[TOC]

go标准库-regex

表达式对象

//初始化结构体对象的方法
func Compile(expr string) (*Regexp, error)
//和Compile函数相似，但是该方法支持POSIX协议，可以支持类似`egrep`的语法
func CompilePOSIX(expr string) (*Regexp, error)
//Must系列函数和上面两个函数相似，但是不会返回error,如果有异常直接panic
func MustCompile(str string) *Regexp
func MustCompilePOSIX(str string) *Regexp
go

是否匹配

str := `hello, this is a email:3232@qq.com, you can visit my site <a href="https://www.go2live.cn">site</a>`
hasEmailTextReg := regexp.MustCompile(`email`)
if strings.Contains(str, "email") {
	t.Log("has email")
}
if hasEmailTextReg.MatchString(str) {
	t.Log("has email")
}
go

单纯的判断是否包含文本，用strings.Contains就可以。
含有一定规则的文本才需要用到正则表达式。

	hasEmailReg := regexp.MustCompile(`[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+`)
	if hasEmailReg.MatchString(str) {
		t.Log("has email")
	}
go

支持的函数有：

//Match检查b中是否存在匹配pattern的子序列
func (re *Regexp) Match(b []byte) bool
//MatchString类似Match，但匹配对象是字符串。
func (re *Regexp) MatchString(s string) bool
//MatchReader类似Match，但匹配对象是io.RuneReader。
func (re *Regexp) MatchReader(r io.RuneReader) bool
go

获取匹配内容

获取首匹配项

更一般的情况，我们是需要提取内容，譬如提取邮箱。

	emailReg := regexp.MustCompile(`[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+`)
	if emailReg.MatchString(str) {
		t.Log("has email")
		email := emailReg.FindString(str)
		t.Log(email)
	}
go

FindString 只会返回第一个匹配项，类似的还有：

// Find返回保管正则表达式re在b中的最左侧的一个匹配结果的[]byte切片。如果没有匹配到，会返回nil。
func (re *Regexp) Find(b []byte) []byte
//Find返回保管正则表达式re在b中的最左侧的一个匹配结果的字符串。如果没有匹配到，会返回""；但如果正则表达式成功匹配了一个空字符串，也会返回""。如果需要区分这种情况，请使用FindStringIndex 或FindStringSubmatch。
func (re *Regexp) FindString(s string) string
//Find返回保管正则表达式re在b中的最左侧的一个匹配结果的起止位置的切片（显然len(loc)==2）。匹配结果可以通过起止位置对b做切片操作得到：b[loc[0]:loc[1]]。如果没有匹配到，会返回nil。
func (re *Regexp) FindIndex(b []byte) (loc []int)
//Find返回保管正则表达式re在b中的最左侧的一个匹配结果的起止位置的切片（显然len(loc)==2）。匹配结果可以通过起止位置对b做切片操作得到：b[loc[0]:loc[1]]。如果没有匹配到，会返回nil。
func (re *Regexp) FindStringIndex(s string) (loc []int)
//Find返回保管正则表达式re在b中的最左侧的一个匹配结果的起止位置的切片（显然len(loc)==2）。匹配结果可以在输入流r的字节偏移量loc[0]到loc[1]-1（包括二者）位置找到。如果没有匹配到，会返回nil。
func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int)
go

获取多个匹配项

我需要提取所有的邮箱。

func TestReg(t *testing.T) {
	str := `hello, this is a email:3232@qq.com(test@qq.com), you can visit my site <a href="https://www.go2live.cn">site</a>`
	emailReg := regexp.MustCompile(`[a-zA-Z0-9_-]+@[a-zA-Z0-9_-]+(\.[a-zA-Z0-9_-]+)+`)
	if emailReg.MatchString(str) {
		t.Log("has email")
		emails := emailReg.FindAllString(str, -1)
		t.Log(emails)
	}
}
go

相似的函数有:

//Find返回保管正则表达式re在b中的所有不重叠的匹配结果的[][]byte切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindAll(b []byte, n int) [][]byte
//Find返回保管正则表达式re在b中的所有不重叠的匹配结果的[]string切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindAllString(s string, n int) []string
//Find返回保管正则表达式re在b中的所有不重叠的匹配结果的起止位置的切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int
//Find返回保管正则表达式re在b中的所有不重叠的匹配结果的起止位置的切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int
go

提取首匹配子匹配项

我要提取超链接里的文本。

func TestRegSubMatch(t *testing.T) {
	str := `hello, this is a email:3232@qq.com(test@qq.com), you can visit my site <a href="https://www.go2live.cn">site</a>
    or <a href='https://www.go2live.cn'>site2</a>
`
	hrefReg := regexp.MustCompile(`<a href=('|")([^'"]*)('|")>([^<]*)</a>`)
	if hrefReg.MatchString(str) {
		alllink := hrefReg.FindAllString(str, -1)
		t.Log(alllink)
		submatch := hrefReg.FindStringSubmatch(str)
		for i, match := range submatch {
			t.Log(i, match)
		}
	} else {
		t.Log("no link")
	}
}
go

输出

=== RUN TestRegSubMatch
— PASS: TestRegSubMatch (0.00s)
reg_test.go:28: [site site2]
reg_test.go:31: 0 site
reg_test.go:31: 1 "
reg_test.go:31: 2 https://www.go2live.cn
reg_test.go:31: 3 "
reg_test.go:31: 4 site
PASS

类似的函数有：

//Find返回一个保管正则表达式re在b中的最左侧的一个匹配结果以及（可能有的）分组匹配的结果的[][]byte切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindSubmatch(b []byte) [][]byte
//Find返回一个保管正则表达式re在b中的最左侧的一个匹配结果以及（可能有的）分组匹配的结果的[]string切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindStringSubmatch(s string) []string
//Find返回一个保管正则表达式re在b中的最左侧的一个匹配结果以及（可能有的）分组匹配的结果的起止位置的切片。匹配结果和分组匹配结果可以通过起止位置对b做切片操作得到：b[loc[2*n]:loc[2*n+1]]。如果没有匹配到，会返回nil。
func (re *Regexp) FindSubmatchIndex(b []byte) []int
//Find返回一个保管正则表达式re在b中的最左侧的一个匹配结果以及（可能有的）分组匹配的结果的起止位置的切片。匹配结果和分组匹配结果可以通过起止位置对b做切片操作得到：b[loc[2*n]:loc[2*n+1]]。如果没有匹配到，会返回nil。
func (re *Regexp) FindStringSubmatchIndex(s string) []int
//Find返回一个保管正则表达式re在b中的最左侧的一个匹配结果以及（可能有的）分组匹配的结果的起止位置的切片。匹配结果和分组匹配结果可以在输入流r的字节偏移量loc[0]到loc[1]-1（包括二者）位置找到。如果没有匹配到，会返回nil。
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int
go

提取多个匹配的子匹配项

func TestRegSubMatch(t *testing.T) {
	str := `hello, this is a email:3232@qq.com(test@qq.com), you can visit my site <a href="https://www.go2live.cn">site</a>
    or <a href='https://www.go2live.cn'>site2</a>
`
	hrefReg := regexp.MustCompile(`<a href=('|")([^'"]*)('|")>([^<]*)</a>`)
	if hrefReg.MatchString(str) {
		submatches := hrefReg.FindAllStringSubmatch(str, -1)
		for i, submatch := range submatches {
			t.Log(i, "match")
			for j, match := range submatch {
				t.Log(j, match)
			}
		}
	} else {
		t.Log("no link")
	}
}
go

类似的函数有：

//Find返回一个保管正则表达式re在b中的所有不重叠的匹配结果及其对应的（可能有的）分组匹配的结果的[][][]byte切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte
//Find返回一个保管正则表达式re在b中的所有不重叠的匹配结果及其对应的（可能有的）分组匹配的结果的[][]string切片。如果没有匹配到，会返回nil。
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string
//Find返回一个保管正则表达式re在b中的所有不重叠的匹配结果及其对应的（可能有的）分组匹配的结果的起止位置的切片（第一层表示第几个匹配结果，完整匹配和分组匹配的起止位置对在第二层）。如果没有匹配到，会返回nil。
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int
//Find返回一个保管正则表达式re在b中的所有不重叠的匹配结果及其对应的（可能有的）分组匹配的结果的起止位置的切片（第一层表示第几个匹配结果，完整匹配和分组匹配的起止位置对在第二层）。如果没有匹配到，会返回nil。
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int
go

总结

判断是否含有匹配项，用MatchString
提取第一个匹配项用FindString
提取所有的匹配项用FindAllString
不需要完整的匹配项，而是想要匹配项的一部分。用FindStringSubmatch

分割

按标点符号分割句子。

func TestRegSplit(t *testing.T) {
	str := `hello, this is a email:3232@qq.com(test@qq.com)。 you can visit my site <a href="https://www.go2live.cn">site</a>
    or <a href='https://www.go2live.cn'>site2</a>
`
	punctuationReg := regexp.MustCompile(`[,.!。?]\s+`)
	sentences := punctuationReg.Split(str, -1)
	for _, s := range sentences {
		t.Log(s)
	}
}
go

输出

=== RUN TestRegSplit
— PASS: TestRegSplit (0.00s)
reg_test.go:49: hello
reg_test.go:49: this is a email:3232@qq.com(test@qq.com)
reg_test.go:49: you can visit my site site
or site2

PASS


//Split将re在s中匹配到的结果作为分隔符将s分割成多个字符串，并返回这些正则匹配结果之间的字符串的切片。
//返回的切片不会包含正则匹配的结果，只包含匹配结果之间的片段。当正则表达式re中不含正则元字符时，本方法等价于strings.SplitN。
func (re *Regexp) Split(s string, n int) []string
go

替换

去掉超链接，只保留超链接里的文本内容。

func TestRegReplace(t *testing.T) {
	str := `hello, this is a email:3232@qq.com(test@qq.com)。 you can visit my site <a href="https://www.go2live.cn">site</a>
    or <a href='https://www.go2live.cn'>site2</a>
`
	hrefReg := regexp.MustCompile(`<a href=('|")([^'"]*)('|")>([^<]*)</a>`)
	replaced := hrefReg.ReplaceAllStringFunc(str, func(s string) string {
		matches := hrefReg.FindStringSubmatch(s)
		return matches[4]
	})
	t.Log(replaced)
}
go

输出:

=== RUN TestRegReplace
— PASS: TestRegReplace (0.00s)
reg_test.go:65: hello, this is a email:3232@qq.com(test@qq.com)。 you can visit my site site
or site2

其他函数还有:

// ReplaceAllString returns a copy of src, replacing matches of the Regexp
// with the replacement string repl. Inside repl, $ signs are interpreted as
// in Expand, so for instance $1 represents the text of the first submatch.
func (re *Regexp) ReplaceAllString(src, repl string) string {
	n := 2
	if strings.Contains(repl, "$") {
		n = 2 * (re.numSubexp + 1)
	}
	b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
		return re.expand(dst, repl, nil, src, match)
	})
	return string(b)
}
// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
// with the replacement string repl. The replacement repl is substituted directly,
// without using Expand.
func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
	return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
		return append(dst, repl...)
	}))
}
// ReplaceAllStringFunc returns a copy of src in which all matches of the
// Regexp have been replaced by the return value of function repl applied
// to the matched substring. The replacement returned by repl is substituted
// directly, without using Expand.
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
	b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
		return append(dst, repl(src[match[0]:match[1]])...)
	})
	return string(b)
    }
go

参考

https://studygolang.com/pkgdoc

bjmayor的又一个博客

go-正则