Merge pull request #43095 from aaronlehmann/avoid-regexp-simple-cases

fileutils: Avoid compiling a regexp for simple patterns
This commit is contained in:
Aaron Lehmann 2022-01-20 17:43:07 -08:00 committed by GitHub
commit 40bb983175
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 136 additions and 9 deletions

View file

@ -285,12 +285,23 @@ func (pm *PatternMatcher) Patterns() []*Pattern {
// Pattern defines a single regexp used to filter file paths.
type Pattern struct {
matchType matchType
cleanedPattern string
dirs []string
regexp *regexp.Regexp
exclusion bool
}
type matchType int
const (
unknownMatch matchType = iota
exactMatch
prefixMatch
suffixMatch
regexpMatch
)
func (p *Pattern) String() string {
return p.cleanedPattern
}
@ -301,18 +312,34 @@ func (p *Pattern) Exclusion() bool {
}
func (p *Pattern) match(path string) (bool, error) {
if p.regexp == nil {
if err := p.compile(); err != nil {
if p.matchType == unknownMatch {
if err := p.compile(string(os.PathSeparator)); err != nil {
return false, filepath.ErrBadPattern
}
}
b := p.regexp.MatchString(path)
switch p.matchType {
case exactMatch:
return path == p.cleanedPattern, nil
case prefixMatch:
// strip trailing **
return strings.HasPrefix(path, p.cleanedPattern[:len(p.cleanedPattern)-2]), nil
case suffixMatch:
// strip leading **
suffix := p.cleanedPattern[2:]
if strings.HasSuffix(path, suffix) {
return true, nil
}
// **/foo matches "foo"
return suffix[0] == os.PathSeparator && path == suffix[1:], nil
case regexpMatch:
return p.regexp.MatchString(path), nil
}
return b, nil
return false, nil
}
func (p *Pattern) compile() error {
func (p *Pattern) compile(sl string) error {
regStr := "^"
pattern := p.cleanedPattern
// Go through the pattern and convert it to a regexp.
@ -320,13 +347,13 @@ func (p *Pattern) compile() error {
var scan scanner.Scanner
scan.Init(strings.NewReader(pattern))
sl := string(os.PathSeparator)
escSL := sl
if sl == `\` {
escSL += `\`
}
for scan.Peek() != scanner.EOF {
p.matchType = exactMatch
for i := 0; scan.Peek() != scanner.EOF; i++ {
ch := scan.Next()
if ch == '*' {
@ -341,20 +368,32 @@ func (p *Pattern) compile() error {
if scan.Peek() == scanner.EOF {
// is "**EOF" - to align with .gitignore just accept all
regStr += ".*"
if p.matchType == exactMatch {
p.matchType = prefixMatch
} else {
regStr += ".*"
p.matchType = regexpMatch
}
} else {
// is "**"
// Note that this allows for any # of /'s (even 0) because
// the .* will eat everything, even /'s
regStr += "(.*" + escSL + ")?"
p.matchType = regexpMatch
}
if i == 0 {
p.matchType = suffixMatch
}
} else {
// is "*" so map it to anything but "/"
regStr += "[^" + escSL + "]*"
p.matchType = regexpMatch
}
} else if ch == '?' {
// "?" is any char except "/"
regStr += "[^" + escSL + "]"
p.matchType = regexpMatch
} else if shouldEscape(ch) {
// Escape some regexp special chars that have no meaning
// in golang's filepath.Match
@ -371,14 +410,22 @@ func (p *Pattern) compile() error {
}
if scan.Peek() != scanner.EOF {
regStr += `\` + string(scan.Next())
p.matchType = regexpMatch
} else {
regStr += `\`
}
} else if ch == '[' || ch == ']' {
regStr += string(ch)
p.matchType = regexpMatch
} else {
regStr += string(ch)
}
}
if p.matchType != regexpMatch {
return nil
}
regStr += "$"
re, err := regexp.Compile(regStr)
@ -387,6 +434,7 @@ func (p *Pattern) compile() error {
}
p.regexp = re
p.matchType = regexpMatch
return nil
}

View file

@ -382,10 +382,14 @@ func TestMatches(t *testing.T) {
{"a(b)c/def", "a(b)c/def", true},
{"a(b)c/def", "a(b)c/xyz", false},
{"a.|)$(}+{bc", "a.|)$(}+{bc", true},
{"dist/proxy.py-2.4.0rc3.dev36+g08acad9-py3-none-any.whl", "dist/proxy.py-2.4.0rc3.dev36+g08acad9-py3-none-any.whl", true},
{"dist/*.whl", "dist/proxy.py-2.4.0rc3.dev36+g08acad9-py3-none-any.whl", true},
}
multiPatternTests := []multiPatternTestCase{
{[]string{"**", "!util/docker/web"}, "util/docker/web/foo", false},
{[]string{"**", "!util/docker/web", "util/docker/web/foo"}, "util/docker/web/foo", true},
{[]string{"**", "!dist/proxy.py-2.4.0rc3.dev36+g08acad9-py3-none-any.whl"}, "dist/proxy.py-2.4.0rc3.dev36+g08acad9-py3-none-any.whl", false},
{[]string{"**", "!dist/*.whl"}, "dist/proxy.py-2.4.0rc3.dev36+g08acad9-py3-none-any.whl", false},
}
if runtime.GOOS != "windows" {
@ -671,7 +675,7 @@ func errp(e error) string {
return e.Error()
}
// TestMatch test's our version of filepath.Match, called regexpMatch.
// TestMatch tests our version of filepath.Match, called Matches.
func TestMatch(t *testing.T) {
for _, tt := range matchTests {
pattern := tt.pattern
@ -690,3 +694,78 @@ func TestMatch(t *testing.T) {
}
}
}
type compileTestCase struct {
pattern string
matchType matchType
compiledRegexp string
windowsCompiledRegexp string
}
var compileTests = []compileTestCase{
{"*", regexpMatch, `^[^/]*$`, `^[^\\]*$`},
{"file*", regexpMatch, `^file[^/]*$`, `^file[^\\]*$`},
{"*file", regexpMatch, `^[^/]*file$`, `^[^\\]*file$`},
{"a*/b", regexpMatch, `^a[^/]*/b$`, `^a[^\\]*\\b$`},
{"**", suffixMatch, "", ""},
{"**/**", regexpMatch, `^(.*/)?.*$`, `^(.*\\)?.*$`},
{"dir/**", prefixMatch, "", ""},
{"**/dir", suffixMatch, "", ""},
{"**/dir2/*", regexpMatch, `^(.*/)?dir2/[^/]*$`, `^(.*\\)?dir2\\[^\\]*$`},
{"**/dir2/**", regexpMatch, `^(.*/)?dir2/.*$`, `^(.*\\)?dir2\\.*$`},
{"**file", suffixMatch, "", ""},
{"**/file*txt", regexpMatch, `^(.*/)?file[^/]*txt$`, `^(.*\\)?file[^\\]*txt$`},
{"**/**/*.txt", regexpMatch, `^(.*/)?(.*/)?[^/]*\.txt$`, `^(.*\\)?(.*\\)?[^\\]*\.txt$`},
{"a[b-d]e", regexpMatch, `^a[b-d]e$`, `^a[b-d]e$`},
{".*", regexpMatch, `^\.[^/]*$`, `^\.[^\\]*$`},
{"abc.def", exactMatch, "", ""},
{"abc?def", regexpMatch, `^abc[^/]def$`, `^abc[^\\]def$`},
{"**/foo/bar", suffixMatch, "", ""},
{"a(b)c/def", exactMatch, "", ""},
{"a.|)$(}+{bc", exactMatch, "", ""},
{"dist/proxy.py-2.4.0rc3.dev36+g08acad9-py3-none-any.whl", exactMatch, "", ""},
}
// TestCompile confirms that "compile" assigns the correct match type to a
// variety of test case patterns. If the match type is regexp, it also confirms
// that the compiled regexp matches the expected regexp.
func TestCompile(t *testing.T) {
t.Run("slash", testCompile("/"))
t.Run("backslash", testCompile(`\`))
}
func testCompile(sl string) func(*testing.T) {
return func(t *testing.T) {
for _, tt := range compileTests {
// Avoid NewPatternMatcher, which has platform-specific behavior
pm := &PatternMatcher{
patterns: make([]*Pattern, 1),
}
pattern := path.Clean(tt.pattern)
if sl != "/" {
pattern = strings.ReplaceAll(pattern, "/", sl)
}
newp := &Pattern{}
newp.cleanedPattern = pattern
newp.dirs = strings.Split(pattern, sl)
pm.patterns[0] = newp
if err := pm.patterns[0].compile(sl); err != nil {
t.Fatalf("Failed to compile pattern %q: %v", pattern, err)
}
if pm.patterns[0].matchType != tt.matchType {
t.Errorf("pattern %q: matchType = %v, want %v", pattern, pm.patterns[0].matchType, tt.matchType)
continue
}
if tt.matchType == regexpMatch {
if sl == `\` {
if pm.patterns[0].regexp.String() != tt.windowsCompiledRegexp {
t.Errorf("pattern %q: regexp = %s, want %s", pattern, pm.patterns[0].regexp, tt.windowsCompiledRegexp)
}
} else if pm.patterns[0].regexp.String() != tt.compiledRegexp {
t.Errorf("pattern %q: regexp = %s, want %s", pattern, pm.patterns[0].regexp, tt.compiledRegexp)
}
}
}
}
}