fix #1653 improve filtering paths

This commit is contained in:
Shinsuke Sugaya 2018-05-17 06:23:56 +09:00
parent ee4d85d594
commit ce72889f36
2 changed files with 77 additions and 35 deletions

View file

@ -23,6 +23,7 @@ import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import javax.xml.parsers.SAXParser;
@ -223,29 +224,7 @@ public class GsaConfigParser extends DefaultHandler {
protected String parseFilterPaths(final String text, final boolean web, final boolean file) {
return split(text, "\n").get(stream -> stream.map(String::trim).filter(StringUtil::isNotBlank).map(s -> {
if (s.startsWith("#")) {
return null;
} else if (s.startsWith(CONTAINS)) {
final String v = s.substring(CONTAINS.length());
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, escape(v));
} else if (s.startsWith(REGEXP_IGNORE_CASE)) {
final String v = s.substring(REGEXP_IGNORE_CASE.length());
final StringBuilder buf = new StringBuilder(100);
buf.append("(?i)");
return appendFileterPath(buf, unescape(v));
} else if (s.startsWith(REGEXP)) {
final String v = s.substring(REGEXP.length());
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, unescape(v));
} else if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
return escape(s) + ".*";
} else if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
return escape(s) + ".*";
} else {
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, escape(s));
}
return getFilterPath(s);
}).filter(s -> {
if (StringUtil.isBlank(s)) {
return false;
@ -260,15 +239,43 @@ public class GsaConfigParser extends DefaultHandler {
}).collect(Collectors.joining("\n")));
}
protected String getFilterPath(String s) {
if (s.startsWith("#")) {
return StringUtil.EMPTY;
} else if (s.startsWith(CONTAINS)) {
final String v = s.substring(CONTAINS.length());
final StringBuilder buf = new StringBuilder(100);
return ".*" + appendFileterPath(buf, escape(v)) + ".*";
} else if (s.startsWith(REGEXP_IGNORE_CASE)) {
final String v = s.substring(REGEXP_IGNORE_CASE.length());
final StringBuilder buf = new StringBuilder(100);
buf.append("(?i)");
return appendFileterPath(buf, unescape(v));
} else if (s.startsWith(REGEXP)) {
final String v = s.substring(REGEXP.length());
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, unescape(v));
} else if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
return escape(s) + ".*";
} else if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
return escape(s) + ".*";
} else {
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, escape(s));
}
}
protected String escape(final String s) {
return s.replace(".", "\\.")//
.replace("+", "\\+")//
.replace("*", "\\*")//
.replace("[", "\\[")//
.replace("]", "\\]")//
.replace("(", "\\(")//
.replace("(", "\\)")//
.replace("?", "\\?");
if (s.startsWith("#")) {
return StringUtil.EMPTY;
} else if (s.startsWith("^") && s.endsWith("$")) {
return "^" + Pattern.quote(s.substring(1, s.length() - 1)) + "$";
} else if (s.startsWith("^")) {
return "^" + Pattern.quote(s.substring(1));
} else if (s.endsWith("$")) {
return Pattern.quote(s.substring(0, s.length() - 1)) + "$";
}
return Pattern.quote(s);
}
protected String unescape(final String s) {
@ -276,12 +283,24 @@ public class GsaConfigParser extends DefaultHandler {
}
protected String appendFileterPath(final StringBuilder buf, final String v) {
if (!v.startsWith("^")) {
buf.append(".*");
if (StringUtil.isBlank(v)) {
return StringUtil.EMPTY;
}
buf.append(v);
if (!v.endsWith("$")) {
if (v.startsWith("^")) {
buf.append(v);
if (!v.endsWith("$")) {
buf.append(".*");
}
} else if (v.endsWith("$")) {
buf.append(".*");
buf.append(v);
} else if (v.endsWith("/\\E")) {
buf.append(".*");
buf.append(v);
buf.append(".*");
} else {
buf.append(v);
}
return buf.toString();
}

View file

@ -40,4 +40,27 @@ public class GsaConfigParserTest extends UnitFessTestCase {
assertEquals(3, labelTypes.length);
}
public void test_escape() {
// https://www.google.com/support/enterprise/static/gsa/docs/admin/70/gsa_doc_set/admin_crawl/url_patterns.html#1076127
assertEscapePattern("", "# Test");
assertEscapePattern(".*\\Q!/\\E.*", "!/");
assertEscapePattern("\\Qindex.html\\E", "index.html");
assertEscapePattern("^\\Qhttp://\\E.*", "^http://");
assertEscapePattern(".*\\Qindex.html\\E$", "index.html$");
assertEscapePattern("^\\Qhttp://www.codelibs.org/page.html\\E$", "^http://www.codelibs.org/page.html$");
assertEscapePattern("\\Qhttp://www.codelibs.org/\\E.*", "http://www.codelibs.org/");
assertEscapePattern("\\Qsmb://server/test/\\E.*", "smb://server/test/");
assertEscapePattern(".*\\Q?\\E.*", "contains:?");
assertEscapePattern(".*\\Q\001\\E.*", "contains:\001");
assertEscapePattern("(?i).*\\.exe$", "regexpIgnoreCase:\\.exe$");
assertEscapePattern("(?i)index.html", "regexpIgnoreCase:index.html");
assertEscapePattern(".*\\.exe$", "regexp:\\.exe$");
assertEscapePattern("index.html", "regexp:index.html");
}
private void assertEscapePattern(String expect, String value) {
GsaConfigParser parser = new GsaConfigParser();
assertEquals(expect, parser.getFilterPath(value));
}
}