浏览代码

fix #1653 improve filtering paths

Shinsuke Sugaya 7 年之前
父节点
当前提交
ce72889f36

+ 54 - 35
src/main/java/org/codelibs/fess/util/GsaConfigParser.java

@@ -23,6 +23,7 @@ import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.List;
 import java.util.Map;
 import java.util.Map;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 import java.util.stream.Collectors;
 
 
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParser;
@@ -223,29 +224,7 @@ public class GsaConfigParser extends DefaultHandler {
 
 
     protected String parseFilterPaths(final String text, final boolean web, final boolean file) {
     protected String parseFilterPaths(final String text, final boolean web, final boolean file) {
         return split(text, "\n").get(stream -> stream.map(String::trim).filter(StringUtil::isNotBlank).map(s -> {
         return split(text, "\n").get(stream -> stream.map(String::trim).filter(StringUtil::isNotBlank).map(s -> {
-            if (s.startsWith("#")) {
-                return null;
-            } else if (s.startsWith(CONTAINS)) {
-                final String v = s.substring(CONTAINS.length());
-                final StringBuilder buf = new StringBuilder(100);
-                return appendFileterPath(buf, escape(v));
-            } else if (s.startsWith(REGEXP_IGNORE_CASE)) {
-                final String v = s.substring(REGEXP_IGNORE_CASE.length());
-                final StringBuilder buf = new StringBuilder(100);
-                buf.append("(?i)");
-                return appendFileterPath(buf, unescape(v));
-            } else if (s.startsWith(REGEXP)) {
-                final String v = s.substring(REGEXP.length());
-                final StringBuilder buf = new StringBuilder(100);
-                return appendFileterPath(buf, unescape(v));
-            } else if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
-                return escape(s) + ".*";
-            } else if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
-                return escape(s) + ".*";
-            } else {
-                final StringBuilder buf = new StringBuilder(100);
-                return appendFileterPath(buf, escape(s));
-            }
+            return getFilterPath(s);
         }).filter(s -> {
         }).filter(s -> {
             if (StringUtil.isBlank(s)) {
             if (StringUtil.isBlank(s)) {
                 return false;
                 return false;
@@ -260,15 +239,43 @@ public class GsaConfigParser extends DefaultHandler {
         }).collect(Collectors.joining("\n")));
         }).collect(Collectors.joining("\n")));
     }
     }
 
 
+    protected String getFilterPath(String s) {
+        if (s.startsWith("#")) {
+            return StringUtil.EMPTY;
+        } else if (s.startsWith(CONTAINS)) {
+            final String v = s.substring(CONTAINS.length());
+            final StringBuilder buf = new StringBuilder(100);
+            return ".*" + appendFileterPath(buf, escape(v)) + ".*";
+        } else if (s.startsWith(REGEXP_IGNORE_CASE)) {
+            final String v = s.substring(REGEXP_IGNORE_CASE.length());
+            final StringBuilder buf = new StringBuilder(100);
+            buf.append("(?i)");
+            return appendFileterPath(buf, unescape(v));
+        } else if (s.startsWith(REGEXP)) {
+            final String v = s.substring(REGEXP.length());
+            final StringBuilder buf = new StringBuilder(100);
+            return appendFileterPath(buf, unescape(v));
+        } else if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
+            return escape(s) + ".*";
+        } else if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
+            return escape(s) + ".*";
+        } else {
+            final StringBuilder buf = new StringBuilder(100);
+            return appendFileterPath(buf, escape(s));
+        }
+    }
+
     protected String escape(final String s) {
     protected String escape(final String s) {
-        return s.replace(".", "\\.")//
-                .replace("+", "\\+")//
-                .replace("*", "\\*")//
-                .replace("[", "\\[")//
-                .replace("]", "\\]")//
-                .replace("(", "\\(")//
-                .replace("(", "\\)")//
-                .replace("?", "\\?");
+        if (s.startsWith("#")) {
+            return StringUtil.EMPTY;
+        } else if (s.startsWith("^") && s.endsWith("$")) {
+            return "^" + Pattern.quote(s.substring(1, s.length() - 1)) + "$";
+        } else if (s.startsWith("^")) {
+            return "^" + Pattern.quote(s.substring(1));
+        } else if (s.endsWith("$")) {
+            return Pattern.quote(s.substring(0, s.length() - 1)) + "$";
+        }
+        return Pattern.quote(s);
     }
     }
 
 
     protected String unescape(final String s) {
     protected String unescape(final String s) {
@@ -276,12 +283,24 @@ public class GsaConfigParser extends DefaultHandler {
     }
     }
 
 
     protected String appendFileterPath(final StringBuilder buf, final String v) {
     protected String appendFileterPath(final StringBuilder buf, final String v) {
-        if (!v.startsWith("^")) {
-            buf.append(".*");
+        if (StringUtil.isBlank(v)) {
+            return StringUtil.EMPTY;
         }
         }
-        buf.append(v);
-        if (!v.endsWith("$")) {
+
+        if (v.startsWith("^")) {
+            buf.append(v);
+            if (!v.endsWith("$")) {
+                buf.append(".*");
+            }
+        } else if (v.endsWith("$")) {
+            buf.append(".*");
+            buf.append(v);
+        } else if (v.endsWith("/\\E")) {
+            buf.append(".*");
+            buf.append(v);
             buf.append(".*");
             buf.append(".*");
+        } else {
+            buf.append(v);
         }
         }
         return buf.toString();
         return buf.toString();
     }
     }

+ 23 - 0
src/test/java/org/codelibs/fess/util/GsaConfigParserTest.java

@@ -40,4 +40,27 @@ public class GsaConfigParserTest extends UnitFessTestCase {
         assertEquals(3, labelTypes.length);
         assertEquals(3, labelTypes.length);
     }
     }
 
 
+    public void test_escape() {
+        // https://www.google.com/support/enterprise/static/gsa/docs/admin/70/gsa_doc_set/admin_crawl/url_patterns.html#1076127
+        assertEscapePattern("", "# Test");
+        assertEscapePattern(".*\\Q!/\\E.*", "!/");
+        assertEscapePattern("\\Qindex.html\\E", "index.html");
+        assertEscapePattern("^\\Qhttp://\\E.*", "^http://");
+        assertEscapePattern(".*\\Qindex.html\\E$", "index.html$");
+        assertEscapePattern("^\\Qhttp://www.codelibs.org/page.html\\E$", "^http://www.codelibs.org/page.html$");
+        assertEscapePattern("\\Qhttp://www.codelibs.org/\\E.*", "http://www.codelibs.org/");
+        assertEscapePattern("\\Qsmb://server/test/\\E.*", "smb://server/test/");
+        assertEscapePattern(".*\\Q?\\E.*", "contains:?");
+        assertEscapePattern(".*\\Q\001\\E.*", "contains:\001");
+        assertEscapePattern("(?i).*\\.exe$", "regexpIgnoreCase:\\.exe$");
+        assertEscapePattern("(?i)index.html", "regexpIgnoreCase:index.html");
+        assertEscapePattern(".*\\.exe$", "regexp:\\.exe$");
+        assertEscapePattern("index.html", "regexp:index.html");
+    }
+
+    private void assertEscapePattern(String expect, String value) {
+        GsaConfigParser parser = new GsaConfigParser();
+        assertEquals(expect, parser.getFilterPath(value));
+    }
+
 }
 }