Procházet zdrojové kódy

fix #442 : add crawler.document.max.symbol.term.size

Shinsuke Sugaya před 9 roky
rodič
revize
ca3d0e8e82

+ 24 - 2
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -38,10 +38,12 @@ public class DocumentHelper implements Serializable {
             return StringUtil.EMPTY; // empty
         }
 
-        final int maxAlphanumTermSize = getMaxAlphanumSize();
+        final int maxAlphanumTermSize = getMaxAlphanumTermSize();
+        final int maxSymbolTermSize = getMaxSymbolTermSize();
         final UnsafeStringBuilder buf = new UnsafeStringBuilder(content.length());
         boolean isSpace = false;
         int alphanumSize = 0;
+        int symbolSize = 0;
         for (int i = 0; i < content.length(); i++) {
             final char c = content.charAt(i);
             if (Character.isISOControl(c) || c == '\u0020' || c == '\u3000' || c == 65533) {
@@ -51,6 +53,7 @@ public class DocumentHelper implements Serializable {
                     isSpace = true;
                 }
                 alphanumSize = 0;
+                symbolSize = 0;
             } else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
                 // alphanum
                 if (maxAlphanumTermSize >= 0) {
@@ -62,21 +65,40 @@ public class DocumentHelper implements Serializable {
                     buf.append(c);
                 }
                 isSpace = false;
+                symbolSize = 0;
+            } else if ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~')) {
+                // symbol
+                if (maxSymbolTermSize >= 0) {
+                    if (symbolSize < maxSymbolTermSize) {
+                        buf.append(c);
+                    }
+                    symbolSize++;
+                } else {
+                    buf.append(c);
+                }
+                isSpace = false;
+                alphanumSize = 0;
             } else {
                 buf.append(c);
                 isSpace = false;
                 alphanumSize = 0;
+                symbolSize = 0;
             }
         }
 
         return buf.toUnsafeString().trim();
     }
 
-    protected int getMaxAlphanumSize() {
+    protected int getMaxAlphanumTermSize() {
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
         return fessConfig.getCrawlerDocumentMaxAlphanumTermSizeAsInteger().intValue();
     }
 
+    protected int getMaxSymbolTermSize() {
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        return fessConfig.getCrawlerDocumentMaxSymbolTermSizeAsInteger().intValue();
+    }
+
     public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
         if (content == null) {
             return StringUtil.EMPTY; // empty

+ 31 - 5
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -42,7 +42,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
     /** The key of the configuration. e.g. -Djava.awt.headless=true
     -server
-    -Xmx256m
+    -Xmx512m
     -XX:MaxMetaspaceSize=128m
     -XX:CompressedClassSpaceSize=32m
     -XX:-UseGCOverheadLimit
@@ -127,9 +127,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. true */
     String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
 
-    /** The key of the configuration. e.g. -1 */
+    /** The key of the configuration. e.g. 20 */
     String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size";
 
+    /** The key of the configuration. e.g. 10 */
+    String CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE = "crawler.document.max.symbol.term.size";
+
     /** The key of the configuration. e.g. UTF-8 */
     String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
 
@@ -833,7 +836,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      * Get the value for the key 'jvm.crawler.options'. <br>
      * The value is, e.g. -Djava.awt.headless=true
     -server
-    -Xmx256m
+    -Xmx512m
     -XX:MaxMetaspaceSize=128m
     -XX:CompressedClassSpaceSize=32m
     -XX:-UseGCOverheadLimit
@@ -1043,19 +1046,34 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
     /**
      * Get the value for the key 'crawler.document.max.alphanum.term.size'. <br>
-     * The value is, e.g. -1 <br>
+     * The value is, e.g. 20 <br>
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      */
     String getCrawlerDocumentMaxAlphanumTermSize();
 
     /**
      * Get the value for the key 'crawler.document.max.alphanum.term.size' as {@link Integer}. <br>
-     * The value is, e.g. -1 <br>
+     * The value is, e.g. 20 <br>
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      * @throws NumberFormatException When the property is not integer.
      */
     Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger();
 
+    /**
+     * Get the value for the key 'crawler.document.max.symbol.term.size'. <br>
+     * The value is, e.g. 10 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentMaxSymbolTermSize();
+
+    /**
+     * Get the value for the key 'crawler.document.max.symbol.term.size' as {@link Integer}. <br>
+     * The value is, e.g. 10 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger();
+
     /**
      * Get the value for the key 'crawler.crawling.data.encoding'. <br>
      * The value is, e.g. UTF-8 <br>
@@ -3291,6 +3309,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
         }
 
+        public String getCrawlerDocumentMaxSymbolTermSize() {
+            return get(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
+        }
+
+        public Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
+        }
+
         public String getCrawlerCrawlingDataEncoding() {
             return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
         }

+ 2 - 0
src/main/resources/crawler/extractor+tikaExtractor.xml

@@ -8,5 +8,7 @@
 		<property name="maxCompressionRatio">2</property>
 		<property name="maxUncompressionSize">10000000</property>
 		<property name="maxAlphanumTermSize">20</property>
+		<property name="maxSymbolTermSize">10</property>
+		<property name="readAsTextIfFailed">false</property>
 	</component>
 </components>

+ 1 - 0
src/main/resources/fess_config.properties

@@ -78,6 +78,7 @@ crawler.document.unknown.hostname=unknown
 crawler.document.use.site.encoding.on.english=false
 crawler.document.append.data=true
 crawler.document.max.alphanum.term.size=20
+crawler.document.max.symbol.term.size=10
 crawler.crawling.data.encoding=UTF-8
 crawler.web.protocols=http,https
 crawler.file.protocols=file,smb

+ 33 - 1
src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java

@@ -47,7 +47,7 @@ public class DocumentHelperTest extends UnitFessTestCase {
 
     public void test_getContent_maxAlphanum() {
         DocumentHelper documentHelper = new DocumentHelper() {
-            protected int getMaxAlphanumSize() {
+            protected int getMaxAlphanumTermSize() {
                 return 2;
             }
         };
@@ -67,6 +67,38 @@ public class DocumentHelperTest extends UnitFessTestCase {
         assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
     }
 
+    public void test_getContent_maxSymbol() {
+        DocumentHelper documentHelper = new DocumentHelper() {
+            protected int getMaxSymbolTermSize() {
+                return 2;
+            }
+        };
+
+        ResponseData responseData = new ResponseData();
+        Map<String, Object> dataMap = new HashMap<>();
+        assertEquals("", documentHelper.getContent(responseData, null, dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "  ", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
+        assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
+        assertEquals("123abc", documentHelper.getContent(responseData, " 123abc ", dataMap));
+
+        assertEquals("!!", documentHelper.getContent(responseData, "!!!", dataMap));
+        assertEquals("//", documentHelper.getContent(responseData, "///", dataMap));
+        assertEquals("::", documentHelper.getContent(responseData, ":::", dataMap));
+        assertEquals("@@", documentHelper.getContent(responseData, "@@@", dataMap));
+        assertEquals("[[", documentHelper.getContent(responseData, "[[[", dataMap));
+        assertEquals("``", documentHelper.getContent(responseData, "```", dataMap));
+        assertEquals("{{", documentHelper.getContent(responseData, "{{{", dataMap));
+        assertEquals("~~", documentHelper.getContent(responseData, "~~~", dataMap));
+        assertEquals("!\"", documentHelper.getContent(responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
+    }
+
     public void test_getDigest() {
         DocumentHelper documentHelper = new DocumentHelper();