Преглед изворни кода

fix #437 : add crawler.document.max.alphanum.term.size

Shinsuke Sugaya пре 9 година
родитељ
комит
3825af907a

+ 40 - 1
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -21,6 +21,8 @@ import java.util.Map;
 import org.apache.commons.lang3.StringUtils;
 import org.codelibs.core.lang.StringUtil;
 import org.codelibs.fess.crawler.entity.ResponseData;
+import org.codelibs.fess.mylasta.direction.FessConfig;
+import org.codelibs.fess.util.ComponentUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -34,7 +36,44 @@ public class DocumentHelper implements Serializable {
         if (content == null) {
             return StringUtil.EMPTY; // empty
         }
-        return content.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
+
+        final int maxAlphanumTermSize = getMaxAlphanumSize();
+        final StringBuilder buf = new StringBuilder(content.length());
+        boolean isSpace = false;
+        int alphanumSize = 0;
+        for (int i = 0; i < content.length(); i++) {
+            final char c = content.charAt(i);
+            if ((c >= '\u0000' && c <= '\u0020') || c == '\u007f' || c == '\u3000') {
+                // space
+                if (!isSpace) {
+                    buf.append(' ');
+                    isSpace = true;
+                }
+                alphanumSize = 0;
+            } else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
+                // alphanum
+                if (maxAlphanumTermSize >= 0) {
+                    if (alphanumSize < maxAlphanumTermSize) {
+                        buf.append(c);
+                    }
+                    alphanumSize++;
+                } else {
+                    buf.append(c);
+                }
+                isSpace = false;
+            } else {
+                buf.append(c);
+                isSpace = false;
+                alphanumSize = 0;
+            }
+        }
+
+        return buf.toString().trim();
+    }
+
+    protected int getMaxAlphanumSize() {
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        return fessConfig.getCrawlerDocumentMaxAlphanumTermSizeAsInteger().intValue();
     }
 
     public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {

+ 26 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -127,6 +127,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. true */
     String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
 
+    /** The key of the configuration. e.g. -1 */
+    String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size";
+
     /** The key of the configuration. e.g. UTF-8 */
     String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
 
@@ -1035,6 +1038,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     boolean isCrawlerDocumentAppendData();
 
+    /**
+     * Get the value for the key 'crawler.document.max.alphanum.term.size'. <br>
+     * The value is, e.g. -1 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentMaxAlphanumTermSize();
+
+    /**
+     * Get the value for the key 'crawler.document.max.alphanum.term.size' as {@link Integer}. <br>
+     * The value is, e.g. -1 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger();
+
     /**
      * Get the value for the key 'crawler.crawling.data.encoding'. <br>
      * The value is, e.g. UTF-8 <br>
@@ -3247,6 +3265,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return is(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
         }
 
+        public String getCrawlerDocumentMaxAlphanumTermSize() {
+            return get(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
+        }
+
+        public Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
+        }
+
         public String getCrawlerCrawlingDataEncoding() {
             return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
         }

+ 1 - 0
src/main/resources/fess_config.properties

@@ -77,6 +77,7 @@ crawler.document.site.encoding=UTF-8
 crawler.document.unknown.hostname=unknown
 crawler.document.use.site.encoding.on.english=false
 crawler.document.append.data=true
+crawler.document.max.alphanum.term.size=-1
 crawler.crawling.data.encoding=UTF-8
 crawler.web.protocols=http,https
 crawler.file.protocols=file,smb

+ 26 - 2
src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java

@@ -22,15 +22,15 @@ import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.unit.UnitFessTestCase;
 
 public class DocumentHelperTest extends UnitFessTestCase {
-    private DocumentHelper documentHelper;
 
     @Override
     public void setUp() throws Exception {
         super.setUp();
-        documentHelper = new DocumentHelper();
     }
 
     public void test_getContent() {
+        DocumentHelper documentHelper = new DocumentHelper();
+
         ResponseData responseData = new ResponseData();
         Map<String, Object> dataMap = new HashMap<>();
         assertEquals("", documentHelper.getContent(responseData, null, dataMap));
@@ -45,7 +45,31 @@ public class DocumentHelperTest extends UnitFessTestCase {
         assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
     }
 
+    public void test_getContent_maxAlphanum() {
+        DocumentHelper documentHelper = new DocumentHelper() {
+            protected int getMaxAlphanumSize() {
+                return 2;
+            }
+        };
+
+        ResponseData responseData = new ResponseData();
+        Map<String, Object> dataMap = new HashMap<>();
+        assertEquals("", documentHelper.getContent(responseData, null, dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "  ", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
+        assertEquals("12 ab", documentHelper.getContent(responseData, " 123 abc ", dataMap));
+        assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
+        assertEquals("12 ab", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
+        assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
+    }
+
     public void test_getDigest() {
+        DocumentHelper documentHelper = new DocumentHelper();
+
         ResponseData responseData = new ResponseData();
         Map<String, Object> dataMap = new HashMap<>();
         assertEquals("1234567...", documentHelper.getDigest(responseData, " 1234567890  1234567890  1234567890 ", dataMap, 10));