diff --git a/src/main/java/org/codelibs/fess/helper/DocumentHelper.java b/src/main/java/org/codelibs/fess/helper/DocumentHelper.java index 424ee7fca..c83d4c4b7 100644 --- a/src/main/java/org/codelibs/fess/helper/DocumentHelper.java +++ b/src/main/java/org/codelibs/fess/helper/DocumentHelper.java @@ -38,10 +38,12 @@ public class DocumentHelper implements Serializable { return StringUtil.EMPTY; // empty } - final int maxAlphanumTermSize = getMaxAlphanumSize(); + final int maxAlphanumTermSize = getMaxAlphanumTermSize(); + final int maxSymbolTermSize = getMaxSymbolTermSize(); final UnsafeStringBuilder buf = new UnsafeStringBuilder(content.length()); boolean isSpace = false; int alphanumSize = 0; + int symbolSize = 0; for (int i = 0; i < content.length(); i++) { final char c = content.charAt(i); if (Character.isISOControl(c) || c == '\u0020' || c == '\u3000' || c == 65533) { @@ -51,6 +53,7 @@ public class DocumentHelper implements Serializable { isSpace = true; } alphanumSize = 0; + symbolSize = 0; } else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) { // alphanum if (maxAlphanumTermSize >= 0) { @@ -62,21 +65,40 @@ public class DocumentHelper implements Serializable { buf.append(c); } isSpace = false; + symbolSize = 0; + } else if ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~')) { + // symbol + if (maxSymbolTermSize >= 0) { + if (symbolSize < maxSymbolTermSize) { + buf.append(c); + } + symbolSize++; + } else { + buf.append(c); + } + isSpace = false; + alphanumSize = 0; } else { buf.append(c); isSpace = false; alphanumSize = 0; + symbolSize = 0; } } return buf.toUnsafeString().trim(); } - protected int getMaxAlphanumSize() { + protected int getMaxAlphanumTermSize() { final FessConfig fessConfig = ComponentUtil.getFessConfig(); return fessConfig.getCrawlerDocumentMaxAlphanumTermSizeAsInteger().intValue(); } + protected int getMaxSymbolTermSize() { + final FessConfig fessConfig = ComponentUtil.getFessConfig(); + return fessConfig.getCrawlerDocumentMaxSymbolTermSizeAsInteger().intValue(); + } + public String getDigest(final ResponseData responseData, final String content, final Map dataMap, final int maxWidth) { if (content == null) { return StringUtil.EMPTY; // empty diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java index 6ddd16ecc..489cbd38b 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java @@ -42,7 +42,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction /** The key of the configuration. e.g. -Djava.awt.headless=true -server - -Xmx256m + -Xmx512m -XX:MaxMetaspaceSize=128m -XX:CompressedClassSpaceSize=32m -XX:-UseGCOverheadLimit @@ -127,9 +127,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction /** The key of the configuration. e.g. true */ String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data"; - /** The key of the configuration. e.g. -1 */ + /** The key of the configuration. e.g. 20 */ String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size"; + /** The key of the configuration. e.g. 10 */ + String CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE = "crawler.document.max.symbol.term.size"; + /** The key of the configuration. e.g. UTF-8 */ String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding"; @@ -833,7 +836,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction * Get the value for the key 'jvm.crawler.options'.
* The value is, e.g. -Djava.awt.headless=true -server - -Xmx256m + -Xmx512m -XX:MaxMetaspaceSize=128m -XX:CompressedClassSpaceSize=32m -XX:-UseGCOverheadLimit @@ -1043,19 +1046,34 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction /** * Get the value for the key 'crawler.document.max.alphanum.term.size'.
- * The value is, e.g. -1
+ * The value is, e.g. 20
* @return The value of found property. (NotNull: if not found, exception but basically no way) */ String getCrawlerDocumentMaxAlphanumTermSize(); /** * Get the value for the key 'crawler.document.max.alphanum.term.size' as {@link Integer}.
- * The value is, e.g. -1
+ * The value is, e.g. 20
* @return The value of found property. (NotNull: if not found, exception but basically no way) * @throws NumberFormatException When the property is not integer. */ Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger(); + /** + * Get the value for the key 'crawler.document.max.symbol.term.size'.
+ * The value is, e.g. 10
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentMaxSymbolTermSize(); + + /** + * Get the value for the key 'crawler.document.max.symbol.term.size' as {@link Integer}.
+ * The value is, e.g. 10
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger(); + /** * Get the value for the key 'crawler.crawling.data.encoding'.
* The value is, e.g. UTF-8
@@ -3291,6 +3309,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE); } + public String getCrawlerDocumentMaxSymbolTermSize() { + return get(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE); + } + + public Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE); + } + public String getCrawlerCrawlingDataEncoding() { return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING); } diff --git a/src/main/resources/crawler/extractor+tikaExtractor.xml b/src/main/resources/crawler/extractor+tikaExtractor.xml index 4bb9414c3..c0921c273 100644 --- a/src/main/resources/crawler/extractor+tikaExtractor.xml +++ b/src/main/resources/crawler/extractor+tikaExtractor.xml @@ -8,5 +8,7 @@ 2 10000000 20 + 10 + false diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties index e80eecadc..d2ae6b795 100644 --- a/src/main/resources/fess_config.properties +++ b/src/main/resources/fess_config.properties @@ -78,6 +78,7 @@ crawler.document.unknown.hostname=unknown crawler.document.use.site.encoding.on.english=false crawler.document.append.data=true crawler.document.max.alphanum.term.size=20 +crawler.document.max.symbol.term.size=10 crawler.crawling.data.encoding=UTF-8 crawler.web.protocols=http,https crawler.file.protocols=file,smb diff --git a/src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java b/src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java index 395c375f5..18bf891e5 100644 --- a/src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java +++ b/src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java @@ -47,7 +47,7 @@ public class DocumentHelperTest extends UnitFessTestCase { public void test_getContent_maxAlphanum() { DocumentHelper documentHelper = new DocumentHelper() { - protected int getMaxAlphanumSize() { + protected int getMaxAlphanumTermSize() { return 2; } }; @@ -67,6 +67,38 @@ public class DocumentHelperTest extends UnitFessTestCase { assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap)); } + public void test_getContent_maxSymbol() { + DocumentHelper documentHelper = new DocumentHelper() { + protected int getMaxSymbolTermSize() { + return 2; + } + }; + + ResponseData responseData = new ResponseData(); + Map dataMap = new HashMap<>(); + assertEquals("", documentHelper.getContent(responseData, null, dataMap)); + assertEquals("", documentHelper.getContent(responseData, "", dataMap)); + assertEquals("", documentHelper.getContent(responseData, " ", dataMap)); + assertEquals("", documentHelper.getContent(responseData, " ", dataMap)); + assertEquals("", documentHelper.getContent(responseData, "\t", dataMap)); + assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap)); + assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap)); + assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap)); + assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap)); + assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap)); + assertEquals("123abc", documentHelper.getContent(responseData, " 123abc ", dataMap)); + + assertEquals("!!", documentHelper.getContent(responseData, "!!!", dataMap)); + assertEquals("//", documentHelper.getContent(responseData, "///", dataMap)); + assertEquals("::", documentHelper.getContent(responseData, ":::", dataMap)); + assertEquals("@@", documentHelper.getContent(responseData, "@@@", dataMap)); + assertEquals("[[", documentHelper.getContent(responseData, "[[[", dataMap)); + assertEquals("``", documentHelper.getContent(responseData, "```", dataMap)); + assertEquals("{{", documentHelper.getContent(responseData, "{{{", dataMap)); + assertEquals("~~", documentHelper.getContent(responseData, "~~~", dataMap)); + assertEquals("!\"", documentHelper.getContent(responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap)); + } + public void test_getDigest() { DocumentHelper documentHelper = new DocumentHelper();