fix #442 : add crawler.document.max.symbol.term.size

This commit is contained in:
Shinsuke Sugaya 2016-03-20 21:29:03 +09:00
parent 8d7085187c
commit ca3d0e8e82
5 changed files with 91 additions and 8 deletions

View file

@ -38,10 +38,12 @@ public class DocumentHelper implements Serializable {
return StringUtil.EMPTY; // empty
}
final int maxAlphanumTermSize = getMaxAlphanumSize();
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
final int maxSymbolTermSize = getMaxSymbolTermSize();
final UnsafeStringBuilder buf = new UnsafeStringBuilder(content.length());
boolean isSpace = false;
int alphanumSize = 0;
int symbolSize = 0;
for (int i = 0; i < content.length(); i++) {
final char c = content.charAt(i);
if (Character.isISOControl(c) || c == '\u0020' || c == '\u3000' || c == 65533) {
@ -51,6 +53,7 @@ public class DocumentHelper implements Serializable {
isSpace = true;
}
alphanumSize = 0;
symbolSize = 0;
} else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
// alphanum
if (maxAlphanumTermSize >= 0) {
@ -62,21 +65,40 @@ public class DocumentHelper implements Serializable {
buf.append(c);
}
isSpace = false;
symbolSize = 0;
} else if ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~')) {
// symbol
if (maxSymbolTermSize >= 0) {
if (symbolSize < maxSymbolTermSize) {
buf.append(c);
}
symbolSize++;
} else {
buf.append(c);
}
isSpace = false;
alphanumSize = 0;
} else {
buf.append(c);
isSpace = false;
alphanumSize = 0;
symbolSize = 0;
}
}
return buf.toUnsafeString().trim();
}
protected int getMaxAlphanumSize() {
protected int getMaxAlphanumTermSize() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return fessConfig.getCrawlerDocumentMaxAlphanumTermSizeAsInteger().intValue();
}
protected int getMaxSymbolTermSize() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return fessConfig.getCrawlerDocumentMaxSymbolTermSizeAsInteger().intValue();
}
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
if (content == null) {
return StringUtil.EMPTY; // empty

View file

@ -42,7 +42,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. -Djava.awt.headless=true
-server
-Xmx256m
-Xmx512m
-XX:MaxMetaspaceSize=128m
-XX:CompressedClassSpaceSize=32m
-XX:-UseGCOverheadLimit
@ -127,9 +127,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. true */
String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
/** The key of the configuration. e.g. -1 */
/** The key of the configuration. e.g. 20 */
String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size";
/** The key of the configuration. e.g. 10 */
String CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE = "crawler.document.max.symbol.term.size";
/** The key of the configuration. e.g. UTF-8 */
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
@ -833,7 +836,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
* Get the value for the key 'jvm.crawler.options'. <br>
* The value is, e.g. -Djava.awt.headless=true
-server
-Xmx256m
-Xmx512m
-XX:MaxMetaspaceSize=128m
-XX:CompressedClassSpaceSize=32m
-XX:-UseGCOverheadLimit
@ -1043,19 +1046,34 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/**
* Get the value for the key 'crawler.document.max.alphanum.term.size'. <br>
* The value is, e.g. -1 <br>
* The value is, e.g. 20 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentMaxAlphanumTermSize();
/**
* Get the value for the key 'crawler.document.max.alphanum.term.size' as {@link Integer}. <br>
* The value is, e.g. -1 <br>
* The value is, e.g. 20 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger();
/**
* Get the value for the key 'crawler.document.max.symbol.term.size'. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentMaxSymbolTermSize();
/**
* Get the value for the key 'crawler.document.max.symbol.term.size' as {@link Integer}. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger();
/**
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
* The value is, e.g. UTF-8 <br>
@ -3291,6 +3309,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
}
public String getCrawlerDocumentMaxSymbolTermSize() {
return get(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
}
public Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
}
public String getCrawlerCrawlingDataEncoding() {
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
}

View file

@ -8,5 +8,7 @@
<property name="maxCompressionRatio">2</property>
<property name="maxUncompressionSize">10000000</property>
<property name="maxAlphanumTermSize">20</property>
<property name="maxSymbolTermSize">10</property>
<property name="readAsTextIfFailed">false</property>
</component>
</components>

View file

@ -78,6 +78,7 @@ crawler.document.unknown.hostname=unknown
crawler.document.use.site.encoding.on.english=false
crawler.document.append.data=true
crawler.document.max.alphanum.term.size=20
crawler.document.max.symbol.term.size=10
crawler.crawling.data.encoding=UTF-8
crawler.web.protocols=http,https
crawler.file.protocols=file,smb

View file

@ -47,7 +47,7 @@ public class DocumentHelperTest extends UnitFessTestCase {
public void test_getContent_maxAlphanum() {
DocumentHelper documentHelper = new DocumentHelper() {
protected int getMaxAlphanumSize() {
protected int getMaxAlphanumTermSize() {
return 2;
}
};
@ -67,6 +67,38 @@ public class DocumentHelperTest extends UnitFessTestCase {
assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
}
public void test_getContent_maxSymbol() {
DocumentHelper documentHelper = new DocumentHelper() {
protected int getMaxSymbolTermSize() {
return 2;
}
};
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
assertEquals("123abc", documentHelper.getContent(responseData, " 123abc ", dataMap));
assertEquals("!!", documentHelper.getContent(responseData, "!!!", dataMap));
assertEquals("//", documentHelper.getContent(responseData, "///", dataMap));
assertEquals("::", documentHelper.getContent(responseData, ":::", dataMap));
assertEquals("@@", documentHelper.getContent(responseData, "@@@", dataMap));
assertEquals("[[", documentHelper.getContent(responseData, "[[[", dataMap));
assertEquals("``", documentHelper.getContent(responseData, "```", dataMap));
assertEquals("{{", documentHelper.getContent(responseData, "{{{", dataMap));
assertEquals("~~", documentHelper.getContent(responseData, "~~~", dataMap));
assertEquals("!\"", documentHelper.getContent(responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
}
public void test_getDigest() {
DocumentHelper documentHelper = new DocumentHelper();