fix #442 : add crawler.document.max.symbol.term.size
This commit is contained in:
parent
8d7085187c
commit
ca3d0e8e82
5 changed files with 91 additions and 8 deletions
|
@ -38,10 +38,12 @@ public class DocumentHelper implements Serializable {
|
|||
return StringUtil.EMPTY; // empty
|
||||
}
|
||||
|
||||
final int maxAlphanumTermSize = getMaxAlphanumSize();
|
||||
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
|
||||
final int maxSymbolTermSize = getMaxSymbolTermSize();
|
||||
final UnsafeStringBuilder buf = new UnsafeStringBuilder(content.length());
|
||||
boolean isSpace = false;
|
||||
int alphanumSize = 0;
|
||||
int symbolSize = 0;
|
||||
for (int i = 0; i < content.length(); i++) {
|
||||
final char c = content.charAt(i);
|
||||
if (Character.isISOControl(c) || c == '\u0020' || c == '\u3000' || c == 65533) {
|
||||
|
@ -51,6 +53,7 @@ public class DocumentHelper implements Serializable {
|
|||
isSpace = true;
|
||||
}
|
||||
alphanumSize = 0;
|
||||
symbolSize = 0;
|
||||
} else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
|
||||
// alphanum
|
||||
if (maxAlphanumTermSize >= 0) {
|
||||
|
@ -62,21 +65,40 @@ public class DocumentHelper implements Serializable {
|
|||
buf.append(c);
|
||||
}
|
||||
isSpace = false;
|
||||
symbolSize = 0;
|
||||
} else if ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~')) {
|
||||
// symbol
|
||||
if (maxSymbolTermSize >= 0) {
|
||||
if (symbolSize < maxSymbolTermSize) {
|
||||
buf.append(c);
|
||||
}
|
||||
symbolSize++;
|
||||
} else {
|
||||
buf.append(c);
|
||||
}
|
||||
isSpace = false;
|
||||
alphanumSize = 0;
|
||||
} else {
|
||||
buf.append(c);
|
||||
isSpace = false;
|
||||
alphanumSize = 0;
|
||||
symbolSize = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return buf.toUnsafeString().trim();
|
||||
}
|
||||
|
||||
protected int getMaxAlphanumSize() {
|
||||
protected int getMaxAlphanumTermSize() {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
return fessConfig.getCrawlerDocumentMaxAlphanumTermSizeAsInteger().intValue();
|
||||
}
|
||||
|
||||
protected int getMaxSymbolTermSize() {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
return fessConfig.getCrawlerDocumentMaxSymbolTermSizeAsInteger().intValue();
|
||||
}
|
||||
|
||||
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
|
||||
if (content == null) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
|
|
|
@ -42,7 +42,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
|
||||
/** The key of the configuration. e.g. -Djava.awt.headless=true
|
||||
-server
|
||||
-Xmx256m
|
||||
-Xmx512m
|
||||
-XX:MaxMetaspaceSize=128m
|
||||
-XX:CompressedClassSpaceSize=32m
|
||||
-XX:-UseGCOverheadLimit
|
||||
|
@ -127,9 +127,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. true */
|
||||
String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
|
||||
|
||||
/** The key of the configuration. e.g. -1 */
|
||||
/** The key of the configuration. e.g. 20 */
|
||||
String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size";
|
||||
|
||||
/** The key of the configuration. e.g. 10 */
|
||||
String CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE = "crawler.document.max.symbol.term.size";
|
||||
|
||||
/** The key of the configuration. e.g. UTF-8 */
|
||||
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
|
||||
|
||||
|
@ -833,7 +836,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
* Get the value for the key 'jvm.crawler.options'. <br>
|
||||
* The value is, e.g. -Djava.awt.headless=true
|
||||
-server
|
||||
-Xmx256m
|
||||
-Xmx512m
|
||||
-XX:MaxMetaspaceSize=128m
|
||||
-XX:CompressedClassSpaceSize=32m
|
||||
-XX:-UseGCOverheadLimit
|
||||
|
@ -1043,19 +1046,34 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.max.alphanum.term.size'. <br>
|
||||
* The value is, e.g. -1 <br>
|
||||
* The value is, e.g. 20 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentMaxAlphanumTermSize();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.max.alphanum.term.size' as {@link Integer}. <br>
|
||||
* The value is, e.g. -1 <br>
|
||||
* The value is, e.g. 20 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.max.symbol.term.size'. <br>
|
||||
* The value is, e.g. 10 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentMaxSymbolTermSize();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.max.symbol.term.size' as {@link Integer}. <br>
|
||||
* The value is, e.g. 10 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
|
||||
* The value is, e.g. UTF-8 <br>
|
||||
|
@ -3291,6 +3309,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentMaxSymbolTermSize() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
|
||||
}
|
||||
|
||||
public String getCrawlerCrawlingDataEncoding() {
|
||||
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
|
||||
}
|
||||
|
|
|
@ -8,5 +8,7 @@
|
|||
<property name="maxCompressionRatio">2</property>
|
||||
<property name="maxUncompressionSize">10000000</property>
|
||||
<property name="maxAlphanumTermSize">20</property>
|
||||
<property name="maxSymbolTermSize">10</property>
|
||||
<property name="readAsTextIfFailed">false</property>
|
||||
</component>
|
||||
</components>
|
||||
|
|
|
@ -78,6 +78,7 @@ crawler.document.unknown.hostname=unknown
|
|||
crawler.document.use.site.encoding.on.english=false
|
||||
crawler.document.append.data=true
|
||||
crawler.document.max.alphanum.term.size=20
|
||||
crawler.document.max.symbol.term.size=10
|
||||
crawler.crawling.data.encoding=UTF-8
|
||||
crawler.web.protocols=http,https
|
||||
crawler.file.protocols=file,smb
|
||||
|
|
|
@ -47,7 +47,7 @@ public class DocumentHelperTest extends UnitFessTestCase {
|
|||
|
||||
public void test_getContent_maxAlphanum() {
|
||||
DocumentHelper documentHelper = new DocumentHelper() {
|
||||
protected int getMaxAlphanumSize() {
|
||||
protected int getMaxAlphanumTermSize() {
|
||||
return 2;
|
||||
}
|
||||
};
|
||||
|
@ -67,6 +67,38 @@ public class DocumentHelperTest extends UnitFessTestCase {
|
|||
assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
|
||||
}
|
||||
|
||||
public void test_getContent_maxSymbol() {
|
||||
DocumentHelper documentHelper = new DocumentHelper() {
|
||||
protected int getMaxSymbolTermSize() {
|
||||
return 2;
|
||||
}
|
||||
};
|
||||
|
||||
ResponseData responseData = new ResponseData();
|
||||
Map<String, Object> dataMap = new HashMap<>();
|
||||
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
|
||||
assertEquals("123abc", documentHelper.getContent(responseData, " 123abc ", dataMap));
|
||||
|
||||
assertEquals("!!", documentHelper.getContent(responseData, "!!!", dataMap));
|
||||
assertEquals("//", documentHelper.getContent(responseData, "///", dataMap));
|
||||
assertEquals("::", documentHelper.getContent(responseData, ":::", dataMap));
|
||||
assertEquals("@@", documentHelper.getContent(responseData, "@@@", dataMap));
|
||||
assertEquals("[[", documentHelper.getContent(responseData, "[[[", dataMap));
|
||||
assertEquals("``", documentHelper.getContent(responseData, "```", dataMap));
|
||||
assertEquals("{{", documentHelper.getContent(responseData, "{{{", dataMap));
|
||||
assertEquals("~~", documentHelper.getContent(responseData, "~~~", dataMap));
|
||||
assertEquals("!\"", documentHelper.getContent(responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
|
||||
}
|
||||
|
||||
public void test_getDigest() {
|
||||
DocumentHelper documentHelper = new DocumentHelper();
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue