fix #437 : add crawler.document.max.alphanum.term.size
This commit is contained in:
parent
132fc16c19
commit
3825af907a
4 changed files with 93 additions and 3 deletions
|
@ -21,6 +21,8 @@ import java.util.Map;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -34,7 +36,44 @@ public class DocumentHelper implements Serializable {
|
|||
if (content == null) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
}
|
||||
return content.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
|
||||
|
||||
final int maxAlphanumTermSize = getMaxAlphanumSize();
|
||||
final StringBuilder buf = new StringBuilder(content.length());
|
||||
boolean isSpace = false;
|
||||
int alphanumSize = 0;
|
||||
for (int i = 0; i < content.length(); i++) {
|
||||
final char c = content.charAt(i);
|
||||
if ((c >= '\u0000' && c <= '\u0020') || c == '\u007f' || c == '\u3000') {
|
||||
// space
|
||||
if (!isSpace) {
|
||||
buf.append(' ');
|
||||
isSpace = true;
|
||||
}
|
||||
alphanumSize = 0;
|
||||
} else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
|
||||
// alphanum
|
||||
if (maxAlphanumTermSize >= 0) {
|
||||
if (alphanumSize < maxAlphanumTermSize) {
|
||||
buf.append(c);
|
||||
}
|
||||
alphanumSize++;
|
||||
} else {
|
||||
buf.append(c);
|
||||
}
|
||||
isSpace = false;
|
||||
} else {
|
||||
buf.append(c);
|
||||
isSpace = false;
|
||||
alphanumSize = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return buf.toString().trim();
|
||||
}
|
||||
|
||||
protected int getMaxAlphanumSize() {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
return fessConfig.getCrawlerDocumentMaxAlphanumTermSizeAsInteger().intValue();
|
||||
}
|
||||
|
||||
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
|
||||
|
|
|
@ -127,6 +127,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. true */
|
||||
String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
|
||||
|
||||
/** The key of the configuration. e.g. -1 */
|
||||
String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size";
|
||||
|
||||
/** The key of the configuration. e.g. UTF-8 */
|
||||
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
|
||||
|
||||
|
@ -1035,6 +1038,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
boolean isCrawlerDocumentAppendData();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.max.alphanum.term.size'. <br>
|
||||
* The value is, e.g. -1 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentMaxAlphanumTermSize();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.max.alphanum.term.size' as {@link Integer}. <br>
|
||||
* The value is, e.g. -1 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
|
||||
* The value is, e.g. UTF-8 <br>
|
||||
|
@ -3247,6 +3265,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return is(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentMaxAlphanumTermSize() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
|
||||
}
|
||||
|
||||
public String getCrawlerCrawlingDataEncoding() {
|
||||
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
|
||||
}
|
||||
|
|
|
@ -77,6 +77,7 @@ crawler.document.site.encoding=UTF-8
|
|||
crawler.document.unknown.hostname=unknown
|
||||
crawler.document.use.site.encoding.on.english=false
|
||||
crawler.document.append.data=true
|
||||
crawler.document.max.alphanum.term.size=-1
|
||||
crawler.crawling.data.encoding=UTF-8
|
||||
crawler.web.protocols=http,https
|
||||
crawler.file.protocols=file,smb
|
||||
|
|
|
@ -22,15 +22,15 @@ import org.codelibs.fess.crawler.entity.ResponseData;
|
|||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
|
||||
public class DocumentHelperTest extends UnitFessTestCase {
|
||||
private DocumentHelper documentHelper;
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
documentHelper = new DocumentHelper();
|
||||
}
|
||||
|
||||
public void test_getContent() {
|
||||
DocumentHelper documentHelper = new DocumentHelper();
|
||||
|
||||
ResponseData responseData = new ResponseData();
|
||||
Map<String, Object> dataMap = new HashMap<>();
|
||||
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
|
||||
|
@ -45,7 +45,31 @@ public class DocumentHelperTest extends UnitFessTestCase {
|
|||
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
|
||||
}
|
||||
|
||||
public void test_getContent_maxAlphanum() {
|
||||
DocumentHelper documentHelper = new DocumentHelper() {
|
||||
protected int getMaxAlphanumSize() {
|
||||
return 2;
|
||||
}
|
||||
};
|
||||
|
||||
ResponseData responseData = new ResponseData();
|
||||
Map<String, Object> dataMap = new HashMap<>();
|
||||
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
|
||||
assertEquals("12 ab", documentHelper.getContent(responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("12 ab", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
|
||||
assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
|
||||
}
|
||||
|
||||
public void test_getDigest() {
|
||||
DocumentHelper documentHelper = new DocumentHelper();
|
||||
|
||||
ResponseData responseData = new ResponseData();
|
||||
Map<String, Object> dataMap = new HashMap<>();
|
||||
assertEquals("1234567...", documentHelper.getDigest(responseData, " 1234567890 1234567890 1234567890 ", dataMap, 10));
|
||||
|
|
Loading…
Add table
Reference in a new issue