fix #437 : add crawler.document.max.alphanum.term.size

This commit is contained in:
Shinsuke Sugaya 2016-03-18 06:44:14 +09:00
parent 132fc16c19
commit 3825af907a
4 changed files with 93 additions and 3 deletions

View file

@ -21,6 +21,8 @@ import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -34,7 +36,44 @@ public class DocumentHelper implements Serializable {
if (content == null) {
return StringUtil.EMPTY; // empty
}
return content.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
final int maxAlphanumTermSize = getMaxAlphanumSize();
final StringBuilder buf = new StringBuilder(content.length());
boolean isSpace = false;
int alphanumSize = 0;
for (int i = 0; i < content.length(); i++) {
final char c = content.charAt(i);
if ((c >= '\u0000' && c <= '\u0020') || c == '\u007f' || c == '\u3000') {
// space
if (!isSpace) {
buf.append(' ');
isSpace = true;
}
alphanumSize = 0;
} else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
// alphanum
if (maxAlphanumTermSize >= 0) {
if (alphanumSize < maxAlphanumTermSize) {
buf.append(c);
}
alphanumSize++;
} else {
buf.append(c);
}
isSpace = false;
} else {
buf.append(c);
isSpace = false;
alphanumSize = 0;
}
}
return buf.toString().trim();
}
protected int getMaxAlphanumSize() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return fessConfig.getCrawlerDocumentMaxAlphanumTermSizeAsInteger().intValue();
}
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {

View file

@ -127,6 +127,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. true */
String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
/** The key of the configuration. e.g. -1 */
String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size";
/** The key of the configuration. e.g. UTF-8 */
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
@ -1035,6 +1038,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
boolean isCrawlerDocumentAppendData();
/**
* Get the value for the key 'crawler.document.max.alphanum.term.size'. <br>
* The value is, e.g. -1 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentMaxAlphanumTermSize();
/**
* Get the value for the key 'crawler.document.max.alphanum.term.size' as {@link Integer}. <br>
* The value is, e.g. -1 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger();
/**
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
* The value is, e.g. UTF-8 <br>
@ -3247,6 +3265,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return is(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
}
public String getCrawlerDocumentMaxAlphanumTermSize() {
return get(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
}
public Integer getCrawlerDocumentMaxAlphanumTermSizeAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
}
public String getCrawlerCrawlingDataEncoding() {
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
}

View file

@ -77,6 +77,7 @@ crawler.document.site.encoding=UTF-8
crawler.document.unknown.hostname=unknown
crawler.document.use.site.encoding.on.english=false
crawler.document.append.data=true
crawler.document.max.alphanum.term.size=-1
crawler.crawling.data.encoding=UTF-8
crawler.web.protocols=http,https
crawler.file.protocols=file,smb

View file

@ -22,15 +22,15 @@ import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.unit.UnitFessTestCase;
public class DocumentHelperTest extends UnitFessTestCase {
private DocumentHelper documentHelper;
@Override
public void setUp() throws Exception {
super.setUp();
documentHelper = new DocumentHelper();
}
public void test_getContent() {
DocumentHelper documentHelper = new DocumentHelper();
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
@ -45,7 +45,31 @@ public class DocumentHelperTest extends UnitFessTestCase {
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
}
public void test_getContent_maxAlphanum() {
DocumentHelper documentHelper = new DocumentHelper() {
protected int getMaxAlphanumSize() {
return 2;
}
};
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
assertEquals("12 ab", documentHelper.getContent(responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
assertEquals("12 ab", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
}
public void test_getDigest() {
DocumentHelper documentHelper = new DocumentHelper();
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("1234567...", documentHelper.getDigest(responseData, " 1234567890 1234567890 1234567890 ", dataMap, 10));