fix #516 : add crawler.document.duplicate.term.removed

This commit is contained in:
Shinsuke Sugaya 2016-05-27 06:38:27 +09:00
parent 9677198214
commit ef84011e80
4 changed files with 34 additions and 3 deletions

View file

@ -36,8 +36,8 @@ public class DocumentHelper implements Serializable {
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
final int maxSymbolTermSize = getMaxSymbolTermSize();
// TODO removeDuplication
return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize, false);
final boolean duplicateTermRemoved = isDuplicateTermRemoved();
return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize, duplicateTermRemoved);
}
protected int getMaxAlphanumTermSize() {
@ -50,6 +50,11 @@ public class DocumentHelper implements Serializable {
return fessConfig.getCrawlerDocumentMaxSymbolTermSizeAsInteger().intValue();
}
protected boolean isDuplicateTermRemoved() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return fessConfig.isCrawlerDocumentDuplicateTermRemoved();
}
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
if (content == null) {
return StringUtil.EMPTY; // empty

View file

@ -725,7 +725,7 @@ public class FessLabels extends ActionMessages {
/** The key of the message: {0} results */
public static final String LABELS_searchoptions_num = "{labels.searchoptions_num}";
/** The key of the message: Preferred Languages */
/** The key of the message: Languages */
public static final String LABELS_searchoptions_menu_lang = "{labels.searchoptions_menu_lang}";
/** The key of the message: Labels */

View file

@ -137,6 +137,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. 10 */
String CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE = "crawler.document.max.symbol.term.size";
/** The key of the configuration. e.g. false */
String CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED = "crawler.document.duplicate.term.removed";
/** The key of the configuration. e.g. UTF-8 */
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
@ -1214,6 +1217,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger();
/**
* Get the value for the key 'crawler.document.duplicate.term.removed'. <br>
* The value is, e.g. false <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentDuplicateTermRemoved();
/**
* Is the property for the key 'crawler.document.duplicate.term.removed' true? <br>
* The value is, e.g. false <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentDuplicateTermRemoved();
/**
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
* The value is, e.g. UTF-8 <br>
@ -3782,6 +3799,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
}
public String getCrawlerDocumentDuplicateTermRemoved() {
return get(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED);
}
public boolean isCrawlerDocumentDuplicateTermRemoved() {
return is(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED);
}
public String getCrawlerCrawlingDataEncoding() {
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
}

View file

@ -84,6 +84,7 @@ crawler.document.use.site.encoding.on.english=false
crawler.document.append.data=true
crawler.document.max.alphanum.term.size=20
crawler.document.max.symbol.term.size=10
crawler.document.duplicate.term.removed=false
crawler.crawling.data.encoding=UTF-8
crawler.web.protocols=http,https
crawler.file.protocols=file,smb,ftp