fix #516 : add crawler.document.duplicate.term.removed
This commit is contained in:
parent
9677198214
commit
ef84011e80
4 changed files with 34 additions and 3 deletions
|
@ -36,8 +36,8 @@ public class DocumentHelper implements Serializable {
|
|||
|
||||
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
|
||||
final int maxSymbolTermSize = getMaxSymbolTermSize();
|
||||
// TODO removeDuplication
|
||||
return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize, false);
|
||||
final boolean duplicateTermRemoved = isDuplicateTermRemoved();
|
||||
return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize, duplicateTermRemoved);
|
||||
}
|
||||
|
||||
protected int getMaxAlphanumTermSize() {
|
||||
|
@ -50,6 +50,11 @@ public class DocumentHelper implements Serializable {
|
|||
return fessConfig.getCrawlerDocumentMaxSymbolTermSizeAsInteger().intValue();
|
||||
}
|
||||
|
||||
protected boolean isDuplicateTermRemoved() {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
return fessConfig.isCrawlerDocumentDuplicateTermRemoved();
|
||||
}
|
||||
|
||||
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
|
||||
if (content == null) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
|
|
|
@ -725,7 +725,7 @@ public class FessLabels extends ActionMessages {
|
|||
/** The key of the message: {0} results */
|
||||
public static final String LABELS_searchoptions_num = "{labels.searchoptions_num}";
|
||||
|
||||
/** The key of the message: Preferred Languages */
|
||||
/** The key of the message: Languages */
|
||||
public static final String LABELS_searchoptions_menu_lang = "{labels.searchoptions_menu_lang}";
|
||||
|
||||
/** The key of the message: Labels */
|
||||
|
|
|
@ -137,6 +137,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. 10 */
|
||||
String CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE = "crawler.document.max.symbol.term.size";
|
||||
|
||||
/** The key of the configuration. e.g. false */
|
||||
String CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED = "crawler.document.duplicate.term.removed";
|
||||
|
||||
/** The key of the configuration. e.g. UTF-8 */
|
||||
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
|
||||
|
||||
|
@ -1214,6 +1217,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
Integer getCrawlerDocumentMaxSymbolTermSizeAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.duplicate.term.removed'. <br>
|
||||
* The value is, e.g. false <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentDuplicateTermRemoved();
|
||||
|
||||
/**
|
||||
* Is the property for the key 'crawler.document.duplicate.term.removed' true? <br>
|
||||
* The value is, e.g. false <br>
|
||||
* @return The determination, true or false. (if not found, exception but basically no way)
|
||||
*/
|
||||
boolean isCrawlerDocumentDuplicateTermRemoved();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
|
||||
* The value is, e.g. UTF-8 <br>
|
||||
|
@ -3782,6 +3799,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentDuplicateTermRemoved() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED);
|
||||
}
|
||||
|
||||
public boolean isCrawlerDocumentDuplicateTermRemoved() {
|
||||
return is(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED);
|
||||
}
|
||||
|
||||
public String getCrawlerCrawlingDataEncoding() {
|
||||
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
|
||||
}
|
||||
|
|
|
@ -84,6 +84,7 @@ crawler.document.use.site.encoding.on.english=false
|
|||
crawler.document.append.data=true
|
||||
crawler.document.max.alphanum.term.size=20
|
||||
crawler.document.max.symbol.term.size=10
|
||||
crawler.document.duplicate.term.removed=false
|
||||
crawler.crawling.data.encoding=UTF-8
|
||||
crawler.web.protocols=http,https
|
||||
crawler.file.protocols=file,smb,ftp
|
||||
|
|
Loading…
Add table
Reference in a new issue