fix #2105 add max text length to detect language

This commit is contained in:
Shinsuke Sugaya 2019-04-30 22:45:29 +09:00
parent 4dc0898248
commit 3e8923fe91
3 changed files with 44 additions and 2 deletions

View file

@ -37,11 +37,14 @@ public class LanguageHelper {
protected LanguageDetector detector;
protected int maxTextLength;
@PostConstruct
public void init() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
langFields = fessConfig.getIndexerLanguageFieldsAsArray();
supportedLanguages = fessConfig.getSupportedLanguagesAsArray();
maxTextLength = fessConfig.getIndexerLanguageDetectLengthAsInteger().intValue();
}
public void updateDocument(final Map<String, Object> doc) {
@ -80,13 +83,24 @@ public class LanguageHelper {
if (StringUtil.isBlank(text)) {
return null;
}
final LanguageResult result = detector.detect(text);
final String target = getDetectText(text);
final LanguageResult result = detector.detect(target);
if (logger.isDebugEnabled()) {
logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text);
logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), target);
}
return getSupportedLanguage(result.getLanguage());
}
protected String getDetectText(final String text) {
final String result;
if (text.length() <= maxTextLength) {
result = text;
} else {
result = text.substring(0, maxTextLength);
}
return result.replaceAll("\\s+", " ");
}
protected String getSupportedLanguage(final String lang) {
if (StringUtil.isBlank(lang)) {
return null;

View file

@ -408,6 +408,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. content,important_content,title */
String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields";
/** The key of the configuration. e.g. 1000 */
String INDEXER_LANGUAGE_DETECT_LENGTH = "indexer.language.detect.length";
/** The key of the configuration. e.g. default */
String INDEX_CODEC = "index.codec";
@ -2624,6 +2627,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
String getIndexerLanguageFields();
/**
* Get the value for the key 'indexer.language.detect.length'. <br>
* The value is, e.g. 1000 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getIndexerLanguageDetectLength();
/**
* Get the value for the key 'indexer.language.detect.length' as {@link Integer}. <br>
* The value is, e.g. 1000 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getIndexerLanguageDetectLengthAsInteger();
/**
* Get the value for the key 'index.codec'. <br>
* The value is, e.g. default <br>
@ -6546,6 +6564,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return get(FessConfig.INDEXER_LANGUAGE_FIELDS);
}
public String getIndexerLanguageDetectLength() {
return get(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH);
}
public Integer getIndexerLanguageDetectLengthAsInteger() {
return getAsInteger(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH);
}
public String getIndexCodec() {
return get(FessConfig.INDEX_CODEC);
}
@ -8439,6 +8465,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "5");
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "10485760");
defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title");
defaultMap.put(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH, "1000");
defaultMap.put(FessConfig.INDEX_CODEC, "default");
defaultMap.put(FessConfig.INDEX_number_of_shards, "5");
defaultMap.put(FessConfig.INDEX_auto_expand_replicas, "0-1");

View file

@ -239,6 +239,7 @@ indexer.webfs.max.document.request.size=10485760
indexer.data.max.document.cache.size=5
indexer.data.max.document.request.size=10485760
indexer.language.fields=content,important_content,title
indexer.language.detect.length=1000
# index setting
index.codec=default