fix #2105 add max text length to detect language
This commit is contained in:
parent
4dc0898248
commit
3e8923fe91
3 changed files with 44 additions and 2 deletions
|
@ -37,11 +37,14 @@ public class LanguageHelper {
|
|||
|
||||
protected LanguageDetector detector;
|
||||
|
||||
protected int maxTextLength;
|
||||
|
||||
@PostConstruct
|
||||
public void init() {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
langFields = fessConfig.getIndexerLanguageFieldsAsArray();
|
||||
supportedLanguages = fessConfig.getSupportedLanguagesAsArray();
|
||||
maxTextLength = fessConfig.getIndexerLanguageDetectLengthAsInteger().intValue();
|
||||
}
|
||||
|
||||
public void updateDocument(final Map<String, Object> doc) {
|
||||
|
@ -80,13 +83,24 @@ public class LanguageHelper {
|
|||
if (StringUtil.isBlank(text)) {
|
||||
return null;
|
||||
}
|
||||
final LanguageResult result = detector.detect(text);
|
||||
final String target = getDetectText(text);
|
||||
final LanguageResult result = detector.detect(target);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text);
|
||||
logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), target);
|
||||
}
|
||||
return getSupportedLanguage(result.getLanguage());
|
||||
}
|
||||
|
||||
protected String getDetectText(final String text) {
|
||||
final String result;
|
||||
if (text.length() <= maxTextLength) {
|
||||
result = text;
|
||||
} else {
|
||||
result = text.substring(0, maxTextLength);
|
||||
}
|
||||
return result.replaceAll("\\s+", " ");
|
||||
}
|
||||
|
||||
protected String getSupportedLanguage(final String lang) {
|
||||
if (StringUtil.isBlank(lang)) {
|
||||
return null;
|
||||
|
|
|
@ -408,6 +408,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. content,important_content,title */
|
||||
String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields";
|
||||
|
||||
/** The key of the configuration. e.g. 1000 */
|
||||
String INDEXER_LANGUAGE_DETECT_LENGTH = "indexer.language.detect.length";
|
||||
|
||||
/** The key of the configuration. e.g. default */
|
||||
String INDEX_CODEC = "index.codec";
|
||||
|
||||
|
@ -2624,6 +2627,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
String getIndexerLanguageFields();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'indexer.language.detect.length'. <br>
|
||||
* The value is, e.g. 1000 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getIndexerLanguageDetectLength();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'indexer.language.detect.length' as {@link Integer}. <br>
|
||||
* The value is, e.g. 1000 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getIndexerLanguageDetectLengthAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'index.codec'. <br>
|
||||
* The value is, e.g. default <br>
|
||||
|
@ -6546,6 +6564,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return get(FessConfig.INDEXER_LANGUAGE_FIELDS);
|
||||
}
|
||||
|
||||
public String getIndexerLanguageDetectLength() {
|
||||
return get(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH);
|
||||
}
|
||||
|
||||
public Integer getIndexerLanguageDetectLengthAsInteger() {
|
||||
return getAsInteger(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH);
|
||||
}
|
||||
|
||||
public String getIndexCodec() {
|
||||
return get(FessConfig.INDEX_CODEC);
|
||||
}
|
||||
|
@ -8439,6 +8465,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "5");
|
||||
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "10485760");
|
||||
defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title");
|
||||
defaultMap.put(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH, "1000");
|
||||
defaultMap.put(FessConfig.INDEX_CODEC, "default");
|
||||
defaultMap.put(FessConfig.INDEX_number_of_shards, "5");
|
||||
defaultMap.put(FessConfig.INDEX_auto_expand_replicas, "0-1");
|
||||
|
|
|
@ -239,6 +239,7 @@ indexer.webfs.max.document.request.size=10485760
|
|||
indexer.data.max.document.cache.size=5
|
||||
indexer.data.max.document.request.size=10485760
|
||||
indexer.language.fields=content,important_content,title
|
||||
indexer.language.detect.length=1000
|
||||
|
||||
# index setting
|
||||
index.codec=default
|
||||
|
|
Loading…
Add table
Reference in a new issue