diff --git a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java index 7d85559a8..3232de7b2 100644 --- a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java +++ b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java @@ -37,11 +37,14 @@ public class LanguageHelper { protected LanguageDetector detector; + protected int maxTextLength; + @PostConstruct public void init() { final FessConfig fessConfig = ComponentUtil.getFessConfig(); langFields = fessConfig.getIndexerLanguageFieldsAsArray(); supportedLanguages = fessConfig.getSupportedLanguagesAsArray(); + maxTextLength = fessConfig.getIndexerLanguageDetectLengthAsInteger().intValue(); } public void updateDocument(final Map doc) { @@ -80,13 +83,24 @@ public class LanguageHelper { if (StringUtil.isBlank(text)) { return null; } - final LanguageResult result = detector.detect(text); + final String target = getDetectText(text); + final LanguageResult result = detector.detect(target); if (logger.isDebugEnabled()) { - logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text); + logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), target); } return getSupportedLanguage(result.getLanguage()); } + protected String getDetectText(final String text) { + final String result; + if (text.length() <= maxTextLength) { + result = text; + } else { + result = text.substring(0, maxTextLength); + } + return result.replaceAll("\\s+", " "); + } + protected String getSupportedLanguage(final String lang) { if (StringUtil.isBlank(lang)) { return null; diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java index 17d5526a8..a31de06af 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java @@ -408,6 +408,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction /** The key of the configuration. e.g. content,important_content,title */ String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields"; + /** The key of the configuration. e.g. 1000 */ + String INDEXER_LANGUAGE_DETECT_LENGTH = "indexer.language.detect.length"; + /** The key of the configuration. e.g. default */ String INDEX_CODEC = "index.codec"; @@ -2624,6 +2627,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction */ String getIndexerLanguageFields(); + /** + * Get the value for the key 'indexer.language.detect.length'.
+ * The value is, e.g. 1000
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getIndexerLanguageDetectLength(); + + /** + * Get the value for the key 'indexer.language.detect.length' as {@link Integer}.
+ * The value is, e.g. 1000
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getIndexerLanguageDetectLengthAsInteger(); + /** * Get the value for the key 'index.codec'.
* The value is, e.g. default
@@ -6546,6 +6564,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction return get(FessConfig.INDEXER_LANGUAGE_FIELDS); } + public String getIndexerLanguageDetectLength() { + return get(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH); + } + + public Integer getIndexerLanguageDetectLengthAsInteger() { + return getAsInteger(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH); + } + public String getIndexCodec() { return get(FessConfig.INDEX_CODEC); } @@ -8439,6 +8465,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "5"); defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "10485760"); defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title"); + defaultMap.put(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH, "1000"); defaultMap.put(FessConfig.INDEX_CODEC, "default"); defaultMap.put(FessConfig.INDEX_number_of_shards, "5"); defaultMap.put(FessConfig.INDEX_auto_expand_replicas, "0-1"); diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties index db470aa9a..5bea5fd1b 100644 --- a/src/main/resources/fess_config.properties +++ b/src/main/resources/fess_config.properties @@ -239,6 +239,7 @@ indexer.webfs.max.document.request.size=10485760 indexer.data.max.document.cache.size=5 indexer.data.max.document.request.size=10485760 indexer.language.fields=content,important_content,title +indexer.language.detect.length=1000 # index setting index.codec=default