Просмотр исходного кода

fix #2105 add max text length to detect language

Shinsuke Sugaya 6 лет назад
Родитель
Сommit
3e8923fe91

+ 16 - 2
src/main/java/org/codelibs/fess/helper/LanguageHelper.java

@@ -37,11 +37,14 @@ public class LanguageHelper {
 
 
     protected LanguageDetector detector;
     protected LanguageDetector detector;
 
 
+    protected int maxTextLength;
+
     @PostConstruct
     @PostConstruct
     public void init() {
     public void init() {
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
         langFields = fessConfig.getIndexerLanguageFieldsAsArray();
         langFields = fessConfig.getIndexerLanguageFieldsAsArray();
         supportedLanguages = fessConfig.getSupportedLanguagesAsArray();
         supportedLanguages = fessConfig.getSupportedLanguagesAsArray();
+        maxTextLength = fessConfig.getIndexerLanguageDetectLengthAsInteger().intValue();
     }
     }
 
 
     public void updateDocument(final Map<String, Object> doc) {
     public void updateDocument(final Map<String, Object> doc) {
@@ -80,13 +83,24 @@ public class LanguageHelper {
         if (StringUtil.isBlank(text)) {
         if (StringUtil.isBlank(text)) {
             return null;
             return null;
         }
         }
-        final LanguageResult result = detector.detect(text);
+        final String target = getDetectText(text);
+        final LanguageResult result = detector.detect(target);
         if (logger.isDebugEnabled()) {
         if (logger.isDebugEnabled()) {
-            logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text);
+            logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), target);
         }
         }
         return getSupportedLanguage(result.getLanguage());
         return getSupportedLanguage(result.getLanguage());
     }
     }
 
 
+    protected String getDetectText(final String text) {
+        final String result;
+        if (text.length() <= maxTextLength) {
+            result = text;
+        } else {
+            result = text.substring(0, maxTextLength);
+        }
+        return result.replaceAll("\\s+", " ");
+    }
+
     protected String getSupportedLanguage(final String lang) {
     protected String getSupportedLanguage(final String lang) {
         if (StringUtil.isBlank(lang)) {
         if (StringUtil.isBlank(lang)) {
             return null;
             return null;

+ 27 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -408,6 +408,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. content,important_content,title */
     /** The key of the configuration. e.g. content,important_content,title */
     String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields";
     String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields";
 
 
+    /** The key of the configuration. e.g. 1000 */
+    String INDEXER_LANGUAGE_DETECT_LENGTH = "indexer.language.detect.length";
+
     /** The key of the configuration. e.g. default */
     /** The key of the configuration. e.g. default */
     String INDEX_CODEC = "index.codec";
     String INDEX_CODEC = "index.codec";
 
 
@@ -2624,6 +2627,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
      */
     String getIndexerLanguageFields();
     String getIndexerLanguageFields();
 
 
+    /**
+     * Get the value for the key 'indexer.language.detect.length'. <br>
+     * The value is, e.g. 1000 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getIndexerLanguageDetectLength();
+
+    /**
+     * Get the value for the key 'indexer.language.detect.length' as {@link Integer}. <br>
+     * The value is, e.g. 1000 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getIndexerLanguageDetectLengthAsInteger();
+
     /**
     /**
      * Get the value for the key 'index.codec'. <br>
      * Get the value for the key 'index.codec'. <br>
      * The value is, e.g. default <br>
      * The value is, e.g. default <br>
@@ -6546,6 +6564,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return get(FessConfig.INDEXER_LANGUAGE_FIELDS);
             return get(FessConfig.INDEXER_LANGUAGE_FIELDS);
         }
         }
 
 
+        public String getIndexerLanguageDetectLength() {
+            return get(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH);
+        }
+
+        public Integer getIndexerLanguageDetectLengthAsInteger() {
+            return getAsInteger(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH);
+        }
+
         public String getIndexCodec() {
         public String getIndexCodec() {
             return get(FessConfig.INDEX_CODEC);
             return get(FessConfig.INDEX_CODEC);
         }
         }
@@ -8439,6 +8465,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "5");
             defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "5");
             defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "10485760");
             defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "10485760");
             defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title");
             defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title");
+            defaultMap.put(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH, "1000");
             defaultMap.put(FessConfig.INDEX_CODEC, "default");
             defaultMap.put(FessConfig.INDEX_CODEC, "default");
             defaultMap.put(FessConfig.INDEX_number_of_shards, "5");
             defaultMap.put(FessConfig.INDEX_number_of_shards, "5");
             defaultMap.put(FessConfig.INDEX_auto_expand_replicas, "0-1");
             defaultMap.put(FessConfig.INDEX_auto_expand_replicas, "0-1");

+ 1 - 0
src/main/resources/fess_config.properties

@@ -239,6 +239,7 @@ indexer.webfs.max.document.request.size=10485760
 indexer.data.max.document.cache.size=5
 indexer.data.max.document.cache.size=5
 indexer.data.max.document.request.size=10485760
 indexer.data.max.document.request.size=10485760
 indexer.language.fields=content,important_content,title
 indexer.language.fields=content,important_content,title
+indexer.language.detect.length=1000
 
 
 # index setting
 # index setting
 index.codec=default
 index.codec=default