Shinsuke Sugaya пре 9 година
родитељ
комит
0a0e253cc1

+ 22 - 4
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -15,6 +15,9 @@
  */
 package org.codelibs.fess.helper;
 
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
 import java.util.Map;
 
 import org.apache.commons.lang3.StringUtils;
@@ -25,7 +28,6 @@ import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 
 public class DocumentHelper {
-
     public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
         if (content == null) {
             return StringUtil.EMPTY; // empty
@@ -34,7 +36,13 @@ public class DocumentHelper {
         final int maxAlphanumTermSize = getMaxAlphanumTermSize();
         final int maxSymbolTermSize = getMaxSymbolTermSize();
         final boolean duplicateTermRemoved = isDuplicateTermRemoved();
-        return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize, duplicateTermRemoved);
+        final int[] spaceChars = getSpaceChars();
+        try (final Reader reader = new StringReader(content)) {
+            return TextUtil.normalizeText(reader).initialCapacity(content.length()).maxAlphanumTermSize(maxAlphanumTermSize)
+                    .maxSymbolTermSize(maxSymbolTermSize).duplicateTermRemoved(duplicateTermRemoved).spaceChars(spaceChars).execute();
+        } catch (final IOException e) {
+            return StringUtil.EMPTY; // empty
+        }
     }
 
     protected int getMaxAlphanumTermSize() {
@@ -52,6 +60,11 @@ public class DocumentHelper {
         return fessConfig.isCrawlerDocumentDuplicateTermRemoved();
     }
 
+    protected int[] getSpaceChars() {
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        return fessConfig.getCrawlerDocumentSpaceCharsAsArray();
+    }
+
     public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
         if (content == null) {
             return StringUtil.EMPTY; // empty
@@ -64,7 +77,12 @@ public class DocumentHelper {
             subContent = content.substring(0, maxWidth * 2);
         }
 
-        final String originalStr = TextUtil.normalizeText(subContent, subContent.length(), -1, -1, false);
-        return StringUtils.abbreviate(originalStr, maxWidth);
+        final int[] spaceChars = getSpaceChars();
+        try (final Reader reader = new StringReader(subContent)) {
+            final String originalStr = TextUtil.normalizeText(reader).initialCapacity(content.length()).spaceChars(spaceChars).execute();
+            return StringUtils.abbreviate(originalStr, maxWidth);
+        } catch (final IOException e) {
+            return StringUtil.EMPTY; // empty
+        }
     }
 }

+ 2 - 2
src/main/java/org/codelibs/fess/mylasta/action/FessLabels.java

@@ -15,13 +15,13 @@
  */
 package org.codelibs.fess.mylasta.action;
 
-import org.lastaflute.web.ruts.message.ActionMessages;
+import org.lastaflute.core.message.UserMessages;
 
 /**
  * The keys for message.
  * @author FreeGen
  */
-public class FessLabels extends ActionMessages {
+public class FessLabels extends UserMessages {
 
     /** The serial version UID for object serialization. (Default) */
     private static final long serialVersionUID = 1L;

Разлика између датотеке није приказан због своје велике величине
+ 112 - 112
src/main/java/org/codelibs/fess/mylasta/action/FessMessages.java


+ 22 - 4
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -140,6 +140,11 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. false */
     String CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED = "crawler.document.duplicate.term.removed";
 
+    /** The key of the configuration. e.g. 	
+    
+       ᠎           ​   �¶ */
+    String CRAWLER_DOCUMENT_SPACE_CHARS = "crawler.document.space.chars";
+
     /** The key of the configuration. e.g. UTF-8 */
     String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
 
@@ -361,7 +366,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g.  */
     String INDEX_ADMIN_INTEGER_FIELDS = "index.admin.integer.fields";
 
-    /** The key of the configuration. e.g. favorite_count,click_count */
+    /** The key of the configuration. e.g. content_length,favorite_count,click_count */
     String INDEX_ADMIN_LONG_FIELDS = "index.admin.long.fields";
 
     /** The key of the configuration. e.g. boost */
@@ -370,7 +375,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g.  */
     String INDEX_ADMIN_DOUBLE_FIELDS = "index.admin.double.fields";
 
-    /** The key of the configuration. e.g. doc_id,url,title,role */
+    /** The key of the configuration. e.g. doc_id,url,title,role,boost */
     String INDEX_ADMIN_REQUIRED_FIELDS = "index.admin.required.fields";
 
     /** The key of the configuration. e.g. 3m */
@@ -1339,6 +1344,15 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     boolean isCrawlerDocumentDuplicateTermRemoved();
 
+    /**
+     * Get the value for the key 'crawler.document.space.chars'. <br>
+     * The value is, e.g. 	
+    
+       ᠎           ​   �¶ <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentSpaceChars();
+
     /**
      * Get the value for the key 'crawler.crawling.data.encoding'. <br>
      * The value is, e.g. UTF-8 <br>
@@ -2041,7 +2055,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
     /**
      * Get the value for the key 'index.admin.long.fields'. <br>
-     * The value is, e.g. favorite_count,click_count <br>
+     * The value is, e.g. content_length,favorite_count,click_count <br>
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      */
     String getIndexAdminLongFields();
@@ -2070,7 +2084,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
     /**
      * Get the value for the key 'index.admin.required.fields'. <br>
-     * The value is, e.g. doc_id,url,title,role <br>
+     * The value is, e.g. doc_id,url,title,role,boost <br>
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      */
     String getIndexAdminRequiredFields();
@@ -4259,6 +4273,10 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return is(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED);
         }
 
+        public String getCrawlerDocumentSpaceChars() {
+            return get(FessConfig.CRAWLER_DOCUMENT_SPACE_CHARS);
+        }
+
         public String getCrawlerCrawlingDataEncoding() {
             return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
         }

+ 16 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java

@@ -55,6 +55,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
 
 public interface FessProp {
 
+    public static final String CRAWLER_DOCUMENT_SPACE_CHARS = "crawlerDocumentSpaceChars";
+
     public static final String INDEX_ADMIN_ARRAY_FIELD_SET = "indexAdminArrayFieldSet";
 
     public static final String INDEX_ADMIN_DATE_FIELD_SET = "indexAdminDateFieldSet";
@@ -1303,4 +1305,18 @@ public interface FessProp {
         return requiredValidator.isValid(value, null);
     }
 
+    String getCrawlerDocumentSpaceChars();
+
+    public default int[] getCrawlerDocumentSpaceCharsAsArray() {
+        int[] spaceChars = (int[]) propMap.get(CRAWLER_DOCUMENT_SPACE_CHARS);
+        if (spaceChars == null) {
+            int length = getCrawlerDocumentSpaceChars().length();
+            spaceChars = new int[length];
+            for (int i = 0; i < length; i++) {
+                spaceChars[i] = getCrawlerDocumentSpaceChars().codePointAt(i);
+            }
+            propMap.put(CRAWLER_DOCUMENT_SPACE_CHARS, spaceChars);
+        }
+        return spaceChars;
+    }
 }

+ 1 - 0
src/main/java/org/codelibs/fess/thumbnail/impl/BaseThumbnailGenerator.java

@@ -16,6 +16,7 @@
 package org.codelibs.fess.thumbnail.impl;
 
 import static org.codelibs.core.stream.StreamUtil.stream;
+
 import java.io.File;
 import java.util.ArrayList;
 import java.util.HashMap;

+ 1 - 0
src/main/resources/fess_config.properties

@@ -85,6 +85,7 @@ crawler.document.append.data=true
 crawler.document.max.alphanum.term.size=20
 crawler.document.max.symbol.term.size=10
 crawler.document.duplicate.term.removed=false
+crawler.document.space.chars=\u0009\u000A\u000B\u000C\u000D\u001C\u001D\u001E\u001F\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF\uFFFD\u00B6
 crawler.crawling.data.encoding=UTF-8
 crawler.web.protocols=http,https
 crawler.file.protocols=file,smb,ftp

+ 18 - 0
src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java

@@ -32,6 +32,7 @@ public class FessPropTest extends UnitFessTestCase {
     }
 
     public void test_maxUsernameLength() throws IOException {
+        FessProp.propMap.clear();
         FessConfig fessConfig = new FessConfig.SimpleImpl() {
             @Override
             public Integer getLdapMaxUsernameLengthAsInteger() {
@@ -52,6 +53,7 @@ public class FessPropTest extends UnitFessTestCase {
     }
 
     public void test_maxUsernameLength10() throws IOException {
+        FessProp.propMap.clear();
         FessConfig fessConfig = new FessConfig.SimpleImpl() {
             @Override
             public Integer getLdapMaxUsernameLengthAsInteger() {
@@ -73,6 +75,7 @@ public class FessPropTest extends UnitFessTestCase {
     }
 
     public void test_validateIndexRequiredFields() {
+        FessProp.propMap.clear();
         FessConfig fessConfig = new FessConfig.SimpleImpl() {
             @Override
             public String getIndexAdminRequiredFields() {
@@ -100,4 +103,19 @@ public class FessPropTest extends UnitFessTestCase {
         source.put("bbb", "a");
         assertTrue(fessConfig.validateIndexRequiredFields(source));
     }
+
+    public void test_getCrawlerDocumentSpaceCharsAsArray() {
+        FessProp.propMap.clear();
+        FessConfig fessConfig = new FessConfig.SimpleImpl() {
+            @Override
+            public String getCrawlerDocumentSpaceChars() {
+                return "\u0020\u3000";
+            }
+        };
+
+        int[] spaceChars = fessConfig.getCrawlerDocumentSpaceCharsAsArray();
+        assertEquals(2, spaceChars.length);
+        assertEquals(32, spaceChars[0]);
+        assertEquals(12288, spaceChars[1]);
+    }
 }

Неке датотеке нису приказане због велике количине промена