ソースを参照

fix #429 : add DocumentHelper

Shinsuke Sugaya 9 年 前
コミット
90e20a0d2c

+ 0 - 2
src/main/java/org/codelibs/fess/Constants.java

@@ -218,8 +218,6 @@ public class Constants extends CoreLibConstants {
 
 
     public static final String INDEXING_TARGET = "indexingTarget";
     public static final String INDEXING_TARGET = "indexingTarget";
 
 
-    public static final String DIGEST_PREFIX = "...";
-
     public static final String BASIC = "BASIC";
     public static final String BASIC = "BASIC";
 
 
     public static final String DIGEST = "DIGEST";
     public static final String DIGEST = "DIGEST";

+ 11 - 24
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -16,7 +16,6 @@
 package org.codelibs.fess.crawler.transformer;
 package org.codelibs.fess.crawler.transformer;
 
 
 import java.io.InputStream;
 import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.net.URLDecoder;
 import java.util.ArrayList;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Collections;
@@ -51,6 +50,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
+import org.codelibs.fess.helper.DocumentHelper;
 import org.codelibs.fess.helper.FileTypeHelper;
 import org.codelibs.fess.helper.FileTypeHelper;
 import org.codelibs.fess.helper.LabelTypeHelper;
 import org.codelibs.fess.helper.LabelTypeHelper;
 import org.codelibs.fess.helper.PathMappingHelper;
 import org.codelibs.fess.helper.PathMappingHelper;
@@ -174,6 +174,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
         final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
         final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
         final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
         final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
         final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
+        final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
         String url = responseData.getUrl();
         String url = responseData.getUrl();
         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
         url = pathMappingHelper.replaceUrl(sessionId, url);
         url = pathMappingHelper.replaceUrl(sessionId, url);
@@ -210,12 +211,8 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
             }
             }
             buf.append(contentMeta);
             buf.append(contentMeta);
         }
         }
-        final String body = normalizeContent(buf.toString());
-        if (StringUtil.isNotBlank(body)) {
-            putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
-        } else {
-            putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), StringUtil.EMPTY);
-        }
+        final String body = documentHelper.getContent(responseData, buf.toString(), dataMap);
+        putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
                 .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
                 .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
             if (responseData.getContentLength() > 0
             if (responseData.getContentLength() > 0
@@ -228,15 +225,17 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
             }
             }
         }
         }
         // digest
         // digest
-        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
-                Constants.DIGEST_PREFIX
-                        + abbreviate(normalizeContent(content), fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
+        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, buf.toString(), dataMap,
+                fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
         // title
         // title
         if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
         if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
             if (url.endsWith("/")) {
             if (url.endsWith("/")) {
                 if (StringUtil.isNotBlank(content)) {
                 if (StringUtil.isNotBlank(content)) {
-                    putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(),
-                            abbreviate(body, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
+                    putResultDataBody(
+                            dataMap,
+                            fessConfig.getIndexFieldTitle(),
+                            documentHelper.getDigest(responseData, body, dataMap,
+                                    fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
                 } else {
                 } else {
                     putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
                     putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
                 }
                 }
@@ -332,18 +331,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         return dataMap;
         return dataMap;
     }
     }
 
 
-    protected String abbreviate(final String str, final int maxWidth) {
-        String newStr = StringUtils.abbreviate(str, maxWidth);
-        try {
-            if (newStr.getBytes(Constants.UTF_8).length > maxWidth + fessConfig.getCrawlerDocumentFileAbbreviationMarginLengthAsInteger()) {
-                newStr = StringUtils.abbreviate(str, maxWidth / 2);
-            }
-        } catch (final UnsupportedEncodingException e) {
-            // NOP
-        }
-        return newStr;
-    }
-
     private String getResourceName(final ResponseData responseData) {
     private String getResourceName(final ResponseData responseData) {
         String name = responseData.getUrl();
         String name = responseData.getUrl();
         final String enc = responseData.getCharSet();
         final String enc = responseData.getCharSet();

+ 0 - 7
src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java

@@ -100,13 +100,6 @@ public interface FessTransformer {
         return StringUtils.abbreviate(url, getMaxSiteLength());
         return StringUtils.abbreviate(url, getMaxSiteLength());
     }
     }
 
 
-    public default String normalizeContent(final String content) {
-        if (content == null) {
-            return StringUtil.EMPTY; // empty
-        }
-        return content.replaceAll("\\s+", " ");
-    }
-
     public default void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
     public default void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
         if (fessConfig.getIndexFieldUrl().equals(key)) {
         if (fessConfig.getIndexFieldUrl().equals(key)) {

+ 13 - 21
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -30,7 +30,6 @@ import java.util.Set;
 import javax.annotation.PostConstruct;
 import javax.annotation.PostConstruct;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.TransformerException;
 
 
-import org.apache.commons.lang3.StringUtils;
 import org.apache.xpath.objects.XObject;
 import org.apache.xpath.objects.XObject;
 import org.codelibs.core.io.InputStreamUtil;
 import org.codelibs.core.io.InputStreamUtil;
 import org.codelibs.core.io.SerializeUtil;
 import org.codelibs.core.io.SerializeUtil;
@@ -51,6 +50,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
+import org.codelibs.fess.helper.DocumentHelper;
 import org.codelibs.fess.helper.DuplicateHostHelper;
 import org.codelibs.fess.helper.DuplicateHostHelper;
 import org.codelibs.fess.helper.FileTypeHelper;
 import org.codelibs.fess.helper.FileTypeHelper;
 import org.codelibs.fess.helper.LabelTypeHelper;
 import org.codelibs.fess.helper.LabelTypeHelper;
@@ -71,7 +71,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
 
     private static final int UTF8_BOM_SIZE = 3;
     private static final int UTF8_BOM_SIZE = 3;
 
 
-    public boolean prunedCacheContent = true;
+    public boolean prunedContent = true;
 
 
     public Map<String, String> convertUrlMap = new HashMap<>();
     public Map<String, String> convertUrlMap = new HashMap<>();
 
 
@@ -177,6 +177,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
         final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
         final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
         final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
         final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
         final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
+        final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
         String url = responseData.getUrl();
         String url = responseData.getUrl();
         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
         url = pathMappingHelper.replaceUrl(sessionId, url);
         url = pathMappingHelper.replaceUrl(sessionId, url);
@@ -208,7 +209,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         }
         }
         // title
         // title
         // content
         // content
-        putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), getDocumentContent(responseData, document));
+        final String body = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), prunedContent);
+        putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
                 .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
                 .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
             if (responseData.getContentLength() > 0
             if (responseData.getContentLength() > 0
@@ -230,7 +232,13 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             }
             }
         }
         }
         // digest
         // digest
-        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), getDocumentDigest(responseData, document));
+        final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
+        if (StringUtil.isNotBlank(digest)) {
+            putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
+        } else {
+            putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
+                    documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
+        }
         // segment
         // segment
         putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
         putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
         // host
         // host
@@ -314,19 +322,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return null;
         return null;
     }
     }
 
 
-    protected String getDocumentDigest(final ResponseData responseData, final Document document) {
-        final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
-        if (StringUtil.isNotBlank(digest)) {
-            return digest;
-        }
-
-        final String body =
-                normalizeContent(removeCommentTag(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(),
-                        prunedCacheContent)));
-        return StringUtils.abbreviate(body, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger());
-    }
-
-    String removeCommentTag(final String content) {
+    protected String removeCommentTag(final String content) {
         if (content == null) {
         if (content == null) {
             return StringUtil.EMPTY;
             return StringUtil.EMPTY;
         }
         }
@@ -348,10 +344,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return value;
         return value;
     }
     }
 
 
-    private String getDocumentContent(final ResponseData responseData, final Document document) {
-        return normalizeContent(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), true));
-    }
-
     protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
     protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
         StringBuilder buf = null;
         StringBuilder buf = null;
         NodeList list = null;
         NodeList list = null;

+ 55 - 0
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -0,0 +1,55 @@
+/*
+ * Copyright 2012-2016 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.helper;
+
+import java.io.Serializable;
+import java.util.Map;
+
+import org.apache.commons.lang3.StringUtils;
+import org.codelibs.core.lang.StringUtil;
+import org.codelibs.fess.crawler.entity.ResponseData;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class DocumentHelper implements Serializable {
+
+    private static final long serialVersionUID = 1L;
+
+    private static final Logger logger = LoggerFactory.getLogger(DocumentHelper.class);
+
+    public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
+        if (content == null) {
+            return StringUtil.EMPTY; // empty
+        }
+        return content.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
+    }
+
+    public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
+        if (content == null) {
+            return StringUtil.EMPTY; // empty
+        }
+
+        String subContent;
+        if (content.length() < maxWidth * 2) {
+            subContent = content;
+        } else {
+            subContent = content.substring(0, maxWidth * 2);
+        }
+
+        final String originalStr = subContent.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
+        return StringUtils.abbreviate(originalStr, maxWidth);
+    }
+}

+ 0 - 26
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -168,9 +168,6 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. No title. */
     /** The key of the configuration. e.g. No title. */
     String CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL = "crawler.document.file.no.title.label";
     String CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL = "crawler.document.file.no.title.label";
 
 
-    /** The key of the configuration. e.g. 10 */
-    String CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH = "crawler.document.file.abbreviation.margin.length";
-
     /** The key of the configuration. e.g. false */
     /** The key of the configuration. e.g. false */
     String CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT = "crawler.document.file.ignore.empty.content";
     String CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT = "crawler.document.file.ignore.empty.content";
 
 
@@ -1150,21 +1147,6 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
      */
     String getCrawlerDocumentFileNoTitleLabel();
     String getCrawlerDocumentFileNoTitleLabel();
 
 
-    /**
-     * Get the value for the key 'crawler.document.file.abbreviation.margin.length'. <br>
-     * The value is, e.g. 10 <br>
-     * @return The value of found property. (NotNull: if not found, exception but basically no way)
-     */
-    String getCrawlerDocumentFileAbbreviationMarginLength();
-
-    /**
-     * Get the value for the key 'crawler.document.file.abbreviation.margin.length' as {@link Integer}. <br>
-     * The value is, e.g. 10 <br>
-     * @return The value of found property. (NotNull: if not found, exception but basically no way)
-     * @throws NumberFormatException When the property is not integer.
-     */
-    Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger();
-
     /**
     /**
      * Get the value for the key 'crawler.document.file.ignore.empty.content'. <br>
      * Get the value for the key 'crawler.document.file.ignore.empty.content'. <br>
      * The value is, e.g. false <br>
      * The value is, e.g. false <br>
@@ -3325,14 +3307,6 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return get(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL);
             return get(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL);
         }
         }
 
 
-        public String getCrawlerDocumentFileAbbreviationMarginLength() {
-            return get(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
-        }
-
-        public Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger() {
-            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
-        }
-
         public String getCrawlerDocumentFileIgnoreEmptyContent() {
         public String getCrawlerDocumentFileIgnoreEmptyContent() {
             return get(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT);
             return get(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT);
         }
         }

+ 7 - 0
src/main/java/org/codelibs/fess/util/ComponentUtil.java

@@ -27,6 +27,7 @@ import org.codelibs.fess.es.client.FessEsClient;
 import org.codelibs.fess.helper.ActivityHelper;
 import org.codelibs.fess.helper.ActivityHelper;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
+import org.codelibs.fess.helper.DocumentHelper;
 import org.codelibs.fess.helper.DuplicateHostHelper;
 import org.codelibs.fess.helper.DuplicateHostHelper;
 import org.codelibs.fess.helper.FileTypeHelper;
 import org.codelibs.fess.helper.FileTypeHelper;
 import org.codelibs.fess.helper.IndexingHelper;
 import org.codelibs.fess.helper.IndexingHelper;
@@ -56,6 +57,8 @@ import org.lastaflute.job.JobManager;
 import org.lastaflute.web.servlet.request.RequestManager;
 import org.lastaflute.web.servlet.request.RequestManager;
 
 
 public final class ComponentUtil {
 public final class ComponentUtil {
+    private static final String DOCUMENT_HELPER = "documentHelper";
+
     private static final String ACTIVITY_HELPER = "activityHelper";
     private static final String ACTIVITY_HELPER = "activityHelper";
 
 
     private static final String LDAP_MANAGER = "ldapManager";
     private static final String LDAP_MANAGER = "ldapManager";
@@ -291,6 +294,10 @@ public final class ComponentUtil {
         return SingletonLaContainer.getComponent(JobManager.class);
         return SingletonLaContainer.getComponent(JobManager.class);
     }
     }
 
 
+    public static DocumentHelper getDocumentHelper() {
+        return SingletonLaContainer.getComponent(DOCUMENT_HELPER);
+    }
+
     public static <T> T getComponent(final Class<T> clazz) {
     public static <T> T getComponent(final Class<T> clazz) {
         return SingletonLaContainer.getComponent(clazz);
         return SingletonLaContainer.getComponent(clazz);
     }
     }

+ 2 - 0
src/main/resources/fess.xml

@@ -16,6 +16,8 @@
 	</component>
 	</component>
 	<component name="crawlingConfigHelper" class="org.codelibs.fess.helper.CrawlingConfigHelper">
 	<component name="crawlingConfigHelper" class="org.codelibs.fess.helper.CrawlingConfigHelper">
 	</component>
 	</component>
+	<component name="documentHelper" class="org.codelibs.fess.helper.DocumentHelper">
+	</component>
 	<component name="pathMappingHelper" class="org.codelibs.fess.helper.PathMappingHelper">
 	<component name="pathMappingHelper" class="org.codelibs.fess.helper.PathMappingHelper">
 	</component>
 	</component>
 	<component name="processHelper" class="org.codelibs.fess.helper.ProcessHelper">
 	<component name="processHelper" class="org.codelibs.fess.helper.ProcessHelper">

+ 0 - 1
src/main/resources/fess_config.properties

@@ -96,7 +96,6 @@ crawler.document.html.max.digest.length=200
 # file
 # file
 crawler.document.file.name.encoding=
 crawler.document.file.name.encoding=
 crawler.document.file.no.title.label=No title.
 crawler.document.file.no.title.label=No title.
-crawler.document.file.abbreviation.margin.length=10
 crawler.document.file.ignore.empty.content=false
 crawler.document.file.ignore.empty.content=false
 crawler.document.file.max.title.length=100
 crawler.document.file.max.title.length=100
 crawler.document.file.max.digest.length=200
 crawler.document.file.max.digest.length=200

+ 0 - 9
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -218,15 +218,6 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
 
 
     }
     }
 
 
-    public void test_normalizeContent() {
-        assertEquals("", fessXpathTransformer.normalizeContent(""));
-        assertEquals(" ", fessXpathTransformer.normalizeContent(" "));
-        assertEquals(" ", fessXpathTransformer.normalizeContent("  "));
-        assertEquals(" ", fessXpathTransformer.normalizeContent("\t"));
-        assertEquals(" ", fessXpathTransformer.normalizeContent("\t\t"));
-        assertEquals(" ", fessXpathTransformer.normalizeContent("\t \t"));
-    }
-
     public void test_removeCommentTag() {
     public void test_removeCommentTag() {
         assertEquals("", fessXpathTransformer.removeCommentTag(""));
         assertEquals("", fessXpathTransformer.removeCommentTag(""));
         assertEquals(" ", fessXpathTransformer.removeCommentTag("<!-- - -->"));
         assertEquals(" ", fessXpathTransformer.removeCommentTag("<!-- - -->"));

+ 67 - 0
src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java

@@ -0,0 +1,67 @@
+/*
+ * Copyright 2012-2016 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.helper;
+
+import java.util.HashMap;
+import java.util.Map;
+
+import org.codelibs.fess.crawler.entity.ResponseData;
+import org.codelibs.fess.unit.UnitFessTestCase;
+
+public class DocumentHelperTest extends UnitFessTestCase {
+    private DocumentHelper documentHelper;
+
+    @Override
+    public void setUp() throws Exception {
+        super.setUp();
+        documentHelper = new DocumentHelper();
+    }
+
+    public void test_getContent() {
+        ResponseData responseData = new ResponseData();
+        Map<String, Object> dataMap = new HashMap<>();
+        assertEquals("", documentHelper.getContent(responseData, null, dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "  ", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
+        assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
+        assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
+    }
+
+    public void test_getDigest() {
+        ResponseData responseData = new ResponseData();
+        Map<String, Object> dataMap = new HashMap<>();
+        assertEquals("1234567...", documentHelper.getDigest(responseData, " 1234567890  1234567890  1234567890 ", dataMap, 10));
+        assertEquals("1234567...", documentHelper.getDigest(responseData, "123456789012345678901234567890", dataMap, 10));
+        assertEquals("1234567...", documentHelper.getDigest(responseData, "123456789012345678901", dataMap, 10));
+        assertEquals("1234567...", documentHelper.getDigest(responseData, "12345678901234567890", dataMap, 10));
+        assertEquals("1234567...", documentHelper.getDigest(responseData, "1234567890123456789", dataMap, 10));
+        assertEquals("1234567...", documentHelper.getDigest(responseData, "12345678901", dataMap, 10));
+        assertEquals("1234567890", documentHelper.getDigest(responseData, "1234567890", dataMap, 10));
+        assertEquals("123456789", documentHelper.getDigest(responseData, "123456789", dataMap, 10));
+        assertEquals("1234567", documentHelper.getDigest(responseData, "1234567", dataMap, 10));
+        assertEquals("1", documentHelper.getDigest(responseData, "1", dataMap, 10));
+        assertEquals("", documentHelper.getDigest(responseData, "", dataMap, 10));
+        assertEquals("", documentHelper.getDigest(responseData, " ", dataMap, 10));
+        assertEquals("", documentHelper.getDigest(responseData, null, dataMap, 10));
+        assertEquals("1234567...", documentHelper.getDigest(responseData, " 1234567890  1234567890  1234567890 ", dataMap, 10));
+        assertEquals("1234567...", documentHelper.getDigest(responseData, "12345678901234567890", dataMap, 10));
+    }
+}