Browse Source

fix #2220 add config.keep_original_body

Shinsuke Sugaya 5 years ago
parent
commit
23eddea328

+ 10 - 1
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -41,6 +41,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
 import org.codelibs.fess.crawler.extractor.Extractor;
+import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
 import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
@@ -218,7 +219,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
             buf.append(contentMeta);
         }
         final String bodyBase = buf.toString().trim();
-        final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
+        final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
         putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
                 .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
@@ -381,6 +382,14 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
         params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
         params.put(ExtractData.URL, responseData.getUrl());
+        Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
+        if (configParam != null) {
+            String keepOriginalBody = configParam.get("keep_original_body");
+            if (StringUtil.isNotBlank(keepOriginalBody)) {
+                params.put(TikaExtractor.NORMALIZE_TEXT, Constants.TRUE.equalsIgnoreCase(keepOriginalBody) ? Constants.FALSE
+                        : Constants.TRUE);
+            }
+        }
         return params;
     }
 

+ 2 - 1
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -392,7 +392,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         // title
         // content
         final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
-        putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
+        putResultDataBody(dataMap, fessConfig.getIndexFieldContent(),
+                documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
                 .isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
             if (responseData.getContentLength() > 0

+ 10 - 1
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -50,6 +50,7 @@ import org.codelibs.fess.crawler.rule.RuleManager;
 import org.codelibs.fess.crawler.transformer.Transformer;
 import org.codelibs.fess.crawler.util.TextUtil;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
+import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.lastaflute.di.core.SingletonLaContainer;
@@ -74,11 +75,19 @@ public class DocumentHelper {
         }
     }
 
-    public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
+    public String getContent(final CrawlingConfig crawlingConfig, final ResponseData responseData, final String content,
+            final Map<String, Object> dataMap) {
         if (content == null) {
             return StringUtil.EMPTY; // empty
         }
 
+        if (crawlingConfig != null) {
+            Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
+            if (configParam != null && Constants.TRUE.equalsIgnoreCase(configParam.get("keep_original_body"))) {
+                return content;
+            }
+        }
+
         final int maxAlphanumTermSize = getMaxAlphanumTermSize();
         final int maxSymbolTermSize = getMaxSymbolTermSize();
         final boolean duplicateTermRemoved = isDuplicateTermRemoved();

+ 42 - 42
src/test/java/org/codelibs/fess/helper/DocumentHelperTest.java

@@ -33,16 +33,16 @@ public class DocumentHelperTest extends UnitFessTestCase {
 
         ResponseData responseData = new ResponseData();
         Map<String, Object> dataMap = new HashMap<>();
-        assertEquals("", documentHelper.getContent(responseData, null, dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "  ", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
-        assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
-        assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
-        assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "  ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
+        assertEquals("123 あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
     }
 
     public void test_getContent_maxAlphanum() {
@@ -54,17 +54,17 @@ public class DocumentHelperTest extends UnitFessTestCase {
 
         ResponseData responseData = new ResponseData();
         Map<String, Object> dataMap = new HashMap<>();
-        assertEquals("", documentHelper.getContent(responseData, null, dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "  ", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
-        assertEquals("12 ab", documentHelper.getContent(responseData, " 123 abc ", dataMap));
-        assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
-        assertEquals("12 ab", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
-        assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "  ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
+        assertEquals("12 ab", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
+        assertEquals("123 あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
+        assertEquals("12 ab", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
+        assertEquals("12", documentHelper.getContent(null, responseData, " 123abc ", dataMap));
     }
 
     public void test_getContent_maxSymbol() {
@@ -76,27 +76,27 @@ public class DocumentHelperTest extends UnitFessTestCase {
 
         ResponseData responseData = new ResponseData();
         Map<String, Object> dataMap = new HashMap<>();
-        assertEquals("", documentHelper.getContent(responseData, null, dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "  ", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
-        assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
-        assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
-        assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
-        assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
-        assertEquals("123abc", documentHelper.getContent(responseData, " 123abc ", dataMap));
-
-        assertEquals("!!", documentHelper.getContent(responseData, "!!!", dataMap));
-        assertEquals("//", documentHelper.getContent(responseData, "///", dataMap));
-        assertEquals("::", documentHelper.getContent(responseData, ":::", dataMap));
-        assertEquals("@@", documentHelper.getContent(responseData, "@@@", dataMap));
-        assertEquals("[[", documentHelper.getContent(responseData, "[[[", dataMap));
-        assertEquals("``", documentHelper.getContent(responseData, "```", dataMap));
-        assertEquals("{{", documentHelper.getContent(responseData, "{{{", dataMap));
-        assertEquals("~~", documentHelper.getContent(responseData, "~~~", dataMap));
-        assertEquals("!\"", documentHelper.getContent(responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "  ", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
+        assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
+        assertEquals("123 あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
+        assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
+        assertEquals("123abc", documentHelper.getContent(null, responseData, " 123abc ", dataMap));
+
+        assertEquals("!!", documentHelper.getContent(null, responseData, "!!!", dataMap));
+        assertEquals("//", documentHelper.getContent(null, responseData, "///", dataMap));
+        assertEquals("::", documentHelper.getContent(null, responseData, ":::", dataMap));
+        assertEquals("@@", documentHelper.getContent(null, responseData, "@@@", dataMap));
+        assertEquals("[[", documentHelper.getContent(null, responseData, "[[[", dataMap));
+        assertEquals("``", documentHelper.getContent(null, responseData, "```", dataMap));
+        assertEquals("{{", documentHelper.getContent(null, responseData, "{{{", dataMap));
+        assertEquals("~~", documentHelper.getContent(null, responseData, "~~~", dataMap));
+        assertEquals("!\"", documentHelper.getContent(null, responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
     }
 
     public void test_getDigest() {