瀏覽代碼

fix #2223 improve text normalization

Shinsuke Sugaya 6 年之前
父節點
當前提交
ae22ad845a

+ 1 - 0
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -220,6 +220,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
             buf.append(contentMeta);
             buf.append(contentMeta);
         }
         }
         final String bodyBase = buf.toString().trim();
         final String bodyBase = buf.toString().trim();
+        responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
         final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
         final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
         putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
         putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
         if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig

+ 28 - 0
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -29,6 +29,8 @@ import java.util.Set;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPInputStream;
 import java.util.zip.GZIPOutputStream;
 import java.util.zip.GZIPOutputStream;
 
 
+import javax.annotation.PostConstruct;
+
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.codelibs.core.io.ReaderUtil;
 import org.codelibs.core.io.ReaderUtil;
 import org.codelibs.core.io.SerializeUtil;
 import org.codelibs.core.io.SerializeUtil;
@@ -43,6 +45,8 @@ import org.codelibs.fess.crawler.entity.ResultData;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
+import org.codelibs.fess.crawler.extractor.Extractor;
+import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
 import org.codelibs.fess.crawler.processor.ResponseProcessor;
 import org.codelibs.fess.crawler.processor.ResponseProcessor;
 import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
 import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
 import org.codelibs.fess.crawler.rule.Rule;
 import org.codelibs.fess.crawler.rule.Rule;
@@ -55,6 +59,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.util.ComponentUtil;
 import org.lastaflute.di.core.SingletonLaContainer;
 import org.lastaflute.di.core.SingletonLaContainer;
+import org.lastaflute.di.core.exception.ComponentNotFoundException;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.LoggerFactory;
 
 
@@ -63,6 +68,25 @@ public class DocumentHelper {
 
 
     protected static final String SIMILAR_DOC_HASH_PREFIX = "$";
     protected static final String SIMILAR_DOC_HASH_PREFIX = "$";
 
 
+    @PostConstruct
+    public void init() {
+        try {
+            final TikaExtractor tikaExtractor = ComponentUtil.getComponent("tikaExtractor");
+            if (tikaExtractor != null) {
+                tikaExtractor.setMaxAlphanumTermSize(getMaxAlphanumTermSize());
+                tikaExtractor.setMaxSymbolTermSize(getMaxSymbolTermSize());
+                tikaExtractor.setReplaceDuplication(isDuplicateTermRemoved());
+                tikaExtractor.setSpaceChars(getSpaceChars());
+            }
+        } catch (ComponentNotFoundException e) {
+            if (logger.isDebugEnabled()) {
+                logger.debug("tikaExtractor is not found: " + e.getMessage());
+            }
+        } catch (Exception e) {
+            logger.warn("Failed to initiaize TikaExtractor.", e);
+        }
+    }
+
     public String getTitle(final ResponseData responseData, final String title, final Map<String, Object> dataMap) {
     public String getTitle(final ResponseData responseData, final String title, final Map<String, Object> dataMap) {
         if (title == null) {
         if (title == null) {
             return StringUtil.EMPTY; // empty
             return StringUtil.EMPTY; // empty
@@ -89,6 +113,10 @@ public class DocumentHelper {
             }
             }
         }
         }
 
 
+        if (responseData.getMetaDataMap().get(Extractor.class.getSimpleName()) instanceof TikaExtractor) {
+            return content;
+        }
+
         final int maxAlphanumTermSize = getMaxAlphanumTermSize();
         final int maxAlphanumTermSize = getMaxAlphanumTermSize();
         final int maxSymbolTermSize = getMaxSymbolTermSize();
         final int maxSymbolTermSize = getMaxSymbolTermSize();
         final boolean duplicateTermRemoved = isDuplicateTermRemoved();
         final boolean duplicateTermRemoved = isDuplicateTermRemoved();

+ 0 - 3
src/main/resources/crawler/extractor+tikaExtractor.xml

@@ -7,8 +7,5 @@
 		class="org.codelibs.fess.crawler.extractor.impl.TikaExtractor">
 		class="org.codelibs.fess.crawler.extractor.impl.TikaExtractor">
 		<property name="maxCompressionRatio">2</property>
 		<property name="maxCompressionRatio">2</property>
 		<property name="maxUncompressionSize">10000000</property>
 		<property name="maxUncompressionSize">10000000</property>
-		<property name="maxAlphanumTermSize">20</property>
-		<property name="maxSymbolTermSize">10</property>
-		<property name="readAsTextIfFailed">false</property>
 	</component>
 	</component>
 </components>
 </components>