fix #2223 improve text normalization

This commit is contained in:
Shinsuke Sugaya 2019-08-25 21:46:12 +09:00
parent 6bca326eaf
commit ae22ad845a
3 changed files with 29 additions and 3 deletions

View file

@ -220,6 +220,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
buf.append(contentMeta);
}
final String bodyBase = buf.toString().trim();
responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig

View file

@ -29,6 +29,8 @@ import java.util.Set;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import javax.annotation.PostConstruct;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.io.ReaderUtil;
import org.codelibs.core.io.SerializeUtil;
@ -43,6 +45,8 @@ import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
import org.codelibs.fess.crawler.processor.ResponseProcessor;
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
import org.codelibs.fess.crawler.rule.Rule;
@ -55,6 +59,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.di.core.exception.ComponentNotFoundException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -63,6 +68,25 @@ public class DocumentHelper {
protected static final String SIMILAR_DOC_HASH_PREFIX = "$";
@PostConstruct
public void init() {
try {
final TikaExtractor tikaExtractor = ComponentUtil.getComponent("tikaExtractor");
if (tikaExtractor != null) {
tikaExtractor.setMaxAlphanumTermSize(getMaxAlphanumTermSize());
tikaExtractor.setMaxSymbolTermSize(getMaxSymbolTermSize());
tikaExtractor.setReplaceDuplication(isDuplicateTermRemoved());
tikaExtractor.setSpaceChars(getSpaceChars());
}
} catch (ComponentNotFoundException e) {
if (logger.isDebugEnabled()) {
logger.debug("tikaExtractor is not found: " + e.getMessage());
}
} catch (Exception e) {
logger.warn("Failed to initiaize TikaExtractor.", e);
}
}
public String getTitle(final ResponseData responseData, final String title, final Map<String, Object> dataMap) {
if (title == null) {
return StringUtil.EMPTY; // empty
@ -89,6 +113,10 @@ public class DocumentHelper {
}
}
if (responseData.getMetaDataMap().get(Extractor.class.getSimpleName()) instanceof TikaExtractor) {
return content;
}
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
final int maxSymbolTermSize = getMaxSymbolTermSize();
final boolean duplicateTermRemoved = isDuplicateTermRemoved();

View file

@ -7,8 +7,5 @@
class="org.codelibs.fess.crawler.extractor.impl.TikaExtractor">
<property name="maxCompressionRatio">2</property>
<property name="maxUncompressionSize">10000000</property>
<property name="maxAlphanumTermSize">20</property>
<property name="maxSymbolTermSize">10</property>
<property name="readAsTextIfFailed">false</property>
</component>
</components>