fix #2223 improve text normalization
This commit is contained in:
parent
6bca326eaf
commit
ae22ad845a
3 changed files with 29 additions and 3 deletions
|
@ -220,6 +220,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
buf.append(contentMeta);
|
||||
}
|
||||
final String bodyBase = buf.toString().trim();
|
||||
responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
|
||||
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
|
||||
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
|
||||
|
|
|
@ -29,6 +29,8 @@ import java.util.Set;
|
|||
import java.util.zip.GZIPInputStream;
|
||||
import java.util.zip.GZIPOutputStream;
|
||||
|
||||
import javax.annotation.PostConstruct;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.codelibs.core.io.ReaderUtil;
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
|
@ -43,6 +45,8 @@ import org.codelibs.fess.crawler.entity.ResultData;
|
|||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.extractor.Extractor;
|
||||
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
|
||||
import org.codelibs.fess.crawler.processor.ResponseProcessor;
|
||||
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
|
||||
import org.codelibs.fess.crawler.rule.Rule;
|
||||
|
@ -55,6 +59,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param;
|
|||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.lastaflute.di.core.exception.ComponentNotFoundException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -63,6 +68,25 @@ public class DocumentHelper {
|
|||
|
||||
protected static final String SIMILAR_DOC_HASH_PREFIX = "$";
|
||||
|
||||
@PostConstruct
|
||||
public void init() {
|
||||
try {
|
||||
final TikaExtractor tikaExtractor = ComponentUtil.getComponent("tikaExtractor");
|
||||
if (tikaExtractor != null) {
|
||||
tikaExtractor.setMaxAlphanumTermSize(getMaxAlphanumTermSize());
|
||||
tikaExtractor.setMaxSymbolTermSize(getMaxSymbolTermSize());
|
||||
tikaExtractor.setReplaceDuplication(isDuplicateTermRemoved());
|
||||
tikaExtractor.setSpaceChars(getSpaceChars());
|
||||
}
|
||||
} catch (ComponentNotFoundException e) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("tikaExtractor is not found: " + e.getMessage());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to initiaize TikaExtractor.", e);
|
||||
}
|
||||
}
|
||||
|
||||
public String getTitle(final ResponseData responseData, final String title, final Map<String, Object> dataMap) {
|
||||
if (title == null) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
|
@ -89,6 +113,10 @@ public class DocumentHelper {
|
|||
}
|
||||
}
|
||||
|
||||
if (responseData.getMetaDataMap().get(Extractor.class.getSimpleName()) instanceof TikaExtractor) {
|
||||
return content;
|
||||
}
|
||||
|
||||
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
|
||||
final int maxSymbolTermSize = getMaxSymbolTermSize();
|
||||
final boolean duplicateTermRemoved = isDuplicateTermRemoved();
|
||||
|
|
|
@ -7,8 +7,5 @@
|
|||
class="org.codelibs.fess.crawler.extractor.impl.TikaExtractor">
|
||||
<property name="maxCompressionRatio">2</property>
|
||||
<property name="maxUncompressionSize">10000000</property>
|
||||
<property name="maxAlphanumTermSize">20</property>
|
||||
<property name="maxSymbolTermSize">10</property>
|
||||
<property name="readAsTextIfFailed">false</property>
|
||||
</component>
|
||||
</components>
|
||||
|
|
Loading…
Add table
Reference in a new issue