fix #2220 add config.keep_original_body

This commit is contained in:
Shinsuke Sugaya 2019-08-24 22:33:21 +09:00
parent 50ecb18987
commit 23eddea328
4 changed files with 63 additions and 44 deletions

View file

@ -41,6 +41,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
@ -218,7 +219,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
buf.append(contentMeta);
}
final String bodyBase = buf.toString().trim();
final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
@ -381,6 +382,14 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
params.put(ExtractData.URL, responseData.getUrl());
Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
if (configParam != null) {
String keepOriginalBody = configParam.get("keep_original_body");
if (StringUtil.isNotBlank(keepOriginalBody)) {
params.put(TikaExtractor.NORMALIZE_TEXT, Constants.TRUE.equalsIgnoreCase(keepOriginalBody) ? Constants.FALSE
: Constants.TRUE);
}
}
return params;
}

View file

@ -392,7 +392,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
// title
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(),
documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0

View file

@ -50,6 +50,7 @@ import org.codelibs.fess.crawler.rule.RuleManager;
import org.codelibs.fess.crawler.transformer.Transformer;
import org.codelibs.fess.crawler.util.TextUtil;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.lastaflute.di.core.SingletonLaContainer;
@ -74,11 +75,19 @@ public class DocumentHelper {
}
}
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
public String getContent(final CrawlingConfig crawlingConfig, final ResponseData responseData, final String content,
final Map<String, Object> dataMap) {
if (content == null) {
return StringUtil.EMPTY; // empty
}
if (crawlingConfig != null) {
Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
if (configParam != null && Constants.TRUE.equalsIgnoreCase(configParam.get("keep_original_body"))) {
return content;
}
}
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
final int maxSymbolTermSize = getMaxSymbolTermSize();
final boolean duplicateTermRemoved = isDuplicateTermRemoved();

View file

@ -33,16 +33,16 @@ public class DocumentHelperTest extends UnitFessTestCase {
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
}
public void test_getContent_maxAlphanum() {
@ -54,17 +54,17 @@ public class DocumentHelperTest extends UnitFessTestCase {
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
assertEquals("12 ab", documentHelper.getContent(responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
assertEquals("12 ab", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
assertEquals("12 ab", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
assertEquals("12 ab", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
assertEquals("12", documentHelper.getContent(null, responseData, " 123abc ", dataMap));
}
public void test_getContent_maxSymbol() {
@ -76,27 +76,27 @@ public class DocumentHelperTest extends UnitFessTestCase {
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
assertEquals("123abc", documentHelper.getContent(responseData, " 123abc ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
assertEquals("123abc", documentHelper.getContent(null, responseData, " 123abc ", dataMap));
assertEquals("!!", documentHelper.getContent(responseData, "!!!", dataMap));
assertEquals("//", documentHelper.getContent(responseData, "///", dataMap));
assertEquals("::", documentHelper.getContent(responseData, ":::", dataMap));
assertEquals("@@", documentHelper.getContent(responseData, "@@@", dataMap));
assertEquals("[[", documentHelper.getContent(responseData, "[[[", dataMap));
assertEquals("``", documentHelper.getContent(responseData, "```", dataMap));
assertEquals("{{", documentHelper.getContent(responseData, "{{{", dataMap));
assertEquals("~~", documentHelper.getContent(responseData, "~~~", dataMap));
assertEquals("!\"", documentHelper.getContent(responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
assertEquals("!!", documentHelper.getContent(null, responseData, "!!!", dataMap));
assertEquals("//", documentHelper.getContent(null, responseData, "///", dataMap));
assertEquals("::", documentHelper.getContent(null, responseData, ":::", dataMap));
assertEquals("@@", documentHelper.getContent(null, responseData, "@@@", dataMap));
assertEquals("[[", documentHelper.getContent(null, responseData, "[[[", dataMap));
assertEquals("``", documentHelper.getContent(null, responseData, "```", dataMap));
assertEquals("{{", documentHelper.getContent(null, responseData, "{{{", dataMap));
assertEquals("~~", documentHelper.getContent(null, responseData, "~~~", dataMap));
assertEquals("!\"", documentHelper.getContent(null, responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
}
public void test_getDigest() {