fix #2220 add config.keep_original_body
This commit is contained in:
parent
50ecb18987
commit
23eddea328
4 changed files with 63 additions and 44 deletions
|
@ -41,6 +41,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
|
|||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.extractor.Extractor;
|
||||
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
|
||||
import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
||||
|
@ -218,7 +219,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
buf.append(contentMeta);
|
||||
}
|
||||
final String bodyBase = buf.toString().trim();
|
||||
final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
|
||||
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
|
||||
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
|
||||
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
|
||||
|
@ -381,6 +382,14 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
|
||||
params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
|
||||
params.put(ExtractData.URL, responseData.getUrl());
|
||||
Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
|
||||
if (configParam != null) {
|
||||
String keepOriginalBody = configParam.get("keep_original_body");
|
||||
if (StringUtil.isNotBlank(keepOriginalBody)) {
|
||||
params.put(TikaExtractor.NORMALIZE_TEXT, Constants.TRUE.equalsIgnoreCase(keepOriginalBody) ? Constants.FALSE
|
||||
: Constants.TRUE);
|
||||
}
|
||||
}
|
||||
return params;
|
||||
}
|
||||
|
||||
|
|
|
@ -392,7 +392,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
// title
|
||||
// content
|
||||
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(),
|
||||
documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
|
||||
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
|
||||
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
|
||||
if (responseData.getContentLength() > 0
|
||||
|
|
|
@ -50,6 +50,7 @@ import org.codelibs.fess.crawler.rule.RuleManager;
|
|||
import org.codelibs.fess.crawler.transformer.Transformer;
|
||||
import org.codelibs.fess.crawler.util.TextUtil;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
|
@ -74,11 +75,19 @@ public class DocumentHelper {
|
|||
}
|
||||
}
|
||||
|
||||
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
|
||||
public String getContent(final CrawlingConfig crawlingConfig, final ResponseData responseData, final String content,
|
||||
final Map<String, Object> dataMap) {
|
||||
if (content == null) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
}
|
||||
|
||||
if (crawlingConfig != null) {
|
||||
Map<String, String> configParam = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
|
||||
if (configParam != null && Constants.TRUE.equalsIgnoreCase(configParam.get("keep_original_body"))) {
|
||||
return content;
|
||||
}
|
||||
}
|
||||
|
||||
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
|
||||
final int maxSymbolTermSize = getMaxSymbolTermSize();
|
||||
final boolean duplicateTermRemoved = isDuplicateTermRemoved();
|
||||
|
|
|
@ -33,16 +33,16 @@ public class DocumentHelperTest extends UnitFessTestCase {
|
|||
|
||||
ResponseData responseData = new ResponseData();
|
||||
Map<String, Object> dataMap = new HashMap<>();
|
||||
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
|
||||
}
|
||||
|
||||
public void test_getContent_maxAlphanum() {
|
||||
|
@ -54,17 +54,17 @@ public class DocumentHelperTest extends UnitFessTestCase {
|
|||
|
||||
ResponseData responseData = new ResponseData();
|
||||
Map<String, Object> dataMap = new HashMap<>();
|
||||
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
|
||||
assertEquals("12 ab", documentHelper.getContent(responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("12 ab", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
|
||||
assertEquals("12", documentHelper.getContent(responseData, " 123abc ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
|
||||
assertEquals("12 ab", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("12 ab", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
|
||||
assertEquals("12", documentHelper.getContent(null, responseData, " 123abc ", dataMap));
|
||||
}
|
||||
|
||||
public void test_getContent_maxSymbol() {
|
||||
|
@ -76,27 +76,27 @@ public class DocumentHelperTest extends UnitFessTestCase {
|
|||
|
||||
ResponseData responseData = new ResponseData();
|
||||
Map<String, Object> dataMap = new HashMap<>();
|
||||
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
|
||||
assertEquals("123abc", documentHelper.getContent(responseData, " 123abc ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, null, dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, " ", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t\t", dataMap));
|
||||
assertEquals("", documentHelper.getContent(null, responseData, "\t \t", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123 abc ", dataMap));
|
||||
assertEquals("123 あいう", documentHelper.getContent(null, responseData, " 123 あいう ", dataMap));
|
||||
assertEquals("123 abc", documentHelper.getContent(null, responseData, " 123\nabc ", dataMap));
|
||||
assertEquals("123abc", documentHelper.getContent(null, responseData, " 123abc ", dataMap));
|
||||
|
||||
assertEquals("!!", documentHelper.getContent(responseData, "!!!", dataMap));
|
||||
assertEquals("//", documentHelper.getContent(responseData, "///", dataMap));
|
||||
assertEquals("::", documentHelper.getContent(responseData, ":::", dataMap));
|
||||
assertEquals("@@", documentHelper.getContent(responseData, "@@@", dataMap));
|
||||
assertEquals("[[", documentHelper.getContent(responseData, "[[[", dataMap));
|
||||
assertEquals("``", documentHelper.getContent(responseData, "```", dataMap));
|
||||
assertEquals("{{", documentHelper.getContent(responseData, "{{{", dataMap));
|
||||
assertEquals("~~", documentHelper.getContent(responseData, "~~~", dataMap));
|
||||
assertEquals("!\"", documentHelper.getContent(responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
|
||||
assertEquals("!!", documentHelper.getContent(null, responseData, "!!!", dataMap));
|
||||
assertEquals("//", documentHelper.getContent(null, responseData, "///", dataMap));
|
||||
assertEquals("::", documentHelper.getContent(null, responseData, ":::", dataMap));
|
||||
assertEquals("@@", documentHelper.getContent(null, responseData, "@@@", dataMap));
|
||||
assertEquals("[[", documentHelper.getContent(null, responseData, "[[[", dataMap));
|
||||
assertEquals("``", documentHelper.getContent(null, responseData, "```", dataMap));
|
||||
assertEquals("{{", documentHelper.getContent(null, responseData, "{{{", dataMap));
|
||||
assertEquals("~~", documentHelper.getContent(null, responseData, "~~~", dataMap));
|
||||
assertEquals("!\"", documentHelper.getContent(null, responseData, "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", dataMap));
|
||||
}
|
||||
|
||||
public void test_getDigest() {
|
||||
|
|
Loading…
Add table
Reference in a new issue