diff --git a/pom.xml b/pom.xml index f14ca78b2..ef206b855 100644 --- a/pom.xml +++ b/pom.xml @@ -1298,10 +1298,6 @@ org.apache.httpcomponents httpmime - - org.apache.commons - commons-exec - org.slf4j jcl-over-slf4j diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java index 2bd44b4bc..54ca95840 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java @@ -95,17 +95,13 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId()); final Extractor extractor = getExtractor(responseData); - final Map params = new HashMap<>(crawlingConfig.getConfigParameterMap(ConfigName.CONFIG)); - params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData)); final String mimeType = responseData.getMimeType(); - params.put(HttpHeaders.CONTENT_TYPE, mimeType); - params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet()); final StringBuilder contentMetaBuf = new StringBuilder(1000); final Map dataMap = new HashMap<>(); final Map metaDataMap = new HashMap<>(); String content; try (final InputStream in = responseData.getResponseBody()) { - final ExtractData extractData = getExtractData(extractor, in, params); + final ExtractData extractData = getExtractData(extractor, in, createExtractParams(responseData, crawlingConfig)); content = extractData.getContent(); if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) { return null; @@ -334,7 +330,16 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im return dataMap; } - private ExtractData getExtractData(final Extractor extractor, final InputStream in, final Map params) { + protected Map createExtractParams(final ResponseData responseData, final CrawlingConfig crawlingConfig) { + final Map params = new HashMap<>(crawlingConfig.getConfigParameterMap(ConfigName.CONFIG)); + params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData)); + params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType()); + params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet()); + params.put(ExtractData.URL, responseData.getUrl()); + return params; + } + + protected ExtractData getExtractData(final Extractor extractor, final InputStream in, final Map params) { try { return extractor.getText(in, params); } catch (final RuntimeException e) { @@ -455,4 +460,4 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im metaContentMapping.put(metaname, dynamicField); } -} \ No newline at end of file +} diff --git a/src/main/java/org/codelibs/fess/job/CrawlJob.java b/src/main/java/org/codelibs/fess/job/CrawlJob.java index 2bfa54217..10f3e5e58 100644 --- a/src/main/java/org/codelibs/fess/job/CrawlJob.java +++ b/src/main/java/org/codelibs/fess/job/CrawlJob.java @@ -289,6 +289,9 @@ public class CrawlJob { addSystemProperty(cmdList, "fess.log.level", null, null); } else { cmdList.add("-Dfess.log.level=" + logLevel); + if (logLevel.equalsIgnoreCase("debug")) { + cmdList.add("-Dorg.apache.tika.service.error.warn=true"); + } } stream(fessConfig.getJvmCrawlerOptionsAsArray()).of( stream -> stream.filter(StringUtil::isNotBlank).forEach(value -> cmdList.add(value)));