fix #2826 Added support for defining field configurations via field.config.fieldname in crawl settings.
This commit is contained in:
parent
89072190b2
commit
5d74bd3c3b
7 changed files with 214 additions and 12 deletions
|
@ -44,6 +44,7 @@ import org.codelibs.fess.crawler.extractor.Extractor;
|
|||
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
|
||||
import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
|
||||
|
@ -181,7 +182,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
|
||||
url = pathMappingHelper.replaceUrl(sessionId, url);
|
||||
|
||||
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
|
||||
final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));
|
||||
|
||||
String urlEncoding;
|
||||
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
|
||||
|
@ -221,7 +222,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
|
||||
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
|
||||
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
|
||||
if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
|
||||
|| fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
|
||||
if (responseData.getContentLength() > 0
|
||||
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
|
||||
|
@ -334,7 +335,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key), scriptType);
|
||||
}
|
||||
|
||||
return dataMap;
|
||||
return processFieldConfigs(dataMap, fieldConfigs);
|
||||
}
|
||||
|
||||
protected Date getLastModified(final Map<String, Object> dataMap, final ResponseData responseData) {
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.util.Arrays;
|
|||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -31,6 +32,7 @@ import org.codelibs.fess.crawler.entity.AccessResult;
|
|||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.UrlQueue;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
|
||||
|
@ -248,4 +250,17 @@ public interface FessTransformer {
|
|||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
default Map<String, Object> processFieldConfigs(final Map<String, Object> dataMap, final FieldConfigs fieldConfigs) {
|
||||
final Map<String, Object> newDataMap = new LinkedHashMap<>();
|
||||
for (Map.Entry<String, Object> e : dataMap.entrySet()) {
|
||||
if (fieldConfigs.getConfig(e.getKey()).map(FieldConfigs.Config::isOverwrite).orElse(false)
|
||||
&& e.getValue() instanceof Object[] values && values.length > 0) {
|
||||
newDataMap.put(e.getKey(), values[values.length - 1]);
|
||||
} else {
|
||||
newDataMap.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
return newDataMap;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -56,6 +56,7 @@ import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
|||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
|
||||
|
@ -152,7 +153,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
processMetaRobots(responseData, resultData, document);
|
||||
processXRobotsTag(responseData, resultData);
|
||||
|
||||
final Map<String, Object> dataMap = new LinkedHashMap<>();
|
||||
Map<String, Object> dataMap = new LinkedHashMap<>();
|
||||
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
|
||||
final String path = entry.getValue();
|
||||
try {
|
||||
|
@ -184,7 +185,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
}
|
||||
}
|
||||
|
||||
putAdditionalData(dataMap, responseData, document);
|
||||
dataMap = processAdditionalData(dataMap, responseData, document);
|
||||
normalizeData(responseData, dataMap);
|
||||
|
||||
try {
|
||||
|
@ -336,7 +337,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
return true;
|
||||
}
|
||||
|
||||
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
|
||||
protected Map<String, Object> processAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData,
|
||||
final Document document) {
|
||||
// canonical
|
||||
final String canonicalUrl = getCanonicalUrl(responseData, document);
|
||||
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
|
||||
|
@ -362,7 +364,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
url = pathMappingHelper.replaceUrl(sessionId, url);
|
||||
final String mimeType = responseData.getMimeType();
|
||||
|
||||
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
|
||||
final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));
|
||||
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
|
||||
|
||||
String urlEncoding;
|
||||
|
@ -394,7 +396,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
prunedContent ? node -> pruneNode(node, crawlingConfig) : node -> node);
|
||||
final String fileName = getFileName(url, urlEncoding);
|
||||
putResultDataContent(dataMap, responseData, fessConfig, crawlingConfig, documentHelper, body, fileName);
|
||||
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
|
||||
if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
|
||||
|| fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
|
||||
if (responseData.getContentLength() > 0
|
||||
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
|
||||
|
@ -499,6 +501,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
final String value = e.getValue();
|
||||
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
|
||||
});
|
||||
|
||||
return processFieldConfigs(dataMap, fieldConfigs);
|
||||
}
|
||||
|
||||
protected void putResultDataContent(final Map<String, Object> dataMap, final ResponseData responseData, final FessConfig fessConfig,
|
||||
|
|
|
@ -0,0 +1,76 @@
|
|||
/*
|
||||
* Copyright 2012-2024 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.crawler.util;
|
||||
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.stream.StreamUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.dbflute.optional.OptionalThing;
|
||||
|
||||
public class FieldConfigs {
|
||||
|
||||
private final Map<String, String> params;
|
||||
|
||||
public FieldConfigs(Map<String, String> params) {
|
||||
this.params = params;
|
||||
}
|
||||
|
||||
public OptionalThing<Config> getConfig(String fieldName) {
|
||||
String value = params.get(fieldName);
|
||||
if (StringUtil.isNotBlank(value)) {
|
||||
return OptionalThing.of(new Config(value));
|
||||
}
|
||||
return OptionalThing.empty();
|
||||
}
|
||||
|
||||
public static class Config {
|
||||
|
||||
private final String[] values;
|
||||
|
||||
public Config(String value) {
|
||||
values = StreamUtil.split(value, Pattern.quote("|")).get(stream -> stream.map(s -> s.trim()).toArray(n -> new String[n]));
|
||||
}
|
||||
|
||||
public boolean isCache() {
|
||||
for (final String value : values) {
|
||||
if ("cache".equalsIgnoreCase(value)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// backward compatibility
|
||||
if (values.length == 1 && Constants.TRUE.equalsIgnoreCase(values[0])) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean isOverwrite() {
|
||||
for (final String value : values) {
|
||||
if ("overwrite".equalsIgnoreCase(value)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public String[] getValues() {
|
||||
return values;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,8 +17,11 @@ package org.codelibs.fess.crawler.transformer;
|
|||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URLEncoder;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.groovy.util.Maps;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.exception.FessSystemException;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
|
||||
|
@ -269,6 +272,21 @@ public class FessFileTransformerTest extends UnitFessTestCase {
|
|||
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
|
||||
}
|
||||
|
||||
public void test_processFieldConfigs() {
|
||||
final FessFileTransformer transformer = createInstance();
|
||||
final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
|
||||
FieldConfigs fieldConfigs = new FieldConfigs(params);
|
||||
final Map<String, Object> dataMap = Map.of(//
|
||||
"foo", new String[] { "aaa", "bbb" }, //
|
||||
"bar", new String[] { "ccc", "ddd" }, //
|
||||
"baz", new String[] { "eee", "fff" });
|
||||
final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
|
||||
assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
|
||||
assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
|
||||
assertEquals("ddd", resultMap.get("bar"));
|
||||
assertEquals("fff", resultMap.get("baz"));
|
||||
}
|
||||
|
||||
private FessFileTransformer createInstance() {
|
||||
final FessFileTransformer transformer = new FessFileTransformer();
|
||||
transformer.init();
|
||||
|
|
|
@ -32,6 +32,7 @@ import javax.xml.transform.TransformerFactory;
|
|||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.apache.groovy.util.Maps;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.codelibs.core.lang.ClassUtil;
|
||||
|
@ -42,6 +43,7 @@ import org.codelibs.fess.crawler.entity.RequestData;
|
|||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
||||
import org.codelibs.fess.es.config.exentity.WebConfig;
|
||||
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
||||
|
@ -626,7 +628,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
String data = "<html><body>aaa</body></html>";
|
||||
Document document = getDocument(data);
|
||||
try {
|
||||
transformer.putAdditionalData(dataMap, responseData, document);
|
||||
transformer.processAdditionalData(dataMap, responseData, document);
|
||||
fail();
|
||||
} catch (final ComponentNotFoundException e) {
|
||||
// ignore
|
||||
|
@ -635,7 +637,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
|
||||
document = getDocument(data);
|
||||
try {
|
||||
transformer.putAdditionalData(dataMap, responseData, document);
|
||||
transformer.processAdditionalData(dataMap, responseData, document);
|
||||
fail();
|
||||
} catch (final ComponentNotFoundException e) {
|
||||
// ignore
|
||||
|
@ -644,7 +646,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/foo\"></head><body>aaa</body></html>";
|
||||
document = getDocument(data);
|
||||
try {
|
||||
transformer.putAdditionalData(dataMap, responseData, document);
|
||||
transformer.processAdditionalData(dataMap, responseData, document);
|
||||
fail();
|
||||
} catch (final ChildUrlsException e) {
|
||||
final Set<RequestData> childUrlList = e.getChildUrlList();
|
||||
|
@ -655,7 +657,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
data = "<html><link rel=\"canonical\" href=\"http://example.com/foo\"><body>aaa</body></html>";
|
||||
document = getDocument(data);
|
||||
try {
|
||||
transformer.putAdditionalData(dataMap, responseData, document);
|
||||
transformer.processAdditionalData(dataMap, responseData, document);
|
||||
fail();
|
||||
} catch (final ChildUrlsException e) {
|
||||
final Set<RequestData> childUrlList = e.getChildUrlList();
|
||||
|
@ -904,4 +906,19 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
assertFalse(transformer.isValidUrl("http://"));
|
||||
assertFalse(transformer.isValidUrl("http://http://www.example.com"));
|
||||
}
|
||||
|
||||
public void test_processFieldConfigs() {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
|
||||
FieldConfigs fieldConfigs = new FieldConfigs(params);
|
||||
final Map<String, Object> dataMap = Map.of(//
|
||||
"foo", new String[] { "aaa", "bbb" }, //
|
||||
"bar", new String[] { "ccc", "ddd" }, //
|
||||
"baz", new String[] { "eee", "fff" });
|
||||
final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
|
||||
assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
|
||||
assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
|
||||
assertEquals("ddd", resultMap.get("bar"));
|
||||
assertEquals("fff", resultMap.get("baz"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/*
|
||||
* Copyright 2012-2024 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.crawler.util;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.groovy.util.Maps;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
|
||||
public class FieldConfigsTest extends UnitFessTestCase {
|
||||
public void test_empty() {
|
||||
final FieldConfigs fieldConfigs = new FieldConfigs(Collections.emptyMap());
|
||||
assertTrue(fieldConfigs.getConfig("test").isEmpty());
|
||||
}
|
||||
|
||||
public void test_values() {
|
||||
final Map<String, String> params = Maps.of("foo", "bar");
|
||||
FieldConfigs fieldConfigs = new FieldConfigs(params);
|
||||
assertTrue(fieldConfigs.getConfig("test").isEmpty());
|
||||
assertFalse(fieldConfigs.getConfig("foo").isEmpty());
|
||||
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
|
||||
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
|
||||
assertEquals("bar", fieldConfigs.getConfig("foo").map(FieldConfigs.Config::getValues).orElse(new String[0])[0]);
|
||||
}
|
||||
|
||||
public void test_cache_true() {
|
||||
final Map<String, String> params = Maps.of("foo", "true");
|
||||
FieldConfigs fieldConfigs = new FieldConfigs(params);
|
||||
assertTrue(fieldConfigs.getConfig("test").isEmpty());
|
||||
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
|
||||
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
|
||||
}
|
||||
|
||||
public void test_cache() {
|
||||
final Map<String, String> params = Maps.of("foo", "cache");
|
||||
FieldConfigs fieldConfigs = new FieldConfigs(params);
|
||||
assertTrue(fieldConfigs.getConfig("test").isEmpty());
|
||||
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
|
||||
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
|
||||
}
|
||||
|
||||
public void test_overwrite() {
|
||||
final Map<String, String> params = Maps.of("foo", "overwrite");
|
||||
FieldConfigs fieldConfigs = new FieldConfigs(params);
|
||||
assertTrue(fieldConfigs.getConfig("test").isEmpty());
|
||||
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
|
||||
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
|
||||
}
|
||||
|
||||
public void test_cache_overwrite() {
|
||||
final Map<String, String> params = Maps.of("foo", "cache|overwrite");
|
||||
FieldConfigs fieldConfigs = new FieldConfigs(params);
|
||||
assertTrue(fieldConfigs.getConfig("test").isEmpty());
|
||||
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
|
||||
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
|
||||
}
|
||||
}
|
Loading…
Add table
Reference in a new issue