fix #2826 Added support for defining field configurations via field.config.fieldname in crawl settings.

This commit is contained in:
Shinsuke Sugaya 2024-07-04 15:20:49 +09:00
parent 89072190b2
commit 5d74bd3c3b
7 changed files with 214 additions and 12 deletions

View file

@ -44,6 +44,7 @@ import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
@ -181,7 +182,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
@ -221,7 +222,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
|| fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
@ -334,7 +335,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key), scriptType);
}
return dataMap;
return processFieldConfigs(dataMap, fieldConfigs);
}
protected Date getLastModified(final Map<String, Object> dataMap, final ResponseData responseData) {

View file

@ -20,6 +20,7 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
@ -31,6 +32,7 @@ import org.codelibs.fess.crawler.entity.AccessResult;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
@ -248,4 +250,17 @@ public interface FessTransformer {
}
return null;
}
default Map<String, Object> processFieldConfigs(final Map<String, Object> dataMap, final FieldConfigs fieldConfigs) {
final Map<String, Object> newDataMap = new LinkedHashMap<>();
for (Map.Entry<String, Object> e : dataMap.entrySet()) {
if (fieldConfigs.getConfig(e.getKey()).map(FieldConfigs.Config::isOverwrite).orElse(false)
&& e.getValue() instanceof Object[] values && values.length > 0) {
newDataMap.put(e.getKey(), values[values.length - 1]);
} else {
newDataMap.put(e.getKey(), e.getValue());
}
}
return newDataMap;
}
}

View file

@ -56,6 +56,7 @@ import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
@ -152,7 +153,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
processMetaRobots(responseData, resultData, document);
processXRobotsTag(responseData, resultData);
final Map<String, Object> dataMap = new LinkedHashMap<>();
Map<String, Object> dataMap = new LinkedHashMap<>();
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
@ -184,7 +185,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
}
putAdditionalData(dataMap, responseData, document);
dataMap = processAdditionalData(dataMap, responseData, document);
normalizeData(responseData, dataMap);
try {
@ -336,7 +337,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return true;
}
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
protected Map<String, Object> processAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData,
final Document document) {
// canonical
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
@ -362,7 +364,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
String urlEncoding;
@ -394,7 +396,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
prunedContent ? node -> pruneNode(node, crawlingConfig) : node -> node);
final String fileName = getFileName(url, urlEncoding);
putResultDataContent(dataMap, responseData, fessConfig, crawlingConfig, documentHelper, body, fileName);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
|| fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
@ -499,6 +501,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final String value = e.getValue();
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
return processFieldConfigs(dataMap, fieldConfigs);
}
protected void putResultDataContent(final Map<String, Object> dataMap, final ResponseData responseData, final FessConfig fessConfig,

View file

@ -0,0 +1,76 @@
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler.util;
import java.util.Map;
import java.util.regex.Pattern;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.stream.StreamUtil;
import org.codelibs.fess.Constants;
import org.dbflute.optional.OptionalThing;
public class FieldConfigs {
private final Map<String, String> params;
public FieldConfigs(Map<String, String> params) {
this.params = params;
}
public OptionalThing<Config> getConfig(String fieldName) {
String value = params.get(fieldName);
if (StringUtil.isNotBlank(value)) {
return OptionalThing.of(new Config(value));
}
return OptionalThing.empty();
}
public static class Config {
private final String[] values;
public Config(String value) {
values = StreamUtil.split(value, Pattern.quote("|")).get(stream -> stream.map(s -> s.trim()).toArray(n -> new String[n]));
}
public boolean isCache() {
for (final String value : values) {
if ("cache".equalsIgnoreCase(value)) {
return true;
}
}
// backward compatibility
if (values.length == 1 && Constants.TRUE.equalsIgnoreCase(values[0])) {
return true;
}
return false;
}
public boolean isOverwrite() {
for (final String value : values) {
if ("overwrite".equalsIgnoreCase(value)) {
return true;
}
}
return false;
}
public String[] getValues() {
return values;
}
}
}

View file

@ -17,8 +17,11 @@ package org.codelibs.fess.crawler.transformer;
import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.Map;
import org.apache.groovy.util.Maps;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.unit.UnitFessTestCase;
@ -269,6 +272,21 @@ public class FessFileTransformerTest extends UnitFessTestCase {
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
}
public void test_processFieldConfigs() {
final FessFileTransformer transformer = createInstance();
final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
final Map<String, Object> dataMap = Map.of(//
"foo", new String[] { "aaa", "bbb" }, //
"bar", new String[] { "ccc", "ddd" }, //
"baz", new String[] { "eee", "fff" });
final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
assertEquals("ddd", resultMap.get("bar"));
assertEquals("fff", resultMap.get("baz"));
}
private FessFileTransformer createInstance() {
final FessFileTransformer transformer = new FessFileTransformer();
transformer.init();

View file

@ -32,6 +32,7 @@ import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.apache.groovy.util.Maps;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.lang.ClassUtil;
@ -42,6 +43,7 @@ import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.es.config.exentity.WebConfig;
import org.codelibs.fess.helper.CrawlingConfigHelper;
@ -626,7 +628,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
String data = "<html><body>aaa</body></html>";
Document document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ComponentNotFoundException e) {
// ignore
@ -635,7 +637,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ComponentNotFoundException e) {
// ignore
@ -644,7 +646,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/foo\"></head><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<RequestData> childUrlList = e.getChildUrlList();
@ -655,7 +657,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
data = "<html><link rel=\"canonical\" href=\"http://example.com/foo\"><body>aaa</body></html>";
document = getDocument(data);
try {
transformer.putAdditionalData(dataMap, responseData, document);
transformer.processAdditionalData(dataMap, responseData, document);
fail();
} catch (final ChildUrlsException e) {
final Set<RequestData> childUrlList = e.getChildUrlList();
@ -904,4 +906,19 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
assertFalse(transformer.isValidUrl("http://"));
assertFalse(transformer.isValidUrl("http://http://www.example.com"));
}
public void test_processFieldConfigs() {
final FessXpathTransformer transformer = new FessXpathTransformer();
final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
final Map<String, Object> dataMap = Map.of(//
"foo", new String[] { "aaa", "bbb" }, //
"bar", new String[] { "ccc", "ddd" }, //
"baz", new String[] { "eee", "fff" });
final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
assertEquals("ddd", resultMap.get("bar"));
assertEquals("fff", resultMap.get("baz"));
}
}

View file

@ -0,0 +1,71 @@
/*
* Copyright 2012-2024 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler.util;
import java.util.Collections;
import java.util.Map;
import org.apache.groovy.util.Maps;
import org.codelibs.fess.unit.UnitFessTestCase;
public class FieldConfigsTest extends UnitFessTestCase {
public void test_empty() {
final FieldConfigs fieldConfigs = new FieldConfigs(Collections.emptyMap());
assertTrue(fieldConfigs.getConfig("test").isEmpty());
}
public void test_values() {
final Map<String, String> params = Maps.of("foo", "bar");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertFalse(fieldConfigs.getConfig("foo").isEmpty());
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
assertEquals("bar", fieldConfigs.getConfig("foo").map(FieldConfigs.Config::getValues).orElse(new String[0])[0]);
}
public void test_cache_true() {
final Map<String, String> params = Maps.of("foo", "true");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}
public void test_cache() {
final Map<String, String> params = Maps.of("foo", "cache");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}
public void test_overwrite() {
final Map<String, String> params = Maps.of("foo", "overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}
public void test_cache_overwrite() {
final Map<String, String> params = Maps.of("foo", "cache|overwrite");
FieldConfigs fieldConfigs = new FieldConfigs(params);
assertTrue(fieldConfigs.getConfig("test").isEmpty());
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
}
}