瀏覽代碼

fix #2826 Added support for defining field configurations via field.config.fieldname in crawl settings.

Shinsuke Sugaya 1 年之前
父節點
當前提交
d0929b5994

+ 4 - 3
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -44,6 +44,7 @@ import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
 import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
+import org.codelibs.fess.crawler.util.FieldConfigs;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
@@ -181,7 +182,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
         url = pathMappingHelper.replaceUrl(sessionId, url);
 
-        final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
+        final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));
 
         String urlEncoding;
         final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
@@ -221,7 +222,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
         final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
         putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
-        if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
+        if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
                 || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
             if (responseData.getContentLength() > 0
                     && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
@@ -334,7 +335,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
             putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key), scriptType);
         }
 
-        return dataMap;
+        return processFieldConfigs(dataMap, fieldConfigs);
     }
 
     protected Date getLastModified(final Map<String, Object> dataMap, final ResponseData responseData) {

+ 15 - 0
src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java

@@ -20,6 +20,7 @@ import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.LinkedHashMap;
 import java.util.Map;
 
 import org.apache.commons.lang3.StringUtils;
@@ -31,6 +32,7 @@ import org.codelibs.fess.crawler.entity.AccessResult;
 import org.codelibs.fess.crawler.entity.AccessResultData;
 import org.codelibs.fess.crawler.entity.UrlQueue;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
+import org.codelibs.fess.crawler.util.FieldConfigs;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 
@@ -248,4 +250,17 @@ public interface FessTransformer {
         }
         return null;
     }
+
+    default Map<String, Object> processFieldConfigs(final Map<String, Object> dataMap, final FieldConfigs fieldConfigs) {
+        final Map<String, Object> newDataMap = new LinkedHashMap<>();
+        for (Map.Entry<String, Object> e : dataMap.entrySet()) {
+            if (fieldConfigs.getConfig(e.getKey()).map(FieldConfigs.Config::isOverwrite).orElse(false)
+                    && e.getValue() instanceof Object[] values && values.length > 0) {
+                newDataMap.put(e.getKey(), values[values.length - 1]);
+            } else {
+                newDataMap.put(e.getKey(), e.getValue());
+            }
+        }
+        return newDataMap;
+    }
 }

+ 9 - 5
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -55,6 +55,7 @@ import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
 import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
+import org.codelibs.fess.crawler.util.FieldConfigs;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
@@ -153,7 +154,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         processMetaRobots(responseData, resultData, document);
         processXRobotsTag(responseData, resultData);
 
-        final Map<String, Object> dataMap = new LinkedHashMap<>();
+        Map<String, Object> dataMap = new LinkedHashMap<>();
         for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
             final String path = entry.getValue();
             try {
@@ -185,7 +186,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             }
         }
 
-        putAdditionalData(dataMap, responseData, document);
+        dataMap = processAdditionalData(dataMap, responseData, document);
         normalizeData(responseData, dataMap);
 
         try {
@@ -337,7 +338,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return true;
     }
 
-    protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
+    protected Map<String, Object> processAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData,
+            final Document document) {
         // canonical
         final String canonicalUrl = getCanonicalUrl(responseData, document);
         if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
@@ -363,7 +365,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         url = pathMappingHelper.replaceUrl(sessionId, url);
         final String mimeType = responseData.getMimeType();
 
-        final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
+        final FieldConfigs fieldConfigs = new FieldConfigs(crawlingConfig.getConfigParameterMap(ConfigName.FIELD));
         final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
 
         String urlEncoding;
@@ -395,7 +397,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
                 prunedContent ? node -> pruneNode(node, crawlingConfig) : node -> node);
         final String fileName = getFileName(url, urlEncoding);
         putResultDataContent(dataMap, responseData, fessConfig, crawlingConfig, documentHelper, body, fileName);
-        if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
+        if ((fieldConfigs.getConfig(fessConfig.getIndexFieldCache()).map(config -> config.isCache()).orElse(false)
                 || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
             if (responseData.getContentLength() > 0
                     && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
@@ -500,6 +502,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             final String value = e.getValue();
             putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
         });
+
+        return processFieldConfigs(dataMap, fieldConfigs);
     }
 
     protected void putResultDataContent(final Map<String, Object> dataMap, final ResponseData responseData, final FessConfig fessConfig,

+ 76 - 0
src/main/java/org/codelibs/fess/crawler/util/FieldConfigs.java

@@ -0,0 +1,76 @@
+/*
+ * Copyright 2012-2024 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.crawler.util;
+
+import java.util.Map;
+import java.util.regex.Pattern;
+
+import org.codelibs.core.lang.StringUtil;
+import org.codelibs.core.stream.StreamUtil;
+import org.codelibs.fess.Constants;
+import org.dbflute.optional.OptionalThing;
+
+public class FieldConfigs {
+
+    private final Map<String, String> params;
+
+    public FieldConfigs(Map<String, String> params) {
+        this.params = params;
+    }
+
+    public OptionalThing<Config> getConfig(String fieldName) {
+        String value = params.get(fieldName);
+        if (StringUtil.isNotBlank(value)) {
+            return OptionalThing.of(new Config(value));
+        }
+        return OptionalThing.empty();
+    }
+
+    public static class Config {
+
+        private final String[] values;
+
+        public Config(String value) {
+            values = StreamUtil.split(value, Pattern.quote("|")).get(stream -> stream.map(s -> s.trim()).toArray(n -> new String[n]));
+        }
+
+        public boolean isCache() {
+            for (final String value : values) {
+                if ("cache".equalsIgnoreCase(value)) {
+                    return true;
+                }
+            }
+            // backward compatibility
+            if (values.length == 1 && Constants.TRUE.equalsIgnoreCase(values[0])) {
+                return true;
+            }
+            return false;
+        }
+
+        public boolean isOverwrite() {
+            for (final String value : values) {
+                if ("overwrite".equalsIgnoreCase(value)) {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        public String[] getValues() {
+            return values;
+        }
+    }
+}

+ 18 - 0
src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java

@@ -17,8 +17,11 @@ package org.codelibs.fess.crawler.transformer;
 
 import java.io.UnsupportedEncodingException;
 import java.net.URLEncoder;
+import java.util.Map;
 
+import org.apache.groovy.util.Maps;
 import org.codelibs.fess.Constants;
+import org.codelibs.fess.crawler.util.FieldConfigs;
 import org.codelibs.fess.exception.FessSystemException;
 import org.codelibs.fess.unit.UnitFessTestCase;
 
@@ -269,6 +272,21 @@ public class FessFileTransformerTest extends UnitFessTestCase {
         assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
     }
 
+    public void test_processFieldConfigs() {
+        final FessFileTransformer transformer = createInstance();
+        final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
+        FieldConfigs fieldConfigs = new FieldConfigs(params);
+        final Map<String, Object> dataMap = Map.of(//
+                "foo", new String[] { "aaa", "bbb" }, //
+                "bar", new String[] { "ccc", "ddd" }, //
+                "baz", new String[] { "eee", "fff" });
+        final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
+        assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
+        assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
+        assertEquals("ddd", resultMap.get("bar"));
+        assertEquals("fff", resultMap.get("baz"));
+    }
+
     private FessFileTransformer createInstance() {
         final FessFileTransformer transformer = new FessFileTransformer();
         transformer.init();

+ 21 - 4
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -32,6 +32,7 @@ import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
+import org.apache.groovy.util.Maps;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.codelibs.core.lang.ClassUtil;
@@ -42,6 +43,7 @@ import org.codelibs.fess.crawler.entity.RequestData;
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.entity.ResultData;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
+import org.codelibs.fess.crawler.util.FieldConfigs;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.es.config.exentity.WebConfig;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
@@ -626,7 +628,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         String data = "<html><body>aaa</body></html>";
         Document document = getDocument(data);
         try {
-            transformer.putAdditionalData(dataMap, responseData, document);
+            transformer.processAdditionalData(dataMap, responseData, document);
             fail();
         } catch (final ComponentNotFoundException e) {
             // ignore
@@ -635,7 +637,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
         document = getDocument(data);
         try {
-            transformer.putAdditionalData(dataMap, responseData, document);
+            transformer.processAdditionalData(dataMap, responseData, document);
             fail();
         } catch (final ComponentNotFoundException e) {
             // ignore
@@ -644,7 +646,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         data = "<html><head><link rel=\"canonical\" href=\"http://example.com/foo\"></head><body>aaa</body></html>";
         document = getDocument(data);
         try {
-            transformer.putAdditionalData(dataMap, responseData, document);
+            transformer.processAdditionalData(dataMap, responseData, document);
             fail();
         } catch (final ChildUrlsException e) {
             final Set<RequestData> childUrlList = e.getChildUrlList();
@@ -655,7 +657,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         data = "<html><link rel=\"canonical\" href=\"http://example.com/foo\"><body>aaa</body></html>";
         document = getDocument(data);
         try {
-            transformer.putAdditionalData(dataMap, responseData, document);
+            transformer.processAdditionalData(dataMap, responseData, document);
             fail();
         } catch (final ChildUrlsException e) {
             final Set<RequestData> childUrlList = e.getChildUrlList();
@@ -904,4 +906,19 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         assertFalse(transformer.isValidUrl("http://"));
         assertFalse(transformer.isValidUrl("http://http://www.example.com"));
     }
+
+    public void test_processFieldConfigs() {
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+        final Map<String, String> params = Maps.of("foo", "cache", "bar", "overwrite", "baz", "cache|overwrite");
+        FieldConfigs fieldConfigs = new FieldConfigs(params);
+        final Map<String, Object> dataMap = Map.of(//
+                "foo", new String[] { "aaa", "bbb" }, //
+                "bar", new String[] { "ccc", "ddd" }, //
+                "baz", new String[] { "eee", "fff" });
+        final Map<String, Object> resultMap = transformer.processFieldConfigs(dataMap, fieldConfigs);
+        assertEquals("aaa", ((String[]) resultMap.get("foo"))[0]);
+        assertEquals("bbb", ((String[]) resultMap.get("foo"))[1]);
+        assertEquals("ddd", resultMap.get("bar"));
+        assertEquals("fff", resultMap.get("baz"));
+    }
 }

+ 71 - 0
src/test/java/org/codelibs/fess/crawler/util/FieldConfigsTest.java

@@ -0,0 +1,71 @@
+/*
+ * Copyright 2012-2024 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.crawler.util;
+
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.groovy.util.Maps;
+import org.codelibs.fess.unit.UnitFessTestCase;
+
+public class FieldConfigsTest extends UnitFessTestCase {
+    public void test_empty() {
+        final FieldConfigs fieldConfigs = new FieldConfigs(Collections.emptyMap());
+        assertTrue(fieldConfigs.getConfig("test").isEmpty());
+    }
+
+    public void test_values() {
+        final Map<String, String> params = Maps.of("foo", "bar");
+        FieldConfigs fieldConfigs = new FieldConfigs(params);
+        assertTrue(fieldConfigs.getConfig("test").isEmpty());
+        assertFalse(fieldConfigs.getConfig("foo").isEmpty());
+        assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
+        assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
+        assertEquals("bar", fieldConfigs.getConfig("foo").map(FieldConfigs.Config::getValues).orElse(new String[0])[0]);
+    }
+
+    public void test_cache_true() {
+        final Map<String, String> params = Maps.of("foo", "true");
+        FieldConfigs fieldConfigs = new FieldConfigs(params);
+        assertTrue(fieldConfigs.getConfig("test").isEmpty());
+        assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
+        assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
+    }
+
+    public void test_cache() {
+        final Map<String, String> params = Maps.of("foo", "cache");
+        FieldConfigs fieldConfigs = new FieldConfigs(params);
+        assertTrue(fieldConfigs.getConfig("test").isEmpty());
+        assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
+        assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
+    }
+
+    public void test_overwrite() {
+        final Map<String, String> params = Maps.of("foo", "overwrite");
+        FieldConfigs fieldConfigs = new FieldConfigs(params);
+        assertTrue(fieldConfigs.getConfig("test").isEmpty());
+        assertFalse(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
+        assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
+    }
+
+    public void test_cache_overwrite() {
+        final Map<String, String> params = Maps.of("foo", "cache|overwrite");
+        FieldConfigs fieldConfigs = new FieldConfigs(params);
+        assertTrue(fieldConfigs.getConfig("test").isEmpty());
+        assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isCache).orElse(false));
+        assertTrue(fieldConfigs.getConfig("foo").map(FieldConfigs.Config::isOverwrite).orElse(false));
+    }
+}