Ver código fonte

fix #2795 Updated user-agent version handling in crawler to use dynamic versioning based on Fess version

Shinsuke Sugaya 1 ano atrás
pai
commit
491e5a02c1

+ 4 - 0
src/main/java/org/codelibs/fess/Constants.java

@@ -79,6 +79,10 @@ public class Constants extends CoreLibConstants {
 
     public static final long DEFAULT_CRAWLING_EXECUTION_INTERVAL = 5000L;
 
+    public static final String CRAWLING_USER_AGENT_PREFIX = "Mozilla/5.0 (compatible; Fess/";
+
+    public static final String CRAWLING_USER_AGENT_SUFFIX = "; +http://fess.codelibs.org/bot.html)";
+
     // fess properties
     public static final String USER_INFO_PROPERTY = "user.info";
 

+ 7 - 1
src/main/java/org/codelibs/fess/es/config/exentity/WebConfig.java

@@ -173,7 +173,13 @@ public class WebConfig extends BsWebConfig implements CrawlingConfig {
 
         final String userAgent = getUserAgent();
         if (StringUtil.isNotBlank(userAgent)) {
-            paramMap.put(Client.USER_AGENT, userAgent);
+            if (userAgent.startsWith(Constants.CRAWLING_USER_AGENT_PREFIX) && userAgent.endsWith(Constants.CRAWLING_USER_AGENT_SUFFIX)) {
+                paramMap.put(Client.USER_AGENT, fessConfig.getUserAgentName());
+            } else {
+                paramMap.put(Client.USER_AGENT, userAgent);
+            }
+        } else {
+            paramMap.put(Client.USER_AGENT, fessConfig.getUserAgentName());
         }
 
         final List<WebAuthentication> webAuthList = webAuthenticationService.getWebAuthenticationList(getId());

+ 2 - 2
src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java

@@ -694,8 +694,8 @@ public interface FessProp {
     }
 
     default String getUserAgentName() {
-        return getSystemProperty(Constants.CRAWLING_USER_AGENT_PROPERTY, "Mozilla/5.0 (compatible; Fess/"
-                + ComponentUtil.getSystemHelper().getProductVersion() + "; +http://fess.codelibs.org/bot.html)");
+        return getSystemProperty(Constants.CRAWLING_USER_AGENT_PROPERTY, Constants.CRAWLING_USER_AGENT_PREFIX
+                + ComponentUtil.getSystemHelper().getProductVersion() + Constants.CRAWLING_USER_AGENT_SUFFIX);
     }
 
     default void setLtrModelName(final String value) {

+ 8 - 1
src/main/java/org/codelibs/fess/util/ComponentUtil.java

@@ -88,6 +88,7 @@ import org.codelibs.fess.thumbnail.ThumbnailManager;
 import org.lastaflute.core.message.MessageManager;
 import org.lastaflute.core.security.PrimaryCipher;
 import org.lastaflute.di.core.SingletonLaContainer;
+import org.lastaflute.di.core.exception.AutoBindingFailureException;
 import org.lastaflute.di.core.exception.ComponentNotFoundException;
 import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
 import org.lastaflute.di.core.smart.hot.HotdeployUtil;
@@ -520,6 +521,7 @@ public final class ComponentUtil {
         return getComponent(RANK_FUSION_PROCESSOR);
     }
 
+    @SuppressWarnings("unchecked")
     public static <T> T getComponent(final Class<T> clazz) {
         try {
             return SingletonLaContainer.getComponent(clazz);
@@ -528,6 +530,11 @@ public final class ComponentUtil {
                 throw new ContainerNotAvailableException(clazz.getCanonicalName(), e);
             }
             throw new ContainerNotAvailableException(clazz.getCanonicalName());
+        } catch (final ComponentNotFoundException | AutoBindingFailureException e) {
+            if (componentMap.containsKey(clazz.getCanonicalName())) {
+                return (T) componentMap.get(clazz.getCanonicalName());
+            }
+            throw e;
         }
     }
 
@@ -540,7 +547,7 @@ public final class ComponentUtil {
                 throw new ContainerNotAvailableException(componentName, e);
             }
             throw new ContainerNotAvailableException(componentName);
-        } catch (final ComponentNotFoundException e) {
+        } catch (final ComponentNotFoundException | AutoBindingFailureException e) {
             if (componentMap.containsKey(componentName)) {
                 return (T) componentMap.get(componentName);
             }

+ 179 - 0
src/test/java/org/codelibs/fess/es/config/exentity/WebConfigTest.java

@@ -0,0 +1,179 @@
+/*
+ * Copyright 2012-2023 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.es.config.exentity;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.codelibs.core.lang.StringUtil;
+import org.codelibs.fess.Constants;
+import org.codelibs.fess.app.service.RequestHeaderService;
+import org.codelibs.fess.app.service.WebAuthenticationService;
+import org.codelibs.fess.crawler.client.CrawlerClientFactory;
+import org.codelibs.fess.crawler.client.http.Authentication;
+import org.codelibs.fess.helper.SystemHelper;
+import org.codelibs.fess.mylasta.direction.FessConfig;
+import org.codelibs.fess.mylasta.direction.FessProp;
+import org.codelibs.fess.unit.UnitFessTestCase;
+import org.codelibs.fess.util.ComponentUtil;
+import org.opensearch.common.SetOnce;
+
+public class WebConfigTest extends UnitFessTestCase {
+
+    @Override
+    protected boolean isUseOneTimeContainer() {
+        return true;
+    }
+
+    public void test_initializeClientFactory() {
+        final Map<String, String> systemPropMap = new HashMap<>();
+        FessProp.propMap.clear();
+        FessConfig fessConfig = new FessConfig.SimpleImpl() {
+            @Override
+            public String getSystemProperty(final String key, final String defaultValue) {
+                return systemPropMap.getOrDefault(key, defaultValue);
+            }
+
+            @Override
+            public boolean isCrawlerIgnoreRobotsTxt() {
+                return false;
+            }
+
+            @Override
+            public String getHttpProxyHost() {
+                return StringUtil.EMPTY;
+            }
+
+            @Override
+            public String getHttpProxyPort() {
+                return StringUtil.EMPTY;
+            }
+        };
+        ComponentUtil.setFessConfig(fessConfig);
+        SystemHelper systemHelper = new SystemHelper() {
+            @Override
+            public String getProductVersion() {
+                return "98.76";
+            }
+        };
+        ComponentUtil.register(systemHelper, "systemHelper");
+        WebAuthenticationService webAuthenticationService = new WebAuthenticationService() {
+            @Override
+            public List<WebAuthentication> getWebAuthenticationList(final String webConfigId) {
+                return Collections.emptyList();
+            }
+        };
+        ComponentUtil.register(webAuthenticationService, WebAuthenticationService.class.getCanonicalName());
+        RequestHeaderService requestHeaderService = new RequestHeaderService() {
+            @Override
+            public List<RequestHeader> getRequestHeaderList(final String webConfigId) {
+                return Collections.emptyList();
+            }
+        };
+        ComponentUtil.register(requestHeaderService, RequestHeaderService.class.getCanonicalName());
+
+        final SetOnce<Map<String, Object>> initParamMapSet = new SetOnce<>();
+        WebConfig webConfig = new WebConfig();
+        webConfig.setUserAgent(Constants.CRAWLING_USER_AGENT_PREFIX + "1.0" + Constants.CRAWLING_USER_AGENT_SUFFIX);
+        CrawlerClientFactory crawlerClientFactory = webConfig.initializeClientFactory(() -> new CrawlerClientFactory() {
+            public void setInitParameterMap(final Map<String, Object> params) {
+                initParamMapSet.set(params);
+            }
+        });
+        assertNotNull(crawlerClientFactory);
+        Map<String, Object> initParamMap = initParamMapSet.get();
+        assertNotNull(initParamMap);
+        assertEquals(0, ((org.codelibs.fess.crawler.client.http.RequestHeader[]) initParamMap.get("requestHeaders")).length);
+        assertEquals("Mozilla/5.0 (compatible; Fess/98.76; +http://fess.codelibs.org/bot.html)", initParamMap.get("userAgent"));
+        assertEquals(0, ((Authentication[]) initParamMap.get("basicAuthentications")).length);
+        assertTrue(Boolean.valueOf(initParamMap.get("robotsTxtEnabled").toString()).booleanValue());
+    }
+
+    public void test_initializeClientFactoryWithConfigParameter() {
+        final Map<String, String> systemPropMap = new HashMap<>();
+        FessProp.propMap.clear();
+        FessConfig fessConfig = new FessConfig.SimpleImpl() {
+            @Override
+            public String getSystemProperty(final String key, final String defaultValue) {
+                return systemPropMap.getOrDefault(key, defaultValue);
+            }
+
+            @Override
+            public boolean isCrawlerIgnoreRobotsTxt() {
+                return false;
+            }
+
+            @Override
+            public String getHttpProxyHost() {
+                return StringUtil.EMPTY;
+            }
+
+            @Override
+            public String getHttpProxyPort() {
+                return StringUtil.EMPTY;
+            }
+
+            @Override
+            public String getAppEncryptPropertyPattern() {
+                return ".*password|.*key|.*token|.*secret";
+            }
+        };
+        ComponentUtil.setFessConfig(fessConfig);
+        SystemHelper systemHelper = new SystemHelper() {
+            @Override
+            public String getProductVersion() {
+                return "98.76";
+            }
+        };
+        ComponentUtil.register(systemHelper, "systemHelper");
+        WebAuthenticationService webAuthenticationService = new WebAuthenticationService() {
+            @Override
+            public List<WebAuthentication> getWebAuthenticationList(final String webConfigId) {
+                return Collections.emptyList();
+            }
+        };
+        ComponentUtil.register(webAuthenticationService, WebAuthenticationService.class.getCanonicalName());
+        RequestHeaderService requestHeaderService = new RequestHeaderService() {
+            @Override
+            public List<RequestHeader> getRequestHeaderList(final String webConfigId) {
+                return Collections.emptyList();
+            }
+        };
+        ComponentUtil.register(requestHeaderService, RequestHeaderService.class.getCanonicalName());
+
+        final SetOnce<Map<String, Object>> initParamMapSet = new SetOnce<>();
+        WebConfig webConfig = new WebConfig();
+        final String userAgent = "TestAgent";
+        webConfig.setUserAgent(userAgent);
+        webConfig.setConfigParameter("""
+                client.robotsTxtEnabled=false
+                """);
+        CrawlerClientFactory crawlerClientFactory = webConfig.initializeClientFactory(() -> new CrawlerClientFactory() {
+            public void setInitParameterMap(final Map<String, Object> params) {
+                initParamMapSet.set(params);
+            }
+        });
+        assertNotNull(crawlerClientFactory);
+        Map<String, Object> initParamMap = initParamMapSet.get();
+        assertNotNull(initParamMap);
+        assertEquals(0, ((org.codelibs.fess.crawler.client.http.RequestHeader[]) initParamMap.get("requestHeaders")).length);
+        assertEquals(userAgent, initParamMap.get("userAgent"));
+        assertEquals(0, ((Authentication[]) initParamMap.get("basicAuthentications")).length);
+        assertFalse(Boolean.valueOf(initParamMap.get("robotsTxtEnabled").toString()).booleanValue());
+    }
+}

+ 28 - 0
src/test/java/org/codelibs/fess/mylasta/direction/FessPropTest.java

@@ -21,10 +21,14 @@ import java.io.IOException;
 import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Locale;
+import java.util.Map;
 
 import org.codelibs.core.io.FileUtil;
 import org.codelibs.core.misc.DynamicProperties;
+import org.codelibs.fess.Constants;
+import org.codelibs.fess.helper.SystemHelper;
 import org.codelibs.fess.unit.UnitFessTestCase;
+import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.util.PrunedTag;
 import org.codelibs.nekohtml.parsers.DOMParser;
 import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
@@ -262,6 +266,30 @@ public class FessPropTest extends UnitFessTestCase {
         assertFalse(fessConfig.isValidUserCode("123456789?"));
     }
 
+    public void test_getUserAgentName() throws IOException {
+        final Map<String, String> systemPropMap = new HashMap<>();
+        FessProp.propMap.clear();
+        FessConfig fessConfig = new FessConfig.SimpleImpl() {
+            @Override
+            public String getSystemProperty(final String key, final String defaultValue) {
+                return systemPropMap.getOrDefault(key, defaultValue);
+            }
+        };
+        ComponentUtil.setFessConfig(fessConfig);
+        SystemHelper systemHelper = new SystemHelper() {
+            @Override
+            public String getProductVersion() {
+                return "98.76";
+            }
+        };
+        ComponentUtil.register(systemHelper, "systemHelper");
+
+        assertEquals("Mozilla/5.0 (compatible; Fess/98.76; +http://fess.codelibs.org/bot.html)", fessConfig.getUserAgentName());
+
+        systemPropMap.put(Constants.CRAWLING_USER_AGENT_PROPERTY, "TestAgent");
+        assertEquals("TestAgent", fessConfig.getUserAgentName());
+    }
+
     private void assertArrays(final String[] expected, final String[] actual) {
         Arrays.sort(expected);
         Arrays.sort(actual);