fix #2795 Updated user-agent version handling in crawler to use dynamic versioning based on Fess version
This commit is contained in:
parent
ab89d042f7
commit
491e5a02c1
6 changed files with 228 additions and 4 deletions
|
@ -79,6 +79,10 @@ public class Constants extends CoreLibConstants {
|
|||
|
||||
public static final long DEFAULT_CRAWLING_EXECUTION_INTERVAL = 5000L;
|
||||
|
||||
public static final String CRAWLING_USER_AGENT_PREFIX = "Mozilla/5.0 (compatible; Fess/";
|
||||
|
||||
public static final String CRAWLING_USER_AGENT_SUFFIX = "; +http://fess.codelibs.org/bot.html)";
|
||||
|
||||
// fess properties
|
||||
public static final String USER_INFO_PROPERTY = "user.info";
|
||||
|
||||
|
|
|
@ -173,7 +173,13 @@ public class WebConfig extends BsWebConfig implements CrawlingConfig {
|
|||
|
||||
final String userAgent = getUserAgent();
|
||||
if (StringUtil.isNotBlank(userAgent)) {
|
||||
paramMap.put(Client.USER_AGENT, userAgent);
|
||||
if (userAgent.startsWith(Constants.CRAWLING_USER_AGENT_PREFIX) && userAgent.endsWith(Constants.CRAWLING_USER_AGENT_SUFFIX)) {
|
||||
paramMap.put(Client.USER_AGENT, fessConfig.getUserAgentName());
|
||||
} else {
|
||||
paramMap.put(Client.USER_AGENT, userAgent);
|
||||
}
|
||||
} else {
|
||||
paramMap.put(Client.USER_AGENT, fessConfig.getUserAgentName());
|
||||
}
|
||||
|
||||
final List<WebAuthentication> webAuthList = webAuthenticationService.getWebAuthenticationList(getId());
|
||||
|
|
|
@ -694,8 +694,8 @@ public interface FessProp {
|
|||
}
|
||||
|
||||
default String getUserAgentName() {
|
||||
return getSystemProperty(Constants.CRAWLING_USER_AGENT_PROPERTY, "Mozilla/5.0 (compatible; Fess/"
|
||||
+ ComponentUtil.getSystemHelper().getProductVersion() + "; +http://fess.codelibs.org/bot.html)");
|
||||
return getSystemProperty(Constants.CRAWLING_USER_AGENT_PROPERTY, Constants.CRAWLING_USER_AGENT_PREFIX
|
||||
+ ComponentUtil.getSystemHelper().getProductVersion() + Constants.CRAWLING_USER_AGENT_SUFFIX);
|
||||
}
|
||||
|
||||
default void setLtrModelName(final String value) {
|
||||
|
|
|
@ -88,6 +88,7 @@ import org.codelibs.fess.thumbnail.ThumbnailManager;
|
|||
import org.lastaflute.core.message.MessageManager;
|
||||
import org.lastaflute.core.security.PrimaryCipher;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.lastaflute.di.core.exception.AutoBindingFailureException;
|
||||
import org.lastaflute.di.core.exception.ComponentNotFoundException;
|
||||
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
|
||||
import org.lastaflute.di.core.smart.hot.HotdeployUtil;
|
||||
|
@ -520,6 +521,7 @@ public final class ComponentUtil {
|
|||
return getComponent(RANK_FUSION_PROCESSOR);
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <T> T getComponent(final Class<T> clazz) {
|
||||
try {
|
||||
return SingletonLaContainer.getComponent(clazz);
|
||||
|
@ -528,6 +530,11 @@ public final class ComponentUtil {
|
|||
throw new ContainerNotAvailableException(clazz.getCanonicalName(), e);
|
||||
}
|
||||
throw new ContainerNotAvailableException(clazz.getCanonicalName());
|
||||
} catch (final ComponentNotFoundException | AutoBindingFailureException e) {
|
||||
if (componentMap.containsKey(clazz.getCanonicalName())) {
|
||||
return (T) componentMap.get(clazz.getCanonicalName());
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -540,7 +547,7 @@ public final class ComponentUtil {
|
|||
throw new ContainerNotAvailableException(componentName, e);
|
||||
}
|
||||
throw new ContainerNotAvailableException(componentName);
|
||||
} catch (final ComponentNotFoundException e) {
|
||||
} catch (final ComponentNotFoundException | AutoBindingFailureException e) {
|
||||
if (componentMap.containsKey(componentName)) {
|
||||
return (T) componentMap.get(componentName);
|
||||
}
|
||||
|
|
|
@ -0,0 +1,179 @@
|
|||
/*
|
||||
* Copyright 2012-2023 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.es.config.exentity;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.app.service.RequestHeaderService;
|
||||
import org.codelibs.fess.app.service.WebAuthenticationService;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.client.http.Authentication;
|
||||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.mylasta.direction.FessProp;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.opensearch.common.SetOnce;
|
||||
|
||||
public class WebConfigTest extends UnitFessTestCase {
|
||||
|
||||
@Override
|
||||
protected boolean isUseOneTimeContainer() {
|
||||
return true;
|
||||
}
|
||||
|
||||
public void test_initializeClientFactory() {
|
||||
final Map<String, String> systemPropMap = new HashMap<>();
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public String getSystemProperty(final String key, final String defaultValue) {
|
||||
return systemPropMap.getOrDefault(key, defaultValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreRobotsTxt() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHttpProxyHost() {
|
||||
return StringUtil.EMPTY;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHttpProxyPort() {
|
||||
return StringUtil.EMPTY;
|
||||
}
|
||||
};
|
||||
ComponentUtil.setFessConfig(fessConfig);
|
||||
SystemHelper systemHelper = new SystemHelper() {
|
||||
@Override
|
||||
public String getProductVersion() {
|
||||
return "98.76";
|
||||
}
|
||||
};
|
||||
ComponentUtil.register(systemHelper, "systemHelper");
|
||||
WebAuthenticationService webAuthenticationService = new WebAuthenticationService() {
|
||||
@Override
|
||||
public List<WebAuthentication> getWebAuthenticationList(final String webConfigId) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
};
|
||||
ComponentUtil.register(webAuthenticationService, WebAuthenticationService.class.getCanonicalName());
|
||||
RequestHeaderService requestHeaderService = new RequestHeaderService() {
|
||||
@Override
|
||||
public List<RequestHeader> getRequestHeaderList(final String webConfigId) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
};
|
||||
ComponentUtil.register(requestHeaderService, RequestHeaderService.class.getCanonicalName());
|
||||
|
||||
final SetOnce<Map<String, Object>> initParamMapSet = new SetOnce<>();
|
||||
WebConfig webConfig = new WebConfig();
|
||||
webConfig.setUserAgent(Constants.CRAWLING_USER_AGENT_PREFIX + "1.0" + Constants.CRAWLING_USER_AGENT_SUFFIX);
|
||||
CrawlerClientFactory crawlerClientFactory = webConfig.initializeClientFactory(() -> new CrawlerClientFactory() {
|
||||
public void setInitParameterMap(final Map<String, Object> params) {
|
||||
initParamMapSet.set(params);
|
||||
}
|
||||
});
|
||||
assertNotNull(crawlerClientFactory);
|
||||
Map<String, Object> initParamMap = initParamMapSet.get();
|
||||
assertNotNull(initParamMap);
|
||||
assertEquals(0, ((org.codelibs.fess.crawler.client.http.RequestHeader[]) initParamMap.get("requestHeaders")).length);
|
||||
assertEquals("Mozilla/5.0 (compatible; Fess/98.76; +http://fess.codelibs.org/bot.html)", initParamMap.get("userAgent"));
|
||||
assertEquals(0, ((Authentication[]) initParamMap.get("basicAuthentications")).length);
|
||||
assertTrue(Boolean.valueOf(initParamMap.get("robotsTxtEnabled").toString()).booleanValue());
|
||||
}
|
||||
|
||||
public void test_initializeClientFactoryWithConfigParameter() {
|
||||
final Map<String, String> systemPropMap = new HashMap<>();
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public String getSystemProperty(final String key, final String defaultValue) {
|
||||
return systemPropMap.getOrDefault(key, defaultValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreRobotsTxt() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHttpProxyHost() {
|
||||
return StringUtil.EMPTY;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getHttpProxyPort() {
|
||||
return StringUtil.EMPTY;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getAppEncryptPropertyPattern() {
|
||||
return ".*password|.*key|.*token|.*secret";
|
||||
}
|
||||
};
|
||||
ComponentUtil.setFessConfig(fessConfig);
|
||||
SystemHelper systemHelper = new SystemHelper() {
|
||||
@Override
|
||||
public String getProductVersion() {
|
||||
return "98.76";
|
||||
}
|
||||
};
|
||||
ComponentUtil.register(systemHelper, "systemHelper");
|
||||
WebAuthenticationService webAuthenticationService = new WebAuthenticationService() {
|
||||
@Override
|
||||
public List<WebAuthentication> getWebAuthenticationList(final String webConfigId) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
};
|
||||
ComponentUtil.register(webAuthenticationService, WebAuthenticationService.class.getCanonicalName());
|
||||
RequestHeaderService requestHeaderService = new RequestHeaderService() {
|
||||
@Override
|
||||
public List<RequestHeader> getRequestHeaderList(final String webConfigId) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
};
|
||||
ComponentUtil.register(requestHeaderService, RequestHeaderService.class.getCanonicalName());
|
||||
|
||||
final SetOnce<Map<String, Object>> initParamMapSet = new SetOnce<>();
|
||||
WebConfig webConfig = new WebConfig();
|
||||
final String userAgent = "TestAgent";
|
||||
webConfig.setUserAgent(userAgent);
|
||||
webConfig.setConfigParameter("""
|
||||
client.robotsTxtEnabled=false
|
||||
""");
|
||||
CrawlerClientFactory crawlerClientFactory = webConfig.initializeClientFactory(() -> new CrawlerClientFactory() {
|
||||
public void setInitParameterMap(final Map<String, Object> params) {
|
||||
initParamMapSet.set(params);
|
||||
}
|
||||
});
|
||||
assertNotNull(crawlerClientFactory);
|
||||
Map<String, Object> initParamMap = initParamMapSet.get();
|
||||
assertNotNull(initParamMap);
|
||||
assertEquals(0, ((org.codelibs.fess.crawler.client.http.RequestHeader[]) initParamMap.get("requestHeaders")).length);
|
||||
assertEquals(userAgent, initParamMap.get("userAgent"));
|
||||
assertEquals(0, ((Authentication[]) initParamMap.get("basicAuthentications")).length);
|
||||
assertFalse(Boolean.valueOf(initParamMap.get("robotsTxtEnabled").toString()).booleanValue());
|
||||
}
|
||||
}
|
|
@ -21,10 +21,14 @@ import java.io.IOException;
|
|||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import org.codelibs.core.io.FileUtil;
|
||||
import org.codelibs.core.misc.DynamicProperties;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.PrunedTag;
|
||||
import org.codelibs.nekohtml.parsers.DOMParser;
|
||||
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
|
||||
|
@ -262,6 +266,30 @@ public class FessPropTest extends UnitFessTestCase {
|
|||
assertFalse(fessConfig.isValidUserCode("123456789?"));
|
||||
}
|
||||
|
||||
public void test_getUserAgentName() throws IOException {
|
||||
final Map<String, String> systemPropMap = new HashMap<>();
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public String getSystemProperty(final String key, final String defaultValue) {
|
||||
return systemPropMap.getOrDefault(key, defaultValue);
|
||||
}
|
||||
};
|
||||
ComponentUtil.setFessConfig(fessConfig);
|
||||
SystemHelper systemHelper = new SystemHelper() {
|
||||
@Override
|
||||
public String getProductVersion() {
|
||||
return "98.76";
|
||||
}
|
||||
};
|
||||
ComponentUtil.register(systemHelper, "systemHelper");
|
||||
|
||||
assertEquals("Mozilla/5.0 (compatible; Fess/98.76; +http://fess.codelibs.org/bot.html)", fessConfig.getUserAgentName());
|
||||
|
||||
systemPropMap.put(Constants.CRAWLING_USER_AGENT_PROPERTY, "TestAgent");
|
||||
assertEquals("TestAgent", fessConfig.getUserAgentName());
|
||||
}
|
||||
|
||||
private void assertArrays(final String[] expected, final String[] actual) {
|
||||
Arrays.sort(expected);
|
||||
Arrays.sort(actual);
|
||||
|
|
Loading…
Add table
Reference in a new issue