fix #2687 add client.crawlerClients

This commit is contained in:
Shinsuke Sugaya 2022-09-17 14:08:54 +09:00
parent cc5f91c6dd
commit 34d552f9e9

View file

@ -15,9 +15,11 @@
*/
package org.codelibs.fess.crawler;
import static org.codelibs.core.stream.StreamUtil.split;
import static org.codelibs.core.stream.StreamUtil.stream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -25,12 +27,15 @@ import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.codelibs.core.io.CloseableUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Pair;
import org.codelibs.fess.app.service.FailureUrlService;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.client.CrawlerClient;
@ -40,6 +45,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.log.LogType;
import org.codelibs.fess.es.client.SearchEngineClient;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.exception.ContainerNotAvailableException;
import org.codelibs.fess.exception.ContentNotFoundException;
import org.codelibs.fess.helper.CrawlingConfigHelper;
@ -52,8 +58,13 @@ import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.DocumentUtil;
public class FessCrawlerThread extends CrawlerThread {
private static final Logger logger = LogManager.getLogger(FessCrawlerThread.class);
protected static final String CRAWLER_CLIENTS = "crawlerClients";
protected ConcurrentHashMap<String, Pair<String, Pattern>> clientRuleCache = new ConcurrentHashMap<>();
@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
@ -257,4 +268,34 @@ public class FessCrawlerThread extends CrawlerThread {
super.storeChildUrl(url, parentUrl, metaData, depth);
}
}
@Override
protected CrawlerClient getClient(final String url) {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, String> clientConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.CLIENT);
final String value = clientConfigMap.get(CRAWLER_CLIENTS);
return getClientRuleList(value).stream().map(e -> {
if (e.getSecond().matcher(url).matches()) {
return e.getFirst();
}
return null;
}).filter(StringUtil::isNotBlank).findFirst()//
.map(s -> clientFactory.getClient(s + ":" + url))//
.orElseGet(() -> clientFactory.getClient(url));
}
protected List<Pair<String, Pattern>> getClientRuleList(final String value) {
if (StringUtil.isBlank(value)) {
return Collections.emptyList();
}
return split(value, ",").get(stream -> stream.map(String::trim)//
.map(s -> clientRuleCache.computeIfAbsent(s, t -> {
final String[] values = t.split(":", 2);
if (values.length != 2) {
return null;
}
return new Pair<>(values[0], Pattern.compile(values[1]));
})).toList());
}
}