diff --git a/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java b/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java index ec5ad746f..d487f73aa 100644 --- a/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java +++ b/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java @@ -15,9 +15,11 @@ */ package org.codelibs.fess.crawler; +import static org.codelibs.core.stream.StreamUtil.split; import static org.codelibs.core.stream.StreamUtil.stream; import java.util.ArrayList; +import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.HashSet; @@ -25,12 +27,15 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.core.io.CloseableUtil; import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.misc.Pair; import org.codelibs.fess.app.service.FailureUrlService; import org.codelibs.fess.crawler.builder.RequestDataBuilder; import org.codelibs.fess.crawler.client.CrawlerClient; @@ -40,6 +45,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue; import org.codelibs.fess.crawler.log.LogType; import org.codelibs.fess.es.client.SearchEngineClient; import org.codelibs.fess.es.config.exentity.CrawlingConfig; +import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName; import org.codelibs.fess.exception.ContainerNotAvailableException; import org.codelibs.fess.exception.ContentNotFoundException; import org.codelibs.fess.helper.CrawlingConfigHelper; @@ -52,8 +58,13 @@ import org.codelibs.fess.util.ComponentUtil; import org.codelibs.fess.util.DocumentUtil; public class FessCrawlerThread extends CrawlerThread { + private static final Logger logger = LogManager.getLogger(FessCrawlerThread.class); + protected static final String CRAWLER_CLIENTS = "crawlerClients"; + + protected ConcurrentHashMap> clientRuleCache = new ConcurrentHashMap<>(); + @Override protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue urlQueue) { if (ComponentUtil.getFessConfig().isIncrementalCrawling()) { @@ -257,4 +268,34 @@ public class FessCrawlerThread extends CrawlerThread { super.storeChildUrl(url, parentUrl, metaData, depth); } } + + @Override + protected CrawlerClient getClient(final String url) { + final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); + final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId()); + final Map clientConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.CLIENT); + final String value = clientConfigMap.get(CRAWLER_CLIENTS); + return getClientRuleList(value).stream().map(e -> { + if (e.getSecond().matcher(url).matches()) { + return e.getFirst(); + } + return null; + }).filter(StringUtil::isNotBlank).findFirst()// + .map(s -> clientFactory.getClient(s + ":" + url))// + .orElseGet(() -> clientFactory.getClient(url)); + } + + protected List> getClientRuleList(final String value) { + if (StringUtil.isBlank(value)) { + return Collections.emptyList(); + } + return split(value, ",").get(stream -> stream.map(String::trim)// + .map(s -> clientRuleCache.computeIfAbsent(s, t -> { + final String[] values = t.split(":", 2); + if (values.length != 2) { + return null; + } + return new Pair<>(values[0], Pattern.compile(values[1])); + })).toList()); + } }