fix #2687 add client.crawlerClients
This commit is contained in:
parent
cc5f91c6dd
commit
34d552f9e9
1 changed files with 41 additions and 0 deletions
|
@ -15,9 +15,11 @@
|
|||
*/
|
||||
package org.codelibs.fess.crawler;
|
||||
|
||||
import static org.codelibs.core.stream.StreamUtil.split;
|
||||
import static org.codelibs.core.stream.StreamUtil.stream;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -25,12 +27,15 @@ import java.util.LinkedHashSet;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.codelibs.core.io.CloseableUtil;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.misc.Pair;
|
||||
import org.codelibs.fess.app.service.FailureUrlService;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClient;
|
||||
|
@ -40,6 +45,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
|
|||
import org.codelibs.fess.crawler.log.LogType;
|
||||
import org.codelibs.fess.es.client.SearchEngineClient;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
||||
import org.codelibs.fess.exception.ContainerNotAvailableException;
|
||||
import org.codelibs.fess.exception.ContentNotFoundException;
|
||||
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
||||
|
@ -52,8 +58,13 @@ import org.codelibs.fess.util.ComponentUtil;
|
|||
import org.codelibs.fess.util.DocumentUtil;
|
||||
|
||||
public class FessCrawlerThread extends CrawlerThread {
|
||||
|
||||
private static final Logger logger = LogManager.getLogger(FessCrawlerThread.class);
|
||||
|
||||
protected static final String CRAWLER_CLIENTS = "crawlerClients";
|
||||
|
||||
protected ConcurrentHashMap<String, Pair<String, Pattern>> clientRuleCache = new ConcurrentHashMap<>();
|
||||
|
||||
@Override
|
||||
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
|
||||
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
|
||||
|
@ -257,4 +268,34 @@ public class FessCrawlerThread extends CrawlerThread {
|
|||
super.storeChildUrl(url, parentUrl, metaData, depth);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected CrawlerClient getClient(final String url) {
|
||||
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
|
||||
final Map<String, String> clientConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.CLIENT);
|
||||
final String value = clientConfigMap.get(CRAWLER_CLIENTS);
|
||||
return getClientRuleList(value).stream().map(e -> {
|
||||
if (e.getSecond().matcher(url).matches()) {
|
||||
return e.getFirst();
|
||||
}
|
||||
return null;
|
||||
}).filter(StringUtil::isNotBlank).findFirst()//
|
||||
.map(s -> clientFactory.getClient(s + ":" + url))//
|
||||
.orElseGet(() -> clientFactory.getClient(url));
|
||||
}
|
||||
|
||||
protected List<Pair<String, Pattern>> getClientRuleList(final String value) {
|
||||
if (StringUtil.isBlank(value)) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
return split(value, ",").get(stream -> stream.map(String::trim)//
|
||||
.map(s -> clientRuleCache.computeIfAbsent(s, t -> {
|
||||
final String[] values = t.split(":", 2);
|
||||
if (values.length != 2) {
|
||||
return null;
|
||||
}
|
||||
return new Pair<>(values[0], Pattern.compile(values[1]));
|
||||
})).toList());
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue