Преглед изворни кода

fix #2687 add client.crawlerClients

Shinsuke Sugaya пре 2 година
родитељ
комит
34d552f9e9
1 измењених фајлова са 41 додато и 0 уклоњено
  1. 41 0
      src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java

+ 41 - 0
src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java

@@ -15,9 +15,11 @@
  */
 package org.codelibs.fess.crawler;
 
+import static org.codelibs.core.stream.StreamUtil.split;
 import static org.codelibs.core.stream.StreamUtil.stream;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -25,12 +27,15 @@ import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.regex.Pattern;
 import java.util.stream.Collectors;
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.codelibs.core.io.CloseableUtil;
 import org.codelibs.core.lang.StringUtil;
+import org.codelibs.core.misc.Pair;
 import org.codelibs.fess.app.service.FailureUrlService;
 import org.codelibs.fess.crawler.builder.RequestDataBuilder;
 import org.codelibs.fess.crawler.client.CrawlerClient;
@@ -40,6 +45,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
 import org.codelibs.fess.crawler.log.LogType;
 import org.codelibs.fess.es.client.SearchEngineClient;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
+import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
 import org.codelibs.fess.exception.ContainerNotAvailableException;
 import org.codelibs.fess.exception.ContentNotFoundException;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
@@ -52,8 +58,13 @@ import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.util.DocumentUtil;
 
 public class FessCrawlerThread extends CrawlerThread {
+
     private static final Logger logger = LogManager.getLogger(FessCrawlerThread.class);
 
+    protected static final String CRAWLER_CLIENTS = "crawlerClients";
+
+    protected ConcurrentHashMap<String, Pair<String, Pattern>> clientRuleCache = new ConcurrentHashMap<>();
+
     @Override
     protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
         if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
@@ -257,4 +268,34 @@ public class FessCrawlerThread extends CrawlerThread {
             super.storeChildUrl(url, parentUrl, metaData, depth);
         }
     }
+
+    @Override
+    protected CrawlerClient getClient(final String url) {
+        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
+        final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
+        final Map<String, String> clientConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.CLIENT);
+        final String value = clientConfigMap.get(CRAWLER_CLIENTS);
+        return getClientRuleList(value).stream().map(e -> {
+            if (e.getSecond().matcher(url).matches()) {
+                return e.getFirst();
+            }
+            return null;
+        }).filter(StringUtil::isNotBlank).findFirst()//
+                .map(s -> clientFactory.getClient(s + ":" + url))//
+                .orElseGet(() -> clientFactory.getClient(url));
+    }
+
+    protected List<Pair<String, Pattern>> getClientRuleList(final String value) {
+        if (StringUtil.isBlank(value)) {
+            return Collections.emptyList();
+        }
+        return split(value, ",").get(stream -> stream.map(String::trim)//
+                .map(s -> clientRuleCache.computeIfAbsent(s, t -> {
+                    final String[] values = t.split(":", 2);
+                    if (values.length != 2) {
+                        return null;
+                    }
+                    return new Pair<>(values[0], Pattern.compile(values[1]));
+                })).toList());
+    }
 }