瀏覽代碼

fix #2688 add fess-crawler-playwright

Shinsuke Sugaya 2 年之前
父節點
當前提交
0b154758bd

+ 5 - 0
pom.xml

@@ -1391,6 +1391,11 @@
 			<artifactId>fess-crawler-es</artifactId>
 			<artifactId>fess-crawler-es</artifactId>
 			<version>${crawler.version}</version>
 			<version>${crawler.version}</version>
 		</dependency>
 		</dependency>
+		<dependency>
+			<groupId>org.codelibs.fess</groupId>
+			<artifactId>fess-crawler-playwright</artifactId>
+			<version>${crawler.version}</version>
+		</dependency>
 		<dependency>
 		<dependency>
 			<groupId>args4j</groupId>
 			<groupId>args4j</groupId>
 			<artifactId>args4j</artifactId>
 			<artifactId>args4j</artifactId>

+ 5 - 1
src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java

@@ -275,7 +275,7 @@ public class FessCrawlerThread extends CrawlerThread {
         final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
         final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
         final Map<String, String> clientConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.CLIENT);
         final Map<String, String> clientConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.CLIENT);
         final String value = clientConfigMap.get(CRAWLER_CLIENTS);
         final String value = clientConfigMap.get(CRAWLER_CLIENTS);
-        return getClientRuleList(value).stream().map(e -> {
+        final CrawlerClient client = getClientRuleList(value).stream().map(e -> {
             if (e.getSecond().matcher(url).matches()) {
             if (e.getSecond().matcher(url).matches()) {
                 return e.getFirst();
                 return e.getFirst();
             }
             }
@@ -283,6 +283,10 @@ public class FessCrawlerThread extends CrawlerThread {
         }).filter(StringUtil::isNotBlank).findFirst()//
         }).filter(StringUtil::isNotBlank).findFirst()//
                 .map(s -> clientFactory.getClient(s + ":" + url))//
                 .map(s -> clientFactory.getClient(s + ":" + url))//
                 .orElseGet(() -> clientFactory.getClient(url));
                 .orElseGet(() -> clientFactory.getClient(url));
+        if (logger.isDebugEnabled()) {
+            logger.debug("CrawlerClient: {}", client.getClass().getCanonicalName());
+        }
+        return client;
     }
     }
 
 
     protected List<Pair<String, Pattern>> getClientRuleList(final String value) {
     protected List<Pair<String, Pattern>> getClientRuleList(final String value) {

+ 7 - 0
src/main/java/org/codelibs/fess/helper/WebFsIndexHelper.java

@@ -420,6 +420,13 @@ public class WebFsIndexHelper {
                 }
                 }
             }
             }
         }
         }
+        crawlerList.forEach(crawler -> {
+            try {
+                crawler.close();
+            } catch (final Exception e) {
+                logger.warn("Failed to close the crawler.", e);
+            }
+        });
         crawlerList.clear();
         crawlerList.clear();
         crawlerStatusList.clear();
         crawlerStatusList.clear();
 
 

+ 50 - 0
src/test/java/org/codelibs/fess/crawler/FessCrawlerThreadTest.java

@@ -0,0 +1,50 @@
+/*
+ * Copyright 2012-2022 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.crawler;
+
+import java.util.List;
+import java.util.regex.Pattern;
+
+import org.codelibs.core.misc.Pair;
+import org.codelibs.fess.unit.UnitFessTestCase;
+
+public class FessCrawlerThreadTest extends UnitFessTestCase {
+
+    public void test_getClientRuleList() {
+        FessCrawlerThread crawlerThread = new FessCrawlerThread();
+
+        List<Pair<String, Pattern>> list = crawlerThread.getClientRuleList(null);
+        assertEquals(0, list.size());
+
+        list = crawlerThread.getClientRuleList("");
+        assertEquals(0, list.size());
+
+        list = crawlerThread.getClientRuleList(" ");
+        assertEquals(0, list.size());
+
+        list = crawlerThread.getClientRuleList("playwright:http://.*");
+        assertEquals(1, list.size());
+        assertEquals("playwright", list.get(0).getFirst());
+        assertEquals("http://.*", list.get(0).getSecond().pattern());
+
+        list = crawlerThread.getClientRuleList("playwright:http://.*,playwright:https://.*");
+        assertEquals(2, list.size());
+        assertEquals("playwright", list.get(0).getFirst());
+        assertEquals("http://.*", list.get(0).getSecond().pattern());
+        assertEquals("playwright", list.get(1).getFirst());
+        assertEquals("https://.*", list.get(1).getSecond().pattern());
+    }
+}