fix #2688 add fess-crawler-playwright

This commit is contained in:
Shinsuke Sugaya 2022-09-19 14:18:27 +09:00
parent 34d552f9e9
commit 0b154758bd
4 changed files with 67 additions and 1 deletions

View file

@ -1391,6 +1391,11 @@
<artifactId>fess-crawler-es</artifactId>
<version>${crawler.version}</version>
</dependency>
<dependency>
<groupId>org.codelibs.fess</groupId>
<artifactId>fess-crawler-playwright</artifactId>
<version>${crawler.version}</version>
</dependency>
<dependency>
<groupId>args4j</groupId>
<artifactId>args4j</artifactId>

View file

@ -275,7 +275,7 @@ public class FessCrawlerThread extends CrawlerThread {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, String> clientConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.CLIENT);
final String value = clientConfigMap.get(CRAWLER_CLIENTS);
return getClientRuleList(value).stream().map(e -> {
final CrawlerClient client = getClientRuleList(value).stream().map(e -> {
if (e.getSecond().matcher(url).matches()) {
return e.getFirst();
}
@ -283,6 +283,10 @@ public class FessCrawlerThread extends CrawlerThread {
}).filter(StringUtil::isNotBlank).findFirst()//
.map(s -> clientFactory.getClient(s + ":" + url))//
.orElseGet(() -> clientFactory.getClient(url));
if (logger.isDebugEnabled()) {
logger.debug("CrawlerClient: {}", client.getClass().getCanonicalName());
}
return client;
}
protected List<Pair<String, Pattern>> getClientRuleList(final String value) {

View file

@ -420,6 +420,13 @@ public class WebFsIndexHelper {
}
}
}
crawlerList.forEach(crawler -> {
try {
crawler.close();
} catch (final Exception e) {
logger.warn("Failed to close the crawler.", e);
}
});
crawlerList.clear();
crawlerStatusList.clear();

View file

@ -0,0 +1,50 @@
/*
* Copyright 2012-2022 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.crawler;
import java.util.List;
import java.util.regex.Pattern;
import org.codelibs.core.misc.Pair;
import org.codelibs.fess.unit.UnitFessTestCase;
public class FessCrawlerThreadTest extends UnitFessTestCase {
public void test_getClientRuleList() {
FessCrawlerThread crawlerThread = new FessCrawlerThread();
List<Pair<String, Pattern>> list = crawlerThread.getClientRuleList(null);
assertEquals(0, list.size());
list = crawlerThread.getClientRuleList("");
assertEquals(0, list.size());
list = crawlerThread.getClientRuleList(" ");
assertEquals(0, list.size());
list = crawlerThread.getClientRuleList("playwright:http://.*");
assertEquals(1, list.size());
assertEquals("playwright", list.get(0).getFirst());
assertEquals("http://.*", list.get(0).getSecond().pattern());
list = crawlerThread.getClientRuleList("playwright:http://.*,playwright:https://.*");
assertEquals(2, list.size());
assertEquals("playwright", list.get(0).getFirst());
assertEquals("http://.*", list.get(0).getSecond().pattern());
assertEquals("playwright", list.get(1).getFirst());
assertEquals("https://.*", list.get(1).getSecond().pattern());
}
}