fix #2852 Add support for specifying HTML child link extraction tags via html.child.url.rules in crawl settings.

This commit is contained in:
Shinsuke Sugaya 2024-10-24 22:01:38 +09:00
parent 706d952ad1
commit 6ca8f890fe
3 changed files with 40 additions and 0 deletions

View file

@ -21,6 +21,7 @@ import java.io.BufferedInputStream;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
@ -32,6 +33,7 @@ import java.util.Map;
import java.util.Set;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.xml.xpath.XPathEvaluationResult;
import javax.xml.xpath.XPathExpressionException;
@ -42,7 +44,9 @@ import org.apache.logging.log4j.Logger;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.io.SerializeUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Pair;
import org.codelibs.core.misc.ValueHolder;
import org.codelibs.core.stream.StreamUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.entity.AccessResultData;
@ -777,6 +781,17 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return new URL(currentUrl);
}
@Override
protected Stream<Pair<String, String>> getChildUrlRules(final ResponseData responseData, final ResultData resultData) {
final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
final String ruleString = configMap.get(Config.HTML_CHILD_URL_RULES);
if (StringUtil.isBlank(ruleString)) {
return childUrlRuleMap.entrySet().stream().map(e -> new Pair<>(e.getKey(), e.getValue()));
}
return Arrays.stream(ruleString.split(",")).map(s -> s.split(":")).filter(v -> v.length == 2)
.map(v -> new Pair<String, String>(v[0].trim(), v[1].trim()));
}
@Override
protected List<RequestData> convertChildUrlList(final List<RequestData> urlList) {
if (urlList != null) {

View file

@ -132,6 +132,7 @@ public interface CrawlingConfig {
public static final String PIPELINE = "pipeline";
public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
public static final String SCRIPT_TYPE = "script.type";
public static final String HTML_CHILD_URL_RULES = "html.child.url.rules";
}
// meta.*

View file

@ -26,6 +26,7 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.TransformerFactory;
@ -45,6 +46,7 @@ import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.util.FieldConfigs;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
import org.codelibs.fess.es.config.exentity.WebConfig;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.CrawlingInfoHelper;
@ -552,7 +554,29 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
transformer.transform(source, result);
return writer.toString();
}
public void test_getChildUrlRules() {
assertEquals("", new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Map.of();
}
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
assertEquals("//A:href", new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Map.of(Config.HTML_CHILD_URL_RULES, "//A:href");
}
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
assertEquals("//A:href,//AREA:href", new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Map.of(Config.HTML_CHILD_URL_RULES, "//A:href,//AREA:href");
}
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
assertEquals("//A:href,//AREA:href", new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Map.of(Config.HTML_CHILD_URL_RULES, " //A : href , //AREA : href ");
}
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
}
public void test_convertChildUrlList() {