fix #2852 Add support for specifying HTML child link extraction tags via html.child.url.rules in crawl settings.
This commit is contained in:
parent
706d952ad1
commit
6ca8f890fe
3 changed files with 40 additions and 0 deletions
|
@ -21,6 +21,7 @@ import java.io.BufferedInputStream;
|
|||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
|
@ -32,6 +33,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.function.UnaryOperator;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import javax.xml.xpath.XPathEvaluationResult;
|
||||
import javax.xml.xpath.XPathExpressionException;
|
||||
|
@ -42,7 +44,9 @@ import org.apache.logging.log4j.Logger;
|
|||
import org.codelibs.core.io.InputStreamUtil;
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.misc.Pair;
|
||||
import org.codelibs.core.misc.ValueHolder;
|
||||
import org.codelibs.core.stream.StreamUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
|
@ -777,6 +781,17 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
return new URL(currentUrl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Stream<Pair<String, String>> getChildUrlRules(final ResponseData responseData, final ResultData resultData) {
|
||||
final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
|
||||
final String ruleString = configMap.get(Config.HTML_CHILD_URL_RULES);
|
||||
if (StringUtil.isBlank(ruleString)) {
|
||||
return childUrlRuleMap.entrySet().stream().map(e -> new Pair<>(e.getKey(), e.getValue()));
|
||||
}
|
||||
return Arrays.stream(ruleString.split(",")).map(s -> s.split(":")).filter(v -> v.length == 2)
|
||||
.map(v -> new Pair<String, String>(v[0].trim(), v[1].trim()));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<RequestData> convertChildUrlList(final List<RequestData> urlList) {
|
||||
if (urlList != null) {
|
||||
|
|
|
@ -132,6 +132,7 @@ public interface CrawlingConfig {
|
|||
public static final String PIPELINE = "pipeline";
|
||||
public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
|
||||
public static final String SCRIPT_TYPE = "script.type";
|
||||
public static final String HTML_CHILD_URL_RULES = "html.child.url.rules";
|
||||
}
|
||||
|
||||
// meta.*
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import javax.xml.transform.OutputKeys;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
|
@ -45,6 +46,7 @@ import org.codelibs.fess.crawler.entity.ResultData;
|
|||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config;
|
||||
import org.codelibs.fess.es.config.exentity.WebConfig;
|
||||
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
||||
import org.codelibs.fess.helper.CrawlingInfoHelper;
|
||||
|
@ -552,7 +554,29 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
transformer.transform(source, result);
|
||||
|
||||
return writer.toString();
|
||||
}
|
||||
|
||||
public void test_getChildUrlRules() {
|
||||
assertEquals("", new FessXpathTransformer() {
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Map.of();
|
||||
}
|
||||
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
|
||||
assertEquals("//A:href", new FessXpathTransformer() {
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Map.of(Config.HTML_CHILD_URL_RULES, "//A:href");
|
||||
}
|
||||
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
|
||||
assertEquals("//A:href,//AREA:href", new FessXpathTransformer() {
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Map.of(Config.HTML_CHILD_URL_RULES, "//A:href,//AREA:href");
|
||||
}
|
||||
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
|
||||
assertEquals("//A:href,//AREA:href", new FessXpathTransformer() {
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Map.of(Config.HTML_CHILD_URL_RULES, " //A : href , //AREA : href ");
|
||||
}
|
||||
}.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(",")));
|
||||
}
|
||||
|
||||
public void test_convertChildUrlList() {
|
||||
|
|
Loading…
Add table
Reference in a new issue