diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index 40f6bedaa..6e2960c34 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -21,6 +21,7 @@ import java.io.BufferedInputStream; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.Date; import java.util.HashMap; @@ -32,6 +33,7 @@ import java.util.Map; import java.util.Set; import java.util.function.UnaryOperator; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.xml.xpath.XPathEvaluationResult; import javax.xml.xpath.XPathExpressionException; @@ -42,7 +44,9 @@ import org.apache.logging.log4j.Logger; import org.codelibs.core.io.InputStreamUtil; import org.codelibs.core.io.SerializeUtil; import org.codelibs.core.lang.StringUtil; +import org.codelibs.core.misc.Pair; import org.codelibs.core.misc.ValueHolder; +import org.codelibs.core.stream.StreamUtil; import org.codelibs.fess.Constants; import org.codelibs.fess.crawler.builder.RequestDataBuilder; import org.codelibs.fess.crawler.entity.AccessResultData; @@ -777,6 +781,17 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf return new URL(currentUrl); } + @Override + protected Stream> getChildUrlRules(final ResponseData responseData, final ResultData resultData) { + final Map configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG); + final String ruleString = configMap.get(Config.HTML_CHILD_URL_RULES); + if (StringUtil.isBlank(ruleString)) { + return childUrlRuleMap.entrySet().stream().map(e -> new Pair<>(e.getKey(), e.getValue())); + } + return Arrays.stream(ruleString.split(",")).map(s -> s.split(":")).filter(v -> v.length == 2) + .map(v -> new Pair(v[0].trim(), v[1].trim())); + } + @Override protected List convertChildUrlList(final List urlList) { if (urlList != null) { diff --git a/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java b/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java index 992461f6d..945d2c90a 100644 --- a/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java +++ b/src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java @@ -132,6 +132,7 @@ public interface CrawlingConfig { public static final String PIPELINE = "pipeline"; public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags"; public static final String SCRIPT_TYPE = "script.type"; + public static final String HTML_CHILD_URL_RULES = "html.child.url.rules"; } // meta.* diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java index 3fbedfab7..2b28b4297 100644 --- a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java +++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java @@ -26,6 +26,7 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerFactory; @@ -45,6 +46,7 @@ import org.codelibs.fess.crawler.entity.ResultData; import org.codelibs.fess.crawler.exception.ChildUrlsException; import org.codelibs.fess.crawler.util.FieldConfigs; import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName; +import org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config; import org.codelibs.fess.es.config.exentity.WebConfig; import org.codelibs.fess.helper.CrawlingConfigHelper; import org.codelibs.fess.helper.CrawlingInfoHelper; @@ -552,7 +554,29 @@ public class FessXpathTransformerTest extends UnitFessTestCase { transformer.transform(source, result); return writer.toString(); + } + public void test_getChildUrlRules() { + assertEquals("", new FessXpathTransformer() { + protected Map getConfigPrameterMap(final ResponseData responseData, final ConfigName config) { + return Map.of(); + } + }.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(","))); + assertEquals("//A:href", new FessXpathTransformer() { + protected Map getConfigPrameterMap(final ResponseData responseData, final ConfigName config) { + return Map.of(Config.HTML_CHILD_URL_RULES, "//A:href"); + } + }.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(","))); + assertEquals("//A:href,//AREA:href", new FessXpathTransformer() { + protected Map getConfigPrameterMap(final ResponseData responseData, final ConfigName config) { + return Map.of(Config.HTML_CHILD_URL_RULES, "//A:href,//AREA:href"); + } + }.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(","))); + assertEquals("//A:href,//AREA:href", new FessXpathTransformer() { + protected Map getConfigPrameterMap(final ResponseData responseData, final ConfigName config) { + return Map.of(Config.HTML_CHILD_URL_RULES, " //A : href , //AREA : href "); + } + }.getChildUrlRules(null, null).map(v -> v.getFirst() + ":" + v.getSecond()).collect(Collectors.joining(","))); } public void test_convertChildUrlList() {