diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index dc75bfe93..edb42bc42 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -65,6 +65,7 @@ import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.codelibs.fess.util.PrunedTag; import org.cyberneko.html.parsers.DOMParser; +import org.hibernate.validator.internal.constraintvalidators.hv.URLValidator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; @@ -219,11 +220,38 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf } + protected boolean isValidUrl(final String urlStr) { + if (StringUtil.isBlank(urlStr)) { + return false; + } + final String value; + if (urlStr.startsWith("://")) { + value = "http" + urlStr; + } else if (urlStr.startsWith("//")) { + value = "http:" + urlStr; + } else { + value = urlStr; + } + try { + final URL url = new java.net.URL(value); + final String host = url.getHost(); + if (StringUtil.isBlank(host)) { + return false; + } + if ("http".equalsIgnoreCase(host) || "https".equalsIgnoreCase(host)) { + return false; + } + } catch (MalformedURLException e) { + return false; + } + return true; + } + protected void putAdditionalData(final Map dataMap, final ResponseData responseData, final Document document) { // canonical if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) { final String canonicalUrl = getCanonicalUrl(responseData, document); - if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) { + if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)) { final Set childUrlSet = new HashSet<>(); childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build()); throw new ChildUrlsException(childUrlSet, this.getClass().getName() diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java index 1a229fe41..fd5273e8d 100644 --- a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java +++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java @@ -727,4 +727,19 @@ public class FessXpathTransformerTest extends UnitFessTestCase { assertEquals(expected, transformer.getThumbnailUrl(responseData, document)); } + + public void test_isValidUrl() { + final FessXpathTransformer transformer = new FessXpathTransformer(); + + assertTrue(transformer.isValidUrl("http://www.example.com")); + assertTrue(transformer.isValidUrl("http://www.example.com/aaa")); + assertTrue(transformer.isValidUrl("https://www.example.com")); + assertTrue(transformer.isValidUrl("://www.example.com")); + assertTrue(transformer.isValidUrl("//www.example.com")); + + assertFalse(transformer.isValidUrl(null)); + assertFalse(transformer.isValidUrl(" ")); + assertFalse(transformer.isValidUrl("http://")); + assertFalse(transformer.isValidUrl("http://http://www.example.com")); + } }