diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index acc861455..d57ea7426 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -404,15 +404,13 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf if (StringUtil.isBlank(canonicalUrl)) { return null; } - if (canonicalUrl.startsWith("/")) { - return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl); - } - return canonicalUrl; + return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl); } protected String normalizeCanonicalUrl(final String baseUrl, final String canonicalUrl) { try { - return new URL(new URL(baseUrl), canonicalUrl).toString(); + final URL u = new URL(baseUrl); + return new URL(u, canonicalUrl.startsWith(":") ? u.getProtocol() + canonicalUrl : canonicalUrl).toString(); } catch (final MalformedURLException e) { logger.warn("Invalid canonical url: " + baseUrl + " : " + canonicalUrl, e); } @@ -580,7 +578,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf List anchorList = new ArrayList<>(); final String baseHref = getBaseHref(document); try { - final URL url = new URL(baseHref != null ? baseHref : responseData.getUrl()); + final URL url = getBaseUrl(responseData.getUrl(), baseHref); for (final Map.Entry entry : childUrlRuleMap.entrySet()) { for (final String u : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) { anchorList.add(RequestDataBuilder.newRequestData().get().url(u).build()); @@ -589,8 +587,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf anchorList = convertChildUrlList(anchorList); } catch (final Exception e) { logger.warn("Could not parse anchor tags.", e); - // } finally { - // xpathAPI.remove(); } final List urlList = new ArrayList<>(anchorList.size()); @@ -600,6 +596,22 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf return urlList; } + protected URL getBaseUrl(final String currentUrl, final String baseHref) throws MalformedURLException { + if (baseHref != null) { + if (baseHref.startsWith("://")) { + final String protocol = currentUrl.split(":")[0]; + return new URL(protocol + baseHref); + } else if (baseHref.startsWith("//")) { + final String protocol = currentUrl.split(":")[0]; + return new URL(protocol + ":" + baseHref); + } else if (baseHref.startsWith("/")) { + return new URL(new URL(currentUrl), baseHref); + } + return new URL(baseHref); + } + return new URL(currentUrl); + } + @Override protected List convertChildUrlList(final List urlList) { if (urlList != null) { @@ -638,7 +650,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf URL childUrl; String u = null; try { - childUrl = new URL(url, urlValue); + childUrl = new URL(url, urlValue.startsWith(":") ? url.getProtocol() + urlValue : urlValue); u = encodeUrl(normalizeUrl(childUrl.toExternalForm()), encoding); } catch (final MalformedURLException e) { final int pos = urlValue.indexOf(':'); diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java index c083b8afb..36ced8386 100644 --- a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java +++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java @@ -18,6 +18,7 @@ package org.codelibs.fess.crawler.transformer; import java.io.ByteArrayInputStream; import java.io.StringWriter; import java.lang.reflect.Field; +import java.net.URL; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; @@ -600,5 +601,46 @@ public class FessXpathTransformerTest extends UnitFessTestCase { value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "/aaa"); assertEquals("http://hoge.com/aaa", value); + + value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "http://hoge.com/aaa"); + assertEquals("http://hoge.com/aaa", value); + + value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "://hoge.com/aaa"); + assertEquals("http://hoge.com/aaa", value); + + value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "//hoge.com/aaa"); + assertEquals("http://hoge.com/aaa", value); + } + + public void test_getBaseUrl() throws Exception { + final FessXpathTransformer transformer = new FessXpathTransformer(); + URL value; + + value = transformer.getBaseUrl("http://hoge.com/", null); + assertEquals("http://hoge.com/", value.toExternalForm()); + + value = transformer.getBaseUrl("http://hoge.com/", "http://hoge.com/"); + assertEquals("http://hoge.com/", value.toExternalForm()); + + value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/"); + assertEquals("http://hoge.com/", value.toExternalForm()); + + value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/ccc/"); + assertEquals("http://hoge.com/ccc/", value.toExternalForm()); + + value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", null); + assertEquals("http://hoge.com/aaa/bbb.html", value.toExternalForm()); + + value = transformer.getBaseUrl("http://hoge.com/", "://hoge.com/aaa/"); + assertEquals("http://hoge.com/aaa/", value.toExternalForm()); + + value = transformer.getBaseUrl("https://hoge.com/", "://hoge.com/aaa/"); + assertEquals("https://hoge.com/aaa/", value.toExternalForm()); + + value = transformer.getBaseUrl("http://hoge.com/", "//hoge.com/aaa/"); + assertEquals("http://hoge.com/aaa/", value.toExternalForm()); + + value = transformer.getBaseUrl("https://hoge.com/", "//hoge.com/aaa/"); + assertEquals("https://hoge.com/aaa/", value.toExternalForm()); } }