diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index b14354f34..5b5afc1ea 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -246,11 +246,22 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf return true; } + protected boolean isValidCanonicalUrl(final String url, final String canonicalUrl) { + if (url.startsWith("https:") && canonicalUrl.startsWith("http:")) { + if (logger.isDebugEnabled()) { + logger.debug("Invalid Canonical Url(https->http): " + url + " -> " + canonicalUrl); + } + return false; + } + return true; + } + protected void putAdditionalData(final Map dataMap, final ResponseData responseData, final Document document) { // canonical if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) { final String canonicalUrl = getCanonicalUrl(responseData, document); - if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)) { + if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) + && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) { final Set childUrlSet = new HashSet<>(); childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build()); logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);