Kaynağa Gözat

fix #1199 ignore http canonical url on https site

Shinsuke Sugaya 8 yıl önce
ebeveyn
işleme
5b8f57dca1

+ 12 - 1
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -246,11 +246,22 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return true;
         return true;
     }
     }
 
 
+    protected boolean isValidCanonicalUrl(final String url, final String canonicalUrl) {
+        if (url.startsWith("https:") && canonicalUrl.startsWith("http:")) {
+            if (logger.isDebugEnabled()) {
+                logger.debug("Invalid Canonical Url(https->http): " + url + " -> " + canonicalUrl);
+            }
+            return false;
+        }
+        return true;
+    }
+
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
         // canonical
         // canonical
         if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
         if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
             final String canonicalUrl = getCanonicalUrl(responseData, document);
             final String canonicalUrl = getCanonicalUrl(responseData, document);
-            if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)) {
+            if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
+                    && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
                 final Set<RequestData> childUrlSet = new HashSet<>();
                 final Set<RequestData> childUrlSet = new HashSet<>();
                 childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
                 childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
                 logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
                 logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);