Преглед на файлове

fix #1199 ignore http canonical url on https site

Shinsuke Sugaya преди 8 години
родител
ревизия
5b8f57dca1
променени са 1 файла, в които са добавени 12 реда и са изтрити 1 реда
  1. 12 1
      src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

+ 12 - 1
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -246,11 +246,22 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return true;
     }
 
+    protected boolean isValidCanonicalUrl(final String url, final String canonicalUrl) {
+        if (url.startsWith("https:") && canonicalUrl.startsWith("http:")) {
+            if (logger.isDebugEnabled()) {
+                logger.debug("Invalid Canonical Url(https->http): " + url + " -> " + canonicalUrl);
+            }
+            return false;
+        }
+        return true;
+    }
+
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
         // canonical
         if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
             final String canonicalUrl = getCanonicalUrl(responseData, document);
-            if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)) {
+            if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
+                    && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
                 final Set<RequestData> childUrlSet = new HashSet<>();
                 childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
                 logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);