Parcourir la source

fix #1067 accept url does not contain protocol

Shinsuke Sugaya il y a 8 ans
Parent
commit
f5ffbb69bb

+ 21 - 9
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -404,15 +404,13 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         if (StringUtil.isBlank(canonicalUrl)) {
             return null;
         }
-        if (canonicalUrl.startsWith("/")) {
-            return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl);
-        }
-        return canonicalUrl;
+        return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl);
     }
 
     protected String normalizeCanonicalUrl(final String baseUrl, final String canonicalUrl) {
         try {
-            return new URL(new URL(baseUrl), canonicalUrl).toString();
+            final URL u = new URL(baseUrl);
+            return new URL(u, canonicalUrl.startsWith(":") ? u.getProtocol() + canonicalUrl : canonicalUrl).toString();
         } catch (final MalformedURLException e) {
             logger.warn("Invalid canonical url: " + baseUrl + " : " + canonicalUrl, e);
         }
@@ -580,7 +578,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         List<RequestData> anchorList = new ArrayList<>();
         final String baseHref = getBaseHref(document);
         try {
-            final URL url = new URL(baseHref != null ? baseHref : responseData.getUrl());
+            final URL url = getBaseUrl(responseData.getUrl(), baseHref);
             for (final Map.Entry<String, String> entry : childUrlRuleMap.entrySet()) {
                 for (final String u : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) {
                     anchorList.add(RequestDataBuilder.newRequestData().get().url(u).build());
@@ -589,8 +587,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             anchorList = convertChildUrlList(anchorList);
         } catch (final Exception e) {
             logger.warn("Could not parse anchor tags.", e);
-            //        } finally {
-            //            xpathAPI.remove();
         }
 
         final List<String> urlList = new ArrayList<>(anchorList.size());
@@ -600,6 +596,22 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return urlList;
     }
 
+    protected URL getBaseUrl(final String currentUrl, final String baseHref) throws MalformedURLException {
+        if (baseHref != null) {
+            if (baseHref.startsWith("://")) {
+                final String protocol = currentUrl.split(":")[0];
+                return new URL(protocol + baseHref);
+            } else if (baseHref.startsWith("//")) {
+                final String protocol = currentUrl.split(":")[0];
+                return new URL(protocol + ":" + baseHref);
+            } else if (baseHref.startsWith("/")) {
+                return new URL(new URL(currentUrl), baseHref);
+            }
+            return new URL(baseHref);
+        }
+        return new URL(currentUrl);
+    }
+
     @Override
     protected List<RequestData> convertChildUrlList(final List<RequestData> urlList) {
         if (urlList != null) {
@@ -638,7 +650,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         URL childUrl;
         String u = null;
         try {
-            childUrl = new URL(url, urlValue);
+            childUrl = new URL(url, urlValue.startsWith(":") ? url.getProtocol() + urlValue : urlValue);
             u = encodeUrl(normalizeUrl(childUrl.toExternalForm()), encoding);
         } catch (final MalformedURLException e) {
             final int pos = urlValue.indexOf(':');

+ 42 - 0
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -18,6 +18,7 @@ package org.codelibs.fess.crawler.transformer;
 import java.io.ByteArrayInputStream;
 import java.io.StringWriter;
 import java.lang.reflect.Field;
+import java.net.URL;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
@@ -600,5 +601,46 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
 
         value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "/aaa");
         assertEquals("http://hoge.com/aaa", value);
+
+        value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "http://hoge.com/aaa");
+        assertEquals("http://hoge.com/aaa", value);
+
+        value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "://hoge.com/aaa");
+        assertEquals("http://hoge.com/aaa", value);
+
+        value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "//hoge.com/aaa");
+        assertEquals("http://hoge.com/aaa", value);
+    }
+
+    public void test_getBaseUrl() throws Exception {
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+        URL value;
+
+        value = transformer.getBaseUrl("http://hoge.com/", null);
+        assertEquals("http://hoge.com/", value.toExternalForm());
+
+        value = transformer.getBaseUrl("http://hoge.com/", "http://hoge.com/");
+        assertEquals("http://hoge.com/", value.toExternalForm());
+
+        value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/");
+        assertEquals("http://hoge.com/", value.toExternalForm());
+
+        value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/ccc/");
+        assertEquals("http://hoge.com/ccc/", value.toExternalForm());
+
+        value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", null);
+        assertEquals("http://hoge.com/aaa/bbb.html", value.toExternalForm());
+
+        value = transformer.getBaseUrl("http://hoge.com/", "://hoge.com/aaa/");
+        assertEquals("http://hoge.com/aaa/", value.toExternalForm());
+
+        value = transformer.getBaseUrl("https://hoge.com/", "://hoge.com/aaa/");
+        assertEquals("https://hoge.com/aaa/", value.toExternalForm());
+
+        value = transformer.getBaseUrl("http://hoge.com/", "//hoge.com/aaa/");
+        assertEquals("http://hoge.com/aaa/", value.toExternalForm());
+
+        value = transformer.getBaseUrl("https://hoge.com/", "//hoge.com/aaa/");
+        assertEquals("https://hoge.com/aaa/", value.toExternalForm());
     }
 }