fix #1067 accept url does not contain protocol

This commit is contained in:
Shinsuke Sugaya 2017-05-25 17:03:00 +09:00
parent 88e01170bd
commit f5ffbb69bb
2 changed files with 63 additions and 9 deletions

View file

@ -404,15 +404,13 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
if (StringUtil.isBlank(canonicalUrl)) {
return null;
}
if (canonicalUrl.startsWith("/")) {
return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl);
}
return canonicalUrl;
return normalizeCanonicalUrl(responseData.getUrl(), canonicalUrl);
}
protected String normalizeCanonicalUrl(final String baseUrl, final String canonicalUrl) {
try {
return new URL(new URL(baseUrl), canonicalUrl).toString();
final URL u = new URL(baseUrl);
return new URL(u, canonicalUrl.startsWith(":") ? u.getProtocol() + canonicalUrl : canonicalUrl).toString();
} catch (final MalformedURLException e) {
logger.warn("Invalid canonical url: " + baseUrl + " : " + canonicalUrl, e);
}
@ -580,7 +578,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
List<RequestData> anchorList = new ArrayList<>();
final String baseHref = getBaseHref(document);
try {
final URL url = new URL(baseHref != null ? baseHref : responseData.getUrl());
final URL url = getBaseUrl(responseData.getUrl(), baseHref);
for (final Map.Entry<String, String> entry : childUrlRuleMap.entrySet()) {
for (final String u : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) {
anchorList.add(RequestDataBuilder.newRequestData().get().url(u).build());
@ -589,8 +587,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
anchorList = convertChildUrlList(anchorList);
} catch (final Exception e) {
logger.warn("Could not parse anchor tags.", e);
// } finally {
// xpathAPI.remove();
}
final List<String> urlList = new ArrayList<>(anchorList.size());
@ -600,6 +596,22 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return urlList;
}
protected URL getBaseUrl(final String currentUrl, final String baseHref) throws MalformedURLException {
if (baseHref != null) {
if (baseHref.startsWith("://")) {
final String protocol = currentUrl.split(":")[0];
return new URL(protocol + baseHref);
} else if (baseHref.startsWith("//")) {
final String protocol = currentUrl.split(":")[0];
return new URL(protocol + ":" + baseHref);
} else if (baseHref.startsWith("/")) {
return new URL(new URL(currentUrl), baseHref);
}
return new URL(baseHref);
}
return new URL(currentUrl);
}
@Override
protected List<RequestData> convertChildUrlList(final List<RequestData> urlList) {
if (urlList != null) {
@ -638,7 +650,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
URL childUrl;
String u = null;
try {
childUrl = new URL(url, urlValue);
childUrl = new URL(url, urlValue.startsWith(":") ? url.getProtocol() + urlValue : urlValue);
u = encodeUrl(normalizeUrl(childUrl.toExternalForm()), encoding);
} catch (final MalformedURLException e) {
final int pos = urlValue.indexOf(':');

View file

@ -18,6 +18,7 @@ package org.codelibs.fess.crawler.transformer;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import java.lang.reflect.Field;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
@ -600,5 +601,46 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "/aaa");
assertEquals("http://hoge.com/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "http://hoge.com/aaa");
assertEquals("http://hoge.com/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "://hoge.com/aaa");
assertEquals("http://hoge.com/aaa", value);
value = transformer.normalizeCanonicalUrl("http://hoge.com/bbb", "//hoge.com/aaa");
assertEquals("http://hoge.com/aaa", value);
}
public void test_getBaseUrl() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
URL value;
value = transformer.getBaseUrl("http://hoge.com/", null);
assertEquals("http://hoge.com/", value.toExternalForm());
value = transformer.getBaseUrl("http://hoge.com/", "http://hoge.com/");
assertEquals("http://hoge.com/", value.toExternalForm());
value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/");
assertEquals("http://hoge.com/", value.toExternalForm());
value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", "http://hoge.com/ccc/");
assertEquals("http://hoge.com/ccc/", value.toExternalForm());
value = transformer.getBaseUrl("http://hoge.com/aaa/bbb.html", null);
assertEquals("http://hoge.com/aaa/bbb.html", value.toExternalForm());
value = transformer.getBaseUrl("http://hoge.com/", "://hoge.com/aaa/");
assertEquals("http://hoge.com/aaa/", value.toExternalForm());
value = transformer.getBaseUrl("https://hoge.com/", "://hoge.com/aaa/");
assertEquals("https://hoge.com/aaa/", value.toExternalForm());
value = transformer.getBaseUrl("http://hoge.com/", "//hoge.com/aaa/");
assertEquals("http://hoge.com/aaa/", value.toExternalForm());
value = transformer.getBaseUrl("https://hoge.com/", "//hoge.com/aaa/");
assertEquals("https://hoge.com/aaa/", value.toExternalForm());
}
}