fix #1118 validate canonical url

This commit is contained in:
Shinsuke Sugaya 2017-06-22 07:14:17 +09:00
parent 8d49f933ec
commit ba8ae39fdb
2 changed files with 44 additions and 1 deletions

View file

@ -65,6 +65,7 @@ import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.PrunedTag;
import org.cyberneko.html.parsers.DOMParser;
import org.hibernate.validator.internal.constraintvalidators.hv.URLValidator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
@ -219,11 +220,38 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
protected boolean isValidUrl(final String urlStr) {
if (StringUtil.isBlank(urlStr)) {
return false;
}
final String value;
if (urlStr.startsWith("://")) {
value = "http" + urlStr;
} else if (urlStr.startsWith("//")) {
value = "http:" + urlStr;
} else {
value = urlStr;
}
try {
final URL url = new java.net.URL(value);
final String host = url.getHost();
if (StringUtil.isBlank(host)) {
return false;
}
if ("http".equalsIgnoreCase(host) || "https".equalsIgnoreCase(host)) {
return false;
}
} catch (MalformedURLException e) {
return false;
}
return true;
}
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
throw new ChildUrlsException(childUrlSet, this.getClass().getName()

View file

@ -727,4 +727,19 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
assertEquals(expected, transformer.getThumbnailUrl(responseData, document));
}
public void test_isValidUrl() {
final FessXpathTransformer transformer = new FessXpathTransformer();
assertTrue(transformer.isValidUrl("http://www.example.com"));
assertTrue(transformer.isValidUrl("http://www.example.com/aaa"));
assertTrue(transformer.isValidUrl("https://www.example.com"));
assertTrue(transformer.isValidUrl("://www.example.com"));
assertTrue(transformer.isValidUrl("//www.example.com"));
assertFalse(transformer.isValidUrl(null));
assertFalse(transformer.isValidUrl(" "));
assertFalse(transformer.isValidUrl("http://"));
assertFalse(transformer.isValidUrl("http://http://www.example.com"));
}
}