fix #1199 ignore http canonical url on https site

This commit is contained in:
Shinsuke Sugaya 2017-07-30 23:02:33 +09:00
parent af80887a18
commit 5b8f57dca1

View file

@ -246,11 +246,22 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return true;
}
protected boolean isValidCanonicalUrl(final String url, final String canonicalUrl) {
if (url.startsWith("https:") && canonicalUrl.startsWith("http:")) {
if (logger.isDebugEnabled()) {
logger.debug("Invalid Canonical Url(https->http): " + url + " -> " + canonicalUrl);
}
return false;
}
return true;
}
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)) {
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
&& isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);