diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index 5b5afc1ea..a3900d944 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -76,6 +76,10 @@ import org.xml.sax.InputSource; public class FessXpathTransformer extends XpathTransformer implements FessTransformer { private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class); + private static final String HTML_CANONICAL_XPATH = "html.canonical.xpath"; + + private static final String IGNORE_META_ROBOTS = "ignore.meta.robots"; + private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content"; private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content"; @@ -134,9 +138,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf final Document document = parser.getDocument(); - if (!fessConfig.isCrawlerIgnoreMetaRobots()) { - processMetaRobots(responseData, resultData, document); - } + processMetaRobots(responseData, resultData, document); final Map dataMap = new LinkedHashMap<>(); for (final Map.Entry entry : fieldRuleMap.entrySet()) { @@ -183,6 +185,18 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf } protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) { + final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); + final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId()); + final Map configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG); + String ignore = configMap.get(IGNORE_META_ROBOTS); + if (ignore == null) { + if (fessConfig.isCrawlerIgnoreMetaRobots()) { + return; + } + } else if (Boolean.parseBoolean(ignore)) { + return; + } + try { final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT); if (value != null) { @@ -258,16 +272,14 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf protected void putAdditionalData(final Map dataMap, final ResponseData responseData, final Document document) { // canonical - if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) { - final String canonicalUrl = getCanonicalUrl(responseData, document); - if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) - && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) { - final Set childUrlSet = new HashSet<>(); - childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build()); - logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl); - throw new ChildUrlsException(childUrlSet, this.getClass().getName() - + "#putAdditionalData(Map, ResponseData, Document)"); - } + final String canonicalUrl = getCanonicalUrl(responseData, document); + if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) + && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) { + final Set childUrlSet = new HashSet<>(); + childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build()); + logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl); + throw new ChildUrlsException(childUrlSet, this.getClass().getName() + + "#putAdditionalData(Map, ResponseData, Document)"); } final FessConfig fessConfig = ComponentUtil.getFessConfig(); @@ -452,7 +464,17 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf } protected String getCanonicalUrl(final ResponseData responseData, final Document document) { - final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCanonicalXpath(), false); + final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper(); + final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId()); + final Map configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG); + String xpath = configMap.get(HTML_CANONICAL_XPATH); + if (xpath == null) { + xpath = fessConfig.getCrawlerDocumentHtmlCanonicalXpath(); + } + if (StringUtil.isBlank(xpath)) { + return null; + } + final String canonicalUrl = getSingleNodeValue(document, xpath, false); if (StringUtil.isBlank(canonicalUrl)) { return null; }