Преглед на файлове

fix #1268 add config.html.canonical.xpath/config.ignore.meta.robots

Shinsuke Sugaya преди 7 години
родител
ревизия
5f126b9931
променени са 1 файла, в които са добавени 36 реда и са изтрити 14 реда
  1. 36 14
      src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

+ 36 - 14
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -76,6 +76,10 @@ import org.xml.sax.InputSource;
 public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
     private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
 
+    private static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
+
+    private static final String IGNORE_META_ROBOTS = "ignore.meta.robots";
+
     private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
 
     private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
@@ -134,9 +138,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
         final Document document = parser.getDocument();
 
-        if (!fessConfig.isCrawlerIgnoreMetaRobots()) {
-            processMetaRobots(responseData, resultData, document);
-        }
+        processMetaRobots(responseData, resultData, document);
 
         final Map<String, Object> dataMap = new LinkedHashMap<>();
         for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
@@ -183,6 +185,18 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
     }
 
     protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
+        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
+        final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
+        final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
+        String ignore = configMap.get(IGNORE_META_ROBOTS);
+        if (ignore == null) {
+            if (fessConfig.isCrawlerIgnoreMetaRobots()) {
+                return;
+            }
+        } else if (Boolean.parseBoolean(ignore)) {
+            return;
+        }
+
         try {
             final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
             if (value != null) {
@@ -258,16 +272,14 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
         // canonical
-        if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
-            final String canonicalUrl = getCanonicalUrl(responseData, document);
-            if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
-                    && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
-                final Set<RequestData> childUrlSet = new HashSet<>();
-                childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
-                logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
-                throw new ChildUrlsException(childUrlSet, this.getClass().getName()
-                        + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
-            }
+        final String canonicalUrl = getCanonicalUrl(responseData, document);
+        if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
+                && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
+            final Set<RequestData> childUrlSet = new HashSet<>();
+            childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
+            logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
+            throw new ChildUrlsException(childUrlSet, this.getClass().getName()
+                    + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
         }
 
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
@@ -452,7 +464,17 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
     }
 
     protected String getCanonicalUrl(final ResponseData responseData, final Document document) {
-        final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCanonicalXpath(), false);
+        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
+        final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
+        final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
+        String xpath = configMap.get(HTML_CANONICAL_XPATH);
+        if (xpath == null) {
+            xpath = fessConfig.getCrawlerDocumentHtmlCanonicalXpath();
+        }
+        if (StringUtil.isBlank(xpath)) {
+            return null;
+        }
+        final String canonicalUrl = getSingleNodeValue(document, xpath, false);
         if (StringUtil.isBlank(canonicalUrl)) {
             return null;
         }