Bladeren bron

fix #907 X-Robots-Tag support

Shinsuke Sugaya 7 jaren geleden
bovenliggende
commit
aed6b735d6

+ 67 - 14
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -74,23 +74,29 @@ import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 
 public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
+
     private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
 
+    private static final String X_ROBOTS_TAG = "X-Robots-Tag";
+
     private static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
 
+    @Deprecated
     private static final String IGNORE_META_ROBOTS = "ignore.meta.robots";
 
+    private static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
+
     private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
 
     private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
 
     private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content";
 
-    private static final String META_ROBOTS_NONE = "none";
+    private static final String ROBOTS_TAG_NONE = "none";
 
-    private static final String META_ROBOTS_NOINDEX = "noindex";
+    private static final String ROBOTS_TAG_NOINDEX = "noindex";
 
-    private static final String META_ROBOTS_NOFOLLOW = "nofollow";
+    private static final String ROBOTS_TAG_NOFOLLOW = "nofollow";
 
     private static final int UTF8_BOM_SIZE = 3;
 
@@ -139,6 +145,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         final Document document = parser.getDocument();
 
         processMetaRobots(responseData, resultData, document);
+        processXRobotsTag(responseData, resultData);
 
         final Map<String, Object> dataMap = new LinkedHashMap<>();
         for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
@@ -195,40 +202,45 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
     protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
         final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
-        String ignore = configMap.get(IGNORE_META_ROBOTS);
+        String ignore = configMap.get(IGNORE_ROBOTS_TAGS);
         if (ignore == null) {
-            if (fessConfig.isCrawlerIgnoreMetaRobots()) {
+            ignore = configMap.get(IGNORE_META_ROBOTS);
+            if (ignore == null) {
+                if (fessConfig.isCrawlerIgnoreRobotsTags()) {
+                    return;
+                }
+            } else if (Boolean.parseBoolean(ignore)) {
                 return;
             }
         } else if (Boolean.parseBoolean(ignore)) {
             return;
         }
 
+        // meta tag
         try {
             final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
             if (value != null) {
-                final String content = value.getTextContent().toLowerCase(Locale.ROOT);
                 boolean noindex = false;
                 boolean nofollow = false;
-                if (content.contains(META_ROBOTS_NONE)) {
+                final String content = value.getTextContent().toLowerCase(Locale.ROOT);
+                if (content.contains(ROBOTS_TAG_NONE)) {
                     noindex = true;
                     nofollow = true;
                 } else {
-                    if (content.contains(META_ROBOTS_NOINDEX)) {
+                    if (content.contains(ROBOTS_TAG_NOINDEX)) {
                         noindex = true;
                     }
-                    if (content.contains(META_ROBOTS_NOFOLLOW)) {
+                    if (content.contains(ROBOTS_TAG_NOFOLLOW)) {
                         nofollow = true;
                     }
                 }
-
                 if (noindex && nofollow) {
                     logger.info("META(robots=noindex,nofollow): " + responseData.getUrl());
-                    throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots(Document)");
+                    throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots");
                 } else if (noindex) {
                     logger.info("META(robots=noindex): " + responseData.getUrl());
                     storeChildUrls(responseData, resultData);
-                    throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots(Document)");
+                    throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots");
                 } else if (nofollow) {
                     logger.info("META(robots=nofollow): " + responseData.getUrl());
                     responseData.setNoFollow(true);
@@ -240,6 +252,48 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
     }
 
+    protected void processXRobotsTag(final ResponseData responseData, final ResultData resultData) {
+        final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
+        String ignore = configMap.get(IGNORE_ROBOTS_TAGS);
+        if (ignore == null) {
+            if (fessConfig.isCrawlerIgnoreRobotsTags()) {
+                return;
+            }
+        } else if (Boolean.parseBoolean(ignore)) {
+            return;
+        }
+
+        // X-Robots-Tag
+        responseData.getMetaDataMap().entrySet().stream().filter(e -> e.getKey().equalsIgnoreCase(X_ROBOTS_TAG) && e.getValue() != null)
+                .forEach(e -> {
+                    boolean noindex = false;
+                    boolean nofollow = false;
+                    final String value = e.getValue().toString().toLowerCase(Locale.ROOT);
+                    if (value.contains(ROBOTS_TAG_NONE)) {
+                        noindex = true;
+                        nofollow = true;
+                    } else {
+                        if (value.contains(ROBOTS_TAG_NOINDEX)) {
+                            noindex = true;
+                        }
+                        if (value.contains(ROBOTS_TAG_NOFOLLOW)) {
+                            nofollow = true;
+                        }
+                    }
+                    if (noindex && nofollow) {
+                        logger.info("HEADER(robots=noindex,nofollow): " + responseData.getUrl());
+                        throw new ChildUrlsException(Collections.emptySet(), "#processXRobotsTag");
+                    } else if (noindex) {
+                        logger.info("HEADER(robots=noindex): " + responseData.getUrl());
+                        storeChildUrls(responseData, resultData);
+                        throw new ChildUrlsException(resultData.getChildUrlSet(), "#processXRobotsTag");
+                    } else if (nofollow) {
+                        logger.info("HEADER(robots=nofollow): " + responseData.getUrl());
+                        responseData.setNoFollow(true);
+                    }
+                });
+    }
+
     protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
         final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
         final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
@@ -292,8 +346,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             final Set<RequestData> childUrlSet = new HashSet<>();
             childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
             logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
-            throw new ChildUrlsException(childUrlSet, this.getClass().getName()
-                    + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
+            throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
         }
 
         final FessConfig fessConfig = ComponentUtil.getFessConfig();

+ 1 - 1
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -149,7 +149,7 @@ public class DocumentHelper {
             if (responseData.getRedirectLocation() != null) {
                 final Set<RequestData> childUrlList = new HashSet<>();
                 childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
-                throw new ChildUrlsException(childUrlList, "Redirected from " + url);
+                throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
             }
             responseData.setExecutionTime(System.currentTimeMillis() - startTime);
             responseData.setSessionId(crawlingInfoId);

+ 10 - 10
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -215,7 +215,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     String CRAWLER_IGNORE_ROBOTS_TXT = "crawler.ignore.robots.txt";
 
     /** The key of the configuration. e.g. false */
-    String CRAWLER_IGNORE_META_ROBOTS = "crawler.ignore.meta.robots";
+    String CRAWLER_IGNORE_ROBOTS_TAGS = "crawler.ignore.robots.tags";
 
     /** The key of the configuration. e.g. true */
     String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
@@ -1869,18 +1869,18 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     boolean isCrawlerIgnoreRobotsTxt();
 
     /**
-     * Get the value for the key 'crawler.ignore.meta.robots'. <br>
+     * Get the value for the key 'crawler.ignore.robots.tags'. <br>
      * The value is, e.g. false <br>
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      */
-    String getCrawlerIgnoreMetaRobots();
+    String getCrawlerIgnoreRobotsTags();
 
     /**
-     * Is the property for the key 'crawler.ignore.meta.robots' true? <br>
+     * Is the property for the key 'crawler.ignore.robots.tags' true? <br>
      * The value is, e.g. false <br>
      * @return The determination, true or false. (if not found, exception but basically no way)
      */
-    boolean isCrawlerIgnoreMetaRobots();
+    boolean isCrawlerIgnoreRobotsTags();
 
     /**
      * Get the value for the key 'crawler.ignore.content.exception'. <br>
@@ -5808,12 +5808,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return is(FessConfig.CRAWLER_IGNORE_ROBOTS_TXT);
         }
 
-        public String getCrawlerIgnoreMetaRobots() {
-            return get(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
+        public String getCrawlerIgnoreRobotsTags() {
+            return get(FessConfig.CRAWLER_IGNORE_ROBOTS_TAGS);
         }
 
-        public boolean isCrawlerIgnoreMetaRobots() {
-            return is(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
+        public boolean isCrawlerIgnoreRobotsTags() {
+            return is(FessConfig.CRAWLER_IGNORE_ROBOTS_TAGS);
         }
 
         public String getCrawlerIgnoreContentException() {
@@ -7833,7 +7833,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.CRAWLER_WEB_PROTOCOLS, "http,https");
             defaultMap.put(FessConfig.CRAWLER_FILE_PROTOCOLS, "file,smb,ftp");
             defaultMap.put(FessConfig.CRAWLER_IGNORE_ROBOTS_TXT, "false");
-            defaultMap.put(FessConfig.CRAWLER_IGNORE_META_ROBOTS, "false");
+            defaultMap.put(FessConfig.CRAWLER_IGNORE_ROBOTS_TAGS, "false");
             defaultMap.put(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION, "true");
             defaultMap.put(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES, "404");
             defaultMap.put(FessConfig.CRAWLER_SYSTEM_MONITOR_INTERVAL, "60");

+ 1 - 1
src/main/resources/fess_config.properties

@@ -126,7 +126,7 @@ crawler.crawling.data.encoding=UTF-8
 crawler.web.protocols=http,https
 crawler.file.protocols=file,smb,ftp
 crawler.ignore.robots.txt=false
-crawler.ignore.meta.robots=false
+crawler.ignore.robots.tags=false
 crawler.ignore.content.exception=true
 crawler.failure.url.status.codes=404
 crawler.system.monitor.interval=60

+ 111 - 5
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -267,6 +267,112 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         assertEquals("foo1<!--googleoff: index--><A href=\"index.html\"></A><!--googleon: index-->foo5", output);
     }
 
+    public void test_processXRobotsTags_no() throws Exception {
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            @Override
+            protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
+                return Collections.emptyMap();
+            }
+        };
+        transformer.fessConfig = new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public boolean isCrawlerIgnoreRobotsTags() {
+                return false;
+            };
+        };
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+
+        transformer.processXRobotsTag(responseData, new ResultData());
+        assertFalse(responseData.isNoFollow());
+    }
+
+    public void test_processXRobotsTag_noindexnofollow() throws Exception {
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
+                return Collections.emptyMap();
+            }
+        };
+        transformer.fessConfig = new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public boolean isCrawlerIgnoreRobotsTags() {
+                return false;
+            };
+        };
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+        responseData.addMetaData("X-Robots-Tag", "noindex,nofollow");
+
+        try {
+            transformer.processXRobotsTag(responseData, new ResultData());
+            fail();
+        } catch (ChildUrlsException e) {
+            assertTrue(e.getChildUrlList().isEmpty());
+        } catch (Exception e) {
+            fail();
+        }
+    }
+
+    public void test_processXRobotsTag_noindex() throws Exception {
+        final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
+
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
+                return Collections.emptyMap();
+            }
+        };
+        transformer.fessConfig = new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public boolean isCrawlerIgnoreRobotsTags() {
+                return false;
+            };
+        };
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+        responseData.setResponseBody(data.getBytes());
+        responseData.addMetaData("X-Robots-Tag", "noindex");
+
+        try {
+            transformer.processXRobotsTag(responseData, new ResultData());
+            fail();
+        } catch (ChildUrlsException e) {
+            assertTrue(e.getChildUrlList().isEmpty());
+        } catch (Exception e) {
+            fail();
+        }
+    }
+
+    public void test_processXRobotsTag_nofollow() throws Exception {
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
+                return Collections.emptyMap();
+            }
+        };
+        transformer.fessConfig = new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public boolean isCrawlerIgnoreRobotsTags() {
+                return false;
+            };
+        };
+
+        final ResponseData responseData = new ResponseData();
+        responseData.addMetaData("X-Robots-Tag", "nofollow");
+
+        transformer.processXRobotsTag(responseData, new ResultData());
+        assertTrue(responseData.isNoFollow());
+    }
+
     public void test_processMetaRobots_no() throws Exception {
         final String data = "<html><body>foo</body></html>";
         final Document document = getDocument(data);
@@ -281,7 +387,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
             private static final long serialVersionUID = 1L;
 
             @Override
-            public boolean isCrawlerIgnoreMetaRobots() {
+            public boolean isCrawlerIgnoreRobotsTags() {
                 return false;
             };
         };
@@ -306,7 +412,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
             private static final long serialVersionUID = 1L;
 
             @Override
-            public boolean isCrawlerIgnoreMetaRobots() {
+            public boolean isCrawlerIgnoreRobotsTags() {
                 return false;
             };
         };
@@ -337,7 +443,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
             private static final long serialVersionUID = 1L;
 
             @Override
-            public boolean isCrawlerIgnoreMetaRobots() {
+            public boolean isCrawlerIgnoreRobotsTags() {
                 return false;
             };
         };
@@ -368,7 +474,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
             private static final long serialVersionUID = 1L;
 
             @Override
-            public boolean isCrawlerIgnoreMetaRobots() {
+            public boolean isCrawlerIgnoreRobotsTags() {
                 return false;
             };
         };
@@ -400,7 +506,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
             private static final long serialVersionUID = 1L;
 
             @Override
-            public boolean isCrawlerIgnoreMetaRobots() {
+            public boolean isCrawlerIgnoreRobotsTags() {
                 return false;
             };
         };