Ver Fonte

fix #666 meta robots tag support

Shinsuke Sugaya há 8 anos atrás
pai
commit
f96312da20

+ 51 - 0
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -21,11 +21,13 @@ import java.io.BufferedInputStream;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 
@@ -73,6 +75,14 @@ import org.xml.sax.InputSource;
 public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
     private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
 
+    private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content";
+
+    private static final String META_ROBOTS_NONE = "none";
+
+    private static final String META_ROBOTS_NOINDEX = "noindex";
+
+    private static final String META_ROBOTS_NOFOLLOW = "nofollow";
+
     private static final int UTF8_BOM_SIZE = 3;
 
     public boolean prunedContent = true;
@@ -119,6 +129,10 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
         final Document document = parser.getDocument();
 
+        if (!fessConfig.isCrawlerIgnoreMetaRobots()) {
+            processMetaRobots(responseData, resultData, document);
+        }
+
         final Map<String, Object> dataMap = new LinkedHashMap<>();
         for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
             final String path = entry.getValue();
@@ -163,6 +177,43 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         resultData.setEncoding(charsetName);
     }
 
+    protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
+        try {
+            final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
+            if (value != null) {
+                final String content = value.getTextContent().toLowerCase(Locale.ROOT);
+                boolean noindex = false;
+                boolean nofollow = false;
+                if (content.contains(META_ROBOTS_NONE)) {
+                    noindex = true;
+                    nofollow = true;
+                } else {
+                    if (content.contains(META_ROBOTS_NOINDEX)) {
+                        noindex = true;
+                    }
+                    if (content.contains(META_ROBOTS_NOFOLLOW)) {
+                        nofollow = true;
+                    }
+                }
+
+                if (noindex && nofollow) {
+                    logger.info("META(robots=noindex,nofollow): " + responseData.getUrl());
+                    throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots(Document)");
+                } else if (noindex) {
+                    logger.info("META(robots=noindex): " + responseData.getUrl());
+                    storeChildUrls(responseData, resultData);
+                    throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots(Document)");
+                } else if (nofollow) {
+                    logger.info("META(robots=nofollow): " + responseData.getUrl());
+                    responseData.setNoFollow(true);
+                }
+            }
+        } catch (TransformerException e) {
+            logger.warn("Could not parse a value of " + META_NAME_ROBOTS_CONTENT);
+        }
+
+    }
+
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
         // canonical
         if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCannonicalXpath())) {

+ 25 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -161,6 +161,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. false */
     String CRAWLER_IGNORE_ROBOTS_TXT = "crawler.ignore.robots.txt";
 
+    /** The key of the configuration. e.g. false */
+    String CRAWLER_IGNORE_META_ROBOTS = "crawler.ignore.meta.robots";
+
     /** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
     String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
 
@@ -1446,6 +1449,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     boolean isCrawlerIgnoreRobotsTxt();
 
+    /**
+     * Get the value for the key 'crawler.ignore.meta.robots'. <br>
+     * The value is, e.g. false <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerIgnoreMetaRobots();
+
+    /**
+     * Is the property for the key 'crawler.ignore.meta.robots' true? <br>
+     * The value is, e.g. false <br>
+     * @return The determination, true or false. (if not found, exception but basically no way)
+     */
+    boolean isCrawlerIgnoreMetaRobots();
+
     /**
      * Get the value for the key 'crawler.metadata.content.excludes'. <br>
      * The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
@@ -4512,6 +4529,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return is(FessConfig.CRAWLER_IGNORE_ROBOTS_TXT);
         }
 
+        public String getCrawlerIgnoreMetaRobots() {
+            return get(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
+        }
+
+        public boolean isCrawlerIgnoreMetaRobots() {
+            return is(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
+        }
+
         public String getCrawlerMetadataContentExcludes() {
             return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
         }

+ 1 - 0
src/main/resources/fess_config.properties

@@ -92,6 +92,7 @@ crawler.crawling.data.encoding=UTF-8
 crawler.web.protocols=http,https
 crawler.file.protocols=file,smb,ftp
 crawler.ignore.robots.txt=false
+crawler.ignore.meta.robots=false
 crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
 crawler.metadata.name.mapping=\
 title=title:string\n\

+ 85 - 0
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -32,6 +32,7 @@ import org.codelibs.core.misc.ValueHolder;
 import org.codelibs.fess.crawler.builder.RequestDataBuilder;
 import org.codelibs.fess.crawler.entity.RequestData;
 import org.codelibs.fess.crawler.entity.ResponseData;
+import org.codelibs.fess.crawler.entity.ResultData;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
 import org.codelibs.fess.unit.UnitFessTestCase;
 import org.cyberneko.html.parsers.DOMParser;
@@ -123,6 +124,90 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         assertEquals("foo1<!--googleoff: index--><A href=\"index.html\"></A><!--googleon: index-->foo5", output);
     }
 
+    public void test_processMetaRobots_no() throws Exception {
+        final String data = "<html><body>foo</body></html>";
+        final Document document = getDocument(data);
+
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+
+        transformer.processMetaRobots(responseData, new ResultData(), document);
+        assertFalse(responseData.isNoFollow());
+    }
+
+    public void test_processMetaRobots_none() throws Exception {
+        final String data = "<meta name=\"robots\" content=\"none\" />";
+        final Document document = getDocument(data);
+
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+
+        try {
+            transformer.processMetaRobots(responseData, new ResultData(), document);
+            fail();
+        } catch (ChildUrlsException e) {
+            assertTrue(e.getChildUrlList().isEmpty());
+        } catch (Exception e) {
+            fail();
+        }
+    }
+
+    public void test_processMetaRobots_noindexnofollow() throws Exception {
+        final String data = "<meta name=\"ROBOTS\" content=\"NOINDEX,NOFOLLOW\" />";
+        final Document document = getDocument(data);
+
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+
+        try {
+            transformer.processMetaRobots(responseData, new ResultData(), document);
+            fail();
+        } catch (ChildUrlsException e) {
+            assertTrue(e.getChildUrlList().isEmpty());
+        } catch (Exception e) {
+            fail();
+        }
+    }
+
+    public void test_processMetaRobots_noindex() throws Exception {
+        final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
+        final Document document = getDocument(data);
+
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+        responseData.setResponseBody(data.getBytes());
+
+        try {
+            transformer.processMetaRobots(responseData, new ResultData(), document);
+            fail();
+        } catch (ChildUrlsException e) {
+            assertTrue(e.getChildUrlList().isEmpty());
+        } catch (Exception e) {
+            fail();
+        }
+    }
+
+    public void test_processMetaRobots_nofollow() throws Exception {
+        final String data = "<meta name=\"robots\" content=\"nofollow\" />";
+        final Document document = getDocument(data);
+
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setUrl("http://example.com/");
+
+        transformer.processMetaRobots(responseData, new ResultData(), document);
+        assertTrue(responseData.isNoFollow());
+    }
+
     private Document getDocument(final String data) throws Exception {
         final DOMParser parser = new DOMParser();
         final ByteArrayInputStream is = new ByteArrayInputStream(data.getBytes("UTF-8"));