Ver Fonte

fix #1440 change xpath for a canonical tag

Shinsuke Sugaya há 7 anos atrás
pai
commit
43f2fd4009

+ 3 - 3
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -237,7 +237,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. //META[@name='description']/@content */
     /** The key of the configuration. e.g. //META[@name='description']/@content */
     String CRAWLER_DOCUMENT_HTML_DIGEST_XPATH = "crawler.document.html.digest.xpath";
     String CRAWLER_DOCUMENT_HTML_DIGEST_XPATH = "crawler.document.html.digest.xpath";
 
 
-    /** The key of the configuration. e.g. //LINK[@rel='canonical']/@href */
+    /** The key of the configuration. e.g. //LINK[@rel='canonical'][1]/@href */
     String CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH = "crawler.document.html.canonical.xpath";
     String CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH = "crawler.document.html.canonical.xpath";
 
 
     /** The key of the configuration. e.g. noscript,script,style,header,footer,nav,a[rel=nofollow] */
     /** The key of the configuration. e.g. noscript,script,style,header,footer,nav,a[rel=nofollow] */
@@ -1924,7 +1924,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
 
     /**
     /**
      * Get the value for the key 'crawler.document.html.canonical.xpath'. <br>
      * Get the value for the key 'crawler.document.html.canonical.xpath'. <br>
-     * The value is, e.g. //LINK[@rel='canonical']/@href <br>
+     * The value is, e.g. //LINK[@rel='canonical'][1]/@href <br>
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      */
      */
     String getCrawlerDocumentHtmlCanonicalXpath();
     String getCrawlerDocumentHtmlCanonicalXpath();
@@ -7747,7 +7747,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH, "//BODY");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH, "//BODY");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH, "//HTML/@lang");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH, "//HTML/@lang");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH, "//META[@name='description']/@content");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH, "//META[@name='description']/@content");
-            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH, "//LINK[@rel='canonical']/@href");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH, "//LINK[@rel='canonical'][1]/@href");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS, "noscript,script,style,header,footer,nav,a[rel=nofollow]");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS, "noscript,script,style,header,footer,nav,a[rel=nofollow]");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH, "120");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH, "120");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG, "");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG, "");

+ 1 - 1
src/main/resources/fess_config.properties

@@ -137,7 +137,7 @@ Title=title:string\n\
 crawler.document.html.content.xpath=//BODY
 crawler.document.html.content.xpath=//BODY
 crawler.document.html.lang.xpath=//HTML/@lang
 crawler.document.html.lang.xpath=//HTML/@lang
 crawler.document.html.digest.xpath=//META[@name='description']/@content
 crawler.document.html.digest.xpath=//META[@name='description']/@content
-crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
+crawler.document.html.canonical.xpath=//LINK[@rel='canonical'][1]/@href
 crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel=nofollow]
 crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel=nofollow]
 crawler.document.html.max.digest.length=120
 crawler.document.html.max.digest.length=120
 crawler.document.html.default.lang=
 crawler.document.html.default.lang=

+ 36 - 0
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -582,6 +582,42 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         assertEquals("bbb aaa", value);
         assertEquals("bbb aaa", value);
     }
     }
 
 
+    public void test_getCanonicalUrl() throws Exception {
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            @Override
+            protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
+                return Collections.emptyMap();
+            }
+        };
+        transformer.fessConfig = new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            public String getCrawlerDocumentHtmlCanonicalXpath() {
+                return "//LINK[@rel='canonical'][1]/@href";
+            };
+        };
+
+        final ResponseData responseData = new ResponseData();
+        responseData.setSessionId("test");
+        responseData.setUrl("http://example.com/");
+
+        String data = "<html><head></head><body>aaa</body></html>";
+        Document document = getDocument(data);
+        String value = transformer.getCanonicalUrl(responseData, document);
+        assertNull(value);
+
+        data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
+        document = getDocument(data);
+        value = transformer.getCanonicalUrl(responseData, document);
+        assertEquals("http://example.com/", value);
+
+        data =
+                "<html><head><link rel=\"canonical\" href=\"http://example1.com/\"><link rel=\"canonical\" href=\"http://example2.com/\"></head><body>aaa</body></html>";
+        document = getDocument(data);
+        value = transformer.getCanonicalUrl(responseData, document);
+        assertEquals("http://example1.com/", value);
+    }
+
     public void test_normalizeCanonicalUrl() throws Exception {
     public void test_normalizeCanonicalUrl() throws Exception {
         final FessXpathTransformer transformer = new FessXpathTransformer();
         final FessXpathTransformer transformer = new FessXpathTransformer();
         String value;
         String value;