fix #1440 change xpath for a canonical tag

This commit is contained in:
Shinsuke Sugaya 2018-01-15 06:39:35 +09:00
parent ce2a95b8fb
commit 43f2fd4009
3 changed files with 40 additions and 4 deletions

View file

@ -237,7 +237,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. //META[@name='description']/@content */
String CRAWLER_DOCUMENT_HTML_DIGEST_XPATH = "crawler.document.html.digest.xpath";
/** The key of the configuration. e.g. //LINK[@rel='canonical']/@href */
/** The key of the configuration. e.g. //LINK[@rel='canonical'][1]/@href */
String CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH = "crawler.document.html.canonical.xpath";
/** The key of the configuration. e.g. noscript,script,style,header,footer,nav,a[rel=nofollow] */
@ -1924,7 +1924,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/**
* Get the value for the key 'crawler.document.html.canonical.xpath'. <br>
* The value is, e.g. //LINK[@rel='canonical']/@href <br>
* The value is, e.g. //LINK[@rel='canonical'][1]/@href <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlCanonicalXpath();
@ -7747,7 +7747,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH, "//BODY");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH, "//HTML/@lang");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH, "//META[@name='description']/@content");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH, "//LINK[@rel='canonical']/@href");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH, "//LINK[@rel='canonical'][1]/@href");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS, "noscript,script,style,header,footer,nav,a[rel=nofollow]");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH, "120");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG, "");

View file

@ -137,7 +137,7 @@ Title=title:string\n\
crawler.document.html.content.xpath=//BODY
crawler.document.html.lang.xpath=//HTML/@lang
crawler.document.html.digest.xpath=//META[@name='description']/@content
crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
crawler.document.html.canonical.xpath=//LINK[@rel='canonical'][1]/@href
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel=nofollow]
crawler.document.html.max.digest.length=120
crawler.document.html.default.lang=

View file

@ -582,6 +582,42 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
assertEquals("bbb aaa", value);
}
public void test_getCanonicalUrl() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer() {
@Override
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
public String getCrawlerDocumentHtmlCanonicalXpath() {
return "//LINK[@rel='canonical'][1]/@href";
};
};
final ResponseData responseData = new ResponseData();
responseData.setSessionId("test");
responseData.setUrl("http://example.com/");
String data = "<html><head></head><body>aaa</body></html>";
Document document = getDocument(data);
String value = transformer.getCanonicalUrl(responseData, document);
assertNull(value);
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
document = getDocument(data);
value = transformer.getCanonicalUrl(responseData, document);
assertEquals("http://example.com/", value);
data =
"<html><head><link rel=\"canonical\" href=\"http://example1.com/\"><link rel=\"canonical\" href=\"http://example2.com/\"></head><body>aaa</body></html>";
document = getDocument(data);
value = transformer.getCanonicalUrl(responseData, document);
assertEquals("http://example1.com/", value);
}
public void test_normalizeCanonicalUrl() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
String value;