fix #1440 change xpath for a canonical tag
This commit is contained in:
parent
ce2a95b8fb
commit
43f2fd4009
3 changed files with 40 additions and 4 deletions
|
@ -237,7 +237,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. //META[@name='description']/@content */
|
||||
String CRAWLER_DOCUMENT_HTML_DIGEST_XPATH = "crawler.document.html.digest.xpath";
|
||||
|
||||
/** The key of the configuration. e.g. //LINK[@rel='canonical']/@href */
|
||||
/** The key of the configuration. e.g. //LINK[@rel='canonical'][1]/@href */
|
||||
String CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH = "crawler.document.html.canonical.xpath";
|
||||
|
||||
/** The key of the configuration. e.g. noscript,script,style,header,footer,nav,a[rel=nofollow] */
|
||||
|
@ -1924,7 +1924,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.canonical.xpath'. <br>
|
||||
* The value is, e.g. //LINK[@rel='canonical']/@href <br>
|
||||
* The value is, e.g. //LINK[@rel='canonical'][1]/@href <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentHtmlCanonicalXpath();
|
||||
|
@ -7747,7 +7747,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH, "//BODY");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH, "//HTML/@lang");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH, "//META[@name='description']/@content");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH, "//LINK[@rel='canonical']/@href");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH, "//LINK[@rel='canonical'][1]/@href");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS, "noscript,script,style,header,footer,nav,a[rel=nofollow]");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH, "120");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG, "");
|
||||
|
|
|
@ -137,7 +137,7 @@ Title=title:string\n\
|
|||
crawler.document.html.content.xpath=//BODY
|
||||
crawler.document.html.lang.xpath=//HTML/@lang
|
||||
crawler.document.html.digest.xpath=//META[@name='description']/@content
|
||||
crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
|
||||
crawler.document.html.canonical.xpath=//LINK[@rel='canonical'][1]/@href
|
||||
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel=nofollow]
|
||||
crawler.document.html.max.digest.length=120
|
||||
crawler.document.html.default.lang=
|
||||
|
|
|
@ -582,6 +582,42 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
assertEquals("bbb aaa", value);
|
||||
}
|
||||
|
||||
public void test_getCanonicalUrl() throws Exception {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
@Override
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
};
|
||||
transformer.fessConfig = new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public String getCrawlerDocumentHtmlCanonicalXpath() {
|
||||
return "//LINK[@rel='canonical'][1]/@href";
|
||||
};
|
||||
};
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setSessionId("test");
|
||||
responseData.setUrl("http://example.com/");
|
||||
|
||||
String data = "<html><head></head><body>aaa</body></html>";
|
||||
Document document = getDocument(data);
|
||||
String value = transformer.getCanonicalUrl(responseData, document);
|
||||
assertNull(value);
|
||||
|
||||
data = "<html><head><link rel=\"canonical\" href=\"http://example.com/\"></head><body>aaa</body></html>";
|
||||
document = getDocument(data);
|
||||
value = transformer.getCanonicalUrl(responseData, document);
|
||||
assertEquals("http://example.com/", value);
|
||||
|
||||
data =
|
||||
"<html><head><link rel=\"canonical\" href=\"http://example1.com/\"><link rel=\"canonical\" href=\"http://example2.com/\"></head><body>aaa</body></html>";
|
||||
document = getDocument(data);
|
||||
value = transformer.getCanonicalUrl(responseData, document);
|
||||
assertEquals("http://example1.com/", value);
|
||||
}
|
||||
|
||||
public void test_normalizeCanonicalUrl() throws Exception {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
String value;
|
||||
|
|
Loading…
Add table
Reference in a new issue