fix #1437 add crawler.document.html.default.lang

This commit is contained in:
Shinsuke Sugaya 2018-01-13 21:49:28 +09:00
parent 6dd9d20e6b
commit db97ecba9b
4 changed files with 38 additions and 1 deletions

View file

@ -333,7 +333,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}

View file

@ -263,6 +263,15 @@ public class SystemHelper {
return StringUtils.abbreviate(str, ComponentUtil.getFessConfig().getMaxLogOutputLengthAsInteger().intValue());
}
public String normalizeHtmlLang(final String value) {
String defaultLang = ComponentUtil.getFessConfig().getCrawlerDocumentHtmlDefaultLang();
if (StringUtil.isNotBlank(defaultLang)) {
return defaultLang;
}
return normalizeLang(value);
}
public String normalizeLang(final String value) {
if (StringUtil.isBlank(value)) {
return null;

View file

@ -246,6 +246,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. 120 */
String CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH = "crawler.document.html.max.digest.length";
/** The key of the configuration. e.g. */
String CRAWLER_DOCUMENT_HTML_DEFAULT_LANG = "crawler.document.html.default.lang";
/** The key of the configuration. e.g. */
String CRAWLER_DOCUMENT_FILE_NAME_ENCODING = "crawler.document.file.name.encoding";
@ -1948,6 +1951,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
Integer getCrawlerDocumentHtmlMaxDigestLengthAsInteger();
/**
* Get the value for the key 'crawler.document.html.default.lang'. <br>
* The value is, e.g. <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlDefaultLang();
/**
* Get the value for the key 'crawler.document.html.default.lang' as {@link Integer}. <br>
* The value is, e.g. <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentHtmlDefaultLangAsInteger();
/**
* Get the value for the key 'crawler.document.file.name.encoding'. <br>
* The value is, e.g. <br>
@ -5777,6 +5795,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH);
}
public String getCrawlerDocumentHtmlDefaultLang() {
return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG);
}
public Integer getCrawlerDocumentHtmlDefaultLangAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG);
}
public String getCrawlerDocumentFileNameEncoding() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING);
}
@ -7724,6 +7750,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_CANONICAL_XPATH, "//LINK[@rel='canonical']/@href");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS, "noscript,script,style,header,footer,nav,a[rel=nofollow]");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH, "120");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG, "");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING, "");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL, "No title.");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT, "false");

View file

@ -140,6 +140,7 @@ crawler.document.html.digest.xpath=//META[@name='description']/@content
crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel=nofollow]
crawler.document.html.max.digest.length=120
crawler.document.html.default.lang=
# file
crawler.document.file.name.encoding=