fix #1268 add config.html.canonical.xpath/config.ignore.meta.robots

This commit is contained in:
Shinsuke Sugaya 2017-09-09 11:18:47 +09:00
parent c45bfa9c41
commit 5f126b9931

View file

@ -76,6 +76,10 @@ import org.xml.sax.InputSource;
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
private static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
private static final String IGNORE_META_ROBOTS = "ignore.meta.robots";
private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
@ -134,9 +138,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final Document document = parser.getDocument();
if (!fessConfig.isCrawlerIgnoreMetaRobots()) {
processMetaRobots(responseData, resultData, document);
}
processMetaRobots(responseData, resultData, document);
final Map<String, Object> dataMap = new LinkedHashMap<>();
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
@ -183,6 +185,18 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
String ignore = configMap.get(IGNORE_META_ROBOTS);
if (ignore == null) {
if (fessConfig.isCrawlerIgnoreMetaRobots()) {
return;
}
} else if (Boolean.parseBoolean(ignore)) {
return;
}
try {
final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
if (value != null) {
@ -258,16 +272,14 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
&& isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
throw new ChildUrlsException(childUrlSet, this.getClass().getName()
+ "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
}
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
&& isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
throw new ChildUrlsException(childUrlSet, this.getClass().getName()
+ "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
@ -452,7 +464,17 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
protected String getCanonicalUrl(final ResponseData responseData, final Document document) {
final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCanonicalXpath(), false);
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
String xpath = configMap.get(HTML_CANONICAL_XPATH);
if (xpath == null) {
xpath = fessConfig.getCrawlerDocumentHtmlCanonicalXpath();
}
if (StringUtil.isBlank(xpath)) {
return null;
}
final String canonicalUrl = getSingleNodeValue(document, xpath, false);
if (StringUtil.isBlank(canonicalUrl)) {
return null;
}