fix #1268 add config.html.canonical.xpath/config.ignore.meta.robots
This commit is contained in:
parent
c45bfa9c41
commit
5f126b9931
1 changed files with 36 additions and 14 deletions
|
@ -76,6 +76,10 @@ import org.xml.sax.InputSource;
|
|||
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
|
||||
|
||||
private static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
|
||||
|
||||
private static final String IGNORE_META_ROBOTS = "ignore.meta.robots";
|
||||
|
||||
private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
|
||||
|
||||
private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
|
||||
|
@ -134,9 +138,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
|
||||
final Document document = parser.getDocument();
|
||||
|
||||
if (!fessConfig.isCrawlerIgnoreMetaRobots()) {
|
||||
processMetaRobots(responseData, resultData, document);
|
||||
}
|
||||
processMetaRobots(responseData, resultData, document);
|
||||
|
||||
final Map<String, Object> dataMap = new LinkedHashMap<>();
|
||||
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
|
||||
|
@ -183,6 +185,18 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
}
|
||||
|
||||
protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
|
||||
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
|
||||
final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
|
||||
String ignore = configMap.get(IGNORE_META_ROBOTS);
|
||||
if (ignore == null) {
|
||||
if (fessConfig.isCrawlerIgnoreMetaRobots()) {
|
||||
return;
|
||||
}
|
||||
} else if (Boolean.parseBoolean(ignore)) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
|
||||
if (value != null) {
|
||||
|
@ -258,16 +272,14 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
|
||||
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
|
||||
// canonical
|
||||
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
|
||||
final String canonicalUrl = getCanonicalUrl(responseData, document);
|
||||
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
|
||||
&& isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
|
||||
final Set<RequestData> childUrlSet = new HashSet<>();
|
||||
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
|
||||
logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
|
||||
throw new ChildUrlsException(childUrlSet, this.getClass().getName()
|
||||
+ "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
|
||||
}
|
||||
final String canonicalUrl = getCanonicalUrl(responseData, document);
|
||||
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl)
|
||||
&& isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
|
||||
final Set<RequestData> childUrlSet = new HashSet<>();
|
||||
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
|
||||
logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
|
||||
throw new ChildUrlsException(childUrlSet, this.getClass().getName()
|
||||
+ "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
|
||||
}
|
||||
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
|
@ -452,7 +464,17 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
}
|
||||
|
||||
protected String getCanonicalUrl(final ResponseData responseData, final Document document) {
|
||||
final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCanonicalXpath(), false);
|
||||
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
|
||||
final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
|
||||
String xpath = configMap.get(HTML_CANONICAL_XPATH);
|
||||
if (xpath == null) {
|
||||
xpath = fessConfig.getCrawlerDocumentHtmlCanonicalXpath();
|
||||
}
|
||||
if (StringUtil.isBlank(xpath)) {
|
||||
return null;
|
||||
}
|
||||
final String canonicalUrl = getSingleNodeValue(document, xpath, false);
|
||||
if (StringUtil.isBlank(canonicalUrl)) {
|
||||
return null;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue