|
@@ -74,23 +74,29 @@ import org.w3c.dom.NodeList;
|
|
|
import org.xml.sax.InputSource;
|
|
|
|
|
|
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
|
|
|
+
|
|
|
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
|
|
|
|
|
|
+ private static final String X_ROBOTS_TAG = "X-Robots-Tag";
|
|
|
+
|
|
|
private static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
|
|
|
|
|
|
+ @Deprecated
|
|
|
private static final String IGNORE_META_ROBOTS = "ignore.meta.robots";
|
|
|
|
|
|
+ private static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
|
|
|
+
|
|
|
private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
|
|
|
|
|
|
private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
|
|
|
|
|
|
private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content";
|
|
|
|
|
|
- private static final String META_ROBOTS_NONE = "none";
|
|
|
+ private static final String ROBOTS_TAG_NONE = "none";
|
|
|
|
|
|
- private static final String META_ROBOTS_NOINDEX = "noindex";
|
|
|
+ private static final String ROBOTS_TAG_NOINDEX = "noindex";
|
|
|
|
|
|
- private static final String META_ROBOTS_NOFOLLOW = "nofollow";
|
|
|
+ private static final String ROBOTS_TAG_NOFOLLOW = "nofollow";
|
|
|
|
|
|
private static final int UTF8_BOM_SIZE = 3;
|
|
|
|
|
@@ -139,6 +145,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
|
final Document document = parser.getDocument();
|
|
|
|
|
|
processMetaRobots(responseData, resultData, document);
|
|
|
+ processXRobotsTag(responseData, resultData);
|
|
|
|
|
|
final Map<String, Object> dataMap = new LinkedHashMap<>();
|
|
|
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
|
|
@@ -195,40 +202,45 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
|
|
|
|
protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
|
|
|
final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
|
|
|
- String ignore = configMap.get(IGNORE_META_ROBOTS);
|
|
|
+ String ignore = configMap.get(IGNORE_ROBOTS_TAGS);
|
|
|
if (ignore == null) {
|
|
|
- if (fessConfig.isCrawlerIgnoreMetaRobots()) {
|
|
|
+ ignore = configMap.get(IGNORE_META_ROBOTS);
|
|
|
+ if (ignore == null) {
|
|
|
+ if (fessConfig.isCrawlerIgnoreRobotsTags()) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ } else if (Boolean.parseBoolean(ignore)) {
|
|
|
return;
|
|
|
}
|
|
|
} else if (Boolean.parseBoolean(ignore)) {
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
+ // meta tag
|
|
|
try {
|
|
|
final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
|
|
|
if (value != null) {
|
|
|
- final String content = value.getTextContent().toLowerCase(Locale.ROOT);
|
|
|
boolean noindex = false;
|
|
|
boolean nofollow = false;
|
|
|
- if (content.contains(META_ROBOTS_NONE)) {
|
|
|
+ final String content = value.getTextContent().toLowerCase(Locale.ROOT);
|
|
|
+ if (content.contains(ROBOTS_TAG_NONE)) {
|
|
|
noindex = true;
|
|
|
nofollow = true;
|
|
|
} else {
|
|
|
- if (content.contains(META_ROBOTS_NOINDEX)) {
|
|
|
+ if (content.contains(ROBOTS_TAG_NOINDEX)) {
|
|
|
noindex = true;
|
|
|
}
|
|
|
- if (content.contains(META_ROBOTS_NOFOLLOW)) {
|
|
|
+ if (content.contains(ROBOTS_TAG_NOFOLLOW)) {
|
|
|
nofollow = true;
|
|
|
}
|
|
|
}
|
|
|
-
|
|
|
if (noindex && nofollow) {
|
|
|
logger.info("META(robots=noindex,nofollow): " + responseData.getUrl());
|
|
|
- throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots(Document)");
|
|
|
+ throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots");
|
|
|
} else if (noindex) {
|
|
|
logger.info("META(robots=noindex): " + responseData.getUrl());
|
|
|
storeChildUrls(responseData, resultData);
|
|
|
- throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots(Document)");
|
|
|
+ throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots");
|
|
|
} else if (nofollow) {
|
|
|
logger.info("META(robots=nofollow): " + responseData.getUrl());
|
|
|
responseData.setNoFollow(true);
|
|
@@ -240,6 +252,48 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
|
|
|
|
}
|
|
|
|
|
|
+ protected void processXRobotsTag(final ResponseData responseData, final ResultData resultData) {
|
|
|
+ final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
|
|
|
+ String ignore = configMap.get(IGNORE_ROBOTS_TAGS);
|
|
|
+ if (ignore == null) {
|
|
|
+ if (fessConfig.isCrawlerIgnoreRobotsTags()) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+ } else if (Boolean.parseBoolean(ignore)) {
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ // X-Robots-Tag
|
|
|
+ responseData.getMetaDataMap().entrySet().stream().filter(e -> e.getKey().equalsIgnoreCase(X_ROBOTS_TAG) && e.getValue() != null)
|
|
|
+ .forEach(e -> {
|
|
|
+ boolean noindex = false;
|
|
|
+ boolean nofollow = false;
|
|
|
+ final String value = e.getValue().toString().toLowerCase(Locale.ROOT);
|
|
|
+ if (value.contains(ROBOTS_TAG_NONE)) {
|
|
|
+ noindex = true;
|
|
|
+ nofollow = true;
|
|
|
+ } else {
|
|
|
+ if (value.contains(ROBOTS_TAG_NOINDEX)) {
|
|
|
+ noindex = true;
|
|
|
+ }
|
|
|
+ if (value.contains(ROBOTS_TAG_NOFOLLOW)) {
|
|
|
+ nofollow = true;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if (noindex && nofollow) {
|
|
|
+ logger.info("HEADER(robots=noindex,nofollow): " + responseData.getUrl());
|
|
|
+ throw new ChildUrlsException(Collections.emptySet(), "#processXRobotsTag");
|
|
|
+ } else if (noindex) {
|
|
|
+ logger.info("HEADER(robots=noindex): " + responseData.getUrl());
|
|
|
+ storeChildUrls(responseData, resultData);
|
|
|
+ throw new ChildUrlsException(resultData.getChildUrlSet(), "#processXRobotsTag");
|
|
|
+ } else if (nofollow) {
|
|
|
+ logger.info("HEADER(robots=nofollow): " + responseData.getUrl());
|
|
|
+ responseData.setNoFollow(true);
|
|
|
+ }
|
|
|
+ });
|
|
|
+ }
|
|
|
+
|
|
|
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
|
|
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
|
|
|
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
|
|
@@ -292,8 +346,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
|
final Set<RequestData> childUrlSet = new HashSet<>();
|
|
|
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
|
|
|
logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
|
|
|
- throw new ChildUrlsException(childUrlSet, this.getClass().getName()
|
|
|
- + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
|
|
|
+ throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
|
|
|
}
|
|
|
|
|
|
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|