fix #907 X-Robots-Tag support
This commit is contained in:
parent
f59290f336
commit
aed6b735d6
5 changed files with 190 additions and 31 deletions
|
@ -74,23 +74,29 @@ import org.w3c.dom.NodeList;
|
|||
import org.xml.sax.InputSource;
|
||||
|
||||
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
|
||||
|
||||
private static final String X_ROBOTS_TAG = "X-Robots-Tag";
|
||||
|
||||
private static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
|
||||
|
||||
@Deprecated
|
||||
private static final String IGNORE_META_ROBOTS = "ignore.meta.robots";
|
||||
|
||||
private static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
|
||||
|
||||
private static final String META_NAME_THUMBNAIL_CONTENT = "//META[@name=\"thumbnail\" or @name=\"THUMBNAIL\"]/@content";
|
||||
|
||||
private static final String META_PROPERTY_OGIMAGE_CONTENT = "//META[@property=\"og:image\"]/@content";
|
||||
|
||||
private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content";
|
||||
|
||||
private static final String META_ROBOTS_NONE = "none";
|
||||
private static final String ROBOTS_TAG_NONE = "none";
|
||||
|
||||
private static final String META_ROBOTS_NOINDEX = "noindex";
|
||||
private static final String ROBOTS_TAG_NOINDEX = "noindex";
|
||||
|
||||
private static final String META_ROBOTS_NOFOLLOW = "nofollow";
|
||||
private static final String ROBOTS_TAG_NOFOLLOW = "nofollow";
|
||||
|
||||
private static final int UTF8_BOM_SIZE = 3;
|
||||
|
||||
|
@ -139,6 +145,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
final Document document = parser.getDocument();
|
||||
|
||||
processMetaRobots(responseData, resultData, document);
|
||||
processXRobotsTag(responseData, resultData);
|
||||
|
||||
final Map<String, Object> dataMap = new LinkedHashMap<>();
|
||||
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
|
||||
|
@ -195,40 +202,45 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
|
||||
protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
|
||||
final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
|
||||
String ignore = configMap.get(IGNORE_META_ROBOTS);
|
||||
String ignore = configMap.get(IGNORE_ROBOTS_TAGS);
|
||||
if (ignore == null) {
|
||||
if (fessConfig.isCrawlerIgnoreMetaRobots()) {
|
||||
ignore = configMap.get(IGNORE_META_ROBOTS);
|
||||
if (ignore == null) {
|
||||
if (fessConfig.isCrawlerIgnoreRobotsTags()) {
|
||||
return;
|
||||
}
|
||||
} else if (Boolean.parseBoolean(ignore)) {
|
||||
return;
|
||||
}
|
||||
} else if (Boolean.parseBoolean(ignore)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// meta tag
|
||||
try {
|
||||
final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
|
||||
if (value != null) {
|
||||
final String content = value.getTextContent().toLowerCase(Locale.ROOT);
|
||||
boolean noindex = false;
|
||||
boolean nofollow = false;
|
||||
if (content.contains(META_ROBOTS_NONE)) {
|
||||
final String content = value.getTextContent().toLowerCase(Locale.ROOT);
|
||||
if (content.contains(ROBOTS_TAG_NONE)) {
|
||||
noindex = true;
|
||||
nofollow = true;
|
||||
} else {
|
||||
if (content.contains(META_ROBOTS_NOINDEX)) {
|
||||
if (content.contains(ROBOTS_TAG_NOINDEX)) {
|
||||
noindex = true;
|
||||
}
|
||||
if (content.contains(META_ROBOTS_NOFOLLOW)) {
|
||||
if (content.contains(ROBOTS_TAG_NOFOLLOW)) {
|
||||
nofollow = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (noindex && nofollow) {
|
||||
logger.info("META(robots=noindex,nofollow): " + responseData.getUrl());
|
||||
throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots(Document)");
|
||||
throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots");
|
||||
} else if (noindex) {
|
||||
logger.info("META(robots=noindex): " + responseData.getUrl());
|
||||
storeChildUrls(responseData, resultData);
|
||||
throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots(Document)");
|
||||
throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots");
|
||||
} else if (nofollow) {
|
||||
logger.info("META(robots=nofollow): " + responseData.getUrl());
|
||||
responseData.setNoFollow(true);
|
||||
|
@ -240,6 +252,48 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
|
||||
}
|
||||
|
||||
protected void processXRobotsTag(final ResponseData responseData, final ResultData resultData) {
|
||||
final Map<String, String> configMap = getConfigPrameterMap(responseData, ConfigName.CONFIG);
|
||||
String ignore = configMap.get(IGNORE_ROBOTS_TAGS);
|
||||
if (ignore == null) {
|
||||
if (fessConfig.isCrawlerIgnoreRobotsTags()) {
|
||||
return;
|
||||
}
|
||||
} else if (Boolean.parseBoolean(ignore)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// X-Robots-Tag
|
||||
responseData.getMetaDataMap().entrySet().stream().filter(e -> e.getKey().equalsIgnoreCase(X_ROBOTS_TAG) && e.getValue() != null)
|
||||
.forEach(e -> {
|
||||
boolean noindex = false;
|
||||
boolean nofollow = false;
|
||||
final String value = e.getValue().toString().toLowerCase(Locale.ROOT);
|
||||
if (value.contains(ROBOTS_TAG_NONE)) {
|
||||
noindex = true;
|
||||
nofollow = true;
|
||||
} else {
|
||||
if (value.contains(ROBOTS_TAG_NOINDEX)) {
|
||||
noindex = true;
|
||||
}
|
||||
if (value.contains(ROBOTS_TAG_NOFOLLOW)) {
|
||||
nofollow = true;
|
||||
}
|
||||
}
|
||||
if (noindex && nofollow) {
|
||||
logger.info("HEADER(robots=noindex,nofollow): " + responseData.getUrl());
|
||||
throw new ChildUrlsException(Collections.emptySet(), "#processXRobotsTag");
|
||||
} else if (noindex) {
|
||||
logger.info("HEADER(robots=noindex): " + responseData.getUrl());
|
||||
storeChildUrls(responseData, resultData);
|
||||
throw new ChildUrlsException(resultData.getChildUrlSet(), "#processXRobotsTag");
|
||||
} else if (nofollow) {
|
||||
logger.info("HEADER(robots=nofollow): " + responseData.getUrl());
|
||||
responseData.setNoFollow(true);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
|
||||
|
@ -292,8 +346,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
final Set<RequestData> childUrlSet = new HashSet<>();
|
||||
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
|
||||
logger.info("CANONICAL: " + responseData.getUrl() + " -> " + canonicalUrl);
|
||||
throw new ChildUrlsException(childUrlSet, this.getClass().getName()
|
||||
+ "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
|
||||
throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
|
||||
}
|
||||
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
|
|
|
@ -149,7 +149,7 @@ public class DocumentHelper {
|
|||
if (responseData.getRedirectLocation() != null) {
|
||||
final Set<RequestData> childUrlList = new HashSet<>();
|
||||
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
|
||||
throw new ChildUrlsException(childUrlList, "Redirected from " + url);
|
||||
throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
|
||||
}
|
||||
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
|
||||
responseData.setSessionId(crawlingInfoId);
|
||||
|
|
|
@ -215,7 +215,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
String CRAWLER_IGNORE_ROBOTS_TXT = "crawler.ignore.robots.txt";
|
||||
|
||||
/** The key of the configuration. e.g. false */
|
||||
String CRAWLER_IGNORE_META_ROBOTS = "crawler.ignore.meta.robots";
|
||||
String CRAWLER_IGNORE_ROBOTS_TAGS = "crawler.ignore.robots.tags";
|
||||
|
||||
/** The key of the configuration. e.g. true */
|
||||
String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
|
||||
|
@ -1869,18 +1869,18 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
boolean isCrawlerIgnoreRobotsTxt();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.ignore.meta.robots'. <br>
|
||||
* Get the value for the key 'crawler.ignore.robots.tags'. <br>
|
||||
* The value is, e.g. false <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerIgnoreMetaRobots();
|
||||
String getCrawlerIgnoreRobotsTags();
|
||||
|
||||
/**
|
||||
* Is the property for the key 'crawler.ignore.meta.robots' true? <br>
|
||||
* Is the property for the key 'crawler.ignore.robots.tags' true? <br>
|
||||
* The value is, e.g. false <br>
|
||||
* @return The determination, true or false. (if not found, exception but basically no way)
|
||||
*/
|
||||
boolean isCrawlerIgnoreMetaRobots();
|
||||
boolean isCrawlerIgnoreRobotsTags();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.ignore.content.exception'. <br>
|
||||
|
@ -5808,12 +5808,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return is(FessConfig.CRAWLER_IGNORE_ROBOTS_TXT);
|
||||
}
|
||||
|
||||
public String getCrawlerIgnoreMetaRobots() {
|
||||
return get(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
|
||||
public String getCrawlerIgnoreRobotsTags() {
|
||||
return get(FessConfig.CRAWLER_IGNORE_ROBOTS_TAGS);
|
||||
}
|
||||
|
||||
public boolean isCrawlerIgnoreMetaRobots() {
|
||||
return is(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return is(FessConfig.CRAWLER_IGNORE_ROBOTS_TAGS);
|
||||
}
|
||||
|
||||
public String getCrawlerIgnoreContentException() {
|
||||
|
@ -7833,7 +7833,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
defaultMap.put(FessConfig.CRAWLER_WEB_PROTOCOLS, "http,https");
|
||||
defaultMap.put(FessConfig.CRAWLER_FILE_PROTOCOLS, "file,smb,ftp");
|
||||
defaultMap.put(FessConfig.CRAWLER_IGNORE_ROBOTS_TXT, "false");
|
||||
defaultMap.put(FessConfig.CRAWLER_IGNORE_META_ROBOTS, "false");
|
||||
defaultMap.put(FessConfig.CRAWLER_IGNORE_ROBOTS_TAGS, "false");
|
||||
defaultMap.put(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION, "true");
|
||||
defaultMap.put(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES, "404");
|
||||
defaultMap.put(FessConfig.CRAWLER_SYSTEM_MONITOR_INTERVAL, "60");
|
||||
|
|
|
@ -126,7 +126,7 @@ crawler.crawling.data.encoding=UTF-8
|
|||
crawler.web.protocols=http,https
|
||||
crawler.file.protocols=file,smb,ftp
|
||||
crawler.ignore.robots.txt=false
|
||||
crawler.ignore.meta.robots=false
|
||||
crawler.ignore.robots.tags=false
|
||||
crawler.ignore.content.exception=true
|
||||
crawler.failure.url.status.codes=404
|
||||
crawler.system.monitor.interval=60
|
||||
|
|
|
@ -267,6 +267,112 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
assertEquals("foo1<!--googleoff: index--><A href=\"index.html\"></A><!--googleon: index-->foo5", output);
|
||||
}
|
||||
|
||||
public void test_processXRobotsTags_no() throws Exception {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
@Override
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
};
|
||||
transformer.fessConfig = new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
|
||||
transformer.processXRobotsTag(responseData, new ResultData());
|
||||
assertFalse(responseData.isNoFollow());
|
||||
}
|
||||
|
||||
public void test_processXRobotsTag_noindexnofollow() throws Exception {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
};
|
||||
transformer.fessConfig = new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
responseData.addMetaData("X-Robots-Tag", "noindex,nofollow");
|
||||
|
||||
try {
|
||||
transformer.processXRobotsTag(responseData, new ResultData());
|
||||
fail();
|
||||
} catch (ChildUrlsException e) {
|
||||
assertTrue(e.getChildUrlList().isEmpty());
|
||||
} catch (Exception e) {
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
public void test_processXRobotsTag_noindex() throws Exception {
|
||||
final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
};
|
||||
transformer.fessConfig = new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
responseData.setResponseBody(data.getBytes());
|
||||
responseData.addMetaData("X-Robots-Tag", "noindex");
|
||||
|
||||
try {
|
||||
transformer.processXRobotsTag(responseData, new ResultData());
|
||||
fail();
|
||||
} catch (ChildUrlsException e) {
|
||||
assertTrue(e.getChildUrlList().isEmpty());
|
||||
} catch (Exception e) {
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
public void test_processXRobotsTag_nofollow() throws Exception {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
};
|
||||
transformer.fessConfig = new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.addMetaData("X-Robots-Tag", "nofollow");
|
||||
|
||||
transformer.processXRobotsTag(responseData, new ResultData());
|
||||
assertTrue(responseData.isNoFollow());
|
||||
}
|
||||
|
||||
public void test_processMetaRobots_no() throws Exception {
|
||||
final String data = "<html><body>foo</body></html>";
|
||||
final Document document = getDocument(data);
|
||||
|
@ -281,7 +387,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreMetaRobots() {
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
@ -306,7 +412,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreMetaRobots() {
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
@ -337,7 +443,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreMetaRobots() {
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
@ -368,7 +474,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreMetaRobots() {
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
@ -400,7 +506,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public boolean isCrawlerIgnoreMetaRobots() {
|
||||
public boolean isCrawlerIgnoreRobotsTags() {
|
||||
return false;
|
||||
};
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue