fix #666 meta robots tag support
This commit is contained in:
parent
c9b8ad3024
commit
f96312da20
4 changed files with 162 additions and 0 deletions
|
@ -21,11 +21,13 @@ import java.io.BufferedInputStream;
|
|||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -73,6 +75,14 @@ import org.xml.sax.InputSource;
|
|||
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
|
||||
|
||||
private static final String META_NAME_ROBOTS_CONTENT = "//META[@name=\"robots\" or @name=\"ROBOTS\"]/@content";
|
||||
|
||||
private static final String META_ROBOTS_NONE = "none";
|
||||
|
||||
private static final String META_ROBOTS_NOINDEX = "noindex";
|
||||
|
||||
private static final String META_ROBOTS_NOFOLLOW = "nofollow";
|
||||
|
||||
private static final int UTF8_BOM_SIZE = 3;
|
||||
|
||||
public boolean prunedContent = true;
|
||||
|
@ -119,6 +129,10 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
|
||||
final Document document = parser.getDocument();
|
||||
|
||||
if (!fessConfig.isCrawlerIgnoreMetaRobots()) {
|
||||
processMetaRobots(responseData, resultData, document);
|
||||
}
|
||||
|
||||
final Map<String, Object> dataMap = new LinkedHashMap<>();
|
||||
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
|
||||
final String path = entry.getValue();
|
||||
|
@ -163,6 +177,43 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
resultData.setEncoding(charsetName);
|
||||
}
|
||||
|
||||
protected void processMetaRobots(final ResponseData responseData, final ResultData resultData, final Document document) {
|
||||
try {
|
||||
final Node value = getXPathAPI().selectSingleNode(document, META_NAME_ROBOTS_CONTENT);
|
||||
if (value != null) {
|
||||
final String content = value.getTextContent().toLowerCase(Locale.ROOT);
|
||||
boolean noindex = false;
|
||||
boolean nofollow = false;
|
||||
if (content.contains(META_ROBOTS_NONE)) {
|
||||
noindex = true;
|
||||
nofollow = true;
|
||||
} else {
|
||||
if (content.contains(META_ROBOTS_NOINDEX)) {
|
||||
noindex = true;
|
||||
}
|
||||
if (content.contains(META_ROBOTS_NOFOLLOW)) {
|
||||
nofollow = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (noindex && nofollow) {
|
||||
logger.info("META(robots=noindex,nofollow): " + responseData.getUrl());
|
||||
throw new ChildUrlsException(Collections.emptySet(), "#processMetaRobots(Document)");
|
||||
} else if (noindex) {
|
||||
logger.info("META(robots=noindex): " + responseData.getUrl());
|
||||
storeChildUrls(responseData, resultData);
|
||||
throw new ChildUrlsException(resultData.getChildUrlSet(), "#processMetaRobots(Document)");
|
||||
} else if (nofollow) {
|
||||
logger.info("META(robots=nofollow): " + responseData.getUrl());
|
||||
responseData.setNoFollow(true);
|
||||
}
|
||||
}
|
||||
} catch (TransformerException e) {
|
||||
logger.warn("Could not parse a value of " + META_NAME_ROBOTS_CONTENT);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
|
||||
// canonical
|
||||
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCannonicalXpath())) {
|
||||
|
|
|
@ -161,6 +161,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. false */
|
||||
String CRAWLER_IGNORE_ROBOTS_TXT = "crawler.ignore.robots.txt";
|
||||
|
||||
/** The key of the configuration. e.g. false */
|
||||
String CRAWLER_IGNORE_META_ROBOTS = "crawler.ignore.meta.robots";
|
||||
|
||||
/** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
|
||||
String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
|
||||
|
||||
|
@ -1446,6 +1449,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
boolean isCrawlerIgnoreRobotsTxt();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.ignore.meta.robots'. <br>
|
||||
* The value is, e.g. false <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerIgnoreMetaRobots();
|
||||
|
||||
/**
|
||||
* Is the property for the key 'crawler.ignore.meta.robots' true? <br>
|
||||
* The value is, e.g. false <br>
|
||||
* @return The determination, true or false. (if not found, exception but basically no way)
|
||||
*/
|
||||
boolean isCrawlerIgnoreMetaRobots();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.metadata.content.excludes'. <br>
|
||||
* The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
|
||||
|
@ -4512,6 +4529,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return is(FessConfig.CRAWLER_IGNORE_ROBOTS_TXT);
|
||||
}
|
||||
|
||||
public String getCrawlerIgnoreMetaRobots() {
|
||||
return get(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
|
||||
}
|
||||
|
||||
public boolean isCrawlerIgnoreMetaRobots() {
|
||||
return is(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
|
||||
}
|
||||
|
||||
public String getCrawlerMetadataContentExcludes() {
|
||||
return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
|
||||
}
|
||||
|
|
|
@ -92,6 +92,7 @@ crawler.crawling.data.encoding=UTF-8
|
|||
crawler.web.protocols=http,https
|
||||
crawler.file.protocols=file,smb,ftp
|
||||
crawler.ignore.robots.txt=false
|
||||
crawler.ignore.meta.robots=false
|
||||
crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
|
||||
crawler.metadata.name.mapping=\
|
||||
title=title:string\n\
|
||||
|
|
|
@ -32,6 +32,7 @@ import org.codelibs.core.misc.ValueHolder;
|
|||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.entity.RequestData;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
import org.cyberneko.html.parsers.DOMParser;
|
||||
|
@ -123,6 +124,90 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
assertEquals("foo1<!--googleoff: index--><A href=\"index.html\"></A><!--googleon: index-->foo5", output);
|
||||
}
|
||||
|
||||
public void test_processMetaRobots_no() throws Exception {
|
||||
final String data = "<html><body>foo</body></html>";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
|
||||
transformer.processMetaRobots(responseData, new ResultData(), document);
|
||||
assertFalse(responseData.isNoFollow());
|
||||
}
|
||||
|
||||
public void test_processMetaRobots_none() throws Exception {
|
||||
final String data = "<meta name=\"robots\" content=\"none\" />";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
|
||||
try {
|
||||
transformer.processMetaRobots(responseData, new ResultData(), document);
|
||||
fail();
|
||||
} catch (ChildUrlsException e) {
|
||||
assertTrue(e.getChildUrlList().isEmpty());
|
||||
} catch (Exception e) {
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
public void test_processMetaRobots_noindexnofollow() throws Exception {
|
||||
final String data = "<meta name=\"ROBOTS\" content=\"NOINDEX,NOFOLLOW\" />";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
|
||||
try {
|
||||
transformer.processMetaRobots(responseData, new ResultData(), document);
|
||||
fail();
|
||||
} catch (ChildUrlsException e) {
|
||||
assertTrue(e.getChildUrlList().isEmpty());
|
||||
} catch (Exception e) {
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
public void test_processMetaRobots_noindex() throws Exception {
|
||||
final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
responseData.setResponseBody(data.getBytes());
|
||||
|
||||
try {
|
||||
transformer.processMetaRobots(responseData, new ResultData(), document);
|
||||
fail();
|
||||
} catch (ChildUrlsException e) {
|
||||
assertTrue(e.getChildUrlList().isEmpty());
|
||||
} catch (Exception e) {
|
||||
fail();
|
||||
}
|
||||
}
|
||||
|
||||
public void test_processMetaRobots_nofollow() throws Exception {
|
||||
final String data = "<meta name=\"robots\" content=\"nofollow\" />";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
|
||||
final ResponseData responseData = new ResponseData();
|
||||
responseData.setUrl("http://example.com/");
|
||||
|
||||
transformer.processMetaRobots(responseData, new ResultData(), document);
|
||||
assertTrue(responseData.isNoFollow());
|
||||
}
|
||||
|
||||
private Document getDocument(final String data) throws Exception {
|
||||
final DOMParser parser = new DOMParser();
|
||||
final ByteArrayInputStream is = new ByteArrayInputStream(data.getBytes("UTF-8"));
|
||||
|
|
Loading…
Add table
Reference in a new issue