fix #860 add crawler.ignore.content.exception

This commit is contained in:
Shinsuke Sugaya 2017-02-02 17:18:33 +09:00
parent 263fe14eaf
commit a5fe68c333
3 changed files with 38 additions and 1 deletions

View file

@ -103,7 +103,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
final Map<String, Object> metaDataMap = new HashMap<>();
String content;
try (final InputStream in = responseData.getResponseBody()) {
final ExtractData extractData = extractor.getText(in, params);
final ExtractData extractData = getExtractData(extractor, in, params);
content = extractData.getContent();
if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
return null;
@ -330,6 +330,17 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
return dataMap;
}
private ExtractData getExtractData(final Extractor extractor, final InputStream in, final Map<String, String> params) {
try {
return extractor.getText(in, params);
} catch (RuntimeException e) {
if (!fessConfig.isCrawlerIgnoreContentException()) {
throw e;
}
}
return new ExtractData();
}
private String getResourceName(final ResponseData responseData) {
String name = responseData.getUrl();
final String enc = responseData.getCharSet();

View file

@ -166,6 +166,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. false */
String CRAWLER_IGNORE_META_ROBOTS = "crawler.ignore.meta.robots";
/** The key of the configuration. e.g. true */
String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
/** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
@ -1497,6 +1500,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
boolean isCrawlerIgnoreMetaRobots();
/**
* Get the value for the key 'crawler.ignore.content.exception'. <br>
* The value is, e.g. true <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerIgnoreContentException();
/**
* Is the property for the key 'crawler.ignore.content.exception' true? <br>
* The value is, e.g. true <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerIgnoreContentException();
/**
* Get the value for the key 'crawler.metadata.content.excludes'. <br>
* The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
@ -4688,6 +4705,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return is(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
}
public String getCrawlerIgnoreContentException() {
return get(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
}
public boolean isCrawlerIgnoreContentException() {
return is(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
}
public String getCrawlerMetadataContentExcludes() {
return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
}

View file

@ -95,6 +95,7 @@ crawler.web.protocols=http,https
crawler.file.protocols=file,smb,ftp
crawler.ignore.robots.txt=false
crawler.ignore.meta.robots=false
crawler.ignore.content.exception=true
crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
crawler.metadata.name.mapping=\
title=title:string\n\