fix #860 add crawler.ignore.content.exception
This commit is contained in:
parent
263fe14eaf
commit
a5fe68c333
3 changed files with 38 additions and 1 deletions
|
@ -103,7 +103,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
final Map<String, Object> metaDataMap = new HashMap<>();
|
||||
String content;
|
||||
try (final InputStream in = responseData.getResponseBody()) {
|
||||
final ExtractData extractData = extractor.getText(in, params);
|
||||
final ExtractData extractData = getExtractData(extractor, in, params);
|
||||
content = extractData.getContent();
|
||||
if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
|
||||
return null;
|
||||
|
@ -330,6 +330,17 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
return dataMap;
|
||||
}
|
||||
|
||||
private ExtractData getExtractData(final Extractor extractor, final InputStream in, final Map<String, String> params) {
|
||||
try {
|
||||
return extractor.getText(in, params);
|
||||
} catch (RuntimeException e) {
|
||||
if (!fessConfig.isCrawlerIgnoreContentException()) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
return new ExtractData();
|
||||
}
|
||||
|
||||
private String getResourceName(final ResponseData responseData) {
|
||||
String name = responseData.getUrl();
|
||||
final String enc = responseData.getCharSet();
|
||||
|
|
|
@ -166,6 +166,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. false */
|
||||
String CRAWLER_IGNORE_META_ROBOTS = "crawler.ignore.meta.robots";
|
||||
|
||||
/** The key of the configuration. e.g. true */
|
||||
String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
|
||||
|
||||
/** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
|
||||
String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
|
||||
|
||||
|
@ -1497,6 +1500,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
boolean isCrawlerIgnoreMetaRobots();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.ignore.content.exception'. <br>
|
||||
* The value is, e.g. true <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerIgnoreContentException();
|
||||
|
||||
/**
|
||||
* Is the property for the key 'crawler.ignore.content.exception' true? <br>
|
||||
* The value is, e.g. true <br>
|
||||
* @return The determination, true or false. (if not found, exception but basically no way)
|
||||
*/
|
||||
boolean isCrawlerIgnoreContentException();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.metadata.content.excludes'. <br>
|
||||
* The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
|
||||
|
@ -4688,6 +4705,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return is(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
|
||||
}
|
||||
|
||||
public String getCrawlerIgnoreContentException() {
|
||||
return get(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
|
||||
}
|
||||
|
||||
public boolean isCrawlerIgnoreContentException() {
|
||||
return is(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
|
||||
}
|
||||
|
||||
public String getCrawlerMetadataContentExcludes() {
|
||||
return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
|
||||
}
|
||||
|
|
|
@ -95,6 +95,7 @@ crawler.web.protocols=http,https
|
|||
crawler.file.protocols=file,smb,ftp
|
||||
crawler.ignore.robots.txt=false
|
||||
crawler.ignore.meta.robots=false
|
||||
crawler.ignore.content.exception=true
|
||||
crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
|
||||
crawler.metadata.name.mapping=\
|
||||
title=title:string\n\
|
||||
|
|
Loading…
Add table
Reference in a new issue