Browse Source

fix #860 add crawler.ignore.content.exception

Shinsuke Sugaya 8 năm trước cách đây
mục cha
commit
a5fe68c333

+ 12 - 1
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -103,7 +103,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         final Map<String, Object> metaDataMap = new HashMap<>();
         String content;
         try (final InputStream in = responseData.getResponseBody()) {
-            final ExtractData extractData = extractor.getText(in, params);
+            final ExtractData extractData = getExtractData(extractor, in, params);
             content = extractData.getContent();
             if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
                 return null;
@@ -330,6 +330,17 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         return dataMap;
     }
 
+    private ExtractData getExtractData(final Extractor extractor, final InputStream in, final Map<String, String> params) {
+        try {
+            return extractor.getText(in, params);
+        } catch (RuntimeException e) {
+            if (!fessConfig.isCrawlerIgnoreContentException()) {
+                throw e;
+            }
+        }
+        return new ExtractData();
+    }
+
     private String getResourceName(final ResponseData responseData) {
         String name = responseData.getUrl();
         final String enc = responseData.getCharSet();

+ 25 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -166,6 +166,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. false */
     String CRAWLER_IGNORE_META_ROBOTS = "crawler.ignore.meta.robots";
 
+    /** The key of the configuration. e.g. true */
+    String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
+
     /** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
     String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
 
@@ -1497,6 +1500,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     boolean isCrawlerIgnoreMetaRobots();
 
+    /**
+     * Get the value for the key 'crawler.ignore.content.exception'. <br>
+     * The value is, e.g. true <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerIgnoreContentException();
+
+    /**
+     * Is the property for the key 'crawler.ignore.content.exception' true? <br>
+     * The value is, e.g. true <br>
+     * @return The determination, true or false. (if not found, exception but basically no way)
+     */
+    boolean isCrawlerIgnoreContentException();
+
     /**
      * Get the value for the key 'crawler.metadata.content.excludes'. <br>
      * The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
@@ -4688,6 +4705,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return is(FessConfig.CRAWLER_IGNORE_META_ROBOTS);
         }
 
+        public String getCrawlerIgnoreContentException() {
+            return get(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
+        }
+
+        public boolean isCrawlerIgnoreContentException() {
+            return is(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
+        }
+
         public String getCrawlerMetadataContentExcludes() {
             return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
         }

+ 1 - 0
src/main/resources/fess_config.properties

@@ -95,6 +95,7 @@ crawler.web.protocols=http,https
 crawler.file.protocols=file,smb,ftp
 crawler.ignore.robots.txt=false
 crawler.ignore.meta.robots=false
+crawler.ignore.content.exception=true
 crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
 crawler.metadata.name.mapping=\
 title=title:string\n\