Browse Source

fix #1070 store 404 urls to failure urls

Shinsuke Sugaya 8 years ago
parent
commit
4bbe6df6b5

+ 18 - 0
src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java

@@ -29,6 +29,7 @@ import java.util.stream.Collectors;
 
 import org.apache.commons.io.IOUtils;
 import org.codelibs.core.lang.StringUtil;
+import org.codelibs.fess.app.service.FailureUrlService;
 import org.codelibs.fess.crawler.builder.RequestDataBuilder;
 import org.codelibs.fess.crawler.client.CrawlerClient;
 import org.codelibs.fess.crawler.client.smb.SmbClient;
@@ -39,6 +40,7 @@ import org.codelibs.fess.crawler.log.LogType;
 import org.codelibs.fess.es.client.FessEsClient;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
 import org.codelibs.fess.exception.ContainerNotAvailableException;
+import org.codelibs.fess.exception.ContentNotFoundException;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
 import org.codelibs.fess.helper.IndexingHelper;
@@ -237,4 +239,20 @@ public class FessCrawlerThread extends CrawlerThread {
         }
         return urlSet;
     }
+
+    @Override
+    protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
+        super.processResponse(urlQueue, responseData);
+
+        FessConfig fessConfig = ComponentUtil.getFessConfig();
+        if (fessConfig.isCrawlerFailureUrlStatusCodes(responseData.getHttpStatusCode())) {
+            String sessionId = crawlerContext.getSessionId();
+            final CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(sessionId);
+            final String url = urlQueue.getUrl();
+
+            final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
+            failureUrlService.store(crawlingConfig, ContentNotFoundException.class.getCanonicalName(), url, new ContentNotFoundException(
+                    url));
+        }
+    }
 }

+ 26 - 0
src/main/java/org/codelibs/fess/exception/ContentNotFoundException.java

@@ -0,0 +1,26 @@
+/*
+ * Copyright 2012-2017 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.exception;
+
+public class ContentNotFoundException extends FessSystemException {
+
+    private static final long serialVersionUID = 1L;
+
+    public ContentNotFoundException(String url) {
+        super("Not Found: " + url, false, false);
+    }
+
+}

+ 4 - 0
src/main/java/org/codelibs/fess/exception/FessSystemException.java

@@ -31,4 +31,8 @@ public class FessSystemException extends RuntimeException {
         super(cause);
     }
 
+    protected FessSystemException(final String message, final boolean enableSuppression, final boolean writableStackTrace) {
+        super(message, null, enableSuppression, writableStackTrace);
+    }
+
 }

+ 26 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -184,6 +184,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. true */
     String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
 
+    /** The key of the configuration. e.g. 404 */
+    String CRAWLER_FAILURE_URL_STATUS_CODES = "crawler.failure.url.status.codes";
+
     /** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
     String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
 
@@ -1636,6 +1639,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     boolean isCrawlerIgnoreContentException();
 
+    /**
+     * Get the value for the key 'crawler.failure.url.status.codes'. <br>
+     * The value is, e.g. 404 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerFailureUrlStatusCodes();
+
+    /**
+     * Get the value for the key 'crawler.failure.url.status.codes' as {@link Integer}. <br>
+     * The value is, e.g. 404 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerFailureUrlStatusCodesAsInteger();
+
     /**
      * Get the value for the key 'crawler.metadata.content.excludes'. <br>
      * The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
@@ -5085,6 +5103,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return is(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
         }
 
+        public String getCrawlerFailureUrlStatusCodes() {
+            return get(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
+        }
+
+        public Integer getCrawlerFailureUrlStatusCodesAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
+        }
+
         public String getCrawlerMetadataContentExcludes() {
             return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
         }

+ 21 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java

@@ -62,6 +62,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
 
 public interface FessProp {
 
+    public static final String CRAWLER_FAILURE_URL_STATUS_CODES = "crawlerFailureUrlStatusCodes";
+
     public static final String VIRTUAL_HOST_HEADERS = "virtualHostHeaders";
 
     public static final String QUERY_COLLAPSE_INNER_HITS_SORTS = "queryCollapseInnerHitsSorts";
@@ -1584,4 +1586,23 @@ public interface FessProp {
             return page;
         }).orElse(page);
     }
+
+    String getCrawlerFailureUrlStatusCodes();
+
+    public default boolean isCrawlerFailureUrlStatusCodes(final int code) {
+        int[] codes = (int[]) propMap.get(CRAWLER_FAILURE_URL_STATUS_CODES);
+        if (codes == null) {
+            codes =
+                    split(getCrawlerFailureUrlStatusCodes(), ",").get(
+                            stream -> stream.filter(StringUtil::isNotBlank).mapToInt(Integer::parseInt).toArray());
+            propMap.put(CRAWLER_FAILURE_URL_STATUS_CODES, codes);
+        }
+        for (int v : codes) {
+            if (v == code) {
+                return true;
+            }
+        }
+        return false;
+    }
+
 }

+ 1 - 0
src/main/resources/fess_config.properties

@@ -103,6 +103,7 @@ crawler.file.protocols=file,smb,ftp
 crawler.ignore.robots.txt=false
 crawler.ignore.meta.robots=false
 crawler.ignore.content.exception=true
+crawler.failure.url.status.codes=404
 crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
 crawler.metadata.name.mapping=\
 title=title:string\n\