8 jaren geleden · 4bbe6df6b5
--- a/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java
+++ b/src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java
@@ -29,6 +29,7 @@ import java.util.stream.Collectors;
 
				 
			
 
				 import org.apache.commons.io.IOUtils;
			
 
				 import org.codelibs.core.lang.StringUtil;
			
 
				+import org.codelibs.fess.app.service.FailureUrlService;
			
 
				 import org.codelibs.fess.crawler.builder.RequestDataBuilder;
			
 
				 import org.codelibs.fess.crawler.client.CrawlerClient;
			
 
				 import org.codelibs.fess.crawler.client.smb.SmbClient;
			
@@ -39,6 +40,7 @@ import org.codelibs.fess.crawler.log.LogType;
 
				 import org.codelibs.fess.es.client.FessEsClient;
			
 
				 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
			
 
				 import org.codelibs.fess.exception.ContainerNotAvailableException;
			
 
				+import org.codelibs.fess.exception.ContentNotFoundException;
			
 
				 import org.codelibs.fess.helper.CrawlingConfigHelper;
			
 
				 import org.codelibs.fess.helper.CrawlingInfoHelper;
			
 
				 import org.codelibs.fess.helper.IndexingHelper;
			
@@ -237,4 +239,20 @@ public class FessCrawlerThread extends CrawlerThread {
 
				         }
			
 
				         return urlSet;
			
 
				     }
			
 
				+
			
 
				+    @Override
			
 
				+    protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
			
 
				+        super.processResponse(urlQueue, responseData);
			
 
				+
			
 
				+        FessConfig fessConfig = ComponentUtil.getFessConfig();
			
 
				+        if (fessConfig.isCrawlerFailureUrlStatusCodes(responseData.getHttpStatusCode())) {
			
 
				+            String sessionId = crawlerContext.getSessionId();
			
 
				+            final CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(sessionId);
			
 
				+            final String url = urlQueue.getUrl();
			
 
				+
			
 
				+            final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
			
 
				+            failureUrlService.store(crawlingConfig, ContentNotFoundException.class.getCanonicalName(), url, new ContentNotFoundException(
			
 
				+                    url));
			
 
				+        }
			
 
				+    }
			
 
				 }
			
--- a/src/main/java/org/codelibs/fess/exception/ContentNotFoundException.java
+++ b/src/main/java/org/codelibs/fess/exception/ContentNotFoundException.java
@@ -0,0 +1,26 @@
 
				+/*
			
 
				+ * Copyright 2012-2017 CodeLibs Project and the Others.
			
 
				+ *
			
 
				+ * Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+ * you may not use this file except in compliance with the License.
			
 
				+ * You may obtain a copy of the License at
			
 
				+ *
			
 
				+ *     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+ *
			
 
				+ * Unless required by applicable law or agreed to in writing, software
			
 
				+ * distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
			
 
				+ * either express or implied. See the License for the specific language
			
 
				+ * governing permissions and limitations under the License.
			
 
				+ */
			
 
				+package org.codelibs.fess.exception;
			
 
				+
			
 
				+public class ContentNotFoundException extends FessSystemException {
			
 
				+
			
 
				+    private static final long serialVersionUID = 1L;
			
 
				+
			
 
				+    public ContentNotFoundException(String url) {
			
 
				+        super("Not Found: " + url, false, false);
			
 
				+    }
			
 
				+
			
 
				+}
			
--- a/src/main/java/org/codelibs/fess/exception/FessSystemException.java
+++ b/src/main/java/org/codelibs/fess/exception/FessSystemException.java
@@ -31,4 +31,8 @@ public class FessSystemException extends RuntimeException {
 
				         super(cause);
			
 
				     }
			
 
				 
			
 
				+    protected FessSystemException(final String message, final boolean enableSuppression, final boolean writableStackTrace) {
			
 
				+        super(message, null, enableSuppression, writableStackTrace);
			
 
				+    }
			
 
				+
			
 
				 }
			
--- a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java
+++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java
@@ -184,6 +184,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
				     /** The key of the configuration. e.g. true */
			
 
				     String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
			
 
				 
			
 
				+    /** The key of the configuration. e.g. 404 */
			
 
				+    String CRAWLER_FAILURE_URL_STATUS_CODES = "crawler.failure.url.status.codes";
			
 
				+
			
 
				     /** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
			
 
				     String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
			
 
				 
			
@@ -1636,6 +1639,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
				      */
			
 
				     boolean isCrawlerIgnoreContentException();
			
 
				 
			
 
				+    /**
			
 
				+     * Get the value for the key 'crawler.failure.url.status.codes'. <br>
			
 
				+     * The value is, e.g. 404 <br>
			
 
				+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
			
 
				+     */
			
 
				+    String getCrawlerFailureUrlStatusCodes();
			
 
				+
			
 
				+    /**
			
 
				+     * Get the value for the key 'crawler.failure.url.status.codes' as {@link Integer}. <br>
			
 
				+     * The value is, e.g. 404 <br>
			
 
				+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
			
 
				+     * @throws NumberFormatException When the property is not integer.
			
 
				+     */
			
 
				+    Integer getCrawlerFailureUrlStatusCodesAsInteger();
			
 
				+
			
 
				     /**
			
 
				      * Get the value for the key 'crawler.metadata.content.excludes'. <br>
			
 
				      * The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
			
@@ -5085,6 +5103,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
 
				             return is(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
			
 
				         }
			
 
				 
			
 
				+        public String getCrawlerFailureUrlStatusCodes() {
			
 
				+            return get(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
			
 
				+        }
			
 
				+
			
 
				+        public Integer getCrawlerFailureUrlStatusCodesAsInteger() {
			
 
				+            return getAsInteger(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
			
 
				+        }
			
 
				+
			
 
				         public String getCrawlerMetadataContentExcludes() {
			
 
				             return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
			
 
				         }
			
--- a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
+++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
@@ -62,6 +62,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
 
				 
			
 
				 public interface FessProp {
			
 
				 
			
 
				+    public static final String CRAWLER_FAILURE_URL_STATUS_CODES = "crawlerFailureUrlStatusCodes";
			
 
				+
			
 
				     public static final String VIRTUAL_HOST_HEADERS = "virtualHostHeaders";
			
 
				 
			
 
				     public static final String QUERY_COLLAPSE_INNER_HITS_SORTS = "queryCollapseInnerHitsSorts";
			
@@ -1584,4 +1586,23 @@ public interface FessProp {
 
				             return page;
			
 
				         }).orElse(page);
			
 
				     }
			
 
				+
			
 
				+    String getCrawlerFailureUrlStatusCodes();
			
 
				+
			
 
				+    public default boolean isCrawlerFailureUrlStatusCodes(final int code) {
			
 
				+        int[] codes = (int[]) propMap.get(CRAWLER_FAILURE_URL_STATUS_CODES);
			
 
				+        if (codes == null) {
			
 
				+            codes =
			
 
				+                    split(getCrawlerFailureUrlStatusCodes(), ",").get(
			
 
				+                            stream -> stream.filter(StringUtil::isNotBlank).mapToInt(Integer::parseInt).toArray());
			
 
				+            propMap.put(CRAWLER_FAILURE_URL_STATUS_CODES, codes);
			
 
				+        }
			
 
				+        for (int v : codes) {
			
 
				+            if (v == code) {
			
 
				+                return true;
			
 
				+            }
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				 }
			
--- a/src/main/resources/fess_config.properties
+++ b/src/main/resources/fess_config.properties
@@ -103,6 +103,7 @@ crawler.file.protocols=file,smb,ftp
 
				 crawler.ignore.robots.txt=false
			
 
				 crawler.ignore.meta.robots=false
			
 
				 crawler.ignore.content.exception=true
			
 
				+crawler.failure.url.status.codes=404
			
 
				 crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
			
 
				 crawler.metadata.name.mapping=\
			
 
				 title=title:string\n\