fix #1070 store 404 urls to failure urls
This commit is contained in:
parent
1450c2b38a
commit
4bbe6df6b5
6 changed files with 96 additions and 0 deletions
|
@ -29,6 +29,7 @@ import java.util.stream.Collectors;
|
|||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.app.service.FailureUrlService;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClient;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbClient;
|
||||
|
@ -39,6 +40,7 @@ import org.codelibs.fess.crawler.log.LogType;
|
|||
import org.codelibs.fess.es.client.FessEsClient;
|
||||
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
||||
import org.codelibs.fess.exception.ContainerNotAvailableException;
|
||||
import org.codelibs.fess.exception.ContentNotFoundException;
|
||||
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
||||
import org.codelibs.fess.helper.CrawlingInfoHelper;
|
||||
import org.codelibs.fess.helper.IndexingHelper;
|
||||
|
@ -237,4 +239,20 @@ public class FessCrawlerThread extends CrawlerThread {
|
|||
}
|
||||
return urlSet;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
|
||||
super.processResponse(urlQueue, responseData);
|
||||
|
||||
FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
if (fessConfig.isCrawlerFailureUrlStatusCodes(responseData.getHttpStatusCode())) {
|
||||
String sessionId = crawlerContext.getSessionId();
|
||||
final CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(sessionId);
|
||||
final String url = urlQueue.getUrl();
|
||||
|
||||
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
|
||||
failureUrlService.store(crawlingConfig, ContentNotFoundException.class.getCanonicalName(), url, new ContentNotFoundException(
|
||||
url));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,26 @@
|
|||
/*
|
||||
* Copyright 2012-2017 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.exception;
|
||||
|
||||
public class ContentNotFoundException extends FessSystemException {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
public ContentNotFoundException(String url) {
|
||||
super("Not Found: " + url, false, false);
|
||||
}
|
||||
|
||||
}
|
|
@ -31,4 +31,8 @@ public class FessSystemException extends RuntimeException {
|
|||
super(cause);
|
||||
}
|
||||
|
||||
protected FessSystemException(final String message, final boolean enableSuppression, final boolean writableStackTrace) {
|
||||
super(message, null, enableSuppression, writableStackTrace);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -184,6 +184,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. true */
|
||||
String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
|
||||
|
||||
/** The key of the configuration. e.g. 404 */
|
||||
String CRAWLER_FAILURE_URL_STATUS_CODES = "crawler.failure.url.status.codes";
|
||||
|
||||
/** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
|
||||
String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
|
||||
|
||||
|
@ -1636,6 +1639,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
boolean isCrawlerIgnoreContentException();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.failure.url.status.codes'. <br>
|
||||
* The value is, e.g. 404 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerFailureUrlStatusCodes();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.failure.url.status.codes' as {@link Integer}. <br>
|
||||
* The value is, e.g. 404 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerFailureUrlStatusCodesAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.metadata.content.excludes'. <br>
|
||||
* The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
|
||||
|
@ -5085,6 +5103,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return is(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
|
||||
}
|
||||
|
||||
public String getCrawlerFailureUrlStatusCodes() {
|
||||
return get(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
|
||||
}
|
||||
|
||||
public Integer getCrawlerFailureUrlStatusCodesAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
|
||||
}
|
||||
|
||||
public String getCrawlerMetadataContentExcludes() {
|
||||
return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
|
||||
}
|
||||
|
|
|
@ -62,6 +62,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
|
|||
|
||||
public interface FessProp {
|
||||
|
||||
public static final String CRAWLER_FAILURE_URL_STATUS_CODES = "crawlerFailureUrlStatusCodes";
|
||||
|
||||
public static final String VIRTUAL_HOST_HEADERS = "virtualHostHeaders";
|
||||
|
||||
public static final String QUERY_COLLAPSE_INNER_HITS_SORTS = "queryCollapseInnerHitsSorts";
|
||||
|
@ -1584,4 +1586,23 @@ public interface FessProp {
|
|||
return page;
|
||||
}).orElse(page);
|
||||
}
|
||||
|
||||
String getCrawlerFailureUrlStatusCodes();
|
||||
|
||||
public default boolean isCrawlerFailureUrlStatusCodes(final int code) {
|
||||
int[] codes = (int[]) propMap.get(CRAWLER_FAILURE_URL_STATUS_CODES);
|
||||
if (codes == null) {
|
||||
codes =
|
||||
split(getCrawlerFailureUrlStatusCodes(), ",").get(
|
||||
stream -> stream.filter(StringUtil::isNotBlank).mapToInt(Integer::parseInt).toArray());
|
||||
propMap.put(CRAWLER_FAILURE_URL_STATUS_CODES, codes);
|
||||
}
|
||||
for (int v : codes) {
|
||||
if (v == code) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -103,6 +103,7 @@ crawler.file.protocols=file,smb,ftp
|
|||
crawler.ignore.robots.txt=false
|
||||
crawler.ignore.meta.robots=false
|
||||
crawler.ignore.content.exception=true
|
||||
crawler.failure.url.status.codes=404
|
||||
crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
|
||||
crawler.metadata.name.mapping=\
|
||||
title=title:string\n\
|
||||
|
|
Loading…
Add table
Reference in a new issue