fix #1070 store 404 urls to failure urls

This commit is contained in:
Shinsuke Sugaya 2017-05-25 18:41:05 +09:00
parent 1450c2b38a
commit 4bbe6df6b5
6 changed files with 96 additions and 0 deletions

View file

@ -29,6 +29,7 @@ import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.app.service.FailureUrlService;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.client.CrawlerClient;
import org.codelibs.fess.crawler.client.smb.SmbClient;
@ -39,6 +40,7 @@ import org.codelibs.fess.crawler.log.LogType;
import org.codelibs.fess.es.client.FessEsClient;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.exception.ContainerNotAvailableException;
import org.codelibs.fess.exception.ContentNotFoundException;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.IndexingHelper;
@ -237,4 +239,20 @@ public class FessCrawlerThread extends CrawlerThread {
}
return urlSet;
}
@Override
protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
super.processResponse(urlQueue, responseData);
FessConfig fessConfig = ComponentUtil.getFessConfig();
if (fessConfig.isCrawlerFailureUrlStatusCodes(responseData.getHttpStatusCode())) {
String sessionId = crawlerContext.getSessionId();
final CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(sessionId);
final String url = urlQueue.getUrl();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(crawlingConfig, ContentNotFoundException.class.getCanonicalName(), url, new ContentNotFoundException(
url));
}
}
}

View file

@ -0,0 +1,26 @@
/*
* Copyright 2012-2017 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.exception;
public class ContentNotFoundException extends FessSystemException {
private static final long serialVersionUID = 1L;
public ContentNotFoundException(String url) {
super("Not Found: " + url, false, false);
}
}

View file

@ -31,4 +31,8 @@ public class FessSystemException extends RuntimeException {
super(cause);
}
protected FessSystemException(final String message, final boolean enableSuppression, final boolean writableStackTrace) {
super(message, null, enableSuppression, writableStackTrace);
}
}

View file

@ -184,6 +184,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. true */
String CRAWLER_IGNORE_CONTENT_EXCEPTION = "crawler.ignore.content.exception";
/** The key of the configuration. e.g. 404 */
String CRAWLER_FAILURE_URL_STATUS_CODES = "crawler.failure.url.status.codes";
/** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
@ -1636,6 +1639,21 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
boolean isCrawlerIgnoreContentException();
/**
* Get the value for the key 'crawler.failure.url.status.codes'. <br>
* The value is, e.g. 404 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerFailureUrlStatusCodes();
/**
* Get the value for the key 'crawler.failure.url.status.codes' as {@link Integer}. <br>
* The value is, e.g. 404 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerFailureUrlStatusCodesAsInteger();
/**
* Get the value for the key 'crawler.metadata.content.excludes'. <br>
* The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
@ -5085,6 +5103,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return is(FessConfig.CRAWLER_IGNORE_CONTENT_EXCEPTION);
}
public String getCrawlerFailureUrlStatusCodes() {
return get(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
}
public Integer getCrawlerFailureUrlStatusCodesAsInteger() {
return getAsInteger(FessConfig.CRAWLER_FAILURE_URL_STATUS_CODES);
}
public String getCrawlerMetadataContentExcludes() {
return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
}

View file

@ -62,6 +62,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
public interface FessProp {
public static final String CRAWLER_FAILURE_URL_STATUS_CODES = "crawlerFailureUrlStatusCodes";
public static final String VIRTUAL_HOST_HEADERS = "virtualHostHeaders";
public static final String QUERY_COLLAPSE_INNER_HITS_SORTS = "queryCollapseInnerHitsSorts";
@ -1584,4 +1586,23 @@ public interface FessProp {
return page;
}).orElse(page);
}
String getCrawlerFailureUrlStatusCodes();
public default boolean isCrawlerFailureUrlStatusCodes(final int code) {
int[] codes = (int[]) propMap.get(CRAWLER_FAILURE_URL_STATUS_CODES);
if (codes == null) {
codes =
split(getCrawlerFailureUrlStatusCodes(), ",").get(
stream -> stream.filter(StringUtil::isNotBlank).mapToInt(Integer::parseInt).toArray());
propMap.put(CRAWLER_FAILURE_URL_STATUS_CODES, codes);
}
for (int v : codes) {
if (v == code) {
return true;
}
}
return false;
}
}

View file

@ -103,6 +103,7 @@ crawler.file.protocols=file,smb,ftp
crawler.ignore.robots.txt=false
crawler.ignore.meta.robots=false
crawler.ignore.content.exception=true
crawler.failure.url.status.codes=404
crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
crawler.metadata.name.mapping=\
title=title:string\n\