fix #377 : add crawler.*.protocols

This commit is contained in:
Shinsuke Sugaya 2016-02-21 07:27:23 +09:00
parent df54d2ab58
commit d444d1ba5b
8 changed files with 76 additions and 8 deletions

View file

@ -27,6 +27,7 @@ import org.codelibs.fess.app.web.CrudMode;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.StreamUtil;
import org.codelibs.fess.validation.UriType;
import org.codelibs.fess.validation.UriTypeValidator.ProtocolType;
import org.lastaflute.web.validation.Required;
import org.lastaflute.web.validation.theme.conversion.ValidateTypeFailure;
@ -50,7 +51,7 @@ public class CreateForm implements Serializable {
public String name;
@Required
@UriType(protocols = { "file:", "smb:" })
@UriType(protocolType = ProtocolType.FILE)
@Size(max = 4000)
public String paths;

View file

@ -27,6 +27,7 @@ import org.codelibs.fess.app.web.CrudMode;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.StreamUtil;
import org.codelibs.fess.validation.UriType;
import org.codelibs.fess.validation.UriTypeValidator.ProtocolType;
import org.lastaflute.web.validation.Required;
import org.lastaflute.web.validation.theme.conversion.ValidateTypeFailure;
@ -51,7 +52,7 @@ public class CreateForm implements Serializable {
public String name;
@Required
@UriType(protocols = { "http:", "https:" })
@UriType(protocolType = ProtocolType.WEB)
@Size(max = 4000)
public String urls;

View file

@ -39,6 +39,7 @@ import org.codelibs.fess.crawler.service.UrlQueueService;
import org.codelibs.fess.es.config.exentity.FileConfig;
import org.codelibs.fess.es.config.exentity.WebConfig;
import org.codelibs.fess.indexer.IndexUpdater;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.lastaflute.di.core.SingletonLaContainer;
import org.slf4j.Logger;
@ -134,6 +135,7 @@ public class WebFsIndexHelper implements Serializable {
}
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final long startTime = System.currentTimeMillis();
@ -184,7 +186,7 @@ public class WebFsIndexHelper implements Serializable {
for (final String u : urls) {
if (StringUtil.isNotBlank(u)) {
final String urlValue = u.trim();
if (!urlValue.startsWith("#")) {
if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(u)) {
crawler.addUrl(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Target URL: " + urlValue);
@ -288,7 +290,7 @@ public class WebFsIndexHelper implements Serializable {
if (StringUtil.isNotBlank(u)) {
u = u.trim();
if (!u.startsWith("#")) {
if (!u.startsWith("file:") && !u.startsWith("smb:")) {
if (!fessConfig.isValidCrawlerFileProtocol(u)) {
if (u.startsWith("/")) {
u = "file:" + u;
} else {

View file

@ -126,6 +126,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. UTF-8 */
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
/** The key of the configuration. e.g. http,https */
String CRAWLER_WEB_PROTOCOLS = "crawler.web.protocols";
/** The key of the configuration. e.g. file,smb */
String CRAWLER_FILE_PROTOCOLS = "crawler.file.protocols";
/** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
@ -956,6 +962,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
String getCrawlerCrawlingDataEncoding();
/**
* Get the value for the key 'crawler.web.protocols'. <br>
* The value is, e.g. http,https <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerWebProtocols();
/**
* Get the value for the key 'crawler.file.protocols'. <br>
* The value is, e.g. file,smb <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerFileProtocols();
/**
* Get the value for the key 'crawler.metadata.content.excludes'. <br>
* The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
@ -2854,6 +2874,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
}
public String getCrawlerWebProtocols() {
return get(FessConfig.CRAWLER_WEB_PROTOCOLS);
}
public String getCrawlerFileProtocols() {
return get(FessConfig.CRAWLER_FILE_PROTOCOLS);
}
public String getCrawlerMetadataContentExcludes() {
return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
}

View file

@ -503,4 +503,25 @@ public interface FessProp {
return StreamUtil.of(getAuthenticationAdminUsers().split(",")).anyMatch(s -> s.equals(username));
}
String getCrawlerWebProtocols();
public default String[] getCrawlerWebProtocolsAsArray() {
return StreamUtil.of(getCrawlerWebProtocols().split(",")).filter(s -> StringUtil.isNotBlank(s)).map(s -> s.trim() + ":")
.toArray(n -> new String[n]);
}
public default boolean isValidCrawlerWebProtocol(final String url) {
return StreamUtil.of(getCrawlerWebProtocolsAsArray()).anyMatch(s -> url.startsWith(s));
}
String getCrawlerFileProtocols();
public default String[] getCrawlerFileProtocolsAsArray() {
return StreamUtil.of(getCrawlerFileProtocols().split(",")).filter(s -> StringUtil.isNotBlank(s)).map(s -> s.trim() + ":")
.toArray(n -> new String[n]);
}
public default boolean isValidCrawlerFileProtocol(final String url) {
return StreamUtil.of(getCrawlerFileProtocolsAsArray()).anyMatch(s -> url.startsWith(s));
}
}

View file

@ -29,13 +29,15 @@ import java.lang.annotation.Target;
import javax.validation.Constraint;
import javax.validation.Payload;
import org.codelibs.fess.validation.UriTypeValidator.ProtocolType;
@Target({ METHOD, FIELD, ANNOTATION_TYPE, CONSTRUCTOR, PARAMETER })
@Retention(RUNTIME)
@Documented
@Constraint(validatedBy = UriTypeValidator.class)
public @interface UriType {
String[] protocols();
ProtocolType protocolType();
String message() default "{org.lastaflute.validator.constraints.UriType.message}";

View file

@ -20,15 +20,22 @@ import javax.validation.ConstraintValidator;
import javax.validation.ConstraintValidatorContext;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.util.ComponentUtil;
public class UriTypeValidator implements ConstraintValidator<UriType, String> {
private String[] protocols;
@Override
public void initialize(final UriType uriType) {
protocols = uriType.protocols();
if (protocols == null || protocols.length == 0) {
throw new ConstraintDefinitionException("protocols is emtpy.");
switch (uriType.protocolType()) {
case WEB:
protocols = ComponentUtil.getFessConfig().getCrawlerWebProtocolsAsArray();
break;
case FILE:
protocols = ComponentUtil.getFessConfig().getCrawlerFileProtocolsAsArray();
break;
default:
throw new ConstraintDefinitionException("protocolType is emtpy.");
}
}
@ -58,4 +65,8 @@ public class UriTypeValidator implements ConstraintValidator<UriType, String> {
}
return true;
}
public enum ProtocolType {
WEB, FILE;
}
}

View file

@ -74,6 +74,8 @@ crawler.document.unknown.hostname=unknown
crawler.document.use.site.encoding.on.english=false
crawler.document.append.data=true
crawler.crawling.data.encoding=UTF-8
crawler.web.protocols=http,https
crawler.file.protocols=file,smb
crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
crawler.metadata.name.mapping=\
title=title:string\n\