fix #377 : add crawler.*.protocols
This commit is contained in:
parent
df54d2ab58
commit
d444d1ba5b
8 changed files with 76 additions and 8 deletions
|
@ -27,6 +27,7 @@ import org.codelibs.fess.app.web.CrudMode;
|
|||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.StreamUtil;
|
||||
import org.codelibs.fess.validation.UriType;
|
||||
import org.codelibs.fess.validation.UriTypeValidator.ProtocolType;
|
||||
import org.lastaflute.web.validation.Required;
|
||||
import org.lastaflute.web.validation.theme.conversion.ValidateTypeFailure;
|
||||
|
||||
|
@ -50,7 +51,7 @@ public class CreateForm implements Serializable {
|
|||
public String name;
|
||||
|
||||
@Required
|
||||
@UriType(protocols = { "file:", "smb:" })
|
||||
@UriType(protocolType = ProtocolType.FILE)
|
||||
@Size(max = 4000)
|
||||
public String paths;
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import org.codelibs.fess.app.web.CrudMode;
|
|||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.StreamUtil;
|
||||
import org.codelibs.fess.validation.UriType;
|
||||
import org.codelibs.fess.validation.UriTypeValidator.ProtocolType;
|
||||
import org.lastaflute.web.validation.Required;
|
||||
import org.lastaflute.web.validation.theme.conversion.ValidateTypeFailure;
|
||||
|
||||
|
@ -51,7 +52,7 @@ public class CreateForm implements Serializable {
|
|||
public String name;
|
||||
|
||||
@Required
|
||||
@UriType(protocols = { "http:", "https:" })
|
||||
@UriType(protocolType = ProtocolType.WEB)
|
||||
@Size(max = 4000)
|
||||
public String urls;
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ import org.codelibs.fess.crawler.service.UrlQueueService;
|
|||
import org.codelibs.fess.es.config.exentity.FileConfig;
|
||||
import org.codelibs.fess.es.config.exentity.WebConfig;
|
||||
import org.codelibs.fess.indexer.IndexUpdater;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -134,6 +135,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
}
|
||||
|
||||
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
|
@ -184,7 +186,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
for (final String u : urls) {
|
||||
if (StringUtil.isNotBlank(u)) {
|
||||
final String urlValue = u.trim();
|
||||
if (!urlValue.startsWith("#")) {
|
||||
if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(u)) {
|
||||
crawler.addUrl(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Target URL: " + urlValue);
|
||||
|
@ -288,7 +290,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
if (StringUtil.isNotBlank(u)) {
|
||||
u = u.trim();
|
||||
if (!u.startsWith("#")) {
|
||||
if (!u.startsWith("file:") && !u.startsWith("smb:")) {
|
||||
if (!fessConfig.isValidCrawlerFileProtocol(u)) {
|
||||
if (u.startsWith("/")) {
|
||||
u = "file:" + u;
|
||||
} else {
|
||||
|
|
|
@ -126,6 +126,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. UTF-8 */
|
||||
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
|
||||
|
||||
/** The key of the configuration. e.g. http,https */
|
||||
String CRAWLER_WEB_PROTOCOLS = "crawler.web.protocols";
|
||||
|
||||
/** The key of the configuration. e.g. file,smb */
|
||||
String CRAWLER_FILE_PROTOCOLS = "crawler.file.protocols";
|
||||
|
||||
/** The key of the configuration. e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* */
|
||||
String CRAWLER_METADATA_CONTENT_EXCLUDES = "crawler.metadata.content.excludes";
|
||||
|
||||
|
@ -956,6 +962,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
String getCrawlerCrawlingDataEncoding();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.web.protocols'. <br>
|
||||
* The value is, e.g. http,https <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerWebProtocols();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.file.protocols'. <br>
|
||||
* The value is, e.g. file,smb <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerFileProtocols();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.metadata.content.excludes'. <br>
|
||||
* The value is, e.g. resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.* <br>
|
||||
|
@ -2854,6 +2874,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
|
||||
}
|
||||
|
||||
public String getCrawlerWebProtocols() {
|
||||
return get(FessConfig.CRAWLER_WEB_PROTOCOLS);
|
||||
}
|
||||
|
||||
public String getCrawlerFileProtocols() {
|
||||
return get(FessConfig.CRAWLER_FILE_PROTOCOLS);
|
||||
}
|
||||
|
||||
public String getCrawlerMetadataContentExcludes() {
|
||||
return get(FessConfig.CRAWLER_METADATA_CONTENT_EXCLUDES);
|
||||
}
|
||||
|
|
|
@ -503,4 +503,25 @@ public interface FessProp {
|
|||
return StreamUtil.of(getAuthenticationAdminUsers().split(",")).anyMatch(s -> s.equals(username));
|
||||
}
|
||||
|
||||
String getCrawlerWebProtocols();
|
||||
|
||||
public default String[] getCrawlerWebProtocolsAsArray() {
|
||||
return StreamUtil.of(getCrawlerWebProtocols().split(",")).filter(s -> StringUtil.isNotBlank(s)).map(s -> s.trim() + ":")
|
||||
.toArray(n -> new String[n]);
|
||||
}
|
||||
|
||||
public default boolean isValidCrawlerWebProtocol(final String url) {
|
||||
return StreamUtil.of(getCrawlerWebProtocolsAsArray()).anyMatch(s -> url.startsWith(s));
|
||||
}
|
||||
|
||||
String getCrawlerFileProtocols();
|
||||
|
||||
public default String[] getCrawlerFileProtocolsAsArray() {
|
||||
return StreamUtil.of(getCrawlerFileProtocols().split(",")).filter(s -> StringUtil.isNotBlank(s)).map(s -> s.trim() + ":")
|
||||
.toArray(n -> new String[n]);
|
||||
}
|
||||
|
||||
public default boolean isValidCrawlerFileProtocol(final String url) {
|
||||
return StreamUtil.of(getCrawlerFileProtocolsAsArray()).anyMatch(s -> url.startsWith(s));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,13 +29,15 @@ import java.lang.annotation.Target;
|
|||
import javax.validation.Constraint;
|
||||
import javax.validation.Payload;
|
||||
|
||||
import org.codelibs.fess.validation.UriTypeValidator.ProtocolType;
|
||||
|
||||
@Target({ METHOD, FIELD, ANNOTATION_TYPE, CONSTRUCTOR, PARAMETER })
|
||||
@Retention(RUNTIME)
|
||||
@Documented
|
||||
@Constraint(validatedBy = UriTypeValidator.class)
|
||||
public @interface UriType {
|
||||
|
||||
String[] protocols();
|
||||
ProtocolType protocolType();
|
||||
|
||||
String message() default "{org.lastaflute.validator.constraints.UriType.message}";
|
||||
|
||||
|
|
|
@ -20,15 +20,22 @@ import javax.validation.ConstraintValidator;
|
|||
import javax.validation.ConstraintValidatorContext;
|
||||
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
|
||||
public class UriTypeValidator implements ConstraintValidator<UriType, String> {
|
||||
private String[] protocols;
|
||||
|
||||
@Override
|
||||
public void initialize(final UriType uriType) {
|
||||
protocols = uriType.protocols();
|
||||
if (protocols == null || protocols.length == 0) {
|
||||
throw new ConstraintDefinitionException("protocols is emtpy.");
|
||||
switch (uriType.protocolType()) {
|
||||
case WEB:
|
||||
protocols = ComponentUtil.getFessConfig().getCrawlerWebProtocolsAsArray();
|
||||
break;
|
||||
case FILE:
|
||||
protocols = ComponentUtil.getFessConfig().getCrawlerFileProtocolsAsArray();
|
||||
break;
|
||||
default:
|
||||
throw new ConstraintDefinitionException("protocolType is emtpy.");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -58,4 +65,8 @@ public class UriTypeValidator implements ConstraintValidator<UriType, String> {
|
|||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public enum ProtocolType {
|
||||
WEB, FILE;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -74,6 +74,8 @@ crawler.document.unknown.hostname=unknown
|
|||
crawler.document.use.site.encoding.on.english=false
|
||||
crawler.document.append.data=true
|
||||
crawler.crawling.data.encoding=UTF-8
|
||||
crawler.web.protocols=http,https
|
||||
crawler.file.protocols=file,smb
|
||||
crawler.metadata.content.excludes=resourceName,X-Parsed-By,Content-Encoding.*,Content-Type.*
|
||||
crawler.metadata.name.mapping=\
|
||||
title=title:string\n\
|
||||
|
|
Loading…
Add table
Reference in a new issue