fix #2704 add include/exclude patterns for crawling config
This commit is contained in:
parent
75dd7830fa
commit
628abed16f
4 changed files with 228 additions and 3 deletions
|
@ -21,6 +21,7 @@ import javax.validation.constraints.Size;
|
|||
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.app.web.CrudMode;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.validation.CustomSize;
|
||||
import org.codelibs.fess.validation.UriType;
|
||||
|
@ -116,12 +117,17 @@ public class CreateForm {
|
|||
|
||||
public void initialize() {
|
||||
crudMode = CrudMode.CREATE;
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
includedPaths = fessConfig.getCrawlerDocumentFileDefaultIncludeIndexPatterns();
|
||||
excludedPaths = fessConfig.getCrawlerDocumentFileDefaultExcludeIndexPatterns();
|
||||
includedDocPaths = fessConfig.getCrawlerDocumentFileDefaultIncludeSearchPatterns();
|
||||
excludedDocPaths = fessConfig.getCrawlerDocumentFileDefaultExcludeSearchPatterns();
|
||||
boost = 1.0f;
|
||||
numOfThread = Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
|
||||
intervalTime = Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
|
||||
sortOrder = 0;
|
||||
createdBy = ComponentUtil.getSystemHelper().getUsername();
|
||||
createdTime = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
|
||||
permissions = ComponentUtil.getFessConfig().getSearchDefaultDisplayPermission();
|
||||
permissions = fessConfig.getSearchDefaultDisplayPermission();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,7 @@ import javax.validation.constraints.Size;
|
|||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.app.web.CrudMode;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.validation.CustomSize;
|
||||
import org.codelibs.fess.validation.UriType;
|
||||
|
@ -122,15 +123,20 @@ public class CreateForm {
|
|||
|
||||
public void initialize() {
|
||||
crudMode = CrudMode.CREATE;
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
includedUrls = fessConfig.getCrawlerDocumentHtmlDefaultIncludeIndexPatterns();
|
||||
excludedUrls = fessConfig.getCrawlerDocumentHtmlDefaultExcludeIndexPatterns();
|
||||
includedDocUrls = fessConfig.getCrawlerDocumentHtmlDefaultIncludeSearchPatterns();
|
||||
excludedDocUrls = fessConfig.getCrawlerDocumentHtmlDefaultExcludeSearchPatterns();
|
||||
boost = 1.0f;
|
||||
if (StringUtil.isBlank(userAgent)) {
|
||||
userAgent = ComponentUtil.getFessConfig().getUserAgentName();
|
||||
userAgent = fessConfig.getUserAgentName();
|
||||
}
|
||||
numOfThread = Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
|
||||
intervalTime = Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
|
||||
sortOrder = 0;
|
||||
createdBy = ComponentUtil.getSystemHelper().getUsername();
|
||||
createdTime = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
|
||||
permissions = ComponentUtil.getFessConfig().getSearchDefaultDisplayPermission();
|
||||
permissions = fessConfig.getSearchDefaultDisplayPermission();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -415,6 +415,18 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_HTML_DEFAULT_LANG = "crawler.document.html.default.lang";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS = "crawler.document.html.default.include.index.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. (?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe) */
|
||||
String CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_INDEX_PATTERNS = "crawler.document.html.default.exclude.index.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS = "crawler.document.html.default.include.search.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS = "crawler.document.html.default.exclude.search.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_FILE_NAME_ENCODING = "crawler.document.file.name.encoding";
|
||||
|
||||
|
@ -439,6 +451,18 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_FILE_DEFAULT_LANG = "crawler.document.file.default.lang";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS = "crawler.document.file.default.include.index.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS = "crawler.document.file.default.exclude.index.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS = "crawler.document.file.default.include.search.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. */
|
||||
String CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS = "crawler.document.file.default.exclude.search.patterns";
|
||||
|
||||
/** The key of the configuration. e.g. true */
|
||||
String CRAWLER_DOCUMENT_CACHE_ENABLED = "crawler.document.cache.enabled";
|
||||
|
||||
|
@ -2930,6 +2954,58 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
Integer getCrawlerDocumentHtmlDefaultLangAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.default.include.index.patterns'. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentHtmlDefaultIncludeIndexPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.default.include.index.patterns' as {@link Integer}. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentHtmlDefaultIncludeIndexPatternsAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.default.exclude.index.patterns'. <br>
|
||||
* The value is, e.g. (?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe) <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentHtmlDefaultExcludeIndexPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.default.include.search.patterns'. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentHtmlDefaultIncludeSearchPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.default.include.search.patterns' as {@link Integer}. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentHtmlDefaultIncludeSearchPatternsAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.default.exclude.search.patterns'. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentHtmlDefaultExcludeSearchPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.html.default.exclude.search.patterns' as {@link Integer}. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentHtmlDefaultExcludeSearchPatternsAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.name.encoding'. <br>
|
||||
* The value is, e.g. <br>
|
||||
|
@ -3041,6 +3117,66 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
Integer getCrawlerDocumentFileDefaultLangAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.include.index.patterns'. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentFileDefaultIncludeIndexPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.include.index.patterns' as {@link Integer}. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentFileDefaultIncludeIndexPatternsAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.exclude.index.patterns'. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentFileDefaultExcludeIndexPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.exclude.index.patterns' as {@link Integer}. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentFileDefaultExcludeIndexPatternsAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.include.search.patterns'. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentFileDefaultIncludeSearchPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.include.search.patterns' as {@link Integer}. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentFileDefaultIncludeSearchPatternsAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.exclude.search.patterns'. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentFileDefaultExcludeSearchPatterns();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.file.default.exclude.search.patterns' as {@link Integer}. <br>
|
||||
* The value is, e.g. <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getCrawlerDocumentFileDefaultExcludeSearchPatternsAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.cache.enabled'. <br>
|
||||
* The value is, e.g. true <br>
|
||||
|
@ -8031,6 +8167,34 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentHtmlDefaultIncludeIndexPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentHtmlDefaultIncludeIndexPatternsAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentHtmlDefaultExcludeIndexPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_INDEX_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentHtmlDefaultIncludeSearchPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentHtmlDefaultIncludeSearchPatternsAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentHtmlDefaultExcludeSearchPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentHtmlDefaultExcludeSearchPatternsAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentFileNameEncoding() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING);
|
||||
}
|
||||
|
@ -8091,6 +8255,38 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_LANG);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentFileDefaultIncludeIndexPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentFileDefaultIncludeIndexPatternsAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentFileDefaultExcludeIndexPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentFileDefaultExcludeIndexPatternsAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentFileDefaultIncludeSearchPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentFileDefaultIncludeSearchPatternsAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentFileDefaultExcludeSearchPatterns() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public Integer getCrawlerDocumentFileDefaultExcludeSearchPatternsAsInteger() {
|
||||
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentCacheEnabled() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLED);
|
||||
}
|
||||
|
@ -10548,6 +10744,11 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS, "noscript,script,style,header,footer,nav,a[rel=nofollow]");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH, "120");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_INDEX_PATTERNS,
|
||||
"(?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe)");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL, "No title.");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT, "false");
|
||||
|
@ -10556,6 +10757,10 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT, "true");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT, "true");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_LANG, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS, "");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLED, "true");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE, "2621440");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES, "text/html");
|
||||
|
|
|
@ -239,6 +239,10 @@ crawler.document.html.canonical.xpath=//LINK[@rel='canonical'][1]/@href
|
|||
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel=nofollow]
|
||||
crawler.document.html.max.digest.length=120
|
||||
crawler.document.html.default.lang=
|
||||
crawler.document.html.default.include.index.patterns=
|
||||
crawler.document.html.default.exclude.index.patterns=(?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe)
|
||||
crawler.document.html.default.include.search.patterns=
|
||||
crawler.document.html.default.exclude.search.patterns=
|
||||
|
||||
# file
|
||||
crawler.document.file.name.encoding=
|
||||
|
@ -249,6 +253,10 @@ crawler.document.file.max.digest.length=200
|
|||
crawler.document.file.append.meta.content=true
|
||||
crawler.document.file.append.body.content=true
|
||||
crawler.document.file.default.lang=
|
||||
crawler.document.file.default.include.index.patterns=
|
||||
crawler.document.file.default.exclude.index.patterns=
|
||||
crawler.document.file.default.include.search.patterns=
|
||||
crawler.document.file.default.exclude.search.patterns=
|
||||
|
||||
# cache
|
||||
crawler.document.cache.enabled=true
|
||||
|
|
Loading…
Add table
Reference in a new issue