Prechádzať zdrojové kódy

fix #2704 add include/exclude patterns for crawling config

Shinsuke Sugaya 2 rokov pred
rodič
commit
628abed16f

+ 7 - 1
src/main/java/org/codelibs/fess/app/web/admin/fileconfig/CreateForm.java

@@ -21,6 +21,7 @@ import javax.validation.constraints.Size;
 
 import org.codelibs.fess.Constants;
 import org.codelibs.fess.app.web.CrudMode;
+import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.validation.CustomSize;
 import org.codelibs.fess.validation.UriType;
@@ -116,12 +117,17 @@ public class CreateForm {
 
     public void initialize() {
         crudMode = CrudMode.CREATE;
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        includedPaths = fessConfig.getCrawlerDocumentFileDefaultIncludeIndexPatterns();
+        excludedPaths = fessConfig.getCrawlerDocumentFileDefaultExcludeIndexPatterns();
+        includedDocPaths = fessConfig.getCrawlerDocumentFileDefaultIncludeSearchPatterns();
+        excludedDocPaths = fessConfig.getCrawlerDocumentFileDefaultExcludeSearchPatterns();
         boost = 1.0f;
         numOfThread = Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
         intervalTime = Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
         sortOrder = 0;
         createdBy = ComponentUtil.getSystemHelper().getUsername();
         createdTime = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
-        permissions = ComponentUtil.getFessConfig().getSearchDefaultDisplayPermission();
+        permissions = fessConfig.getSearchDefaultDisplayPermission();
     }
 }

+ 8 - 2
src/main/java/org/codelibs/fess/app/web/admin/webconfig/CreateForm.java

@@ -22,6 +22,7 @@ import javax.validation.constraints.Size;
 import org.codelibs.core.lang.StringUtil;
 import org.codelibs.fess.Constants;
 import org.codelibs.fess.app.web.CrudMode;
+import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.validation.CustomSize;
 import org.codelibs.fess.validation.UriType;
@@ -122,15 +123,20 @@ public class CreateForm {
 
     public void initialize() {
         crudMode = CrudMode.CREATE;
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        includedUrls = fessConfig.getCrawlerDocumentHtmlDefaultIncludeIndexPatterns();
+        excludedUrls = fessConfig.getCrawlerDocumentHtmlDefaultExcludeIndexPatterns();
+        includedDocUrls = fessConfig.getCrawlerDocumentHtmlDefaultIncludeSearchPatterns();
+        excludedDocUrls = fessConfig.getCrawlerDocumentHtmlDefaultExcludeSearchPatterns();
         boost = 1.0f;
         if (StringUtil.isBlank(userAgent)) {
-            userAgent = ComponentUtil.getFessConfig().getUserAgentName();
+            userAgent = fessConfig.getUserAgentName();
         }
         numOfThread = Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
         intervalTime = Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
         sortOrder = 0;
         createdBy = ComponentUtil.getSystemHelper().getUsername();
         createdTime = ComponentUtil.getSystemHelper().getCurrentTimeAsLong();
-        permissions = ComponentUtil.getFessConfig().getSearchDefaultDisplayPermission();
+        permissions = fessConfig.getSearchDefaultDisplayPermission();
     }
 }

+ 205 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -415,6 +415,18 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g.  */
     String CRAWLER_DOCUMENT_HTML_DEFAULT_LANG = "crawler.document.html.default.lang";
 
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS = "crawler.document.html.default.include.index.patterns";
+
+    /** The key of the configuration. e.g. (?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe) */
+    String CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_INDEX_PATTERNS = "crawler.document.html.default.exclude.index.patterns";
+
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS = "crawler.document.html.default.include.search.patterns";
+
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS = "crawler.document.html.default.exclude.search.patterns";
+
     /** The key of the configuration. e.g.  */
     String CRAWLER_DOCUMENT_FILE_NAME_ENCODING = "crawler.document.file.name.encoding";
 
@@ -439,6 +451,18 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g.  */
     String CRAWLER_DOCUMENT_FILE_DEFAULT_LANG = "crawler.document.file.default.lang";
 
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS = "crawler.document.file.default.include.index.patterns";
+
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS = "crawler.document.file.default.exclude.index.patterns";
+
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS = "crawler.document.file.default.include.search.patterns";
+
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS = "crawler.document.file.default.exclude.search.patterns";
+
     /** The key of the configuration. e.g. true */
     String CRAWLER_DOCUMENT_CACHE_ENABLED = "crawler.document.cache.enabled";
 
@@ -2930,6 +2954,58 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     Integer getCrawlerDocumentHtmlDefaultLangAsInteger();
 
+    /**
+     * Get the value for the key 'crawler.document.html.default.include.index.patterns'. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlDefaultIncludeIndexPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.html.default.include.index.patterns' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentHtmlDefaultIncludeIndexPatternsAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.html.default.exclude.index.patterns'. <br>
+     * The value is, e.g. (?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe) <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlDefaultExcludeIndexPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.html.default.include.search.patterns'. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlDefaultIncludeSearchPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.html.default.include.search.patterns' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentHtmlDefaultIncludeSearchPatternsAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.html.default.exclude.search.patterns'. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlDefaultExcludeSearchPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.html.default.exclude.search.patterns' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentHtmlDefaultExcludeSearchPatternsAsInteger();
+
     /**
      * Get the value for the key 'crawler.document.file.name.encoding'. <br>
      * The value is, e.g.  <br>
@@ -3041,6 +3117,66 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     Integer getCrawlerDocumentFileDefaultLangAsInteger();
 
+    /**
+     * Get the value for the key 'crawler.document.file.default.include.index.patterns'. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileDefaultIncludeIndexPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.file.default.include.index.patterns' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileDefaultIncludeIndexPatternsAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.default.exclude.index.patterns'. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileDefaultExcludeIndexPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.file.default.exclude.index.patterns' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileDefaultExcludeIndexPatternsAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.default.include.search.patterns'. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileDefaultIncludeSearchPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.file.default.include.search.patterns' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileDefaultIncludeSearchPatternsAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.default.exclude.search.patterns'. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileDefaultExcludeSearchPatterns();
+
+    /**
+     * Get the value for the key 'crawler.document.file.default.exclude.search.patterns' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileDefaultExcludeSearchPatternsAsInteger();
+
     /**
      * Get the value for the key 'crawler.document.cache.enabled'. <br>
      * The value is, e.g. true <br>
@@ -8031,6 +8167,34 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG);
         }
 
+        public String getCrawlerDocumentHtmlDefaultIncludeIndexPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS);
+        }
+
+        public Integer getCrawlerDocumentHtmlDefaultIncludeIndexPatternsAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS);
+        }
+
+        public String getCrawlerDocumentHtmlDefaultExcludeIndexPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_INDEX_PATTERNS);
+        }
+
+        public String getCrawlerDocumentHtmlDefaultIncludeSearchPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS);
+        }
+
+        public Integer getCrawlerDocumentHtmlDefaultIncludeSearchPatternsAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS);
+        }
+
+        public String getCrawlerDocumentHtmlDefaultExcludeSearchPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
+        }
+
+        public Integer getCrawlerDocumentHtmlDefaultExcludeSearchPatternsAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
+        }
+
         public String getCrawlerDocumentFileNameEncoding() {
             return get(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING);
         }
@@ -8091,6 +8255,38 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_LANG);
         }
 
+        public String getCrawlerDocumentFileDefaultIncludeIndexPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS);
+        }
+
+        public Integer getCrawlerDocumentFileDefaultIncludeIndexPatternsAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS);
+        }
+
+        public String getCrawlerDocumentFileDefaultExcludeIndexPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS);
+        }
+
+        public Integer getCrawlerDocumentFileDefaultExcludeIndexPatternsAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS);
+        }
+
+        public String getCrawlerDocumentFileDefaultIncludeSearchPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS);
+        }
+
+        public Integer getCrawlerDocumentFileDefaultIncludeSearchPatternsAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS);
+        }
+
+        public String getCrawlerDocumentFileDefaultExcludeSearchPatterns() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
+        }
+
+        public Integer getCrawlerDocumentFileDefaultExcludeSearchPatternsAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS);
+        }
+
         public String getCrawlerDocumentCacheEnabled() {
             return get(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLED);
         }
@@ -10548,6 +10744,11 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS, "noscript,script,style,header,footer,nav,a[rel=nofollow]");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH, "120");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_LANG, "");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_INDEX_PATTERNS, "");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_INDEX_PATTERNS,
+                    "(?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe)");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_INCLUDE_SEARCH_PATTERNS, "");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_HTML_DEFAULT_EXCLUDE_SEARCH_PATTERNS, "");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING, "");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL, "No title.");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT, "false");
@@ -10556,6 +10757,10 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT, "true");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT, "true");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_LANG, "");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_INDEX_PATTERNS, "");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_INDEX_PATTERNS, "");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_INCLUDE_SEARCH_PATTERNS, "");
+            defaultMap.put(FessConfig.CRAWLER_DOCUMENT_FILE_DEFAULT_EXCLUDE_SEARCH_PATTERNS, "");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLED, "true");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE, "2621440");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES, "text/html");

+ 8 - 0
src/main/resources/fess_config.properties

@@ -239,6 +239,10 @@ crawler.document.html.canonical.xpath=//LINK[@rel='canonical'][1]/@href
 crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel=nofollow]
 crawler.document.html.max.digest.length=120
 crawler.document.html.default.lang=
+crawler.document.html.default.include.index.patterns=
+crawler.document.html.default.exclude.index.patterns=(?i).*(css|js|jpeg|jpg|gif|png|bmp|wmv|xml|ico|exe)
+crawler.document.html.default.include.search.patterns=
+crawler.document.html.default.exclude.search.patterns=
 
 # file
 crawler.document.file.name.encoding=
@@ -249,6 +253,10 @@ crawler.document.file.max.digest.length=200
 crawler.document.file.append.meta.content=true
 crawler.document.file.append.body.content=true
 crawler.document.file.default.lang=
+crawler.document.file.default.include.index.patterns=
+crawler.document.file.default.exclude.index.patterns=
+crawler.document.file.default.include.search.patterns=
+crawler.document.file.default.exclude.search.patterns=
 
 # cache
 crawler.document.cache.enabled=true