Przeglądaj źródła

fix #440 : modify index/mapping

Shinsuke Sugaya 9 lat temu
rodzic
commit
92dc3d378f

+ 12 - 0
src/main/resources/crawler/extractor+tikaExtractor.xml

@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
+	"http://dbflute.org/meta/lastadi10.dtd">
+<components namespace="fessCrawler">
+	<include path="crawler/container.xml" />
+	<component name="tikaExtractor"
+		class="org.codelibs.fess.crawler.extractor.impl.TikaExtractor">
+		<property name="maxCompressionRatio">2</property>
+		<property name="maxUncompressionSize">10000000</property>
+		<property name="maxAlphanumTermSize">20</property>
+	</component>
+</components>

+ 2 - 2
src/main/resources/fess_config.properties

@@ -21,7 +21,7 @@ app.digest.algorism=sha256
 jvm.crawler.options=\
 -Djava.awt.headless=true\n\
 -server\n\
--Xmx256m\n\
+-Xmx512m\n\
 -XX:MaxMetaspaceSize=128m\n\
 -XX:CompressedClassSpaceSize=32m\n\
 -XX:-UseGCOverheadLimit\n\
@@ -77,7 +77,7 @@ crawler.document.site.encoding=UTF-8
 crawler.document.unknown.hostname=unknown
 crawler.document.use.site.encoding.on.english=false
 crawler.document.append.data=true
-crawler.document.max.alphanum.term.size=-1
+crawler.document.max.alphanum.term.size=20
 crawler.crawling.data.encoding=UTF-8
 crawler.web.protocols=http,https
 crawler.file.protocols=file,smb

+ 89 - 2
src/main/resources/fess_indices/fess.json

@@ -10,6 +10,11 @@
         "mapping_ja_filter": {
           "type": "mapping",
           "mappings_path": "${fess.dictionary.path}ja/mapping.txt"
+        },
+        "removeall_filter": {
+          "type": "pattern_replace",
+          "pattern":"(.*)",
+          "replacement":""
         }
       },
       "filter": {
@@ -24,6 +29,72 @@
         "stopword_en_filter": {
           "type": "stop",
           "stopwords": "_english_"
+        },
+        "kuromoji_neologd_pos_filter" : {
+          "type" : "kuromoji_neologd_part_of_speech",
+          "stoptags" : [
+            "その他",
+            "その他-間投",
+            "フィラー",
+            "感動詞",
+            "記号",
+            "記号-アルファベット",
+            "記号-一般",
+            "記号-括弧開",
+            "記号-括弧閉",
+            "記号-句点",
+            "記号-空白",
+            "記号-読点",
+            "形容詞",
+            "形容詞-自立",
+            "形容詞-接尾",
+            "形容詞-非自立",
+            "語断片",
+            "助詞",
+            "助詞-格助詞",
+            "助詞-格助詞-一般",
+            "助詞-格助詞-引用",
+            "助詞-格助詞-連語",
+            "助詞-間投助詞",
+            "助詞-係助詞",
+            "助詞-終助詞",
+            "助詞-接続助詞",
+            "助詞-特殊",
+            "助詞-副詞化",
+            "助詞-副助詞",
+            "助詞-副助詞/並立助詞/終助詞",
+            "助詞-並立助詞",
+            "助詞-連体化",
+            "助動詞",
+            "接続詞",
+            "接頭詞",
+            "接頭詞-形容詞接続",
+            "接頭詞-数接続",
+            "接頭詞-動詞接続",
+            "接頭詞-名詞接続",
+            "動詞",
+            "動詞-自立",
+            "動詞-接尾",
+            "動詞-非自立",
+            "非言語音",
+            "副詞",
+            "副詞-一般",
+            "副詞-助詞類接続",
+            "未知語",
+            "連体詞"
+          ]
+        },
+        "truncate10_filter" : {
+          "type" : "truncate",
+          "length" : 10
+        },
+        "truncate20_filter" : {
+          "type" : "truncate",
+          "length" : 20
+        },
+        "alphanum_word_filter" : {
+          "type" : "alphanum_word",
+          "max_token_length" : 20
         }
       },
       "tokenizer": {
@@ -40,6 +111,13 @@
           "synonyms_path": "${fess.dictionary.path}synonym.txt",
           "dynamic_reload":true,
           "reload_interval":"1m"
+        },
+        "bigram_synonym_tokenizer": {
+          "type": "ngram_synonym",
+          "n": "2",
+          "synonyms_path": "${fess.dictionary.path}synonym.txt",
+          "dynamic_reload":true,
+          "reload_interval":"1m"
         }
       },
       "analyzer": {
@@ -51,9 +129,10 @@
           ],
           "tokenizer": "kuromoji_neologd_tokenizer",
           "filter": [
+            "truncate10_filter",
             "kuromoji_neologd_baseform",
             "kuromoji_neologd_stemmer",
-            "kuromoji_neologd_part_of_speech",
+            "kuromoji_neologd_pos_filter",
             "lowercase"
           ]
         },
@@ -61,10 +140,18 @@
           "type": "custom",
           "tokenizer": "standard",
           "filter": [
+            "truncate20_filter",
             "lowercase",
             "possessive_stemmer_en_filter"
           ]
         },
+        "empty_analyzer": {
+          "type": "custom",
+          "tokenizer": "standard",
+          "char_filter": [
+            "removeall_filter"
+          ]
+        },
         "standard_analyzer": {
           "type": "custom",
           "char_filter": [
@@ -72,7 +159,7 @@
           ],
           "tokenizer": "unigram_synonym_tokenizer",
           "filter": [
-            "alphanum_word",
+            "alphanum_word_filter",
             "cjk_bigram",
             "stopword_en_filter",
             "lowercase",

+ 45 - 45
src/main/resources/fess_indices/fess/doc.json

@@ -12,7 +12,7 @@
           "match": "*_ar",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -21,7 +21,7 @@
           "match": "*_bg",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -30,7 +30,7 @@
           "match": "*_bn",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -39,7 +39,7 @@
           "match": "*_ca",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -48,7 +48,7 @@
           "match": "*_cs",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -57,7 +57,7 @@
           "match": "*_da",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -66,7 +66,7 @@
           "match": "*_de",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -75,7 +75,7 @@
           "match": "*_el",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -93,7 +93,7 @@
           "match": "*_es",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -102,7 +102,7 @@
           "match": "*_et",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -111,7 +111,7 @@
           "match": "*_fa",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -120,7 +120,7 @@
           "match": "*_fi",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -129,7 +129,7 @@
           "match": "*_fr",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -138,7 +138,7 @@
           "match": "*_gu",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -147,7 +147,7 @@
           "match": "*_he",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -156,7 +156,7 @@
           "match": "*_hi",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -165,7 +165,7 @@
           "match": "*_hr",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -174,7 +174,7 @@
           "match": "*_hu",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -183,7 +183,7 @@
           "match": "*_id",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -192,7 +192,7 @@
           "match": "*_it",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -210,7 +210,7 @@
           "match": "*_ko",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -219,7 +219,7 @@
           "match": "*_lt",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -228,7 +228,7 @@
           "match": "*_lv",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -237,7 +237,7 @@
           "match": "*_mk",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -246,7 +246,7 @@
           "match": "*_ml",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -255,7 +255,7 @@
           "match": "*_nl",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -264,7 +264,7 @@
           "match": "*_no",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -273,7 +273,7 @@
           "match": "*_pa",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -282,7 +282,7 @@
           "match": "*_pl",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -291,7 +291,7 @@
           "match": "*_pt",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -300,7 +300,7 @@
           "match": "*_ro",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -309,7 +309,7 @@
           "match": "*_ru",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -318,7 +318,7 @@
           "match": "*_si",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -327,7 +327,7 @@
           "match": "*_sq",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -336,7 +336,7 @@
           "match": "*_sv",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -345,7 +345,7 @@
           "match": "*_ta",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -354,7 +354,7 @@
           "match": "*_te",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -363,7 +363,7 @@
           "match": "*_th",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -372,7 +372,7 @@
           "match": "*_tl",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -381,7 +381,7 @@
           "match": "*_tr",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -390,7 +390,7 @@
           "match": "*_uk",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -399,7 +399,7 @@
           "match": "*_ur",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -408,7 +408,7 @@
           "match": "*_vi",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -417,7 +417,7 @@
           "match": "*_zh-cn",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       },
@@ -426,7 +426,7 @@
           "match": "*_zh-tw",
           "mapping": {
             "type": "string",
-            "analyzer": "standard_analyzer"
+            "analyzer": "empty_analyzer"
           }
         }
       }