fix #682 add zh-cn analyzer

fix #683 add zh-tw analyzer
2017-06-17 17:00:22 +09:00 · 2017-06-17 17:00:22 +09:00 · 85d8395299
commit 85d8395299
parent f0d3c6f40e
6 changed files with 161 additions and 3 deletions
--- a/plugin.xml
+++ b/plugin.xml
@ -20,7 +20,7 @@
 			<param name="plugin.name.prefix" value="elasticsearch-" />
 			<param name="plugin.name" value="analysis-fess" />
 			<param name="plugin.version" value="5.4.2-SNAPSHOT" />
-			<param name="plugin.zip.version" value="5.4.2-20170616.220752-1" />
+			<param name="plugin.zip.version" value="5.4.2-20170617.065236-3" />
 		</antcall>
 		<!-- analysis-ja -->
 		<antcall target="install.plugin">
--- a/src/main/resources/esclient.xml
+++ b/src/main/resources/esclient.xml
@ -148,6 +148,14 @@
 			<arg>"fess"</arg>
 			<arg>"tr/protwords.txt"</arg>
 		</postConstruct>
+		<postConstruct name="addConfigFile">
+			<arg>"fess"</arg>
+			<arg>"zh-cn/stopwords.txt"</arg>
+		</postConstruct>
+		<postConstruct name="addConfigFile">
+			<arg>"fess"</arg>
+			<arg>"zh-tw/stopwords.txt"</arg>
+		</postConstruct>
 		<!-- fess index -->
 		<postConstruct name="addIndexConfig">
 			<arg>"fess/doc"</arg>
--- a/src/main/resources/fess_indices/fess.json
+++ b/src/main/resources/fess_indices/fess.json
@ -15,6 +15,10 @@
          "type": "mapping",
          "mappings_path": "${fess.dictionary.path}ja/mapping.txt"
        },
+        "traditional_chinese_convert": {
+          "type": "fess_traditional_chinese_convert",
+          "convert_type": "t2s"
+        },
        "zero_width_spaces": {
            "type":       "mapping",
            "mappings": [ "\\u200C=> "]
@ -431,6 +435,10 @@
          "type":       "stemmer",
          "language":   "russian"
        },
+        "simplified_chinese_stop": {
+          "type":       "stop",
+          "stopwords_path": "${fess.dictionary.path}zh-cn/stopwords.txt"
+        },
        "sorani_stop": {
          "type":       "stop",
          "stopwords":  "_sorani_"
@ -471,6 +479,10 @@
          "type":       "stop",
          "stopwords":  "_thai_"
        },
+        "traditional_chinese_stop": {
+          "type":       "stop",
+          "stopwords_path": "${fess.dictionary.path}zh-tw/stopwords.txt"
+        },
        "turkish_stop": {
          "type":       "stop",
          "stopwords":  "_turkish_"
@ -524,6 +536,9 @@
            "pos_tagging": false,
            "user_dict_path": "${fess.dictionary.path}ko/seunjeon.txt"
        },
+        "simplified_chinese_tokenizer": {
+            "type": "fess_simplified_chinese_tokenizer"
+        },
        "vietnamese_tokenizer": {
            "type": "fess_vietnamese_tokenizer",
            "sentence_detector": false,
@ -865,6 +880,14 @@
            "russian_stemmer"
          ]
        },
+        "simplified_chinese_analyzer": {
+          "tokenizer":  "simplified_chinese_tokenizer",
+          "filter": [
+            "truncate10_filter",
+            "lowercase",
+            "simplified_chinese_stop"
+          ]
+        },
        "sorani_analyzer": {
          "tokenizer":  "standard",
          "filter": [
@ -907,6 +930,15 @@
            "thai_stop"
          ]
        },
+        "traditional_chinese_analyzer": {
+          "char_filter": [ "traditional_chinese_convert" ],
+          "tokenizer":  "simplified_chinese_tokenizer",
+          "filter": [
+            "truncate10_filter",
+            "lowercase",
+            "traditional_chinese_stop"
+          ]
+        },
        "turkish_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
--- a/src/main/resources/fess_indices/fess/doc.json
+++ b/src/main/resources/fess_indices/fess/doc.json
@ -471,7 +471,7 @@
          "match": "*_zh-cn",
          "mapping": {
            "type": "text",
-            "analyzer": "empty_analyzer"
+            "analyzer": "simplified_chinese_analyzer"
          }
        }
      },
@ -480,7 +480,7 @@
          "match": "*_zh-tw",
          "mapping": {
            "type": "text",
-            "analyzer": "empty_analyzer"
+            "analyzer": "traditional_chinese_analyzer"
          }
        }
      }
--- a/src/main/resources/fess_indices/fess/zh-cn/stopwords.txt
+++ b/src/main/resources/fess_indices/fess/zh-cn/stopwords.txt
@ -0,0 +1,59 @@
+# Punctuation tokens to remove
+,
+.
+`
+-
+_
+=
+?
+'
+|
+"
+(
+)
+{
+}
+[
+]
+<
+>
+*
+#
+&
+^
+$
+@
+!
+~
+:
+;
+
+/
+\
+《
+》
+—
+－
+，
+。
+、
+：
+；
+！
+·
+？
+“
+”
+）
+（
+【
+】
+［
+］
+●
+# the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
+　
+
+# English Stop Words
+
+# Chinese Stop Words
--- a/src/main/resources/fess_indices/fess/zh-tw/stopwords.txt
+++ b/src/main/resources/fess_indices/fess/zh-tw/stopwords.txt
@ -0,0 +1,59 @@
+# Punctuation tokens to remove
+,
+.
+`
+-
+_
+=
+?
+'
+|
+"
+(
+)
+{
+}
+[
+]
+<
+>
+*
+#
+&
+^
+$
+@
+!
+~
+:
+;
+
+/
+\
+《
+》
+—
+－
+，
+。
+、
+：
+；
+！
+·
+？
+“
+”
+）
+（
+【
+】
+［
+］
+●
+# the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
+　
+
+# English Stop Words
+
+# Chinese Stop Words