fix #682 add zh-cn analyzer

fix #683 add zh-tw analyzer
This commit is contained in:
Shinsuke Sugaya 2017-06-17 17:00:22 +09:00
parent f0d3c6f40e
commit 85d8395299
6 changed files with 161 additions and 3 deletions

View file

@ -20,7 +20,7 @@
<param name="plugin.name.prefix" value="elasticsearch-" />
<param name="plugin.name" value="analysis-fess" />
<param name="plugin.version" value="5.4.2-SNAPSHOT" />
<param name="plugin.zip.version" value="5.4.2-20170616.220752-1" />
<param name="plugin.zip.version" value="5.4.2-20170617.065236-3" />
</antcall>
<!-- analysis-ja -->
<antcall target="install.plugin">

View file

@ -148,6 +148,14 @@
<arg>"fess"</arg>
<arg>"tr/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"zh-cn/stopwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"zh-tw/stopwords.txt"</arg>
</postConstruct>
<!-- fess index -->
<postConstruct name="addIndexConfig">
<arg>"fess/doc"</arg>

View file

@ -15,6 +15,10 @@
"type": "mapping",
"mappings_path": "${fess.dictionary.path}ja/mapping.txt"
},
"traditional_chinese_convert": {
"type": "fess_traditional_chinese_convert",
"convert_type": "t2s"
},
"zero_width_spaces": {
"type": "mapping",
"mappings": [ "\\u200C=> "]
@ -431,6 +435,10 @@
"type": "stemmer",
"language": "russian"
},
"simplified_chinese_stop": {
"type": "stop",
"stopwords_path": "${fess.dictionary.path}zh-cn/stopwords.txt"
},
"sorani_stop": {
"type": "stop",
"stopwords": "_sorani_"
@ -471,6 +479,10 @@
"type": "stop",
"stopwords": "_thai_"
},
"traditional_chinese_stop": {
"type": "stop",
"stopwords_path": "${fess.dictionary.path}zh-tw/stopwords.txt"
},
"turkish_stop": {
"type": "stop",
"stopwords": "_turkish_"
@ -524,6 +536,9 @@
"pos_tagging": false,
"user_dict_path": "${fess.dictionary.path}ko/seunjeon.txt"
},
"simplified_chinese_tokenizer": {
"type": "fess_simplified_chinese_tokenizer"
},
"vietnamese_tokenizer": {
"type": "fess_vietnamese_tokenizer",
"sentence_detector": false,
@ -865,6 +880,14 @@
"russian_stemmer"
]
},
"simplified_chinese_analyzer": {
"tokenizer": "simplified_chinese_tokenizer",
"filter": [
"truncate10_filter",
"lowercase",
"simplified_chinese_stop"
]
},
"sorani_analyzer": {
"tokenizer": "standard",
"filter": [
@ -907,6 +930,15 @@
"thai_stop"
]
},
"traditional_chinese_analyzer": {
"char_filter": [ "traditional_chinese_convert" ],
"tokenizer": "simplified_chinese_tokenizer",
"filter": [
"truncate10_filter",
"lowercase",
"traditional_chinese_stop"
]
},
"turkish_analyzer": {
"type": "custom",
"tokenizer": "standard",

View file

@ -471,7 +471,7 @@
"match": "*_zh-cn",
"mapping": {
"type": "text",
"analyzer": "empty_analyzer"
"analyzer": "simplified_chinese_analyzer"
}
}
},
@ -480,7 +480,7 @@
"match": "*_zh-tw",
"mapping": {
"type": "text",
"analyzer": "empty_analyzer"
"analyzer": "traditional_chinese_analyzer"
}
}
}

View file

@ -0,0 +1,59 @@
# Punctuation tokens to remove
,
.
`
-
_
=
?
'
|
"
(
)
{
}
[
]
<
>
*
#
&
^
$
@
!
~
:
;
+
/
\
·
# the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
 
# English Stop Words
# Chinese Stop Words

View file

@ -0,0 +1,59 @@
# Punctuation tokens to remove
,
.
`
-
_
=
?
'
|
"
(
)
{
}
[
]
<
>
*
#
&
^
$
@
!
~
:
;
+
/
\
·
# the line below contains an IDEOGRAPHIC SPACE character (Used as a space in Chinese)
 
# English Stop Words
# Chinese Stop Words