#489 language analyzer add ro de[WIP] (#536)

*  #489 language analyzer add ro de[WIP]

*  #489 language analyzer add[WIP]

*  import * to LocalDateTime

* format
This commit is contained in:
Matsutani Kenji 2016-06-11 21:51:31 +09:00 committed by Shinsuke Sugaya
parent 26df9e49b0
commit 96a4845f5c
27 changed files with 773 additions and 24 deletions

View file

@ -26,6 +26,98 @@
<arg>"fess"</arg>
<arg>"ko/seunjeon.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"de/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"ar/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"ro/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"ca/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"cs/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"da/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"es/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"el/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"fa/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"fi/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"fr/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"hi/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"hu/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"id/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"it/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"lt/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"lv/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"nl/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"no/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"pt/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"ru/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"sv/protwords.txt"</arg>
</postConstruct>
<postConstruct name="addConfigFile">
<arg>"fess"</arg>
<arg>"tr/protwords.txt"</arg>
</postConstruct>
<!-- fess index -->
<postConstruct name="addIndexConfig">
<arg>"fess/doc"</arg>

View file

@ -11,6 +11,10 @@
"type": "mapping",
"mappings_path": "${fess.dictionary.path}ja/mapping.txt"
},
"zero_width_spaces": {
"type": "mapping",
"mappings": [ "\\u200C=> "]
},
"removeall_filter": {
"type": "pattern_replace",
"pattern":"(.*)",
@ -84,6 +88,317 @@
"連体詞"
]
},
"german_stop": {
"type": "stop",
"stopwords": "_german_"
},
"german_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}de/protwords.txt"
},
"german_stemmer": {
"type": "stemmer",
"language": "light_german"
},
"french_elision": {
"type": "elision",
"articles_case": true,
"articles": [
"l", "m", "t", "qu", "n", "s",
"j", "d", "c", "jusqu", "quoiqu",
"lorsqu", "puisqu"
]
},
"french_stop": {
"type": "stop",
"stopwords": "_french_"
},
"french_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}fr/protwords.txt"
},
"french_stemmer": {
"type": "stemmer",
"language": "light_french"
},
"italian_elision": {
"type": "elision",
"articles": [
"c", "l", "all", "dall", "dell",
"nell", "sull", "coll", "pell",
"gl", "agl", "dagl", "degl", "negl",
"sugl", "un", "m", "t", "s", "v", "d"
]
},
"italian_stop": {
"type": "stop",
"stopwords": "_italian_"
},
"italian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}it/protwords.txt"
},
"italian_stemmer": {
"type": "stemmer",
"language": "light_italian"
},
"arabic_stop": {
"type": "stop",
"stopwords": "_arabic_"
},
"arabic_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}ar/protwords.txt"
},
"arabic_stemmer": {
"type": "stemmer",
"language": "arabic"
},
"romanian_stop": {
"type": "stop",
"stopwords": "_romanian_"
},
"romanian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}ro/protwords.txt"
},
"romanian_stemmer": {
"type": "stemmer",
"language": "romanian"
},
"catalan_elision": {
"type": "elision",
"articles": [ "d", "l", "m", "n", "s", "t"]
},
"catalan_stop": {
"type": "stop",
"stopwords": "_catalan_"
},
"catalan_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}ca/protwords.txt"
},
"catalan_stemmer": {
"type": "stemmer",
"language": "catalan"
},
"czech_stop": {
"type": "stop",
"stopwords": "_czech_"
},
"czech_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}cs/protwords.txt"
},
"czech_stemmer": {
"type": "stemmer",
"language": "czech"
},
"danish_stop": {
"type": "stop",
"stopwords": "_danish_"
},
"danish_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}da/protwords.txt"
},
"danish_stemmer": {
"type": "stemmer",
"language": "danish"
},
"spanish_stop": {
"type": "stop",
"stopwords": "_spanish_"
},
"spanish_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}es/protwords.txt"
},
"spanish_stemmer": {
"type": "stemmer",
"language": "light_spanish"
},
"greek_stop": {
"type": "stop",
"stopwords": "_greek_"
},
"greek_lowercase": {
"type": "lowercase",
"language": "greek"
},
"greek_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}el/protwords.txt"
},
"greek_stemmer": {
"type": "stemmer",
"language": "greek"
},
"persian_stop": {
"type": "stop",
"stopwords": "_persian_"
},
"finnish_stop": {
"type": "stop",
"stopwords": "_finnish_"
},
"finnish_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}fi/protwords.txt"
},
"finnish_stemmer": {
"type": "stemmer",
"language": "finnish"
},
"hindi_stop": {
"type": "stop",
"stopwords": "_hindi_"
},
"hindi_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}hi/protwords.txt"
},
"hindi_stemmer": {
"type": "stemmer",
"language": "hindi"
},
"hungarian_stop": {
"type": "stop",
"stopwords": "_hungarian_"
},
"hungarian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}hu/protwords.txt"
},
"hungarian_stemmer": {
"type": "stemmer",
"language": "hungarian"
},
"indonesian_stop": {
"type": "stop",
"stopwords": "_indonesian_"
},
"indonesian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}id/protwords.txt"
},
"indonesian_stemmer": {
"type": "stemmer",
"language": "indonesian"
},
"lithuanian_stop": {
"type": "stop",
"stopwords": "_lithuanian_"
},
"lithuanian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}lt/protwords.txt"
},
"lithuanian_stemmer": {
"type": "stemmer",
"language": "lithuanian"
},
"latvian_stop": {
"type": "stop",
"stopwords": "_latvian_"
},
"latvian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}lv/protwords.txt"
},
"latvian_stemmer": {
"type": "stemmer",
"language": "latvian"
},
"dutch_stop": {
"type": "stop",
"stopwords": "_dutch_"
},
"dutch_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}nl/protwords.txt"
},
"dutch_stemmer": {
"type": "stemmer",
"language": "dutch"
},
"dutch_override": {
"type": "stemmer_override",
"rules": [
"fiets=>fiets",
"bromfiets=>bromfiets",
"ei=>eier",
"kind=>kinder"
]
},
"norwegian_stop": {
"type": "stop",
"stopwords": "_norwegian_"
},
"norwegian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}no/protwords.txt"
},
"norwegian_stemmer": {
"type": "stemmer",
"language": "norwegian"
},
"portuguese_stop": {
"type": "stop",
"stopwords": "_portuguese_"
},
"portuguese_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}pt/protwords.txt"
},
"portuguese_stemmer": {
"type": "stemmer",
"language": "light_portuguese"
},
"russian_stop": {
"type": "stop",
"stopwords": "_russian_"
},
"russian_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}ru/protwords.txt"
},
"russian_stemmer": {
"type": "stemmer",
"language": "russian"
},
"swedish_stop": {
"type": "stop",
"stopwords": "_swedish_"
},
"swedish_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}sv/protwords.txt"
},
"swedish_stemmer": {
"type": "stemmer",
"language": "swedish"
},
"thai_stop": {
"type": "stop",
"stopwords": "_thai_"
},
"turkish_stop": {
"type": "stop",
"stopwords": "_turkish_"
},
"turkish_lowercase": {
"type": "lowercase",
"language": "turkish"
},
"turkish_keywords": {
"type": "keyword_marker",
"keywords_path": "${fess.dictionary.path}tr/protwords.txt"
},
"turkish_stemmer": {
"type": "stemmer",
"language": "turkish"
},
"truncate10_filter" : {
"type" : "truncate",
"length" : 10
@ -153,6 +468,276 @@
"type": "custom",
"tokenizer":"seunjeon_default_tokenizer"
},
"german_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"german_stop",
"german_normalization",
"german_stemmer"
]
},
"french_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"french_elision",
"lowercase",
"french_stop",
"french_keywords",
"french_stemmer"
]
},
"italian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"italian_elision",
"lowercase",
"italian_stop",
"italian_keywords",
"italian_stemmer"
]
},
"arabic_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"arabic_stop",
"arabic_normalization",
"arabic_keywords",
"arabic_stemmer"
]
},
"romanian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"romanian_stop",
"romanian_stemmer"
]
},
"catalan_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"catalan_elision",
"lowercase",
"catalan_stop",
"catalan_keywords",
"catalan_stemmer"
]
},
"czech_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"czech_stop",
"czech_keywords",
"czech_stemmer"
]
},
"danish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"danish_stop",
"danish_keywords",
"danish_stemmer"
]
},
"spanish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"spanish_stop",
"spanish_keywords",
"spanish_stemmer"
]
},
"greek_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"greek_lowercase",
"greek_stop",
"greek_keywords",
"greek_stemmer"
]
},
"persian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"char_filter": [ "zero_width_spaces" ],
"filter": [
"truncate20_filter",
"lowercase",
"arabic_normalization",
"persian_normalization",
"persian_stop"
]
},
"finnish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"finnish_stop",
"finnish_keywords",
"finnish_stemmer"
]
},
"hindi_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"indic_normalization",
"hindi_normalization",
"hindi_stop",
"hindi_keywords",
"hindi_stemmer"
]
},
"hungarian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"hungarian_stop",
"hungarian_keywords",
"hungarian_stemmer"
]
},
"indonesian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"indonesian_stop",
"indonesian_keywords",
"indonesian_stemmer"
]
},
"lithuanian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"lithuanian_stop",
"lithuanian_keywords",
"lithuanian_stemmer"
]
},
"latvian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"latvian_stop",
"latvian_keywords",
"latvian_stemmer"
]
},
"dutch_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"dutch_stop",
"dutch_keywords",
"dutch_override",
"dutch_stemmer"
]
},
"norwegian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"norwegian_stop",
"norwegian_keywords",
"norwegian_stemmer"
]
},
"portuguese_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"portuguese_stop",
"portuguese_keywords",
"portuguese_stemmer"
]
},
"russian_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"russian_stop",
"russian_keywords",
"russian_stemmer"
]
},
"swedish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"lowercase",
"swedish_stop",
"swedish_keywords",
"swedish_stemmer"
]
},
"thai_analyzer": {
"type": "custom",
"tokenizer": "thai",
"filter": [
"truncate20_filter",
"lowercase",
"thai_stop"
]
},
"turkish_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [
"truncate20_filter",
"apostrophe",
"turkish_lowercase",
"turkish_stop",
"turkish_keywords",
"turkish_stemmer"
]
},
"empty_analyzer": {
"type": "custom",
"tokenizer": "standard",

View file

@ -0,0 +1,3 @@
مرحباا
عالم
بحث

View file

@ -0,0 +1,3 @@
Hola
món
recerca

View file

@ -0,0 +1,3 @@
Haló
svět
vyhledávání

View file

@ -0,0 +1,3 @@
Hej
verden
Søg

View file

@ -0,0 +1,3 @@
Hallo
Welt
Suche

View file

@ -12,7 +12,7 @@
"match": "*_ar",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "arabic_analyzer"
}
}
},
@ -21,7 +21,7 @@
"match": "*_bg",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "bulgarian_analyzer"
}
}
},
@ -39,7 +39,7 @@
"match": "*_ca",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "catalan_analyzer"
}
}
},
@ -48,7 +48,7 @@
"match": "*_cs",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "czech_analyzer"
}
}
},
@ -57,7 +57,7 @@
"match": "*_da",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "danish_analyzer"
}
}
},
@ -66,7 +66,7 @@
"match": "*_de",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "german_analyzer"
}
}
},
@ -75,7 +75,7 @@
"match": "*_el",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "greek_analyzer"
}
}
},
@ -93,7 +93,7 @@
"match": "*_es",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "spanish_analyzer"
}
}
},
@ -111,7 +111,7 @@
"match": "*_fa",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "persian_analyzer"
}
}
},
@ -120,7 +120,7 @@
"match": "*_fi",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "finnish_analyzer"
}
}
},
@ -129,7 +129,7 @@
"match": "*_fr",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "french_analyzer"
}
}
},
@ -174,7 +174,7 @@
"match": "*_hu",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "hungarian_analyzer"
}
}
},
@ -183,7 +183,7 @@
"match": "*_id",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "indonesian_analyzer"
}
}
},
@ -192,7 +192,7 @@
"match": "*_it",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "italian_analyzer"
}
}
},
@ -219,7 +219,7 @@
"match": "*_lt",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "lithuanian_analyzer"
}
}
},
@ -228,7 +228,7 @@
"match": "*_lv",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "latvian_analyzer"
}
}
},
@ -255,7 +255,7 @@
"match": "*_nl",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "dutch_analyzer"
}
}
},
@ -264,7 +264,7 @@
"match": "*_no",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "norwegian_analyzer"
}
}
},
@ -291,7 +291,7 @@
"match": "*_pt",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "portuguese_analyzer"
}
}
},
@ -300,7 +300,7 @@
"match": "*_ro",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "romanian_analyzer"
}
}
},
@ -309,7 +309,7 @@
"match": "*_ru",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "russian_analyzer"
}
}
},
@ -336,7 +336,7 @@
"match": "*_sv",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "swedish_analyzer"
}
}
},
@ -363,7 +363,7 @@
"match": "*_th",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "thai_analyzer"
}
}
},
@ -381,7 +381,7 @@
"match": "*_tr",
"mapping": {
"type": "string",
"analyzer": "empty_analyzer"
"analyzer": "turkish_analyzer"
}
}
},

View file

@ -0,0 +1,3 @@
Γεια σας
κόσμος
έρευνα

View file

@ -0,0 +1,3 @@
¡Hola
mundo
búsqueda

View file

@ -0,0 +1,3 @@
سلام
جهان
جستجو

View file

@ -0,0 +1,3 @@
Hei
maailma
haku

View file

@ -0,0 +1,3 @@
Bonjour
monde
recherche

View file

@ -0,0 +1,3 @@
नमस्ते
दुनिया
खोज

View file

@ -0,0 +1,3 @@
Helló
világ
keresés

View file

@ -0,0 +1,3 @@
halo
dunia
pencarian

View file

@ -0,0 +1,3 @@
Ciao
mondo
ricerca

View file

@ -0,0 +1,3 @@
Sveiki
pasaulis
paieška

View file

@ -0,0 +1,3 @@
sveiki
pasaule
meklēšana

View file

@ -0,0 +1,3 @@
hallo
wereld
zoeken

View file

@ -0,0 +1,3 @@
Hallo
verden
Søk

View file

@ -0,0 +1,3 @@
Olá
mundo
pesquisa

View file

@ -0,0 +1,3 @@
Alo
lume
căutare

View file

@ -0,0 +1,3 @@
привет
мир
поиск

View file

@ -0,0 +1,3 @@
Hallå
material
sök

View file

@ -0,0 +1,3 @@
สวัสดี
โลก
ค้นหา

View file

@ -0,0 +1,3 @@
Merhaba
Dünya
arama