diff --git a/pom.xml b/pom.xml index 0abad3004..45e02f113 100644 --- a/pom.xml +++ b/pom.xml @@ -57,7 +57,7 @@ 0.6.0F - 1.0.11-SNAPSHOT + 1.0.11 2.3.0 diff --git a/src/main/java/org/codelibs/fess/ds/impl/GitBucketDataStoreImpl.java b/src/main/java/org/codelibs/fess/ds/impl/GitBucketDataStoreImpl.java new file mode 100644 index 000000000..71e117b20 --- /dev/null +++ b/src/main/java/org/codelibs/fess/ds/impl/GitBucketDataStoreImpl.java @@ -0,0 +1,185 @@ +/* + * Copyright 2012-2016 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.ds.impl; + +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.apache.commons.io.FilenameUtils; +import org.codelibs.core.lang.StringUtil; +import org.codelibs.elasticsearch.runner.net.Curl; +import org.codelibs.elasticsearch.runner.net.CurlResponse; +import org.codelibs.fess.ds.IndexUpdateCallback; +import org.codelibs.fess.es.config.exentity.DataConfig; +import org.elasticsearch.common.xcontent.json.JsonXContent; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author Keiichi Watanabe + */ +public class GitBucketDataStoreImpl extends AbstractDataStoreImpl { + private static final Logger logger = LoggerFactory.getLogger(CsvDataStoreImpl.class); + + private static final int MAX_DEPTH = 20; + + protected static final String TOKEN_PARAM = "token"; + protected static final String GITBUCKET_URL_PARAM = "url"; + + @Override + protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map paramMap, + final Map scriptMap, final Map defaultDataMap) { + + final String rootURL = getRootURL(paramMap); + final String authToken = getAuthToken(paramMap); + final long readInterval = getReadInterval(paramMap); + + if (rootURL.isEmpty() || authToken.isEmpty()) { + logger.warn("parameter \"" + TOKEN_PARAM + "\" and \"" + GITBUCKET_URL_PARAM + "\" are required"); + return; + } + + final List> repositoryList = getRepositoryList(rootURL, authToken); + if (repositoryList.isEmpty()) { + logger.warn("Token is invalid or no Repository"); + return; + } + + for (final Map repository : repositoryList) { + try { + final String name = (String) repository.get("name"); + final String owner = (String) repository.get("owner"); + final boolean isPrivate = (boolean) repository.get("is_private"); // TODO Use this info for roles + + List pathList = collectFileNames(rootURL, authToken, owner, name, "", 0, readInterval); + for (String path : pathList) { + storeFileContent(rootURL, authToken, owner, name, path, dataConfig, callback, paramMap, scriptMap, defaultDataMap); + if (readInterval > 0) { + sleep(readInterval); + } + } + } catch (Exception e) { + logger.warn("Failed to access to " + repository, e); + } + } + + } + + protected String getRootURL(final Map paramMap) { + if (paramMap.containsKey(GITBUCKET_URL_PARAM)) { + String url = paramMap.get(GITBUCKET_URL_PARAM); + if (url.charAt(url.length() - 1) != '/') { + url += "/"; + } + return url; + } + return StringUtil.EMPTY; + } + + protected String getAuthToken(final Map paramMap) { + if (paramMap.containsKey(TOKEN_PARAM)) { + return paramMap.get(TOKEN_PARAM); + } + return StringUtil.EMPTY; + } + + protected List> getRepositoryList(final String rootURL, final String authToken) { + final String url = rootURL + "api/v3/fess/repos"; + try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) { + final String content = curlResponse.getContentAsString(); + final Map map = curlResponse.getContentAsMap(); + assert (map.containsKey("repositories")); + final List> repoList = (List>) map.get("repositories"); + return repoList; + } catch (Exception e) { + logger.warn("Failed to access to " + rootURL, e); + return Collections.emptyList(); + } + } + + private List parseList(final InputStream is) { // TODO This function should be moved to CurlResponse + try { + return JsonXContent.jsonXContent.createParser(is).list(); + } catch (final Exception e) { + return Collections.emptyList(); + } + } + + private void storeFileContent(final String rootURL, final String authToken, final String owner, final String name, final String path, + final DataConfig dataConfig, final IndexUpdateCallback callback, final Map paramMap, + final Map scriptMap, final Map defaultDataMap) { + final String url = rootURL + owner + "/" + name + "/blob/master/" + path; + final String filename = FilenameUtils.getName(url); + + try (CurlResponse curlResponse = Curl.get(url).param("raw", "true").header("Authorization", "token " + authToken).execute()) { + logger.info("Get a content from " + url); + // TODO Use DoucmentHelper#processRequest and scriptMap + final Map dataMap = new HashMap<>(); + dataMap.putAll(defaultDataMap); + dataMap.put("title", owner + "/" + name + " : " + filename); + dataMap.put("url", url); + dataMap.put("content", curlResponse.getContentAsString()); + dataMap.put("label", "GitBucket"); // TODO role + + callback.store(paramMap, dataMap); + + } catch (Exception e) { + // TODO CrawlingAccessException? + logger.warn("Failed to parse " + url, e); + } + return; + } + + protected List collectFileNames(final String rootURL, final String authToken, final String owner, final String name, + final String path, final int depth, final long readInterval) { + + if (MAX_DEPTH <= depth) { + return Collections.emptyList(); + } + + List resultList = new ArrayList(); + final String url = rootURL + "api/v3/repos/" + owner + "/" + name + "/contents/" + path; + + try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) { + final InputStream iStream = curlResponse.getContentAsStream(); + List fileList = parseList(iStream); + + for (int i = 0; i < fileList.size(); ++i) { + Map file = (Map) fileList.get(i); + final String newPath = path.isEmpty() ? file.get("name") : path + "/" + file.get("name"); + switch (file.get("type")) { + case "file": + resultList.add(newPath); + break; + case "dir": + if (readInterval > 0) { + sleep(readInterval); + } + resultList.addAll(collectFileNames(rootURL, authToken, owner, name, newPath, depth + 1, readInterval)); + break; + } + } + } catch (Exception e) { + logger.warn("Failed to access to " + url, e); + } + return resultList; + } + +} diff --git a/src/main/resources/fess_ds.xml b/src/main/resources/fess_ds.xml index e619d59b8..742f825bd 100644 --- a/src/main/resources/fess_ds.xml +++ b/src/main/resources/fess_ds.xml @@ -23,12 +23,16 @@ "EsListDataStore" esListDataStore + + "GitBucketDataStore" + gitBucketDataStore + - @@ -38,6 +42,8 @@ + + diff --git a/src/main/resources/suggest/fess-suggest-default-analyzer.json b/src/main/resources/suggest/fess-suggest-default-analyzer.json index 3625fe0a4..7e3322227 100644 --- a/src/main/resources/suggest/fess-suggest-default-analyzer.json +++ b/src/main/resources/suggest/fess-suggest-default-analyzer.json @@ -67,6 +67,25 @@ "tokenizer" : "standard", "filter" : ["lowercase", "stopword_en_filter", "content_length_filter", "limit_token_count_filter", "english_keywords"] }, + "reading_analyzer_ko" : { + "type" : "custom", + "tokenizer" : "fess_korean_tokenizer" + }, + "reading_term_analyzer_ko" : { + "type" : "custom", + "tokenizer" : "fess_korean_tokenizer" + }, + "normalize_analyzer_ko" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_ko" : { + "type" : "custom", + "tokenizer" : "fess_korean_tokenizer", + "filter" : ["lowercase", "stopword_en_filter", "content_length_filter", "limit_token_count_filter", "english_keywords"] + }, "reading_analyzer_ar" : { "type" : "custom", "tokenizer" : "standard" @@ -86,6 +105,25 @@ "tokenizer" : "standard", "filter" : ["lowercase", "stopword_en_filter", "content_length_filter", "limit_token_count_filter", "arabic_stop", "arabic_normalization", "arabic_keywords"] }, + "reading_analyzer_bg" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_bg" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_bg" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_bg" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter", "bulgarian_stop", "bulgarian_keywords", "bulgarian_stemmer"] + }, "reading_analyzer_ca" : { "type" : "custom", "tokenizer" : "standard" @@ -503,6 +541,386 @@ "type" : "custom", "tokenizer" : "thai", "filter" : ["lowercase", "stopword_en_filter", "content_length_filter", "limit_token_count_filter", "thai_stop"] + }, + "reading_analyzer_bn" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_bn" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_bn" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_bn" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_et" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_et" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_et" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_et" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_gu" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_gu" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_gu" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_gu" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_he" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_he" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_he" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_he" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_hi" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_hi" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_hi" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_hi" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_hr" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_hr" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_hr" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_hr" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_mk" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_mk" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_mk" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_mk" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_ml" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_ml" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_ml" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_ml" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_pa" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_pa" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_pa" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_pa" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_pl" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_pl" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_pl" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_pl" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_si" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_si" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_si" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_si" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_sq" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_sq" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_sq" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_sq" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_ta" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_ta" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_ta" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_ta" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_te" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_te" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_te" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_te" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_tl" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_tl" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_tl" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_tl" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_uk" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_uk" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_uk" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_uk" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_ur" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_ur" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_ur" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_ur" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_vi" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_vi" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_vi" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_vi" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_zh-cn" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_zh-cn" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_zh-cn" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_zh-cn" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] + }, + "reading_analyzer_zh-tw" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "reading_term_analyzer_zh-tw" : { + "type" : "custom", + "tokenizer" : "standard" + }, + "normalize_analyzer_zh-tw" : { + "type" : "custom", + "tokenizer" : "keyword", + "char_filter" : ["mapping_char"], + "filter" : ["lowercase"] + }, + "contents_analyzer_zh-tw" : { + "type" : "custom", + "tokenizer" : "standard", + "filter" : ["lowercase", "content_length_filter", "limit_token_count_filter"] } }, "char_filter" : { @@ -631,6 +1049,18 @@ "type": "stemmer", "language": "arabic" }, + "bulgarian_stop": { + "type": "stop", + "stopwords": "_bulgarian_" + }, + "bulgarian_keywords": { + "type": "keyword_marker", + "keywords": ["Добър ден"] + }, + "bulgarian_stemmer": { + "type": "stemmer", + "language": "bulgarian" + }, "catalan_elision": { "type": "elision", "articles": [ "d", "l", "m", "n", "s", "t"]