crawler for GitBucket

This commit is contained in:
Keiichi Watanabe 2016-10-07 04:18:44 +09:00
parent e5c0635762
commit 797ba71f83
2 changed files with 190 additions and 1 deletions

View file

@ -0,0 +1,183 @@
/*
* Copyright 2012-2016 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.ds.impl;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.io.FilenameUtils;
import org.codelibs.elasticsearch.runner.net.Curl;
import org.codelibs.elasticsearch.runner.net.CurlResponse;
import org.codelibs.fess.ds.IndexUpdateCallback;
import org.codelibs.fess.es.config.exentity.DataConfig;
import org.elasticsearch.common.xcontent.json.JsonXContent;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author Keiichi Watanabe
*/
public class GitBucketDataStoreImpl extends AbstractDataStoreImpl {
private static final Logger logger = LoggerFactory.getLogger(CsvDataStoreImpl.class);
private static final int MAX_DEPTH = 20;
protected static final String TOKEN_PARAM = "token";
protected static final String GITBUCKET_URL_PARAM = "url";
@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
final String rootURL = getRootURL(paramMap);
final String authToken = getAuthToken(paramMap);
final long readInterval = getReadInterval(paramMap);
if (rootURL.isEmpty() || authToken.isEmpty()) {
logger.warn("parameter \"" + TOKEN_PARAM + "\" and \"" + GITBUCKET_URL_PARAM + "\" are required");
return;
}
final List<Map<String, Object>> repositoryList = getRepositoryList(rootURL, authToken);
if (repositoryList.isEmpty()) {
logger.warn("Token is invalid or no Repository");
return;
}
for (final Map<String, Object> repository : repositoryList) {
try {
final String name = (String) repository.get("name");
final String owner = (String) repository.get("owner");
final boolean isPrivate = (boolean) repository.get("is_private"); // TODO Use this info for roles
List<String> pathList = collectFileNames(rootURL, authToken, owner, name, "", 0, readInterval);
for (String path : pathList) {
storeFileContent(rootURL, authToken, owner, name, path, dataConfig, callback, paramMap, scriptMap, defaultDataMap);
if (readInterval > 0) {
sleep(readInterval);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
protected String getRootURL(final Map<String, String> paramMap) {
if (paramMap.containsKey(GITBUCKET_URL_PARAM)) {
String url = paramMap.get(GITBUCKET_URL_PARAM);
if (url.charAt(url.length() - 1) != '/') {
url += "/";
}
return url;
}
return "";
}
protected String getAuthToken(final Map<String, String> paramMap) {
if (paramMap.containsKey(TOKEN_PARAM)) {
return paramMap.get(TOKEN_PARAM);
}
return "";
}
protected List<Map<String, Object>> getRepositoryList(final String rootURL, final String authToken) {
final String url = rootURL + "api/v3/fess/repos";
try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) {
final String content = curlResponse.getContentAsString();
final Map<String, Object> map = curlResponse.getContentAsMap();
assert (map.containsKey("repositories"));
final List<Map<String, Object>> repoList = (List<Map<String, Object>>) map.get("repositories");
return repoList;
} catch (Exception e) {
e.printStackTrace();
return Collections.emptyList();
}
}
private List<Object> parseList(final InputStream is) { // TODO This function should be moved to CurlResponse
try {
return JsonXContent.jsonXContent.createParser(is).list();
} catch (final Exception e) {
return Collections.emptyList();
}
}
private void storeFileContent(final String rootURL, final String authToken, final String owner, final String name, final String path,
final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
final String url = rootURL + owner + "/" + name + "/blob/master/" + path;
final String filename = FilenameUtils.getName(url);
try (CurlResponse curlResponse = Curl.get(url).param("raw", "true").header("Authorization", "token " + authToken).execute()) {
logger.info("Get a content from " + url);
// TODO Check mimetype
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
dataMap.put("title", owner + "/" + name + " : " + filename);
dataMap.put("url", url);
dataMap.put("content", curlResponse.getContentAsString());
dataMap.put("label", "GitBucket"); // TODO role
callback.store(paramMap, dataMap);
} catch (Exception e) {
e.printStackTrace();
}
return;
}
protected List<String> collectFileNames(final String rootURL, final String authToken, final String owner, final String name,
final String path, final int depth, final long readInterval) {
if (MAX_DEPTH <= depth) {
return Collections.emptyList();
}
List<String> resultList = new ArrayList<String>();
final String url = rootURL + "api/v3/repos/" + owner + "/" + name + "/contents/" + path;
try (CurlResponse curlResponse = Curl.get(url).header("Authorization", "token " + authToken).execute()) {
final InputStream iStream = curlResponse.getContentAsStream();
List<Object> fileList = parseList(iStream);
for (int i = 0; i < fileList.size(); ++i) {
Map<String, String> file = (Map<String, String>) fileList.get(i);
final String newPath = path.isEmpty() ? file.get("name") : path + "/" + file.get("name");
switch (file.get("type")) {
case "file":
resultList.add(newPath);
break;
case "dir":
if (readInterval > 0) {
sleep(readInterval);
}
resultList.addAll(collectFileNames(rootURL, authToken, owner, name, newPath, depth + 1, readInterval));
break;
}
}
} catch (Exception e) {
e.printStackTrace();
}
return resultList;
}
}

View file

@ -23,12 +23,16 @@
<arg>"EsListDataStore"</arg>
<arg>esListDataStore</arg>
</postConstruct>
<postConstruct name="add">
<arg>"GitBucketDataStore"</arg>
<arg>gitBucketDataStore</arg>
</postConstruct>
</component>
<component name="databaseDataStore" class="org.codelibs.fess.ds.impl.DatabaseDataStoreImpl">
</component>
<component name="csvDataStore" class="org.codelibs.fess.ds.impl.CsvDataStoreImpl">
<!--
<!--
<property name="csvFileSuffixs">new String[] { ".csv", ".tsv" }</property>
-->
</component>
@ -38,6 +42,8 @@
</component>
<component name="esListDataStore" class="org.codelibs.fess.ds.impl.EsListDataStoreImpl">
</component>
<component name="gitBucketDataStore" class="org.codelibs.fess.ds.impl.GitBucketDataStoreImpl">
</component>
<component name="indexUpdateCallback" class="org.codelibs.fess.ds.impl.IndexUpdateCallbackImpl" instance="prototype">
</component>