fix #454 : add EsDataStore/EsListDataStore
This commit is contained in:
parent
26b4ad5b25
commit
746c85cb24
9 changed files with 606 additions and 4 deletions
|
@ -109,7 +109,7 @@ public abstract class AbstractDataStoreImpl implements DataStore {
|
|||
|
||||
}
|
||||
|
||||
protected Object convertValue(final String template, final Map<String, String> paramMap) {
|
||||
protected <T> Object convertValue(final String template, final Map<String, T> paramMap) {
|
||||
if (StringUtil.isEmpty(template)) {
|
||||
return StringUtil.EMPTY;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
/*
|
||||
* Copyright 2012-2016 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.ds.impl;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.Map;
|
||||
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.ds.DataStoreException;
|
||||
import org.codelibs.fess.ds.IndexUpdateCallback;
|
||||
import org.codelibs.fess.es.config.exentity.DataConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.orangesignal.csv.CsvConfig;
|
||||
|
||||
public class CsvListDataStoreImpl extends CsvDataStoreImpl {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(CsvListDataStoreImpl.class);
|
||||
|
||||
public boolean deleteProcessedFile = true;
|
||||
|
||||
public long csvFileTimestampMargin = 60 * 1000;// 1min
|
||||
|
||||
public boolean ignoreDataStoreException = true;
|
||||
|
||||
@Override
|
||||
protected boolean isCsvFile(final File parentFile, final String filename) {
|
||||
if (super.isCsvFile(parentFile, filename)) {
|
||||
final File file = new File(parentFile, filename);
|
||||
final long now = System.currentTimeMillis();
|
||||
return now - file.lastModified() > csvFileTimestampMargin;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
|
||||
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
|
||||
|
||||
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
|
||||
dataConfig.initializeClientFactory(crawlerClientFactory);
|
||||
final FileListIndexUpdateCallbackImpl fileListIndexUpdateCallback =
|
||||
new FileListIndexUpdateCallbackImpl(callback, crawlerClientFactory);
|
||||
super.storeData(dataConfig, fileListIndexUpdateCallback, paramMap, scriptMap, defaultDataMap);
|
||||
fileListIndexUpdateCallback.commit();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processCsv(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
|
||||
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final CsvConfig csvConfig, final File csvFile,
|
||||
final long readInterval, final String csvFileEncoding, final boolean hasHeaderLine) {
|
||||
try {
|
||||
super.processCsv(dataConfig, callback, paramMap, scriptMap, defaultDataMap, csvConfig, csvFile, readInterval, csvFileEncoding,
|
||||
hasHeaderLine);
|
||||
|
||||
// delete csv file
|
||||
if (deleteProcessedFile && !csvFile.delete()) {
|
||||
logger.warn("Failed to delete {}", csvFile.getAbsolutePath());
|
||||
}
|
||||
} catch (final DataStoreException e) {
|
||||
if (ignoreDataStoreException) {
|
||||
logger.error("Failed to process " + csvFile.getAbsolutePath(), e);
|
||||
// rename csv file, or delete it if failed
|
||||
if (!csvFile.renameTo(new File(csvFile.getParent(), csvFile.getName() + ".txt")) && !csvFile.delete()) {
|
||||
logger.warn("Failed to delete {}", csvFile.getAbsolutePath());
|
||||
}
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
248
src/main/java/org/codelibs/fess/ds/impl/EsDataStoreImpl.java
Normal file
248
src/main/java/org/codelibs/fess/ds/impl/EsDataStoreImpl.java
Normal file
|
@ -0,0 +1,248 @@
|
|||
/*
|
||||
* Copyright 2012-2016 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.ds.impl;
|
||||
|
||||
import java.net.InetAddress;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.app.service.FailureUrlService;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
|
||||
import org.codelibs.fess.ds.DataStoreCrawlingException;
|
||||
import org.codelibs.fess.ds.DataStoreException;
|
||||
import org.codelibs.fess.ds.IndexUpdateCallback;
|
||||
import org.codelibs.fess.es.config.exentity.DataConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.StreamUtil;
|
||||
import org.elasticsearch.action.bulk.BulkRequestBuilder;
|
||||
import org.elasticsearch.action.bulk.BulkResponse;
|
||||
import org.elasticsearch.action.search.SearchRequestBuilder;
|
||||
import org.elasticsearch.action.search.SearchResponse;
|
||||
import org.elasticsearch.client.Client;
|
||||
import org.elasticsearch.client.transport.TransportClient;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
import org.elasticsearch.common.transport.InetSocketTransportAddress;
|
||||
import org.elasticsearch.search.SearchHit;
|
||||
import org.elasticsearch.search.SearchHits;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class EsDataStoreImpl extends AbstractDataStoreImpl {
|
||||
private static final String PREFERENCE = "preference";
|
||||
|
||||
private static final String QUERY = "query";
|
||||
|
||||
private static final String FIELDS = "fields";
|
||||
|
||||
private static final String SIZE = "size";
|
||||
|
||||
private static final String TYPE = "type";
|
||||
|
||||
private static final String TIMEOUT = "timeout";
|
||||
|
||||
private static final String SCROLL = "scroll";
|
||||
|
||||
private static final String INDEX = "index";
|
||||
|
||||
private static final String HOSTS = "hosts";
|
||||
|
||||
private static final String SETTINGS_PREFIX = "settings.";
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(EsDataStoreImpl.class);
|
||||
|
||||
@Override
|
||||
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
|
||||
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
|
||||
final String hostsStr = paramMap.get(HOSTS);
|
||||
if (StringUtil.isBlank(hostsStr)) {
|
||||
logger.info("hosts is empty.");
|
||||
return;
|
||||
}
|
||||
|
||||
final long readInterval = getReadInterval(paramMap);
|
||||
|
||||
final Settings settings =
|
||||
Settings.settingsBuilder()
|
||||
.put(paramMap
|
||||
.entrySet()
|
||||
.stream()
|
||||
.filter(e -> e.getKey().startsWith(SETTINGS_PREFIX))
|
||||
.collect(
|
||||
Collectors.toMap(e -> e.getKey().replaceFirst("^settings\\.", StringUtil.EMPTY), e -> e.getValue())))
|
||||
.build();
|
||||
logger.info("Connecting to " + hostsStr + " with [" + settings.toDelimitedString(',') + "]");
|
||||
final InetSocketTransportAddress[] addresses = StreamUtil.of(hostsStr.split(",")).map(h -> {
|
||||
String[] values = h.trim().split(":");
|
||||
try {
|
||||
if (values.length == 1) {
|
||||
return new InetSocketTransportAddress(InetAddress.getByName(values[0]), 9300);
|
||||
} else if (values.length == 2) {
|
||||
return new InetSocketTransportAddress(InetAddress.getByName(values[0]), Integer.parseInt(values[1]));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.warn("Failed to parse address: " + h, e);
|
||||
}
|
||||
return null;
|
||||
}).filter(v -> v != null).toArray(n -> new InetSocketTransportAddress[n]);
|
||||
try (Client client = TransportClient.builder().settings(settings).build().addTransportAddresses(addresses)) {
|
||||
processData(dataConfig, callback, paramMap, scriptMap, defaultDataMap, readInterval, client);
|
||||
}
|
||||
}
|
||||
|
||||
protected void processData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
|
||||
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final long readInterval, final Client client) {
|
||||
|
||||
final boolean deleteProcessedDoc = paramMap.getOrDefault("delete.processed.doc", Constants.FALSE).equalsIgnoreCase(Constants.TRUE);
|
||||
final String[] indices;
|
||||
if (paramMap.containsKey(INDEX)) {
|
||||
indices = paramMap.get(INDEX).trim().split(",");
|
||||
} else {
|
||||
indices = new String[] { "_all" };
|
||||
}
|
||||
final String scroll = paramMap.containsKey(SCROLL) ? paramMap.get(SCROLL).trim() : "1m";
|
||||
final String timeout = paramMap.containsKey(TIMEOUT) ? paramMap.get(TIMEOUT).trim() : "1m";
|
||||
final SearchRequestBuilder builder = client.prepareSearch(indices);
|
||||
if (paramMap.containsKey(TYPE)) {
|
||||
builder.setTypes(paramMap.get(TYPE).trim().split(","));
|
||||
}
|
||||
if (paramMap.containsKey(SIZE)) {
|
||||
builder.setSize(Integer.parseInt(paramMap.get(SIZE)));
|
||||
}
|
||||
if (paramMap.containsKey(FIELDS)) {
|
||||
builder.addFields(paramMap.get(FIELDS).trim().split(","));
|
||||
}
|
||||
builder.setQuery(paramMap.containsKey(QUERY) ? paramMap.get(QUERY).trim() : "{\"query\":{\"match_all\":{}}}");
|
||||
builder.setScroll(scroll);
|
||||
builder.setPreference(paramMap.containsKey(PREFERENCE) ? paramMap.get(PREFERENCE).trim() : Constants.SEARCH_PREFERENCE_PRIMARY);
|
||||
try {
|
||||
SearchResponse response = builder.execute().actionGet(timeout);
|
||||
|
||||
String scrollId = response.getScrollId();
|
||||
while (scrollId != null) {
|
||||
final SearchHits searchHits = response.getHits();
|
||||
final SearchHit[] hits = searchHits.getHits();
|
||||
if (hits.length == 0) {
|
||||
scrollId = null;
|
||||
break;
|
||||
}
|
||||
|
||||
boolean loop = true;
|
||||
final BulkRequestBuilder bulkRequest = deleteProcessedDoc ? client.prepareBulk() : null;
|
||||
for (final SearchHit hit : hits) {
|
||||
if (!alive || !loop) {
|
||||
break;
|
||||
}
|
||||
|
||||
final Map<String, Object> dataMap = new HashMap<String, Object>();
|
||||
dataMap.putAll(defaultDataMap);
|
||||
final Map<String, Object> resultMap = new LinkedHashMap<>();
|
||||
resultMap.putAll(paramMap);
|
||||
resultMap.put("index", hit.getIndex());
|
||||
resultMap.put("type", hit.getType());
|
||||
resultMap.put("id", hit.getId());
|
||||
resultMap.put("version", Long.valueOf(hit.getVersion()));
|
||||
resultMap.put("hit", hit);
|
||||
resultMap.put("source", hit.getSource());
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
|
||||
logger.debug(entry.getKey() + "=" + entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
|
||||
final Object convertValue = convertValue(entry.getValue(), resultMap);
|
||||
if (convertValue != null) {
|
||||
dataMap.put(entry.getKey(), convertValue);
|
||||
}
|
||||
}
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
|
||||
logger.debug(entry.getKey() + "=" + entry.getValue());
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
loop = callback.store(paramMap, dataMap);
|
||||
} catch (final CrawlingAccessException e) {
|
||||
logger.warn("Crawling Access Exception at : " + dataMap, e);
|
||||
|
||||
Throwable target = e;
|
||||
if (target instanceof MultipleCrawlingAccessException) {
|
||||
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
|
||||
if (causes.length > 0) {
|
||||
target = causes[causes.length - 1];
|
||||
}
|
||||
}
|
||||
|
||||
String errorName;
|
||||
final Throwable cause = target.getCause();
|
||||
if (cause != null) {
|
||||
errorName = cause.getClass().getCanonicalName();
|
||||
} else {
|
||||
errorName = target.getClass().getCanonicalName();
|
||||
}
|
||||
|
||||
String url;
|
||||
if (target instanceof DataStoreCrawlingException) {
|
||||
url = ((DataStoreCrawlingException) target).getUrl();
|
||||
} else {
|
||||
url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
|
||||
}
|
||||
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
|
||||
failureUrlService.store(dataConfig, errorName, url, target);
|
||||
} catch (final Exception e) {
|
||||
final String url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
|
||||
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
|
||||
failureUrlService.store(dataConfig, e.getClass().getCanonicalName(), url, e);
|
||||
|
||||
logger.warn("Crawling Access Exception at : " + dataMap, e);
|
||||
}
|
||||
|
||||
if (bulkRequest != null) {
|
||||
bulkRequest.add(client.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()));
|
||||
}
|
||||
|
||||
if (readInterval > 0) {
|
||||
sleep(readInterval);
|
||||
}
|
||||
}
|
||||
|
||||
if (bulkRequest != null && bulkRequest.numberOfActions() > 0) {
|
||||
final BulkResponse bulkResponse = bulkRequest.execute().actionGet(timeout);
|
||||
if (bulkResponse.hasFailures()) {
|
||||
logger.warn(bulkResponse.buildFailureMessage());
|
||||
}
|
||||
}
|
||||
|
||||
if (!alive) {
|
||||
break;
|
||||
}
|
||||
response = client.prepareSearchScroll(scrollId).setScroll(scroll).execute().actionGet(timeout);
|
||||
scrollId = response.getScrollId();
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
throw new DataStoreException("Failed to crawl data when acessing elasticsearch.", e);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
/*
|
||||
* Copyright 2012-2016 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.ds.impl;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.ds.IndexUpdateCallback;
|
||||
import org.codelibs.fess.es.config.exentity.DataConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
|
||||
public class EsListDataStoreImpl extends EsDataStoreImpl {
|
||||
|
||||
@Override
|
||||
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap,
|
||||
final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
|
||||
|
||||
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
|
||||
dataConfig.initializeClientFactory(crawlerClientFactory);
|
||||
final FileListIndexUpdateCallbackImpl fileListIndexUpdateCallback =
|
||||
new FileListIndexUpdateCallbackImpl(callback, crawlerClientFactory);
|
||||
super.storeData(dataConfig, fileListIndexUpdateCallback, paramMap, scriptMap, defaultDataMap);
|
||||
fileListIndexUpdateCallback.commit();
|
||||
}
|
||||
|
||||
}
|
|
@ -50,6 +50,8 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import com.orangesignal.csv.CsvConfig;
|
||||
|
||||
@Deprecated
|
||||
// replace with CsvListDataStoreImpl
|
||||
public class FileListDataStoreImpl extends CsvDataStoreImpl {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(FileListDataStoreImpl.class);
|
||||
|
|
|
@ -0,0 +1,199 @@
|
|||
package org.codelibs.fess.ds.impl;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClient;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.processor.ResponseProcessor;
|
||||
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
|
||||
import org.codelibs.fess.crawler.rule.Rule;
|
||||
import org.codelibs.fess.crawler.rule.RuleManager;
|
||||
import org.codelibs.fess.crawler.transformer.Transformer;
|
||||
import org.codelibs.fess.ds.DataStoreCrawlingException;
|
||||
import org.codelibs.fess.ds.IndexUpdateCallback;
|
||||
import org.codelibs.fess.es.client.FessEsClient;
|
||||
import org.codelibs.fess.helper.IndexingHelper;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.StreamUtil;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FileListIndexUpdateCallbackImpl.class);
|
||||
|
||||
protected IndexUpdateCallback indexUpdateCallback;
|
||||
|
||||
protected CrawlerClientFactory crawlerClientFactory;
|
||||
|
||||
protected List<String> deleteIdList = new ArrayList<String>(100);
|
||||
|
||||
protected int maxDeleteDocumentCacheSize = 100;
|
||||
|
||||
protected FileListIndexUpdateCallbackImpl(final IndexUpdateCallback indexUpdateCallback, final CrawlerClientFactory crawlerClientFactory) {
|
||||
this.indexUpdateCallback = indexUpdateCallback;
|
||||
this.crawlerClientFactory = crawlerClientFactory;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean store(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
|
||||
final Object eventType = dataMap.remove(getParamValue(paramMap, "field.event_type", "event_type"));
|
||||
|
||||
if (getParamValue(paramMap, "event.create", "create").equals(eventType)
|
||||
|| getParamValue(paramMap, "event.modify", "modify").equals(eventType)) {
|
||||
// updated file
|
||||
return addDocument(paramMap, dataMap);
|
||||
} else if (getParamValue(paramMap, "event.delete", "delete").equals(eventType)) {
|
||||
// deleted file
|
||||
return deleteDocument(paramMap, dataMap);
|
||||
}
|
||||
|
||||
logger.warn("unknown event: " + eventType + ", data: " + dataMap);
|
||||
return false;
|
||||
}
|
||||
|
||||
protected String getParamValue(Map<String, String> paramMap, String key, String defaultValue) {
|
||||
return paramMap.getOrDefault(key, defaultValue);
|
||||
}
|
||||
|
||||
protected boolean addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
synchronized (indexUpdateCallback) {
|
||||
// required check
|
||||
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
|
||||
logger.warn("Could not add a doc. Invalid data: " + dataMap);
|
||||
return false;
|
||||
}
|
||||
|
||||
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
|
||||
try {
|
||||
final CrawlerClient client = crawlerClientFactory.getClient(url);
|
||||
if (client == null) {
|
||||
logger.warn("CrawlerClient is null. Data: " + dataMap);
|
||||
return false;
|
||||
}
|
||||
|
||||
final long startTime = System.currentTimeMillis();
|
||||
final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build());
|
||||
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
|
||||
if (dataMap.containsKey(Constants.SESSION_ID)) {
|
||||
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
|
||||
} else {
|
||||
responseData.setSessionId((String) paramMap.get(Constants.CRAWLING_INFO_ID));
|
||||
}
|
||||
|
||||
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
|
||||
final Rule rule = ruleManager.getRule(responseData);
|
||||
if (rule == null) {
|
||||
logger.warn("No url rule. Data: " + dataMap);
|
||||
return false;
|
||||
} else {
|
||||
responseData.setRuleId(rule.getRuleId());
|
||||
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
|
||||
if (responseProcessor instanceof DefaultResponseProcessor) {
|
||||
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
|
||||
final ResultData resultData = transformer.transform(responseData);
|
||||
final byte[] data = resultData.getData();
|
||||
if (data != null) {
|
||||
try {
|
||||
@SuppressWarnings("unchecked")
|
||||
final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
|
||||
dataMap.putAll(responseDataMap);
|
||||
} catch (final Exception e) {
|
||||
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
|
||||
}
|
||||
}
|
||||
|
||||
// remove
|
||||
String[] ignoreFields;
|
||||
if (paramMap.containsKey("ignore.field.names")) {
|
||||
ignoreFields = paramMap.get("ignore.field.names").split(",");
|
||||
} else {
|
||||
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
|
||||
}
|
||||
StreamUtil.of(ignoreFields).map(s -> s.trim()).forEach(s -> dataMap.remove(s));
|
||||
|
||||
return indexUpdateCallback.store(paramMap, dataMap);
|
||||
} else {
|
||||
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
|
||||
+ ", Data: " + dataMap);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean deleteDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Deleting " + dataMap);
|
||||
}
|
||||
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
|
||||
// required check
|
||||
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
|
||||
logger.warn("Could not delete a doc. Invalid data: " + dataMap);
|
||||
return false;
|
||||
}
|
||||
|
||||
synchronized (indexUpdateCallback) {
|
||||
deleteIdList.add(ComponentUtil.getCrawlingInfoHelper().generateId(dataMap));
|
||||
|
||||
if (deleteIdList.size() >= maxDeleteDocumentCacheSize) {
|
||||
final FessEsClient fessEsClient = ComponentUtil.getElasticsearchClient();
|
||||
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
|
||||
for (final String id : deleteIdList) {
|
||||
indexingHelper.deleteDocument(fessEsClient, id);
|
||||
}
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Deleted " + deleteIdList);
|
||||
}
|
||||
deleteIdList.clear();
|
||||
}
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void commit() {
|
||||
if (!deleteIdList.isEmpty()) {
|
||||
final FessEsClient fessEsClient = ComponentUtil.getElasticsearchClient();
|
||||
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
|
||||
for (final String id : deleteIdList) {
|
||||
indexingHelper.deleteDocument(fessEsClient, id);
|
||||
}
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Deleted " + deleteIdList);
|
||||
}
|
||||
}
|
||||
indexUpdateCallback.commit();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getDocumentSize() {
|
||||
return indexUpdateCallback.getDocumentSize();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getExecuteTime() {
|
||||
return indexUpdateCallback.getExecuteTime();
|
||||
}
|
||||
|
||||
public void setMaxDeleteDocumentCacheSize(int maxDeleteDocumentCacheSize) {
|
||||
this.maxDeleteDocumentCacheSize = maxDeleteDocumentCacheSize;
|
||||
}
|
||||
}
|
|
@ -19,6 +19,7 @@ import org.apache.lucene.queryparser.classic.QueryParser;
|
|||
import org.codelibs.core.crypto.CachedCipher;
|
||||
import org.codelibs.core.misc.DynamicProperties;
|
||||
import org.codelibs.fess.api.WebApiManagerFactory;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.entity.EsAccessResult;
|
||||
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
|
||||
import org.codelibs.fess.crawler.service.DataService;
|
||||
|
@ -60,6 +61,7 @@ import org.lastaflute.job.JobManager;
|
|||
import org.lastaflute.web.servlet.request.RequestManager;
|
||||
|
||||
public final class ComponentUtil {
|
||||
|
||||
private static final String QUERY_PARSER = "queryParser";
|
||||
|
||||
private static final String DOCUMENT_HELPER = "documentHelper";
|
||||
|
@ -322,6 +324,10 @@ public final class ComponentUtil {
|
|||
return getComponent(QUERY_PARSER);
|
||||
}
|
||||
|
||||
public static CrawlerClientFactory getCrawlerClientFactory() {
|
||||
return getComponent(CrawlerClientFactory.class);
|
||||
}
|
||||
|
||||
public static <T> T getComponent(final Class<T> clazz) {
|
||||
try {
|
||||
return SingletonLaContainer.getComponent(clazz);
|
||||
|
|
|
@ -30,6 +30,14 @@ public class StreamUtil {
|
|||
}
|
||||
}
|
||||
|
||||
public static Stream<String> splitOf(final String value, final String regex) {
|
||||
if (value != null) {
|
||||
return Arrays.stream(value.split(regex));
|
||||
} else {
|
||||
return Collections.<String> emptyList().stream();
|
||||
}
|
||||
}
|
||||
|
||||
public static <K, V> Stream<Map.Entry<K, V>> of(final Map<K, V> map) {
|
||||
if (map != null) {
|
||||
return map.entrySet().stream();
|
||||
|
|
|
@ -12,8 +12,16 @@
|
|||
<arg>csvDataStore</arg>
|
||||
</postConstruct>
|
||||
<postConstruct name="add">
|
||||
<arg>"FileListDataStore"</arg>
|
||||
<arg>fileListDataStore</arg>
|
||||
<arg>"CsvListDataStore"</arg>
|
||||
<arg>csvListDataStore</arg>
|
||||
</postConstruct>
|
||||
<postConstruct name="add">
|
||||
<arg>"EsDataStore"</arg>
|
||||
<arg>esDataStore</arg>
|
||||
</postConstruct>
|
||||
<postConstruct name="add">
|
||||
<arg>"EsListDataStore"</arg>
|
||||
<arg>esListDataStore</arg>
|
||||
</postConstruct>
|
||||
</component>
|
||||
|
||||
|
@ -24,7 +32,11 @@
|
|||
<property name="csvFileSuffixs">new String[] { ".csv", ".tsv" }</property>
|
||||
-->
|
||||
</component>
|
||||
<component name="fileListDataStore" class="org.codelibs.fess.ds.impl.FileListDataStoreImpl">
|
||||
<component name="csvListDataStore" class="org.codelibs.fess.ds.impl.CsvListDataStoreImpl">
|
||||
</component>
|
||||
<component name="esDataStore" class="org.codelibs.fess.ds.impl.EsDataStoreImpl">
|
||||
</component>
|
||||
<component name="esListDataStore" class="org.codelibs.fess.ds.impl.EsListDataStoreImpl">
|
||||
</component>
|
||||
|
||||
<component name="indexUpdateCallback" class="org.codelibs.fess.ds.impl.IndexUpdateCallbackImpl" instance="prototype">
|
||||
|
|
Loading…
Add table
Reference in a new issue