fix #2488 add recursive

This commit is contained in:
Shinsuke Sugaya 2020-10-07 21:38:45 +09:00
parent caa36e2830
commit ad83edf75f
4 changed files with 131 additions and 14 deletions

View file

@ -18,6 +18,8 @@ package org.codelibs.fess.ds.callback;
import static org.codelibs.core.stream.StreamUtil.stream;
import java.util.ArrayList;
import java.util.Deque;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutorService;
@ -48,6 +50,7 @@ import org.codelibs.fess.exception.DataStoreCrawlingException;
import org.codelibs.fess.helper.IndexingHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.elasticsearch.index.query.QueryBuilders;
import org.lastaflute.di.core.SingletonLaContainer;
public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
@ -59,9 +62,9 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
protected List<String> deleteUrlList = new ArrayList<>(100);
protected int maxDeleteDocumentCacheSize = 100;
protected int maxDeleteDocumentCacheSize;
protected int maxRedirectCount = 10;
protected int maxRedirectCount;
private final ExecutorService executor;
@ -72,6 +75,9 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
this.indexUpdateCallback = indexUpdateCallback;
this.crawlerClientFactory = crawlerClientFactory;
executor = newFixedThreadPool(nThreads < 1 ? 1 : nThreads);
final FessConfig fessConfig = ComponentUtil.getFessConfig();
maxDeleteDocumentCacheSize = fessConfig.getIndexerDataMaxDeleteCacheSizeAsInteger();
maxRedirectCount = fessConfig.getIndexerDataMaxRedirectCountAsInteger();
}
protected ExecutorService newFixedThreadPool(final int nThreads) {
@ -119,17 +125,56 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
return;
}
String processingUrl = url;
for (int i = 0; i < maxRedirectCount; i++) {
processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
if (processingUrl == null) {
break;
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
long counter = 0;
final Deque<String> urlQueue = new LinkedList<>();
urlQueue.offer(url);
while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
String processingUrl = urlQueue.poll();
if (deleteUrlList.contains(processingUrl)) {
deleteDocuments(); // delete before indexing
}
try {
for (int i = 0; i < maxRedirectCount; i++) {
processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
if (processingUrl == null) {
break;
}
counter++;
dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
}
} catch (ChildUrlsException e) {
e.getChildUrlList().stream().map(req -> req.getUrl()).forEach(urlQueue::offer);
} catch (DataStoreCrawlingException e) {
Throwable cause = e.getCause();
if (cause instanceof ChildUrlsException) {
((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
} else {
if (maxAccessCount != 1L) {
throw e;
} else {
logger.warn("Failed to access {}.", processingUrl, e);
}
}
}
dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
}
}
}
protected long getMaxAccessCount(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final Object recursive = dataMap.remove(getParamValue(paramMap, "field.recursive", "recursive"));
if (recursive == null || Constants.FALSE.equalsIgnoreCase(recursive.toString())) {
return 1L;
} else if (Constants.TRUE.equalsIgnoreCase(recursive.toString())) {
return -1L;
}
try {
return Long.parseLong(recursive.toString());
} catch (NumberFormatException e) {
return 1L;
}
}
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url,
final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
@ -176,8 +221,8 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
+ ", Data: " + dataMap);
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}",
responseProcessor, dataMap);
}
}
return null;
@ -204,12 +249,23 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
}
synchronized (indexUpdateCallback) {
deleteUrlList.add(dataMap.get(fessConfig.getIndexFieldUrl()).toString());
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
if (maxAccessCount != 1L) {
final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final long count =
indexingHelper.deleteDocumentByQuery(fessEsClient, QueryBuilders.prefixQuery(fessConfig.getIndexFieldUrl(), url));
if (logger.isDebugEnabled()) {
logger.debug("Deleted {} docs for {}*", count, url);
}
} else {
deleteUrlList.add(url);
if (deleteUrlList.size() >= maxDeleteDocumentCacheSize) {
deleteDocuments();
if (deleteUrlList.size() >= maxDeleteDocumentCacheSize) {
deleteDocuments();
}
}
}
return true;
}

View file

@ -156,6 +156,11 @@ public class IndexingHelper {
QueryBuilders.idsQuery().addIds(docIdList.stream().toArray(n -> new String[n])));
}
public long deleteDocumentByQuery(final FessEsClient fessEsClient, final QueryBuilder queryBuilder) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return fessEsClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(), queryBuilder);
}
public Map<String, Object> getDocument(final FessEsClient fessEsClient, final String id, final String[] fields) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return fessEsClient.getDocument(fessConfig.getIndexDocumentUpdateIndex(), builder -> {

View file

@ -426,6 +426,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. 1048576 */
String INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE = "indexer.data.max.document.request.size";
/** The key of the configuration. e.g. 100 */
String INDEXER_DATA_MAX_DELETE_CACHE_SIZE = "indexer.data.max.delete.cache.size";
/** The key of the configuration. e.g. 10 */
String INDEXER_DATA_MAX_REDIRECT_COUNT = "indexer.data.max.redirect.count";
/** The key of the configuration. e.g. content,important_content,title */
String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields";
@ -2861,6 +2867,36 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
Integer getIndexerDataMaxDocumentRequestSizeAsInteger();
/**
* Get the value for the key 'indexer.data.max.delete.cache.size'. <br>
* The value is, e.g. 100 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getIndexerDataMaxDeleteCacheSize();
/**
* Get the value for the key 'indexer.data.max.delete.cache.size' as {@link Integer}. <br>
* The value is, e.g. 100 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getIndexerDataMaxDeleteCacheSizeAsInteger();
/**
* Get the value for the key 'indexer.data.max.redirect.count'. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getIndexerDataMaxRedirectCount();
/**
* Get the value for the key 'indexer.data.max.redirect.count' as {@link Integer}. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getIndexerDataMaxRedirectCountAsInteger();
/**
* Get the value for the key 'indexer.language.fields'. <br>
* The value is, e.g. content,important_content,title <br>
@ -7181,6 +7217,22 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return getAsInteger(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE);
}
public String getIndexerDataMaxDeleteCacheSize() {
return get(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE);
}
public Integer getIndexerDataMaxDeleteCacheSizeAsInteger() {
return getAsInteger(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE);
}
public String getIndexerDataMaxRedirectCount() {
return get(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT);
}
public Integer getIndexerDataMaxRedirectCountAsInteger() {
return getAsInteger(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT);
}
public String getIndexerLanguageFields() {
return get(FessConfig.INDEXER_LANGUAGE_FIELDS);
}
@ -9247,6 +9299,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
defaultMap.put(FessConfig.INDEXER_WEBFS_MAX_DOCUMENT_REQUEST_SIZE, "1048576");
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "10000");
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "1048576");
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE, "100");
defaultMap.put(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT, "10");
defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title");
defaultMap.put(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH, "1000");
defaultMap.put(FessConfig.INDEX_CODEC, "default");

View file

@ -249,6 +249,8 @@ indexer.webfs.max.document.cache.size=10
indexer.webfs.max.document.request.size=1048576
indexer.data.max.document.cache.size=10000
indexer.data.max.document.request.size=1048576
indexer.data.max.delete.cache.size=100
indexer.data.max.redirect.count=10
indexer.language.fields=content,important_content,title
indexer.language.detect.length=1000