fix #2488 add recursive
This commit is contained in:
parent
caa36e2830
commit
ad83edf75f
4 changed files with 131 additions and 14 deletions
|
@ -18,6 +18,8 @@ package org.codelibs.fess.ds.callback;
|
|||
import static org.codelibs.core.stream.StreamUtil.stream;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Deque;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
|
@ -48,6 +50,7 @@ import org.codelibs.fess.exception.DataStoreCrawlingException;
|
|||
import org.codelibs.fess.helper.IndexingHelper;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
|
||||
public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
||||
|
@ -59,9 +62,9 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
|
||||
protected List<String> deleteUrlList = new ArrayList<>(100);
|
||||
|
||||
protected int maxDeleteDocumentCacheSize = 100;
|
||||
protected int maxDeleteDocumentCacheSize;
|
||||
|
||||
protected int maxRedirectCount = 10;
|
||||
protected int maxRedirectCount;
|
||||
|
||||
private final ExecutorService executor;
|
||||
|
||||
|
@ -72,6 +75,9 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
this.indexUpdateCallback = indexUpdateCallback;
|
||||
this.crawlerClientFactory = crawlerClientFactory;
|
||||
executor = newFixedThreadPool(nThreads < 1 ? 1 : nThreads);
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
maxDeleteDocumentCacheSize = fessConfig.getIndexerDataMaxDeleteCacheSizeAsInteger();
|
||||
maxRedirectCount = fessConfig.getIndexerDataMaxRedirectCountAsInteger();
|
||||
}
|
||||
|
||||
protected ExecutorService newFixedThreadPool(final int nThreads) {
|
||||
|
@ -119,17 +125,56 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
return;
|
||||
}
|
||||
|
||||
String processingUrl = url;
|
||||
for (int i = 0; i < maxRedirectCount; i++) {
|
||||
processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
|
||||
if (processingUrl == null) {
|
||||
break;
|
||||
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
|
||||
long counter = 0;
|
||||
final Deque<String> urlQueue = new LinkedList<>();
|
||||
urlQueue.offer(url);
|
||||
while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
|
||||
String processingUrl = urlQueue.poll();
|
||||
if (deleteUrlList.contains(processingUrl)) {
|
||||
deleteDocuments(); // delete before indexing
|
||||
}
|
||||
try {
|
||||
for (int i = 0; i < maxRedirectCount; i++) {
|
||||
processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
|
||||
if (processingUrl == null) {
|
||||
break;
|
||||
}
|
||||
counter++;
|
||||
dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
|
||||
}
|
||||
} catch (ChildUrlsException e) {
|
||||
e.getChildUrlList().stream().map(req -> req.getUrl()).forEach(urlQueue::offer);
|
||||
} catch (DataStoreCrawlingException e) {
|
||||
Throwable cause = e.getCause();
|
||||
if (cause instanceof ChildUrlsException) {
|
||||
((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
|
||||
} else {
|
||||
if (maxAccessCount != 1L) {
|
||||
throw e;
|
||||
} else {
|
||||
logger.warn("Failed to access {}.", processingUrl, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected long getMaxAccessCount(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
|
||||
final Object recursive = dataMap.remove(getParamValue(paramMap, "field.recursive", "recursive"));
|
||||
if (recursive == null || Constants.FALSE.equalsIgnoreCase(recursive.toString())) {
|
||||
return 1L;
|
||||
} else if (Constants.TRUE.equalsIgnoreCase(recursive.toString())) {
|
||||
return -1L;
|
||||
}
|
||||
try {
|
||||
return Long.parseLong(recursive.toString());
|
||||
} catch (NumberFormatException e) {
|
||||
return 1L;
|
||||
}
|
||||
}
|
||||
|
||||
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url,
|
||||
final CrawlerClient client) {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
@ -176,8 +221,8 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
|
||||
indexUpdateCallback.store(paramMap, dataMap);
|
||||
} else {
|
||||
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
|
||||
+ ", Data: " + dataMap);
|
||||
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}",
|
||||
responseProcessor, dataMap);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
|
@ -204,12 +249,23 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
}
|
||||
|
||||
synchronized (indexUpdateCallback) {
|
||||
deleteUrlList.add(dataMap.get(fessConfig.getIndexFieldUrl()).toString());
|
||||
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
|
||||
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
|
||||
if (maxAccessCount != 1L) {
|
||||
final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
|
||||
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
|
||||
final long count =
|
||||
indexingHelper.deleteDocumentByQuery(fessEsClient, QueryBuilders.prefixQuery(fessConfig.getIndexFieldUrl(), url));
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Deleted {} docs for {}*", count, url);
|
||||
}
|
||||
} else {
|
||||
deleteUrlList.add(url);
|
||||
|
||||
if (deleteUrlList.size() >= maxDeleteDocumentCacheSize) {
|
||||
deleteDocuments();
|
||||
if (deleteUrlList.size() >= maxDeleteDocumentCacheSize) {
|
||||
deleteDocuments();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -156,6 +156,11 @@ public class IndexingHelper {
|
|||
QueryBuilders.idsQuery().addIds(docIdList.stream().toArray(n -> new String[n])));
|
||||
}
|
||||
|
||||
public long deleteDocumentByQuery(final FessEsClient fessEsClient, final QueryBuilder queryBuilder) {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
return fessEsClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(), queryBuilder);
|
||||
}
|
||||
|
||||
public Map<String, Object> getDocument(final FessEsClient fessEsClient, final String id, final String[] fields) {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
return fessEsClient.getDocument(fessConfig.getIndexDocumentUpdateIndex(), builder -> {
|
||||
|
|
|
@ -426,6 +426,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. 1048576 */
|
||||
String INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE = "indexer.data.max.document.request.size";
|
||||
|
||||
/** The key of the configuration. e.g. 100 */
|
||||
String INDEXER_DATA_MAX_DELETE_CACHE_SIZE = "indexer.data.max.delete.cache.size";
|
||||
|
||||
/** The key of the configuration. e.g. 10 */
|
||||
String INDEXER_DATA_MAX_REDIRECT_COUNT = "indexer.data.max.redirect.count";
|
||||
|
||||
/** The key of the configuration. e.g. content,important_content,title */
|
||||
String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields";
|
||||
|
||||
|
@ -2861,6 +2867,36 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
Integer getIndexerDataMaxDocumentRequestSizeAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'indexer.data.max.delete.cache.size'. <br>
|
||||
* The value is, e.g. 100 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getIndexerDataMaxDeleteCacheSize();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'indexer.data.max.delete.cache.size' as {@link Integer}. <br>
|
||||
* The value is, e.g. 100 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getIndexerDataMaxDeleteCacheSizeAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'indexer.data.max.redirect.count'. <br>
|
||||
* The value is, e.g. 10 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getIndexerDataMaxRedirectCount();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'indexer.data.max.redirect.count' as {@link Integer}. <br>
|
||||
* The value is, e.g. 10 <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
* @throws NumberFormatException When the property is not integer.
|
||||
*/
|
||||
Integer getIndexerDataMaxRedirectCountAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'indexer.language.fields'. <br>
|
||||
* The value is, e.g. content,important_content,title <br>
|
||||
|
@ -7181,6 +7217,22 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return getAsInteger(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE);
|
||||
}
|
||||
|
||||
public String getIndexerDataMaxDeleteCacheSize() {
|
||||
return get(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE);
|
||||
}
|
||||
|
||||
public Integer getIndexerDataMaxDeleteCacheSizeAsInteger() {
|
||||
return getAsInteger(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE);
|
||||
}
|
||||
|
||||
public String getIndexerDataMaxRedirectCount() {
|
||||
return get(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT);
|
||||
}
|
||||
|
||||
public Integer getIndexerDataMaxRedirectCountAsInteger() {
|
||||
return getAsInteger(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT);
|
||||
}
|
||||
|
||||
public String getIndexerLanguageFields() {
|
||||
return get(FessConfig.INDEXER_LANGUAGE_FIELDS);
|
||||
}
|
||||
|
@ -9247,6 +9299,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
defaultMap.put(FessConfig.INDEXER_WEBFS_MAX_DOCUMENT_REQUEST_SIZE, "1048576");
|
||||
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "10000");
|
||||
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "1048576");
|
||||
defaultMap.put(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE, "100");
|
||||
defaultMap.put(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT, "10");
|
||||
defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title");
|
||||
defaultMap.put(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH, "1000");
|
||||
defaultMap.put(FessConfig.INDEX_CODEC, "default");
|
||||
|
|
|
@ -249,6 +249,8 @@ indexer.webfs.max.document.cache.size=10
|
|||
indexer.webfs.max.document.request.size=1048576
|
||||
indexer.data.max.document.cache.size=10000
|
||||
indexer.data.max.document.request.size=1048576
|
||||
indexer.data.max.delete.cache.size=100
|
||||
indexer.data.max.redirect.count=10
|
||||
indexer.language.fields=content,important_content,title
|
||||
indexer.language.detect.length=1000
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue