Browse Source

fix #2488 add recursive

Shinsuke Sugaya 4 years ago
parent
commit
ad83edf75f

+ 70 - 14
src/main/java/org/codelibs/fess/ds/callback/FileListIndexUpdateCallbackImpl.java

@@ -18,6 +18,8 @@ package org.codelibs.fess.ds.callback;
 import static org.codelibs.core.stream.StreamUtil.stream;
 
 import java.util.ArrayList;
+import java.util.Deque;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ExecutorService;
@@ -48,6 +50,7 @@ import org.codelibs.fess.exception.DataStoreCrawlingException;
 import org.codelibs.fess.helper.IndexingHelper;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
+import org.elasticsearch.index.query.QueryBuilders;
 import org.lastaflute.di.core.SingletonLaContainer;
 
 public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
@@ -59,9 +62,9 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
 
     protected List<String> deleteUrlList = new ArrayList<>(100);
 
-    protected int maxDeleteDocumentCacheSize = 100;
+    protected int maxDeleteDocumentCacheSize;
 
-    protected int maxRedirectCount = 10;
+    protected int maxRedirectCount;
 
     private final ExecutorService executor;
 
@@ -72,6 +75,9 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
         this.indexUpdateCallback = indexUpdateCallback;
         this.crawlerClientFactory = crawlerClientFactory;
         executor = newFixedThreadPool(nThreads < 1 ? 1 : nThreads);
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        maxDeleteDocumentCacheSize = fessConfig.getIndexerDataMaxDeleteCacheSizeAsInteger();
+        maxRedirectCount = fessConfig.getIndexerDataMaxRedirectCountAsInteger();
     }
 
     protected ExecutorService newFixedThreadPool(final int nThreads) {
@@ -119,17 +125,56 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
                 return;
             }
 
-            String processingUrl = url;
-            for (int i = 0; i < maxRedirectCount; i++) {
-                processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
-                if (processingUrl == null) {
-                    break;
+            final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
+            long counter = 0;
+            final Deque<String> urlQueue = new LinkedList<>();
+            urlQueue.offer(url);
+            while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
+                String processingUrl = urlQueue.poll();
+                if (deleteUrlList.contains(processingUrl)) {
+                    deleteDocuments(); // delete before indexing
+                }
+                try {
+                    for (int i = 0; i < maxRedirectCount; i++) {
+                        processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
+                        if (processingUrl == null) {
+                            break;
+                        }
+                        counter++;
+                        dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
+                    }
+                } catch (ChildUrlsException e) {
+                    e.getChildUrlList().stream().map(req -> req.getUrl()).forEach(urlQueue::offer);
+                } catch (DataStoreCrawlingException e) {
+                    Throwable cause = e.getCause();
+                    if (cause instanceof ChildUrlsException) {
+                        ((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
+                    } else {
+                        if (maxAccessCount != 1L) {
+                            throw e;
+                        } else {
+                            logger.warn("Failed to access {}.", processingUrl, e);
+                        }
+                    }
                 }
-                dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
             }
         }
     }
 
+    protected long getMaxAccessCount(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
+        final Object recursive = dataMap.remove(getParamValue(paramMap, "field.recursive", "recursive"));
+        if (recursive == null || Constants.FALSE.equalsIgnoreCase(recursive.toString())) {
+            return 1L;
+        } else if (Constants.TRUE.equalsIgnoreCase(recursive.toString())) {
+            return -1L;
+        }
+        try {
+            return Long.parseLong(recursive.toString());
+        } catch (NumberFormatException e) {
+            return 1L;
+        }
+    }
+
     protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url,
             final CrawlerClient client) {
         final long startTime = System.currentTimeMillis();
@@ -176,8 +221,8 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
 
                     indexUpdateCallback.store(paramMap, dataMap);
                 } else {
-                    logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
-                            + ", Data: " + dataMap);
+                    logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}",
+                            responseProcessor, dataMap);
                 }
             }
             return null;
@@ -204,12 +249,23 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
         }
 
         synchronized (indexUpdateCallback) {
-            deleteUrlList.add(dataMap.get(fessConfig.getIndexFieldUrl()).toString());
+            final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
+            final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
+            if (maxAccessCount != 1L) {
+                final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
+                final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
+                final long count =
+                        indexingHelper.deleteDocumentByQuery(fessEsClient, QueryBuilders.prefixQuery(fessConfig.getIndexFieldUrl(), url));
+                if (logger.isDebugEnabled()) {
+                    logger.debug("Deleted {} docs for {}*", count, url);
+                }
+            } else {
+                deleteUrlList.add(url);
 
-            if (deleteUrlList.size() >= maxDeleteDocumentCacheSize) {
-                deleteDocuments();
+                if (deleteUrlList.size() >= maxDeleteDocumentCacheSize) {
+                    deleteDocuments();
+                }
             }
-
         }
         return true;
     }

+ 5 - 0
src/main/java/org/codelibs/fess/helper/IndexingHelper.java

@@ -156,6 +156,11 @@ public class IndexingHelper {
                 QueryBuilders.idsQuery().addIds(docIdList.stream().toArray(n -> new String[n])));
     }
 
+    public long deleteDocumentByQuery(final FessEsClient fessEsClient, final QueryBuilder queryBuilder) {
+        final FessConfig fessConfig = ComponentUtil.getFessConfig();
+        return fessEsClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(), queryBuilder);
+    }
+
     public Map<String, Object> getDocument(final FessEsClient fessEsClient, final String id, final String[] fields) {
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
         return fessEsClient.getDocument(fessConfig.getIndexDocumentUpdateIndex(), builder -> {

+ 54 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -426,6 +426,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. 1048576 */
     String INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE = "indexer.data.max.document.request.size";
 
+    /** The key of the configuration. e.g. 100 */
+    String INDEXER_DATA_MAX_DELETE_CACHE_SIZE = "indexer.data.max.delete.cache.size";
+
+    /** The key of the configuration. e.g. 10 */
+    String INDEXER_DATA_MAX_REDIRECT_COUNT = "indexer.data.max.redirect.count";
+
     /** The key of the configuration. e.g. content,important_content,title */
     String INDEXER_LANGUAGE_FIELDS = "indexer.language.fields";
 
@@ -2861,6 +2867,36 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     Integer getIndexerDataMaxDocumentRequestSizeAsInteger();
 
+    /**
+     * Get the value for the key 'indexer.data.max.delete.cache.size'. <br>
+     * The value is, e.g. 100 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getIndexerDataMaxDeleteCacheSize();
+
+    /**
+     * Get the value for the key 'indexer.data.max.delete.cache.size' as {@link Integer}. <br>
+     * The value is, e.g. 100 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getIndexerDataMaxDeleteCacheSizeAsInteger();
+
+    /**
+     * Get the value for the key 'indexer.data.max.redirect.count'. <br>
+     * The value is, e.g. 10 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getIndexerDataMaxRedirectCount();
+
+    /**
+     * Get the value for the key 'indexer.data.max.redirect.count' as {@link Integer}. <br>
+     * The value is, e.g. 10 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getIndexerDataMaxRedirectCountAsInteger();
+
     /**
      * Get the value for the key 'indexer.language.fields'. <br>
      * The value is, e.g. content,important_content,title <br>
@@ -7181,6 +7217,22 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return getAsInteger(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE);
         }
 
+        public String getIndexerDataMaxDeleteCacheSize() {
+            return get(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE);
+        }
+
+        public Integer getIndexerDataMaxDeleteCacheSizeAsInteger() {
+            return getAsInteger(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE);
+        }
+
+        public String getIndexerDataMaxRedirectCount() {
+            return get(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT);
+        }
+
+        public Integer getIndexerDataMaxRedirectCountAsInteger() {
+            return getAsInteger(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT);
+        }
+
         public String getIndexerLanguageFields() {
             return get(FessConfig.INDEXER_LANGUAGE_FIELDS);
         }
@@ -9247,6 +9299,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.INDEXER_WEBFS_MAX_DOCUMENT_REQUEST_SIZE, "1048576");
             defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_CACHE_SIZE, "10000");
             defaultMap.put(FessConfig.INDEXER_DATA_MAX_DOCUMENT_REQUEST_SIZE, "1048576");
+            defaultMap.put(FessConfig.INDEXER_DATA_MAX_DELETE_CACHE_SIZE, "100");
+            defaultMap.put(FessConfig.INDEXER_DATA_MAX_REDIRECT_COUNT, "10");
             defaultMap.put(FessConfig.INDEXER_LANGUAGE_FIELDS, "content,important_content,title");
             defaultMap.put(FessConfig.INDEXER_LANGUAGE_DETECT_LENGTH, "1000");
             defaultMap.put(FessConfig.INDEX_CODEC, "default");

+ 2 - 0
src/main/resources/fess_config.properties

@@ -249,6 +249,8 @@ indexer.webfs.max.document.cache.size=10
 indexer.webfs.max.document.request.size=1048576
 indexer.data.max.document.cache.size=10000
 indexer.data.max.document.request.size=1048576
+indexer.data.max.delete.cache.size=100
+indexer.data.max.redirect.count=10
 indexer.language.fields=content,important_content,title
 indexer.language.detect.length=1000