|
@@ -19,9 +19,11 @@ import static org.codelibs.core.stream.StreamUtil.stream;
|
|
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.ArrayList;
|
|
import java.util.Deque;
|
|
import java.util.Deque;
|
|
|
|
+import java.util.HashSet;
|
|
import java.util.LinkedList;
|
|
import java.util.LinkedList;
|
|
import java.util.List;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Map;
|
|
|
|
+import java.util.Set;
|
|
import java.util.concurrent.ExecutorService;
|
|
import java.util.concurrent.ExecutorService;
|
|
import java.util.concurrent.LinkedBlockingQueue;
|
|
import java.util.concurrent.LinkedBlockingQueue;
|
|
import java.util.concurrent.ThreadPoolExecutor;
|
|
import java.util.concurrent.ThreadPoolExecutor;
|
|
@@ -147,11 +149,14 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
|
|
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
|
|
long counter = 0;
|
|
long counter = 0;
|
|
final Deque<String> urlQueue = new LinkedList<>();
|
|
final Deque<String> urlQueue = new LinkedList<>();
|
|
|
|
+ final Set<String> processedUrls = new HashSet<>();
|
|
urlQueue.offer(url);
|
|
urlQueue.offer(url);
|
|
while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
|
|
while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
|
|
|
|
+ counter++;
|
|
final Map<String, Object> localDataMap =
|
|
final Map<String, Object> localDataMap =
|
|
dataMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
|
dataMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
|
|
String processingUrl = urlQueue.poll();
|
|
String processingUrl = urlQueue.poll();
|
|
|
|
+ processedUrls.add(processingUrl);
|
|
if (deleteUrlList.contains(processingUrl)) {
|
|
if (deleteUrlList.contains(processingUrl)) {
|
|
deleteDocuments(); // delete before indexing
|
|
deleteDocuments(); // delete before indexing
|
|
}
|
|
}
|
|
@@ -165,7 +170,6 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|
if (processingUrl == null) {
|
|
if (processingUrl == null) {
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
- counter++;
|
|
|
|
localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
|
|
localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
|
|
crawlerStatsHelper.record(keyObj, StatsAction.REDIRECTED);
|
|
crawlerStatsHelper.record(keyObj, StatsAction.REDIRECTED);
|
|
}
|
|
}
|
|
@@ -176,7 +180,11 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|
crawlerStatsHelper.record(keyObj, StatsAction.ACCESS_EXCEPTION);
|
|
crawlerStatsHelper.record(keyObj, StatsAction.ACCESS_EXCEPTION);
|
|
final Throwable cause = e.getCause();
|
|
final Throwable cause = e.getCause();
|
|
if (cause instanceof ChildUrlsException) {
|
|
if (cause instanceof ChildUrlsException) {
|
|
- ((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
|
|
|
|
|
|
+ ((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(s -> {
|
|
|
|
+ if (!processedUrls.contains(s)&&!urlQueue.contains(s)) {
|
|
|
|
+ urlQueue.offer(s);
|
|
|
|
+ }
|
|
|
|
+ });
|
|
} else if (maxAccessCount != 1L) {
|
|
} else if (maxAccessCount != 1L) {
|
|
throw e;
|
|
throw e;
|
|
} else {
|
|
} else {
|