Browse Source

fix #2860 Refactor URL handling logic.

Shinsuke Sugaya 6 months ago
parent
commit
c3514c5b3a

+ 8 - 9
src/main/java/org/codelibs/fess/crawler/FessCrawlerThread.java

@@ -95,7 +95,8 @@ public class FessCrawlerThread extends CrawlerThread {
                     final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
                     if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
                         // head method
-                        responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
+                        responseData =
+                                client.execute(RequestDataBuilder.newRequestData().head().url(url).weight(urlQueue.getWeight()).build());
                         if (responseData == null) {
                             return true;
                         }
@@ -202,14 +203,12 @@ public class FessCrawlerThread extends CrawlerThread {
         }
     }
 
-    @SuppressWarnings("unchecked")
     protected Set<RequestData> getAnchorSet(final Object obj) {
         List<String> anchorList;
-        if (obj instanceof String) {
-            anchorList = new ArrayList<>();
-            anchorList.add(obj.toString());
-        } else if (obj instanceof List<?>) {
-            anchorList = (List<String>) obj;
+        if (obj instanceof final String s) {
+            anchorList = List.of(s);
+        } else if (obj instanceof final List<?> l) {
+            anchorList = l.stream().map(String::valueOf).toList();
         } else {
             return null;
         }
@@ -263,11 +262,11 @@ public class FessCrawlerThread extends CrawlerThread {
     }
 
     @Override
-    protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) {
+    protected void storeChildUrl(final String childUrl, final String parentUrl, final float weight, final int depth) {
         if (StringUtil.isNotBlank(childUrl)) {
             final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
             final String url = duplicateHostHelper.convert(childUrl);
-            super.storeChildUrl(url, parentUrl, metaData, depth);
+            super.storeChildUrl(url, parentUrl, weight, depth);
         }
     }
 

+ 9 - 4
src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java

@@ -34,8 +34,13 @@ import org.opensearch.search.sort.SortBuilders;
 import org.opensearch.search.sort.SortOrder;
 
 public class FessUrlQueueService extends OpenSearchUrlQueueService {
+
     private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class);
 
+    protected static final String ORDER_SEQUENTIAL = "sequential";
+
+    protected static final String ORDER_RANDOM = "random";
+
     public FessUrlQueueService(final OpenSearchCrawlerConfig crawlerConfig) {
         super(crawlerConfig);
     }
@@ -45,14 +50,14 @@ public class FessUrlQueueService extends OpenSearchUrlQueueService {
         final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
         final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId);
         final Map<String, String> configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
-        final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential");
-        if ("random".equals(crawlOrder)) {
+        final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, ORDER_SEQUENTIAL);
+        if (ORDER_RANDOM.equals(crawlOrder)) {
             return getList(OpenSearchUrlQueue.class, sessionId,
                     QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(),
                             new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder(
                                     new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }),
-                    0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC));
-        } else if (!"sequential".equals(crawlOrder)) {
+                    0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.DESC));
+        } else if (!ORDER_SEQUENTIAL.equals(crawlOrder)) {
             logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder);
         }
         return super.fetchUrlQueueList(sessionId);