Parcourir la source

fix #2856 Add crawl order configuration to control URL processing order

Shinsuke Sugaya il y a 8 mois
Parent
commit
036ebd6c3c

+ 60 - 0
src/main/java/org/codelibs/fess/crawler/service/FessUrlQueueService.java

@@ -0,0 +1,60 @@
+/*
+ * Copyright 2012-2024 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.crawler.service;
+
+import java.util.List;
+import java.util.Map;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.codelibs.fess.crawler.entity.EsUrlQueue;
+import org.codelibs.fess.crawler.service.impl.EsUrlQueueService;
+import org.codelibs.fess.crawler.util.EsCrawlerConfig;
+import org.codelibs.fess.es.config.exentity.CrawlingConfig;
+import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
+import org.codelibs.fess.helper.CrawlingConfigHelper;
+import org.codelibs.fess.util.ComponentUtil;
+import org.opensearch.index.query.QueryBuilders;
+import org.opensearch.index.query.functionscore.FunctionScoreQueryBuilder;
+import org.opensearch.index.query.functionscore.RandomScoreFunctionBuilder;
+import org.opensearch.search.sort.SortBuilders;
+import org.opensearch.search.sort.SortOrder;
+
+public class FessUrlQueueService extends EsUrlQueueService {
+    private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class);
+
+    public FessUrlQueueService(final EsCrawlerConfig crawlerConfig) {
+        super(crawlerConfig);
+    }
+
+    @Override
+    protected List<EsUrlQueue> fetchUrlQueueList(final String sessionId) {
+        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
+        final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId);
+        final Map<String, String> configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
+        final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential");
+        if ("random".equals(crawlOrder)) {
+            return getList(EsUrlQueue.class, sessionId,
+                    QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(),
+                            new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder(
+                                    new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }),
+                    0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC));
+        } else if (!"sequential".equals(crawlOrder)) {
+            logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder);
+        }
+        return getList(EsUrlQueue.class, sessionId, null, 0, pollingFetchSize, SortBuilders.fieldSort(CREATE_TIME).order(SortOrder.ASC));
+    }
+}

+ 1 - 0
src/main/java/org/codelibs/fess/es/config/exentity/CrawlingConfig.java

@@ -133,6 +133,7 @@ public interface CrawlingConfig {
             public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
             public static final String SCRIPT_TYPE = "script.type";
             public static final String HTML_CHILD_URL_RULES = "html.child.url.rules";
+            public static final String CRAWL_ORDER = "crawl.order";
         }
 
         // meta.*

+ 9 - 0
src/main/resources/crawler_es+urlQueueService.xml

@@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN" 
+	"http://dbflute.org/meta/lastadi10.dtd">
+<components namespace="fessCrawler">
+	<component name="urlQueueService"
+		class="org.codelibs.fess.crawler.service.FessUrlQueueService">
+		<arg>crawlerConfig</arg>
+	</component>
+</components>