fix #2860 Refactor URL handling logic.

This commit is contained in:
Shinsuke Sugaya 2024-12-19 17:05:54 +09:00
parent 69f21a80f1
commit c3514c5b3a
2 changed files with 17 additions and 13 deletions

View file

@ -95,7 +95,8 @@ public class FessCrawlerThread extends CrawlerThread {
final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
responseData =
client.execute(RequestDataBuilder.newRequestData().head().url(url).weight(urlQueue.getWeight()).build());
if (responseData == null) {
return true;
}
@ -202,14 +203,12 @@ public class FessCrawlerThread extends CrawlerThread {
}
}
@SuppressWarnings("unchecked")
protected Set<RequestData> getAnchorSet(final Object obj) {
List<String> anchorList;
if (obj instanceof String) {
anchorList = new ArrayList<>();
anchorList.add(obj.toString());
} else if (obj instanceof List<?>) {
anchorList = (List<String>) obj;
if (obj instanceof final String s) {
anchorList = List.of(s);
} else if (obj instanceof final List<?> l) {
anchorList = l.stream().map(String::valueOf).toList();
} else {
return null;
}
@ -263,11 +262,11 @@ public class FessCrawlerThread extends CrawlerThread {
}
@Override
protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) {
protected void storeChildUrl(final String childUrl, final String parentUrl, final float weight, final int depth) {
if (StringUtil.isNotBlank(childUrl)) {
final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
final String url = duplicateHostHelper.convert(childUrl);
super.storeChildUrl(url, parentUrl, metaData, depth);
super.storeChildUrl(url, parentUrl, weight, depth);
}
}

View file

@ -34,8 +34,13 @@ import org.opensearch.search.sort.SortBuilders;
import org.opensearch.search.sort.SortOrder;
public class FessUrlQueueService extends OpenSearchUrlQueueService {
private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class);
protected static final String ORDER_SEQUENTIAL = "sequential";
protected static final String ORDER_RANDOM = "random";
public FessUrlQueueService(final OpenSearchCrawlerConfig crawlerConfig) {
super(crawlerConfig);
}
@ -45,14 +50,14 @@ public class FessUrlQueueService extends OpenSearchUrlQueueService {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId);
final Map<String, String> configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential");
if ("random".equals(crawlOrder)) {
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, ORDER_SEQUENTIAL);
if (ORDER_RANDOM.equals(crawlOrder)) {
return getList(OpenSearchUrlQueue.class, sessionId,
QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(),
new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder(
new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }),
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC));
} else if (!"sequential".equals(crawlOrder)) {
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.DESC));
} else if (!ORDER_SEQUENTIAL.equals(crawlOrder)) {
logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder);
}
return super.fetchUrlQueueList(sessionId);