fix #2860 Refactor URL handling logic.
This commit is contained in:
parent
69f21a80f1
commit
c3514c5b3a
2 changed files with 17 additions and 13 deletions
|
@ -95,7 +95,8 @@ public class FessCrawlerThread extends CrawlerThread {
|
|||
final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
|
||||
if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
|
||||
// head method
|
||||
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
|
||||
responseData =
|
||||
client.execute(RequestDataBuilder.newRequestData().head().url(url).weight(urlQueue.getWeight()).build());
|
||||
if (responseData == null) {
|
||||
return true;
|
||||
}
|
||||
|
@ -202,14 +203,12 @@ public class FessCrawlerThread extends CrawlerThread {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
protected Set<RequestData> getAnchorSet(final Object obj) {
|
||||
List<String> anchorList;
|
||||
if (obj instanceof String) {
|
||||
anchorList = new ArrayList<>();
|
||||
anchorList.add(obj.toString());
|
||||
} else if (obj instanceof List<?>) {
|
||||
anchorList = (List<String>) obj;
|
||||
if (obj instanceof final String s) {
|
||||
anchorList = List.of(s);
|
||||
} else if (obj instanceof final List<?> l) {
|
||||
anchorList = l.stream().map(String::valueOf).toList();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
|
@ -263,11 +262,11 @@ public class FessCrawlerThread extends CrawlerThread {
|
|||
}
|
||||
|
||||
@Override
|
||||
protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) {
|
||||
protected void storeChildUrl(final String childUrl, final String parentUrl, final float weight, final int depth) {
|
||||
if (StringUtil.isNotBlank(childUrl)) {
|
||||
final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
|
||||
final String url = duplicateHostHelper.convert(childUrl);
|
||||
super.storeChildUrl(url, parentUrl, metaData, depth);
|
||||
super.storeChildUrl(url, parentUrl, weight, depth);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -34,8 +34,13 @@ import org.opensearch.search.sort.SortBuilders;
|
|||
import org.opensearch.search.sort.SortOrder;
|
||||
|
||||
public class FessUrlQueueService extends OpenSearchUrlQueueService {
|
||||
|
||||
private static final Logger logger = LogManager.getLogger(FessUrlQueueService.class);
|
||||
|
||||
protected static final String ORDER_SEQUENTIAL = "sequential";
|
||||
|
||||
protected static final String ORDER_RANDOM = "random";
|
||||
|
||||
public FessUrlQueueService(final OpenSearchCrawlerConfig crawlerConfig) {
|
||||
super(crawlerConfig);
|
||||
}
|
||||
|
@ -45,14 +50,14 @@ public class FessUrlQueueService extends OpenSearchUrlQueueService {
|
|||
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(sessionId);
|
||||
final Map<String, String> configParams = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
|
||||
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, "sequential");
|
||||
if ("random".equals(crawlOrder)) {
|
||||
final String crawlOrder = configParams.getOrDefault(CrawlingConfig.Param.Config.CRAWL_ORDER, ORDER_SEQUENTIAL);
|
||||
if (ORDER_RANDOM.equals(crawlOrder)) {
|
||||
return getList(OpenSearchUrlQueue.class, sessionId,
|
||||
QueryBuilders.functionScoreQuery(QueryBuilders.matchAllQuery(),
|
||||
new FunctionScoreQueryBuilder.FilterFunctionBuilder[] { new FunctionScoreQueryBuilder.FilterFunctionBuilder(
|
||||
new RandomScoreFunctionBuilder().seed(sessionId.hashCode())) }),
|
||||
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.ASC));
|
||||
} else if (!"sequential".equals(crawlOrder)) {
|
||||
0, pollingFetchSize, SortBuilders.scoreSort().order(SortOrder.DESC));
|
||||
} else if (!ORDER_SEQUENTIAL.equals(crawlOrder)) {
|
||||
logger.warn("Invalid crawl order specified: {}. Falling back to sequential.", crawlOrder);
|
||||
}
|
||||
return super.fetchUrlQueueList(sessionId);
|
||||
|
|
Loading…
Add table
Reference in a new issue