WebCrawl support URLEncode and Disabled (#2678)

This commit is contained in:
jasongwq 2022-09-03 08:52:18 +08:00 committed by GitHub
parent 9fa9e17056
commit 63da35d257
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -162,22 +162,42 @@ public class WebFsIndexHelper {
}));
// set included urls
split(includedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
if (!urlValue.startsWith("#")) {
final AtomicBoolean urlEncodeDisabled = new AtomicBoolean(false);
split(includedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
if (!line.startsWith("#")) {
final String urlValue;
if (urlEncodeDisabled.get()) {
urlValue = line;
urlEncodeDisabled.set(false);
} else {
urlValue = systemHelper.encodeUrlFilter(line);
}
crawler.addIncludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Included URL: {}", urlValue);
}
} else if (line.startsWith("#DISABLE_URL_ENCODE")) {
urlEncodeDisabled.set(true);
}
}));
// set excluded urls
split(excludedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
if (!urlValue.startsWith("#")) {
urlEncodeDisabled.set(false);
split(excludedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
if (!line.startsWith("#")) {
final String urlValue;
if (urlEncodeDisabled.get()) {
urlValue = line;
urlEncodeDisabled.set(false);
} else {
urlValue = systemHelper.encodeUrlFilter(line);
}
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded URL: {}", urlValue);
}
} else if (line.startsWith("#DISABLE_URL_ENCODE")) {
urlEncodeDisabled.set(true);
}
}));