WebCrawl support URLEncode and Disabled (#2678)
This commit is contained in:
parent
9fa9e17056
commit
63da35d257
1 changed files with 24 additions and 4 deletions
|
@ -162,22 +162,42 @@ public class WebFsIndexHelper {
|
|||
}));
|
||||
|
||||
// set included urls
|
||||
split(includedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
|
||||
if (!urlValue.startsWith("#")) {
|
||||
final AtomicBoolean urlEncodeDisabled = new AtomicBoolean(false);
|
||||
split(includedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
|
||||
if (!line.startsWith("#")) {
|
||||
final String urlValue;
|
||||
if (urlEncodeDisabled.get()) {
|
||||
urlValue = line;
|
||||
urlEncodeDisabled.set(false);
|
||||
} else {
|
||||
urlValue = systemHelper.encodeUrlFilter(line);
|
||||
}
|
||||
crawler.addIncludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Included URL: {}", urlValue);
|
||||
}
|
||||
} else if (line.startsWith("#DISABLE_URL_ENCODE")) {
|
||||
urlEncodeDisabled.set(true);
|
||||
}
|
||||
}));
|
||||
|
||||
// set excluded urls
|
||||
split(excludedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
|
||||
if (!urlValue.startsWith("#")) {
|
||||
urlEncodeDisabled.set(false);
|
||||
split(excludedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
|
||||
if (!line.startsWith("#")) {
|
||||
final String urlValue;
|
||||
if (urlEncodeDisabled.get()) {
|
||||
urlValue = line;
|
||||
urlEncodeDisabled.set(false);
|
||||
} else {
|
||||
urlValue = systemHelper.encodeUrlFilter(line);
|
||||
}
|
||||
crawler.addExcludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Excluded URL: {}", urlValue);
|
||||
}
|
||||
} else if (line.startsWith("#DISABLE_URL_ENCODE")) {
|
||||
urlEncodeDisabled.set(true);
|
||||
}
|
||||
}));
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue