fix #599 redirect handling in csv crawling
This commit is contained in:
parent
dcbc3a29da
commit
2aebe9d65f
2 changed files with 75 additions and 47 deletions
|
@ -167,7 +167,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
|
||||
final Set<RequestData> childUrlSet = new HashSet<>();
|
||||
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
|
||||
throw new ChildUrlsException(childUrlSet);
|
||||
throw new ChildUrlsException(childUrlSet, this.getClass().getName()
|
||||
+ "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.concurrent.ExecutorService;
|
|||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.ThreadPoolExecutor;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
|
@ -32,6 +33,7 @@ import org.codelibs.fess.crawler.client.CrawlerClient;
|
|||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.processor.ResponseProcessor;
|
||||
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
|
||||
|
@ -59,6 +61,8 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback, Aut
|
|||
|
||||
protected int maxDeleteDocumentCacheSize = 100;
|
||||
|
||||
protected int maxRedirectCount = 10;
|
||||
|
||||
private final ExecutorService executor;
|
||||
|
||||
protected FileListIndexUpdateCallbackImpl(final IndexUpdateCallback indexUpdateCallback,
|
||||
|
@ -113,57 +117,76 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback, Aut
|
|||
return;
|
||||
}
|
||||
|
||||
final long startTime = System.currentTimeMillis();
|
||||
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
|
||||
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
|
||||
if (dataMap.containsKey(Constants.SESSION_ID)) {
|
||||
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
|
||||
} else {
|
||||
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
|
||||
String processingUrl = url;
|
||||
for (int i = 0; i < maxRedirectCount; i++) {
|
||||
processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
|
||||
if (processingUrl == null) {
|
||||
break;
|
||||
}
|
||||
|
||||
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
|
||||
final Rule rule = ruleManager.getRule(responseData);
|
||||
if (rule == null) {
|
||||
logger.warn("No url rule. Data: " + dataMap);
|
||||
} else {
|
||||
responseData.setRuleId(rule.getRuleId());
|
||||
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
|
||||
if (responseProcessor instanceof DefaultResponseProcessor) {
|
||||
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
|
||||
final ResultData resultData = transformer.transform(responseData);
|
||||
final byte[] data = resultData.getData();
|
||||
if (data != null) {
|
||||
try {
|
||||
@SuppressWarnings("unchecked")
|
||||
final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
|
||||
dataMap.putAll(responseDataMap);
|
||||
} catch (final Exception e) {
|
||||
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
|
||||
}
|
||||
}
|
||||
|
||||
// remove
|
||||
String[] ignoreFields;
|
||||
if (paramMap.containsKey("ignore.field.names")) {
|
||||
ignoreFields = paramMap.get("ignore.field.names").split(",");
|
||||
} else {
|
||||
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
|
||||
}
|
||||
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
|
||||
|
||||
indexUpdateCallback.store(paramMap, dataMap);
|
||||
} else {
|
||||
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
|
||||
+ ", Data: " + dataMap);
|
||||
}
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
|
||||
dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url,
|
||||
final CrawlerClient client) {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
|
||||
if (responseData.getRedirectLocation() != null) {
|
||||
return responseData.getRedirectLocation();
|
||||
}
|
||||
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
|
||||
if (dataMap.containsKey(Constants.SESSION_ID)) {
|
||||
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
|
||||
} else {
|
||||
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
|
||||
}
|
||||
|
||||
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
|
||||
final Rule rule = ruleManager.getRule(responseData);
|
||||
if (rule == null) {
|
||||
logger.warn("No url rule. Data: " + dataMap);
|
||||
} else {
|
||||
responseData.setRuleId(rule.getRuleId());
|
||||
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
|
||||
if (responseProcessor instanceof DefaultResponseProcessor) {
|
||||
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
|
||||
final ResultData resultData = transformer.transform(responseData);
|
||||
final byte[] data = resultData.getData();
|
||||
if (data != null) {
|
||||
try {
|
||||
@SuppressWarnings("unchecked")
|
||||
final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
|
||||
dataMap.putAll(responseDataMap);
|
||||
} catch (final Exception e) {
|
||||
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
|
||||
}
|
||||
}
|
||||
|
||||
// remove
|
||||
String[] ignoreFields;
|
||||
if (paramMap.containsKey("ignore.field.names")) {
|
||||
ignoreFields = paramMap.get("ignore.field.names").split(",");
|
||||
} else {
|
||||
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
|
||||
}
|
||||
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
|
||||
|
||||
indexUpdateCallback.store(paramMap, dataMap);
|
||||
} else {
|
||||
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
|
||||
+ ", Data: " + dataMap);
|
||||
}
|
||||
}
|
||||
return null;
|
||||
} catch (final ChildUrlsException e) {
|
||||
throw new DataStoreCrawlingException(url, "Redirected to "
|
||||
+ e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
|
||||
} catch (final Exception e) {
|
||||
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean deleteDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
|
||||
|
||||
if (logger.isDebugEnabled()) {
|
||||
|
@ -226,6 +249,10 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback, Aut
|
|||
this.maxDeleteDocumentCacheSize = maxDeleteDocumentCacheSize;
|
||||
}
|
||||
|
||||
public void setMaxRedirectCount(int maxRedirectCount) {
|
||||
this.maxRedirectCount = maxRedirectCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws Exception {
|
||||
try {
|
||||
|
|
Loading…
Add table
Reference in a new issue