Przeglądaj źródła

fix #599 redirect handling in csv crawling

Shinsuke Sugaya 9 lat temu
rodzic
commit
2aebe9d65f

+ 2 - 1
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -167,7 +167,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
                 final Set<RequestData> childUrlSet = new HashSet<>();
                 childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
-                throw new ChildUrlsException(childUrlSet);
+                throw new ChildUrlsException(childUrlSet, this.getClass().getName()
+                        + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
             }
         }
 

+ 66 - 39
src/main/java/org/codelibs/fess/ds/impl/FileListIndexUpdateCallbackImpl.java

@@ -24,6 +24,7 @@ import java.util.concurrent.ExecutorService;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
 
 import org.codelibs.core.io.SerializeUtil;
 import org.codelibs.fess.Constants;
@@ -32,6 +33,7 @@ import org.codelibs.fess.crawler.client.CrawlerClient;
 import org.codelibs.fess.crawler.client.CrawlerClientFactory;
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.entity.ResultData;
+import org.codelibs.fess.crawler.exception.ChildUrlsException;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.processor.ResponseProcessor;
 import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
@@ -59,6 +61,8 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback, Aut
 
     protected int maxDeleteDocumentCacheSize = 100;
 
+    protected int maxRedirectCount = 10;
+
     private final ExecutorService executor;
 
     protected FileListIndexUpdateCallbackImpl(final IndexUpdateCallback indexUpdateCallback,
@@ -113,54 +117,73 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback, Aut
                 return;
             }
 
-            final long startTime = System.currentTimeMillis();
-            try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
-                responseData.setExecutionTime(System.currentTimeMillis() - startTime);
-                if (dataMap.containsKey(Constants.SESSION_ID)) {
-                    responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
-                } else {
-                    responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
+            String processingUrl = url;
+            for (int i = 0; i < maxRedirectCount; i++) {
+                processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
+                if (processingUrl == null) {
+                    break;
                 }
+                dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
+            }
+        }
+    }
 
-                final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
-                final Rule rule = ruleManager.getRule(responseData);
-                if (rule == null) {
-                    logger.warn("No url rule. Data: " + dataMap);
-                } else {
-                    responseData.setRuleId(rule.getRuleId());
-                    final ResponseProcessor responseProcessor = rule.getResponseProcessor();
-                    if (responseProcessor instanceof DefaultResponseProcessor) {
-                        final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
-                        final ResultData resultData = transformer.transform(responseData);
-                        final byte[] data = resultData.getData();
-                        if (data != null) {
-                            try {
-                                @SuppressWarnings("unchecked")
-                                final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
-                                dataMap.putAll(responseDataMap);
-                            } catch (final Exception e) {
-                                throw new CrawlerSystemException("Could not create an instance from bytes.", e);
-                            }
-                        }
+    protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url,
+            final CrawlerClient client) {
+        final long startTime = System.currentTimeMillis();
+        try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
+            if (responseData.getRedirectLocation() != null) {
+                return responseData.getRedirectLocation();
+            }
+            responseData.setExecutionTime(System.currentTimeMillis() - startTime);
+            if (dataMap.containsKey(Constants.SESSION_ID)) {
+                responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
+            } else {
+                responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
+            }
 
-                        // remove
-                        String[] ignoreFields;
-                        if (paramMap.containsKey("ignore.field.names")) {
-                            ignoreFields = paramMap.get("ignore.field.names").split(",");
-                        } else {
-                            ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
+            final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
+            final Rule rule = ruleManager.getRule(responseData);
+            if (rule == null) {
+                logger.warn("No url rule. Data: " + dataMap);
+            } else {
+                responseData.setRuleId(rule.getRuleId());
+                final ResponseProcessor responseProcessor = rule.getResponseProcessor();
+                if (responseProcessor instanceof DefaultResponseProcessor) {
+                    final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
+                    final ResultData resultData = transformer.transform(responseData);
+                    final byte[] data = resultData.getData();
+                    if (data != null) {
+                        try {
+                            @SuppressWarnings("unchecked")
+                            final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
+                            dataMap.putAll(responseDataMap);
+                        } catch (final Exception e) {
+                            throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                         }
-                        stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
+                    }
 
-                        indexUpdateCallback.store(paramMap, dataMap);
+                    // remove
+                    String[] ignoreFields;
+                    if (paramMap.containsKey("ignore.field.names")) {
+                        ignoreFields = paramMap.get("ignore.field.names").split(",");
                     } else {
-                        logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
-                                + ", Data: " + dataMap);
+                        ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                     }
+                    stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
+
+                    indexUpdateCallback.store(paramMap, dataMap);
+                } else {
+                    logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor
+                            + ", Data: " + dataMap);
                 }
-            } catch (final Exception e) {
-                throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
             }
+            return null;
+        } catch (final ChildUrlsException e) {
+            throw new DataStoreCrawlingException(url, "Redirected to "
+                    + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
+        } catch (final Exception e) {
+            throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
         }
     }
 
@@ -226,6 +249,10 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback, Aut
         this.maxDeleteDocumentCacheSize = maxDeleteDocumentCacheSize;
     }
 
+    public void setMaxRedirectCount(int maxRedirectCount) {
+        this.maxRedirectCount = maxRedirectCount;
+    }
+
     @Override
     public void close() throws Exception {
         try {