|
@@ -18,14 +18,32 @@ package org.codelibs.fess.helper;
|
|
|
import java.io.IOException;
|
|
|
import java.io.Reader;
|
|
|
import java.io.StringReader;
|
|
|
+import java.util.HashSet;
|
|
|
import java.util.Map;
|
|
|
+import java.util.Set;
|
|
|
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
+import org.codelibs.core.io.SerializeUtil;
|
|
|
import org.codelibs.core.lang.StringUtil;
|
|
|
+import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
|
|
+import org.codelibs.fess.crawler.client.CrawlerClient;
|
|
|
+import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
|
|
+import org.codelibs.fess.crawler.entity.RequestData;
|
|
|
import org.codelibs.fess.crawler.entity.ResponseData;
|
|
|
+import org.codelibs.fess.crawler.entity.ResultData;
|
|
|
+import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
|
|
+import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
|
|
+import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
|
|
+import org.codelibs.fess.crawler.processor.ResponseProcessor;
|
|
|
+import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
|
|
|
+import org.codelibs.fess.crawler.rule.Rule;
|
|
|
+import org.codelibs.fess.crawler.rule.RuleManager;
|
|
|
+import org.codelibs.fess.crawler.transformer.Transformer;
|
|
|
import org.codelibs.fess.crawler.util.TextUtil;
|
|
|
+import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
|
|
import org.codelibs.fess.mylasta.direction.FessConfig;
|
|
|
import org.codelibs.fess.util.ComponentUtil;
|
|
|
+import org.lastaflute.di.core.SingletonLaContainer;
|
|
|
|
|
|
public class DocumentHelper {
|
|
|
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
|
|
@@ -85,4 +103,58 @@ public class DocumentHelper {
|
|
|
return StringUtil.EMPTY; // empty
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String sessionId, final String url) {
|
|
|
+ if (StringUtil.isBlank(sessionId)) {
|
|
|
+ throw new CrawlingAccessException("sessionId is null.");
|
|
|
+ }
|
|
|
+
|
|
|
+ final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
|
|
|
+ crawlingConfig.initializeClientFactory(crawlerClientFactory);
|
|
|
+ final CrawlerClient client = crawlerClientFactory.getClient(url);
|
|
|
+ if (client == null) {
|
|
|
+ throw new CrawlingAccessException("CrawlerClient is null for " + url);
|
|
|
+ }
|
|
|
+
|
|
|
+ final long startTime = System.currentTimeMillis();
|
|
|
+ try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
|
|
|
+ if (responseData.getRedirectLocation() != null) {
|
|
|
+ Set<RequestData> childUrlList = new HashSet<>();
|
|
|
+ childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
|
|
|
+ throw new ChildUrlsException(childUrlList, "Redirected from " + url);
|
|
|
+ }
|
|
|
+ responseData.setExecutionTime(System.currentTimeMillis() - startTime);
|
|
|
+ responseData.setSessionId(sessionId);
|
|
|
+
|
|
|
+ final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
|
|
|
+ final Rule rule = ruleManager.getRule(responseData);
|
|
|
+ if (rule == null) {
|
|
|
+ throw new CrawlingAccessException("No url rule for " + url);
|
|
|
+ } else {
|
|
|
+ responseData.setRuleId(rule.getRuleId());
|
|
|
+ final ResponseProcessor responseProcessor = rule.getResponseProcessor();
|
|
|
+ if (responseProcessor instanceof DefaultResponseProcessor) {
|
|
|
+ final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
|
|
|
+ final ResultData resultData = transformer.transform(responseData);
|
|
|
+ final byte[] data = resultData.getData();
|
|
|
+ if (data != null) {
|
|
|
+ try {
|
|
|
+ @SuppressWarnings("unchecked")
|
|
|
+ final Map<String, Object> result = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
|
|
|
+ return result;
|
|
|
+ } catch (final Exception e) {
|
|
|
+ throw new CrawlerSystemException("Could not create an instance from bytes.", e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: "
|
|
|
+ + responseProcessor + ", url: " + url);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return null;
|
|
|
+ } catch (final Exception e) {
|
|
|
+ throw new CrawlingAccessException("Failed to parse " + url, e);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
}
|