fix #727 add processRequest to DocumentHelper

This commit is contained in:
Shinsuke Sugaya 2016-10-03 23:24:31 +09:00
parent a834890f24
commit 37e18d172e
4 changed files with 79 additions and 4 deletions

View file

@ -194,6 +194,7 @@ public class CsvDataStoreImpl extends AbstractDataStoreImpl {
resultMap.putAll(paramMap);
resultMap.put("csvfile", csvFile.getAbsolutePath());
resultMap.put("csvfilename", csvFile.getName());
resultMap.put("crawlingConfig", dataConfig);
boolean foundValues = false;
for (int i = 0; i < list.size(); i++) {
String key = null;

View file

@ -109,7 +109,7 @@ public class DatabaseDataStoreImpl extends AbstractDataStoreImpl {
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(entry.getValue(), rs, paramMap);
final Object convertValue = convertValue(config, entry.getValue(), rs, paramMap);
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
@ -189,15 +189,16 @@ public class DatabaseDataStoreImpl extends AbstractDataStoreImpl {
}
}
protected Object convertValue(final String template, final ResultSet rs, final Map<String, String> paramMap) {
return convertValue(template, new ResultSetParamMap(rs, paramMap));
protected Object convertValue(final DataConfig config, final String template, final ResultSet rs, final Map<String, String> paramMap) {
return convertValue(template, new ResultSetParamMap(config, rs, paramMap));
}
protected static class ResultSetParamMap implements Map<String, Object> {
private final Map<String, Object> paramMap = new HashMap<>();
public ResultSetParamMap(final ResultSet resultSet, final Map<String, String> paramMap) {
public ResultSetParamMap(final DataConfig config, final ResultSet resultSet, final Map<String, String> paramMap) {
this.paramMap.putAll(paramMap);
this.paramMap.put("crawlingConfig", config);
try {
final ResultSetMetaData metaData = resultSet.getMetaData();

View file

@ -162,6 +162,7 @@ public class EsDataStoreImpl extends AbstractDataStoreImpl {
resultMap.put("version", Long.valueOf(hit.getVersion()));
resultMap.put("hit", hit);
resultMap.put("source", hit.getSource());
resultMap.put("crawlingConfig", dataConfig);
if (logger.isDebugEnabled()) {
for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {

View file

@ -18,14 +18,32 @@ package org.codelibs.fess.helper;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.io.SerializeUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.client.CrawlerClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.processor.ResponseProcessor;
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
import org.codelibs.fess.crawler.rule.Rule;
import org.codelibs.fess.crawler.rule.RuleManager;
import org.codelibs.fess.crawler.transformer.Transformer;
import org.codelibs.fess.crawler.util.TextUtil;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.lastaflute.di.core.SingletonLaContainer;
public class DocumentHelper {
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
@ -85,4 +103,58 @@ public class DocumentHelper {
return StringUtil.EMPTY; // empty
}
}
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String sessionId, final String url) {
if (StringUtil.isBlank(sessionId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
crawlingConfig.initializeClientFactory(crawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, "Redirected from " + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(sessionId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked")
final Map<String, Object> result = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
return result;
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
} else {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: "
+ responseProcessor + ", url: " + url);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
}