This commit is contained in:
Shinsuke Sugaya 2014-02-22 23:50:26 +09:00
parent a637c889f8
commit 29e7c2a34c
5 changed files with 141 additions and 6 deletions

View file

@ -16,9 +16,16 @@
package jp.sf.fess.db.exentity;
import java.util.Map;
import org.seasar.robot.client.S2RobotClientFactory;
public interface CrawlingConfig {
public static final String XPATH_PREFIX = "field.xpath.";
public static final String SCRIPT_PREFIX = "field.script.";
public static final String CLIENT_PREFIX = "client.";
Long getId();
@ -38,6 +45,8 @@ public interface CrawlingConfig {
void initializeClientFactory(S2RobotClientFactory s2RobotClientFactory);
Map<String, String> getConfigParameterMap(ConfigName name);
public enum ConfigType {
WEB("W"), FILE("F"), DATA("D");
@ -59,4 +68,7 @@ public interface CrawlingConfig {
}
}
public enum ConfigName {
CLIENT, XPATH, SCRIPT;
}
}

View file

@ -18,6 +18,7 @@ package jp.sf.fess.db.exentity;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -383,4 +384,9 @@ public class DataCrawlingConfig extends BsDataCrawlingConfig implements
}
}
@Override
public Map<String, String> getConfigParameterMap(final ConfigName name) {
return Collections.emptyMap();
}
}

View file

@ -18,6 +18,7 @@ package jp.sf.fess.db.exentity;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -60,6 +61,8 @@ public class FileCrawlingConfig extends BsFileCrawlingConfig implements
protected volatile Pattern[] excludedDocPathPatterns;
protected volatile Map<ConfigName, Map<String, String>> configParameterMap;
public FileCrawlingConfig() {
super();
setBoost(BigDecimal.ONE);
@ -235,9 +238,10 @@ public class FileCrawlingConfig extends BsFileCrawlingConfig implements
// Parameters
final Map<String, Object> paramMap = new HashMap<String, Object>();
clientFactory.setInitParameterMap(paramMap);
final String configParam = getConfigParameter();
if (StringUtil.isNotBlank(configParam)) {
ParameterUtil.loadConfigParams(paramMap, configParam);
final Map<String, String> clientConfigMap = getConfigParameterMap(ConfigName.CLIENT);
if (clientConfigMap != null) {
paramMap.putAll(clientConfigMap);
}
// auth params
@ -262,4 +266,39 @@ public class FileCrawlingConfig extends BsFileCrawlingConfig implements
smbAuthList.toArray(new SmbAuthentication[smbAuthList.size()]));
}
@Override
public Map<String, String> getConfigParameterMap(final ConfigName name) {
if (configParameterMap == null) {
final Map<ConfigName, Map<String, String>> map = new HashMap<>();
final Map<String, String> clientConfigMap = new HashMap<>();
final Map<String, String> xpathConfigMap = new HashMap<>();
final Map<String, String> scriptConfigMap = new HashMap<>();
map.put(ConfigName.CLIENT, clientConfigMap);
map.put(ConfigName.XPATH, xpathConfigMap);
map.put(ConfigName.SCRIPT, scriptConfigMap);
for (final Map.Entry<String, String> entry : ParameterUtil.parse(
getConfigParameter()).entrySet()) {
final String key = entry.getKey();
if (key.startsWith(CLIENT_PREFIX)) {
clientConfigMap.put(key.substring(CLIENT_PREFIX.length()),
entry.getValue());
} else if (key.startsWith(XPATH_PREFIX)) {
xpathConfigMap.put(key.substring(XPATH_PREFIX.length()),
entry.getValue());
} else if (key.startsWith(SCRIPT_PREFIX)) {
scriptConfigMap.put(key.substring(SCRIPT_PREFIX.length()),
entry.getValue());
}
}
configParameterMap = map;
}
final Map<String, String> configMap = configParameterMap.get(name);
if (configMap == null) {
return Collections.emptyMap();
}
return configMap;
}
}

View file

@ -18,6 +18,7 @@ package jp.sf.fess.db.exentity;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -59,6 +60,8 @@ public class WebCrawlingConfig extends BsWebCrawlingConfig implements
protected volatile Pattern[] excludedDocUrlPatterns;
protected volatile Map<ConfigName, Map<String, String>> configParameterMap;
public WebCrawlingConfig() {
super();
setBoost(BigDecimal.ONE);
@ -242,9 +245,9 @@ public class WebCrawlingConfig extends BsWebCrawlingConfig implements
final Map<String, Object> paramMap = new HashMap<String, Object>();
clientFactory.setInitParameterMap(paramMap);
final String configParam = getConfigParameter();
if (StringUtil.isNotBlank(configParam)) {
ParameterUtil.loadConfigParams(paramMap, configParam);
final Map<String, String> clientConfigMap = getConfigParameterMap(ConfigName.CLIENT);
if (clientConfigMap != null) {
paramMap.putAll(clientConfigMap);
}
final String userAgent = getUserAgent();
@ -274,4 +277,38 @@ public class WebCrawlingConfig extends BsWebCrawlingConfig implements
}
@Override
public Map<String, String> getConfigParameterMap(final ConfigName name) {
if (configParameterMap == null) {
final Map<ConfigName, Map<String, String>> map = new HashMap<>();
final Map<String, String> clientConfigMap = new HashMap<>();
final Map<String, String> xpathConfigMap = new HashMap<>();
final Map<String, String> scriptConfigMap = new HashMap<>();
map.put(ConfigName.CLIENT, clientConfigMap);
map.put(ConfigName.XPATH, xpathConfigMap);
map.put(ConfigName.SCRIPT, scriptConfigMap);
for (final Map.Entry<String, String> entry : ParameterUtil.parse(
getConfigParameter()).entrySet()) {
final String key = entry.getKey();
if (key.startsWith(CLIENT_PREFIX)) {
clientConfigMap.put(key.substring(CLIENT_PREFIX.length()),
entry.getValue());
} else if (key.startsWith(XPATH_PREFIX)) {
xpathConfigMap.put(key.substring(XPATH_PREFIX.length()),
entry.getValue());
} else if (key.startsWith(SCRIPT_PREFIX)) {
scriptConfigMap.put(key.substring(SCRIPT_PREFIX.length()),
entry.getValue());
}
}
configParameterMap = map;
}
final Map<String, String> configMap = configParameterMap.get(name);
if (configMap == null) {
return Collections.emptyMap();
}
return configMap;
}
}

View file

@ -38,6 +38,7 @@ import javax.xml.transform.TransformerException;
import jp.sf.fess.Constants;
import jp.sf.fess.db.exentity.CrawlingConfig;
import jp.sf.fess.db.exentity.CrawlingConfig.ConfigName;
import jp.sf.fess.helper.CrawlingConfigHelper;
import jp.sf.fess.helper.CrawlingSessionHelper;
import jp.sf.fess.helper.FileTypeHelper;
@ -52,6 +53,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.xpath.objects.XObject;
import org.cyberneko.html.parsers.DOMParser;
import org.seasar.framework.util.OgnlUtil;
import org.seasar.framework.util.SerializeUtil;
import org.seasar.framework.util.StringUtil;
import org.seasar.robot.RobotCrawlAccessException;
@ -331,6 +333,45 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
putResultDataBody(dataMap, "url", url); // set again
}
// from config
final Map<String, String> xpathConfigMap = crawlingConfig
.getConfigParameterMap(ConfigName.XPATH);
final Map<String, String> scriptConfigMap = crawlingConfig
.getConfigParameterMap(ConfigName.SCRIPT);
for (final Map.Entry<String, String> entry : xpathConfigMap.entrySet()) {
String value = getSingleNodeValue(document, entry.getValue(), true);
final String key = entry.getKey();
final String template = scriptConfigMap.get(key);
if (template != null) {
final Map<String, Object> paramMap = new HashMap<>(
dataMap.size());
paramMap.putAll(dataMap);
paramMap.put("value", value);
value = convertValue(template, paramMap);
}
if (value != null) {
putResultDataBody(dataMap, key, value);
}
}
}
protected String convertValue(final String template,
final Map<String, Object> paramMap) {
if (StringUtil.isEmpty(template)) {
return Constants.EMPTY_STRING;
}
try {
final Object exp = OgnlUtil.parseExpression(template);
final Object value = OgnlUtil.getValue(exp, paramMap);
if (value == null) {
return null;
}
return value.toString();
} catch (final Exception e) {
logger.warn("Invalid value format: " + template, e);
return null;
}
}
protected String getCanonicalUrl(final ResponseData responseData,