diff --git a/src/main/java/jp/sf/fess/db/exentity/CrawlingConfig.java b/src/main/java/jp/sf/fess/db/exentity/CrawlingConfig.java index 347ae285c..8a07d3e57 100644 --- a/src/main/java/jp/sf/fess/db/exentity/CrawlingConfig.java +++ b/src/main/java/jp/sf/fess/db/exentity/CrawlingConfig.java @@ -16,9 +16,16 @@ package jp.sf.fess.db.exentity; +import java.util.Map; + import org.seasar.robot.client.S2RobotClientFactory; public interface CrawlingConfig { + public static final String XPATH_PREFIX = "field.xpath."; + + public static final String SCRIPT_PREFIX = "field.script."; + + public static final String CLIENT_PREFIX = "client."; Long getId(); @@ -38,6 +45,8 @@ public interface CrawlingConfig { void initializeClientFactory(S2RobotClientFactory s2RobotClientFactory); + Map getConfigParameterMap(ConfigName name); + public enum ConfigType { WEB("W"), FILE("F"), DATA("D"); @@ -59,4 +68,7 @@ public interface CrawlingConfig { } } + public enum ConfigName { + CLIENT, XPATH, SCRIPT; + } } \ No newline at end of file diff --git a/src/main/java/jp/sf/fess/db/exentity/DataCrawlingConfig.java b/src/main/java/jp/sf/fess/db/exentity/DataCrawlingConfig.java index 8d230d914..4e7af60d2 100644 --- a/src/main/java/jp/sf/fess/db/exentity/DataCrawlingConfig.java +++ b/src/main/java/jp/sf/fess/db/exentity/DataCrawlingConfig.java @@ -18,6 +18,7 @@ package jp.sf.fess.db.exentity; import java.math.BigDecimal; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -383,4 +384,9 @@ public class DataCrawlingConfig extends BsDataCrawlingConfig implements } } + + @Override + public Map getConfigParameterMap(final ConfigName name) { + return Collections.emptyMap(); + } } diff --git a/src/main/java/jp/sf/fess/db/exentity/FileCrawlingConfig.java b/src/main/java/jp/sf/fess/db/exentity/FileCrawlingConfig.java index 4f7d21f5c..ea0370a6d 100644 --- a/src/main/java/jp/sf/fess/db/exentity/FileCrawlingConfig.java +++ b/src/main/java/jp/sf/fess/db/exentity/FileCrawlingConfig.java @@ -18,6 +18,7 @@ package jp.sf.fess.db.exentity; import java.math.BigDecimal; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -60,6 +61,8 @@ public class FileCrawlingConfig extends BsFileCrawlingConfig implements protected volatile Pattern[] excludedDocPathPatterns; + protected volatile Map> configParameterMap; + public FileCrawlingConfig() { super(); setBoost(BigDecimal.ONE); @@ -235,9 +238,10 @@ public class FileCrawlingConfig extends BsFileCrawlingConfig implements // Parameters final Map paramMap = new HashMap(); clientFactory.setInitParameterMap(paramMap); - final String configParam = getConfigParameter(); - if (StringUtil.isNotBlank(configParam)) { - ParameterUtil.loadConfigParams(paramMap, configParam); + + final Map clientConfigMap = getConfigParameterMap(ConfigName.CLIENT); + if (clientConfigMap != null) { + paramMap.putAll(clientConfigMap); } // auth params @@ -262,4 +266,39 @@ public class FileCrawlingConfig extends BsFileCrawlingConfig implements smbAuthList.toArray(new SmbAuthentication[smbAuthList.size()])); } + + @Override + public Map getConfigParameterMap(final ConfigName name) { + if (configParameterMap == null) { + final Map> map = new HashMap<>(); + final Map clientConfigMap = new HashMap<>(); + final Map xpathConfigMap = new HashMap<>(); + final Map scriptConfigMap = new HashMap<>(); + map.put(ConfigName.CLIENT, clientConfigMap); + map.put(ConfigName.XPATH, xpathConfigMap); + map.put(ConfigName.SCRIPT, scriptConfigMap); + for (final Map.Entry entry : ParameterUtil.parse( + getConfigParameter()).entrySet()) { + final String key = entry.getKey(); + if (key.startsWith(CLIENT_PREFIX)) { + clientConfigMap.put(key.substring(CLIENT_PREFIX.length()), + entry.getValue()); + } else if (key.startsWith(XPATH_PREFIX)) { + xpathConfigMap.put(key.substring(XPATH_PREFIX.length()), + entry.getValue()); + } else if (key.startsWith(SCRIPT_PREFIX)) { + scriptConfigMap.put(key.substring(SCRIPT_PREFIX.length()), + entry.getValue()); + } + } + + configParameterMap = map; + } + + final Map configMap = configParameterMap.get(name); + if (configMap == null) { + return Collections.emptyMap(); + } + return configMap; + } } diff --git a/src/main/java/jp/sf/fess/db/exentity/WebCrawlingConfig.java b/src/main/java/jp/sf/fess/db/exentity/WebCrawlingConfig.java index 9a1a19042..e4c19a768 100644 --- a/src/main/java/jp/sf/fess/db/exentity/WebCrawlingConfig.java +++ b/src/main/java/jp/sf/fess/db/exentity/WebCrawlingConfig.java @@ -18,6 +18,7 @@ package jp.sf.fess.db.exentity; import java.math.BigDecimal; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -59,6 +60,8 @@ public class WebCrawlingConfig extends BsWebCrawlingConfig implements protected volatile Pattern[] excludedDocUrlPatterns; + protected volatile Map> configParameterMap; + public WebCrawlingConfig() { super(); setBoost(BigDecimal.ONE); @@ -242,9 +245,9 @@ public class WebCrawlingConfig extends BsWebCrawlingConfig implements final Map paramMap = new HashMap(); clientFactory.setInitParameterMap(paramMap); - final String configParam = getConfigParameter(); - if (StringUtil.isNotBlank(configParam)) { - ParameterUtil.loadConfigParams(paramMap, configParam); + final Map clientConfigMap = getConfigParameterMap(ConfigName.CLIENT); + if (clientConfigMap != null) { + paramMap.putAll(clientConfigMap); } final String userAgent = getUserAgent(); @@ -274,4 +277,38 @@ public class WebCrawlingConfig extends BsWebCrawlingConfig implements } + @Override + public Map getConfigParameterMap(final ConfigName name) { + if (configParameterMap == null) { + final Map> map = new HashMap<>(); + final Map clientConfigMap = new HashMap<>(); + final Map xpathConfigMap = new HashMap<>(); + final Map scriptConfigMap = new HashMap<>(); + map.put(ConfigName.CLIENT, clientConfigMap); + map.put(ConfigName.XPATH, xpathConfigMap); + map.put(ConfigName.SCRIPT, scriptConfigMap); + for (final Map.Entry entry : ParameterUtil.parse( + getConfigParameter()).entrySet()) { + final String key = entry.getKey(); + if (key.startsWith(CLIENT_PREFIX)) { + clientConfigMap.put(key.substring(CLIENT_PREFIX.length()), + entry.getValue()); + } else if (key.startsWith(XPATH_PREFIX)) { + xpathConfigMap.put(key.substring(XPATH_PREFIX.length()), + entry.getValue()); + } else if (key.startsWith(SCRIPT_PREFIX)) { + scriptConfigMap.put(key.substring(SCRIPT_PREFIX.length()), + entry.getValue()); + } + } + + configParameterMap = map; + } + + final Map configMap = configParameterMap.get(name); + if (configMap == null) { + return Collections.emptyMap(); + } + return configMap; + } } diff --git a/src/main/java/jp/sf/fess/transformer/FessXpathTransformer.java b/src/main/java/jp/sf/fess/transformer/FessXpathTransformer.java index 794380d95..4129e80cc 100644 --- a/src/main/java/jp/sf/fess/transformer/FessXpathTransformer.java +++ b/src/main/java/jp/sf/fess/transformer/FessXpathTransformer.java @@ -38,6 +38,7 @@ import javax.xml.transform.TransformerException; import jp.sf.fess.Constants; import jp.sf.fess.db.exentity.CrawlingConfig; +import jp.sf.fess.db.exentity.CrawlingConfig.ConfigName; import jp.sf.fess.helper.CrawlingConfigHelper; import jp.sf.fess.helper.CrawlingSessionHelper; import jp.sf.fess.helper.FileTypeHelper; @@ -52,6 +53,7 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.xpath.objects.XObject; import org.cyberneko.html.parsers.DOMParser; +import org.seasar.framework.util.OgnlUtil; import org.seasar.framework.util.SerializeUtil; import org.seasar.framework.util.StringUtil; import org.seasar.robot.RobotCrawlAccessException; @@ -331,6 +333,45 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { putResultDataBody(dataMap, "url", url); // set again } + // from config + final Map xpathConfigMap = crawlingConfig + .getConfigParameterMap(ConfigName.XPATH); + final Map scriptConfigMap = crawlingConfig + .getConfigParameterMap(ConfigName.SCRIPT); + for (final Map.Entry entry : xpathConfigMap.entrySet()) { + String value = getSingleNodeValue(document, entry.getValue(), true); + final String key = entry.getKey(); + final String template = scriptConfigMap.get(key); + if (template != null) { + final Map paramMap = new HashMap<>( + dataMap.size()); + paramMap.putAll(dataMap); + paramMap.put("value", value); + value = convertValue(template, paramMap); + } + if (value != null) { + putResultDataBody(dataMap, key, value); + } + } + } + + protected String convertValue(final String template, + final Map paramMap) { + if (StringUtil.isEmpty(template)) { + return Constants.EMPTY_STRING; + } + + try { + final Object exp = OgnlUtil.parseExpression(template); + final Object value = OgnlUtil.getValue(exp, paramMap); + if (value == null) { + return null; + } + return value.toString(); + } catch (final Exception e) { + logger.warn("Invalid value format: " + template, e); + return null; + } } protected String getCanonicalUrl(final ResponseData responseData,