fix #2722 add config.html.pruned.tags

This commit is contained in:
Shinsuke Sugaya 2023-02-18 21:51:23 +09:00
parent 14b9e897cc
commit 04dcf34ad7
3 changed files with 60 additions and 29 deletions

View file

@ -30,6 +30,7 @@ import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.UnaryOperator;
import java.util.stream.Collectors;
import javax.annotation.PostConstruct;
@ -107,6 +108,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
protected Map<String, Boolean> fieldPrunedRuleMap = new HashMap<>();
protected Map<String, PrunedTag[]> prunedTagsCache = new HashMap<>();
@PostConstruct
public void init() {
if (logger.isDebugEnabled()) {
@ -171,7 +174,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final Boolean isPruned = fieldPrunedRuleMap.get(entry.getKey());
Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
if (value != null && isPruned != null && isPruned.booleanValue()) {
value = pruneNode(value);
value = pruneNode(value, getCrawlingConfig(responseData));
}
putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
break;
@ -348,8 +351,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final CrawlingConfig crawlingConfig = getCrawlingConfig(responseData);
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
@ -381,13 +383,15 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
final String lang = systemHelper.normalizeHtmlLang(
getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), node -> pruneNode(node, crawlingConfig)));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}
// title
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap),
prunedContent ? node -> pruneNode(node, crawlingConfig) : node -> node);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(),
documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
@ -411,7 +415,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
}
// digest
final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), node -> node);
if (StringUtil.isNotBlank(digest)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
} else {
@ -488,7 +492,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
final String key = e.getKey();
final String value = getSingleNodeValue(document, e.getValue(), true);
final String value = getSingleNodeValue(document, e.getValue(), node -> pruneNode(node, crawlingConfig));
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
@ -498,6 +502,11 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
});
}
protected CrawlingConfig getCrawlingConfig(final ResponseData responseData) {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
return crawlingConfigHelper.get(responseData.getSessionId());
}
protected String getLangXpath(final FessConfig fessConfig, final Map<String, String> xpathConfigMap) {
final String xpath = xpathConfigMap.get(XPath.DEFAULT_LANG);
if (StringUtil.isNotBlank(xpath)) {
@ -531,7 +540,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
if (StringUtil.isBlank(xpath)) {
return null;
}
final String canonicalUrl = getSingleNodeValue(document, xpath, false);
final String canonicalUrl = getSingleNodeValue(document, xpath, node -> node);
if (StringUtil.isBlank(canonicalUrl)) {
return null;
}
@ -569,7 +578,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return value;
}
protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
protected String getSingleNodeValue(final Document document, final String xpath, final UnaryOperator<Node> pruneFunc) {
StringBuilder buf = null;
XPathNodes list = null;
try {
@ -582,9 +591,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
if (useGoogleOffOn) {
node = processGoogleOffOn(node, new ValueHolder<>(true));
}
if (pruned) {
node = pruneNode(node);
}
node = pruneFunc.apply(node);
parseTextContent(node, buf);
}
} catch (final Exception e) {
@ -645,13 +652,36 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return node;
}
protected Node pruneNode(final Node node) {
protected Node pruneNode(final Node node, final CrawlingConfig crawlingConfig) {
PrunedTag[] prunedTags = null;
if (crawlingConfig != null) {
final String configId = crawlingConfig.getConfigId();
prunedTags = prunedTagsCache.get(configId);
if (prunedTags == null) {
final Map<String, String> configMap = crawlingConfig.getConfigParameterMap(ConfigName.CONFIG);
final String value = configMap.get(CrawlingConfig.Param.Config.HTML_PRUNED_TAGS);
if (StringUtil.isNotBlank(value)) {
prunedTags = PrunedTag.parse(value);
}
if (prunedTags == null) {
prunedTags = fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
}
prunedTagsCache.put(configId, prunedTags);
}
}
if (prunedTags == null) {
prunedTags = fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
}
return pruneNodeByTags(node, prunedTags);
}
protected Node pruneNodeByTags(final Node node, final PrunedTag[] prunedTags) {
final NodeList nodeList = node.getChildNodes();
final List<Node> childNodeList = new ArrayList<>();
final List<Node> removedNodeList = new ArrayList<>();
for (int i = 0; i < nodeList.getLength(); i++) {
final Node childNode = nodeList.item(i);
if (isPrunedTag(childNode)) {
if (isPrunedTag(childNode, prunedTags)) {
removedNodeList.add(childNode);
} else {
childNodeList.add(childNode);
@ -663,14 +693,14 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
for (final Node childNode : childNodeList) {
pruneNode(childNode);
pruneNodeByTags(childNode, prunedTags);
}
return node;
}
protected boolean isPrunedTag(final Node node) {
for (final PrunedTag prunedTag : fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray()) {
protected boolean isPrunedTag(final Node node, final PrunedTag[] prunedTags) {
for (final PrunedTag prunedTag : prunedTags) {
if (prunedTag.matches(node)) {
return true;
}

View file

@ -128,6 +128,7 @@ public interface CrawlingConfig {
public static final String CLEANUP_URL_FILTERS = "cleanup.urlFilters";
public static final String JCIFS_PREFIX = "jcifs.";
public static final String HTML_CANONICAL_XPATH = "html.canonical.xpath";
public static final String HTML_PRUNED_TAGS = "html.pruned.tags";
public static final String PIPELINE = "pipeline";
public static final String IGNORE_ROBOTS_TAGS = "ignore.robots.tags";
public static final String SCRIPT_TYPE = "script.type";

View file

@ -141,7 +141,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final Node pruneNode = transformer.pruneNode(document.cloneNode(true), null);
assertEquals(getXmlString(document), getXmlString(pruneNode));
ComponentUtil.setFessConfig(null);
}
@ -161,7 +161,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final Node pruneNode = transformer.pruneNode(document.cloneNode(true), null);
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<SCRIPT>"));
@ -190,7 +190,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final Node pruneNode = transformer.pruneNode(document.cloneNode(true), null);
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<SCRIPT>"));
@ -219,7 +219,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final Node pruneNode = transformer.pruneNode(document.cloneNode(true), null);
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<DIV>"));
@ -248,7 +248,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final Node pruneNode = transformer.pruneNode(document.cloneNode(true), null);
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<DIV>"));
@ -669,22 +669,22 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
String data = "<html><body>aaa<style>bbb</style>ccc</body></html>";
Document document = getDocument(data);
String value = transformer.getSingleNodeValue(document, "//BODY", false);
String value = transformer.getSingleNodeValue(document, "//BODY", node -> node);
assertEquals("aaa bbb ccc", value);
data = "<html><body> aaa <p> bbb <b>ccc</b> </p> </body></html>";
document = getDocument(data);
value = transformer.getSingleNodeValue(document, "//BODY", false);
value = transformer.getSingleNodeValue(document, "//BODY", node -> node);
assertEquals("aaa bbb ccc", value);
data = "<html><body> aaa <p> bbb <aaa>ccc</bbb> </p> </body></html>";
document = getDocument(data);
value = transformer.getSingleNodeValue(document, "//BODY", false);
value = transformer.getSingleNodeValue(document, "//BODY", node -> node);
assertEquals("aaa bbb ccc", value);
data = "<html><body> aaa <p> bbb <!-- test -->ccc<!-- test --> </p> </body></html>";
document = getDocument(data);
value = transformer.getSingleNodeValue(document, "//BODY", false);
value = transformer.getSingleNodeValue(document, "//BODY", node -> node);
assertEquals("aaa bbb ccc", value);
}
@ -693,13 +693,13 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final String data = "<html><head><meta name=\"keywords\" content=\"bbb\"></head><body>aaa</body></html>";
final Document document = getDocument(data);
String value = transformer.getSingleNodeValue(document, "//BODY", false);
String value = transformer.getSingleNodeValue(document, "//BODY", node -> node);
assertEquals("aaa", value);
value = transformer.getSingleNodeValue(document, "//META[@name='keywords']/@content", false);
value = transformer.getSingleNodeValue(document, "//META[@name='keywords']/@content", node -> node);
assertEquals("bbb", value);
value = transformer.getSingleNodeValue(document, "//META[@name='keywords']/@content|//BODY", false);
value = transformer.getSingleNodeValue(document, "//META[@name='keywords']/@content|//BODY", node -> node);
assertEquals("bbb aaa", value);
}