fix #1211 css attribute selector support

This commit is contained in:
Shinsuke Sugaya 2017-08-08 06:50:32 +09:00
parent 53a59b58fd
commit 8e987df540
6 changed files with 97 additions and 52 deletions

View file

@ -15,10 +15,10 @@
*/
package org.codelibs.fess.app.web.base;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Map;
import java.util.Set;

View file

@ -35,7 +35,6 @@ import org.codelibs.fess.Constants;
import org.codelibs.fess.app.service.LabelTypeService;
import org.codelibs.fess.entity.SearchRequestParams.SearchRequestType;
import org.codelibs.fess.es.config.exentity.LabelType;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View file

@ -35,6 +35,7 @@ import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@ -47,6 +48,7 @@ import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Pair;
import org.codelibs.core.misc.Tuple3;
import org.codelibs.fess.Constants;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.helper.PermissionHelper;
import org.codelibs.fess.mylasta.action.FessUserBean;
import org.codelibs.fess.taglib.FessFunctions;
@ -635,23 +637,26 @@ public interface FessProp {
PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags");
if (tags == null) {
tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> {
final String[] cssValues = v.split("\\.", 2);
final String css;
if (cssValues.length == 2) {
css = cssValues[1];
} else {
css = null;
final Pattern pattern = Pattern.compile("(\\w+)(\\[[^\\]]+\\])?(\\.\\w+)?(#\\w+)?");
final Matcher matcher = pattern.matcher(v.trim());
if (matcher.matches()) {
final PrunedTag tag = new PrunedTag(matcher.group(1));
if (matcher.group(2) != null) {
final String attrPair = matcher.group(2).substring(1, matcher.group(2).length() - 1);
final Matcher equalMatcher = Pattern.compile("(\\w+)=(\\w+)").matcher(attrPair);
if (equalMatcher.matches()) {
tag.setAttr(equalMatcher.group(1), equalMatcher.group(2));
}
}
if (matcher.group(3) != null) {
tag.setCss(matcher.group(3).substring(1));
}
if (matcher.group(4) != null) {
tag.setId(matcher.group(4).substring(1));
}
return tag;
}
final String[] idValues = cssValues[0].split("#", 2);
final String id;
if (idValues.length == 2) {
id = idValues[1];
} else {
id = null;
}
return new PrunedTag(idValues[0], id, css);
throw new FessSystemException("Invalid pruned tag: " + v);
}).toArray(n -> new PrunedTag[n]));
propMap.put("crawlerDocumentHtmlPrunedTags", tags);
}

View file

@ -15,24 +15,30 @@
*/
package org.codelibs.fess.util;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.stream.StreamUtil;
import org.w3c.dom.Node;
public class PrunedTag {
private final String tag;
private final String id;
private final String css;
private String id;
private String css;
private String attrName;
private String attrValue;
public PrunedTag(final String tag, final String id, final String css) {
public PrunedTag(final String tag) {
this.tag = tag;
this.id = id;
this.css = css;
}
public boolean matches(final Node node) {
if (tag.equalsIgnoreCase(node.getNodeName())) {
if (attrName != null) {
Node attr = node.getAttributes().getNamedItem(attrName);
if (attr == null || !attrValue.equals(attr.getNodeValue())) {
return false;
}
}
if (id == null) {
if (css == null) {
return true;
@ -56,11 +62,6 @@ public class PrunedTag {
return false;
}
@Override
public String toString() {
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]";
}
@Override
public int hashCode() {
final int prime = 31;
@ -83,27 +84,28 @@ public class PrunedTag {
return false;
}
final PrunedTag other = (PrunedTag) obj;
if (css == null) {
if (other.css != null) {
return false;
}
} else if (!css.equals(other.css)) {
return false;
}
if (id == null) {
if (other.id != null) {
return false;
}
} else if (!id.equals(other.id)) {
return false;
}
if (tag == null) {
if (other.tag != null) {
return false;
}
} else if (!tag.equals(other.tag)) {
return false;
}
return true;
return StringUtils.compare(tag, other.tag) == 0 //
&& StringUtils.compare(css, other.css) == 0 //
&& StringUtils.compare(id, other.id) == 0 //
&& StringUtils.compare(attrName, other.attrName) == 0 //
&& StringUtils.compare(attrValue, other.attrValue) == 0;
}
public void setId(String id) {
this.id = id;
}
public void setCss(String css) {
this.css = css;
}
public void setAttr(String name, String value) {
this.attrName = name;
this.attrValue = value;
}
@Override
public String toString() {
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + ", attrName=" + attrName + ", attrValue=" + attrValue + "]";
}
}

View file

@ -119,7 +119,7 @@ crawler.document.html.content.xpath=//BODY
crawler.document.html.lang.xpath=//HTML/@lang
crawler.document.html.digest.xpath=//META[@name='description']/@content
crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel="nofollow"]
crawler.document.html.max.digest.length=200
# file

View file

@ -15,6 +15,7 @@
*/
package org.codelibs.fess.mylasta.direction;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
@ -23,7 +24,12 @@ import java.util.HashMap;
import org.codelibs.core.io.FileUtil;
import org.codelibs.core.misc.DynamicProperties;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.PrunedTag;
import org.cyberneko.html.parsers.DOMParser;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
public class FessPropTest extends UnitFessTestCase {
@ -120,6 +126,39 @@ public class FessPropTest extends UnitFessTestCase {
assertEquals(12288, spaceChars[1]);
}
public void test_getCrawlerDocumentHtmlPrunedTagsAsArray() throws Exception {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "script,div#main,p.image,a[rel=nofollow]";
}
};
PrunedTag[] tags = fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
assertTrue(matchesTag(tags[0], "<script></script>"));
assertTrue(matchesTag(tags[0], "<script id=\\\"main\\\"></script>"));
assertFalse(matchesTag(tags[0], "<a></a>"));
assertTrue(matchesTag(tags[1], "<div id=\"main\"></div>"));
assertFalse(matchesTag(tags[1], "<div></div>"));
assertTrue(matchesTag(tags[2], "<p class=\"image\"></p>"));
assertFalse(matchesTag(tags[2], "<p></p>"));
assertTrue(matchesTag(tags[3], "<a rel=\"nofollow\"></a>"));
assertFalse(matchesTag(tags[3], "<a></a>"));
}
private boolean matchesTag(final PrunedTag tag, final String text) throws Exception {
final DOMParser parser = new DOMParser();
final String html = "<html><body>" + text + "</body></html>";
final ByteArrayInputStream is = new ByteArrayInputStream(html.getBytes("UTF-8"));
parser.parse(new InputSource(is));
Node node = parser.getDocument().getFirstChild().getLastChild().getFirstChild();
return tag.matches(node);
}
public void test_normalizeQueryLanguages() {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {