fix #1211 css attribute selector support
This commit is contained in:
parent
53a59b58fd
commit
8e987df540
6 changed files with 97 additions and 52 deletions
|
@ -15,10 +15,10 @@
|
|||
*/
|
||||
package org.codelibs.fess.app.web.base;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
|
|
@ -35,7 +35,6 @@ import org.codelibs.fess.Constants;
|
|||
import org.codelibs.fess.app.service.LabelTypeService;
|
||||
import org.codelibs.fess.entity.SearchRequestParams.SearchRequestType;
|
||||
import org.codelibs.fess.es.config.exentity.LabelType;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
|
|
@ -35,6 +35,7 @@ import java.util.Map;
|
|||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.function.Function;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
|
@ -47,6 +48,7 @@ import org.codelibs.core.lang.StringUtil;
|
|||
import org.codelibs.core.misc.Pair;
|
||||
import org.codelibs.core.misc.Tuple3;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.exception.FessSystemException;
|
||||
import org.codelibs.fess.helper.PermissionHelper;
|
||||
import org.codelibs.fess.mylasta.action.FessUserBean;
|
||||
import org.codelibs.fess.taglib.FessFunctions;
|
||||
|
@ -635,23 +637,26 @@ public interface FessProp {
|
|||
PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags");
|
||||
if (tags == null) {
|
||||
tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> {
|
||||
final String[] cssValues = v.split("\\.", 2);
|
||||
final String css;
|
||||
if (cssValues.length == 2) {
|
||||
css = cssValues[1];
|
||||
} else {
|
||||
css = null;
|
||||
final Pattern pattern = Pattern.compile("(\\w+)(\\[[^\\]]+\\])?(\\.\\w+)?(#\\w+)?");
|
||||
final Matcher matcher = pattern.matcher(v.trim());
|
||||
if (matcher.matches()) {
|
||||
final PrunedTag tag = new PrunedTag(matcher.group(1));
|
||||
if (matcher.group(2) != null) {
|
||||
final String attrPair = matcher.group(2).substring(1, matcher.group(2).length() - 1);
|
||||
final Matcher equalMatcher = Pattern.compile("(\\w+)=(\\w+)").matcher(attrPair);
|
||||
if (equalMatcher.matches()) {
|
||||
tag.setAttr(equalMatcher.group(1), equalMatcher.group(2));
|
||||
}
|
||||
}
|
||||
if (matcher.group(3) != null) {
|
||||
tag.setCss(matcher.group(3).substring(1));
|
||||
}
|
||||
if (matcher.group(4) != null) {
|
||||
tag.setId(matcher.group(4).substring(1));
|
||||
}
|
||||
return tag;
|
||||
}
|
||||
|
||||
final String[] idValues = cssValues[0].split("#", 2);
|
||||
final String id;
|
||||
if (idValues.length == 2) {
|
||||
id = idValues[1];
|
||||
} else {
|
||||
id = null;
|
||||
}
|
||||
|
||||
return new PrunedTag(idValues[0], id, css);
|
||||
throw new FessSystemException("Invalid pruned tag: " + v);
|
||||
}).toArray(n -> new PrunedTag[n]));
|
||||
propMap.put("crawlerDocumentHtmlPrunedTags", tags);
|
||||
}
|
||||
|
|
|
@ -15,24 +15,30 @@
|
|||
*/
|
||||
package org.codelibs.fess.util;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.stream.StreamUtil;
|
||||
import org.w3c.dom.Node;
|
||||
|
||||
public class PrunedTag {
|
||||
private final String tag;
|
||||
private final String id;
|
||||
private final String css;
|
||||
private String id;
|
||||
private String css;
|
||||
private String attrName;
|
||||
private String attrValue;
|
||||
|
||||
public PrunedTag(final String tag, final String id, final String css) {
|
||||
public PrunedTag(final String tag) {
|
||||
this.tag = tag;
|
||||
this.id = id;
|
||||
this.css = css;
|
||||
|
||||
}
|
||||
|
||||
public boolean matches(final Node node) {
|
||||
if (tag.equalsIgnoreCase(node.getNodeName())) {
|
||||
if (attrName != null) {
|
||||
Node attr = node.getAttributes().getNamedItem(attrName);
|
||||
if (attr == null || !attrValue.equals(attr.getNodeValue())) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (id == null) {
|
||||
if (css == null) {
|
||||
return true;
|
||||
|
@ -56,11 +62,6 @@ public class PrunedTag {
|
|||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
|
@ -83,27 +84,28 @@ public class PrunedTag {
|
|||
return false;
|
||||
}
|
||||
final PrunedTag other = (PrunedTag) obj;
|
||||
if (css == null) {
|
||||
if (other.css != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!css.equals(other.css)) {
|
||||
return false;
|
||||
}
|
||||
if (id == null) {
|
||||
if (other.id != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!id.equals(other.id)) {
|
||||
return false;
|
||||
}
|
||||
if (tag == null) {
|
||||
if (other.tag != null) {
|
||||
return false;
|
||||
}
|
||||
} else if (!tag.equals(other.tag)) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
return StringUtils.compare(tag, other.tag) == 0 //
|
||||
&& StringUtils.compare(css, other.css) == 0 //
|
||||
&& StringUtils.compare(id, other.id) == 0 //
|
||||
&& StringUtils.compare(attrName, other.attrName) == 0 //
|
||||
&& StringUtils.compare(attrValue, other.attrValue) == 0;
|
||||
}
|
||||
|
||||
public void setId(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public void setCss(String css) {
|
||||
this.css = css;
|
||||
}
|
||||
|
||||
public void setAttr(String name, String value) {
|
||||
this.attrName = name;
|
||||
this.attrValue = value;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + ", attrName=" + attrName + ", attrValue=" + attrValue + "]";
|
||||
}
|
||||
}
|
||||
|
|
|
@ -119,7 +119,7 @@ crawler.document.html.content.xpath=//BODY
|
|||
crawler.document.html.lang.xpath=//HTML/@lang
|
||||
crawler.document.html.digest.xpath=//META[@name='description']/@content
|
||||
crawler.document.html.canonical.xpath=//LINK[@rel='canonical']/@href
|
||||
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav
|
||||
crawler.document.html.pruned.tags=noscript,script,style,header,footer,nav,a[rel="nofollow"]
|
||||
crawler.document.html.max.digest.length=200
|
||||
|
||||
# file
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
package org.codelibs.fess.mylasta.direction;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
@ -23,7 +24,12 @@ import java.util.HashMap;
|
|||
import org.codelibs.core.io.FileUtil;
|
||||
import org.codelibs.core.misc.DynamicProperties;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
import org.codelibs.fess.util.PrunedTag;
|
||||
import org.cyberneko.html.parsers.DOMParser;
|
||||
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Node;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
public class FessPropTest extends UnitFessTestCase {
|
||||
|
||||
|
@ -120,6 +126,39 @@ public class FessPropTest extends UnitFessTestCase {
|
|||
assertEquals(12288, spaceChars[1]);
|
||||
}
|
||||
|
||||
public void test_getCrawlerDocumentHtmlPrunedTagsAsArray() throws Exception {
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public String getCrawlerDocumentHtmlPrunedTags() {
|
||||
return "script,div#main,p.image,a[rel=nofollow]";
|
||||
}
|
||||
};
|
||||
|
||||
PrunedTag[] tags = fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
|
||||
assertTrue(matchesTag(tags[0], "<script></script>"));
|
||||
assertTrue(matchesTag(tags[0], "<script id=\\\"main\\\"></script>"));
|
||||
assertFalse(matchesTag(tags[0], "<a></a>"));
|
||||
|
||||
assertTrue(matchesTag(tags[1], "<div id=\"main\"></div>"));
|
||||
assertFalse(matchesTag(tags[1], "<div></div>"));
|
||||
|
||||
assertTrue(matchesTag(tags[2], "<p class=\"image\"></p>"));
|
||||
assertFalse(matchesTag(tags[2], "<p></p>"));
|
||||
|
||||
assertTrue(matchesTag(tags[3], "<a rel=\"nofollow\"></a>"));
|
||||
assertFalse(matchesTag(tags[3], "<a></a>"));
|
||||
}
|
||||
|
||||
private boolean matchesTag(final PrunedTag tag, final String text) throws Exception {
|
||||
final DOMParser parser = new DOMParser();
|
||||
final String html = "<html><body>" + text + "</body></html>";
|
||||
final ByteArrayInputStream is = new ByteArrayInputStream(html.getBytes("UTF-8"));
|
||||
parser.parse(new InputSource(is));
|
||||
Node node = parser.getDocument().getFirstChild().getLastChild().getFirstChild();
|
||||
return tag.matches(node);
|
||||
}
|
||||
|
||||
public void test_normalizeQueryLanguages() {
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
|
|
Loading…
Add table
Reference in a new issue