fix #1534 support _ in pruned tags

This commit is contained in:
Shinsuke Sugaya 2018-03-01 09:48:23 +09:00
parent 25b96c5978
commit 00f8e23a06
2 changed files with 5 additions and 2 deletions

View file

@ -659,7 +659,7 @@ public interface FessProp {
final PrunedTag tag = new PrunedTag(matcher.group(1));
if (matcher.group(2) != null) {
final String attrPair = matcher.group(2).substring(1, matcher.group(2).length() - 1);
final Matcher equalMatcher = Pattern.compile("(\\w+)=(\\w+)").matcher(attrPair);
final Matcher equalMatcher = Pattern.compile("([\\w\\-]+)=(\\S+)").matcher(attrPair);
if (equalMatcher.matches()) {
tag.setAttr(equalMatcher.group(1), equalMatcher.group(2));
}

View file

@ -146,7 +146,7 @@ public class FessPropTest extends UnitFessTestCase {
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "script,div#main,p.image,a[rel=nofollow]";
return "script,div#main,p.image,a[rel=nofollow],div[x-y=a-.:_0]";
}
};
@ -163,6 +163,9 @@ public class FessPropTest extends UnitFessTestCase {
assertTrue(matchesTag(tags[3], "<a rel=\"nofollow\"></a>"));
assertFalse(matchesTag(tags[3], "<a></a>"));
assertTrue(matchesTag(tags[4], "<div x-y=\"a-.:_0\"></div>"));
assertFalse(matchesTag(tags[4], "<div x-y=\"a 0\"></div>"));
}
private boolean matchesTag(final PrunedTag tag, final String text) throws Exception {