Merge branch '10.3.x'

This commit is contained in:
Shinsuke Sugaya 2016-11-14 23:21:08 +09:00
commit 58f11530cb
5 changed files with 183 additions and 22 deletions

View file

@ -64,6 +64,7 @@ import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.PrunedTag;
import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -507,7 +508,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final List<Node> removedNodeList = new ArrayList<>();
for (int i = 0; i < nodeList.getLength(); i++) {
final Node childNode = nodeList.item(i);
if (isPrunedTag(childNode.getNodeName())) {
if (isPrunedTag(childNode)) {
removedNodeList.add(childNode);
} else {
childNodeList.add(childNode);
@ -525,9 +526,9 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return node;
}
protected boolean isPrunedTag(final String tagName) {
for (final String name : getCrawlerDocumentHtmlPrunedTags()) {
if (name.equalsIgnoreCase(tagName)) {
protected boolean isPrunedTag(final Node node) {
for (final PrunedTag prunedTag : fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray()) {
if (prunedTag.matches(node)) {
return true;
}
}
@ -655,10 +656,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF;
}
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
}
public void setUseGoogleOffOn(boolean useGoogleOffOn) {
this.useGoogleOffOn = useGoogleOffOn;
}

View file

@ -44,6 +44,7 @@ import org.codelibs.fess.helper.PermissionHelper;
import org.codelibs.fess.mylasta.action.FessUserBean;
import org.codelibs.fess.taglib.FessFunctions;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.PrunedTag;
import org.dbflute.optional.OptionalThing;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.lastaflute.job.LaJob;
@ -582,8 +583,31 @@ public interface FessProp {
String getCrawlerDocumentHtmlPrunedTags();
public default String[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
return getCrawlerDocumentHtmlPrunedTags().split(",");
public default PrunedTag[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags");
if (tags == null) {
tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> {
final String[] cssValues = v.split("\\.", 2);
final String css;
if (cssValues.length == 2) {
css = cssValues[1];
} else {
css = null;
}
final String[] idValues = cssValues[0].split("#", 2);
final String id;
if (idValues.length == 2) {
id = idValues[1];
} else {
id = null;
}
return new PrunedTag(idValues[0], id, css);
}).toArray(n -> new PrunedTag[n]));
propMap.put("crawlerDocumentHtmlPrunedTags", tags);
}
return tags;
}
String getCrawlerDocumentCacheHtmlMimetypes();

View file

@ -59,6 +59,7 @@ import org.codelibs.fess.indexer.IndexUpdater;
import org.codelibs.fess.job.JobExecutor;
import org.codelibs.fess.ldap.LdapManager;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.mylasta.direction.FessProp;
import org.codelibs.fess.sso.SsoManager;
import org.codelibs.fess.thumbnail.ThumbnailManager;
import org.lastaflute.core.message.MessageManager;
@ -429,6 +430,9 @@ public final class ComponentUtil {
*/
public static void setFessConfig(final FessConfig fessConfig) {
ComponentUtil.fessConfig = fessConfig;
if (fessConfig == null) {
FessProp.propMap.clear();
}
}
}

View file

@ -0,0 +1,58 @@
/*
* Copyright 2012-2016 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.util;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.stream.StreamUtil;
import org.w3c.dom.Node;
public class PrunedTag {
private String tag;
private String id;
private String css;
public PrunedTag(final String tag, final String id, final String css) {
this.tag = tag;
this.id = id;
this.css = css;
}
public boolean matches(final Node node) {
if (tag.equalsIgnoreCase(node.getNodeName())) {
if (id == null) {
if (css == null) {
return true;
} else {
Node classAttr = node.getAttributes().getNamedItem("class");
if (classAttr != null) {
final String value = classAttr.getNodeValue();
if (StringUtil.isNotBlank(value)) {
return StreamUtil.split(value, " ").get(stream -> stream.anyMatch(s -> css.equals(s)));
}
}
}
} else {
Node idAttr = node.getAttributes().getNamedItem("id");
if (idAttr != null) {
final String value = idAttr.getNodeValue();
return id.equals(value);
}
}
}
return false;
}
}

View file

@ -34,7 +34,9 @@ import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.ComponentUtil;
import org.cyberneko.html.parsers.DOMParser;
import org.lastaflute.di.core.exception.ComponentNotFoundException;
import org.w3c.dom.Document;
@ -56,25 +58,36 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return new String[0];
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "";
}
};
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
assertEquals(getXmlString(document), getXmlString(pruneNode));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeNoScript() throws Exception {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return new String[] { "noscript" };
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "noscript";
}
};
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
@ -87,17 +100,23 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
assertTrue(pnString.contains("foo"));
assertFalse(pnString.contains("<NOSCRIPT>"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeScriptAndNoscript() throws Exception {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return new String[] { "script", "noscript" };
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "script,noscript";
}
};
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
@ -110,6 +129,65 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
assertFalse(pnString.contains("foo"));
assertFalse(pnString.contains("<NOSCRIPT>"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeDivId() throws Exception {
final String data = "<html><body><br/><div>foo</div><div id=\"barid\">bar</div></body></html>";
final Document document = getDocument(data);
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "div#barid";
}
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<DIV>"));
assertTrue(docString.contains("foo"));
assertTrue(docString.contains("<DIV id=\"barid\">"));
assertTrue(docString.contains("bar"));
assertTrue(pnString.contains("<DIV>"));
assertTrue(pnString.contains("foo"));
assertFalse(pnString.contains("<DIV id=\"barid\">"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_pruneNode_removeDivClass() throws Exception {
final String data = "<html><body><br/><div>foo</div><div class=\"barcls\">bar</div></body></html>";
final Document document = getDocument(data);
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public String getCrawlerDocumentHtmlPrunedTags() {
return "div.barcls";
}
});
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
final String pnString = getXmlString(pruneNode);
assertTrue(docString.contains("<DIV>"));
assertTrue(docString.contains("foo"));
assertTrue(docString.contains("<DIV class=\"barcls\">"));
assertTrue(docString.contains("bar"));
assertTrue(pnString.contains("<DIV>"));
assertTrue(pnString.contains("foo"));
assertFalse(pnString.contains("<DIV class=\"barcls\">"));
assertFalse(pnString.contains("bar"));
ComponentUtil.setFessConfig(null);
}
public void test_processGoogleOffOn() throws Exception {