Merge branch '10.3.x'
This commit is contained in:
commit
58f11530cb
5 changed files with 183 additions and 22 deletions
|
@ -64,6 +64,7 @@ import org.codelibs.fess.helper.PathMappingHelper;
|
|||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.PrunedTag;
|
||||
import org.cyberneko.html.parsers.DOMParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -507,7 +508,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
final List<Node> removedNodeList = new ArrayList<>();
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
final Node childNode = nodeList.item(i);
|
||||
if (isPrunedTag(childNode.getNodeName())) {
|
||||
if (isPrunedTag(childNode)) {
|
||||
removedNodeList.add(childNode);
|
||||
} else {
|
||||
childNodeList.add(childNode);
|
||||
|
@ -525,9 +526,9 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
return node;
|
||||
}
|
||||
|
||||
protected boolean isPrunedTag(final String tagName) {
|
||||
for (final String name : getCrawlerDocumentHtmlPrunedTags()) {
|
||||
if (name.equalsIgnoreCase(tagName)) {
|
||||
protected boolean isPrunedTag(final Node node) {
|
||||
for (final PrunedTag prunedTag : fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray()) {
|
||||
if (prunedTag.matches(node)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -655,10 +656,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF;
|
||||
}
|
||||
|
||||
protected String[] getCrawlerDocumentHtmlPrunedTags() {
|
||||
return fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
|
||||
}
|
||||
|
||||
public void setUseGoogleOffOn(boolean useGoogleOffOn) {
|
||||
this.useGoogleOffOn = useGoogleOffOn;
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.codelibs.fess.helper.PermissionHelper;
|
|||
import org.codelibs.fess.mylasta.action.FessUserBean;
|
||||
import org.codelibs.fess.taglib.FessFunctions;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.PrunedTag;
|
||||
import org.dbflute.optional.OptionalThing;
|
||||
import org.elasticsearch.action.search.SearchRequestBuilder;
|
||||
import org.lastaflute.job.LaJob;
|
||||
|
@ -582,8 +583,31 @@ public interface FessProp {
|
|||
|
||||
String getCrawlerDocumentHtmlPrunedTags();
|
||||
|
||||
public default String[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
|
||||
return getCrawlerDocumentHtmlPrunedTags().split(",");
|
||||
public default PrunedTag[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
|
||||
PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags");
|
||||
if (tags == null) {
|
||||
tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> {
|
||||
final String[] cssValues = v.split("\\.", 2);
|
||||
final String css;
|
||||
if (cssValues.length == 2) {
|
||||
css = cssValues[1];
|
||||
} else {
|
||||
css = null;
|
||||
}
|
||||
|
||||
final String[] idValues = cssValues[0].split("#", 2);
|
||||
final String id;
|
||||
if (idValues.length == 2) {
|
||||
id = idValues[1];
|
||||
} else {
|
||||
id = null;
|
||||
}
|
||||
|
||||
return new PrunedTag(idValues[0], id, css);
|
||||
}).toArray(n -> new PrunedTag[n]));
|
||||
propMap.put("crawlerDocumentHtmlPrunedTags", tags);
|
||||
}
|
||||
return tags;
|
||||
}
|
||||
|
||||
String getCrawlerDocumentCacheHtmlMimetypes();
|
||||
|
|
|
@ -59,6 +59,7 @@ import org.codelibs.fess.indexer.IndexUpdater;
|
|||
import org.codelibs.fess.job.JobExecutor;
|
||||
import org.codelibs.fess.ldap.LdapManager;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.mylasta.direction.FessProp;
|
||||
import org.codelibs.fess.sso.SsoManager;
|
||||
import org.codelibs.fess.thumbnail.ThumbnailManager;
|
||||
import org.lastaflute.core.message.MessageManager;
|
||||
|
@ -429,6 +430,9 @@ public final class ComponentUtil {
|
|||
*/
|
||||
public static void setFessConfig(final FessConfig fessConfig) {
|
||||
ComponentUtil.fessConfig = fessConfig;
|
||||
if (fessConfig == null) {
|
||||
FessProp.propMap.clear();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
58
src/main/java/org/codelibs/fess/util/PrunedTag.java
Normal file
58
src/main/java/org/codelibs/fess/util/PrunedTag.java
Normal file
|
@ -0,0 +1,58 @@
|
|||
/*
|
||||
* Copyright 2012-2016 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.util;
|
||||
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.stream.StreamUtil;
|
||||
import org.w3c.dom.Node;
|
||||
|
||||
public class PrunedTag {
|
||||
private String tag;
|
||||
private String id;
|
||||
private String css;
|
||||
|
||||
public PrunedTag(final String tag, final String id, final String css) {
|
||||
this.tag = tag;
|
||||
this.id = id;
|
||||
this.css = css;
|
||||
|
||||
}
|
||||
|
||||
public boolean matches(final Node node) {
|
||||
if (tag.equalsIgnoreCase(node.getNodeName())) {
|
||||
if (id == null) {
|
||||
if (css == null) {
|
||||
return true;
|
||||
} else {
|
||||
Node classAttr = node.getAttributes().getNamedItem("class");
|
||||
if (classAttr != null) {
|
||||
final String value = classAttr.getNodeValue();
|
||||
if (StringUtil.isNotBlank(value)) {
|
||||
return StreamUtil.split(value, " ").get(stream -> stream.anyMatch(s -> css.equals(s)));
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
Node idAttr = node.getAttributes().getNamedItem("id");
|
||||
if (idAttr != null) {
|
||||
final String value = idAttr.getNodeValue();
|
||||
return id.equals(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
|
@ -34,7 +34,9 @@ import org.codelibs.fess.crawler.entity.RequestData;
|
|||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.cyberneko.html.parsers.DOMParser;
|
||||
import org.lastaflute.di.core.exception.ComponentNotFoundException;
|
||||
import org.w3c.dom.Document;
|
||||
|
@ -56,25 +58,36 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
protected String[] getCrawlerDocumentHtmlPrunedTags() {
|
||||
return new String[0];
|
||||
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public String getCrawlerDocumentHtmlPrunedTags() {
|
||||
return "";
|
||||
}
|
||||
};
|
||||
});
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
transformer.init();
|
||||
|
||||
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
|
||||
assertEquals(getXmlString(document), getXmlString(pruneNode));
|
||||
ComponentUtil.setFessConfig(null);
|
||||
}
|
||||
|
||||
public void test_pruneNode_removeNoScript() throws Exception {
|
||||
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
protected String[] getCrawlerDocumentHtmlPrunedTags() {
|
||||
return new String[] { "noscript" };
|
||||
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public String getCrawlerDocumentHtmlPrunedTags() {
|
||||
return "noscript";
|
||||
}
|
||||
};
|
||||
});
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
transformer.init();
|
||||
|
||||
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
|
||||
final String docString = getXmlString(document);
|
||||
|
@ -87,17 +100,23 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
assertTrue(pnString.contains("foo"));
|
||||
assertFalse(pnString.contains("<NOSCRIPT>"));
|
||||
assertFalse(pnString.contains("bar"));
|
||||
ComponentUtil.setFessConfig(null);
|
||||
}
|
||||
|
||||
public void test_pruneNode_removeScriptAndNoscript() throws Exception {
|
||||
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer() {
|
||||
protected String[] getCrawlerDocumentHtmlPrunedTags() {
|
||||
return new String[] { "script", "noscript" };
|
||||
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public String getCrawlerDocumentHtmlPrunedTags() {
|
||||
return "script,noscript";
|
||||
}
|
||||
};
|
||||
});
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
transformer.init();
|
||||
|
||||
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
|
||||
final String docString = getXmlString(document);
|
||||
|
@ -110,6 +129,65 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
assertFalse(pnString.contains("foo"));
|
||||
assertFalse(pnString.contains("<NOSCRIPT>"));
|
||||
assertFalse(pnString.contains("bar"));
|
||||
ComponentUtil.setFessConfig(null);
|
||||
}
|
||||
|
||||
public void test_pruneNode_removeDivId() throws Exception {
|
||||
final String data = "<html><body><br/><div>foo</div><div id=\"barid\">bar</div></body></html>";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public String getCrawlerDocumentHtmlPrunedTags() {
|
||||
return "div#barid";
|
||||
}
|
||||
});
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
transformer.init();
|
||||
|
||||
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
|
||||
final String docString = getXmlString(document);
|
||||
final String pnString = getXmlString(pruneNode);
|
||||
assertTrue(docString.contains("<DIV>"));
|
||||
assertTrue(docString.contains("foo"));
|
||||
assertTrue(docString.contains("<DIV id=\"barid\">"));
|
||||
assertTrue(docString.contains("bar"));
|
||||
assertTrue(pnString.contains("<DIV>"));
|
||||
assertTrue(pnString.contains("foo"));
|
||||
assertFalse(pnString.contains("<DIV id=\"barid\">"));
|
||||
assertFalse(pnString.contains("bar"));
|
||||
ComponentUtil.setFessConfig(null);
|
||||
}
|
||||
|
||||
public void test_pruneNode_removeDivClass() throws Exception {
|
||||
final String data = "<html><body><br/><div>foo</div><div class=\"barcls\">bar</div></body></html>";
|
||||
final Document document = getDocument(data);
|
||||
|
||||
ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
@Override
|
||||
public String getCrawlerDocumentHtmlPrunedTags() {
|
||||
return "div.barcls";
|
||||
}
|
||||
});
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
transformer.init();
|
||||
|
||||
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
|
||||
final String docString = getXmlString(document);
|
||||
final String pnString = getXmlString(pruneNode);
|
||||
assertTrue(docString.contains("<DIV>"));
|
||||
assertTrue(docString.contains("foo"));
|
||||
assertTrue(docString.contains("<DIV class=\"barcls\">"));
|
||||
assertTrue(docString.contains("bar"));
|
||||
assertTrue(pnString.contains("<DIV>"));
|
||||
assertTrue(pnString.contains("foo"));
|
||||
assertFalse(pnString.contains("<DIV class=\"barcls\">"));
|
||||
assertFalse(pnString.contains("bar"));
|
||||
ComponentUtil.setFessConfig(null);
|
||||
}
|
||||
|
||||
public void test_processGoogleOffOn() throws Exception {
|
||||
|
|
Loading…
Add table
Reference in a new issue