浏览代码

fix #785 prune tags by id/class

Shinsuke Sugaya 8 年之前
父节点
当前提交
a61219b117

+ 5 - 8
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -64,6 +64,7 @@ import org.codelibs.fess.helper.PathMappingHelper;
 import org.codelibs.fess.helper.SystemHelper;
 import org.codelibs.fess.helper.SystemHelper;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.util.ComponentUtil;
+import org.codelibs.fess.util.PrunedTag;
 import org.cyberneko.html.parsers.DOMParser;
 import org.cyberneko.html.parsers.DOMParser;
 import org.slf4j.Logger;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.slf4j.LoggerFactory;
@@ -507,7 +508,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         final List<Node> removedNodeList = new ArrayList<>();
         final List<Node> removedNodeList = new ArrayList<>();
         for (int i = 0; i < nodeList.getLength(); i++) {
         for (int i = 0; i < nodeList.getLength(); i++) {
             final Node childNode = nodeList.item(i);
             final Node childNode = nodeList.item(i);
-            if (isPrunedTag(childNode.getNodeName())) {
+            if (isPrunedTag(childNode)) {
                 removedNodeList.add(childNode);
                 removedNodeList.add(childNode);
             } else {
             } else {
                 childNodeList.add(childNode);
                 childNodeList.add(childNode);
@@ -525,9 +526,9 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return node;
         return node;
     }
     }
 
 
-    protected boolean isPrunedTag(final String tagName) {
-        for (final String name : getCrawlerDocumentHtmlPrunedTags()) {
-            if (name.equalsIgnoreCase(tagName)) {
+    protected boolean isPrunedTag(final Node node) {
+        for (final PrunedTag prunedTag : fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray()) {
+            if (prunedTag.matches(node)) {
                 return true;
                 return true;
             }
             }
         }
         }
@@ -655,10 +656,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF;
         return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF;
     }
     }
 
 
-    protected String[] getCrawlerDocumentHtmlPrunedTags() {
-        return fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
-    }
-
     public void setUseGoogleOffOn(boolean useGoogleOffOn) {
     public void setUseGoogleOffOn(boolean useGoogleOffOn) {
         this.useGoogleOffOn = useGoogleOffOn;
         this.useGoogleOffOn = useGoogleOffOn;
     }
     }

+ 26 - 2
src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java

@@ -44,6 +44,7 @@ import org.codelibs.fess.helper.PermissionHelper;
 import org.codelibs.fess.mylasta.action.FessUserBean;
 import org.codelibs.fess.mylasta.action.FessUserBean;
 import org.codelibs.fess.taglib.FessFunctions;
 import org.codelibs.fess.taglib.FessFunctions;
 import org.codelibs.fess.util.ComponentUtil;
 import org.codelibs.fess.util.ComponentUtil;
+import org.codelibs.fess.util.PrunedTag;
 import org.dbflute.optional.OptionalThing;
 import org.dbflute.optional.OptionalThing;
 import org.elasticsearch.action.search.SearchRequestBuilder;
 import org.elasticsearch.action.search.SearchRequestBuilder;
 import org.lastaflute.job.LaJob;
 import org.lastaflute.job.LaJob;
@@ -582,8 +583,31 @@ public interface FessProp {
 
 
     String getCrawlerDocumentHtmlPrunedTags();
     String getCrawlerDocumentHtmlPrunedTags();
 
 
-    public default String[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
-        return getCrawlerDocumentHtmlPrunedTags().split(",");
+    public default PrunedTag[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
+        PrunedTag[] tags = (PrunedTag[]) propMap.get("crawlerDocumentHtmlPrunedTags");
+        if (tags == null) {
+            tags = split(getCrawlerDocumentHtmlPrunedTags(), ",").get(stream -> stream.filter(StringUtil::isNotBlank).map(v -> {
+                final String[] cssValues = v.split("\\.", 2);
+                final String css;
+                if (cssValues.length == 2) {
+                    css = cssValues[1];
+                } else {
+                    css = null;
+                }
+
+                final String[] idValues = cssValues[0].split("#", 2);
+                final String id;
+                if (idValues.length == 2) {
+                    id = idValues[1];
+                } else {
+                    id = null;
+                }
+
+                return new PrunedTag(idValues[0], id, css);
+            }).toArray(n -> new PrunedTag[n]));
+            propMap.put("crawlerDocumentHtmlPrunedTags", tags);
+        }
+        return tags;
     }
     }
 
 
     String getCrawlerDocumentCacheHtmlMimetypes();
     String getCrawlerDocumentCacheHtmlMimetypes();

+ 4 - 0
src/main/java/org/codelibs/fess/util/ComponentUtil.java

@@ -59,6 +59,7 @@ import org.codelibs.fess.indexer.IndexUpdater;
 import org.codelibs.fess.job.JobExecutor;
 import org.codelibs.fess.job.JobExecutor;
 import org.codelibs.fess.ldap.LdapManager;
 import org.codelibs.fess.ldap.LdapManager;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.mylasta.direction.FessConfig;
+import org.codelibs.fess.mylasta.direction.FessProp;
 import org.codelibs.fess.sso.SsoManager;
 import org.codelibs.fess.sso.SsoManager;
 import org.codelibs.fess.thumbnail.ThumbnailManager;
 import org.codelibs.fess.thumbnail.ThumbnailManager;
 import org.lastaflute.core.message.MessageManager;
 import org.lastaflute.core.message.MessageManager;
@@ -429,6 +430,9 @@ public final class ComponentUtil {
      */
      */
     public static void setFessConfig(final FessConfig fessConfig) {
     public static void setFessConfig(final FessConfig fessConfig) {
         ComponentUtil.fessConfig = fessConfig;
         ComponentUtil.fessConfig = fessConfig;
+        if (fessConfig == null) {
+            FessProp.propMap.clear();
+        }
     }
     }
 
 
 }
 }

+ 58 - 0
src/main/java/org/codelibs/fess/util/PrunedTag.java

@@ -0,0 +1,58 @@
+/*
+ * Copyright 2012-2016 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.util;
+
+import org.codelibs.core.lang.StringUtil;
+import org.codelibs.core.stream.StreamUtil;
+import org.w3c.dom.Node;
+
+public class PrunedTag {
+    private String tag;
+    private String id;
+    private String css;
+
+    public PrunedTag(final String tag, final String id, final String css) {
+        this.tag = tag;
+        this.id = id;
+        this.css = css;
+
+    }
+
+    public boolean matches(final Node node) {
+        if (tag.equalsIgnoreCase(node.getNodeName())) {
+            if (id == null) {
+                if (css == null) {
+                    return true;
+                } else {
+                    Node classAttr = node.getAttributes().getNamedItem("class");
+                    if (classAttr != null) {
+                        final String value = classAttr.getNodeValue();
+                        if (StringUtil.isNotBlank(value)) {
+                            return StreamUtil.split(value, " ").get(stream -> stream.anyMatch(s -> css.equals(s)));
+                        }
+                    }
+                }
+            } else {
+                Node idAttr = node.getAttributes().getNamedItem("id");
+                if (idAttr != null) {
+                    final String value = idAttr.getNodeValue();
+                    return id.equals(value);
+                }
+            }
+        }
+        return false;
+    }
+}

+ 90 - 12
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -34,7 +34,9 @@ import org.codelibs.fess.crawler.entity.RequestData;
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.entity.ResultData;
 import org.codelibs.fess.crawler.entity.ResultData;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
+import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.unit.UnitFessTestCase;
 import org.codelibs.fess.unit.UnitFessTestCase;
+import org.codelibs.fess.util.ComponentUtil;
 import org.cyberneko.html.parsers.DOMParser;
 import org.cyberneko.html.parsers.DOMParser;
 import org.lastaflute.di.core.exception.ComponentNotFoundException;
 import org.lastaflute.di.core.exception.ComponentNotFoundException;
 import org.w3c.dom.Document;
 import org.w3c.dom.Document;
@@ -56,25 +58,36 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final Document document = getDocument(data);
         final Document document = getDocument(data);
 
 
-        final FessXpathTransformer transformer = new FessXpathTransformer() {
-            protected String[] getCrawlerDocumentHtmlPrunedTags() {
-                return new String[0];
+        ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public String getCrawlerDocumentHtmlPrunedTags() {
+                return "";
             }
             }
-        };
+        });
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+        transformer.init();
 
 
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         assertEquals(getXmlString(document), getXmlString(pruneNode));
         assertEquals(getXmlString(document), getXmlString(pruneNode));
+        ComponentUtil.setFessConfig(null);
     }
     }
 
 
     public void test_pruneNode_removeNoScript() throws Exception {
     public void test_pruneNode_removeNoScript() throws Exception {
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final Document document = getDocument(data);
         final Document document = getDocument(data);
 
 
-        final FessXpathTransformer transformer = new FessXpathTransformer() {
-            protected String[] getCrawlerDocumentHtmlPrunedTags() {
-                return new String[] { "noscript" };
+        ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public String getCrawlerDocumentHtmlPrunedTags() {
+                return "noscript";
             }
             }
-        };
+        });
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+        transformer.init();
 
 
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         final String docString = getXmlString(document);
         final String docString = getXmlString(document);
@@ -87,17 +100,23 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         assertTrue(pnString.contains("foo"));
         assertTrue(pnString.contains("foo"));
         assertFalse(pnString.contains("<NOSCRIPT>"));
         assertFalse(pnString.contains("<NOSCRIPT>"));
         assertFalse(pnString.contains("bar"));
         assertFalse(pnString.contains("bar"));
+        ComponentUtil.setFessConfig(null);
     }
     }
 
 
     public void test_pruneNode_removeScriptAndNoscript() throws Exception {
     public void test_pruneNode_removeScriptAndNoscript() throws Exception {
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final Document document = getDocument(data);
         final Document document = getDocument(data);
 
 
-        final FessXpathTransformer transformer = new FessXpathTransformer() {
-            protected String[] getCrawlerDocumentHtmlPrunedTags() {
-                return new String[] { "script", "noscript" };
+        ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public String getCrawlerDocumentHtmlPrunedTags() {
+                return "script,noscript";
             }
             }
-        };
+        });
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+        transformer.init();
 
 
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         final String docString = getXmlString(document);
         final String docString = getXmlString(document);
@@ -110,6 +129,65 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         assertFalse(pnString.contains("foo"));
         assertFalse(pnString.contains("foo"));
         assertFalse(pnString.contains("<NOSCRIPT>"));
         assertFalse(pnString.contains("<NOSCRIPT>"));
         assertFalse(pnString.contains("bar"));
         assertFalse(pnString.contains("bar"));
+        ComponentUtil.setFessConfig(null);
+    }
+
+    public void test_pruneNode_removeDivId() throws Exception {
+        final String data = "<html><body><br/><div>foo</div><div id=\"barid\">bar</div></body></html>";
+        final Document document = getDocument(data);
+
+        ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public String getCrawlerDocumentHtmlPrunedTags() {
+                return "div#barid";
+            }
+        });
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+        transformer.init();
+
+        final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
+        final String docString = getXmlString(document);
+        final String pnString = getXmlString(pruneNode);
+        assertTrue(docString.contains("<DIV>"));
+        assertTrue(docString.contains("foo"));
+        assertTrue(docString.contains("<DIV id=\"barid\">"));
+        assertTrue(docString.contains("bar"));
+        assertTrue(pnString.contains("<DIV>"));
+        assertTrue(pnString.contains("foo"));
+        assertFalse(pnString.contains("<DIV id=\"barid\">"));
+        assertFalse(pnString.contains("bar"));
+        ComponentUtil.setFessConfig(null);
+    }
+
+    public void test_pruneNode_removeDivClass() throws Exception {
+        final String data = "<html><body><br/><div>foo</div><div class=\"barcls\">bar</div></body></html>";
+        final Document document = getDocument(data);
+
+        ComponentUtil.setFessConfig(new FessConfig.SimpleImpl() {
+            private static final long serialVersionUID = 1L;
+
+            @Override
+            public String getCrawlerDocumentHtmlPrunedTags() {
+                return "div.barcls";
+            }
+        });
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+        transformer.init();
+
+        final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
+        final String docString = getXmlString(document);
+        final String pnString = getXmlString(pruneNode);
+        assertTrue(docString.contains("<DIV>"));
+        assertTrue(docString.contains("foo"));
+        assertTrue(docString.contains("<DIV class=\"barcls\">"));
+        assertTrue(docString.contains("bar"));
+        assertTrue(pnString.contains("<DIV>"));
+        assertTrue(pnString.contains("foo"));
+        assertFalse(pnString.contains("<DIV class=\"barcls\">"));
+        assertFalse(pnString.contains("bar"));
+        ComponentUtil.setFessConfig(null);
     }
     }
 
 
     public void test_processGoogleOffOn() throws Exception {
     public void test_processGoogleOffOn() throws Exception {