Browse Source

fix #791 add parseTextContext

Shinsuke Sugaya 8 years ago
parent
commit
f4bd1be6ff

+ 20 - 4
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -237,6 +237,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
         final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
         final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
+        final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
         String url = responseData.getUrl();
         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
         url = pathMappingHelper.replaceUrl(sessionId, url);
@@ -344,7 +345,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         for (final String labelType : crawlingConfig.getLabelTypeValues()) {
             labelTypeSet.add(labelType);
         }
-        final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
         labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
         putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
         // role: roleType
@@ -450,8 +450,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             for (int i = 0; i < list.getLength(); i++) {
                 if (buf == null) {
                     buf = new UnsafeStringBuilder(1000);
-                } else {
-                    buf.append(' ');
                 }
                 Node node = list.item(i).cloneNode(true);
                 if (useGoogleOffOn) {
@@ -460,7 +458,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
                 if (pruned) {
                     node = pruneNode(node);
                 }
-                buf.append(node.getTextContent());
+                paseTextContent(node, buf);
             }
         } catch (final Exception e) {
             logger.warn("Could not parse a value of " + xpath);
@@ -471,6 +469,24 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return buf.toUnsafeString().trim();
     }
 
+    protected void paseTextContent(Node node, UnsafeStringBuilder buf) {
+        if (node.hasChildNodes()) {
+            final NodeList nodeList = node.getChildNodes();
+            for (int i = 0; i < nodeList.getLength(); i++) {
+                final Node childNode = nodeList.item(i);
+                paseTextContent(childNode, buf);
+            }
+        } else {
+            final String value = node.getTextContent();
+            if (value != null) {
+                final String content = value.trim();
+                if (content.length() > 0) {
+                    buf.append(' ').append(content);
+                }
+            }
+        }
+    }
+
     protected Node processGoogleOffOn(final Node node, final ValueHolder<Boolean> flag) {
         final NodeList nodeList = node.getChildNodes();
         List<Node> removedNodeList = null;

+ 1 - 1
src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java

@@ -173,7 +173,7 @@ public class LabelTypeHelper {
         }
     }
 
-    protected static class LabelTypePattern {
+    public static class LabelTypePattern {
 
         private final String value;
 

+ 7 - 0
src/main/java/org/codelibs/fess/util/MemoryUtil.java

@@ -35,4 +35,11 @@ public final class MemoryUtil {
     public static String byteCountToDisplaySize(final long size) {
         return FileUtils.byteCountToDisplaySize(size).replace(" ", StringUtil.EMPTY);
     }
+
+    public static long getUsedMemory() {
+        final Runtime runtime = Runtime.getRuntime();
+        final long freeBytes = runtime.freeMemory();
+        final long totalBytes = runtime.totalMemory();
+        return totalBytes - freeBytes;
+    }
 }

+ 42 - 0
src/main/java/org/codelibs/fess/util/PrunedTag.java

@@ -55,4 +55,46 @@ public class PrunedTag {
         }
         return false;
     }
+
+    @Override
+    public String toString() {
+        return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]";
+    }
+
+    @Override
+    public int hashCode() {
+        final int prime = 31;
+        int result = 1;
+        result = prime * result + ((css == null) ? 0 : css.hashCode());
+        result = prime * result + ((id == null) ? 0 : id.hashCode());
+        result = prime * result + ((tag == null) ? 0 : tag.hashCode());
+        return result;
+    }
+
+    @Override
+    public boolean equals(Object obj) {
+        if (this == obj)
+            return true;
+        if (obj == null)
+            return false;
+        if (getClass() != obj.getClass())
+            return false;
+        PrunedTag other = (PrunedTag) obj;
+        if (css == null) {
+            if (other.css != null)
+                return false;
+        } else if (!css.equals(other.css))
+            return false;
+        if (id == null) {
+            if (other.id != null)
+                return false;
+        } else if (!id.equals(other.id))
+            return false;
+        if (tag == null) {
+            if (other.tag != null)
+                return false;
+        } else if (!tag.equals(other.tag))
+            return false;
+        return true;
+    }
 }

+ 2 - 0
src/main/resources/fess_config.properties

@@ -30,6 +30,7 @@ jvm.crawler.options=\
 -XX:+UseParNewGC\n\
 -XX:+UseTLAB\n\
 -XX:+DisableExplicitGC\n\
+-XX:+HeapDumpOnOutOfMemoryError\n\
 -XX:-OmitStackTraceInFastThrow\n\
 -Djcifs.smb.client.connTimeout=60000\n\
 -Djcifs.smb.client.soTimeout=35000\n\
@@ -49,6 +50,7 @@ jvm.suggest.options=\
 -XX:+UseParNewGC\n\
 -XX:+UseTLAB\n\
 -XX:+DisableExplicitGC\n\
+-XX:+HeapDumpOnOutOfMemoryError\n\
 -Dgroovy.use.classvalue=true\n\
 
 

+ 98 - 6
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -17,7 +17,9 @@ package org.codelibs.fess.crawler.transformer;
 
 import java.io.ByteArrayInputStream;
 import java.io.StringWriter;
+import java.lang.reflect.Field;
 import java.util.ArrayList;
+import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -28,30 +30,90 @@ import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 
+import org.codelibs.core.lang.ClassUtil;
+import org.codelibs.core.lang.FieldUtil;
 import org.codelibs.core.misc.ValueHolder;
 import org.codelibs.fess.crawler.builder.RequestDataBuilder;
 import org.codelibs.fess.crawler.entity.RequestData;
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.entity.ResultData;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
+import org.codelibs.fess.es.config.exentity.LabelType;
+import org.codelibs.fess.es.config.exentity.WebConfig;
+import org.codelibs.fess.helper.CrawlingConfigHelper;
+import org.codelibs.fess.helper.CrawlingInfoHelper;
+import org.codelibs.fess.helper.DocumentHelper;
+import org.codelibs.fess.helper.FileTypeHelper;
+import org.codelibs.fess.helper.LabelTypeHelper;
+import org.codelibs.fess.helper.LabelTypeHelper.LabelTypePattern;
+import org.codelibs.fess.helper.PathMappingHelper;
+import org.codelibs.fess.helper.SystemHelper;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.unit.UnitFessTestCase;
 import org.codelibs.fess.util.ComponentUtil;
+import org.codelibs.fess.util.MemoryUtil;
 import org.cyberneko.html.parsers.DOMParser;
 import org.lastaflute.di.core.exception.ComponentNotFoundException;
+import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.Node;
 import org.xml.sax.InputSource;
 
 public class FessXpathTransformerTest extends UnitFessTestCase {
-    public FessXpathTransformer fessXpathTransformer;
+    private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformerTest.class);
 
-    @Override
-    public void setUp() throws Exception {
-        super.setUp();
-        fessXpathTransformer = new FessXpathTransformer();
+    public void test_transform() throws Exception {
+        String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";
+
+        final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
         fessXpathTransformer.init();
-        fessXpathTransformer.convertUrlMap.put("feed:", "http:");
+        SingletonLaContainerFactory.getContainer().register(CrawlingInfoHelper.class, "crawlingInfoHelper");
+        SingletonLaContainerFactory.getContainer().register(PathMappingHelper.class, "pathMappingHelper");
+        SingletonLaContainerFactory.getContainer().register(CrawlingConfigHelper.class, "crawlingConfigHelper");
+        SingletonLaContainerFactory.getContainer().register(SystemHelper.class, "systemHelper");
+        SingletonLaContainerFactory.getContainer().register(FileTypeHelper.class, "fileTypeHelper");
+        SingletonLaContainerFactory.getContainer().register(DocumentHelper.class, "documentHelper");
+        SingletonLaContainerFactory.getContainer().register(LabelTypeHelper.class, "labelTypeHelper");
+
+        WebConfig webConfig = new WebConfig();
+        setValueToObject(webConfig, "labelTypeList", new ArrayList<LabelType>());
+        ComponentUtil.getCrawlingConfigHelper().store("test", webConfig);
+        setValueToObject(ComponentUtil.getLabelTypeHelper(), "labelTypePatternList", new ArrayList<LabelTypePattern>());
+
+        for (int i = 0; i < 10000; i++) {
+            if (i % 1000 == 0) {
+                logger.info(MemoryUtil.getMemoryUsageLog() + ":" + i);
+                System.gc();
+            }
+            ResponseData responseData = new ResponseData();
+            responseData.setCharSet("UTF-8");
+            responseData.setContentLength(data.length());
+            responseData.setExecutionTime(1000L);
+            responseData.setHttpStatusCode(200);
+            responseData.setLastModified(new Date());
+            responseData.setMethod("GET");
+            responseData.setMimeType("text/html");
+            responseData.setParentUrl("http://fess.codelibs.org/");
+            responseData.setResponseBody(data.getBytes());
+            responseData.setSessionId("test-1");
+            responseData.setStatus(0);
+            responseData.setUrl("http://fess.codelibs.org/test.html");
+            ResultData resultData = fessXpathTransformer.transform(responseData);
+            // System.out.println(resultData.toString());
+        }
+
+        System.gc();
+        Thread.sleep(1000L);
+        logger.info(MemoryUtil.getMemoryUsageLog());
+        assertTrue(MemoryUtil.getUsedMemory() < 100000000L);
+    }
+
+    private void setValueToObject(Object obj, String name, Object value) {
+        Field field = ClassUtil.getDeclaredField(obj.getClass(), name);
+        field.setAccessible(true);
+        FieldUtil.set(field, obj, value);
     }
 
     public void test_pruneNode() throws Exception {
@@ -311,6 +373,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
     }
 
     public void test_isValidPath_valid() {
+        final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
+        fessXpathTransformer.init();
+        fessXpathTransformer.convertUrlMap.put("feed:", "http:");
+
         String value;
 
         value = "foo.html";
@@ -331,6 +397,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
     }
 
     public void test_isValidPath_invalid() {
+        final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
+        fessXpathTransformer.init();
+        fessXpathTransformer.convertUrlMap.put("feed:", "http:");
+
         String value;
 
         value = "javascript:...";
@@ -365,6 +435,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
     }
 
     public void test_convertChildUrlList() {
+        final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
+        fessXpathTransformer.init();
+        fessXpathTransformer.convertUrlMap.put("feed:", "http:");
+
         List<RequestData> urlList = new ArrayList<>();
 
         urlList = fessXpathTransformer.convertChildUrlList(urlList);
@@ -395,6 +469,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
     }
 
     public void test_removeCommentTag() {
+        final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
+        fessXpathTransformer.init();
+        fessXpathTransformer.convertUrlMap.put("feed:", "http:");
+
         assertEquals("", fessXpathTransformer.removeCommentTag(""));
         assertEquals(" ", fessXpathTransformer.removeCommentTag("<!-- - -->"));
         assertEquals("abc", fessXpathTransformer.removeCommentTag("abc"));
@@ -459,6 +537,20 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         }
     }
 
+    public void test_getSingleNodeValue() throws Exception {
+        final FessXpathTransformer transformer = new FessXpathTransformer();
+
+        String data = "<html><body>aaa<style>bbb</style>ccc</body></html>";
+        Document document = getDocument(data);
+        String value = transformer.getSingleNodeValue(document, "//BODY", false);
+        assertEquals("aaa bbb ccc", value);
+
+        data = "<html><body> aaa <p> bbb <b>ccc</b> </p> </body></html>";
+        document = getDocument(data);
+        value = transformer.getSingleNodeValue(document, "//BODY", false);
+        assertEquals("aaa bbb ccc", value);
+    }
+
     public void test_contentXpath() throws Exception {
         final FessXpathTransformer transformer = new FessXpathTransformer();
 

+ 7 - 0
src/test/java/org/codelibs/fess/unit/UnitFessTestCase.java

@@ -15,6 +15,7 @@
  */
 package org.codelibs.fess.unit;
 
+import org.codelibs.fess.util.ComponentUtil;
 import org.dbflute.utflute.lastaflute.WebContainerTestCase;
 
 public abstract class UnitFessTestCase extends WebContainerTestCase {
@@ -22,4 +23,10 @@ public abstract class UnitFessTestCase extends WebContainerTestCase {
     protected String prepareConfigFile() {
         return "test_app.xml";
     }
+
+    @Override
+    public void tearDown() throws Exception {
+        ComponentUtil.setFessConfig(null);
+        super.tearDown();
+    }
 }