diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index 705e3ccf8..92eed3c17 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -237,6 +237,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper(); final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper(); + final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper(); String url = responseData.getUrl(); final String indexingTarget = crawlingConfig.getIndexingTarget(url); url = pathMappingHelper.replaceUrl(sessionId, url); @@ -344,7 +345,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf for (final String labelType : crawlingConfig.getLabelTypeValues()) { labelTypeSet.add(labelType); } - final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper(); labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url)); putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet); // role: roleType @@ -450,8 +450,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf for (int i = 0; i < list.getLength(); i++) { if (buf == null) { buf = new UnsafeStringBuilder(1000); - } else { - buf.append(' '); } Node node = list.item(i).cloneNode(true); if (useGoogleOffOn) { @@ -460,7 +458,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf if (pruned) { node = pruneNode(node); } - buf.append(node.getTextContent()); + paseTextContent(node, buf); } } catch (final Exception e) { logger.warn("Could not parse a value of " + xpath); @@ -471,6 +469,24 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf return buf.toUnsafeString().trim(); } + protected void paseTextContent(Node node, UnsafeStringBuilder buf) { + if (node.hasChildNodes()) { + final NodeList nodeList = node.getChildNodes(); + for (int i = 0; i < nodeList.getLength(); i++) { + final Node childNode = nodeList.item(i); + paseTextContent(childNode, buf); + } + } else { + final String value = node.getTextContent(); + if (value != null) { + final String content = value.trim(); + if (content.length() > 0) { + buf.append(' ').append(content); + } + } + } + } + protected Node processGoogleOffOn(final Node node, final ValueHolder flag) { final NodeList nodeList = node.getChildNodes(); List removedNodeList = null; diff --git a/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java b/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java index cd4b0464c..6cf7912a4 100644 --- a/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java +++ b/src/main/java/org/codelibs/fess/helper/LabelTypeHelper.java @@ -173,7 +173,7 @@ public class LabelTypeHelper { } } - protected static class LabelTypePattern { + public static class LabelTypePattern { private final String value; diff --git a/src/main/java/org/codelibs/fess/util/MemoryUtil.java b/src/main/java/org/codelibs/fess/util/MemoryUtil.java index 7883b289a..497725442 100644 --- a/src/main/java/org/codelibs/fess/util/MemoryUtil.java +++ b/src/main/java/org/codelibs/fess/util/MemoryUtil.java @@ -35,4 +35,11 @@ public final class MemoryUtil { public static String byteCountToDisplaySize(final long size) { return FileUtils.byteCountToDisplaySize(size).replace(" ", StringUtil.EMPTY); } + + public static long getUsedMemory() { + final Runtime runtime = Runtime.getRuntime(); + final long freeBytes = runtime.freeMemory(); + final long totalBytes = runtime.totalMemory(); + return totalBytes - freeBytes; + } } diff --git a/src/main/java/org/codelibs/fess/util/PrunedTag.java b/src/main/java/org/codelibs/fess/util/PrunedTag.java index 4a3819774..2f7c90322 100644 --- a/src/main/java/org/codelibs/fess/util/PrunedTag.java +++ b/src/main/java/org/codelibs/fess/util/PrunedTag.java @@ -55,4 +55,46 @@ public class PrunedTag { } return false; } + + @Override + public String toString() { + return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]"; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = 1; + result = prime * result + ((css == null) ? 0 : css.hashCode()); + result = prime * result + ((id == null) ? 0 : id.hashCode()); + result = prime * result + ((tag == null) ? 0 : tag.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + PrunedTag other = (PrunedTag) obj; + if (css == null) { + if (other.css != null) + return false; + } else if (!css.equals(other.css)) + return false; + if (id == null) { + if (other.id != null) + return false; + } else if (!id.equals(other.id)) + return false; + if (tag == null) { + if (other.tag != null) + return false; + } else if (!tag.equals(other.tag)) + return false; + return true; + } } diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties index b51753ae1..f8a3727bc 100644 --- a/src/main/resources/fess_config.properties +++ b/src/main/resources/fess_config.properties @@ -30,6 +30,7 @@ jvm.crawler.options=\ -XX:+UseParNewGC\n\ -XX:+UseTLAB\n\ -XX:+DisableExplicitGC\n\ +-XX:+HeapDumpOnOutOfMemoryError\n\ -XX:-OmitStackTraceInFastThrow\n\ -Djcifs.smb.client.connTimeout=60000\n\ -Djcifs.smb.client.soTimeout=35000\n\ @@ -49,6 +50,7 @@ jvm.suggest.options=\ -XX:+UseParNewGC\n\ -XX:+UseTLAB\n\ -XX:+DisableExplicitGC\n\ +-XX:+HeapDumpOnOutOfMemoryError\n\ -Dgroovy.use.classvalue=true\n\ diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java index 83c64c6ec..c90144bfa 100644 --- a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java +++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java @@ -17,7 +17,9 @@ package org.codelibs.fess.crawler.transformer; import java.io.ByteArrayInputStream; import java.io.StringWriter; +import java.lang.reflect.Field; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -28,30 +30,90 @@ import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; +import org.codelibs.core.lang.ClassUtil; +import org.codelibs.core.lang.FieldUtil; import org.codelibs.core.misc.ValueHolder; import org.codelibs.fess.crawler.builder.RequestDataBuilder; import org.codelibs.fess.crawler.entity.RequestData; import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.entity.ResultData; import org.codelibs.fess.crawler.exception.ChildUrlsException; +import org.codelibs.fess.es.config.exentity.LabelType; +import org.codelibs.fess.es.config.exentity.WebConfig; +import org.codelibs.fess.helper.CrawlingConfigHelper; +import org.codelibs.fess.helper.CrawlingInfoHelper; +import org.codelibs.fess.helper.DocumentHelper; +import org.codelibs.fess.helper.FileTypeHelper; +import org.codelibs.fess.helper.LabelTypeHelper; +import org.codelibs.fess.helper.LabelTypeHelper.LabelTypePattern; +import org.codelibs.fess.helper.PathMappingHelper; +import org.codelibs.fess.helper.SystemHelper; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.unit.UnitFessTestCase; import org.codelibs.fess.util.ComponentUtil; +import org.codelibs.fess.util.MemoryUtil; import org.cyberneko.html.parsers.DOMParser; import org.lastaflute.di.core.exception.ComponentNotFoundException; +import org.lastaflute.di.core.factory.SingletonLaContainerFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.InputSource; public class FessXpathTransformerTest extends UnitFessTestCase { - public FessXpathTransformer fessXpathTransformer; + private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformerTest.class); - @Override - public void setUp() throws Exception { - super.setUp(); - fessXpathTransformer = new FessXpathTransformer(); + public void test_transform() throws Exception { + String data = "Test

Header1

This is a pen.

"; + + final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer(); fessXpathTransformer.init(); - fessXpathTransformer.convertUrlMap.put("feed:", "http:"); + SingletonLaContainerFactory.getContainer().register(CrawlingInfoHelper.class, "crawlingInfoHelper"); + SingletonLaContainerFactory.getContainer().register(PathMappingHelper.class, "pathMappingHelper"); + SingletonLaContainerFactory.getContainer().register(CrawlingConfigHelper.class, "crawlingConfigHelper"); + SingletonLaContainerFactory.getContainer().register(SystemHelper.class, "systemHelper"); + SingletonLaContainerFactory.getContainer().register(FileTypeHelper.class, "fileTypeHelper"); + SingletonLaContainerFactory.getContainer().register(DocumentHelper.class, "documentHelper"); + SingletonLaContainerFactory.getContainer().register(LabelTypeHelper.class, "labelTypeHelper"); + + WebConfig webConfig = new WebConfig(); + setValueToObject(webConfig, "labelTypeList", new ArrayList()); + ComponentUtil.getCrawlingConfigHelper().store("test", webConfig); + setValueToObject(ComponentUtil.getLabelTypeHelper(), "labelTypePatternList", new ArrayList()); + + for (int i = 0; i < 10000; i++) { + if (i % 1000 == 0) { + logger.info(MemoryUtil.getMemoryUsageLog() + ":" + i); + System.gc(); + } + ResponseData responseData = new ResponseData(); + responseData.setCharSet("UTF-8"); + responseData.setContentLength(data.length()); + responseData.setExecutionTime(1000L); + responseData.setHttpStatusCode(200); + responseData.setLastModified(new Date()); + responseData.setMethod("GET"); + responseData.setMimeType("text/html"); + responseData.setParentUrl("http://fess.codelibs.org/"); + responseData.setResponseBody(data.getBytes()); + responseData.setSessionId("test-1"); + responseData.setStatus(0); + responseData.setUrl("http://fess.codelibs.org/test.html"); + ResultData resultData = fessXpathTransformer.transform(responseData); + // System.out.println(resultData.toString()); + } + + System.gc(); + Thread.sleep(1000L); + logger.info(MemoryUtil.getMemoryUsageLog()); + assertTrue(MemoryUtil.getUsedMemory() < 100000000L); + } + + private void setValueToObject(Object obj, String name, Object value) { + Field field = ClassUtil.getDeclaredField(obj.getClass(), name); + field.setAccessible(true); + FieldUtil.set(field, obj, value); } public void test_pruneNode() throws Exception { @@ -311,6 +373,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase { } public void test_isValidPath_valid() { + final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer(); + fessXpathTransformer.init(); + fessXpathTransformer.convertUrlMap.put("feed:", "http:"); + String value; value = "foo.html"; @@ -331,6 +397,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase { } public void test_isValidPath_invalid() { + final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer(); + fessXpathTransformer.init(); + fessXpathTransformer.convertUrlMap.put("feed:", "http:"); + String value; value = "javascript:..."; @@ -365,6 +435,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase { } public void test_convertChildUrlList() { + final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer(); + fessXpathTransformer.init(); + fessXpathTransformer.convertUrlMap.put("feed:", "http:"); + List urlList = new ArrayList<>(); urlList = fessXpathTransformer.convertChildUrlList(urlList); @@ -395,6 +469,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase { } public void test_removeCommentTag() { + final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer(); + fessXpathTransformer.init(); + fessXpathTransformer.convertUrlMap.put("feed:", "http:"); + assertEquals("", fessXpathTransformer.removeCommentTag("")); assertEquals(" ", fessXpathTransformer.removeCommentTag("")); assertEquals("abc", fessXpathTransformer.removeCommentTag("abc")); @@ -459,6 +537,20 @@ public class FessXpathTransformerTest extends UnitFessTestCase { } } + public void test_getSingleNodeValue() throws Exception { + final FessXpathTransformer transformer = new FessXpathTransformer(); + + String data = "aaaccc"; + Document document = getDocument(data); + String value = transformer.getSingleNodeValue(document, "//BODY", false); + assertEquals("aaa bbb ccc", value); + + data = " aaa

bbb ccc

"; + document = getDocument(data); + value = transformer.getSingleNodeValue(document, "//BODY", false); + assertEquals("aaa bbb ccc", value); + } + public void test_contentXpath() throws Exception { final FessXpathTransformer transformer = new FessXpathTransformer(); diff --git a/src/test/java/org/codelibs/fess/unit/UnitFessTestCase.java b/src/test/java/org/codelibs/fess/unit/UnitFessTestCase.java index 813a650ea..76f6ba039 100644 --- a/src/test/java/org/codelibs/fess/unit/UnitFessTestCase.java +++ b/src/test/java/org/codelibs/fess/unit/UnitFessTestCase.java @@ -15,6 +15,7 @@ */ package org.codelibs.fess.unit; +import org.codelibs.fess.util.ComponentUtil; import org.dbflute.utflute.lastaflute.WebContainerTestCase; public abstract class UnitFessTestCase extends WebContainerTestCase { @@ -22,4 +23,10 @@ public abstract class UnitFessTestCase extends WebContainerTestCase { protected String prepareConfigFile() { return "test_app.xml"; } + + @Override + public void tearDown() throws Exception { + ComponentUtil.setFessConfig(null); + super.tearDown(); + } }