fix #791 add parseTextContext
This commit is contained in:
parent
d2dd3ae427
commit
f4bd1be6ff
7 changed files with 177 additions and 11 deletions
|
@ -237,6 +237,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
||||
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
|
||||
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
|
||||
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
|
||||
String url = responseData.getUrl();
|
||||
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
|
||||
url = pathMappingHelper.replaceUrl(sessionId, url);
|
||||
|
@ -344,7 +345,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
for (final String labelType : crawlingConfig.getLabelTypeValues()) {
|
||||
labelTypeSet.add(labelType);
|
||||
}
|
||||
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
|
||||
labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
|
||||
// role: roleType
|
||||
|
@ -450,8 +450,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
for (int i = 0; i < list.getLength(); i++) {
|
||||
if (buf == null) {
|
||||
buf = new UnsafeStringBuilder(1000);
|
||||
} else {
|
||||
buf.append(' ');
|
||||
}
|
||||
Node node = list.item(i).cloneNode(true);
|
||||
if (useGoogleOffOn) {
|
||||
|
@ -460,7 +458,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
if (pruned) {
|
||||
node = pruneNode(node);
|
||||
}
|
||||
buf.append(node.getTextContent());
|
||||
paseTextContent(node, buf);
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
logger.warn("Could not parse a value of " + xpath);
|
||||
|
@ -471,6 +469,24 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
return buf.toUnsafeString().trim();
|
||||
}
|
||||
|
||||
protected void paseTextContent(Node node, UnsafeStringBuilder buf) {
|
||||
if (node.hasChildNodes()) {
|
||||
final NodeList nodeList = node.getChildNodes();
|
||||
for (int i = 0; i < nodeList.getLength(); i++) {
|
||||
final Node childNode = nodeList.item(i);
|
||||
paseTextContent(childNode, buf);
|
||||
}
|
||||
} else {
|
||||
final String value = node.getTextContent();
|
||||
if (value != null) {
|
||||
final String content = value.trim();
|
||||
if (content.length() > 0) {
|
||||
buf.append(' ').append(content);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected Node processGoogleOffOn(final Node node, final ValueHolder<Boolean> flag) {
|
||||
final NodeList nodeList = node.getChildNodes();
|
||||
List<Node> removedNodeList = null;
|
||||
|
|
|
@ -173,7 +173,7 @@ public class LabelTypeHelper {
|
|||
}
|
||||
}
|
||||
|
||||
protected static class LabelTypePattern {
|
||||
public static class LabelTypePattern {
|
||||
|
||||
private final String value;
|
||||
|
||||
|
|
|
@ -35,4 +35,11 @@ public final class MemoryUtil {
|
|||
public static String byteCountToDisplaySize(final long size) {
|
||||
return FileUtils.byteCountToDisplaySize(size).replace(" ", StringUtil.EMPTY);
|
||||
}
|
||||
|
||||
public static long getUsedMemory() {
|
||||
final Runtime runtime = Runtime.getRuntime();
|
||||
final long freeBytes = runtime.freeMemory();
|
||||
final long totalBytes = runtime.totalMemory();
|
||||
return totalBytes - freeBytes;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,4 +55,46 @@ public class PrunedTag {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]";
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
final int prime = 31;
|
||||
int result = 1;
|
||||
result = prime * result + ((css == null) ? 0 : css.hashCode());
|
||||
result = prime * result + ((id == null) ? 0 : id.hashCode());
|
||||
result = prime * result + ((tag == null) ? 0 : tag.hashCode());
|
||||
return result;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
PrunedTag other = (PrunedTag) obj;
|
||||
if (css == null) {
|
||||
if (other.css != null)
|
||||
return false;
|
||||
} else if (!css.equals(other.css))
|
||||
return false;
|
||||
if (id == null) {
|
||||
if (other.id != null)
|
||||
return false;
|
||||
} else if (!id.equals(other.id))
|
||||
return false;
|
||||
if (tag == null) {
|
||||
if (other.tag != null)
|
||||
return false;
|
||||
} else if (!tag.equals(other.tag))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,7 @@ jvm.crawler.options=\
|
|||
-XX:+UseParNewGC\n\
|
||||
-XX:+UseTLAB\n\
|
||||
-XX:+DisableExplicitGC\n\
|
||||
-XX:+HeapDumpOnOutOfMemoryError\n\
|
||||
-XX:-OmitStackTraceInFastThrow\n\
|
||||
-Djcifs.smb.client.connTimeout=60000\n\
|
||||
-Djcifs.smb.client.soTimeout=35000\n\
|
||||
|
@ -49,6 +50,7 @@ jvm.suggest.options=\
|
|||
-XX:+UseParNewGC\n\
|
||||
-XX:+UseTLAB\n\
|
||||
-XX:+DisableExplicitGC\n\
|
||||
-XX:+HeapDumpOnOutOfMemoryError\n\
|
||||
-Dgroovy.use.classvalue=true\n\
|
||||
|
||||
|
||||
|
|
|
@ -17,7 +17,9 @@ package org.codelibs.fess.crawler.transformer;
|
|||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.StringWriter;
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -28,30 +30,90 @@ import javax.xml.transform.TransformerFactory;
|
|||
import javax.xml.transform.dom.DOMSource;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
|
||||
import org.codelibs.core.lang.ClassUtil;
|
||||
import org.codelibs.core.lang.FieldUtil;
|
||||
import org.codelibs.core.misc.ValueHolder;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.entity.RequestData;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.es.config.exentity.LabelType;
|
||||
import org.codelibs.fess.es.config.exentity.WebConfig;
|
||||
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
||||
import org.codelibs.fess.helper.CrawlingInfoHelper;
|
||||
import org.codelibs.fess.helper.DocumentHelper;
|
||||
import org.codelibs.fess.helper.FileTypeHelper;
|
||||
import org.codelibs.fess.helper.LabelTypeHelper;
|
||||
import org.codelibs.fess.helper.LabelTypeHelper.LabelTypePattern;
|
||||
import org.codelibs.fess.helper.PathMappingHelper;
|
||||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.MemoryUtil;
|
||||
import org.cyberneko.html.parsers.DOMParser;
|
||||
import org.lastaflute.di.core.exception.ComponentNotFoundException;
|
||||
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.Node;
|
||||
import org.xml.sax.InputSource;
|
||||
|
||||
public class FessXpathTransformerTest extends UnitFessTestCase {
|
||||
public FessXpathTransformer fessXpathTransformer;
|
||||
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformerTest.class);
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
fessXpathTransformer = new FessXpathTransformer();
|
||||
public void test_transform() throws Exception {
|
||||
String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";
|
||||
|
||||
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
|
||||
fessXpathTransformer.init();
|
||||
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
|
||||
SingletonLaContainerFactory.getContainer().register(CrawlingInfoHelper.class, "crawlingInfoHelper");
|
||||
SingletonLaContainerFactory.getContainer().register(PathMappingHelper.class, "pathMappingHelper");
|
||||
SingletonLaContainerFactory.getContainer().register(CrawlingConfigHelper.class, "crawlingConfigHelper");
|
||||
SingletonLaContainerFactory.getContainer().register(SystemHelper.class, "systemHelper");
|
||||
SingletonLaContainerFactory.getContainer().register(FileTypeHelper.class, "fileTypeHelper");
|
||||
SingletonLaContainerFactory.getContainer().register(DocumentHelper.class, "documentHelper");
|
||||
SingletonLaContainerFactory.getContainer().register(LabelTypeHelper.class, "labelTypeHelper");
|
||||
|
||||
WebConfig webConfig = new WebConfig();
|
||||
setValueToObject(webConfig, "labelTypeList", new ArrayList<LabelType>());
|
||||
ComponentUtil.getCrawlingConfigHelper().store("test", webConfig);
|
||||
setValueToObject(ComponentUtil.getLabelTypeHelper(), "labelTypePatternList", new ArrayList<LabelTypePattern>());
|
||||
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
if (i % 1000 == 0) {
|
||||
logger.info(MemoryUtil.getMemoryUsageLog() + ":" + i);
|
||||
System.gc();
|
||||
}
|
||||
ResponseData responseData = new ResponseData();
|
||||
responseData.setCharSet("UTF-8");
|
||||
responseData.setContentLength(data.length());
|
||||
responseData.setExecutionTime(1000L);
|
||||
responseData.setHttpStatusCode(200);
|
||||
responseData.setLastModified(new Date());
|
||||
responseData.setMethod("GET");
|
||||
responseData.setMimeType("text/html");
|
||||
responseData.setParentUrl("http://fess.codelibs.org/");
|
||||
responseData.setResponseBody(data.getBytes());
|
||||
responseData.setSessionId("test-1");
|
||||
responseData.setStatus(0);
|
||||
responseData.setUrl("http://fess.codelibs.org/test.html");
|
||||
ResultData resultData = fessXpathTransformer.transform(responseData);
|
||||
// System.out.println(resultData.toString());
|
||||
}
|
||||
|
||||
System.gc();
|
||||
Thread.sleep(1000L);
|
||||
logger.info(MemoryUtil.getMemoryUsageLog());
|
||||
assertTrue(MemoryUtil.getUsedMemory() < 100000000L);
|
||||
}
|
||||
|
||||
private void setValueToObject(Object obj, String name, Object value) {
|
||||
Field field = ClassUtil.getDeclaredField(obj.getClass(), name);
|
||||
field.setAccessible(true);
|
||||
FieldUtil.set(field, obj, value);
|
||||
}
|
||||
|
||||
public void test_pruneNode() throws Exception {
|
||||
|
@ -311,6 +373,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
}
|
||||
|
||||
public void test_isValidPath_valid() {
|
||||
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
|
||||
fessXpathTransformer.init();
|
||||
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
|
||||
|
||||
String value;
|
||||
|
||||
value = "foo.html";
|
||||
|
@ -331,6 +397,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
}
|
||||
|
||||
public void test_isValidPath_invalid() {
|
||||
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
|
||||
fessXpathTransformer.init();
|
||||
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
|
||||
|
||||
String value;
|
||||
|
||||
value = "javascript:...";
|
||||
|
@ -365,6 +435,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
}
|
||||
|
||||
public void test_convertChildUrlList() {
|
||||
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
|
||||
fessXpathTransformer.init();
|
||||
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
|
||||
|
||||
List<RequestData> urlList = new ArrayList<>();
|
||||
|
||||
urlList = fessXpathTransformer.convertChildUrlList(urlList);
|
||||
|
@ -395,6 +469,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
}
|
||||
|
||||
public void test_removeCommentTag() {
|
||||
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
|
||||
fessXpathTransformer.init();
|
||||
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
|
||||
|
||||
assertEquals("", fessXpathTransformer.removeCommentTag(""));
|
||||
assertEquals(" ", fessXpathTransformer.removeCommentTag("<!-- - -->"));
|
||||
assertEquals("abc", fessXpathTransformer.removeCommentTag("abc"));
|
||||
|
@ -459,6 +537,20 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
|
|||
}
|
||||
}
|
||||
|
||||
public void test_getSingleNodeValue() throws Exception {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
|
||||
String data = "<html><body>aaa<style>bbb</style>ccc</body></html>";
|
||||
Document document = getDocument(data);
|
||||
String value = transformer.getSingleNodeValue(document, "//BODY", false);
|
||||
assertEquals("aaa bbb ccc", value);
|
||||
|
||||
data = "<html><body> aaa <p> bbb <b>ccc</b> </p> </body></html>";
|
||||
document = getDocument(data);
|
||||
value = transformer.getSingleNodeValue(document, "//BODY", false);
|
||||
assertEquals("aaa bbb ccc", value);
|
||||
}
|
||||
|
||||
public void test_contentXpath() throws Exception {
|
||||
final FessXpathTransformer transformer = new FessXpathTransformer();
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
*/
|
||||
package org.codelibs.fess.unit;
|
||||
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.dbflute.utflute.lastaflute.WebContainerTestCase;
|
||||
|
||||
public abstract class UnitFessTestCase extends WebContainerTestCase {
|
||||
|
@ -22,4 +23,10 @@ public abstract class UnitFessTestCase extends WebContainerTestCase {
|
|||
protected String prepareConfigFile() {
|
||||
return "test_app.xml";
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
ComponentUtil.setFessConfig(null);
|
||||
super.tearDown();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue