fix #791 add parseTextContext

This commit is contained in:
Shinsuke Sugaya 2016-11-19 10:14:56 +09:00
parent d2dd3ae427
commit f4bd1be6ff
7 changed files with 177 additions and 11 deletions

View file

@ -237,6 +237,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
@ -344,7 +345,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
for (final String labelType : crawlingConfig.getLabelTypeValues()) {
labelTypeSet.add(labelType);
}
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
// role: roleType
@ -450,8 +450,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
for (int i = 0; i < list.getLength(); i++) {
if (buf == null) {
buf = new UnsafeStringBuilder(1000);
} else {
buf.append(' ');
}
Node node = list.item(i).cloneNode(true);
if (useGoogleOffOn) {
@ -460,7 +458,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
if (pruned) {
node = pruneNode(node);
}
buf.append(node.getTextContent());
paseTextContent(node, buf);
}
} catch (final Exception e) {
logger.warn("Could not parse a value of " + xpath);
@ -471,6 +469,24 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return buf.toUnsafeString().trim();
}
protected void paseTextContent(Node node, UnsafeStringBuilder buf) {
if (node.hasChildNodes()) {
final NodeList nodeList = node.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
final Node childNode = nodeList.item(i);
paseTextContent(childNode, buf);
}
} else {
final String value = node.getTextContent();
if (value != null) {
final String content = value.trim();
if (content.length() > 0) {
buf.append(' ').append(content);
}
}
}
}
protected Node processGoogleOffOn(final Node node, final ValueHolder<Boolean> flag) {
final NodeList nodeList = node.getChildNodes();
List<Node> removedNodeList = null;

View file

@ -173,7 +173,7 @@ public class LabelTypeHelper {
}
}
protected static class LabelTypePattern {
public static class LabelTypePattern {
private final String value;

View file

@ -35,4 +35,11 @@ public final class MemoryUtil {
public static String byteCountToDisplaySize(final long size) {
return FileUtils.byteCountToDisplaySize(size).replace(" ", StringUtil.EMPTY);
}
public static long getUsedMemory() {
final Runtime runtime = Runtime.getRuntime();
final long freeBytes = runtime.freeMemory();
final long totalBytes = runtime.totalMemory();
return totalBytes - freeBytes;
}
}

View file

@ -55,4 +55,46 @@ public class PrunedTag {
}
return false;
}
@Override
public String toString() {
return "PrunedTag [tag=" + tag + ", id=" + id + ", css=" + css + "]";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((css == null) ? 0 : css.hashCode());
result = prime * result + ((id == null) ? 0 : id.hashCode());
result = prime * result + ((tag == null) ? 0 : tag.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
PrunedTag other = (PrunedTag) obj;
if (css == null) {
if (other.css != null)
return false;
} else if (!css.equals(other.css))
return false;
if (id == null) {
if (other.id != null)
return false;
} else if (!id.equals(other.id))
return false;
if (tag == null) {
if (other.tag != null)
return false;
} else if (!tag.equals(other.tag))
return false;
return true;
}
}

View file

@ -30,6 +30,7 @@ jvm.crawler.options=\
-XX:+UseParNewGC\n\
-XX:+UseTLAB\n\
-XX:+DisableExplicitGC\n\
-XX:+HeapDumpOnOutOfMemoryError\n\
-XX:-OmitStackTraceInFastThrow\n\
-Djcifs.smb.client.connTimeout=60000\n\
-Djcifs.smb.client.soTimeout=35000\n\
@ -49,6 +50,7 @@ jvm.suggest.options=\
-XX:+UseParNewGC\n\
-XX:+UseTLAB\n\
-XX:+DisableExplicitGC\n\
-XX:+HeapDumpOnOutOfMemoryError\n\
-Dgroovy.use.classvalue=true\n\

View file

@ -17,7 +17,9 @@ package org.codelibs.fess.crawler.transformer;
import java.io.ByteArrayInputStream;
import java.io.StringWriter;
import java.lang.reflect.Field;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -28,30 +30,90 @@ import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.codelibs.core.lang.ClassUtil;
import org.codelibs.core.lang.FieldUtil;
import org.codelibs.core.misc.ValueHolder;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.es.config.exentity.LabelType;
import org.codelibs.fess.es.config.exentity.WebConfig;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.DocumentHelper;
import org.codelibs.fess.helper.FileTypeHelper;
import org.codelibs.fess.helper.LabelTypeHelper;
import org.codelibs.fess.helper.LabelTypeHelper.LabelTypePattern;
import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.MemoryUtil;
import org.cyberneko.html.parsers.DOMParser;
import org.lastaflute.di.core.exception.ComponentNotFoundException;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.InputSource;
public class FessXpathTransformerTest extends UnitFessTestCase {
public FessXpathTransformer fessXpathTransformer;
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformerTest.class);
@Override
public void setUp() throws Exception {
super.setUp();
fessXpathTransformer = new FessXpathTransformer();
public void test_transform() throws Exception {
String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
SingletonLaContainerFactory.getContainer().register(CrawlingInfoHelper.class, "crawlingInfoHelper");
SingletonLaContainerFactory.getContainer().register(PathMappingHelper.class, "pathMappingHelper");
SingletonLaContainerFactory.getContainer().register(CrawlingConfigHelper.class, "crawlingConfigHelper");
SingletonLaContainerFactory.getContainer().register(SystemHelper.class, "systemHelper");
SingletonLaContainerFactory.getContainer().register(FileTypeHelper.class, "fileTypeHelper");
SingletonLaContainerFactory.getContainer().register(DocumentHelper.class, "documentHelper");
SingletonLaContainerFactory.getContainer().register(LabelTypeHelper.class, "labelTypeHelper");
WebConfig webConfig = new WebConfig();
setValueToObject(webConfig, "labelTypeList", new ArrayList<LabelType>());
ComponentUtil.getCrawlingConfigHelper().store("test", webConfig);
setValueToObject(ComponentUtil.getLabelTypeHelper(), "labelTypePatternList", new ArrayList<LabelTypePattern>());
for (int i = 0; i < 10000; i++) {
if (i % 1000 == 0) {
logger.info(MemoryUtil.getMemoryUsageLog() + ":" + i);
System.gc();
}
ResponseData responseData = new ResponseData();
responseData.setCharSet("UTF-8");
responseData.setContentLength(data.length());
responseData.setExecutionTime(1000L);
responseData.setHttpStatusCode(200);
responseData.setLastModified(new Date());
responseData.setMethod("GET");
responseData.setMimeType("text/html");
responseData.setParentUrl("http://fess.codelibs.org/");
responseData.setResponseBody(data.getBytes());
responseData.setSessionId("test-1");
responseData.setStatus(0);
responseData.setUrl("http://fess.codelibs.org/test.html");
ResultData resultData = fessXpathTransformer.transform(responseData);
// System.out.println(resultData.toString());
}
System.gc();
Thread.sleep(1000L);
logger.info(MemoryUtil.getMemoryUsageLog());
assertTrue(MemoryUtil.getUsedMemory() < 100000000L);
}
private void setValueToObject(Object obj, String name, Object value) {
Field field = ClassUtil.getDeclaredField(obj.getClass(), name);
field.setAccessible(true);
FieldUtil.set(field, obj, value);
}
public void test_pruneNode() throws Exception {
@ -311,6 +373,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
}
public void test_isValidPath_valid() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
String value;
value = "foo.html";
@ -331,6 +397,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
}
public void test_isValidPath_invalid() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
String value;
value = "javascript:...";
@ -365,6 +435,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
}
public void test_convertChildUrlList() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
List<RequestData> urlList = new ArrayList<>();
urlList = fessXpathTransformer.convertChildUrlList(urlList);
@ -395,6 +469,10 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
}
public void test_removeCommentTag() {
final FessXpathTransformer fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
assertEquals("", fessXpathTransformer.removeCommentTag(""));
assertEquals(" ", fessXpathTransformer.removeCommentTag("<!-- - -->"));
assertEquals("abc", fessXpathTransformer.removeCommentTag("abc"));
@ -459,6 +537,20 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
}
}
public void test_getSingleNodeValue() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
String data = "<html><body>aaa<style>bbb</style>ccc</body></html>";
Document document = getDocument(data);
String value = transformer.getSingleNodeValue(document, "//BODY", false);
assertEquals("aaa bbb ccc", value);
data = "<html><body> aaa <p> bbb <b>ccc</b> </p> </body></html>";
document = getDocument(data);
value = transformer.getSingleNodeValue(document, "//BODY", false);
assertEquals("aaa bbb ccc", value);
}
public void test_contentXpath() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();

View file

@ -15,6 +15,7 @@
*/
package org.codelibs.fess.unit;
import org.codelibs.fess.util.ComponentUtil;
import org.dbflute.utflute.lastaflute.WebContainerTestCase;
public abstract class UnitFessTestCase extends WebContainerTestCase {
@ -22,4 +23,10 @@ public abstract class UnitFessTestCase extends WebContainerTestCase {
protected String prepareConfigFile() {
return "test_app.xml";
}
@Override
public void tearDown() throws Exception {
ComponentUtil.setFessConfig(null);
super.tearDown();
}
}