fix #429 : add DocumentHelper

This commit is contained in:
Shinsuke Sugaya 2016-03-14 18:52:30 +09:00
parent 03c7d84261
commit 90e20a0d2c
11 changed files with 155 additions and 90 deletions

View file

@ -218,8 +218,6 @@ public class Constants extends CoreLibConstants {
public static final String INDEXING_TARGET = "indexingTarget";
public static final String DIGEST_PREFIX = "...";
public static final String BASIC = "BASIC";
public static final String DIGEST = "DIGEST";

View file

@ -16,7 +16,6 @@
package org.codelibs.fess.crawler.transformer;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
@ -51,6 +50,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.DocumentHelper;
import org.codelibs.fess.helper.FileTypeHelper;
import org.codelibs.fess.helper.LabelTypeHelper;
import org.codelibs.fess.helper.PathMappingHelper;
@ -174,6 +174,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
@ -210,12 +211,8 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
}
buf.append(contentMeta);
}
final String body = normalizeContent(buf.toString());
if (StringUtil.isNotBlank(body)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), StringUtil.EMPTY);
}
final String body = documentHelper.getContent(responseData, buf.toString(), dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
@ -228,15 +225,17 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
}
}
// digest
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
Constants.DIGEST_PREFIX
+ abbreviate(normalizeContent(content), fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, buf.toString(), dataMap,
fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
// title
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
if (url.endsWith("/")) {
if (StringUtil.isNotBlank(content)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(),
abbreviate(body, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
putResultDataBody(
dataMap,
fessConfig.getIndexFieldTitle(),
documentHelper.getDigest(responseData, body, dataMap,
fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
}
@ -332,18 +331,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
return dataMap;
}
protected String abbreviate(final String str, final int maxWidth) {
String newStr = StringUtils.abbreviate(str, maxWidth);
try {
if (newStr.getBytes(Constants.UTF_8).length > maxWidth + fessConfig.getCrawlerDocumentFileAbbreviationMarginLengthAsInteger()) {
newStr = StringUtils.abbreviate(str, maxWidth / 2);
}
} catch (final UnsupportedEncodingException e) {
// NOP
}
return newStr;
}
private String getResourceName(final ResponseData responseData) {
String name = responseData.getUrl();
final String enc = responseData.getCharSet();

View file

@ -100,13 +100,6 @@ public interface FessTransformer {
return StringUtils.abbreviate(url, getMaxSiteLength());
}
public default String normalizeContent(final String content) {
if (content == null) {
return StringUtil.EMPTY; // empty
}
return content.replaceAll("\\s+", " ");
}
public default void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
if (fessConfig.getIndexFieldUrl().equals(key)) {

View file

@ -30,7 +30,6 @@ import java.util.Set;
import javax.annotation.PostConstruct;
import javax.xml.transform.TransformerException;
import org.apache.commons.lang3.StringUtils;
import org.apache.xpath.objects.XObject;
import org.codelibs.core.io.InputStreamUtil;
import org.codelibs.core.io.SerializeUtil;
@ -51,6 +50,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.DocumentHelper;
import org.codelibs.fess.helper.DuplicateHostHelper;
import org.codelibs.fess.helper.FileTypeHelper;
import org.codelibs.fess.helper.LabelTypeHelper;
@ -71,7 +71,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
private static final int UTF8_BOM_SIZE = 3;
public boolean prunedCacheContent = true;
public boolean prunedContent = true;
public Map<String, String> convertUrlMap = new HashMap<>();
@ -177,6 +177,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
@ -208,7 +209,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
// title
// content
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), getDocumentContent(responseData, document));
final String body = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), prunedContent);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
@ -230,7 +232,13 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
}
// digest
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), getDocumentDigest(responseData, document));
final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
if (StringUtil.isNotBlank(digest)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// host
@ -314,19 +322,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return null;
}
protected String getDocumentDigest(final ResponseData responseData, final Document document) {
final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
if (StringUtil.isNotBlank(digest)) {
return digest;
}
final String body =
normalizeContent(removeCommentTag(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(),
prunedCacheContent)));
return StringUtils.abbreviate(body, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger());
}
String removeCommentTag(final String content) {
protected String removeCommentTag(final String content) {
if (content == null) {
return StringUtil.EMPTY;
}
@ -348,10 +344,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return value;
}
private String getDocumentContent(final ResponseData responseData, final Document document) {
return normalizeContent(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), true));
}
protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
StringBuilder buf = null;
NodeList list = null;

View file

@ -0,0 +1,55 @@
/*
* Copyright 2012-2016 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.helper;
import java.io.Serializable;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DocumentHelper implements Serializable {
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(DocumentHelper.class);
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
if (content == null) {
return StringUtil.EMPTY; // empty
}
return content.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
}
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
if (content == null) {
return StringUtil.EMPTY; // empty
}
String subContent;
if (content.length() < maxWidth * 2) {
subContent = content;
} else {
subContent = content.substring(0, maxWidth * 2);
}
final String originalStr = subContent.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
return StringUtils.abbreviate(originalStr, maxWidth);
}
}

View file

@ -168,9 +168,6 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. No title. */
String CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL = "crawler.document.file.no.title.label";
/** The key of the configuration. e.g. 10 */
String CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH = "crawler.document.file.abbreviation.margin.length";
/** The key of the configuration. e.g. false */
String CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT = "crawler.document.file.ignore.empty.content";
@ -1150,21 +1147,6 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
String getCrawlerDocumentFileNoTitleLabel();
/**
* Get the value for the key 'crawler.document.file.abbreviation.margin.length'. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileAbbreviationMarginLength();
/**
* Get the value for the key 'crawler.document.file.abbreviation.margin.length' as {@link Integer}. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger();
/**
* Get the value for the key 'crawler.document.file.ignore.empty.content'. <br>
* The value is, e.g. false <br>
@ -3325,14 +3307,6 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return get(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL);
}
public String getCrawlerDocumentFileAbbreviationMarginLength() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
}
public Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
}
public String getCrawlerDocumentFileIgnoreEmptyContent() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT);
}

View file

@ -27,6 +27,7 @@ import org.codelibs.fess.es.client.FessEsClient;
import org.codelibs.fess.helper.ActivityHelper;
import org.codelibs.fess.helper.CrawlingConfigHelper;
import org.codelibs.fess.helper.CrawlingInfoHelper;
import org.codelibs.fess.helper.DocumentHelper;
import org.codelibs.fess.helper.DuplicateHostHelper;
import org.codelibs.fess.helper.FileTypeHelper;
import org.codelibs.fess.helper.IndexingHelper;
@ -56,6 +57,8 @@ import org.lastaflute.job.JobManager;
import org.lastaflute.web.servlet.request.RequestManager;
public final class ComponentUtil {
private static final String DOCUMENT_HELPER = "documentHelper";
private static final String ACTIVITY_HELPER = "activityHelper";
private static final String LDAP_MANAGER = "ldapManager";
@ -291,6 +294,10 @@ public final class ComponentUtil {
return SingletonLaContainer.getComponent(JobManager.class);
}
public static DocumentHelper getDocumentHelper() {
return SingletonLaContainer.getComponent(DOCUMENT_HELPER);
}
public static <T> T getComponent(final Class<T> clazz) {
return SingletonLaContainer.getComponent(clazz);
}

View file

@ -16,6 +16,8 @@
</component>
<component name="crawlingConfigHelper" class="org.codelibs.fess.helper.CrawlingConfigHelper">
</component>
<component name="documentHelper" class="org.codelibs.fess.helper.DocumentHelper">
</component>
<component name="pathMappingHelper" class="org.codelibs.fess.helper.PathMappingHelper">
</component>
<component name="processHelper" class="org.codelibs.fess.helper.ProcessHelper">

View file

@ -96,7 +96,6 @@ crawler.document.html.max.digest.length=200
# file
crawler.document.file.name.encoding=
crawler.document.file.no.title.label=No title.
crawler.document.file.abbreviation.margin.length=10
crawler.document.file.ignore.empty.content=false
crawler.document.file.max.title.length=100
crawler.document.file.max.digest.length=200

View file

@ -218,15 +218,6 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
}
public void test_normalizeContent() {
assertEquals("", fessXpathTransformer.normalizeContent(""));
assertEquals(" ", fessXpathTransformer.normalizeContent(" "));
assertEquals(" ", fessXpathTransformer.normalizeContent(" "));
assertEquals(" ", fessXpathTransformer.normalizeContent("\t"));
assertEquals(" ", fessXpathTransformer.normalizeContent("\t\t"));
assertEquals(" ", fessXpathTransformer.normalizeContent("\t \t"));
}
public void test_removeCommentTag() {
assertEquals("", fessXpathTransformer.removeCommentTag(""));
assertEquals(" ", fessXpathTransformer.removeCommentTag("<!-- - -->"));

View file

@ -0,0 +1,67 @@
/*
* Copyright 2012-2016 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.helper;
import java.util.HashMap;
import java.util.Map;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.unit.UnitFessTestCase;
public class DocumentHelperTest extends UnitFessTestCase {
private DocumentHelper documentHelper;
@Override
public void setUp() throws Exception {
super.setUp();
documentHelper = new DocumentHelper();
}
public void test_getContent() {
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("", documentHelper.getContent(responseData, null, dataMap));
assertEquals("", documentHelper.getContent(responseData, "", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, " ", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t\t", dataMap));
assertEquals("", documentHelper.getContent(responseData, "\t \t", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123 abc ", dataMap));
assertEquals(" あいう", documentHelper.getContent(responseData, " 123 あいう ", dataMap));
assertEquals("123 abc", documentHelper.getContent(responseData, " 123\nabc ", dataMap));
}
public void test_getDigest() {
ResponseData responseData = new ResponseData();
Map<String, Object> dataMap = new HashMap<>();
assertEquals("1234567...", documentHelper.getDigest(responseData, " 1234567890 1234567890 1234567890 ", dataMap, 10));
assertEquals("1234567...", documentHelper.getDigest(responseData, "123456789012345678901234567890", dataMap, 10));
assertEquals("1234567...", documentHelper.getDigest(responseData, "123456789012345678901", dataMap, 10));
assertEquals("1234567...", documentHelper.getDigest(responseData, "12345678901234567890", dataMap, 10));
assertEquals("1234567...", documentHelper.getDigest(responseData, "1234567890123456789", dataMap, 10));
assertEquals("1234567...", documentHelper.getDigest(responseData, "12345678901", dataMap, 10));
assertEquals("1234567890", documentHelper.getDigest(responseData, "1234567890", dataMap, 10));
assertEquals("123456789", documentHelper.getDigest(responseData, "123456789", dataMap, 10));
assertEquals("1234567", documentHelper.getDigest(responseData, "1234567", dataMap, 10));
assertEquals("1", documentHelper.getDigest(responseData, "1", dataMap, 10));
assertEquals("", documentHelper.getDigest(responseData, "", dataMap, 10));
assertEquals("", documentHelper.getDigest(responseData, " ", dataMap, 10));
assertEquals("", documentHelper.getDigest(responseData, null, dataMap, 10));
assertEquals("1234567...", documentHelper.getDigest(responseData, " 1234567890 1234567890 1234567890 ", dataMap, 10));
assertEquals("...", documentHelper.getDigest(responseData, "", dataMap, 10));
}
}