|
@@ -30,7 +30,6 @@ import java.util.Set;
|
|
import javax.annotation.PostConstruct;
|
|
import javax.annotation.PostConstruct;
|
|
import javax.xml.transform.TransformerException;
|
|
import javax.xml.transform.TransformerException;
|
|
|
|
|
|
-import org.apache.commons.lang3.StringUtils;
|
|
|
|
import org.apache.xpath.objects.XObject;
|
|
import org.apache.xpath.objects.XObject;
|
|
import org.codelibs.core.io.InputStreamUtil;
|
|
import org.codelibs.core.io.InputStreamUtil;
|
|
import org.codelibs.core.io.SerializeUtil;
|
|
import org.codelibs.core.io.SerializeUtil;
|
|
@@ -51,6 +50,7 @@ import org.codelibs.fess.es.config.exentity.CrawlingConfig;
|
|
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
|
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
|
|
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
|
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
|
import org.codelibs.fess.helper.CrawlingInfoHelper;
|
|
import org.codelibs.fess.helper.CrawlingInfoHelper;
|
|
|
|
+import org.codelibs.fess.helper.DocumentHelper;
|
|
import org.codelibs.fess.helper.DuplicateHostHelper;
|
|
import org.codelibs.fess.helper.DuplicateHostHelper;
|
|
import org.codelibs.fess.helper.FileTypeHelper;
|
|
import org.codelibs.fess.helper.FileTypeHelper;
|
|
import org.codelibs.fess.helper.LabelTypeHelper;
|
|
import org.codelibs.fess.helper.LabelTypeHelper;
|
|
@@ -71,7 +71,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
|
|
|
|
private static final int UTF8_BOM_SIZE = 3;
|
|
private static final int UTF8_BOM_SIZE = 3;
|
|
|
|
|
|
- public boolean prunedCacheContent = true;
|
|
|
|
|
|
+ public boolean prunedContent = true;
|
|
|
|
|
|
public Map<String, String> convertUrlMap = new HashMap<>();
|
|
public Map<String, String> convertUrlMap = new HashMap<>();
|
|
|
|
|
|
@@ -177,6 +177,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
|
|
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
|
|
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
|
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
|
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
|
|
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
|
|
|
|
+ final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
|
|
String url = responseData.getUrl();
|
|
String url = responseData.getUrl();
|
|
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
|
|
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
|
|
url = pathMappingHelper.replaceUrl(sessionId, url);
|
|
url = pathMappingHelper.replaceUrl(sessionId, url);
|
|
@@ -208,7 +209,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
}
|
|
}
|
|
// title
|
|
// title
|
|
// content
|
|
// content
|
|
- putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), getDocumentContent(responseData, document));
|
|
|
|
|
|
+ final String body = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), prunedContent);
|
|
|
|
+ putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
|
|
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
|
|
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
|
|
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
|
|
.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
|
|
if (responseData.getContentLength() > 0
|
|
if (responseData.getContentLength() > 0
|
|
@@ -230,7 +232,13 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// digest
|
|
// digest
|
|
- putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), getDocumentDigest(responseData, document));
|
|
|
|
|
|
+ final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
|
|
|
|
+ if (StringUtil.isNotBlank(digest)) {
|
|
|
|
+ putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
|
|
|
|
+ } else {
|
|
|
|
+ putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
|
|
|
|
+ documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
|
|
|
|
+ }
|
|
// segment
|
|
// segment
|
|
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
|
|
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
|
|
// host
|
|
// host
|
|
@@ -314,19 +322,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
return null;
|
|
return null;
|
|
}
|
|
}
|
|
|
|
|
|
- protected String getDocumentDigest(final ResponseData responseData, final Document document) {
|
|
|
|
- final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
|
|
|
|
- if (StringUtil.isNotBlank(digest)) {
|
|
|
|
- return digest;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- final String body =
|
|
|
|
- normalizeContent(removeCommentTag(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(),
|
|
|
|
- prunedCacheContent)));
|
|
|
|
- return StringUtils.abbreviate(body, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger());
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- String removeCommentTag(final String content) {
|
|
|
|
|
|
+ protected String removeCommentTag(final String content) {
|
|
if (content == null) {
|
|
if (content == null) {
|
|
return StringUtil.EMPTY;
|
|
return StringUtil.EMPTY;
|
|
}
|
|
}
|
|
@@ -348,10 +344,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|
return value;
|
|
return value;
|
|
}
|
|
}
|
|
|
|
|
|
- private String getDocumentContent(final ResponseData responseData, final Document document) {
|
|
|
|
- return normalizeContent(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), true));
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
|
|
protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
|
|
StringBuilder buf = null;
|
|
StringBuilder buf = null;
|
|
NodeList list = null;
|
|
NodeList list = null;
|