improve cache handling, remove unused jsps, update lastaflute

This commit is contained in:
Shinsuke Sugaya 2015-12-27 07:35:04 +09:00
parent 8c1369fd8f
commit 3c1bb626fd
23 changed files with 763 additions and 328 deletions

View file

@ -41,6 +41,7 @@ map:{
; fess = map:{
; path = ..
; freeGenList = list:{ env ; config ; label ; message ; mail ; template ; jsp ; doc }
; configPluginInterface = org.codelibs.fess.mylasta.direction.FessProp
; propertiesHtmlList = list:{ env ; config ; label ; message }
}
}

View file

@ -45,6 +45,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
@ -57,36 +58,20 @@ import org.codelibs.fess.helper.SambaHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import jcifs.smb.ACE;
import jcifs.smb.SID;
public abstract class AbstractFessFileTransformer extends AbstractFessXpathTransformer {
private static final Logger logger = LoggerFactory // NOPMD
.getLogger(AbstractFessFileTransformer.class);
public abstract class AbstractFessFileTransformer extends AbstractTransformer implements FessTransformer {
public String encoding = null;
public String noTitleLabel = "No title.";
public int abbreviationMarginLength = 10;
public boolean ignoreEmptyContent = false;
public int maxTitleLength = 100;
public int maxDigestLength = 200;
public boolean appendMetaContentToContent = true;
public boolean appendBodyContentToContent = true;
protected String charsetName = Constants.UTF_8;
public Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
protected Map<String, String> metaContentMapping;
protected FessConfig fessConfig;
protected abstract Extractor getExtractor(ResponseData responseData);
@Override
@ -109,11 +94,11 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
try {
final ExtractData extractData = extractor.getText(in, params);
content = extractData.getContent();
if (ignoreEmptyContent && StringUtil.isBlank(content)) {
if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
return null;
}
if (logger.isDebugEnabled()) {
logger.debug("ExtractData: " + extractData);
if (getLogger().isDebugEnabled()) {
getLogger().debug("ExtractData: " + extractData);
}
// meta
for (final String key : extractData.getKeySet()) {
@ -191,10 +176,10 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// content
final StringBuilder buf = new StringBuilder(content.length() + 1000);
if (appendBodyContentToContent) {
if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
buf.append(content);
}
if (appendMetaContentToContent) {
if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
if (buf.length() > 0) {
buf.append(' ');
}
@ -206,23 +191,29 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), StringUtil.EMPTY);
}
if (Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
|| fessConfig.isCrawlerDocumentCacheEnable()) {
final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
// text cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
.isCrawlerDocumentCacheEnable()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
// text cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
}
}
// digest
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
Constants.DIGEST_PREFIX + abbreviate(normalizeContent(content), maxDigestLength));
Constants.DIGEST_PREFIX
+ abbreviate(normalizeContent(content), fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
// title
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
if (url.endsWith("/")) {
if (StringUtil.isNotBlank(content)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), abbreviate(body, maxTitleLength));
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(),
abbreviate(body, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), noTitleLabel);
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
}
} else {
final String u = decodeUrlAsName(url, url.startsWith("file:"));
@ -235,9 +226,9 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
}
}
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
@ -287,8 +278,8 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
final SID sid = item.getSID();
roleTypeList.add(sambaHelper.getAccountId(sid));
}
if (logger.isDebugEnabled()) {
logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
if (getLogger().isDebugEnabled()) {
getLogger().debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
}
}
}
@ -335,7 +326,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
protected String abbreviate(final String str, final int maxWidth) {
String newStr = StringUtils.abbreviate(str, maxWidth);
try {
if (newStr.getBytes(Constants.UTF_8).length > maxWidth + abbreviationMarginLength) {
if (newStr.getBytes(Constants.UTF_8).length > maxWidth + fessConfig.getCrawlerDocumentFileAbbreviationMarginLengthAsInteger()) {
newStr = StringUtils.abbreviate(str, maxWidth / 2);
}
} catch (final UnsupportedEncodingException e) {
@ -370,7 +361,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
}
String enc = Constants.UTF_8;
if (encoding == null) {
if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null) {
final String parentUrl = urlQueue.getParentUrl();
@ -385,7 +376,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
}
}
} else {
enc = encoding;
enc = fessConfig.getCrawlerDocumentFileNameEncoding();
}
final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
@ -415,8 +406,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
return null;
}
@Override
protected String getHost(final String url) {
protected String getHostOnFile(final String url) {
if (StringUtil.isBlank(url)) {
return StringUtil.EMPTY; // empty
}
@ -435,30 +425,29 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
return "localhost";
}
return super.getHost(url);
return getHost(url);
}
@Override
protected String getSite(final String url, final String encoding) {
protected String getSiteOnFile(final String url, final String encoding) {
if (StringUtil.isBlank(url)) {
return StringUtil.EMPTY; // empty
}
if (url.startsWith("file:////")) {
final String value = decodeUrlAsName(url.substring(9), true);
return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), maxSiteLength);
return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), getMaxSiteLength());
} else if (url.startsWith("file:")) {
final String value = decodeUrlAsName(url.substring(5), true);
if (value.length() > 2 && value.charAt(2) == ':') {
// Windows
return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), maxSiteLength);
return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), getMaxSiteLength());
} else {
// Unix
return StringUtils.abbreviate(value, maxSiteLength);
return StringUtils.abbreviate(value, getMaxSiteLength());
}
}
return super.getSite(url, encoding);
return getSite(url, encoding);
}
@Override
@ -480,4 +469,5 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
}
metaContentMapping.put(metaname, dynamicField);
}
}

View file

@ -15,10 +15,13 @@
*/
package org.codelibs.fess.crawler.transformer;
import javax.annotation.PostConstruct;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -26,6 +29,21 @@ import org.slf4j.LoggerFactory;
public class FessFileTransformer extends AbstractFessFileTransformer {
private static final Logger logger = LoggerFactory.getLogger(FessFileTransformer.class);
@PostConstruct
public void init() {
fessConfig = ComponentUtil.getFessConfig();
}
@Override
public FessConfig getFessConfig() {
return fessConfig;
}
@Override
public Logger getLogger() {
return logger;
}
@Override
protected Extractor getExtractor(final ResponseData responseData) {
final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory();

View file

@ -15,12 +15,35 @@
*/
package org.codelibs.fess.crawler.transformer;
import javax.annotation.PostConstruct;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.lastaflute.di.core.SingletonLaContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class FessTikaTransformer extends AbstractFessFileTransformer {
private static final Logger logger = LoggerFactory.getLogger(FessTikaTransformer.class);
@PostConstruct
public void init() {
fessConfig = ComponentUtil.getFessConfig();
}
@Override
public FessConfig getFessConfig() {
return fessConfig;
}
@Override
public Logger getLogger() {
return logger;
}
@Override
protected Extractor getExtractor(final ResponseData responseData) {
final Extractor extractor = SingletonLaContainer.getComponent("tikaExtractor");

View file

@ -22,29 +22,20 @@ import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import groovy.lang.Binding;
import groovy.lang.GroovyShell;
public abstract class AbstractFessXpathTransformer extends XpathTransformer {
private static final Logger logger = LoggerFactory.getLogger(AbstractFessXpathTransformer.class);
public interface FessTransformer {
public int maxSiteLength = 50;
FessConfig getFessConfig();
public String unknownHostname = "unknown";
Logger getLogger();
public String siteEncoding;
public boolean replaceSiteEncodingWhenEnglish = false;
public boolean appendResultData = true;
protected String getHost(final String u) {
public default String getHost(final String u) {
if (StringUtil.isBlank(u)) {
return StringUtil.EMPTY; // empty
}
@ -63,13 +54,13 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
}
if (url.equals(originalUrl)) {
return unknownHostname;
return getFessConfig().getCrawlerDocumentUnknownHostname();
}
return url;
}
protected String getSite(final String u, final String encoding) {
public default String getSite(final String u, final String encoding) {
if (StringUtil.isBlank(u)) {
return StringUtil.EMPTY; // empty
}
@ -87,15 +78,15 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
if (encoding != null) {
String enc;
if (siteEncoding != null) {
if (replaceSiteEncodingWhenEnglish) {
if (StringUtil.isNotBlank(getFessConfig().getCrawlerDocumentSiteEncoding())) {
if (getFessConfig().isCrawlerDocumentUseSiteEncodingOnEnglish()) {
if ("ISO-8859-1".equalsIgnoreCase(encoding) || "US-ASCII".equalsIgnoreCase(encoding)) {
enc = siteEncoding;
enc = getFessConfig().getCrawlerDocumentSiteEncoding();
} else {
enc = encoding;
}
} else {
enc = siteEncoding;
enc = getFessConfig().getCrawlerDocumentSiteEncoding();
}
} else {
enc = encoding;
@ -106,39 +97,35 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
} catch (final Exception e) {}
}
return StringUtils.abbreviate(url, maxSiteLength);
return StringUtils.abbreviate(url, getMaxSiteLength());
}
protected String normalizeContent(final String content) {
public default String normalizeContent(final String content) {
if (content == null) {
return StringUtil.EMPTY; // empty
}
return content.replaceAll("\\s+", " ");
}
protected void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
public default void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
if (fessConfig.getIndexFieldUrl().equals(key)) {
dataMap.put(key, value);
} else if (dataMap.containsKey(key)) {
if (appendResultData) {
if (getFessConfig().isCrawlerDocumentAppendData()) {
final Object oldValue = dataMap.get(key);
if (key.endsWith("_m")) {
final Object[] oldValues = (Object[]) oldValue;
if (value.getClass().isArray()) {
final Object[] newValues = (Object[]) value;
final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length);
for (int i = 0; i < newValues.length; i++) {
values[values.length - 1 + i] = newValues[i];
}
dataMap.put(key, values);
} else {
final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1);
values[values.length - 1] = value;
dataMap.put(key, values);
final Object[] oldValues = (Object[]) oldValue;
if (value.getClass().isArray()) {
final Object[] newValues = (Object[]) value;
final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length);
for (int i = 0; i < newValues.length; i++) {
values[values.length - 1 + i] = newValues[i];
}
dataMap.put(key, values);
} else {
dataMap.put(key, oldValue + " " + value);
final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1);
values[values.length - 1] = value;
dataMap.put(key, values);
}
} else {
dataMap.put(key, value);
@ -148,7 +135,8 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
}
}
protected void putResultDataWithTemplate(final Map<String, Object> dataMap, final String key, final Object value, final String template) {
public default void putResultDataWithTemplate(final Map<String, Object> dataMap, final String key, final Object value,
final String template) {
Object target = value;
if (template != null) {
final Map<String, Object> paramMap = new HashMap<>(dataMap.size() + 1);
@ -173,7 +161,7 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
}
}
protected String evaluateValue(final String template, final Map<String, Object> paramMap) {
public default String evaluateValue(final String template, final Map<String, Object> paramMap) {
if (StringUtil.isEmpty(template)) {
return StringUtil.EMPTY;
}
@ -185,8 +173,13 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
}
return value.toString();
} catch (final Exception e) {
logger.warn("Invalid value format: " + template, e);
getLogger().warn("Invalid value format: " + template, e);
return null;
}
}
public default int getMaxSiteLength() {
return getFessConfig().getCrawlerDocumentMaxSiteLengthAsInteger();
}
}

View file

@ -29,6 +29,7 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.annotation.PostConstruct;
import javax.xml.transform.TransformerException;
import org.apache.commons.io.IOUtils;
@ -47,6 +48,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.ResponseDataUtil;
import org.codelibs.fess.es.config.exentity.CrawlingConfig;
@ -68,30 +70,31 @@ import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
public class FessXpathTransformer extends AbstractFessXpathTransformer {
public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
private static final int UTF8_BOM_SIZE = 3;
public String cacheXpath = "//BODY";
public String contentXpath = "//BODY";
public String langXpath = "//HTML/@lang";
public String digestXpath = "//META[@name='description']/@content";
public String canonicalXpath = "//LINK[@rel='canonical']/@href";
public List<String> prunedTagList = new ArrayList<String>();
public boolean prunedCacheContent = true;
public int maxDigestLength = 200;
public Map<String, String> convertUrlMap = new HashMap<>();
public int maxCacheLength = 2621440; // 2.5Mbytes
protected FessConfig fessConfig;
public Map<String, String> convertUrlMap = new HashMap<String, String>();
@PostConstruct
public void init() {
fessConfig = ComponentUtil.getFessConfig();
}
@Override
public FessConfig getFessConfig() {
return fessConfig;
}
@Override
public Logger getLogger() {
return logger;
}
@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
@ -181,7 +184,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
if (StringUtil.isNotBlank(canonicalXpath)) {
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCannonicalXpath())) {
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
final Set<RequestData> childUrlSet = new HashSet<>();
@ -202,6 +205,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
@ -223,26 +227,32 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, langXpath, true));
final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlLangXpath(), true));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}
// title
// content
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), getDocumentContent(responseData, document));
if (Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
|| fessConfig.isCrawlerDocumentCacheEnable()) {
String charSet = responseData.getCharSet();
if (charSet == null) {
charSet = Constants.UTF_8;
}
try {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(),
new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
.isCrawlerDocumentCacheEnable()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
&& responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
String charSet = responseData.getCharSet();
if (charSet == null) {
charSet = Constants.UTF_8;
}
try {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(),
new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
}
} else {
logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(),
fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
}
}
// digest
@ -261,7 +271,6 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
// anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
// mimetype
final String mimeType = responseData.getMimeType();
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
@ -324,7 +333,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
}
protected String getCanonicalUrl(final ResponseData responseData, final Document document) {
final String canonicalUrl = getSingleNodeValue(document, canonicalXpath, false);
final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCannonicalXpath(), false);
if (StringUtil.isNotBlank(canonicalUrl)) {
return canonicalUrl;
}
@ -332,13 +341,15 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
}
protected String getDocumentDigest(final ResponseData responseData, final Document document) {
final String digest = getSingleNodeValue(document, digestXpath, false);
final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
if (StringUtil.isNotBlank(digest)) {
return digest;
}
final String body = normalizeContent(removeCommentTag(getSingleNodeValue(document, contentXpath, prunedCacheContent)));
return StringUtils.abbreviate(body, maxDigestLength);
final String body =
normalizeContent(removeCommentTag(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(),
prunedCacheContent)));
return StringUtils.abbreviate(body, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger());
}
String removeCommentTag(final String content) {
@ -364,7 +375,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
}
private String getDocumentContent(final ResponseData responseData, final Document document) {
return normalizeContent(getSingleNodeValue(document, contentXpath, true));
return normalizeContent(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), true));
}
protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
@ -420,7 +431,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
}
protected boolean isPrunedTag(final String tagName) {
for (final String name : prunedTagList) {
for (final String name : getCrawlerDocumentHtmlPrunedTags()) {
if (name.equalsIgnoreCase(tagName)) {
return true;
}
@ -492,12 +503,6 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
return urlList;
}
public void addPrunedTag(final String tagName) {
if (StringUtil.isNotBlank(tagName)) {
prunedTagList.add(tagName);
}
}
@Override
public Object getData(final AccessResultData<?> accessResultData) {
final byte[] data = accessResultData.getData();
@ -554,4 +559,9 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
private boolean isUtf8BomBytes(final byte[] b) {
return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF;
}
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
}
}

View file

@ -38,6 +38,7 @@ import java.util.regex.Pattern;
import javax.annotation.PostConstruct;
import javax.annotation.Resource;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.CoreLibConstants;
import org.codelibs.core.lang.StringUtil;
@ -398,7 +399,7 @@ public class ViewHelper implements Serializable {
if (locale == null) {
locale = Locale.ENGLISH;
}
String url = DocumentUtil.getValue(doc, "url", String.class);
String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
if (url == null) {
url = ComponentUtil.getMessageManager().getMessage(locale, "labels.search_unknown");
}
@ -417,6 +418,10 @@ public class ViewHelper implements Serializable {
String cache = DocumentUtil.getValue(doc, fessConfig.getIndexFieldCache(), String.class);
if (cache != null) {
String mimetype = DocumentUtil.getValue(doc, fessConfig.getIndexFieldMimetype(), String.class);
if (!ComponentUtil.getFessConfig().isHtmlMimetypeForCache(mimetype)) {
cache = StringEscapeUtils.escapeHtml4(cache);
}
cache = pathMappingHelper.replaceUrls(cache);
if (queries != null && queries.length > 0) {
doc.put("hlCache", replaceHighlightQueries(cache, queries));

View file

@ -20,7 +20,7 @@ import org.lastaflute.core.direction.exception.ConfigPropertyNotFoundException;
/**
* @author FreeGen
*/
public interface FessConfig extends FessEnv, FessProp {
public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction.FessProp {
/** The key of the configuration. e.g. Fess */
String DOMAIN_TITLE = "domain.title";
@ -66,9 +66,75 @@ public interface FessConfig extends FessEnv, FessProp {
-XX:+DisableExplicitGC */
String JVM_SUGGEST_OPTIONS = "jvm.suggest.options";
/** The key of the configuration. e.g. 50 */
String CRAWLER_DOCUMENT_MAX_SITE_LENGTH = "crawler.document.max.site.length";
/** The key of the configuration. e.g. UTF-8 */
String CRAWLER_DOCUMENT_SITE_ENCODING = "crawler.document.site.encoding";
/** The key of the configuration. e.g. unknown */
String CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME = "crawler.document.unknown.hostname";
/** The key of the configuration. e.g. false */
String CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH = "crawler.document.use.site.encoding.on.english";
/** The key of the configuration. e.g. true */
String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
/** The key of the configuration. e.g. //BODY */
String CRAWLER_DOCUMENT_HTML_CONTENT_XPATH = "crawler.document.html.content.xpath";
/** The key of the configuration. e.g. //HTML/@lang */
String CRAWLER_DOCUMENT_HTML_LANG_XPATH = "crawler.document.html.lang.xpath";
/** The key of the configuration. e.g. //META[@name='description']/@content */
String CRAWLER_DOCUMENT_HTML_DIGEST_XPATH = "crawler.document.html.digest.xpath";
/** The key of the configuration. e.g. //LINK[@rel='canonical']/@href */
String CRAWLER_DOCUMENT_HTML_CANNONICAL_XPATH = "crawler.document.html.cannonical.xpath";
/** The key of the configuration. e.g. noscript,script */
String CRAWLER_DOCUMENT_HTML_PRUNED_TAGS = "crawler.document.html.pruned.tags";
/** The key of the configuration. e.g. 200 */
String CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH = "crawler.document.html.max.digest.length";
/** The key of the configuration. e.g. */
String CRAWLER_DOCUMENT_FILE_NAME_ENCODING = "crawler.document.file.name.encoding";
/** The key of the configuration. e.g. No title. */
String CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL = "crawler.document.file.no.title.label";
/** The key of the configuration. e.g. 10 */
String CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH = "crawler.document.file.abbreviation.margin.length";
/** The key of the configuration. e.g. false */
String CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT = "crawler.document.file.ignore.empty.content";
/** The key of the configuration. e.g. 100 */
String CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH = "crawler.document.file.max.title.length";
/** The key of the configuration. e.g. 200 */
String CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH = "crawler.document.file.max.digest.length";
/** The key of the configuration. e.g. true */
String CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT = "crawler.document.file.append.meta.content";
/** The key of the configuration. e.g. true */
String CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT = "crawler.document.file.append.body.content";
/** The key of the configuration. e.g. true */
String CRAWLER_DOCUMENT_CACHE_ENABLE = "crawler.document.cache.enable";
/** The key of the configuration. e.g. 2621440 */
String CRAWLER_DOCUMENT_CACHE_MAX_SIZE = "crawler.document.cache.max.size";
/** The key of the configuration. e.g. text/html */
String CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES = "crawler.document.cache.supported.mimetypes";
/** The key of the configuration. e.g. text/html */
String CRAWLER_DOCUMENT_CACHE_HTML_MIMETYPES = "crawler.document.cache.html.mimetypes";
/** The key of the configuration. e.g. favorite_count */
String INDEX_FIELD_favorite_count = "index.field.favorite_count";
@ -475,19 +541,272 @@ public interface FessConfig extends FessEnv, FessProp {
String getJvmSuggestOptions();
/**
* Get the value for the key 'crawler.document.cache.enable'. <br>
* Get the value for the key 'crawler.document.max.site.length'. <br>
* The value is, e.g. 50 <br>
* comment: common
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentMaxSiteLength();
/**
* Get the value for the key 'crawler.document.max.site.length' as {@link Integer}. <br>
* The value is, e.g. 50 <br>
* comment: common
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentMaxSiteLengthAsInteger();
/**
* Get the value for the key 'crawler.document.site.encoding'. <br>
* The value is, e.g. UTF-8 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentSiteEncoding();
/**
* Get the value for the key 'crawler.document.unknown.hostname'. <br>
* The value is, e.g. unknown <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentUnknownHostname();
/**
* Get the value for the key 'crawler.document.use.site.encoding.on.english'. <br>
* The value is, e.g. false <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentUseSiteEncodingOnEnglish();
/**
* Is the property for the key 'crawler.document.use.site.encoding.on.english' true? <br>
* The value is, e.g. false <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentUseSiteEncodingOnEnglish();
/**
* Get the value for the key 'crawler.document.append.data'. <br>
* The value is, e.g. true <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentAppendData();
/**
* Is the property for the key 'crawler.document.append.data' true? <br>
* The value is, e.g. true <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentAppendData();
/**
* Get the value for the key 'crawler.document.html.content.xpath'. <br>
* The value is, e.g. //BODY <br>
* comment: html
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlContentXpath();
/**
* Get the value for the key 'crawler.document.html.lang.xpath'. <br>
* The value is, e.g. //HTML/@lang <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlLangXpath();
/**
* Get the value for the key 'crawler.document.html.digest.xpath'. <br>
* The value is, e.g. //META[@name='description']/@content <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlDigestXpath();
/**
* Get the value for the key 'crawler.document.html.cannonical.xpath'. <br>
* The value is, e.g. //LINK[@rel='canonical']/@href <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlCannonicalXpath();
/**
* Get the value for the key 'crawler.document.html.pruned.tags'. <br>
* The value is, e.g. noscript,script <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlPrunedTags();
/**
* Get the value for the key 'crawler.document.html.max.digest.length'. <br>
* The value is, e.g. 200 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentHtmlMaxDigestLength();
/**
* Get the value for the key 'crawler.document.html.max.digest.length' as {@link Integer}. <br>
* The value is, e.g. 200 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentHtmlMaxDigestLengthAsInteger();
/**
* Get the value for the key 'crawler.document.file.name.encoding'. <br>
* The value is, e.g. <br>
* comment: file
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileNameEncoding();
/**
* Get the value for the key 'crawler.document.file.name.encoding' as {@link Integer}. <br>
* The value is, e.g. <br>
* comment: file
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentFileNameEncodingAsInteger();
/**
* Get the value for the key 'crawler.document.file.no.title.label'. <br>
* The value is, e.g. No title. <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileNoTitleLabel();
/**
* Get the value for the key 'crawler.document.file.abbreviation.margin.length'. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileAbbreviationMarginLength();
/**
* Get the value for the key 'crawler.document.file.abbreviation.margin.length' as {@link Integer}. <br>
* The value is, e.g. 10 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger();
/**
* Get the value for the key 'crawler.document.file.ignore.empty.content'. <br>
* The value is, e.g. false <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileIgnoreEmptyContent();
/**
* Is the property for the key 'crawler.document.file.ignore.empty.content' true? <br>
* The value is, e.g. false <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentFileIgnoreEmptyContent();
/**
* Get the value for the key 'crawler.document.file.max.title.length'. <br>
* The value is, e.g. 100 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileMaxTitleLength();
/**
* Get the value for the key 'crawler.document.file.max.title.length' as {@link Integer}. <br>
* The value is, e.g. 100 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentFileMaxTitleLengthAsInteger();
/**
* Get the value for the key 'crawler.document.file.max.digest.length'. <br>
* The value is, e.g. 200 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileMaxDigestLength();
/**
* Get the value for the key 'crawler.document.file.max.digest.length' as {@link Integer}. <br>
* The value is, e.g. 200 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentFileMaxDigestLengthAsInteger();
/**
* Get the value for the key 'crawler.document.file.append.meta.content'. <br>
* The value is, e.g. true <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileAppendMetaContent();
/**
* Is the property for the key 'crawler.document.file.append.meta.content' true? <br>
* The value is, e.g. true <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentFileAppendMetaContent();
/**
* Get the value for the key 'crawler.document.file.append.body.content'. <br>
* The value is, e.g. true <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentFileAppendBodyContent();
/**
* Is the property for the key 'crawler.document.file.append.body.content' true? <br>
* The value is, e.g. true <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentFileAppendBodyContent();
/**
* Get the value for the key 'crawler.document.cache.enable'. <br>
* The value is, e.g. true <br>
* comment: cache
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentCacheEnable();
/**
* Is the property for the key 'crawler.document.cache.enable' true? <br>
* The value is, e.g. false <br>
* The value is, e.g. true <br>
* comment: cache
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentCacheEnable();
/**
* Get the value for the key 'crawler.document.cache.max.size'. <br>
* The value is, e.g. 2621440 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentCacheMaxSize();
/**
* Get the value for the key 'crawler.document.cache.max.size' as {@link Integer}. <br>
* The value is, e.g. 2621440 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
Integer getCrawlerDocumentCacheMaxSizeAsInteger();
/**
* Get the value for the key 'crawler.document.cache.supported.mimetypes'. <br>
* The value is, e.g. text/html <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentCacheSupportedMimetypes();
/**
* Get the value for the key 'crawler.document.cache.html.mimetypes'. <br>
* The value is, e.g. text/html <br>
* comment: ,text/plain,application/xml,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-powerpoint,application/vnd.openxmlformats-officedocument.presentationml.presentation
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentCacheHtmlMimetypes();
/**
* Get the value for the key 'index.field.favorite_count'. <br>
* The value is, e.g. favorite_count <br>
@ -1515,6 +1834,126 @@ public interface FessConfig extends FessEnv, FessProp {
return get(FessConfig.JVM_SUGGEST_OPTIONS);
}
public String getCrawlerDocumentMaxSiteLength() {
return get(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH);
}
public Integer getCrawlerDocumentMaxSiteLengthAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH);
}
public String getCrawlerDocumentSiteEncoding() {
return get(FessConfig.CRAWLER_DOCUMENT_SITE_ENCODING);
}
public String getCrawlerDocumentUnknownHostname() {
return get(FessConfig.CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME);
}
public String getCrawlerDocumentUseSiteEncodingOnEnglish() {
return get(FessConfig.CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH);
}
public boolean isCrawlerDocumentUseSiteEncodingOnEnglish() {
return is(FessConfig.CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH);
}
public String getCrawlerDocumentAppendData() {
return get(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
}
public boolean isCrawlerDocumentAppendData() {
return is(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
}
public String getCrawlerDocumentHtmlContentXpath() {
return get(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH);
}
public String getCrawlerDocumentHtmlLangXpath() {
return get(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH);
}
public String getCrawlerDocumentHtmlDigestXpath() {
return get(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH);
}
public String getCrawlerDocumentHtmlCannonicalXpath() {
return get(FessConfig.CRAWLER_DOCUMENT_HTML_CANNONICAL_XPATH);
}
public String getCrawlerDocumentHtmlPrunedTags() {
return get(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS);
}
public String getCrawlerDocumentHtmlMaxDigestLength() {
return get(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH);
}
public Integer getCrawlerDocumentHtmlMaxDigestLengthAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH);
}
public String getCrawlerDocumentFileNameEncoding() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING);
}
public Integer getCrawlerDocumentFileNameEncodingAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING);
}
public String getCrawlerDocumentFileNoTitleLabel() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL);
}
public String getCrawlerDocumentFileAbbreviationMarginLength() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
}
public Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
}
public String getCrawlerDocumentFileIgnoreEmptyContent() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT);
}
public boolean isCrawlerDocumentFileIgnoreEmptyContent() {
return is(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT);
}
public String getCrawlerDocumentFileMaxTitleLength() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH);
}
public Integer getCrawlerDocumentFileMaxTitleLengthAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH);
}
public String getCrawlerDocumentFileMaxDigestLength() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH);
}
public Integer getCrawlerDocumentFileMaxDigestLengthAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH);
}
public String getCrawlerDocumentFileAppendMetaContent() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT);
}
public boolean isCrawlerDocumentFileAppendMetaContent() {
return is(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT);
}
public String getCrawlerDocumentFileAppendBodyContent() {
return get(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT);
}
public boolean isCrawlerDocumentFileAppendBodyContent() {
return is(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT);
}
public String getCrawlerDocumentCacheEnable() {
return get(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLE);
}
@ -1523,6 +1962,22 @@ public interface FessConfig extends FessEnv, FessProp {
return is(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLE);
}
public String getCrawlerDocumentCacheMaxSize() {
return get(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE);
}
public Integer getCrawlerDocumentCacheMaxSizeAsInteger() {
return getAsInteger(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE);
}
public String getCrawlerDocumentCacheSupportedMimetypes() {
return get(FessConfig.CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES);
}
public String getCrawlerDocumentCacheHtmlMimetypes() {
return get(FessConfig.CRAWLER_DOCUMENT_CACHE_HTML_MIMETYPES);
}
public String getIndexFieldFavoriteCount() {
return get(FessConfig.INDEX_FIELD_favorite_count);
}

View file

@ -18,6 +18,7 @@ package org.codelibs.fess.mylasta.direction;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.StreamUtil;
public interface FessProp {
public default String getProperty(String key) {
@ -79,4 +80,30 @@ public interface FessProp {
return getJvmSuggestOptions().split("\n");
}
String getCrawlerDocumentHtmlPrunedTags();
public default String[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
return getCrawlerDocumentHtmlPrunedTags().split(",");
}
String getCrawlerDocumentCacheHtmlMimetypes();
public default boolean isHtmlMimetypeForCache(String mimetype) {
String[] mimetypes = getCrawlerDocumentCacheHtmlMimetypes().split(",");
if (mimetypes.length == 1 && StringUtil.isBlank(mimetypes[0])) {
return true;
}
return StreamUtil.of(mimetypes).anyMatch(s -> s.equalsIgnoreCase(mimetype));
}
String getCrawlerDocumentCacheSupportedMimetypes();
public default boolean isSupportedDocumentCacheMimetypes(String mimetype) {
String[] mimetypes = getCrawlerDocumentCacheSupportedMimetypes().split(",");
if (mimetypes.length == 1 && StringUtil.isBlank(mimetypes[0])) {
return true;
}
return StreamUtil.of(mimetypes).anyMatch(s -> s.equalsIgnoreCase(mimetype));
}
}

View file

@ -4,7 +4,6 @@
<components namespace="fessCrawler">
<include path="crawler/transformer_basic.xml"/>
<component name="fessXpathTransformer" class="org.codelibs.fess.crawler.transformer.FessXpathTransformer" instance="singleton">
<property name="name">"fessXpathTransformer"</property>
<property name="featureMap">defaultFeatureMap</property>
@ -16,31 +15,15 @@
<property name="convertUrlMap">
{"feed:" : "http:"}
</property>
<!--
<property name="cacheXpath">"//BODY"</property>
<property name="contentXpath">"//BODY"</property>
<property name="anchorXpath">"//A/@href"</property>
<property name="digestXpath">"//META[@name='description']/@content"</property>
-->
<property name="replaceSiteEncodingWhenEnglish">true</property>
<property name="siteEncoding">"UTF-8"</property>
<!-- segment -->
<postConstruct name="addFieldRule">
<arg>"title"</arg>
<arg>"//TITLE"</arg>
</postConstruct>
<postConstruct name="addPrunedTag">
<arg>"noscript"</arg>
</postConstruct>
<postConstruct name="addPrunedTag">
<arg>"script"</arg>
</postConstruct>
</component>
<component name="fessFileTransformer" class="org.codelibs.fess.crawler.transformer.FessFileTransformer" instance="singleton">
<property name="name">"fessFileTransformer"</property>
<property name="replaceSiteEncodingWhenEnglish">true</property>
<property name="siteEncoding">"UTF-8"</property>
<postConstruct name="addMetaContentMapping">
<arg>"title"</arg>
<arg>"title"</arg>
@ -60,8 +43,6 @@
<component name="fessTikaTransformer" class="org.codelibs.fess.crawler.transformer.FessTikaTransformer" instance="singleton">
<property name="name">"fessTikaTransformer"</property>
<property name="replaceSiteEncodingWhenEnglish">true</property>
<property name="siteEncoding">"UTF-8"</property>
<postConstruct name="addMetaContentMapping">
<arg>"title"</arg>
<arg>"title"</arg>

View file

@ -50,7 +50,37 @@ jvm.suggest.options=\
# Index
# ====
crawler.document.cache.enable=false
# common
crawler.document.max.site.length=50
crawler.document.site.encoding=UTF-8
crawler.document.unknown.hostname=unknown
crawler.document.use.site.encoding.on.english=false
crawler.document.append.data=true
# html
crawler.document.html.content.xpath=//BODY
crawler.document.html.lang.xpath=//HTML/@lang
crawler.document.html.digest.xpath=//META[@name='description']/@content
crawler.document.html.cannonical.xpath=//LINK[@rel='canonical']/@href
crawler.document.html.pruned.tags=noscript,script
crawler.document.html.max.digest.length=200
# file
crawler.document.file.name.encoding=
crawler.document.file.no.title.label=No title.
crawler.document.file.abbreviation.margin.length=10
crawler.document.file.ignore.empty.content=false
crawler.document.file.max.title.length=100
crawler.document.file.max.digest.length=200
crawler.document.file.append.meta.content=true
crawler.document.file.append.body.content=true
# cache
crawler.document.cache.enable=true
crawler.document.cache.max.size=2621440
crawler.document.cache.supported.mimetypes=text/html
#,text/plain,application/xml,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-powerpoint,application/vnd.openxmlformats-officedocument.presentationml.presentation
crawler.document.cache.html.mimetypes=text/html
# field names
index.field.favorite_count=favorite_count

View file

@ -1,9 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<html>
<head>
<jsp:include page="indexHtmlHead.jsp"/>
</head>
<body>
<jsp:include page="indexMain.jsp"/>
</body>
</html>

View file

@ -1,4 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<meta http-equiv="Content-Type" content="text/html; charset=<m:charset/>" />
<meta content="no-cache" http-equiv="Cache-Control"/>
<title><bean:message key="labels.mobile_search_title"/></title>

View file

@ -1,14 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<div>
<div style="text-align: center;">
<m:img src="logo-top.png" magniWidth="0.8" style="vertical-align: middle;" />
<br/>
<s:form>
<div>
<html:text property="query" title="Search" size="20" maxlength="1000" />
<br/>
<input type="submit" value="<bean:message key="labels.top.search"/>" name="search" />
</div>
</s:form>
</div>
</div>

View file

@ -1,18 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<html>
<head>
<jsp:include page="searchHtmlHead.jsp"/>
</head>
<body>
<jsp:include page="searchHeader.jsp"/>
<c:choose>
<c:when test="${f:h(allRecordCount) != 0}">
<jsp:include page="searchResults.jsp"/>
</c:when>
<c:otherwise>
<jsp:include page="searchNoResult.jsp"/>
</c:otherwise>
</c:choose>
<jsp:include page="searchFooter.jsp"/>
</body>
</html>

View file

@ -1,5 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<hr style="border-style: solid; border-color: #ffffff;"/>
<div style="font-size: x-small; text-align: center;">
<bean:message key="labels.footer.copyright"/>
</div>

View file

@ -1,13 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<div id="header">
<div>
<s:form>
<div>
<m:img src="logo-top.png" magniWidth="0.3" />
<br/>
<html:text property="query" title="Search" size="16" maxlength="1000" />
<input type="submit" value="<bean:message key="labels.search"/>" name="search"/>
</div>
</s:form>
</div>
</div>

View file

@ -1,4 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<meta http-equiv="Content-Type" content="text/html; charset=<m:charset/>" />
<meta content="no-cache" http-equiv="Cache-Control"/>
<title>${f:h(query)} - <bean:message key="labels.search_title"/></title>

View file

@ -1,4 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<div id="result">
<bean:message key="labels.did_not_match" arg0="${f:h(query)}"/>
</div>

View file

@ -1,55 +0,0 @@
<%@page pageEncoding="UTF-8" %>
<div id="result">
<div>
<c:forEach var="doc" varStatus="s" items="${documentItems}">
<div>
<a href="${doc.urlLink}"><span>${f:h(doc.contentTitle)}</span></a>
<span id="snip">
<br/>
<span style="color: #666666;">
${doc.contentDescription}
</span>
</span>
<span style="color: #008000;">
<br/>
${f:h(doc.site)}
</span>
<br/>
</div>
<br/>
</c:forEach>
</div>
</div>
<div id="subfooter" style="text-align: center;">
<p>
<c:if test="${existPrevPage}">
<span>
<s:link href="prev?query=${f:u(query)}&pn=${f:u(currentPageNumber)}&num=${f:u(pageSize)}">
<bean:message key="labels.prev_page"/>
</s:link>
</span>
</c:if>
<c:forEach var="pageNumber" varStatus="s" items="${pageNumberList}">
<c:if test="${pageNumber == currentPageNumber}">
<span>
${pageNumber}
</span>
</c:if>
<c:if test="${pageNumber != currentPageNumber}">
<span>
<s:link href="move?query=${f:u(query)}&pn=${f:u(pageNumber)}&num=${f:u(pageSize)}">
${f:h(pageNumber)}
</s:link>
</span>
</c:if>
</c:forEach>
<c:if test="${existNextPage}">
<span>
<s:link href="next?query=${f:u(query)}&pn=${f:u(currentPageNumber)}&num=${f:u(pageSize)}">
<bean:message key="labels.next_page"/>
</s:link>
</span>
</c:if>
</p>
</div>

View file

@ -36,8 +36,10 @@
<div class="site ellipsis">
<cite>${f:h(doc.sitePath)}</cite>
<c:if test="${doc.has_cache=='true'}">
<small>
<la:link href="/cache/?docId=${doc.doc_id}${appendHighlightParams}" class="cache"><la:message
key="labels.search_result_cache" /></la:link>
</small>
</c:if>
</div>
<div class="more hidden-md-up">

View file

@ -34,7 +34,7 @@ public class FessFileTransformerTest extends UnitFessTestCase {
public void test_decodeUrl_ok() throws Exception {
String url, exp;
final FessFileTransformer transformer = new FessFileTransformer();
final FessFileTransformer transformer = createInstance();
url = "";
exp = "";
@ -62,156 +62,171 @@ public class FessFileTransformerTest extends UnitFessTestCase {
}
public void test_decodeUrl_null() throws Exception {
final FessFileTransformer transformer = new FessFileTransformer();
final FessFileTransformer transformer = createInstance();
assertNull(transformer.decodeUrlAsName(null, true));
}
public void test_getHost_ok() {
String url, exp;
final FessFileTransformer transformer = new FessFileTransformer();
final FessFileTransformer transformer = createInstance();
url = "";
exp = "";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "http://server/home/user";
exp = "server";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:/home/user";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:/c:/home/user";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:////server/home/user";
exp = "server";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:/" + encodeUrl("ホーム") + "/user";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:/c:/" + encodeUrl("ホーム") + "/user";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:////" + encodeUrl("サーバー") + "/home/user";
exp = "サーバー";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
}
public void test_getHost_unexpected() {
String url, exp;
final FessFileTransformer transformer = new FessFileTransformer();
final FessFileTransformer transformer = createInstance();
url = null;
exp = "";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "example:";
exp = "unknown";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file://";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:///";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file://///";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file://///example";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
url = "file:/c:";
exp = "localhost";
assertEquals(exp, transformer.getHost(url));
assertEquals(exp, transformer.getHostOnFile(url));
}
public void test_getSite_ok() {
String url, exp;
final FessFileTransformer transformer = new FessFileTransformer();
final FessFileTransformer transformer = createInstance();
url = "";
exp = "";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "http://example.com/";
exp = "example.com/";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "http://example.com/index.html";
exp = "example.com/index.html";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file:/home/user";
exp = "/home/user";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file:/c:/home/user";
exp = "c:\\home\\user";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file:/c:/";
exp = "c:\\";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file:////server/user";
exp = "\\\\server\\user";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
}
transformer.maxSiteLength = 10;
public void test_getSite_ok_len10() {
String url, exp;
final FessFileTransformer transformer = new FessFileTransformer() {
@Override
public int getMaxSiteLength() {
return 10;
}
};
transformer.init();
url = "file:/home/user/foo";
exp = "/home/u...";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
}
public void test_getSite_unexpected() {
String url, exp;
final FessFileTransformer transformer = new FessFileTransformer();
final FessFileTransformer transformer = createInstance();
url = "file:";
exp = "";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file";
exp = "file";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file:/";
exp = "/";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file:/c:";
exp = "c:";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file://";
exp = "//";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file:///";
exp = "///";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
url = "file://///";
exp = "\\\\\\";
assertEquals(exp, transformer.getSite(url, "UTF-8"));
assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
}
private FessFileTransformer createInstance() {
final FessFileTransformer transformer = new FessFileTransformer();
transformer.init();
return transformer;
}
}

View file

@ -46,6 +46,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
public void setUp() throws Exception {
super.setUp();
fessXpathTransformer = new FessXpathTransformer();
fessXpathTransformer.init();
fessXpathTransformer.convertUrlMap.put("feed:", "http:");
}
@ -53,7 +54,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return new String[0];
}
};
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
assertEquals(getXmlString(document), getXmlString(pruneNode));
@ -63,8 +68,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.prunedTagList.add("noscript");
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return new String[] { "noscript" };
}
};
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
@ -83,9 +91,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.prunedTagList.add("script");
transformer.prunedTagList.add("noscript");
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected String[] getCrawlerDocumentHtmlPrunedTags() {
return new String[] { "script", "noscript" };
}
};
final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
final String docString = getXmlString(document);
@ -235,6 +245,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
public void test_canonicalXpath() throws Exception {
final FessXpathTransformer transformer = new FessXpathTransformer();
transformer.init();
final Map<String, Object> dataMap = new HashMap<String, Object>();
final ResponseData responseData = new ResponseData();