diff --git a/dbflute_fess/dfprop/lastafluteMap.dfprop b/dbflute_fess/dfprop/lastafluteMap.dfprop index 6db7b17c8..c13b749e0 100644 --- a/dbflute_fess/dfprop/lastafluteMap.dfprop +++ b/dbflute_fess/dfprop/lastafluteMap.dfprop @@ -41,6 +41,7 @@ map:{ ; fess = map:{ ; path = .. ; freeGenList = list:{ env ; config ; label ; message ; mail ; template ; jsp ; doc } + ; configPluginInterface = org.codelibs.fess.mylasta.direction.FessProp ; propertiesHtmlList = list:{ env ; config ; label ; message } } } diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java index 40aafa426..c9185d039 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java @@ -45,6 +45,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.CrawlingAccessException; import org.codelibs.fess.crawler.extractor.Extractor; +import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer; import org.codelibs.fess.crawler.util.CrawlingParameterUtil; import org.codelibs.fess.es.config.exentity.CrawlingConfig; import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName; @@ -57,36 +58,20 @@ import org.codelibs.fess.helper.SambaHelper; import org.codelibs.fess.helper.SystemHelper; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import jcifs.smb.ACE; import jcifs.smb.SID; -public abstract class AbstractFessFileTransformer extends AbstractFessXpathTransformer { - private static final Logger logger = LoggerFactory // NOPMD - .getLogger(AbstractFessFileTransformer.class); +public abstract class AbstractFessFileTransformer extends AbstractTransformer implements FessTransformer { - public String encoding = null; - - public String noTitleLabel = "No title."; - - public int abbreviationMarginLength = 10; - - public boolean ignoreEmptyContent = false; - - public int maxTitleLength = 100; - - public int maxDigestLength = 200; - - public boolean appendMetaContentToContent = true; - - public boolean appendBodyContentToContent = true; + protected String charsetName = Constants.UTF_8; public Map parentEncodingMap = Collections.synchronizedMap(new LruHashMap(1000)); protected Map metaContentMapping; + protected FessConfig fessConfig; + protected abstract Extractor getExtractor(ResponseData responseData); @Override @@ -109,11 +94,11 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans try { final ExtractData extractData = extractor.getText(in, params); content = extractData.getContent(); - if (ignoreEmptyContent && StringUtil.isBlank(content)) { + if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) { return null; } - if (logger.isDebugEnabled()) { - logger.debug("ExtractData: " + extractData); + if (getLogger().isDebugEnabled()) { + getLogger().debug("ExtractData: " + extractData); } // meta for (final String key : extractData.getKeySet()) { @@ -191,10 +176,10 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId); // content final StringBuilder buf = new StringBuilder(content.length() + 1000); - if (appendBodyContentToContent) { + if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) { buf.append(content); } - if (appendMetaContentToContent) { + if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) { if (buf.length() > 0) { buf.append(' '); } @@ -206,23 +191,29 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans } else { putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), StringUtil.EMPTY); } - if (Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) - || fessConfig.isCrawlerDocumentCacheEnable()) { - final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " "); - // text cache - putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache); - putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE); + if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig + .isCrawlerDocumentCacheEnable()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) { + if (responseData.getContentLength() > 0 + && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) { + + final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " "); + // text cache + putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache); + putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE); + } } // digest putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), - Constants.DIGEST_PREFIX + abbreviate(normalizeContent(content), maxDigestLength)); + Constants.DIGEST_PREFIX + + abbreviate(normalizeContent(content), fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger())); // title if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) { if (url.endsWith("/")) { if (StringUtil.isNotBlank(content)) { - putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), abbreviate(body, maxTitleLength)); + putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), + abbreviate(body, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger())); } else { - putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), noTitleLabel); + putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel()); } } else { final String u = decodeUrlAsName(url, url.startsWith("file:")); @@ -235,9 +226,9 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans } } // host - putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url)); + putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url)); // site - putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding)); + putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding)); // url putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url); // created @@ -287,8 +278,8 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans final SID sid = item.getSID(); roleTypeList.add(sambaHelper.getAccountId(sid)); } - if (logger.isDebugEnabled()) { - logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString()); + if (getLogger().isDebugEnabled()) { + getLogger().debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString()); } } } @@ -335,7 +326,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans protected String abbreviate(final String str, final int maxWidth) { String newStr = StringUtils.abbreviate(str, maxWidth); try { - if (newStr.getBytes(Constants.UTF_8).length > maxWidth + abbreviationMarginLength) { + if (newStr.getBytes(Constants.UTF_8).length > maxWidth + fessConfig.getCrawlerDocumentFileAbbreviationMarginLengthAsInteger()) { newStr = StringUtils.abbreviate(str, maxWidth / 2); } } catch (final UnsupportedEncodingException e) { @@ -370,7 +361,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans } String enc = Constants.UTF_8; - if (encoding == null) { + if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) { final UrlQueue urlQueue = CrawlingParameterUtil.getUrlQueue(); if (urlQueue != null) { final String parentUrl = urlQueue.getParentUrl(); @@ -385,7 +376,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans } } } else { - enc = encoding; + enc = fessConfig.getCrawlerDocumentFileNameEncoding(); } final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url; @@ -415,8 +406,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans return null; } - @Override - protected String getHost(final String url) { + protected String getHostOnFile(final String url) { if (StringUtil.isBlank(url)) { return StringUtil.EMPTY; // empty } @@ -435,30 +425,29 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans return "localhost"; } - return super.getHost(url); + return getHost(url); } - @Override - protected String getSite(final String url, final String encoding) { + protected String getSiteOnFile(final String url, final String encoding) { if (StringUtil.isBlank(url)) { return StringUtil.EMPTY; // empty } if (url.startsWith("file:////")) { final String value = decodeUrlAsName(url.substring(9), true); - return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), maxSiteLength); + return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), getMaxSiteLength()); } else if (url.startsWith("file:")) { final String value = decodeUrlAsName(url.substring(5), true); if (value.length() > 2 && value.charAt(2) == ':') { // Windows - return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), maxSiteLength); + return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), getMaxSiteLength()); } else { // Unix - return StringUtils.abbreviate(value, maxSiteLength); + return StringUtils.abbreviate(value, getMaxSiteLength()); } } - return super.getSite(url, encoding); + return getSite(url, encoding); } @Override @@ -480,4 +469,5 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans } metaContentMapping.put(metaname, dynamicField); } + } diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java index 64f59d0a0..bc1f39ddb 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java @@ -15,10 +15,13 @@ */ package org.codelibs.fess.crawler.transformer; +import javax.annotation.PostConstruct; + import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.crawler.extractor.ExtractorFactory; import org.codelibs.fess.exception.FessSystemException; +import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,6 +29,21 @@ import org.slf4j.LoggerFactory; public class FessFileTransformer extends AbstractFessFileTransformer { private static final Logger logger = LoggerFactory.getLogger(FessFileTransformer.class); + @PostConstruct + public void init() { + fessConfig = ComponentUtil.getFessConfig(); + } + + @Override + public FessConfig getFessConfig() { + return fessConfig; + } + + @Override + public Logger getLogger() { + return logger; + } + @Override protected Extractor getExtractor(final ResponseData responseData) { final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory(); diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessTikaTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessTikaTransformer.java index a6176c1c9..e93383955 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessTikaTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessTikaTransformer.java @@ -15,12 +15,35 @@ */ package org.codelibs.fess.crawler.transformer; +import javax.annotation.PostConstruct; + import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.extractor.Extractor; import org.codelibs.fess.exception.FessSystemException; +import org.codelibs.fess.mylasta.direction.FessConfig; +import org.codelibs.fess.util.ComponentUtil; import org.lastaflute.di.core.SingletonLaContainer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class FessTikaTransformer extends AbstractFessFileTransformer { + private static final Logger logger = LoggerFactory.getLogger(FessTikaTransformer.class); + + @PostConstruct + public void init() { + fessConfig = ComponentUtil.getFessConfig(); + } + + @Override + public FessConfig getFessConfig() { + return fessConfig; + } + + @Override + public Logger getLogger() { + return logger; + } + @Override protected Extractor getExtractor(final ResponseData responseData) { final Extractor extractor = SingletonLaContainer.getComponent("tikaExtractor"); diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java similarity index 65% rename from src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessXpathTransformer.java rename to src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java index b66198126..e87a386a2 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java @@ -22,29 +22,20 @@ import java.util.Map; import org.apache.commons.lang3.StringUtils; import org.codelibs.core.lang.StringUtil; -import org.codelibs.fess.crawler.transformer.impl.XpathTransformer; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; import org.slf4j.Logger; -import org.slf4j.LoggerFactory; import groovy.lang.Binding; import groovy.lang.GroovyShell; -public abstract class AbstractFessXpathTransformer extends XpathTransformer { - private static final Logger logger = LoggerFactory.getLogger(AbstractFessXpathTransformer.class); +public interface FessTransformer { - public int maxSiteLength = 50; + FessConfig getFessConfig(); - public String unknownHostname = "unknown"; + Logger getLogger(); - public String siteEncoding; - - public boolean replaceSiteEncodingWhenEnglish = false; - - public boolean appendResultData = true; - - protected String getHost(final String u) { + public default String getHost(final String u) { if (StringUtil.isBlank(u)) { return StringUtil.EMPTY; // empty } @@ -63,13 +54,13 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer { } if (url.equals(originalUrl)) { - return unknownHostname; + return getFessConfig().getCrawlerDocumentUnknownHostname(); } return url; } - protected String getSite(final String u, final String encoding) { + public default String getSite(final String u, final String encoding) { if (StringUtil.isBlank(u)) { return StringUtil.EMPTY; // empty } @@ -87,15 +78,15 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer { if (encoding != null) { String enc; - if (siteEncoding != null) { - if (replaceSiteEncodingWhenEnglish) { + if (StringUtil.isNotBlank(getFessConfig().getCrawlerDocumentSiteEncoding())) { + if (getFessConfig().isCrawlerDocumentUseSiteEncodingOnEnglish()) { if ("ISO-8859-1".equalsIgnoreCase(encoding) || "US-ASCII".equalsIgnoreCase(encoding)) { - enc = siteEncoding; + enc = getFessConfig().getCrawlerDocumentSiteEncoding(); } else { enc = encoding; } } else { - enc = siteEncoding; + enc = getFessConfig().getCrawlerDocumentSiteEncoding(); } } else { enc = encoding; @@ -106,39 +97,35 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer { } catch (final Exception e) {} } - return StringUtils.abbreviate(url, maxSiteLength); + return StringUtils.abbreviate(url, getMaxSiteLength()); } - protected String normalizeContent(final String content) { + public default String normalizeContent(final String content) { if (content == null) { return StringUtil.EMPTY; // empty } return content.replaceAll("\\s+", " "); } - protected void putResultDataBody(final Map dataMap, final String key, final Object value) { + public default void putResultDataBody(final Map dataMap, final String key, final Object value) { final FessConfig fessConfig = ComponentUtil.getFessConfig(); if (fessConfig.getIndexFieldUrl().equals(key)) { dataMap.put(key, value); } else if (dataMap.containsKey(key)) { - if (appendResultData) { + if (getFessConfig().isCrawlerDocumentAppendData()) { final Object oldValue = dataMap.get(key); - if (key.endsWith("_m")) { - final Object[] oldValues = (Object[]) oldValue; - if (value.getClass().isArray()) { - final Object[] newValues = (Object[]) value; - final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length); - for (int i = 0; i < newValues.length; i++) { - values[values.length - 1 + i] = newValues[i]; - } - dataMap.put(key, values); - } else { - final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1); - values[values.length - 1] = value; - dataMap.put(key, values); + final Object[] oldValues = (Object[]) oldValue; + if (value.getClass().isArray()) { + final Object[] newValues = (Object[]) value; + final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length); + for (int i = 0; i < newValues.length; i++) { + values[values.length - 1 + i] = newValues[i]; } + dataMap.put(key, values); } else { - dataMap.put(key, oldValue + " " + value); + final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1); + values[values.length - 1] = value; + dataMap.put(key, values); } } else { dataMap.put(key, value); @@ -148,7 +135,8 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer { } } - protected void putResultDataWithTemplate(final Map dataMap, final String key, final Object value, final String template) { + public default void putResultDataWithTemplate(final Map dataMap, final String key, final Object value, + final String template) { Object target = value; if (template != null) { final Map paramMap = new HashMap<>(dataMap.size() + 1); @@ -173,7 +161,7 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer { } } - protected String evaluateValue(final String template, final Map paramMap) { + public default String evaluateValue(final String template, final Map paramMap) { if (StringUtil.isEmpty(template)) { return StringUtil.EMPTY; } @@ -185,8 +173,13 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer { } return value.toString(); } catch (final Exception e) { - logger.warn("Invalid value format: " + template, e); + getLogger().warn("Invalid value format: " + template, e); return null; } } + + public default int getMaxSiteLength() { + return getFessConfig().getCrawlerDocumentMaxSiteLengthAsInteger(); + } + } \ No newline at end of file diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index dc0482261..751c2c81b 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -29,6 +29,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import javax.annotation.PostConstruct; import javax.xml.transform.TransformerException; import org.apache.commons.io.IOUtils; @@ -47,6 +48,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue; import org.codelibs.fess.crawler.exception.ChildUrlsException; import org.codelibs.fess.crawler.exception.CrawlerSystemException; import org.codelibs.fess.crawler.exception.CrawlingAccessException; +import org.codelibs.fess.crawler.transformer.impl.XpathTransformer; import org.codelibs.fess.crawler.util.CrawlingParameterUtil; import org.codelibs.fess.crawler.util.ResponseDataUtil; import org.codelibs.fess.es.config.exentity.CrawlingConfig; @@ -68,30 +70,31 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; -public class FessXpathTransformer extends AbstractFessXpathTransformer { +public class FessXpathTransformer extends XpathTransformer implements FessTransformer { private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class); private static final int UTF8_BOM_SIZE = 3; - public String cacheXpath = "//BODY"; - - public String contentXpath = "//BODY"; - - public String langXpath = "//HTML/@lang"; - - public String digestXpath = "//META[@name='description']/@content"; - - public String canonicalXpath = "//LINK[@rel='canonical']/@href"; - - public List prunedTagList = new ArrayList(); - public boolean prunedCacheContent = true; - public int maxDigestLength = 200; + public Map convertUrlMap = new HashMap<>(); - public int maxCacheLength = 2621440; // 2.5Mbytes + protected FessConfig fessConfig; - public Map convertUrlMap = new HashMap(); + @PostConstruct + public void init() { + fessConfig = ComponentUtil.getFessConfig(); + } + + @Override + public FessConfig getFessConfig() { + return fessConfig; + } + + @Override + public Logger getLogger() { + return logger; + } @Override protected void storeData(final ResponseData responseData, final ResultData resultData) { @@ -181,7 +184,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { protected void putAdditionalData(final Map dataMap, final ResponseData responseData, final Document document) { // canonical - if (StringUtil.isNotBlank(canonicalXpath)) { + if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCannonicalXpath())) { final String canonicalUrl = getCanonicalUrl(responseData, document); if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) { final Set childUrlSet = new HashSet<>(); @@ -202,6 +205,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { String url = responseData.getUrl(); final String indexingTarget = crawlingConfig.getIndexingTarget(url); url = pathMappingHelper.replaceUrl(sessionId, url); + final String mimeType = responseData.getMimeType(); final Map fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD); @@ -223,26 +227,32 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires); } // lang - final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, langXpath, true)); + final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlLangXpath(), true)); if (lang != null) { putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang); } // title // content putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), getDocumentContent(responseData, document)); - if (Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) - || fessConfig.isCrawlerDocumentCacheEnable()) { - String charSet = responseData.getCharSet(); - if (charSet == null) { - charSet = Constants.UTF_8; - } - try { - // cache - putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), - new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet)); - putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE); - } catch (final Exception e) { - logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e); + if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig + .isCrawlerDocumentCacheEnable()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) { + if (responseData.getContentLength() > 0 + && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) { + String charSet = responseData.getCharSet(); + if (charSet == null) { + charSet = Constants.UTF_8; + } + try { + // cache + putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), + new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet)); + putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE); + } catch (final Exception e) { + logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e); + } + } else { + logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), + fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl()); } } // digest @@ -261,7 +271,6 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { // anchor putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData)); // mimetype - final String mimeType = responseData.getMimeType(); putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType); if (fileTypeHelper != null) { // filetype @@ -324,7 +333,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { } protected String getCanonicalUrl(final ResponseData responseData, final Document document) { - final String canonicalUrl = getSingleNodeValue(document, canonicalXpath, false); + final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCannonicalXpath(), false); if (StringUtil.isNotBlank(canonicalUrl)) { return canonicalUrl; } @@ -332,13 +341,15 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { } protected String getDocumentDigest(final ResponseData responseData, final Document document) { - final String digest = getSingleNodeValue(document, digestXpath, false); + final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false); if (StringUtil.isNotBlank(digest)) { return digest; } - final String body = normalizeContent(removeCommentTag(getSingleNodeValue(document, contentXpath, prunedCacheContent))); - return StringUtils.abbreviate(body, maxDigestLength); + final String body = + normalizeContent(removeCommentTag(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), + prunedCacheContent))); + return StringUtils.abbreviate(body, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()); } String removeCommentTag(final String content) { @@ -364,7 +375,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { } private String getDocumentContent(final ResponseData responseData, final Document document) { - return normalizeContent(getSingleNodeValue(document, contentXpath, true)); + return normalizeContent(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), true)); } protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) { @@ -420,7 +431,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { } protected boolean isPrunedTag(final String tagName) { - for (final String name : prunedTagList) { + for (final String name : getCrawlerDocumentHtmlPrunedTags()) { if (name.equalsIgnoreCase(tagName)) { return true; } @@ -492,12 +503,6 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { return urlList; } - public void addPrunedTag(final String tagName) { - if (StringUtil.isNotBlank(tagName)) { - prunedTagList.add(tagName); - } - } - @Override public Object getData(final AccessResultData accessResultData) { final byte[] data = accessResultData.getData(); @@ -554,4 +559,9 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer { private boolean isUtf8BomBytes(final byte[] b) { return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF; } + + protected String[] getCrawlerDocumentHtmlPrunedTags() { + return fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray(); + } + } diff --git a/src/main/java/org/codelibs/fess/helper/ViewHelper.java b/src/main/java/org/codelibs/fess/helper/ViewHelper.java index d682f98dc..022312bc1 100644 --- a/src/main/java/org/codelibs/fess/helper/ViewHelper.java +++ b/src/main/java/org/codelibs/fess/helper/ViewHelper.java @@ -38,6 +38,7 @@ import java.util.regex.Pattern; import javax.annotation.PostConstruct; import javax.annotation.Resource; +import org.apache.commons.lang3.StringEscapeUtils; import org.apache.commons.lang3.StringUtils; import org.codelibs.core.CoreLibConstants; import org.codelibs.core.lang.StringUtil; @@ -398,7 +399,7 @@ public class ViewHelper implements Serializable { if (locale == null) { locale = Locale.ENGLISH; } - String url = DocumentUtil.getValue(doc, "url", String.class); + String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class); if (url == null) { url = ComponentUtil.getMessageManager().getMessage(locale, "labels.search_unknown"); } @@ -417,6 +418,10 @@ public class ViewHelper implements Serializable { String cache = DocumentUtil.getValue(doc, fessConfig.getIndexFieldCache(), String.class); if (cache != null) { + String mimetype = DocumentUtil.getValue(doc, fessConfig.getIndexFieldMimetype(), String.class); + if (!ComponentUtil.getFessConfig().isHtmlMimetypeForCache(mimetype)) { + cache = StringEscapeUtils.escapeHtml4(cache); + } cache = pathMappingHelper.replaceUrls(cache); if (queries != null && queries.length > 0) { doc.put("hlCache", replaceHighlightQueries(cache, queries)); diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java index 80b072469..118553f52 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java @@ -20,7 +20,7 @@ import org.lastaflute.core.direction.exception.ConfigPropertyNotFoundException; /** * @author FreeGen */ -public interface FessConfig extends FessEnv, FessProp { +public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction.FessProp { /** The key of the configuration. e.g. Fess */ String DOMAIN_TITLE = "domain.title"; @@ -66,9 +66,75 @@ public interface FessConfig extends FessEnv, FessProp { -XX:+DisableExplicitGC */ String JVM_SUGGEST_OPTIONS = "jvm.suggest.options"; + /** The key of the configuration. e.g. 50 */ + String CRAWLER_DOCUMENT_MAX_SITE_LENGTH = "crawler.document.max.site.length"; + + /** The key of the configuration. e.g. UTF-8 */ + String CRAWLER_DOCUMENT_SITE_ENCODING = "crawler.document.site.encoding"; + + /** The key of the configuration. e.g. unknown */ + String CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME = "crawler.document.unknown.hostname"; + /** The key of the configuration. e.g. false */ + String CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH = "crawler.document.use.site.encoding.on.english"; + + /** The key of the configuration. e.g. true */ + String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data"; + + /** The key of the configuration. e.g. //BODY */ + String CRAWLER_DOCUMENT_HTML_CONTENT_XPATH = "crawler.document.html.content.xpath"; + + /** The key of the configuration. e.g. //HTML/@lang */ + String CRAWLER_DOCUMENT_HTML_LANG_XPATH = "crawler.document.html.lang.xpath"; + + /** The key of the configuration. e.g. //META[@name='description']/@content */ + String CRAWLER_DOCUMENT_HTML_DIGEST_XPATH = "crawler.document.html.digest.xpath"; + + /** The key of the configuration. e.g. //LINK[@rel='canonical']/@href */ + String CRAWLER_DOCUMENT_HTML_CANNONICAL_XPATH = "crawler.document.html.cannonical.xpath"; + + /** The key of the configuration. e.g. noscript,script */ + String CRAWLER_DOCUMENT_HTML_PRUNED_TAGS = "crawler.document.html.pruned.tags"; + + /** The key of the configuration. e.g. 200 */ + String CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH = "crawler.document.html.max.digest.length"; + + /** The key of the configuration. e.g. */ + String CRAWLER_DOCUMENT_FILE_NAME_ENCODING = "crawler.document.file.name.encoding"; + + /** The key of the configuration. e.g. No title. */ + String CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL = "crawler.document.file.no.title.label"; + + /** The key of the configuration. e.g. 10 */ + String CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH = "crawler.document.file.abbreviation.margin.length"; + + /** The key of the configuration. e.g. false */ + String CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT = "crawler.document.file.ignore.empty.content"; + + /** The key of the configuration. e.g. 100 */ + String CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH = "crawler.document.file.max.title.length"; + + /** The key of the configuration. e.g. 200 */ + String CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH = "crawler.document.file.max.digest.length"; + + /** The key of the configuration. e.g. true */ + String CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT = "crawler.document.file.append.meta.content"; + + /** The key of the configuration. e.g. true */ + String CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT = "crawler.document.file.append.body.content"; + + /** The key of the configuration. e.g. true */ String CRAWLER_DOCUMENT_CACHE_ENABLE = "crawler.document.cache.enable"; + /** The key of the configuration. e.g. 2621440 */ + String CRAWLER_DOCUMENT_CACHE_MAX_SIZE = "crawler.document.cache.max.size"; + + /** The key of the configuration. e.g. text/html */ + String CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES = "crawler.document.cache.supported.mimetypes"; + + /** The key of the configuration. e.g. text/html */ + String CRAWLER_DOCUMENT_CACHE_HTML_MIMETYPES = "crawler.document.cache.html.mimetypes"; + /** The key of the configuration. e.g. favorite_count */ String INDEX_FIELD_favorite_count = "index.field.favorite_count"; @@ -475,19 +541,272 @@ public interface FessConfig extends FessEnv, FessProp { String getJvmSuggestOptions(); /** - * Get the value for the key 'crawler.document.cache.enable'.
+ * Get the value for the key 'crawler.document.max.site.length'.
+ * The value is, e.g. 50
+ * comment: common + * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentMaxSiteLength(); + + /** + * Get the value for the key 'crawler.document.max.site.length' as {@link Integer}.
+ * The value is, e.g. 50
+ * comment: common + * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentMaxSiteLengthAsInteger(); + + /** + * Get the value for the key 'crawler.document.site.encoding'.
+ * The value is, e.g. UTF-8
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentSiteEncoding(); + + /** + * Get the value for the key 'crawler.document.unknown.hostname'.
+ * The value is, e.g. unknown
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentUnknownHostname(); + + /** + * Get the value for the key 'crawler.document.use.site.encoding.on.english'.
* The value is, e.g. false
* @return The value of found property. (NotNull: if not found, exception but basically no way) */ + String getCrawlerDocumentUseSiteEncodingOnEnglish(); + + /** + * Is the property for the key 'crawler.document.use.site.encoding.on.english' true?
+ * The value is, e.g. false
+ * @return The determination, true or false. (if not found, exception but basically no way) + */ + boolean isCrawlerDocumentUseSiteEncodingOnEnglish(); + + /** + * Get the value for the key 'crawler.document.append.data'.
+ * The value is, e.g. true
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentAppendData(); + + /** + * Is the property for the key 'crawler.document.append.data' true?
+ * The value is, e.g. true
+ * @return The determination, true or false. (if not found, exception but basically no way) + */ + boolean isCrawlerDocumentAppendData(); + + /** + * Get the value for the key 'crawler.document.html.content.xpath'.
+ * The value is, e.g. //BODY
+ * comment: html + * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentHtmlContentXpath(); + + /** + * Get the value for the key 'crawler.document.html.lang.xpath'.
+ * The value is, e.g. //HTML/@lang
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentHtmlLangXpath(); + + /** + * Get the value for the key 'crawler.document.html.digest.xpath'.
+ * The value is, e.g. //META[@name='description']/@content
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentHtmlDigestXpath(); + + /** + * Get the value for the key 'crawler.document.html.cannonical.xpath'.
+ * The value is, e.g. //LINK[@rel='canonical']/@href
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentHtmlCannonicalXpath(); + + /** + * Get the value for the key 'crawler.document.html.pruned.tags'.
+ * The value is, e.g. noscript,script
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentHtmlPrunedTags(); + + /** + * Get the value for the key 'crawler.document.html.max.digest.length'.
+ * The value is, e.g. 200
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentHtmlMaxDigestLength(); + + /** + * Get the value for the key 'crawler.document.html.max.digest.length' as {@link Integer}.
+ * The value is, e.g. 200
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentHtmlMaxDigestLengthAsInteger(); + + /** + * Get the value for the key 'crawler.document.file.name.encoding'.
+ * The value is, e.g.
+ * comment: file + * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileNameEncoding(); + + /** + * Get the value for the key 'crawler.document.file.name.encoding' as {@link Integer}.
+ * The value is, e.g.
+ * comment: file + * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentFileNameEncodingAsInteger(); + + /** + * Get the value for the key 'crawler.document.file.no.title.label'.
+ * The value is, e.g. No title.
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileNoTitleLabel(); + + /** + * Get the value for the key 'crawler.document.file.abbreviation.margin.length'.
+ * The value is, e.g. 10
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileAbbreviationMarginLength(); + + /** + * Get the value for the key 'crawler.document.file.abbreviation.margin.length' as {@link Integer}.
+ * The value is, e.g. 10
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger(); + + /** + * Get the value for the key 'crawler.document.file.ignore.empty.content'.
+ * The value is, e.g. false
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileIgnoreEmptyContent(); + + /** + * Is the property for the key 'crawler.document.file.ignore.empty.content' true?
+ * The value is, e.g. false
+ * @return The determination, true or false. (if not found, exception but basically no way) + */ + boolean isCrawlerDocumentFileIgnoreEmptyContent(); + + /** + * Get the value for the key 'crawler.document.file.max.title.length'.
+ * The value is, e.g. 100
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileMaxTitleLength(); + + /** + * Get the value for the key 'crawler.document.file.max.title.length' as {@link Integer}.
+ * The value is, e.g. 100
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentFileMaxTitleLengthAsInteger(); + + /** + * Get the value for the key 'crawler.document.file.max.digest.length'.
+ * The value is, e.g. 200
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileMaxDigestLength(); + + /** + * Get the value for the key 'crawler.document.file.max.digest.length' as {@link Integer}.
+ * The value is, e.g. 200
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentFileMaxDigestLengthAsInteger(); + + /** + * Get the value for the key 'crawler.document.file.append.meta.content'.
+ * The value is, e.g. true
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileAppendMetaContent(); + + /** + * Is the property for the key 'crawler.document.file.append.meta.content' true?
+ * The value is, e.g. true
+ * @return The determination, true or false. (if not found, exception but basically no way) + */ + boolean isCrawlerDocumentFileAppendMetaContent(); + + /** + * Get the value for the key 'crawler.document.file.append.body.content'.
+ * The value is, e.g. true
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentFileAppendBodyContent(); + + /** + * Is the property for the key 'crawler.document.file.append.body.content' true?
+ * The value is, e.g. true
+ * @return The determination, true or false. (if not found, exception but basically no way) + */ + boolean isCrawlerDocumentFileAppendBodyContent(); + + /** + * Get the value for the key 'crawler.document.cache.enable'.
+ * The value is, e.g. true
+ * comment: cache + * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ String getCrawlerDocumentCacheEnable(); /** * Is the property for the key 'crawler.document.cache.enable' true?
- * The value is, e.g. false
+ * The value is, e.g. true
+ * comment: cache * @return The determination, true or false. (if not found, exception but basically no way) */ boolean isCrawlerDocumentCacheEnable(); + /** + * Get the value for the key 'crawler.document.cache.max.size'.
+ * The value is, e.g. 2621440
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentCacheMaxSize(); + + /** + * Get the value for the key 'crawler.document.cache.max.size' as {@link Integer}.
+ * The value is, e.g. 2621440
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + * @throws NumberFormatException When the property is not integer. + */ + Integer getCrawlerDocumentCacheMaxSizeAsInteger(); + + /** + * Get the value for the key 'crawler.document.cache.supported.mimetypes'.
+ * The value is, e.g. text/html
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentCacheSupportedMimetypes(); + + /** + * Get the value for the key 'crawler.document.cache.html.mimetypes'.
+ * The value is, e.g. text/html
+ * comment: ,text/plain,application/xml,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-powerpoint,application/vnd.openxmlformats-officedocument.presentationml.presentation + * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getCrawlerDocumentCacheHtmlMimetypes(); + /** * Get the value for the key 'index.field.favorite_count'.
* The value is, e.g. favorite_count
@@ -1515,6 +1834,126 @@ public interface FessConfig extends FessEnv, FessProp { return get(FessConfig.JVM_SUGGEST_OPTIONS); } + public String getCrawlerDocumentMaxSiteLength() { + return get(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH); + } + + public Integer getCrawlerDocumentMaxSiteLengthAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH); + } + + public String getCrawlerDocumentSiteEncoding() { + return get(FessConfig.CRAWLER_DOCUMENT_SITE_ENCODING); + } + + public String getCrawlerDocumentUnknownHostname() { + return get(FessConfig.CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME); + } + + public String getCrawlerDocumentUseSiteEncodingOnEnglish() { + return get(FessConfig.CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH); + } + + public boolean isCrawlerDocumentUseSiteEncodingOnEnglish() { + return is(FessConfig.CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH); + } + + public String getCrawlerDocumentAppendData() { + return get(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA); + } + + public boolean isCrawlerDocumentAppendData() { + return is(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA); + } + + public String getCrawlerDocumentHtmlContentXpath() { + return get(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH); + } + + public String getCrawlerDocumentHtmlLangXpath() { + return get(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH); + } + + public String getCrawlerDocumentHtmlDigestXpath() { + return get(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH); + } + + public String getCrawlerDocumentHtmlCannonicalXpath() { + return get(FessConfig.CRAWLER_DOCUMENT_HTML_CANNONICAL_XPATH); + } + + public String getCrawlerDocumentHtmlPrunedTags() { + return get(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS); + } + + public String getCrawlerDocumentHtmlMaxDigestLength() { + return get(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH); + } + + public Integer getCrawlerDocumentHtmlMaxDigestLengthAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH); + } + + public String getCrawlerDocumentFileNameEncoding() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING); + } + + public Integer getCrawlerDocumentFileNameEncodingAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING); + } + + public String getCrawlerDocumentFileNoTitleLabel() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL); + } + + public String getCrawlerDocumentFileAbbreviationMarginLength() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH); + } + + public Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH); + } + + public String getCrawlerDocumentFileIgnoreEmptyContent() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT); + } + + public boolean isCrawlerDocumentFileIgnoreEmptyContent() { + return is(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT); + } + + public String getCrawlerDocumentFileMaxTitleLength() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH); + } + + public Integer getCrawlerDocumentFileMaxTitleLengthAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH); + } + + public String getCrawlerDocumentFileMaxDigestLength() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH); + } + + public Integer getCrawlerDocumentFileMaxDigestLengthAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH); + } + + public String getCrawlerDocumentFileAppendMetaContent() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT); + } + + public boolean isCrawlerDocumentFileAppendMetaContent() { + return is(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT); + } + + public String getCrawlerDocumentFileAppendBodyContent() { + return get(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT); + } + + public boolean isCrawlerDocumentFileAppendBodyContent() { + return is(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT); + } + public String getCrawlerDocumentCacheEnable() { return get(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLE); } @@ -1523,6 +1962,22 @@ public interface FessConfig extends FessEnv, FessProp { return is(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLE); } + public String getCrawlerDocumentCacheMaxSize() { + return get(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE); + } + + public Integer getCrawlerDocumentCacheMaxSizeAsInteger() { + return getAsInteger(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE); + } + + public String getCrawlerDocumentCacheSupportedMimetypes() { + return get(FessConfig.CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES); + } + + public String getCrawlerDocumentCacheHtmlMimetypes() { + return get(FessConfig.CRAWLER_DOCUMENT_CACHE_HTML_MIMETYPES); + } + public String getIndexFieldFavoriteCount() { return get(FessConfig.INDEX_FIELD_favorite_count); } diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java index 4372875e7..27d098d58 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java @@ -18,6 +18,7 @@ package org.codelibs.fess.mylasta.direction; import org.codelibs.core.lang.StringUtil; import org.codelibs.fess.Constants; import org.codelibs.fess.util.ComponentUtil; +import org.codelibs.fess.util.StreamUtil; public interface FessProp { public default String getProperty(String key) { @@ -79,4 +80,30 @@ public interface FessProp { return getJvmSuggestOptions().split("\n"); } + String getCrawlerDocumentHtmlPrunedTags(); + + public default String[] getCrawlerDocumentHtmlPrunedTagsAsArray() { + return getCrawlerDocumentHtmlPrunedTags().split(","); + } + + String getCrawlerDocumentCacheHtmlMimetypes(); + + public default boolean isHtmlMimetypeForCache(String mimetype) { + String[] mimetypes = getCrawlerDocumentCacheHtmlMimetypes().split(","); + if (mimetypes.length == 1 && StringUtil.isBlank(mimetypes[0])) { + return true; + } + return StreamUtil.of(mimetypes).anyMatch(s -> s.equalsIgnoreCase(mimetype)); + } + + String getCrawlerDocumentCacheSupportedMimetypes(); + + public default boolean isSupportedDocumentCacheMimetypes(String mimetype) { + String[] mimetypes = getCrawlerDocumentCacheSupportedMimetypes().split(","); + if (mimetypes.length == 1 && StringUtil.isBlank(mimetypes[0])) { + return true; + } + return StreamUtil.of(mimetypes).anyMatch(s -> s.equalsIgnoreCase(mimetype)); + } + } diff --git a/src/main/resources/crawler/transformer.xml b/src/main/resources/crawler/transformer.xml index 67d274c26..a4c39e79c 100644 --- a/src/main/resources/crawler/transformer.xml +++ b/src/main/resources/crawler/transformer.xml @@ -4,7 +4,6 @@ - "fessXpathTransformer" defaultFeatureMap @@ -16,31 +15,15 @@ {"feed:" : "http:"} - - true - "UTF-8" "title" "//TITLE" - - "noscript" - - - "script" - "fessFileTransformer" - true - "UTF-8" "title" "title" @@ -60,8 +43,6 @@ "fessTikaTransformer" - true - "UTF-8" "title" "title" diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties index e59e22d51..0c7e9f271 100644 --- a/src/main/resources/fess_config.properties +++ b/src/main/resources/fess_config.properties @@ -50,7 +50,37 @@ jvm.suggest.options=\ # Index # ==== -crawler.document.cache.enable=false +# common +crawler.document.max.site.length=50 +crawler.document.site.encoding=UTF-8 +crawler.document.unknown.hostname=unknown +crawler.document.use.site.encoding.on.english=false +crawler.document.append.data=true + +# html +crawler.document.html.content.xpath=//BODY +crawler.document.html.lang.xpath=//HTML/@lang +crawler.document.html.digest.xpath=//META[@name='description']/@content +crawler.document.html.cannonical.xpath=//LINK[@rel='canonical']/@href +crawler.document.html.pruned.tags=noscript,script +crawler.document.html.max.digest.length=200 + +# file +crawler.document.file.name.encoding= +crawler.document.file.no.title.label=No title. +crawler.document.file.abbreviation.margin.length=10 +crawler.document.file.ignore.empty.content=false +crawler.document.file.max.title.length=100 +crawler.document.file.max.digest.length=200 +crawler.document.file.append.meta.content=true +crawler.document.file.append.body.content=true + +# cache +crawler.document.cache.enable=true +crawler.document.cache.max.size=2621440 +crawler.document.cache.supported.mimetypes=text/html +#,text/plain,application/xml,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-powerpoint,application/vnd.openxmlformats-officedocument.presentationml.presentation +crawler.document.cache.html.mimetypes=text/html # field names index.field.favorite_count=favorite_count diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/index.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/index.jsp deleted file mode 100644 index 45fd9b8db..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/index.jsp +++ /dev/null @@ -1,9 +0,0 @@ -<%@page pageEncoding="UTF-8" %> - - - - - - - - diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/indexHtmlHead.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/indexHtmlHead.jsp deleted file mode 100644 index 5592f52f2..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/indexHtmlHead.jsp +++ /dev/null @@ -1,4 +0,0 @@ -<%@page pageEncoding="UTF-8" %> - - -<bean:message key="labels.mobile_search_title"/> diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/indexMain.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/indexMain.jsp deleted file mode 100644 index 97fbeb4c3..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/indexMain.jsp +++ /dev/null @@ -1,14 +0,0 @@ -<%@page pageEncoding="UTF-8" %> -
-
- -
- -
- -
- " name="search" /> -
-
-
-
diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/search.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/search.jsp deleted file mode 100644 index 2fdf50be1..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/search.jsp +++ /dev/null @@ -1,18 +0,0 @@ -<%@page pageEncoding="UTF-8" %> - - - - - - - - - - - - - - - - - diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/searchFooter.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/searchFooter.jsp deleted file mode 100644 index aff3591db..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/searchFooter.jsp +++ /dev/null @@ -1,5 +0,0 @@ -<%@page pageEncoding="UTF-8" %> -
-
- -
diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/searchHeader.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/searchHeader.jsp deleted file mode 100644 index 835156f96..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/searchHeader.jsp +++ /dev/null @@ -1,13 +0,0 @@ -<%@page pageEncoding="UTF-8" %> - diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/searchHtmlHead.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/searchHtmlHead.jsp deleted file mode 100644 index 49d98f3a2..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/searchHtmlHead.jsp +++ /dev/null @@ -1,4 +0,0 @@ -<%@page pageEncoding="UTF-8" %> - - -${f:h(query)} - <bean:message key="labels.search_title"/> diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/searchNoResult.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/searchNoResult.jsp deleted file mode 100644 index 45b163e07..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/searchNoResult.jsp +++ /dev/null @@ -1,4 +0,0 @@ -<%@page pageEncoding="UTF-8" %> -
- -
diff --git a/src/main/webapp/WEB-INF/orig/view/mobile/searchResults.jsp b/src/main/webapp/WEB-INF/orig/view/mobile/searchResults.jsp deleted file mode 100644 index c5070748c..000000000 --- a/src/main/webapp/WEB-INF/orig/view/mobile/searchResults.jsp +++ /dev/null @@ -1,55 +0,0 @@ -<%@page pageEncoding="UTF-8" %> -
-
- -
- ${f:h(doc.contentTitle)} - -
- - ${doc.contentDescription} - -
- -
- ${f:h(doc.site)} -
-
-
-
-
-
-
- -
-

- - - - - - - - - - - ${pageNumber} - - - - - - ${f:h(pageNumber)} - - - - - - - - - - - -

-
diff --git a/src/main/webapp/WEB-INF/view/searchResults.jsp b/src/main/webapp/WEB-INF/view/searchResults.jsp index 7cf371771..0b7caf15c 100644 --- a/src/main/webapp/WEB-INF/view/searchResults.jsp +++ b/src/main/webapp/WEB-INF/view/searchResults.jsp @@ -36,8 +36,10 @@
${f:h(doc.sitePath)} + +
diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java index 9c1ab33d4..78a59f242 100644 --- a/src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java +++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java @@ -34,7 +34,7 @@ public class FessFileTransformerTest extends UnitFessTestCase { public void test_decodeUrl_ok() throws Exception { String url, exp; - final FessFileTransformer transformer = new FessFileTransformer(); + final FessFileTransformer transformer = createInstance(); url = ""; exp = ""; @@ -62,156 +62,171 @@ public class FessFileTransformerTest extends UnitFessTestCase { } public void test_decodeUrl_null() throws Exception { - final FessFileTransformer transformer = new FessFileTransformer(); + final FessFileTransformer transformer = createInstance(); assertNull(transformer.decodeUrlAsName(null, true)); } public void test_getHost_ok() { String url, exp; - final FessFileTransformer transformer = new FessFileTransformer(); + final FessFileTransformer transformer = createInstance(); url = ""; exp = ""; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "http://server/home/user"; exp = "server"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:/home/user"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:/c:/home/user"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:////server/home/user"; exp = "server"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:/" + encodeUrl("ホーム") + "/user"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:/c:/" + encodeUrl("ホーム") + "/user"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:////" + encodeUrl("サーバー") + "/home/user"; exp = "サーバー"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); } public void test_getHost_unexpected() { String url, exp; - final FessFileTransformer transformer = new FessFileTransformer(); + final FessFileTransformer transformer = createInstance(); url = null; exp = ""; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "example:"; exp = "unknown"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file://"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:///"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file://///"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file://///example"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); url = "file:/c:"; exp = "localhost"; - assertEquals(exp, transformer.getHost(url)); + assertEquals(exp, transformer.getHostOnFile(url)); } public void test_getSite_ok() { String url, exp; - final FessFileTransformer transformer = new FessFileTransformer(); + final FessFileTransformer transformer = createInstance(); url = ""; exp = ""; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "http://example.com/"; exp = "example.com/"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "http://example.com/index.html"; exp = "example.com/index.html"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file:/home/user"; exp = "/home/user"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file:/c:/home/user"; exp = "c:\\home\\user"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file:/c:/"; exp = "c:\\"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file:////server/user"; exp = "\\\\server\\user"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); + } - transformer.maxSiteLength = 10; + public void test_getSite_ok_len10() { + String url, exp; + final FessFileTransformer transformer = new FessFileTransformer() { + @Override + public int getMaxSiteLength() { + return 10; + } + }; + transformer.init(); url = "file:/home/user/foo"; exp = "/home/u..."; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); } public void test_getSite_unexpected() { String url, exp; - final FessFileTransformer transformer = new FessFileTransformer(); + final FessFileTransformer transformer = createInstance(); url = "file:"; exp = ""; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file"; exp = "file"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file:/"; exp = "/"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file:/c:"; exp = "c:"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file://"; exp = "//"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file:///"; exp = "///"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); url = "file://///"; exp = "\\\\\\"; - assertEquals(exp, transformer.getSite(url, "UTF-8")); + assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8")); + } + + private FessFileTransformer createInstance() { + final FessFileTransformer transformer = new FessFileTransformer(); + transformer.init(); + return transformer; } } diff --git a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java index 24b8cecd2..05a5da027 100644 --- a/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java +++ b/src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java @@ -46,6 +46,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase { public void setUp() throws Exception { super.setUp(); fessXpathTransformer = new FessXpathTransformer(); + fessXpathTransformer.init(); fessXpathTransformer.convertUrlMap.put("feed:", "http:"); } @@ -53,7 +54,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase { final String data = "
"; final Document document = getDocument(data); - final FessXpathTransformer transformer = new FessXpathTransformer(); + final FessXpathTransformer transformer = new FessXpathTransformer() { + protected String[] getCrawlerDocumentHtmlPrunedTags() { + return new String[0]; + } + }; final Node pruneNode = transformer.pruneNode(document.cloneNode(true)); assertEquals(getXmlString(document), getXmlString(pruneNode)); @@ -63,8 +68,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase { final String data = "
"; final Document document = getDocument(data); - final FessXpathTransformer transformer = new FessXpathTransformer(); - transformer.prunedTagList.add("noscript"); + final FessXpathTransformer transformer = new FessXpathTransformer() { + protected String[] getCrawlerDocumentHtmlPrunedTags() { + return new String[] { "noscript" }; + } + }; final Node pruneNode = transformer.pruneNode(document.cloneNode(true)); final String docString = getXmlString(document); @@ -83,9 +91,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase { final String data = "
"; final Document document = getDocument(data); - final FessXpathTransformer transformer = new FessXpathTransformer(); - transformer.prunedTagList.add("script"); - transformer.prunedTagList.add("noscript"); + final FessXpathTransformer transformer = new FessXpathTransformer() { + protected String[] getCrawlerDocumentHtmlPrunedTags() { + return new String[] { "script", "noscript" }; + } + }; final Node pruneNode = transformer.pruneNode(document.cloneNode(true)); final String docString = getXmlString(document); @@ -235,6 +245,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase { public void test_canonicalXpath() throws Exception { final FessXpathTransformer transformer = new FessXpathTransformer(); + transformer.init(); final Map dataMap = new HashMap(); final ResponseData responseData = new ResponseData();