Browse Source

improve cache handling, remove unused jsps, update lastaflute

Shinsuke Sugaya 9 years ago
parent
commit
3c1bb626fd
23 changed files with 762 additions and 327 deletions
  1. 1 0
      dbflute_fess/dfprop/lastafluteMap.dfprop
  2. 40 50
      src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java
  3. 18 0
      src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java
  4. 23 0
      src/main/java/org/codelibs/fess/crawler/transformer/FessTikaTransformer.java
  5. 33 40
      src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java
  6. 53 43
      src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java
  7. 6 1
      src/main/java/org/codelibs/fess/helper/ViewHelper.java
  8. 458 3
      src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java
  9. 27 0
      src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
  10. 0 19
      src/main/resources/crawler/transformer.xml
  11. 31 1
      src/main/resources/fess_config.properties
  12. 0 9
      src/main/webapp/WEB-INF/orig/view/mobile/index.jsp
  13. 0 4
      src/main/webapp/WEB-INF/orig/view/mobile/indexHtmlHead.jsp
  14. 0 14
      src/main/webapp/WEB-INF/orig/view/mobile/indexMain.jsp
  15. 0 18
      src/main/webapp/WEB-INF/orig/view/mobile/search.jsp
  16. 0 5
      src/main/webapp/WEB-INF/orig/view/mobile/searchFooter.jsp
  17. 0 13
      src/main/webapp/WEB-INF/orig/view/mobile/searchHeader.jsp
  18. 0 4
      src/main/webapp/WEB-INF/orig/view/mobile/searchHtmlHead.jsp
  19. 0 4
      src/main/webapp/WEB-INF/orig/view/mobile/searchNoResult.jsp
  20. 0 55
      src/main/webapp/WEB-INF/orig/view/mobile/searchResults.jsp
  21. 2 0
      src/main/webapp/WEB-INF/view/searchResults.jsp
  22. 53 38
      src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java
  23. 17 6
      src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

+ 1 - 0
dbflute_fess/dfprop/lastafluteMap.dfprop

@@ -41,6 +41,7 @@ map:{
         ; fess = map:{
             ; path = ..
             ; freeGenList = list:{ env ; config ; label ; message ; mail ; template ; jsp ; doc }
+            ; configPluginInterface = org.codelibs.fess.mylasta.direction.FessProp
             ; propertiesHtmlList = list:{ env ; config ; label ; message }
         }
     }

+ 40 - 50
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -45,6 +45,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
 import org.codelibs.fess.crawler.extractor.Extractor;
+import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
@@ -57,36 +58,20 @@ import org.codelibs.fess.helper.SambaHelper;
 import org.codelibs.fess.helper.SystemHelper;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import jcifs.smb.ACE;
 import jcifs.smb.SID;
 
-public abstract class AbstractFessFileTransformer extends AbstractFessXpathTransformer {
-    private static final Logger logger = LoggerFactory // NOPMD
-            .getLogger(AbstractFessFileTransformer.class);
+public abstract class AbstractFessFileTransformer extends AbstractTransformer implements FessTransformer {
 
-    public String encoding = null;
-
-    public String noTitleLabel = "No title.";
-
-    public int abbreviationMarginLength = 10;
-
-    public boolean ignoreEmptyContent = false;
-
-    public int maxTitleLength = 100;
-
-    public int maxDigestLength = 200;
-
-    public boolean appendMetaContentToContent = true;
-
-    public boolean appendBodyContentToContent = true;
+    protected String charsetName = Constants.UTF_8;
 
     public Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
 
     protected Map<String, String> metaContentMapping;
 
+    protected FessConfig fessConfig;
+
     protected abstract Extractor getExtractor(ResponseData responseData);
 
     @Override
@@ -109,11 +94,11 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
         try {
             final ExtractData extractData = extractor.getText(in, params);
             content = extractData.getContent();
-            if (ignoreEmptyContent && StringUtil.isBlank(content)) {
+            if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
                 return null;
             }
-            if (logger.isDebugEnabled()) {
-                logger.debug("ExtractData: " + extractData);
+            if (getLogger().isDebugEnabled()) {
+                getLogger().debug("ExtractData: " + extractData);
             }
             // meta
             for (final String key : extractData.getKeySet()) {
@@ -191,10 +176,10 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
         putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
         // content
         final StringBuilder buf = new StringBuilder(content.length() + 1000);
-        if (appendBodyContentToContent) {
+        if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
             buf.append(content);
         }
-        if (appendMetaContentToContent) {
+        if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
             if (buf.length() > 0) {
                 buf.append(' ');
             }
@@ -206,23 +191,29 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
         } else {
             putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), StringUtil.EMPTY);
         }
-        if (Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
-                || fessConfig.isCrawlerDocumentCacheEnable()) {
-            final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
-            // text cache
-            putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
-            putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
+        if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
+                .isCrawlerDocumentCacheEnable()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
+            if (responseData.getContentLength() > 0
+                    && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
+
+                final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
+                // text cache
+                putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
+                putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
+            }
         }
         // digest
         putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
-                Constants.DIGEST_PREFIX + abbreviate(normalizeContent(content), maxDigestLength));
+                Constants.DIGEST_PREFIX
+                        + abbreviate(normalizeContent(content), fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
         // title
         if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
             if (url.endsWith("/")) {
                 if (StringUtil.isNotBlank(content)) {
-                    putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), abbreviate(body, maxTitleLength));
+                    putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(),
+                            abbreviate(body, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
                 } else {
-                    putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), noTitleLabel);
+                    putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
                 }
             } else {
                 final String u = decodeUrlAsName(url, url.startsWith("file:"));
@@ -235,9 +226,9 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
             }
         }
         // host
-        putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
+        putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
         // site
-        putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
+        putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
         // url
         putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
         // created
@@ -287,8 +278,8 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
                     final SID sid = item.getSID();
                     roleTypeList.add(sambaHelper.getAccountId(sid));
                 }
-                if (logger.isDebugEnabled()) {
-                    logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
+                if (getLogger().isDebugEnabled()) {
+                    getLogger().debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
                 }
             }
         }
@@ -335,7 +326,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
     protected String abbreviate(final String str, final int maxWidth) {
         String newStr = StringUtils.abbreviate(str, maxWidth);
         try {
-            if (newStr.getBytes(Constants.UTF_8).length > maxWidth + abbreviationMarginLength) {
+            if (newStr.getBytes(Constants.UTF_8).length > maxWidth + fessConfig.getCrawlerDocumentFileAbbreviationMarginLengthAsInteger()) {
                 newStr = StringUtils.abbreviate(str, maxWidth / 2);
             }
         } catch (final UnsupportedEncodingException e) {
@@ -370,7 +361,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
         }
 
         String enc = Constants.UTF_8;
-        if (encoding == null) {
+        if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
             final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
             if (urlQueue != null) {
                 final String parentUrl = urlQueue.getParentUrl();
@@ -385,7 +376,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
                 }
             }
         } else {
-            enc = encoding;
+            enc = fessConfig.getCrawlerDocumentFileNameEncoding();
         }
 
         final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
@@ -415,8 +406,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
         return null;
     }
 
-    @Override
-    protected String getHost(final String url) {
+    protected String getHostOnFile(final String url) {
         if (StringUtil.isBlank(url)) {
             return StringUtil.EMPTY; // empty
         }
@@ -435,30 +425,29 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
             return "localhost";
         }
 
-        return super.getHost(url);
+        return getHost(url);
     }
 
-    @Override
-    protected String getSite(final String url, final String encoding) {
+    protected String getSiteOnFile(final String url, final String encoding) {
         if (StringUtil.isBlank(url)) {
             return StringUtil.EMPTY; // empty
         }
 
         if (url.startsWith("file:////")) {
             final String value = decodeUrlAsName(url.substring(9), true);
-            return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), maxSiteLength);
+            return StringUtils.abbreviate("\\\\" + value.replace('/', '\\'), getMaxSiteLength());
         } else if (url.startsWith("file:")) {
             final String value = decodeUrlAsName(url.substring(5), true);
             if (value.length() > 2 && value.charAt(2) == ':') {
                 // Windows
-                return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), maxSiteLength);
+                return StringUtils.abbreviate(value.substring(1).replace('/', '\\'), getMaxSiteLength());
             } else {
                 // Unix
-                return StringUtils.abbreviate(value, maxSiteLength);
+                return StringUtils.abbreviate(value, getMaxSiteLength());
             }
         }
 
-        return super.getSite(url, encoding);
+        return getSite(url, encoding);
     }
 
     @Override
@@ -480,4 +469,5 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
         }
         metaContentMapping.put(metaname, dynamicField);
     }
+
 }

+ 18 - 0
src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java

@@ -15,10 +15,13 @@
  */
 package org.codelibs.fess.crawler.transformer;
 
+import javax.annotation.PostConstruct;
+
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.ExtractorFactory;
 import org.codelibs.fess.exception.FessSystemException;
+import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -26,6 +29,21 @@ import org.slf4j.LoggerFactory;
 public class FessFileTransformer extends AbstractFessFileTransformer {
     private static final Logger logger = LoggerFactory.getLogger(FessFileTransformer.class);
 
+    @PostConstruct
+    public void init() {
+        fessConfig = ComponentUtil.getFessConfig();
+    }
+
+    @Override
+    public FessConfig getFessConfig() {
+        return fessConfig;
+    }
+
+    @Override
+    public Logger getLogger() {
+        return logger;
+    }
+
     @Override
     protected Extractor getExtractor(final ResponseData responseData) {
         final ExtractorFactory extractorFactory = ComponentUtil.getExtractorFactory();

+ 23 - 0
src/main/java/org/codelibs/fess/crawler/transformer/FessTikaTransformer.java

@@ -15,12 +15,35 @@
  */
 package org.codelibs.fess.crawler.transformer;
 
+import javax.annotation.PostConstruct;
+
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.exception.FessSystemException;
+import org.codelibs.fess.mylasta.direction.FessConfig;
+import org.codelibs.fess.util.ComponentUtil;
 import org.lastaflute.di.core.SingletonLaContainer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class FessTikaTransformer extends AbstractFessFileTransformer {
+    private static final Logger logger = LoggerFactory.getLogger(FessTikaTransformer.class);
+
+    @PostConstruct
+    public void init() {
+        fessConfig = ComponentUtil.getFessConfig();
+    }
+
+    @Override
+    public FessConfig getFessConfig() {
+        return fessConfig;
+    }
+
+    @Override
+    public Logger getLogger() {
+        return logger;
+    }
+
     @Override
     protected Extractor getExtractor(final ResponseData responseData) {
         final Extractor extractor = SingletonLaContainer.getComponent("tikaExtractor");

+ 33 - 40
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessXpathTransformer.java → src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java

@@ -22,29 +22,20 @@ import java.util.Map;
 
 import org.apache.commons.lang3.StringUtils;
 import org.codelibs.core.lang.StringUtil;
-import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
 import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 import groovy.lang.Binding;
 import groovy.lang.GroovyShell;
 
-public abstract class AbstractFessXpathTransformer extends XpathTransformer {
-    private static final Logger logger = LoggerFactory.getLogger(AbstractFessXpathTransformer.class);
+public interface FessTransformer {
 
-    public int maxSiteLength = 50;
+    FessConfig getFessConfig();
 
-    public String unknownHostname = "unknown";
+    Logger getLogger();
 
-    public String siteEncoding;
-
-    public boolean replaceSiteEncodingWhenEnglish = false;
-
-    public boolean appendResultData = true;
-
-    protected String getHost(final String u) {
+    public default String getHost(final String u) {
         if (StringUtil.isBlank(u)) {
             return StringUtil.EMPTY; // empty
         }
@@ -63,13 +54,13 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
         }
 
         if (url.equals(originalUrl)) {
-            return unknownHostname;
+            return getFessConfig().getCrawlerDocumentUnknownHostname();
         }
 
         return url;
     }
 
-    protected String getSite(final String u, final String encoding) {
+    public default String getSite(final String u, final String encoding) {
         if (StringUtil.isBlank(u)) {
             return StringUtil.EMPTY; // empty
         }
@@ -87,15 +78,15 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
 
         if (encoding != null) {
             String enc;
-            if (siteEncoding != null) {
-                if (replaceSiteEncodingWhenEnglish) {
+            if (StringUtil.isNotBlank(getFessConfig().getCrawlerDocumentSiteEncoding())) {
+                if (getFessConfig().isCrawlerDocumentUseSiteEncodingOnEnglish()) {
                     if ("ISO-8859-1".equalsIgnoreCase(encoding) || "US-ASCII".equalsIgnoreCase(encoding)) {
-                        enc = siteEncoding;
+                        enc = getFessConfig().getCrawlerDocumentSiteEncoding();
                     } else {
                         enc = encoding;
                     }
                 } else {
-                    enc = siteEncoding;
+                    enc = getFessConfig().getCrawlerDocumentSiteEncoding();
                 }
             } else {
                 enc = encoding;
@@ -106,39 +97,35 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
             } catch (final Exception e) {}
         }
 
-        return StringUtils.abbreviate(url, maxSiteLength);
+        return StringUtils.abbreviate(url, getMaxSiteLength());
     }
 
-    protected String normalizeContent(final String content) {
+    public default String normalizeContent(final String content) {
         if (content == null) {
             return StringUtil.EMPTY; // empty
         }
         return content.replaceAll("\\s+", " ");
     }
 
-    protected void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
+    public default void putResultDataBody(final Map<String, Object> dataMap, final String key, final Object value) {
         final FessConfig fessConfig = ComponentUtil.getFessConfig();
         if (fessConfig.getIndexFieldUrl().equals(key)) {
             dataMap.put(key, value);
         } else if (dataMap.containsKey(key)) {
-            if (appendResultData) {
+            if (getFessConfig().isCrawlerDocumentAppendData()) {
                 final Object oldValue = dataMap.get(key);
-                if (key.endsWith("_m")) {
-                    final Object[] oldValues = (Object[]) oldValue;
-                    if (value.getClass().isArray()) {
-                        final Object[] newValues = (Object[]) value;
-                        final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length);
-                        for (int i = 0; i < newValues.length; i++) {
-                            values[values.length - 1 + i] = newValues[i];
-                        }
-                        dataMap.put(key, values);
-                    } else {
-                        final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1);
-                        values[values.length - 1] = value;
-                        dataMap.put(key, values);
+                final Object[] oldValues = (Object[]) oldValue;
+                if (value.getClass().isArray()) {
+                    final Object[] newValues = (Object[]) value;
+                    final Object[] values = Arrays.copyOf(oldValues, oldValues.length + newValues.length);
+                    for (int i = 0; i < newValues.length; i++) {
+                        values[values.length - 1 + i] = newValues[i];
                     }
+                    dataMap.put(key, values);
                 } else {
-                    dataMap.put(key, oldValue + " " + value);
+                    final Object[] values = Arrays.copyOf(oldValues, oldValues.length + 1);
+                    values[values.length - 1] = value;
+                    dataMap.put(key, values);
                 }
             } else {
                 dataMap.put(key, value);
@@ -148,7 +135,8 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
         }
     }
 
-    protected void putResultDataWithTemplate(final Map<String, Object> dataMap, final String key, final Object value, final String template) {
+    public default void putResultDataWithTemplate(final Map<String, Object> dataMap, final String key, final Object value,
+            final String template) {
         Object target = value;
         if (template != null) {
             final Map<String, Object> paramMap = new HashMap<>(dataMap.size() + 1);
@@ -173,7 +161,7 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
         }
     }
 
-    protected String evaluateValue(final String template, final Map<String, Object> paramMap) {
+    public default String evaluateValue(final String template, final Map<String, Object> paramMap) {
         if (StringUtil.isEmpty(template)) {
             return StringUtil.EMPTY;
         }
@@ -185,8 +173,13 @@ public abstract class AbstractFessXpathTransformer extends XpathTransformer {
             }
             return value.toString();
         } catch (final Exception e) {
-            logger.warn("Invalid value format: " + template, e);
+            getLogger().warn("Invalid value format: " + template, e);
             return null;
         }
     }
+
+    public default int getMaxSiteLength() {
+        return getFessConfig().getCrawlerDocumentMaxSiteLengthAsInteger();
+    }
+
 }

+ 53 - 43
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -29,6 +29,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import javax.annotation.PostConstruct;
 import javax.xml.transform.TransformerException;
 
 import org.apache.commons.io.IOUtils;
@@ -47,6 +48,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
+import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
 import org.codelibs.fess.crawler.util.ResponseDataUtil;
 import org.codelibs.fess.es.config.exentity.CrawlingConfig;
@@ -68,30 +70,31 @@ import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.xml.sax.InputSource;
 
-public class FessXpathTransformer extends AbstractFessXpathTransformer {
+public class FessXpathTransformer extends XpathTransformer implements FessTransformer {
     private static final Logger logger = LoggerFactory.getLogger(FessXpathTransformer.class);
 
     private static final int UTF8_BOM_SIZE = 3;
 
-    public String cacheXpath = "//BODY";
-
-    public String contentXpath = "//BODY";
-
-    public String langXpath = "//HTML/@lang";
-
-    public String digestXpath = "//META[@name='description']/@content";
-
-    public String canonicalXpath = "//LINK[@rel='canonical']/@href";
+    public boolean prunedCacheContent = true;
 
-    public List<String> prunedTagList = new ArrayList<String>();
+    public Map<String, String> convertUrlMap = new HashMap<>();
 
-    public boolean prunedCacheContent = true;
+    protected FessConfig fessConfig;
 
-    public int maxDigestLength = 200;
+    @PostConstruct
+    public void init() {
+        fessConfig = ComponentUtil.getFessConfig();
+    }
 
-    public int maxCacheLength = 2621440; //  2.5Mbytes
+    @Override
+    public FessConfig getFessConfig() {
+        return fessConfig;
+    }
 
-    public Map<String, String> convertUrlMap = new HashMap<String, String>();
+    @Override
+    public Logger getLogger() {
+        return logger;
+    }
 
     @Override
     protected void storeData(final ResponseData responseData, final ResultData resultData) {
@@ -181,7 +184,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
 
     protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
         // canonical
-        if (StringUtil.isNotBlank(canonicalXpath)) {
+        if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCannonicalXpath())) {
             final String canonicalUrl = getCanonicalUrl(responseData, document);
             if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
                 final Set<RequestData> childUrlSet = new HashSet<>();
@@ -202,6 +205,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
         String url = responseData.getUrl();
         final String indexingTarget = crawlingConfig.getIndexingTarget(url);
         url = pathMappingHelper.replaceUrl(sessionId, url);
+        final String mimeType = responseData.getMimeType();
 
         final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
 
@@ -223,26 +227,32 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
             putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
         }
         // lang
-        final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, langXpath, true));
+        final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlLangXpath(), true));
         if (lang != null) {
             putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
         }
         // title
         // content
         putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), getDocumentContent(responseData, document));
-        if (Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
-                || fessConfig.isCrawlerDocumentCacheEnable()) {
-            String charSet = responseData.getCharSet();
-            if (charSet == null) {
-                charSet = Constants.UTF_8;
-            }
-            try {
-                // cache
-                putResultDataBody(dataMap, fessConfig.getIndexFieldCache(),
-                        new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet));
-                putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
-            } catch (final Exception e) {
-                logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
+        if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig
+                .isCrawlerDocumentCacheEnable()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
+            if (responseData.getContentLength() > 0
+                    && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
+                String charSet = responseData.getCharSet();
+                if (charSet == null) {
+                    charSet = Constants.UTF_8;
+                }
+                try {
+                    // cache
+                    putResultDataBody(dataMap, fessConfig.getIndexFieldCache(),
+                            new String(InputStreamUtil.getBytes(responseData.getResponseBody()), charSet));
+                    putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
+                } catch (final Exception e) {
+                    logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
+                }
+            } else {
+                logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(),
+                        fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
             }
         }
         // digest
@@ -261,7 +271,6 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
         // anchor
         putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
         // mimetype
-        final String mimeType = responseData.getMimeType();
         putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
         if (fileTypeHelper != null) {
             // filetype
@@ -324,7 +333,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
     }
 
     protected String getCanonicalUrl(final ResponseData responseData, final Document document) {
-        final String canonicalUrl = getSingleNodeValue(document, canonicalXpath, false);
+        final String canonicalUrl = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlCannonicalXpath(), false);
         if (StringUtil.isNotBlank(canonicalUrl)) {
             return canonicalUrl;
         }
@@ -332,13 +341,15 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
     }
 
     protected String getDocumentDigest(final ResponseData responseData, final Document document) {
-        final String digest = getSingleNodeValue(document, digestXpath, false);
+        final String digest = getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlDigestXpath(), false);
         if (StringUtil.isNotBlank(digest)) {
             return digest;
         }
 
-        final String body = normalizeContent(removeCommentTag(getSingleNodeValue(document, contentXpath, prunedCacheContent)));
-        return StringUtils.abbreviate(body, maxDigestLength);
+        final String body =
+                normalizeContent(removeCommentTag(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(),
+                        prunedCacheContent)));
+        return StringUtils.abbreviate(body, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger());
     }
 
     String removeCommentTag(final String content) {
@@ -364,7 +375,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
     }
 
     private String getDocumentContent(final ResponseData responseData, final Document document) {
-        return normalizeContent(getSingleNodeValue(document, contentXpath, true));
+        return normalizeContent(getSingleNodeValue(document, fessConfig.getCrawlerDocumentHtmlContentXpath(), true));
     }
 
     protected String getSingleNodeValue(final Document document, final String xpath, final boolean pruned) {
@@ -420,7 +431,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
     }
 
     protected boolean isPrunedTag(final String tagName) {
-        for (final String name : prunedTagList) {
+        for (final String name : getCrawlerDocumentHtmlPrunedTags()) {
             if (name.equalsIgnoreCase(tagName)) {
                 return true;
             }
@@ -492,12 +503,6 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
         return urlList;
     }
 
-    public void addPrunedTag(final String tagName) {
-        if (StringUtil.isNotBlank(tagName)) {
-            prunedTagList.add(tagName);
-        }
-    }
-
     @Override
     public Object getData(final AccessResultData<?> accessResultData) {
         final byte[] data = accessResultData.getData();
@@ -554,4 +559,9 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
     private boolean isUtf8BomBytes(final byte[] b) {
         return b[0] == (byte) 0xEF && b[1] == (byte) 0xBB && b[2] == (byte) 0xBF;
     }
+
+    protected String[] getCrawlerDocumentHtmlPrunedTags() {
+        return fessConfig.getCrawlerDocumentHtmlPrunedTagsAsArray();
+    }
+
 }

+ 6 - 1
src/main/java/org/codelibs/fess/helper/ViewHelper.java

@@ -38,6 +38,7 @@ import java.util.regex.Pattern;
 import javax.annotation.PostConstruct;
 import javax.annotation.Resource;
 
+import org.apache.commons.lang3.StringEscapeUtils;
 import org.apache.commons.lang3.StringUtils;
 import org.codelibs.core.CoreLibConstants;
 import org.codelibs.core.lang.StringUtil;
@@ -398,7 +399,7 @@ public class ViewHelper implements Serializable {
         if (locale == null) {
             locale = Locale.ENGLISH;
         }
-        String url = DocumentUtil.getValue(doc, "url", String.class);
+        String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
         if (url == null) {
             url = ComponentUtil.getMessageManager().getMessage(locale, "labels.search_unknown");
         }
@@ -417,6 +418,10 @@ public class ViewHelper implements Serializable {
 
         String cache = DocumentUtil.getValue(doc, fessConfig.getIndexFieldCache(), String.class);
         if (cache != null) {
+            String mimetype = DocumentUtil.getValue(doc, fessConfig.getIndexFieldMimetype(), String.class);
+            if (!ComponentUtil.getFessConfig().isHtmlMimetypeForCache(mimetype)) {
+                cache = StringEscapeUtils.escapeHtml4(cache);
+            }
             cache = pathMappingHelper.replaceUrls(cache);
             if (queries != null && queries.length > 0) {
                 doc.put("hlCache", replaceHighlightQueries(cache, queries));

+ 458 - 3
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -20,7 +20,7 @@ import org.lastaflute.core.direction.exception.ConfigPropertyNotFoundException;
 /**
  * @author FreeGen
  */
-public interface FessConfig extends FessEnv, FessProp {
+public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction.FessProp {
 
     /** The key of the configuration. e.g. Fess */
     String DOMAIN_TITLE = "domain.title";
@@ -66,9 +66,75 @@ public interface FessConfig extends FessEnv, FessProp {
     -XX:+DisableExplicitGC */
     String JVM_SUGGEST_OPTIONS = "jvm.suggest.options";
 
+    /** The key of the configuration. e.g. 50 */
+    String CRAWLER_DOCUMENT_MAX_SITE_LENGTH = "crawler.document.max.site.length";
+
+    /** The key of the configuration. e.g. UTF-8 */
+    String CRAWLER_DOCUMENT_SITE_ENCODING = "crawler.document.site.encoding";
+
+    /** The key of the configuration. e.g. unknown */
+    String CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME = "crawler.document.unknown.hostname";
+
+    /** The key of the configuration. e.g. false */
+    String CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH = "crawler.document.use.site.encoding.on.english";
+
+    /** The key of the configuration. e.g. true */
+    String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
+
+    /** The key of the configuration. e.g. //BODY */
+    String CRAWLER_DOCUMENT_HTML_CONTENT_XPATH = "crawler.document.html.content.xpath";
+
+    /** The key of the configuration. e.g. //HTML/@lang */
+    String CRAWLER_DOCUMENT_HTML_LANG_XPATH = "crawler.document.html.lang.xpath";
+
+    /** The key of the configuration. e.g. //META[@name='description']/@content */
+    String CRAWLER_DOCUMENT_HTML_DIGEST_XPATH = "crawler.document.html.digest.xpath";
+
+    /** The key of the configuration. e.g. //LINK[@rel='canonical']/@href */
+    String CRAWLER_DOCUMENT_HTML_CANNONICAL_XPATH = "crawler.document.html.cannonical.xpath";
+
+    /** The key of the configuration. e.g. noscript,script */
+    String CRAWLER_DOCUMENT_HTML_PRUNED_TAGS = "crawler.document.html.pruned.tags";
+
+    /** The key of the configuration. e.g. 200 */
+    String CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH = "crawler.document.html.max.digest.length";
+
+    /** The key of the configuration. e.g.  */
+    String CRAWLER_DOCUMENT_FILE_NAME_ENCODING = "crawler.document.file.name.encoding";
+
+    /** The key of the configuration. e.g. No title. */
+    String CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL = "crawler.document.file.no.title.label";
+
+    /** The key of the configuration. e.g. 10 */
+    String CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH = "crawler.document.file.abbreviation.margin.length";
+
     /** The key of the configuration. e.g. false */
+    String CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT = "crawler.document.file.ignore.empty.content";
+
+    /** The key of the configuration. e.g. 100 */
+    String CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH = "crawler.document.file.max.title.length";
+
+    /** The key of the configuration. e.g. 200 */
+    String CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH = "crawler.document.file.max.digest.length";
+
+    /** The key of the configuration. e.g. true */
+    String CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT = "crawler.document.file.append.meta.content";
+
+    /** The key of the configuration. e.g. true */
+    String CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT = "crawler.document.file.append.body.content";
+
+    /** The key of the configuration. e.g. true */
     String CRAWLER_DOCUMENT_CACHE_ENABLE = "crawler.document.cache.enable";
 
+    /** The key of the configuration. e.g. 2621440 */
+    String CRAWLER_DOCUMENT_CACHE_MAX_SIZE = "crawler.document.cache.max.size";
+
+    /** The key of the configuration. e.g. text/html */
+    String CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES = "crawler.document.cache.supported.mimetypes";
+
+    /** The key of the configuration. e.g. text/html */
+    String CRAWLER_DOCUMENT_CACHE_HTML_MIMETYPES = "crawler.document.cache.html.mimetypes";
+
     /** The key of the configuration. e.g. favorite_count */
     String INDEX_FIELD_favorite_count = "index.field.favorite_count";
 
@@ -475,19 +541,272 @@ public interface FessConfig extends FessEnv, FessProp {
     String getJvmSuggestOptions();
 
     /**
-     * Get the value for the key 'crawler.document.cache.enable'. <br>
+     * Get the value for the key 'crawler.document.max.site.length'. <br>
+     * The value is, e.g. 50 <br>
+     * comment: common
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentMaxSiteLength();
+
+    /**
+     * Get the value for the key 'crawler.document.max.site.length' as {@link Integer}. <br>
+     * The value is, e.g. 50 <br>
+     * comment: common
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentMaxSiteLengthAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.site.encoding'. <br>
+     * The value is, e.g. UTF-8 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentSiteEncoding();
+
+    /**
+     * Get the value for the key 'crawler.document.unknown.hostname'. <br>
+     * The value is, e.g. unknown <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentUnknownHostname();
+
+    /**
+     * Get the value for the key 'crawler.document.use.site.encoding.on.english'. <br>
+     * The value is, e.g. false <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentUseSiteEncodingOnEnglish();
+
+    /**
+     * Is the property for the key 'crawler.document.use.site.encoding.on.english' true? <br>
      * The value is, e.g. false <br>
+     * @return The determination, true or false. (if not found, exception but basically no way)
+     */
+    boolean isCrawlerDocumentUseSiteEncodingOnEnglish();
+
+    /**
+     * Get the value for the key 'crawler.document.append.data'. <br>
+     * The value is, e.g. true <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentAppendData();
+
+    /**
+     * Is the property for the key 'crawler.document.append.data' true? <br>
+     * The value is, e.g. true <br>
+     * @return The determination, true or false. (if not found, exception but basically no way)
+     */
+    boolean isCrawlerDocumentAppendData();
+
+    /**
+     * Get the value for the key 'crawler.document.html.content.xpath'. <br>
+     * The value is, e.g. //BODY <br>
+     * comment: html
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlContentXpath();
+
+    /**
+     * Get the value for the key 'crawler.document.html.lang.xpath'. <br>
+     * The value is, e.g. //HTML/@lang <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlLangXpath();
+
+    /**
+     * Get the value for the key 'crawler.document.html.digest.xpath'. <br>
+     * The value is, e.g. //META[@name='description']/@content <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlDigestXpath();
+
+    /**
+     * Get the value for the key 'crawler.document.html.cannonical.xpath'. <br>
+     * The value is, e.g. //LINK[@rel='canonical']/@href <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlCannonicalXpath();
+
+    /**
+     * Get the value for the key 'crawler.document.html.pruned.tags'. <br>
+     * The value is, e.g. noscript,script <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlPrunedTags();
+
+    /**
+     * Get the value for the key 'crawler.document.html.max.digest.length'. <br>
+     * The value is, e.g. 200 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentHtmlMaxDigestLength();
+
+    /**
+     * Get the value for the key 'crawler.document.html.max.digest.length' as {@link Integer}. <br>
+     * The value is, e.g. 200 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentHtmlMaxDigestLengthAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.name.encoding'. <br>
+     * The value is, e.g.  <br>
+     * comment: file
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileNameEncoding();
+
+    /**
+     * Get the value for the key 'crawler.document.file.name.encoding' as {@link Integer}. <br>
+     * The value is, e.g.  <br>
+     * comment: file
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileNameEncodingAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.no.title.label'. <br>
+     * The value is, e.g. No title. <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileNoTitleLabel();
+
+    /**
+     * Get the value for the key 'crawler.document.file.abbreviation.margin.length'. <br>
+     * The value is, e.g. 10 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileAbbreviationMarginLength();
+
+    /**
+     * Get the value for the key 'crawler.document.file.abbreviation.margin.length' as {@link Integer}. <br>
+     * The value is, e.g. 10 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.ignore.empty.content'. <br>
+     * The value is, e.g. false <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileIgnoreEmptyContent();
+
+    /**
+     * Is the property for the key 'crawler.document.file.ignore.empty.content' true? <br>
+     * The value is, e.g. false <br>
+     * @return The determination, true or false. (if not found, exception but basically no way)
+     */
+    boolean isCrawlerDocumentFileIgnoreEmptyContent();
+
+    /**
+     * Get the value for the key 'crawler.document.file.max.title.length'. <br>
+     * The value is, e.g. 100 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileMaxTitleLength();
+
+    /**
+     * Get the value for the key 'crawler.document.file.max.title.length' as {@link Integer}. <br>
+     * The value is, e.g. 100 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileMaxTitleLengthAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.max.digest.length'. <br>
+     * The value is, e.g. 200 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileMaxDigestLength();
+
+    /**
+     * Get the value for the key 'crawler.document.file.max.digest.length' as {@link Integer}. <br>
+     * The value is, e.g. 200 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentFileMaxDigestLengthAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.file.append.meta.content'. <br>
+     * The value is, e.g. true <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileAppendMetaContent();
+
+    /**
+     * Is the property for the key 'crawler.document.file.append.meta.content' true? <br>
+     * The value is, e.g. true <br>
+     * @return The determination, true or false. (if not found, exception but basically no way)
+     */
+    boolean isCrawlerDocumentFileAppendMetaContent();
+
+    /**
+     * Get the value for the key 'crawler.document.file.append.body.content'. <br>
+     * The value is, e.g. true <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentFileAppendBodyContent();
+
+    /**
+     * Is the property for the key 'crawler.document.file.append.body.content' true? <br>
+     * The value is, e.g. true <br>
+     * @return The determination, true or false. (if not found, exception but basically no way)
+     */
+    boolean isCrawlerDocumentFileAppendBodyContent();
+
+    /**
+     * Get the value for the key 'crawler.document.cache.enable'. <br>
+     * The value is, e.g. true <br>
+     * comment: cache
      * @return The value of found property. (NotNull: if not found, exception but basically no way)
      */
     String getCrawlerDocumentCacheEnable();
 
     /**
      * Is the property for the key 'crawler.document.cache.enable' true? <br>
-     * The value is, e.g. false <br>
+     * The value is, e.g. true <br>
+     * comment: cache
      * @return The determination, true or false. (if not found, exception but basically no way)
      */
     boolean isCrawlerDocumentCacheEnable();
 
+    /**
+     * Get the value for the key 'crawler.document.cache.max.size'. <br>
+     * The value is, e.g. 2621440 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentCacheMaxSize();
+
+    /**
+     * Get the value for the key 'crawler.document.cache.max.size' as {@link Integer}. <br>
+     * The value is, e.g. 2621440 <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     * @throws NumberFormatException When the property is not integer.
+     */
+    Integer getCrawlerDocumentCacheMaxSizeAsInteger();
+
+    /**
+     * Get the value for the key 'crawler.document.cache.supported.mimetypes'. <br>
+     * The value is, e.g. text/html <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentCacheSupportedMimetypes();
+
+    /**
+     * Get the value for the key 'crawler.document.cache.html.mimetypes'. <br>
+     * The value is, e.g. text/html <br>
+     * comment: ,text/plain,application/xml,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-powerpoint,application/vnd.openxmlformats-officedocument.presentationml.presentation
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDocumentCacheHtmlMimetypes();
+
     /**
      * Get the value for the key 'index.field.favorite_count'. <br>
      * The value is, e.g. favorite_count <br>
@@ -1515,6 +1834,126 @@ public interface FessConfig extends FessEnv, FessProp {
             return get(FessConfig.JVM_SUGGEST_OPTIONS);
         }
 
+        public String getCrawlerDocumentMaxSiteLength() {
+            return get(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH);
+        }
+
+        public Integer getCrawlerDocumentMaxSiteLengthAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH);
+        }
+
+        public String getCrawlerDocumentSiteEncoding() {
+            return get(FessConfig.CRAWLER_DOCUMENT_SITE_ENCODING);
+        }
+
+        public String getCrawlerDocumentUnknownHostname() {
+            return get(FessConfig.CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME);
+        }
+
+        public String getCrawlerDocumentUseSiteEncodingOnEnglish() {
+            return get(FessConfig.CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH);
+        }
+
+        public boolean isCrawlerDocumentUseSiteEncodingOnEnglish() {
+            return is(FessConfig.CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH);
+        }
+
+        public String getCrawlerDocumentAppendData() {
+            return get(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
+        }
+
+        public boolean isCrawlerDocumentAppendData() {
+            return is(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
+        }
+
+        public String getCrawlerDocumentHtmlContentXpath() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_CONTENT_XPATH);
+        }
+
+        public String getCrawlerDocumentHtmlLangXpath() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_LANG_XPATH);
+        }
+
+        public String getCrawlerDocumentHtmlDigestXpath() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_DIGEST_XPATH);
+        }
+
+        public String getCrawlerDocumentHtmlCannonicalXpath() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_CANNONICAL_XPATH);
+        }
+
+        public String getCrawlerDocumentHtmlPrunedTags() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_PRUNED_TAGS);
+        }
+
+        public String getCrawlerDocumentHtmlMaxDigestLength() {
+            return get(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH);
+        }
+
+        public Integer getCrawlerDocumentHtmlMaxDigestLengthAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_HTML_MAX_DIGEST_LENGTH);
+        }
+
+        public String getCrawlerDocumentFileNameEncoding() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING);
+        }
+
+        public Integer getCrawlerDocumentFileNameEncodingAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_NAME_ENCODING);
+        }
+
+        public String getCrawlerDocumentFileNoTitleLabel() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_NO_TITLE_LABEL);
+        }
+
+        public String getCrawlerDocumentFileAbbreviationMarginLength() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
+        }
+
+        public Integer getCrawlerDocumentFileAbbreviationMarginLengthAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_ABBREVIATION_MARGIN_LENGTH);
+        }
+
+        public String getCrawlerDocumentFileIgnoreEmptyContent() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT);
+        }
+
+        public boolean isCrawlerDocumentFileIgnoreEmptyContent() {
+            return is(FessConfig.CRAWLER_DOCUMENT_FILE_IGNORE_EMPTY_CONTENT);
+        }
+
+        public String getCrawlerDocumentFileMaxTitleLength() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH);
+        }
+
+        public Integer getCrawlerDocumentFileMaxTitleLengthAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_TITLE_LENGTH);
+        }
+
+        public String getCrawlerDocumentFileMaxDigestLength() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH);
+        }
+
+        public Integer getCrawlerDocumentFileMaxDigestLengthAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_FILE_MAX_DIGEST_LENGTH);
+        }
+
+        public String getCrawlerDocumentFileAppendMetaContent() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT);
+        }
+
+        public boolean isCrawlerDocumentFileAppendMetaContent() {
+            return is(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_META_CONTENT);
+        }
+
+        public String getCrawlerDocumentFileAppendBodyContent() {
+            return get(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT);
+        }
+
+        public boolean isCrawlerDocumentFileAppendBodyContent() {
+            return is(FessConfig.CRAWLER_DOCUMENT_FILE_APPEND_BODY_CONTENT);
+        }
+
         public String getCrawlerDocumentCacheEnable() {
             return get(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLE);
         }
@@ -1523,6 +1962,22 @@ public interface FessConfig extends FessEnv, FessProp {
             return is(FessConfig.CRAWLER_DOCUMENT_CACHE_ENABLE);
         }
 
+        public String getCrawlerDocumentCacheMaxSize() {
+            return get(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE);
+        }
+
+        public Integer getCrawlerDocumentCacheMaxSizeAsInteger() {
+            return getAsInteger(FessConfig.CRAWLER_DOCUMENT_CACHE_MAX_SIZE);
+        }
+
+        public String getCrawlerDocumentCacheSupportedMimetypes() {
+            return get(FessConfig.CRAWLER_DOCUMENT_CACHE_SUPPORTED_MIMETYPES);
+        }
+
+        public String getCrawlerDocumentCacheHtmlMimetypes() {
+            return get(FessConfig.CRAWLER_DOCUMENT_CACHE_HTML_MIMETYPES);
+        }
+
         public String getIndexFieldFavoriteCount() {
             return get(FessConfig.INDEX_FIELD_favorite_count);
         }

+ 27 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java

@@ -18,6 +18,7 @@ package org.codelibs.fess.mylasta.direction;
 import org.codelibs.core.lang.StringUtil;
 import org.codelibs.fess.Constants;
 import org.codelibs.fess.util.ComponentUtil;
+import org.codelibs.fess.util.StreamUtil;
 
 public interface FessProp {
     public default String getProperty(String key) {
@@ -79,4 +80,30 @@ public interface FessProp {
         return getJvmSuggestOptions().split("\n");
     }
 
+    String getCrawlerDocumentHtmlPrunedTags();
+
+    public default String[] getCrawlerDocumentHtmlPrunedTagsAsArray() {
+        return getCrawlerDocumentHtmlPrunedTags().split(",");
+    }
+
+    String getCrawlerDocumentCacheHtmlMimetypes();
+
+    public default boolean isHtmlMimetypeForCache(String mimetype) {
+        String[] mimetypes = getCrawlerDocumentCacheHtmlMimetypes().split(",");
+        if (mimetypes.length == 1 && StringUtil.isBlank(mimetypes[0])) {
+            return true;
+        }
+        return StreamUtil.of(mimetypes).anyMatch(s -> s.equalsIgnoreCase(mimetype));
+    }
+
+    String getCrawlerDocumentCacheSupportedMimetypes();
+
+    public default boolean isSupportedDocumentCacheMimetypes(String mimetype) {
+        String[] mimetypes = getCrawlerDocumentCacheSupportedMimetypes().split(",");
+        if (mimetypes.length == 1 && StringUtil.isBlank(mimetypes[0])) {
+            return true;
+        }
+        return StreamUtil.of(mimetypes).anyMatch(s -> s.equalsIgnoreCase(mimetype));
+    }
+
 }

+ 0 - 19
src/main/resources/crawler/transformer.xml

@@ -4,7 +4,6 @@
 <components namespace="fessCrawler">
 	<include path="crawler/transformer_basic.xml"/>
 
-
 	<component name="fessXpathTransformer" class="org.codelibs.fess.crawler.transformer.FessXpathTransformer" instance="singleton">
 		<property name="name">"fessXpathTransformer"</property>
 		<property name="featureMap">defaultFeatureMap</property>
@@ -16,31 +15,15 @@
 		<property name="convertUrlMap">
 			{"feed:" : "http:"}
 		</property>
-		<!-- 
-		<property name="cacheXpath">"//BODY"</property>
-		<property name="contentXpath">"//BODY"</property>
-		<property name="anchorXpath">"//A/@href"</property>
-		<property name="digestXpath">"//META[@name='description']/@content"</property>
-		 -->
-		<property name="replaceSiteEncodingWhenEnglish">true</property>
-		<property name="siteEncoding">"UTF-8"</property>
 		<!-- segment -->
 		<postConstruct name="addFieldRule">
 			<arg>"title"</arg>
 			<arg>"//TITLE"</arg>
 		</postConstruct>
-		<postConstruct name="addPrunedTag">
-			<arg>"noscript"</arg>
-		</postConstruct>
-		<postConstruct name="addPrunedTag">
-			<arg>"script"</arg>
-		</postConstruct>
 	</component>
 
 	<component name="fessFileTransformer" class="org.codelibs.fess.crawler.transformer.FessFileTransformer" instance="singleton">
 		<property name="name">"fessFileTransformer"</property>
-		<property name="replaceSiteEncodingWhenEnglish">true</property>
-		<property name="siteEncoding">"UTF-8"</property>
 		<postConstruct name="addMetaContentMapping">
 			<arg>"title"</arg>
 			<arg>"title"</arg>
@@ -60,8 +43,6 @@
 
 	<component name="fessTikaTransformer" class="org.codelibs.fess.crawler.transformer.FessTikaTransformer" instance="singleton">
 		<property name="name">"fessTikaTransformer"</property>
-		<property name="replaceSiteEncodingWhenEnglish">true</property>
-		<property name="siteEncoding">"UTF-8"</property>
 		<postConstruct name="addMetaContentMapping">
 			<arg>"title"</arg>
 			<arg>"title"</arg>

+ 31 - 1
src/main/resources/fess_config.properties

@@ -50,7 +50,37 @@ jvm.suggest.options=\
 #                                                                                   Index
 #                                                                                     ====
 
-crawler.document.cache.enable=false
+# common
+crawler.document.max.site.length=50
+crawler.document.site.encoding=UTF-8
+crawler.document.unknown.hostname=unknown
+crawler.document.use.site.encoding.on.english=false
+crawler.document.append.data=true
+
+# html
+crawler.document.html.content.xpath=//BODY
+crawler.document.html.lang.xpath=//HTML/@lang
+crawler.document.html.digest.xpath=//META[@name='description']/@content
+crawler.document.html.cannonical.xpath=//LINK[@rel='canonical']/@href
+crawler.document.html.pruned.tags=noscript,script
+crawler.document.html.max.digest.length=200
+
+# file
+crawler.document.file.name.encoding=
+crawler.document.file.no.title.label=No title.
+crawler.document.file.abbreviation.margin.length=10
+crawler.document.file.ignore.empty.content=false
+crawler.document.file.max.title.length=100
+crawler.document.file.max.digest.length=200
+crawler.document.file.append.meta.content=true
+crawler.document.file.append.body.content=true
+
+# cache
+crawler.document.cache.enable=true
+crawler.document.cache.max.size=2621440
+crawler.document.cache.supported.mimetypes=text/html
+#,text/plain,application/xml,application/pdf,application/msword,application/vnd.openxmlformats-officedocument.wordprocessingml.document,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/vnd.ms-powerpoint,application/vnd.openxmlformats-officedocument.presentationml.presentation
+crawler.document.cache.html.mimetypes=text/html
 
 # field names
 index.field.favorite_count=favorite_count

+ 0 - 9
src/main/webapp/WEB-INF/orig/view/mobile/index.jsp

@@ -1,9 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-<html>
-<head>
-<jsp:include page="indexHtmlHead.jsp"/>
-</head>
-<body>
-<jsp:include page="indexMain.jsp"/>
-</body>
-</html>

+ 0 - 4
src/main/webapp/WEB-INF/orig/view/mobile/indexHtmlHead.jsp

@@ -1,4 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-<meta http-equiv="Content-Type" content="text/html; charset=<m:charset/>" />
-<meta content="no-cache" http-equiv="Cache-Control"/>
-<title><bean:message key="labels.mobile_search_title"/></title>

+ 0 - 14
src/main/webapp/WEB-INF/orig/view/mobile/indexMain.jsp

@@ -1,14 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-    <div>
-      <div style="text-align: center;">
-        <m:img src="logo-top.png" magniWidth="0.8" style="vertical-align: middle;" />
-        <br/>
-        <s:form>
-          <div>
-          	<html:text property="query" title="Search" size="20" maxlength="1000" />
-            <br/>
-            <input type="submit" value="<bean:message key="labels.top.search"/>" name="search" />
-          </div>
-        </s:form>
-      </div>
-    </div>

+ 0 - 18
src/main/webapp/WEB-INF/orig/view/mobile/search.jsp

@@ -1,18 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-<html>
-<head>
-<jsp:include page="searchHtmlHead.jsp"/>
-</head>
-<body>
-<jsp:include page="searchHeader.jsp"/>
-<c:choose>
-<c:when test="${f:h(allRecordCount) != 0}">
-<jsp:include page="searchResults.jsp"/>
-</c:when>
-<c:otherwise>
-<jsp:include page="searchNoResult.jsp"/>
-</c:otherwise>
-</c:choose>
-<jsp:include page="searchFooter.jsp"/>
-</body>
-</html>

+ 0 - 5
src/main/webapp/WEB-INF/orig/view/mobile/searchFooter.jsp

@@ -1,5 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-<hr style="border-style: solid; border-color: #ffffff;"/>
-<div style="font-size: x-small; text-align: center;">
-  <bean:message key="labels.footer.copyright"/>
-</div>

+ 0 - 13
src/main/webapp/WEB-INF/orig/view/mobile/searchHeader.jsp

@@ -1,13 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-<div id="header">
-  <div>
-    <s:form>
-      <div>
-<m:img src="logo-top.png" magniWidth="0.3" />
-<br/>
-<html:text property="query" title="Search" size="16" maxlength="1000" />
-<input type="submit" value="<bean:message key="labels.search"/>" name="search"/>
-      </div>
-    </s:form>
-  </div>
-</div>

+ 0 - 4
src/main/webapp/WEB-INF/orig/view/mobile/searchHtmlHead.jsp

@@ -1,4 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-<meta http-equiv="Content-Type" content="text/html; charset=<m:charset/>" />
-<meta content="no-cache" http-equiv="Cache-Control"/>
-<title>${f:h(query)} - <bean:message key="labels.search_title"/></title>

+ 0 - 4
src/main/webapp/WEB-INF/orig/view/mobile/searchNoResult.jsp

@@ -1,4 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-              <div id="result">
-                <bean:message key="labels.did_not_match" arg0="${f:h(query)}"/>
-              </div>

+ 0 - 55
src/main/webapp/WEB-INF/orig/view/mobile/searchResults.jsp

@@ -1,55 +0,0 @@
-<%@page pageEncoding="UTF-8" %>
-              <div id="result">
-                <div>
-                  <c:forEach var="doc" varStatus="s" items="${documentItems}">
-                    <div>
-                      <a href="${doc.urlLink}"><span>${f:h(doc.contentTitle)}</span></a>
-                      <span id="snip">
-                        <br/>
-                        <span style="color: #666666;">
-                          ${doc.contentDescription}
-                        </span>
-                      </span>
-                      <span style="color: #008000;">
-                        <br/>
-                        ${f:h(doc.site)}
-                      </span>
-                      <br/>
-                    </div>
-                    <br/>
-                  </c:forEach>
-                </div>
-              </div>
-
-              <div id="subfooter"  style="text-align: center;">
-                <p>
-                  <c:if test="${existPrevPage}">
-                    <span>
-                      <s:link href="prev?query=${f:u(query)}&pn=${f:u(currentPageNumber)}&num=${f:u(pageSize)}">
-                        <bean:message key="labels.prev_page"/>
-                      </s:link>
-                    </span>
-                  </c:if>
-                  <c:forEach var="pageNumber" varStatus="s" items="${pageNumberList}">
-                    <c:if test="${pageNumber == currentPageNumber}">
-                      <span>
-                        ${pageNumber}
-                      </span>
-                    </c:if>
-                    <c:if test="${pageNumber != currentPageNumber}">
-                      <span>
-                        <s:link href="move?query=${f:u(query)}&pn=${f:u(pageNumber)}&num=${f:u(pageSize)}">
-                          ${f:h(pageNumber)}
-                        </s:link>
-                      </span>
-                    </c:if>
-                  </c:forEach>
-                  <c:if test="${existNextPage}">
-                    <span>
-                      <s:link href="next?query=${f:u(query)}&pn=${f:u(currentPageNumber)}&num=${f:u(pageSize)}">
-                        <bean:message key="labels.next_page"/>
-                      </s:link>
-                    </span>
-                  </c:if>
-                </p>
-              </div>

+ 2 - 0
src/main/webapp/WEB-INF/view/searchResults.jsp

@@ -36,8 +36,10 @@
 						<div class="site ellipsis">
 							<cite>${f:h(doc.sitePath)}</cite>
 							<c:if test="${doc.has_cache=='true'}">
+								<small>
 								<la:link href="/cache/?docId=${doc.doc_id}${appendHighlightParams}" class="cache"><la:message
 										key="labels.search_result_cache" /></la:link>
+								</small>
 							</c:if>
 						</div>
 						<div class="more hidden-md-up">

+ 53 - 38
src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java

@@ -34,7 +34,7 @@ public class FessFileTransformerTest extends UnitFessTestCase {
 
     public void test_decodeUrl_ok() throws Exception {
         String url, exp;
-        final FessFileTransformer transformer = new FessFileTransformer();
+        final FessFileTransformer transformer = createInstance();
 
         url = "";
         exp = "";
@@ -62,156 +62,171 @@ public class FessFileTransformerTest extends UnitFessTestCase {
     }
 
     public void test_decodeUrl_null() throws Exception {
-        final FessFileTransformer transformer = new FessFileTransformer();
+        final FessFileTransformer transformer = createInstance();
         assertNull(transformer.decodeUrlAsName(null, true));
     }
 
     public void test_getHost_ok() {
         String url, exp;
-        final FessFileTransformer transformer = new FessFileTransformer();
+        final FessFileTransformer transformer = createInstance();
 
         url = "";
         exp = "";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "http://server/home/user";
         exp = "server";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:/home/user";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:/c:/home/user";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:////server/home/user";
         exp = "server";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:/" + encodeUrl("ホーム") + "/user";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:/c:/" + encodeUrl("ホーム") + "/user";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:////" + encodeUrl("サーバー") + "/home/user";
         exp = "サーバー";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
     }
 
     public void test_getHost_unexpected() {
         String url, exp;
-        final FessFileTransformer transformer = new FessFileTransformer();
+        final FessFileTransformer transformer = createInstance();
 
         url = null;
         exp = "";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "example:";
         exp = "unknown";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file://";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:///";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file://///";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file://///example";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
         url = "file:/c:";
         exp = "localhost";
-        assertEquals(exp, transformer.getHost(url));
+        assertEquals(exp, transformer.getHostOnFile(url));
 
     }
 
     public void test_getSite_ok() {
         String url, exp;
-        final FessFileTransformer transformer = new FessFileTransformer();
+        final FessFileTransformer transformer = createInstance();
 
         url = "";
         exp = "";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "http://example.com/";
         exp = "example.com/";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "http://example.com/index.html";
         exp = "example.com/index.html";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file:/home/user";
         exp = "/home/user";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file:/c:/home/user";
         exp = "c:\\home\\user";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file:/c:/";
         exp = "c:\\";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file:////server/user";
         exp = "\\\\server\\user";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
+    }
 
-        transformer.maxSiteLength = 10;
+    public void test_getSite_ok_len10() {
+        String url, exp;
+        final FessFileTransformer transformer = new FessFileTransformer() {
+            @Override
+            public int getMaxSiteLength() {
+                return 10;
+            }
+        };
+        transformer.init();
 
         url = "file:/home/user/foo";
         exp = "/home/u...";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
     }
 
     public void test_getSite_unexpected() {
         String url, exp;
-        final FessFileTransformer transformer = new FessFileTransformer();
+        final FessFileTransformer transformer = createInstance();
 
         url = "file:";
         exp = "";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file";
         exp = "file";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file:/";
         exp = "/";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file:/c:";
         exp = "c:";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file://";
         exp = "//";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file:///";
         exp = "///";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
 
         url = "file://///";
         exp = "\\\\\\";
-        assertEquals(exp, transformer.getSite(url, "UTF-8"));
+        assertEquals(exp, transformer.getSiteOnFile(url, "UTF-8"));
+    }
+
+    private FessFileTransformer createInstance() {
+        final FessFileTransformer transformer = new FessFileTransformer();
+        transformer.init();
+        return transformer;
     }
 }

+ 17 - 6
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -46,6 +46,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
     public void setUp() throws Exception {
         super.setUp();
         fessXpathTransformer = new FessXpathTransformer();
+        fessXpathTransformer.init();
         fessXpathTransformer.convertUrlMap.put("feed:", "http:");
     }
 
@@ -53,7 +54,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final Document document = getDocument(data);
 
-        final FessXpathTransformer transformer = new FessXpathTransformer();
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            protected String[] getCrawlerDocumentHtmlPrunedTags() {
+                return new String[0];
+            }
+        };
 
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         assertEquals(getXmlString(document), getXmlString(pruneNode));
@@ -63,8 +68,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final Document document = getDocument(data);
 
-        final FessXpathTransformer transformer = new FessXpathTransformer();
-        transformer.prunedTagList.add("noscript");
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            protected String[] getCrawlerDocumentHtmlPrunedTags() {
+                return new String[] { "noscript" };
+            }
+        };
 
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         final String docString = getXmlString(document);
@@ -83,9 +91,11 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
         final String data = "<html><body><br/><script>foo</script><noscript>bar</noscript></body></html>";
         final Document document = getDocument(data);
 
-        final FessXpathTransformer transformer = new FessXpathTransformer();
-        transformer.prunedTagList.add("script");
-        transformer.prunedTagList.add("noscript");
+        final FessXpathTransformer transformer = new FessXpathTransformer() {
+            protected String[] getCrawlerDocumentHtmlPrunedTags() {
+                return new String[] { "script", "noscript" };
+            }
+        };
 
         final Node pruneNode = transformer.pruneNode(document.cloneNode(true));
         final String docString = getXmlString(document);
@@ -235,6 +245,7 @@ public class FessXpathTransformerTest extends UnitFessTestCase {
 
     public void test_canonicalXpath() throws Exception {
         final FessXpathTransformer transformer = new FessXpathTransformer();
+        transformer.init();
 
         final Map<String, Object> dataMap = new HashMap<String, Object>();
         final ResponseData responseData = new ResponseData();