瀏覽代碼

fix #1193 add thumbnail.html.image.xpath and thumbnail.html.image.exclude.extensions

Shinsuke Sugaya 8 年之前
父節點
當前提交
c6b6f0bbea

+ 31 - 24
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -740,44 +740,34 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
                 }
                 }
             }
             }
 
 
-            final NodeList imgNodeList = getXPathAPI().selectNodeList(document, "//IMG");
-            Node firstSrcNode = null;
+            final NodeList imgNodeList = getXPathAPI().selectNodeList(document, fessConfig.getThumbnailHtmlImageXpath());
+            String firstThumbnailUrl = null;
             for (int i = 0; i < imgNodeList.getLength(); i++) {
             for (int i = 0; i < imgNodeList.getLength(); i++) {
                 final Node imgNode = imgNodeList.item(i);
                 final Node imgNode = imgNodeList.item(i);
+                if (logger.isDebugEnabled()) {
+                    logger.debug("img tag: " + imgNode);
+                }
                 final NamedNodeMap attributes = imgNode.getAttributes();
                 final NamedNodeMap attributes = imgNode.getAttributes();
+                final String thumbnailUrl = getThumbnailSrc(responseData.getUrl(), attributes);
                 final Integer height = getAttributeAsInteger(attributes, "height");
                 final Integer height = getAttributeAsInteger(attributes, "height");
                 final Integer width = getAttributeAsInteger(attributes, "width");
                 final Integer width = getAttributeAsInteger(attributes, "width");
-                if (height != null && width != null) {
+                if (!fessConfig.isThumbnailHtmlImageUrl(thumbnailUrl)) {
+                    continue;
+                } else if (height != null && width != null) {
                     try {
                     try {
                         if (fessConfig.validateThumbnailSize(width, height)) {
                         if (fessConfig.validateThumbnailSize(width, height)) {
-                            final Node srcNode = attributes.getNamedItem("src");
-                            if (srcNode != null) {
-                                final URL thumbnailUrl = getURL(responseData.getUrl(), srcNode.getTextContent());
-                                if (thumbnailUrl != null) {
-                                    return thumbnailUrl.toExternalForm();
-                                }
-                            }
+                            return thumbnailUrl;
                         }
                         }
                     } catch (final Exception e) {
                     } catch (final Exception e) {
                         logger.debug("Failed to parse " + imgNode + " at " + responseData.getUrl(), e);
                         logger.debug("Failed to parse " + imgNode + " at " + responseData.getUrl(), e);
                     }
                     }
-                } else if (firstSrcNode == null) {
-                    final Node srcNode = attributes.getNamedItem("src");
-                    if (srcNode != null) {
-                        firstSrcNode = srcNode;
-                    }
+                } else if (firstThumbnailUrl == null) {
+                    firstThumbnailUrl = thumbnailUrl;
                 }
                 }
             }
             }
 
 
-            if (firstSrcNode != null) {
-                try {
-                    final URL thumbnailUrl = getURL(responseData.getUrl(), firstSrcNode.getTextContent());
-                    if (thumbnailUrl != null) {
-                        return thumbnailUrl.toExternalForm();
-                    }
-                } catch (final Exception e) {
-                    logger.debug("Failed to parse " + firstSrcNode + " at " + responseData.getUrl(), e);
-                }
+            if (firstThumbnailUrl != null) {
+                return firstThumbnailUrl;
             }
             }
         } catch (final Exception e) {
         } catch (final Exception e) {
             logger.warn("Failed to retrieve thumbnail url from " + responseData.getUrl(), e);
             logger.warn("Failed to retrieve thumbnail url from " + responseData.getUrl(), e);
@@ -785,6 +775,23 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         return null;
         return null;
     }
     }
 
 
+    protected String getThumbnailSrc(final String url, final NamedNodeMap attributes) {
+        final Node srcNode = attributes.getNamedItem("src");
+        if (srcNode != null) {
+            try {
+                final URL thumbnailUrl = getURL(url, srcNode.getTextContent());
+                if (thumbnailUrl != null) {
+                    return thumbnailUrl.toExternalForm();
+                }
+            } catch (Exception e) {
+                if (logger.isDebugEnabled()) {
+                    logger.debug("Failed to parse thumbnail url for " + url + " : " + attributes, e);
+                }
+            }
+        }
+        return null;
+    }
+
     protected Integer getAttributeAsInteger(final NamedNodeMap attributes, final String name) {
     protected Integer getAttributeAsInteger(final NamedNodeMap attributes, final String name) {
         final Node namedItem = attributes.getNamedItem(name);
         final Node namedItem = attributes.getNamedItem(name);
         if (namedItem == null) {
         if (namedItem == null) {

+ 30 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -812,6 +812,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. png */
     /** The key of the configuration. e.g. png */
     String THUMBNAIL_HTML_IMAGE_FORMAT = "thumbnail.html.image.format";
     String THUMBNAIL_HTML_IMAGE_FORMAT = "thumbnail.html.image.format";
 
 
+    /** The key of the configuration. e.g. //IMG */
+    String THUMBNAIL_HTML_IMAGE_XPATH = "thumbnail.html.image.xpath";
+
+    /** The key of the configuration. e.g. svg,html,css,js */
+    String THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS = "thumbnail.html.image.exclude.extensions";
+
     /** The key of the configuration. e.g. 0 */
     /** The key of the configuration. e.g. 0 */
     String THUMBNAIL_GENERATOR_INTERVAL = "thumbnail.generator.interval";
     String THUMBNAIL_GENERATOR_INTERVAL = "thumbnail.generator.interval";
 
 
@@ -3971,6 +3977,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
      */
     String getThumbnailHtmlImageFormat();
     String getThumbnailHtmlImageFormat();
 
 
+    /**
+     * Get the value for the key 'thumbnail.html.image.xpath'. <br>
+     * The value is, e.g. //IMG <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getThumbnailHtmlImageXpath();
+
+    /**
+     * Get the value for the key 'thumbnail.html.image.exclude.extensions'. <br>
+     * The value is, e.g. svg,html,css,js <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getThumbnailHtmlImageExcludeExtensions();
+
     /**
     /**
      * Get the value for the key 'thumbnail.generator.interval'. <br>
      * Get the value for the key 'thumbnail.generator.interval'. <br>
      * The value is, e.g. 0 <br>
      * The value is, e.g. 0 <br>
@@ -6661,6 +6681,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return get(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT);
             return get(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT);
         }
         }
 
 
+        public String getThumbnailHtmlImageXpath() {
+            return get(FessConfig.THUMBNAIL_HTML_IMAGE_XPATH);
+        }
+
+        public String getThumbnailHtmlImageExcludeExtensions() {
+            return get(FessConfig.THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS);
+        }
+
         public String getThumbnailGeneratorInterval() {
         public String getThumbnailGeneratorInterval() {
             return get(FessConfig.THUMBNAIL_GENERATOR_INTERVAL);
             return get(FessConfig.THUMBNAIL_GENERATOR_INTERVAL);
         }
         }
@@ -7617,6 +7645,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_WIDTH, "100");
             defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_WIDTH, "100");
             defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_HEIGHT, "100");
             defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_HEIGHT, "100");
             defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT, "png");
             defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT, "png");
+            defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_XPATH, "//IMG");
+            defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS, "svg,html,css,js");
             defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_INTERVAL, "0");
             defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_INTERVAL, "0");
             defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_TARGETS, "all");
             defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_TARGETS, "all");
             defaultMap.put(FessConfig.THUMBNAIL_CRAWLER_ENABLED, "true");
             defaultMap.put(FessConfig.THUMBNAIL_CRAWLER_ENABLED, "true");

+ 23 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java

@@ -69,6 +69,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
 
 
 public interface FessProp {
 public interface FessProp {
 
 
+    public static final String THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS = "ThumbnailHtmlImageExcludeExtensions";
+
     public static final String VIRTUAL_HOST_VALUE = "VirtualHostValue";
     public static final String VIRTUAL_HOST_VALUE = "VirtualHostValue";
 
 
     public static final String QUERY_DEFAULT_LANGUAGES = "queryDefaultLanguages";
     public static final String QUERY_DEFAULT_LANGUAGES = "queryDefaultLanguages";
@@ -1736,4 +1738,25 @@ public interface FessProp {
         }
         }
         return proxy;
         return proxy;
     }
     }
+
+    String getThumbnailHtmlImageExcludeExtensions();
+
+    public default boolean isThumbnailHtmlImageUrl(final String url) {
+        if (StringUtil.isBlank(url)) {
+            return false;
+        }
+
+        String[] excludeExtensions = (String[]) propMap.get(THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS);
+        if (excludeExtensions == null) {
+            excludeExtensions =
+                    split(getThumbnailHtmlImageExcludeExtensions(), ",").get(
+                            stream -> stream.map(s -> s.toLowerCase(Locale.ROOT).trim()).filter(StringUtil::isNotBlank)
+                                    .toArray(n -> new String[n]));
+            propMap.put(THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS, excludeExtensions);
+        }
+
+        final String u = url.toLowerCase(Locale.ROOT);
+        return !stream(excludeExtensions).get(stream -> stream.anyMatch(s -> u.endsWith(s)));
+    }
+
 }
 }

+ 2 - 0
src/main/resources/fess_config.properties

@@ -414,6 +414,8 @@ thumbnail.html.image.window.height=800
 thumbnail.html.image.thumbnail.width=100
 thumbnail.html.image.thumbnail.width=100
 thumbnail.html.image.thumbnail.height=100
 thumbnail.html.image.thumbnail.height=100
 thumbnail.html.image.format=png
 thumbnail.html.image.format=png
+thumbnail.html.image.xpath=//IMG
+thumbnail.html.image.exclude.extensions=svg,html,css,js
 thumbnail.generator.interval=0
 thumbnail.generator.interval=0
 thumbnail.generator.targets=all
 thumbnail.generator.targets=all
 thumbnail.crawler.enabled=true
 thumbnail.crawler.enabled=true