diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java index 418b54ab1..776f4254a 100644 --- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java +++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java @@ -740,44 +740,34 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf } } - final NodeList imgNodeList = getXPathAPI().selectNodeList(document, "//IMG"); - Node firstSrcNode = null; + final NodeList imgNodeList = getXPathAPI().selectNodeList(document, fessConfig.getThumbnailHtmlImageXpath()); + String firstThumbnailUrl = null; for (int i = 0; i < imgNodeList.getLength(); i++) { final Node imgNode = imgNodeList.item(i); + if (logger.isDebugEnabled()) { + logger.debug("img tag: " + imgNode); + } final NamedNodeMap attributes = imgNode.getAttributes(); + final String thumbnailUrl = getThumbnailSrc(responseData.getUrl(), attributes); final Integer height = getAttributeAsInteger(attributes, "height"); final Integer width = getAttributeAsInteger(attributes, "width"); - if (height != null && width != null) { + if (!fessConfig.isThumbnailHtmlImageUrl(thumbnailUrl)) { + continue; + } else if (height != null && width != null) { try { if (fessConfig.validateThumbnailSize(width, height)) { - final Node srcNode = attributes.getNamedItem("src"); - if (srcNode != null) { - final URL thumbnailUrl = getURL(responseData.getUrl(), srcNode.getTextContent()); - if (thumbnailUrl != null) { - return thumbnailUrl.toExternalForm(); - } - } + return thumbnailUrl; } } catch (final Exception e) { logger.debug("Failed to parse " + imgNode + " at " + responseData.getUrl(), e); } - } else if (firstSrcNode == null) { - final Node srcNode = attributes.getNamedItem("src"); - if (srcNode != null) { - firstSrcNode = srcNode; - } + } else if (firstThumbnailUrl == null) { + firstThumbnailUrl = thumbnailUrl; } } - if (firstSrcNode != null) { - try { - final URL thumbnailUrl = getURL(responseData.getUrl(), firstSrcNode.getTextContent()); - if (thumbnailUrl != null) { - return thumbnailUrl.toExternalForm(); - } - } catch (final Exception e) { - logger.debug("Failed to parse " + firstSrcNode + " at " + responseData.getUrl(), e); - } + if (firstThumbnailUrl != null) { + return firstThumbnailUrl; } } catch (final Exception e) { logger.warn("Failed to retrieve thumbnail url from " + responseData.getUrl(), e); @@ -785,6 +775,23 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf return null; } + protected String getThumbnailSrc(final String url, final NamedNodeMap attributes) { + final Node srcNode = attributes.getNamedItem("src"); + if (srcNode != null) { + try { + final URL thumbnailUrl = getURL(url, srcNode.getTextContent()); + if (thumbnailUrl != null) { + return thumbnailUrl.toExternalForm(); + } + } catch (Exception e) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to parse thumbnail url for " + url + " : " + attributes, e); + } + } + } + return null; + } + protected Integer getAttributeAsInteger(final NamedNodeMap attributes, final String name) { final Node namedItem = attributes.getNamedItem(name); if (namedItem == null) { diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java index 9122baaef..9a71d5cb8 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java @@ -812,6 +812,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction /** The key of the configuration. e.g. png */ String THUMBNAIL_HTML_IMAGE_FORMAT = "thumbnail.html.image.format"; + /** The key of the configuration. e.g. //IMG */ + String THUMBNAIL_HTML_IMAGE_XPATH = "thumbnail.html.image.xpath"; + + /** The key of the configuration. e.g. svg,html,css,js */ + String THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS = "thumbnail.html.image.exclude.extensions"; + /** The key of the configuration. e.g. 0 */ String THUMBNAIL_GENERATOR_INTERVAL = "thumbnail.generator.interval"; @@ -3971,6 +3977,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction */ String getThumbnailHtmlImageFormat(); + /** + * Get the value for the key 'thumbnail.html.image.xpath'.
+ * The value is, e.g. //IMG
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getThumbnailHtmlImageXpath(); + + /** + * Get the value for the key 'thumbnail.html.image.exclude.extensions'.
+ * The value is, e.g. svg,html,css,js
+ * @return The value of found property. (NotNull: if not found, exception but basically no way) + */ + String getThumbnailHtmlImageExcludeExtensions(); + /** * Get the value for the key 'thumbnail.generator.interval'.
* The value is, e.g. 0
@@ -6661,6 +6681,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction return get(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT); } + public String getThumbnailHtmlImageXpath() { + return get(FessConfig.THUMBNAIL_HTML_IMAGE_XPATH); + } + + public String getThumbnailHtmlImageExcludeExtensions() { + return get(FessConfig.THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS); + } + public String getThumbnailGeneratorInterval() { return get(FessConfig.THUMBNAIL_GENERATOR_INTERVAL); } @@ -7617,6 +7645,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_WIDTH, "100"); defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_HEIGHT, "100"); defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT, "png"); + defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_XPATH, "//IMG"); + defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS, "svg,html,css,js"); defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_INTERVAL, "0"); defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_TARGETS, "all"); defaultMap.put(FessConfig.THUMBNAIL_CRAWLER_ENABLED, "true"); diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java index f16d783e6..a5629be21 100644 --- a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java +++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java @@ -69,6 +69,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator; public interface FessProp { + public static final String THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS = "ThumbnailHtmlImageExcludeExtensions"; + public static final String VIRTUAL_HOST_VALUE = "VirtualHostValue"; public static final String QUERY_DEFAULT_LANGUAGES = "queryDefaultLanguages"; @@ -1736,4 +1738,25 @@ public interface FessProp { } return proxy; } + + String getThumbnailHtmlImageExcludeExtensions(); + + public default boolean isThumbnailHtmlImageUrl(final String url) { + if (StringUtil.isBlank(url)) { + return false; + } + + String[] excludeExtensions = (String[]) propMap.get(THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS); + if (excludeExtensions == null) { + excludeExtensions = + split(getThumbnailHtmlImageExcludeExtensions(), ",").get( + stream -> stream.map(s -> s.toLowerCase(Locale.ROOT).trim()).filter(StringUtil::isNotBlank) + .toArray(n -> new String[n])); + propMap.put(THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS, excludeExtensions); + } + + final String u = url.toLowerCase(Locale.ROOT); + return !stream(excludeExtensions).get(stream -> stream.anyMatch(s -> u.endsWith(s))); + } + } diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties index 6f5339c71..ade07c824 100644 --- a/src/main/resources/fess_config.properties +++ b/src/main/resources/fess_config.properties @@ -414,6 +414,8 @@ thumbnail.html.image.window.height=800 thumbnail.html.image.thumbnail.width=100 thumbnail.html.image.thumbnail.height=100 thumbnail.html.image.format=png +thumbnail.html.image.xpath=//IMG +thumbnail.html.image.exclude.extensions=svg,html,css,js thumbnail.generator.interval=0 thumbnail.generator.targets=all thumbnail.crawler.enabled=true