diff --git a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java
index 418b54ab1..776f4254a 100644
--- a/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java
+++ b/src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java
@@ -740,44 +740,34 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
}
}
- final NodeList imgNodeList = getXPathAPI().selectNodeList(document, "//IMG");
- Node firstSrcNode = null;
+ final NodeList imgNodeList = getXPathAPI().selectNodeList(document, fessConfig.getThumbnailHtmlImageXpath());
+ String firstThumbnailUrl = null;
for (int i = 0; i < imgNodeList.getLength(); i++) {
final Node imgNode = imgNodeList.item(i);
+ if (logger.isDebugEnabled()) {
+ logger.debug("img tag: " + imgNode);
+ }
final NamedNodeMap attributes = imgNode.getAttributes();
+ final String thumbnailUrl = getThumbnailSrc(responseData.getUrl(), attributes);
final Integer height = getAttributeAsInteger(attributes, "height");
final Integer width = getAttributeAsInteger(attributes, "width");
- if (height != null && width != null) {
+ if (!fessConfig.isThumbnailHtmlImageUrl(thumbnailUrl)) {
+ continue;
+ } else if (height != null && width != null) {
try {
if (fessConfig.validateThumbnailSize(width, height)) {
- final Node srcNode = attributes.getNamedItem("src");
- if (srcNode != null) {
- final URL thumbnailUrl = getURL(responseData.getUrl(), srcNode.getTextContent());
- if (thumbnailUrl != null) {
- return thumbnailUrl.toExternalForm();
- }
- }
+ return thumbnailUrl;
}
} catch (final Exception e) {
logger.debug("Failed to parse " + imgNode + " at " + responseData.getUrl(), e);
}
- } else if (firstSrcNode == null) {
- final Node srcNode = attributes.getNamedItem("src");
- if (srcNode != null) {
- firstSrcNode = srcNode;
- }
+ } else if (firstThumbnailUrl == null) {
+ firstThumbnailUrl = thumbnailUrl;
}
}
- if (firstSrcNode != null) {
- try {
- final URL thumbnailUrl = getURL(responseData.getUrl(), firstSrcNode.getTextContent());
- if (thumbnailUrl != null) {
- return thumbnailUrl.toExternalForm();
- }
- } catch (final Exception e) {
- logger.debug("Failed to parse " + firstSrcNode + " at " + responseData.getUrl(), e);
- }
+ if (firstThumbnailUrl != null) {
+ return firstThumbnailUrl;
}
} catch (final Exception e) {
logger.warn("Failed to retrieve thumbnail url from " + responseData.getUrl(), e);
@@ -785,6 +775,23 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
return null;
}
+ protected String getThumbnailSrc(final String url, final NamedNodeMap attributes) {
+ final Node srcNode = attributes.getNamedItem("src");
+ if (srcNode != null) {
+ try {
+ final URL thumbnailUrl = getURL(url, srcNode.getTextContent());
+ if (thumbnailUrl != null) {
+ return thumbnailUrl.toExternalForm();
+ }
+ } catch (Exception e) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("Failed to parse thumbnail url for " + url + " : " + attributes, e);
+ }
+ }
+ }
+ return null;
+ }
+
protected Integer getAttributeAsInteger(final NamedNodeMap attributes, final String name) {
final Node namedItem = attributes.getNamedItem(name);
if (namedItem == null) {
diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java
index 9122baaef..9a71d5cb8 100644
--- a/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java
+++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java
@@ -812,6 +812,12 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. png */
String THUMBNAIL_HTML_IMAGE_FORMAT = "thumbnail.html.image.format";
+ /** The key of the configuration. e.g. //IMG */
+ String THUMBNAIL_HTML_IMAGE_XPATH = "thumbnail.html.image.xpath";
+
+ /** The key of the configuration. e.g. svg,html,css,js */
+ String THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS = "thumbnail.html.image.exclude.extensions";
+
/** The key of the configuration. e.g. 0 */
String THUMBNAIL_GENERATOR_INTERVAL = "thumbnail.generator.interval";
@@ -3971,6 +3977,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
String getThumbnailHtmlImageFormat();
+ /**
+ * Get the value for the key 'thumbnail.html.image.xpath'.
+ * The value is, e.g. //IMG
+ * @return The value of found property. (NotNull: if not found, exception but basically no way)
+ */
+ String getThumbnailHtmlImageXpath();
+
+ /**
+ * Get the value for the key 'thumbnail.html.image.exclude.extensions'.
+ * The value is, e.g. svg,html,css,js
+ * @return The value of found property. (NotNull: if not found, exception but basically no way)
+ */
+ String getThumbnailHtmlImageExcludeExtensions();
+
/**
* Get the value for the key 'thumbnail.generator.interval'.
* The value is, e.g. 0
@@ -6661,6 +6681,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return get(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT);
}
+ public String getThumbnailHtmlImageXpath() {
+ return get(FessConfig.THUMBNAIL_HTML_IMAGE_XPATH);
+ }
+
+ public String getThumbnailHtmlImageExcludeExtensions() {
+ return get(FessConfig.THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS);
+ }
+
public String getThumbnailGeneratorInterval() {
return get(FessConfig.THUMBNAIL_GENERATOR_INTERVAL);
}
@@ -7617,6 +7645,8 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_WIDTH, "100");
defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_THUMBNAIL_HEIGHT, "100");
defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_FORMAT, "png");
+ defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_XPATH, "//IMG");
+ defaultMap.put(FessConfig.THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS, "svg,html,css,js");
defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_INTERVAL, "0");
defaultMap.put(FessConfig.THUMBNAIL_GENERATOR_TARGETS, "all");
defaultMap.put(FessConfig.THUMBNAIL_CRAWLER_ENABLED, "true");
diff --git a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
index f16d783e6..a5629be21 100644
--- a/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
+++ b/src/main/java/org/codelibs/fess/mylasta/direction/FessProp.java
@@ -69,6 +69,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
public interface FessProp {
+ public static final String THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS = "ThumbnailHtmlImageExcludeExtensions";
+
public static final String VIRTUAL_HOST_VALUE = "VirtualHostValue";
public static final String QUERY_DEFAULT_LANGUAGES = "queryDefaultLanguages";
@@ -1736,4 +1738,25 @@ public interface FessProp {
}
return proxy;
}
+
+ String getThumbnailHtmlImageExcludeExtensions();
+
+ public default boolean isThumbnailHtmlImageUrl(final String url) {
+ if (StringUtil.isBlank(url)) {
+ return false;
+ }
+
+ String[] excludeExtensions = (String[]) propMap.get(THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS);
+ if (excludeExtensions == null) {
+ excludeExtensions =
+ split(getThumbnailHtmlImageExcludeExtensions(), ",").get(
+ stream -> stream.map(s -> s.toLowerCase(Locale.ROOT).trim()).filter(StringUtil::isNotBlank)
+ .toArray(n -> new String[n]));
+ propMap.put(THUMBNAIL_HTML_IMAGE_EXCLUDE_EXTENSIONS, excludeExtensions);
+ }
+
+ final String u = url.toLowerCase(Locale.ROOT);
+ return !stream(excludeExtensions).get(stream -> stream.anyMatch(s -> u.endsWith(s)));
+ }
+
}
diff --git a/src/main/resources/fess_config.properties b/src/main/resources/fess_config.properties
index 6f5339c71..ade07c824 100644
--- a/src/main/resources/fess_config.properties
+++ b/src/main/resources/fess_config.properties
@@ -414,6 +414,8 @@ thumbnail.html.image.window.height=800
thumbnail.html.image.thumbnail.width=100
thumbnail.html.image.thumbnail.height=100
thumbnail.html.image.format=png
+thumbnail.html.image.xpath=//IMG
+thumbnail.html.image.exclude.extensions=svg,html,css,js
thumbnail.generator.interval=0
thumbnail.generator.targets=all
thumbnail.crawler.enabled=true