fix #2735 add crawler.document.append.filename

This commit is contained in:
Shinsuke Sugaya 2023-04-03 21:54:36 +09:00
parent d952b31ee0
commit e9c4a0c0b2
4 changed files with 47 additions and 8 deletions

View file

@ -213,6 +213,10 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
}
buf.append(contentMeta);
}
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName) && fessConfig.isCrawlerDocumentAppendFilename()) {
buf.append(' ').append(fileName);
}
final String bodyBase = buf.toString().trim();
responseData.addMetaData(Extractor.class.getSimpleName(), extractor);
final String body = documentHelper.getContent(crawlingConfig, responseData, bodyBase, dataMap);
@ -232,7 +236,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
// title
final String fileName = getFileName(url, urlEncoding);
if (!hasTitle(dataMap)) {
final String titleField = fessConfig.getIndexFieldTitle();
dataMap.remove(titleField);

View file

@ -392,8 +392,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap),
prunedContent ? node -> pruneNode(node, crawlingConfig) : node -> node);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(),
documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
final String fileName = getFileName(url, urlEncoding);
putResultDataContent(dataMap, responseData, fessConfig, crawlingConfig, documentHelper, body, fileName);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache()))
|| fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0
@ -429,7 +429,6 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
// filename
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
@ -502,6 +501,16 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
});
}
protected void putResultDataContent(final Map<String, Object> dataMap, final ResponseData responseData, final FessConfig fessConfig,
final CrawlingConfig crawlingConfig, final DocumentHelper documentHelper, final String body, final String fileName) {
final String content = documentHelper.getContent(crawlingConfig, responseData, body, dataMap);
if (StringUtil.isNotBlank(fileName) && fessConfig.isCrawlerDocumentAppendFilename()) {
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), content + " " + fileName);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), content);
}
}
protected CrawlingConfig getCrawlingConfig(final ResponseData responseData) {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
return crawlingConfigHelper.get(responseData.getSessionId());

View file

@ -325,6 +325,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. true */
String CRAWLER_DOCUMENT_APPEND_DATA = "crawler.document.append.data";
/** The key of the configuration. e.g. false */
String CRAWLER_DOCUMENT_APPEND_FILENAME = "crawler.document.append.filename";
/** The key of the configuration. e.g. 20 */
String CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE = "crawler.document.max.alphanum.term.size";
@ -1164,7 +1167,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. org.codelibs,org.dbflute,org.lastaflute */
String LOGGING_APP_PACKAGES = "logging.app.packages";
/** The key of the configuration. e.g. 4000 */
/** The key of the configuration. e.g. 10000 */
String FORM_ADMIN_MAX_INPUT_SIZE = "form.admin.max.input.size";
/** The key of the configuration. e.g. false */
@ -2651,6 +2654,20 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
boolean isCrawlerDocumentAppendData();
/**
* Get the value for the key 'crawler.document.append.filename'. <br>
* The value is, e.g. false <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentAppendFilename();
/**
* Is the property for the key 'crawler.document.append.filename' true? <br>
* The value is, e.g. false <br>
* @return The determination, true or false. (if not found, exception but basically no way)
*/
boolean isCrawlerDocumentAppendFilename();
/**
* Get the value for the key 'crawler.document.max.alphanum.term.size'. <br>
* The value is, e.g. 20 <br>
@ -5422,14 +5439,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/**
* Get the value for the key 'form.admin.max.input.size'. <br>
* The value is, e.g. 4000 <br>
* The value is, e.g. 10000 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getFormAdminMaxInputSize();
/**
* Get the value for the key 'form.admin.max.input.size' as {@link Integer}. <br>
* The value is, e.g. 4000 <br>
* The value is, e.g. 10000 <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
* @throws NumberFormatException When the property is not integer.
*/
@ -8009,6 +8026,14 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return is(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA);
}
public String getCrawlerDocumentAppendFilename() {
return get(FessConfig.CRAWLER_DOCUMENT_APPEND_FILENAME);
}
public boolean isCrawlerDocumentAppendFilename() {
return is(FessConfig.CRAWLER_DOCUMENT_APPEND_FILENAME);
}
public String getCrawlerDocumentMaxAlphanumTermSize() {
return get(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE);
}
@ -10727,6 +10752,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME, "unknown");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_USE_SITE_ENCODING_ON_ENGLISH, "false");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_APPEND_DATA, "true");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_APPEND_FILENAME, "false");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_MAX_ALPHANUM_TERM_SIZE, "20");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_MAX_SYMBOL_TERM_SIZE, "10");
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED, "false");
@ -10978,7 +11004,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
"filetype,created,click_count,title,doc_id,url,score,site,filename,host,digest,boost,mimetype,favorite_count,_id,lang,last_modified,content_length,timestamp");
defaultMap.put(FessConfig.LOGGING_SEARCH_USE_LOGFILE, "true");
defaultMap.put(FessConfig.LOGGING_APP_PACKAGES, "org.codelibs,org.dbflute,org.lastaflute");
defaultMap.put(FessConfig.FORM_ADMIN_MAX_INPUT_SIZE, "4000");
defaultMap.put(FessConfig.FORM_ADMIN_MAX_INPUT_SIZE, "10000");
defaultMap.put(FessConfig.FORM_ADMIN_LABEL_IN_CONFIG_ENABLED, "false");
defaultMap.put(FessConfig.FORM_ADMIN_DEFAULT_TEMPLATE_NAME, "__TEMPLATE__");
defaultMap.put(FessConfig.OSDD_LINK_ENABLED, "true");

View file

@ -205,6 +205,7 @@ crawler.document.site.encoding=UTF-8
crawler.document.unknown.hostname=unknown
crawler.document.use.site.encoding.on.english=false
crawler.document.append.data=true
crawler.document.append.filename=false
crawler.document.max.alphanum.term.size=20
crawler.document.max.symbol.term.size=10
crawler.document.duplicate.term.removed=false