Merge branch '10.3.x'
This commit is contained in:
commit
884c70513a
6 changed files with 73 additions and 72 deletions
|
@ -20,7 +20,6 @@ import static org.codelibs.core.stream.StreamUtil.stream;
|
|||
import java.io.InputStream;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -31,13 +30,11 @@ import java.util.Set;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.tika.metadata.HttpHeaders;
|
||||
import org.apache.tika.metadata.TikaMetadataKeys;
|
||||
import org.codelibs.core.collection.LruHashMap;
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.misc.Pair;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbClient;
|
||||
import org.codelibs.fess.crawler.entity.AccessResult;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.ExtractData;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
|
@ -71,8 +68,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(AbstractFessFileTransformer.class);
|
||||
|
||||
public Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
|
||||
|
||||
protected Map<String, String> metaContentMapping;
|
||||
|
||||
protected FessConfig fessConfig;
|
||||
|
@ -232,6 +227,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
|
||||
documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
|
||||
// title
|
||||
final String fileName = getFileName(url, urlEncoding);
|
||||
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
|
||||
if (url.endsWith("/")) {
|
||||
if (StringUtil.isNotBlank(content)) {
|
||||
|
@ -244,12 +240,10 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
|
||||
}
|
||||
} else {
|
||||
final String u = decodeUrlAsName(url, url.startsWith("file:"));
|
||||
final int pos = u.lastIndexOf('/');
|
||||
if (pos == -1) {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), u);
|
||||
if (StringUtil.isBlank(fileName)) {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
|
||||
} else {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), u.substring(pos + 1));
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -258,7 +252,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
// site
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
|
||||
// filename
|
||||
final String fileName = getFileName(url, urlEncoding);
|
||||
if (StringUtil.isNotBlank(fileName)) {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
|
||||
}
|
||||
|
@ -358,57 +351,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
}
|
||||
}
|
||||
|
||||
protected String decodeUrlAsName(final String url, final boolean escapePlus) {
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String enc = Constants.UTF_8;
|
||||
if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
|
||||
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
|
||||
if (urlQueue != null) {
|
||||
final String parentUrl = urlQueue.getParentUrl();
|
||||
if (StringUtil.isNotEmpty(parentUrl)) {
|
||||
final String sessionId = urlQueue.getSessionId();
|
||||
final String pageEnc = getParentEncoding(parentUrl, sessionId);
|
||||
if (pageEnc != null) {
|
||||
enc = pageEnc;
|
||||
} else if (urlQueue.getEncoding() != null) {
|
||||
enc = urlQueue.getEncoding();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
enc = fessConfig.getCrawlerDocumentFileNameEncoding();
|
||||
}
|
||||
|
||||
final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
|
||||
try {
|
||||
return URLDecoder.decode(escapedUrl, enc);
|
||||
} catch (final Exception e) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
protected String getParentEncoding(final String parentUrl, final String sessionId) {
|
||||
final String key = sessionId + ":" + parentUrl;
|
||||
String enc = parentEncodingMap.get(key);
|
||||
if (enc != null) {
|
||||
return enc;
|
||||
}
|
||||
|
||||
final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
|
||||
if (accessResult != null) {
|
||||
final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
|
||||
if (accessResultData != null && accessResultData.getEncoding() != null) {
|
||||
enc = accessResultData.getEncoding();
|
||||
parentEncodingMap.put(key, enc);
|
||||
return enc;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected String getHostOnFile(final String url) {
|
||||
if (StringUtil.isBlank(url)) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
|
|
|
@ -17,11 +17,18 @@ package org.codelibs.fess.crawler.transformer;
|
|||
|
||||
import java.net.URLDecoder;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.codelibs.core.collection.LruHashMap;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.entity.AccessResult;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.UrlQueue;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.GroovyUtil;
|
||||
|
@ -29,6 +36,8 @@ import org.slf4j.Logger;
|
|||
|
||||
public interface FessTransformer {
|
||||
|
||||
public static Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
|
||||
|
||||
FessConfig getFessConfig();
|
||||
|
||||
Logger getLogger();
|
||||
|
@ -173,7 +182,7 @@ public interface FessTransformer {
|
|||
return StringUtil.EMPTY;
|
||||
}
|
||||
|
||||
String u = url;
|
||||
String u = decodeUrlAsName(url, url.startsWith("file:"));
|
||||
|
||||
int idx = u.lastIndexOf('?');
|
||||
if (idx >= 0) {
|
||||
|
@ -201,4 +210,56 @@ public interface FessTransformer {
|
|||
}
|
||||
return u;
|
||||
}
|
||||
|
||||
public default String decodeUrlAsName(final String url, final boolean escapePlus) {
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final FessConfig fessConfig = getFessConfig();
|
||||
String enc = Constants.UTF_8;
|
||||
if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
|
||||
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
|
||||
if (urlQueue != null) {
|
||||
final String parentUrl = urlQueue.getParentUrl();
|
||||
if (StringUtil.isNotEmpty(parentUrl)) {
|
||||
final String sessionId = urlQueue.getSessionId();
|
||||
final String pageEnc = getParentEncoding(parentUrl, sessionId);
|
||||
if (pageEnc != null) {
|
||||
enc = pageEnc;
|
||||
} else if (urlQueue.getEncoding() != null) {
|
||||
enc = urlQueue.getEncoding();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
enc = fessConfig.getCrawlerDocumentFileNameEncoding();
|
||||
}
|
||||
|
||||
final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
|
||||
try {
|
||||
return URLDecoder.decode(escapedUrl, enc);
|
||||
} catch (final Exception e) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
public default String getParentEncoding(final String parentUrl, final String sessionId) {
|
||||
final String key = sessionId + ":" + parentUrl;
|
||||
String enc = parentEncodingMap.get(key);
|
||||
if (enc != null) {
|
||||
return enc;
|
||||
}
|
||||
|
||||
final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
|
||||
if (accessResult != null) {
|
||||
final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
|
||||
if (accessResultData != null && accessResultData.getEncoding() != null) {
|
||||
enc = accessResultData.getEncoding();
|
||||
parentEncodingMap.put(key, enc);
|
||||
return enc;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -24,7 +24,6 @@ import java.util.Map;
|
|||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.elasticsearch.runner.net.Curl;
|
||||
import org.codelibs.elasticsearch.runner.net.CurlResponse;
|
||||
|
@ -189,7 +188,6 @@ public class GitBucketDataStoreImpl extends AbstractDataStoreImpl {
|
|||
dataMap.putAll(ComponentUtil.getDocumentHelper().processRequest(crawlingConfig, paramMap.get("crawlingInfoId"),
|
||||
apiUrl + "?large_file=true"));
|
||||
|
||||
dataMap.put("title", FilenameUtils.getName(apiUrl));
|
||||
dataMap.put("url", viewUrl);
|
||||
dataMap.put("role", roleList);
|
||||
|
||||
|
|
|
@ -331,7 +331,7 @@ public class Crawler {
|
|||
postcard.setDataCrawlEndTime(getValueFromMap(dataMap, "dataCrawlEndTime", StringUtil.EMPTY));
|
||||
postcard.setDataCrawlExecTime(getValueFromMap(dataMap, "dataCrawlExecTime", "0"));
|
||||
postcard.setDataCrawlStartTime(getValueFromMap(dataMap, "dataCrawlStartTime", StringUtil.EMPTY));
|
||||
postcard.setDataFsIndexSize(getValueFromMap(dataMap, "dataFsIndexSize", "0"));
|
||||
postcard.setDataIndexSize(getValueFromMap(dataMap, "dataIndexSize", "0"));
|
||||
postcard.setDataIndexExecTime(getValueFromMap(dataMap, "dataIndexExecTime", "0"));
|
||||
postcard.setHostname(getValueFromMap(dataMap, "hostname", StringUtil.EMPTY));
|
||||
postcard.setWebFsCrawlEndTime(getValueFromMap(dataMap, "webFsCrawlEndTime", StringUtil.EMPTY));
|
||||
|
|
|
@ -51,7 +51,7 @@ public class CrawlerPostcard extends LaTypicalPostcard {
|
|||
@Override
|
||||
protected String[] getPropertyNames() {
|
||||
return new String[] { "hostname", "webFsCrawlStartTime", "webFsCrawlEndTime", "webFsCrawlExecTime", "webFsIndexExecTime",
|
||||
"webFsIndexSize", "dataCrawlStartTime", "dataCrawlEndTime", "dataCrawlExecTime", "dataIndexExecTime", "dataFsIndexSize",
|
||||
"webFsIndexSize", "dataCrawlStartTime", "dataCrawlEndTime", "dataCrawlExecTime", "dataIndexExecTime", "dataIndexSize",
|
||||
"crawlerStartTime", "crawlerEndTime", "crawlerExecTime", "status" };
|
||||
}
|
||||
|
||||
|
@ -191,12 +191,12 @@ public class CrawlerPostcard extends LaTypicalPostcard {
|
|||
}
|
||||
|
||||
/**
|
||||
* Set the value of dataFsIndexSize, used in parameter comment. <br>
|
||||
* Set the value of dataIndexSize, used in parameter comment. <br>
|
||||
* Even if empty string, treated as empty plainly. So "IF pmb != null" is false if empty.
|
||||
* @param dataFsIndexSize The parameter value of dataFsIndexSize. (NotNull)
|
||||
* @param dataIndexSize The parameter value of dataIndexSize. (NotNull)
|
||||
*/
|
||||
public void setDataFsIndexSize(String dataFsIndexSize) {
|
||||
registerVariable("dataFsIndexSize", dataFsIndexSize);
|
||||
public void setDataIndexSize(String dataIndexSize) {
|
||||
registerVariable("dataIndexSize", dataIndexSize);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -23,7 +23,7 @@ Exec Time: /*pmb.dataCrawlExecTime:orElse('-')*/ ms
|
|||
|
||||
--- Data Store Indexer ---
|
||||
Exec Time: /*pmb.dataIndexExecTime:orElse('-')*/ ms
|
||||
Num of Doc: /*pmb.dataFsIndexSize:orElse('-')*/ docs
|
||||
Num of Doc: /*pmb.dataIndexSize:orElse('-')*/ docs
|
||||
|
||||
--- Total ---
|
||||
Start Time: /*pmb.crawlerStartTime:orElse('-')*/
|
||||
|
|
Loading…
Add table
Reference in a new issue