fix #748 remove parameters from title
This commit is contained in:
parent
bc0caef065
commit
3d5972a5c8
3 changed files with 66 additions and 65 deletions
|
@ -20,7 +20,6 @@ import static org.codelibs.core.stream.StreamUtil.stream;
|
|||
import java.io.InputStream;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
@ -31,13 +30,11 @@ import java.util.Set;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.tika.metadata.HttpHeaders;
|
||||
import org.apache.tika.metadata.TikaMetadataKeys;
|
||||
import org.codelibs.core.collection.LruHashMap;
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.misc.Pair;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbClient;
|
||||
import org.codelibs.fess.crawler.entity.AccessResult;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.ExtractData;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
|
@ -71,8 +68,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(AbstractFessFileTransformer.class);
|
||||
|
||||
public Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
|
||||
|
||||
protected Map<String, String> metaContentMapping;
|
||||
|
||||
protected FessConfig fessConfig;
|
||||
|
@ -232,6 +227,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
|
||||
documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
|
||||
// title
|
||||
final String fileName = getFileName(url, urlEncoding);
|
||||
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
|
||||
if (url.endsWith("/")) {
|
||||
if (StringUtil.isNotBlank(content)) {
|
||||
|
@ -244,12 +240,10 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
|
||||
}
|
||||
} else {
|
||||
final String u = decodeUrlAsName(url, url.startsWith("file:"));
|
||||
final int pos = u.lastIndexOf('/');
|
||||
if (pos == -1) {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), u);
|
||||
if (StringUtil.isBlank(fileName)) {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
|
||||
} else {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), u.substring(pos + 1));
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -258,7 +252,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
// site
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
|
||||
// filename
|
||||
final String fileName = getFileName(url, urlEncoding);
|
||||
if (StringUtil.isNotBlank(fileName)) {
|
||||
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
|
||||
}
|
||||
|
@ -358,57 +351,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
}
|
||||
}
|
||||
|
||||
protected String decodeUrlAsName(final String url, final boolean escapePlus) {
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
String enc = Constants.UTF_8;
|
||||
if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
|
||||
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
|
||||
if (urlQueue != null) {
|
||||
final String parentUrl = urlQueue.getParentUrl();
|
||||
if (StringUtil.isNotEmpty(parentUrl)) {
|
||||
final String sessionId = urlQueue.getSessionId();
|
||||
final String pageEnc = getParentEncoding(parentUrl, sessionId);
|
||||
if (pageEnc != null) {
|
||||
enc = pageEnc;
|
||||
} else if (urlQueue.getEncoding() != null) {
|
||||
enc = urlQueue.getEncoding();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
enc = fessConfig.getCrawlerDocumentFileNameEncoding();
|
||||
}
|
||||
|
||||
final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
|
||||
try {
|
||||
return URLDecoder.decode(escapedUrl, enc);
|
||||
} catch (final Exception e) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
protected String getParentEncoding(final String parentUrl, final String sessionId) {
|
||||
final String key = sessionId + ":" + parentUrl;
|
||||
String enc = parentEncodingMap.get(key);
|
||||
if (enc != null) {
|
||||
return enc;
|
||||
}
|
||||
|
||||
final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
|
||||
if (accessResult != null) {
|
||||
final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
|
||||
if (accessResultData != null && accessResultData.getEncoding() != null) {
|
||||
enc = accessResultData.getEncoding();
|
||||
parentEncodingMap.put(key, enc);
|
||||
return enc;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected String getHostOnFile(final String url) {
|
||||
if (StringUtil.isBlank(url)) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
|
|
|
@ -17,11 +17,18 @@ package org.codelibs.fess.crawler.transformer;
|
|||
|
||||
import java.net.URLDecoder;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.codelibs.core.collection.LruHashMap;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.entity.AccessResult;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.UrlQueue;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.GroovyUtil;
|
||||
|
@ -29,6 +36,8 @@ import org.slf4j.Logger;
|
|||
|
||||
public interface FessTransformer {
|
||||
|
||||
public static Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
|
||||
|
||||
FessConfig getFessConfig();
|
||||
|
||||
Logger getLogger();
|
||||
|
@ -173,7 +182,7 @@ public interface FessTransformer {
|
|||
return StringUtil.EMPTY;
|
||||
}
|
||||
|
||||
String u = url;
|
||||
String u = decodeUrlAsName(url, url.startsWith("file:"));
|
||||
|
||||
int idx = u.lastIndexOf('?');
|
||||
if (idx >= 0) {
|
||||
|
@ -201,4 +210,56 @@ public interface FessTransformer {
|
|||
}
|
||||
return u;
|
||||
}
|
||||
|
||||
public default String decodeUrlAsName(final String url, final boolean escapePlus) {
|
||||
if (url == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
final FessConfig fessConfig = getFessConfig();
|
||||
String enc = Constants.UTF_8;
|
||||
if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
|
||||
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
|
||||
if (urlQueue != null) {
|
||||
final String parentUrl = urlQueue.getParentUrl();
|
||||
if (StringUtil.isNotEmpty(parentUrl)) {
|
||||
final String sessionId = urlQueue.getSessionId();
|
||||
final String pageEnc = getParentEncoding(parentUrl, sessionId);
|
||||
if (pageEnc != null) {
|
||||
enc = pageEnc;
|
||||
} else if (urlQueue.getEncoding() != null) {
|
||||
enc = urlQueue.getEncoding();
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
enc = fessConfig.getCrawlerDocumentFileNameEncoding();
|
||||
}
|
||||
|
||||
final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
|
||||
try {
|
||||
return URLDecoder.decode(escapedUrl, enc);
|
||||
} catch (final Exception e) {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
|
||||
public default String getParentEncoding(final String parentUrl, final String sessionId) {
|
||||
final String key = sessionId + ":" + parentUrl;
|
||||
String enc = parentEncodingMap.get(key);
|
||||
if (enc != null) {
|
||||
return enc;
|
||||
}
|
||||
|
||||
final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
|
||||
if (accessResult != null) {
|
||||
final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
|
||||
if (accessResultData != null && accessResultData.getEncoding() != null) {
|
||||
enc = accessResultData.getEncoding();
|
||||
parentEncodingMap.put(key, enc);
|
||||
return enc;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
|
@ -24,7 +24,6 @@ import java.util.Map;
|
|||
import java.util.function.Consumer;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.FilenameUtils;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.elasticsearch.runner.net.Curl;
|
||||
import org.codelibs.elasticsearch.runner.net.CurlResponse;
|
||||
|
@ -189,7 +188,6 @@ public class GitBucketDataStoreImpl extends AbstractDataStoreImpl {
|
|||
dataMap.putAll(ComponentUtil.getDocumentHelper().processRequest(crawlingConfig, paramMap.get("crawlingInfoId"),
|
||||
apiUrl + "?large_file=true"));
|
||||
|
||||
dataMap.put("title", FilenameUtils.getName(apiUrl));
|
||||
dataMap.put("url", viewUrl);
|
||||
dataMap.put("role", roleList);
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue