fix #748 remove parameters from title

This commit is contained in:
Shinsuke Sugaya 2016-10-19 06:41:16 +09:00
parent bc0caef065
commit 3d5972a5c8
3 changed files with 66 additions and 65 deletions

View file

@ -20,7 +20,6 @@ import static org.codelibs.core.stream.StreamUtil.stream;
import java.io.InputStream;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
@ -31,13 +30,11 @@ import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import org.apache.tika.metadata.HttpHeaders;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.codelibs.core.collection.LruHashMap;
import org.codelibs.core.io.SerializeUtil;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.core.misc.Pair;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.client.smb.SmbClient;
import org.codelibs.fess.crawler.entity.AccessResult;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.entity.ResponseData;
@ -71,8 +68,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
private static final Logger logger = LoggerFactory.getLogger(AbstractFessFileTransformer.class);
public Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
protected Map<String, String> metaContentMapping;
protected FessConfig fessConfig;
@ -232,6 +227,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(),
documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
// title
final String fileName = getFileName(url, urlEncoding);
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
if (url.endsWith("/")) {
if (StringUtil.isNotBlank(content)) {
@ -244,12 +240,10 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
}
} else {
final String u = decodeUrlAsName(url, url.startsWith("file:"));
final int pos = u.lastIndexOf('/');
if (pos == -1) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), u);
if (StringUtil.isBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), u.substring(pos + 1));
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
}
}
}
@ -258,7 +252,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
// filename
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
@ -358,57 +351,6 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
}
}
protected String decodeUrlAsName(final String url, final boolean escapePlus) {
if (url == null) {
return null;
}
String enc = Constants.UTF_8;
if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null) {
final String parentUrl = urlQueue.getParentUrl();
if (StringUtil.isNotEmpty(parentUrl)) {
final String sessionId = urlQueue.getSessionId();
final String pageEnc = getParentEncoding(parentUrl, sessionId);
if (pageEnc != null) {
enc = pageEnc;
} else if (urlQueue.getEncoding() != null) {
enc = urlQueue.getEncoding();
}
}
}
} else {
enc = fessConfig.getCrawlerDocumentFileNameEncoding();
}
final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
try {
return URLDecoder.decode(escapedUrl, enc);
} catch (final Exception e) {
return url;
}
}
protected String getParentEncoding(final String parentUrl, final String sessionId) {
final String key = sessionId + ":" + parentUrl;
String enc = parentEncodingMap.get(key);
if (enc != null) {
return enc;
}
final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
if (accessResult != null) {
final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
if (accessResultData != null && accessResultData.getEncoding() != null) {
enc = accessResultData.getEncoding();
parentEncodingMap.put(key, enc);
return enc;
}
}
return null;
}
protected String getHostOnFile(final String url) {
if (StringUtil.isBlank(url)) {
return StringUtil.EMPTY; // empty

View file

@ -17,11 +17,18 @@ package org.codelibs.fess.crawler.transformer;
import java.net.URLDecoder;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.collection.LruHashMap;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.crawler.entity.AccessResult;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.GroovyUtil;
@ -29,6 +36,8 @@ import org.slf4j.Logger;
public interface FessTransformer {
public static Map<String, String> parentEncodingMap = Collections.synchronizedMap(new LruHashMap<String, String>(1000));
FessConfig getFessConfig();
Logger getLogger();
@ -173,7 +182,7 @@ public interface FessTransformer {
return StringUtil.EMPTY;
}
String u = url;
String u = decodeUrlAsName(url, url.startsWith("file:"));
int idx = u.lastIndexOf('?');
if (idx >= 0) {
@ -201,4 +210,56 @@ public interface FessTransformer {
}
return u;
}
public default String decodeUrlAsName(final String url, final boolean escapePlus) {
if (url == null) {
return null;
}
final FessConfig fessConfig = getFessConfig();
String enc = Constants.UTF_8;
if (StringUtil.isBlank(fessConfig.getCrawlerDocumentFileNameEncoding())) {
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null) {
final String parentUrl = urlQueue.getParentUrl();
if (StringUtil.isNotEmpty(parentUrl)) {
final String sessionId = urlQueue.getSessionId();
final String pageEnc = getParentEncoding(parentUrl, sessionId);
if (pageEnc != null) {
enc = pageEnc;
} else if (urlQueue.getEncoding() != null) {
enc = urlQueue.getEncoding();
}
}
}
} else {
enc = fessConfig.getCrawlerDocumentFileNameEncoding();
}
final String escapedUrl = escapePlus ? url.replace("+", "%2B") : url;
try {
return URLDecoder.decode(escapedUrl, enc);
} catch (final Exception e) {
return url;
}
}
public default String getParentEncoding(final String parentUrl, final String sessionId) {
final String key = sessionId + ":" + parentUrl;
String enc = parentEncodingMap.get(key);
if (enc != null) {
return enc;
}
final AccessResult<?> accessResult = ComponentUtil.getDataService().getAccessResult(sessionId, parentUrl);
if (accessResult != null) {
final AccessResultData<?> accessResultData = accessResult.getAccessResultData();
if (accessResultData != null && accessResultData.getEncoding() != null) {
enc = accessResultData.getEncoding();
parentEncodingMap.put(key, enc);
return enc;
}
}
return null;
}
}

View file

@ -24,7 +24,6 @@ import java.util.Map;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.apache.commons.io.FilenameUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.elasticsearch.runner.net.Curl;
import org.codelibs.elasticsearch.runner.net.CurlResponse;
@ -189,7 +188,6 @@ public class GitBucketDataStoreImpl extends AbstractDataStoreImpl {
dataMap.putAll(ComponentUtil.getDocumentHelper().processRequest(crawlingConfig, paramMap.get("crawlingInfoId"),
apiUrl + "?large_file=true"));
dataMap.put("title", FilenameUtils.getName(apiUrl));
dataMap.put("url", viewUrl);
dataMap.put("role", roleList);