fix #445 : use TextUtil

This commit is contained in:
Shinsuke Sugaya 2016-03-21 10:40:03 +09:00
parent d8224b5cea
commit 6ff0658e4e

View file

@ -21,18 +21,14 @@ import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.util.UnsafeStringBuilder;
import org.codelibs.fess.crawler.util.TextUtil;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DocumentHelper implements Serializable {
private static final long serialVersionUID = 1L;
private static final Logger logger = LoggerFactory.getLogger(DocumentHelper.class);
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
if (content == null) {
return StringUtil.EMPTY; // empty
@ -40,53 +36,7 @@ public class DocumentHelper implements Serializable {
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
final int maxSymbolTermSize = getMaxSymbolTermSize();
final UnsafeStringBuilder buf = new UnsafeStringBuilder(content.length());
boolean isSpace = false;
int alphanumSize = 0;
int symbolSize = 0;
for (int i = 0; i < content.length(); i++) {
final char c = content.charAt(i);
if (Character.isISOControl(c) || c == '\u0020' || c == '\u3000' || c == 65533) {
// space
if (!isSpace) {
buf.append(' ');
isSpace = true;
}
alphanumSize = 0;
symbolSize = 0;
} else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
// alphanum
if (maxAlphanumTermSize >= 0) {
if (alphanumSize < maxAlphanumTermSize) {
buf.append(c);
}
alphanumSize++;
} else {
buf.append(c);
}
isSpace = false;
symbolSize = 0;
} else if ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~')) {
// symbol
if (maxSymbolTermSize >= 0) {
if (symbolSize < maxSymbolTermSize) {
buf.append(c);
}
symbolSize++;
} else {
buf.append(c);
}
isSpace = false;
alphanumSize = 0;
} else {
buf.append(c);
isSpace = false;
alphanumSize = 0;
symbolSize = 0;
}
}
return buf.toUnsafeString().trim();
return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize);
}
protected int getMaxAlphanumTermSize() {
@ -111,7 +61,7 @@ public class DocumentHelper implements Serializable {
subContent = content.substring(0, maxWidth * 2);
}
final String originalStr = subContent.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
final String originalStr = TextUtil.normalizeText(subContent, subContent.length(), -1, -1);
return StringUtils.abbreviate(originalStr, maxWidth);
}
}