|
@@ -21,18 +21,14 @@ import java.util.Map;
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
import org.codelibs.core.lang.StringUtil;
|
|
|
import org.codelibs.fess.crawler.entity.ResponseData;
|
|
|
-import org.codelibs.fess.crawler.util.UnsafeStringBuilder;
|
|
|
+import org.codelibs.fess.crawler.util.TextUtil;
|
|
|
import org.codelibs.fess.mylasta.direction.FessConfig;
|
|
|
import org.codelibs.fess.util.ComponentUtil;
|
|
|
-import org.slf4j.Logger;
|
|
|
-import org.slf4j.LoggerFactory;
|
|
|
|
|
|
public class DocumentHelper implements Serializable {
|
|
|
|
|
|
private static final long serialVersionUID = 1L;
|
|
|
|
|
|
- private static final Logger logger = LoggerFactory.getLogger(DocumentHelper.class);
|
|
|
-
|
|
|
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
|
|
|
if (content == null) {
|
|
|
return StringUtil.EMPTY; // empty
|
|
@@ -40,53 +36,7 @@ public class DocumentHelper implements Serializable {
|
|
|
|
|
|
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
|
|
|
final int maxSymbolTermSize = getMaxSymbolTermSize();
|
|
|
- final UnsafeStringBuilder buf = new UnsafeStringBuilder(content.length());
|
|
|
- boolean isSpace = false;
|
|
|
- int alphanumSize = 0;
|
|
|
- int symbolSize = 0;
|
|
|
- for (int i = 0; i < content.length(); i++) {
|
|
|
- final char c = content.charAt(i);
|
|
|
- if (Character.isISOControl(c) || c == '\u0020' || c == '\u3000' || c == 65533) {
|
|
|
- // space
|
|
|
- if (!isSpace) {
|
|
|
- buf.append(' ');
|
|
|
- isSpace = true;
|
|
|
- }
|
|
|
- alphanumSize = 0;
|
|
|
- symbolSize = 0;
|
|
|
- } else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
|
|
|
- // alphanum
|
|
|
- if (maxAlphanumTermSize >= 0) {
|
|
|
- if (alphanumSize < maxAlphanumTermSize) {
|
|
|
- buf.append(c);
|
|
|
- }
|
|
|
- alphanumSize++;
|
|
|
- } else {
|
|
|
- buf.append(c);
|
|
|
- }
|
|
|
- isSpace = false;
|
|
|
- symbolSize = 0;
|
|
|
- } else if ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~')) {
|
|
|
- // symbol
|
|
|
- if (maxSymbolTermSize >= 0) {
|
|
|
- if (symbolSize < maxSymbolTermSize) {
|
|
|
- buf.append(c);
|
|
|
- }
|
|
|
- symbolSize++;
|
|
|
- } else {
|
|
|
- buf.append(c);
|
|
|
- }
|
|
|
- isSpace = false;
|
|
|
- alphanumSize = 0;
|
|
|
- } else {
|
|
|
- buf.append(c);
|
|
|
- isSpace = false;
|
|
|
- alphanumSize = 0;
|
|
|
- symbolSize = 0;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return buf.toUnsafeString().trim();
|
|
|
+ return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize);
|
|
|
}
|
|
|
|
|
|
protected int getMaxAlphanumTermSize() {
|
|
@@ -111,7 +61,7 @@ public class DocumentHelper implements Serializable {
|
|
|
subContent = content.substring(0, maxWidth * 2);
|
|
|
}
|
|
|
|
|
|
- final String originalStr = subContent.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
|
|
|
+ final String originalStr = TextUtil.normalizeText(subContent, subContent.length(), -1, -1);
|
|
|
return StringUtils.abbreviate(originalStr, maxWidth);
|
|
|
}
|
|
|
}
|