Explorar el Código

fix #445 : use TextUtil

Shinsuke Sugaya hace 9 años
padre
commit
6ff0658e4e
Se han modificado 1 ficheros con 3 adiciones y 53 borrados
  1. 3 53
      src/main/java/org/codelibs/fess/helper/DocumentHelper.java

+ 3 - 53
src/main/java/org/codelibs/fess/helper/DocumentHelper.java

@@ -21,18 +21,14 @@ import java.util.Map;
 import org.apache.commons.lang3.StringUtils;
 import org.codelibs.core.lang.StringUtil;
 import org.codelibs.fess.crawler.entity.ResponseData;
-import org.codelibs.fess.crawler.util.UnsafeStringBuilder;
+import org.codelibs.fess.crawler.util.TextUtil;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 
 public class DocumentHelper implements Serializable {
 
     private static final long serialVersionUID = 1L;
 
-    private static final Logger logger = LoggerFactory.getLogger(DocumentHelper.class);
-
     public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
         if (content == null) {
             return StringUtil.EMPTY; // empty
@@ -40,53 +36,7 @@ public class DocumentHelper implements Serializable {
 
         final int maxAlphanumTermSize = getMaxAlphanumTermSize();
         final int maxSymbolTermSize = getMaxSymbolTermSize();
-        final UnsafeStringBuilder buf = new UnsafeStringBuilder(content.length());
-        boolean isSpace = false;
-        int alphanumSize = 0;
-        int symbolSize = 0;
-        for (int i = 0; i < content.length(); i++) {
-            final char c = content.charAt(i);
-            if (Character.isISOControl(c) || c == '\u0020' || c == '\u3000' || c == 65533) {
-                // space
-                if (!isSpace) {
-                    buf.append(' ');
-                    isSpace = true;
-                }
-                alphanumSize = 0;
-                symbolSize = 0;
-            } else if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) {
-                // alphanum
-                if (maxAlphanumTermSize >= 0) {
-                    if (alphanumSize < maxAlphanumTermSize) {
-                        buf.append(c);
-                    }
-                    alphanumSize++;
-                } else {
-                    buf.append(c);
-                }
-                isSpace = false;
-                symbolSize = 0;
-            } else if ((c >= '!' && c <= '/') || (c >= ':' && c <= '@') || (c >= '[' && c <= '`') || (c >= '{' && c <= '~')) {
-                // symbol
-                if (maxSymbolTermSize >= 0) {
-                    if (symbolSize < maxSymbolTermSize) {
-                        buf.append(c);
-                    }
-                    symbolSize++;
-                } else {
-                    buf.append(c);
-                }
-                isSpace = false;
-                alphanumSize = 0;
-            } else {
-                buf.append(c);
-                isSpace = false;
-                alphanumSize = 0;
-                symbolSize = 0;
-            }
-        }
-
-        return buf.toUnsafeString().trim();
+        return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize);
     }
 
     protected int getMaxAlphanumTermSize() {
@@ -111,7 +61,7 @@ public class DocumentHelper implements Serializable {
             subContent = content.substring(0, maxWidth * 2);
         }
 
-        final String originalStr = subContent.replaceAll("[\u0000-\u0020\u007f\u3000]+", " ").trim();
+        final String originalStr = TextUtil.normalizeText(subContent, subContent.length(), -1, -1);
         return StringUtils.abbreviate(originalStr, maxWidth);
     }
 }