fix #602 remove \u00b6

This commit is contained in:
Shinsuke Sugaya 2016-08-18 15:58:02 +09:00
parent 9b37890dde
commit 0a0e253cc1
8 changed files with 212 additions and 140 deletions

View file

@ -15,6 +15,9 @@
*/
package org.codelibs.fess.helper;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
@ -25,7 +28,6 @@ import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
public class DocumentHelper {
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
if (content == null) {
return StringUtil.EMPTY; // empty
@ -34,7 +36,13 @@ public class DocumentHelper {
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
final int maxSymbolTermSize = getMaxSymbolTermSize();
final boolean duplicateTermRemoved = isDuplicateTermRemoved();
return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize, duplicateTermRemoved);
final int[] spaceChars = getSpaceChars();
try (final Reader reader = new StringReader(content)) {
return TextUtil.normalizeText(reader).initialCapacity(content.length()).maxAlphanumTermSize(maxAlphanumTermSize)
.maxSymbolTermSize(maxSymbolTermSize).duplicateTermRemoved(duplicateTermRemoved).spaceChars(spaceChars).execute();
} catch (final IOException e) {
return StringUtil.EMPTY; // empty
}
}
protected int getMaxAlphanumTermSize() {
@ -52,6 +60,11 @@ public class DocumentHelper {
return fessConfig.isCrawlerDocumentDuplicateTermRemoved();
}
protected int[] getSpaceChars() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return fessConfig.getCrawlerDocumentSpaceCharsAsArray();
}
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
if (content == null) {
return StringUtil.EMPTY; // empty
@ -64,7 +77,12 @@ public class DocumentHelper {
subContent = content.substring(0, maxWidth * 2);
}
final String originalStr = TextUtil.normalizeText(subContent, subContent.length(), -1, -1, false);
return StringUtils.abbreviate(originalStr, maxWidth);
final int[] spaceChars = getSpaceChars();
try (final Reader reader = new StringReader(subContent)) {
final String originalStr = TextUtil.normalizeText(reader).initialCapacity(content.length()).spaceChars(spaceChars).execute();
return StringUtils.abbreviate(originalStr, maxWidth);
} catch (final IOException e) {
return StringUtil.EMPTY; // empty
}
}
}

View file

@ -15,13 +15,13 @@
*/
package org.codelibs.fess.mylasta.action;
import org.lastaflute.web.ruts.message.ActionMessages;
import org.lastaflute.core.message.UserMessages;
/**
* The keys for message.
* @author FreeGen
*/
public class FessLabels extends ActionMessages {
public class FessLabels extends UserMessages {
/** The serial version UID for object serialization. (Default) */
private static final long serialVersionUID = 1L;

View file

@ -140,6 +140,11 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. false */
String CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED = "crawler.document.duplicate.term.removed";
/** The key of the configuration. e.g.
   <EFBFBD> */
String CRAWLER_DOCUMENT_SPACE_CHARS = "crawler.document.space.chars";
/** The key of the configuration. e.g. UTF-8 */
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
@ -361,7 +366,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. */
String INDEX_ADMIN_INTEGER_FIELDS = "index.admin.integer.fields";
/** The key of the configuration. e.g. favorite_count,click_count */
/** The key of the configuration. e.g. content_length,favorite_count,click_count */
String INDEX_ADMIN_LONG_FIELDS = "index.admin.long.fields";
/** The key of the configuration. e.g. boost */
@ -370,7 +375,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/** The key of the configuration. e.g. */
String INDEX_ADMIN_DOUBLE_FIELDS = "index.admin.double.fields";
/** The key of the configuration. e.g. doc_id,url,title,role */
/** The key of the configuration. e.g. doc_id,url,title,role,boost */
String INDEX_ADMIN_REQUIRED_FIELDS = "index.admin.required.fields";
/** The key of the configuration. e.g. 3m */
@ -1339,6 +1344,15 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
*/
boolean isCrawlerDocumentDuplicateTermRemoved();
/**
* Get the value for the key 'crawler.document.space.chars'. <br>
* The value is, e.g.
   <EFBFBD> <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getCrawlerDocumentSpaceChars();
/**
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
* The value is, e.g. UTF-8 <br>
@ -2041,7 +2055,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/**
* Get the value for the key 'index.admin.long.fields'. <br>
* The value is, e.g. favorite_count,click_count <br>
* The value is, e.g. content_length,favorite_count,click_count <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getIndexAdminLongFields();
@ -2070,7 +2084,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
/**
* Get the value for the key 'index.admin.required.fields'. <br>
* The value is, e.g. doc_id,url,title,role <br>
* The value is, e.g. doc_id,url,title,role,boost <br>
* @return The value of found property. (NotNull: if not found, exception but basically no way)
*/
String getIndexAdminRequiredFields();
@ -4259,6 +4273,10 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
return is(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED);
}
public String getCrawlerDocumentSpaceChars() {
return get(FessConfig.CRAWLER_DOCUMENT_SPACE_CHARS);
}
public String getCrawlerCrawlingDataEncoding() {
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
}

View file

@ -55,6 +55,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
public interface FessProp {
public static final String CRAWLER_DOCUMENT_SPACE_CHARS = "crawlerDocumentSpaceChars";
public static final String INDEX_ADMIN_ARRAY_FIELD_SET = "indexAdminArrayFieldSet";
public static final String INDEX_ADMIN_DATE_FIELD_SET = "indexAdminDateFieldSet";
@ -1303,4 +1305,18 @@ public interface FessProp {
return requiredValidator.isValid(value, null);
}
String getCrawlerDocumentSpaceChars();
public default int[] getCrawlerDocumentSpaceCharsAsArray() {
int[] spaceChars = (int[]) propMap.get(CRAWLER_DOCUMENT_SPACE_CHARS);
if (spaceChars == null) {
int length = getCrawlerDocumentSpaceChars().length();
spaceChars = new int[length];
for (int i = 0; i < length; i++) {
spaceChars[i] = getCrawlerDocumentSpaceChars().codePointAt(i);
}
propMap.put(CRAWLER_DOCUMENT_SPACE_CHARS, spaceChars);
}
return spaceChars;
}
}

View file

@ -16,6 +16,7 @@
package org.codelibs.fess.thumbnail.impl;
import static org.codelibs.core.stream.StreamUtil.stream;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;

View file

@ -85,6 +85,7 @@ crawler.document.append.data=true
crawler.document.max.alphanum.term.size=20
crawler.document.max.symbol.term.size=10
crawler.document.duplicate.term.removed=false
crawler.document.space.chars=\u0009\u000A\u000B\u000C\u000D\u001C\u001D\u001E\u001F\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF\uFFFD\u00B6
crawler.crawling.data.encoding=UTF-8
crawler.web.protocols=http,https
crawler.file.protocols=file,smb,ftp

View file

@ -32,6 +32,7 @@ public class FessPropTest extends UnitFessTestCase {
}
public void test_maxUsernameLength() throws IOException {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public Integer getLdapMaxUsernameLengthAsInteger() {
@ -52,6 +53,7 @@ public class FessPropTest extends UnitFessTestCase {
}
public void test_maxUsernameLength10() throws IOException {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public Integer getLdapMaxUsernameLengthAsInteger() {
@ -73,6 +75,7 @@ public class FessPropTest extends UnitFessTestCase {
}
public void test_validateIndexRequiredFields() {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getIndexAdminRequiredFields() {
@ -100,4 +103,19 @@ public class FessPropTest extends UnitFessTestCase {
source.put("bbb", "a");
assertTrue(fessConfig.validateIndexRequiredFields(source));
}
public void test_getCrawlerDocumentSpaceCharsAsArray() {
FessProp.propMap.clear();
FessConfig fessConfig = new FessConfig.SimpleImpl() {
@Override
public String getCrawlerDocumentSpaceChars() {
return "\u0020\u3000";
}
};
int[] spaceChars = fessConfig.getCrawlerDocumentSpaceCharsAsArray();
assertEquals(2, spaceChars.length);
assertEquals(32, spaceChars[0]);
assertEquals(12288, spaceChars[1]);
}
}