fix #602 remove \u00b6
This commit is contained in:
parent
9b37890dde
commit
0a0e253cc1
8 changed files with 212 additions and 140 deletions
|
@ -15,6 +15,9 @@
|
|||
*/
|
||||
package org.codelibs.fess.helper;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.io.StringReader;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -25,7 +28,6 @@ import org.codelibs.fess.mylasta.direction.FessConfig;
|
|||
import org.codelibs.fess.util.ComponentUtil;
|
||||
|
||||
public class DocumentHelper {
|
||||
|
||||
public String getContent(final ResponseData responseData, final String content, final Map<String, Object> dataMap) {
|
||||
if (content == null) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
|
@ -34,7 +36,13 @@ public class DocumentHelper {
|
|||
final int maxAlphanumTermSize = getMaxAlphanumTermSize();
|
||||
final int maxSymbolTermSize = getMaxSymbolTermSize();
|
||||
final boolean duplicateTermRemoved = isDuplicateTermRemoved();
|
||||
return TextUtil.normalizeText(content, content.length(), maxAlphanumTermSize, maxSymbolTermSize, duplicateTermRemoved);
|
||||
final int[] spaceChars = getSpaceChars();
|
||||
try (final Reader reader = new StringReader(content)) {
|
||||
return TextUtil.normalizeText(reader).initialCapacity(content.length()).maxAlphanumTermSize(maxAlphanumTermSize)
|
||||
.maxSymbolTermSize(maxSymbolTermSize).duplicateTermRemoved(duplicateTermRemoved).spaceChars(spaceChars).execute();
|
||||
} catch (final IOException e) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
}
|
||||
}
|
||||
|
||||
protected int getMaxAlphanumTermSize() {
|
||||
|
@ -52,6 +60,11 @@ public class DocumentHelper {
|
|||
return fessConfig.isCrawlerDocumentDuplicateTermRemoved();
|
||||
}
|
||||
|
||||
protected int[] getSpaceChars() {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
return fessConfig.getCrawlerDocumentSpaceCharsAsArray();
|
||||
}
|
||||
|
||||
public String getDigest(final ResponseData responseData, final String content, final Map<String, Object> dataMap, final int maxWidth) {
|
||||
if (content == null) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
|
@ -64,7 +77,12 @@ public class DocumentHelper {
|
|||
subContent = content.substring(0, maxWidth * 2);
|
||||
}
|
||||
|
||||
final String originalStr = TextUtil.normalizeText(subContent, subContent.length(), -1, -1, false);
|
||||
return StringUtils.abbreviate(originalStr, maxWidth);
|
||||
final int[] spaceChars = getSpaceChars();
|
||||
try (final Reader reader = new StringReader(subContent)) {
|
||||
final String originalStr = TextUtil.normalizeText(reader).initialCapacity(content.length()).spaceChars(spaceChars).execute();
|
||||
return StringUtils.abbreviate(originalStr, maxWidth);
|
||||
} catch (final IOException e) {
|
||||
return StringUtil.EMPTY; // empty
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -15,13 +15,13 @@
|
|||
*/
|
||||
package org.codelibs.fess.mylasta.action;
|
||||
|
||||
import org.lastaflute.web.ruts.message.ActionMessages;
|
||||
import org.lastaflute.core.message.UserMessages;
|
||||
|
||||
/**
|
||||
* The keys for message.
|
||||
* @author FreeGen
|
||||
*/
|
||||
public class FessLabels extends ActionMessages {
|
||||
public class FessLabels extends UserMessages {
|
||||
|
||||
/** The serial version UID for object serialization. (Default) */
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -140,6 +140,11 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. false */
|
||||
String CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED = "crawler.document.duplicate.term.removed";
|
||||
|
||||
/** The key of the configuration. e.g.
|
||||
|
||||
<EFBFBD>¶ */
|
||||
String CRAWLER_DOCUMENT_SPACE_CHARS = "crawler.document.space.chars";
|
||||
|
||||
/** The key of the configuration. e.g. UTF-8 */
|
||||
String CRAWLER_CRAWLING_DATA_ENCODING = "crawler.crawling.data.encoding";
|
||||
|
||||
|
@ -361,7 +366,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. */
|
||||
String INDEX_ADMIN_INTEGER_FIELDS = "index.admin.integer.fields";
|
||||
|
||||
/** The key of the configuration. e.g. favorite_count,click_count */
|
||||
/** The key of the configuration. e.g. content_length,favorite_count,click_count */
|
||||
String INDEX_ADMIN_LONG_FIELDS = "index.admin.long.fields";
|
||||
|
||||
/** The key of the configuration. e.g. boost */
|
||||
|
@ -370,7 +375,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. */
|
||||
String INDEX_ADMIN_DOUBLE_FIELDS = "index.admin.double.fields";
|
||||
|
||||
/** The key of the configuration. e.g. doc_id,url,title,role */
|
||||
/** The key of the configuration. e.g. doc_id,url,title,role,boost */
|
||||
String INDEX_ADMIN_REQUIRED_FIELDS = "index.admin.required.fields";
|
||||
|
||||
/** The key of the configuration. e.g. 3m */
|
||||
|
@ -1339,6 +1344,15 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
boolean isCrawlerDocumentDuplicateTermRemoved();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.space.chars'. <br>
|
||||
* The value is, e.g.
|
||||
|
||||
<EFBFBD>¶ <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDocumentSpaceChars();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.crawling.data.encoding'. <br>
|
||||
* The value is, e.g. UTF-8 <br>
|
||||
|
@ -2041,7 +2055,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
|
||||
/**
|
||||
* Get the value for the key 'index.admin.long.fields'. <br>
|
||||
* The value is, e.g. favorite_count,click_count <br>
|
||||
* The value is, e.g. content_length,favorite_count,click_count <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getIndexAdminLongFields();
|
||||
|
@ -2070,7 +2084,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
|
||||
/**
|
||||
* Get the value for the key 'index.admin.required.fields'. <br>
|
||||
* The value is, e.g. doc_id,url,title,role <br>
|
||||
* The value is, e.g. doc_id,url,title,role,boost <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getIndexAdminRequiredFields();
|
||||
|
@ -4259,6 +4273,10 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return is(FessConfig.CRAWLER_DOCUMENT_DUPLICATE_TERM_REMOVED);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentSpaceChars() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_SPACE_CHARS);
|
||||
}
|
||||
|
||||
public String getCrawlerCrawlingDataEncoding() {
|
||||
return get(FessConfig.CRAWLER_CRAWLING_DATA_ENCODING);
|
||||
}
|
||||
|
|
|
@ -55,6 +55,8 @@ import org.lastaflute.web.validation.theme.typed.LongTypeValidator;
|
|||
|
||||
public interface FessProp {
|
||||
|
||||
public static final String CRAWLER_DOCUMENT_SPACE_CHARS = "crawlerDocumentSpaceChars";
|
||||
|
||||
public static final String INDEX_ADMIN_ARRAY_FIELD_SET = "indexAdminArrayFieldSet";
|
||||
|
||||
public static final String INDEX_ADMIN_DATE_FIELD_SET = "indexAdminDateFieldSet";
|
||||
|
@ -1303,4 +1305,18 @@ public interface FessProp {
|
|||
return requiredValidator.isValid(value, null);
|
||||
}
|
||||
|
||||
String getCrawlerDocumentSpaceChars();
|
||||
|
||||
public default int[] getCrawlerDocumentSpaceCharsAsArray() {
|
||||
int[] spaceChars = (int[]) propMap.get(CRAWLER_DOCUMENT_SPACE_CHARS);
|
||||
if (spaceChars == null) {
|
||||
int length = getCrawlerDocumentSpaceChars().length();
|
||||
spaceChars = new int[length];
|
||||
for (int i = 0; i < length; i++) {
|
||||
spaceChars[i] = getCrawlerDocumentSpaceChars().codePointAt(i);
|
||||
}
|
||||
propMap.put(CRAWLER_DOCUMENT_SPACE_CHARS, spaceChars);
|
||||
}
|
||||
return spaceChars;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
package org.codelibs.fess.thumbnail.impl;
|
||||
|
||||
import static org.codelibs.core.stream.StreamUtil.stream;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
|
|
@ -85,6 +85,7 @@ crawler.document.append.data=true
|
|||
crawler.document.max.alphanum.term.size=20
|
||||
crawler.document.max.symbol.term.size=10
|
||||
crawler.document.duplicate.term.removed=false
|
||||
crawler.document.space.chars=\u0009\u000A\u000B\u000C\u000D\u001C\u001D\u001E\u001F\u0020\u00A0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF\uFFFD\u00B6
|
||||
crawler.crawling.data.encoding=UTF-8
|
||||
crawler.web.protocols=http,https
|
||||
crawler.file.protocols=file,smb,ftp
|
||||
|
|
|
@ -32,6 +32,7 @@ public class FessPropTest extends UnitFessTestCase {
|
|||
}
|
||||
|
||||
public void test_maxUsernameLength() throws IOException {
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public Integer getLdapMaxUsernameLengthAsInteger() {
|
||||
|
@ -52,6 +53,7 @@ public class FessPropTest extends UnitFessTestCase {
|
|||
}
|
||||
|
||||
public void test_maxUsernameLength10() throws IOException {
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public Integer getLdapMaxUsernameLengthAsInteger() {
|
||||
|
@ -73,6 +75,7 @@ public class FessPropTest extends UnitFessTestCase {
|
|||
}
|
||||
|
||||
public void test_validateIndexRequiredFields() {
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public String getIndexAdminRequiredFields() {
|
||||
|
@ -100,4 +103,19 @@ public class FessPropTest extends UnitFessTestCase {
|
|||
source.put("bbb", "a");
|
||||
assertTrue(fessConfig.validateIndexRequiredFields(source));
|
||||
}
|
||||
|
||||
public void test_getCrawlerDocumentSpaceCharsAsArray() {
|
||||
FessProp.propMap.clear();
|
||||
FessConfig fessConfig = new FessConfig.SimpleImpl() {
|
||||
@Override
|
||||
public String getCrawlerDocumentSpaceChars() {
|
||||
return "\u0020\u3000";
|
||||
}
|
||||
};
|
||||
|
||||
int[] spaceChars = fessConfig.getCrawlerDocumentSpaceCharsAsArray();
|
||||
assertEquals(2, spaceChars.length);
|
||||
assertEquals(32, spaceChars[0]);
|
||||
assertEquals(12288, spaceChars[1]);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue