fix #2036 add langdetect

This commit is contained in:
Shinsuke Sugaya 2019-03-14 11:05:53 +09:00
parent c2fcefce71
commit 9cdabea4b8
3 changed files with 63 additions and 3 deletions

12
pom.xml
View file

@ -47,6 +47,7 @@
<jackson.version>2.8.11</jackson.version>
<commons.fileupload.version>1.3.3</commons.fileupload.version>
<asm.version>5.1</asm.version>
<tika.version>1.20</tika.version>
<!-- Testing -->
<junit.version>4.12</junit.version>
@ -1320,6 +1321,17 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-langdetect</artifactId>
<version>${tika.version}</version>
<exclusions>
<exclusion>
<groupId>javax.annotation</groupId>
<artifactId>javax.annotation-api</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- suggest library -->
<dependency>

View file

@ -19,16 +19,24 @@ import java.util.Map;
import javax.annotation.PostConstruct;
import org.apache.tika.language.detect.LanguageDetector;
import org.apache.tika.language.detect.LanguageResult;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.DocumentUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class LanguageHelper {
private static final Logger logger = LoggerFactory.getLogger(LanguageHelper.class);
protected String[] langFields;
protected String[] supportedLanguages;
protected LanguageDetector detector;
@PostConstruct
public void init() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
@ -37,21 +45,52 @@ public class LanguageHelper {
}
public void updateDocument(final Map<String, Object> doc) {
final String language =
getSupportedLanguage(DocumentUtil.getValue(doc, ComponentUtil.getFessConfig().getIndexFieldLang(), String.class));
final FessConfig fessConfig = ComponentUtil.getFessConfig();
String language = getSupportedLanguage(DocumentUtil.getValue(doc, fessConfig.getIndexFieldLang(), String.class));
if (language == null) {
return;
for (final String f : langFields) {
if (doc.containsKey(f)) {
language = detectLanguage(DocumentUtil.getValue(doc, f, String.class));
if (language != null) {
if (logger.isDebugEnabled()) {
logger.debug("set {} to lang field", language);
}
doc.put(fessConfig.getIndexFieldLang(), language);
break;
}
}
}
if (language == null) {
return;
}
}
for (final String f : langFields) {
final String lf = f + "_" + language;
if (doc.containsKey(f) && !doc.containsKey(lf)) {
doc.put(lf, doc.get(f));
if (logger.isDebugEnabled()) {
logger.debug("add {} field", lf);
}
}
}
}
protected String detectLanguage(final String text) {
if (StringUtil.isBlank(text)) {
return null;
}
final LanguageResult result = detector.detect(text);
if (logger.isDebugEnabled()) {
logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text);
}
return getSupportedLanguage(result.getLanguage());
}
protected String getSupportedLanguage(final String lang) {
if (StringUtil.isBlank(lang)) {
return null;
}
for (final String l : supportedLanguages) {
if (l.equals(lang)) {
return l;
@ -60,4 +99,8 @@ public class LanguageHelper {
return null;
}
public void setDetector(LanguageDetector detector) {
this.detector = detector;
}
}

View file

@ -11,6 +11,11 @@
<component name="curlHelper" class="org.codelibs.fess.helper.CurlHelper">
</component>
<component name="languageHelper" class="org.codelibs.fess.helper.LanguageHelper">
<property name="detector">
<component class="org.apache.tika.langdetect.OptimaizeLangDetector">
<postConstruct name="loadModels"></postConstruct>
</component>
</property>
</component>
<component name="searchLogHelper" class="org.codelibs.fess.helper.SearchLogHelper">
<!--