fix #2036 add langdetect
This commit is contained in:
parent
c2fcefce71
commit
9cdabea4b8
3 changed files with 63 additions and 3 deletions
12
pom.xml
12
pom.xml
|
@ -47,6 +47,7 @@
|
|||
<jackson.version>2.8.11</jackson.version>
|
||||
<commons.fileupload.version>1.3.3</commons.fileupload.version>
|
||||
<asm.version>5.1</asm.version>
|
||||
<tika.version>1.20</tika.version>
|
||||
|
||||
<!-- Testing -->
|
||||
<junit.version>4.12</junit.version>
|
||||
|
@ -1320,6 +1321,17 @@
|
|||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.apache.tika</groupId>
|
||||
<artifactId>tika-langdetect</artifactId>
|
||||
<version>${tika.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>javax.annotation</groupId>
|
||||
<artifactId>javax.annotation-api</artifactId>
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
|
||||
<!-- suggest library -->
|
||||
<dependency>
|
||||
|
|
|
@ -19,16 +19,24 @@ import java.util.Map;
|
|||
|
||||
import javax.annotation.PostConstruct;
|
||||
|
||||
import org.apache.tika.language.detect.LanguageDetector;
|
||||
import org.apache.tika.language.detect.LanguageResult;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.DocumentUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class LanguageHelper {
|
||||
private static final Logger logger = LoggerFactory.getLogger(LanguageHelper.class);
|
||||
|
||||
protected String[] langFields;
|
||||
|
||||
protected String[] supportedLanguages;
|
||||
|
||||
protected LanguageDetector detector;
|
||||
|
||||
@PostConstruct
|
||||
public void init() {
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
|
@ -37,21 +45,52 @@ public class LanguageHelper {
|
|||
}
|
||||
|
||||
public void updateDocument(final Map<String, Object> doc) {
|
||||
final String language =
|
||||
getSupportedLanguage(DocumentUtil.getValue(doc, ComponentUtil.getFessConfig().getIndexFieldLang(), String.class));
|
||||
final FessConfig fessConfig = ComponentUtil.getFessConfig();
|
||||
String language = getSupportedLanguage(DocumentUtil.getValue(doc, fessConfig.getIndexFieldLang(), String.class));
|
||||
if (language == null) {
|
||||
return;
|
||||
for (final String f : langFields) {
|
||||
if (doc.containsKey(f)) {
|
||||
language = detectLanguage(DocumentUtil.getValue(doc, f, String.class));
|
||||
if (language != null) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("set {} to lang field", language);
|
||||
}
|
||||
doc.put(fessConfig.getIndexFieldLang(), language);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (language == null) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
for (final String f : langFields) {
|
||||
final String lf = f + "_" + language;
|
||||
if (doc.containsKey(f) && !doc.containsKey(lf)) {
|
||||
doc.put(lf, doc.get(f));
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("add {} field", lf);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected String detectLanguage(final String text) {
|
||||
if (StringUtil.isBlank(text)) {
|
||||
return null;
|
||||
}
|
||||
final LanguageResult result = detector.detect(text);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text);
|
||||
}
|
||||
return getSupportedLanguage(result.getLanguage());
|
||||
}
|
||||
|
||||
protected String getSupportedLanguage(final String lang) {
|
||||
if (StringUtil.isBlank(lang)) {
|
||||
return null;
|
||||
}
|
||||
for (final String l : supportedLanguages) {
|
||||
if (l.equals(lang)) {
|
||||
return l;
|
||||
|
@ -60,4 +99,8 @@ public class LanguageHelper {
|
|||
return null;
|
||||
}
|
||||
|
||||
public void setDetector(LanguageDetector detector) {
|
||||
this.detector = detector;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -11,6 +11,11 @@
|
|||
<component name="curlHelper" class="org.codelibs.fess.helper.CurlHelper">
|
||||
</component>
|
||||
<component name="languageHelper" class="org.codelibs.fess.helper.LanguageHelper">
|
||||
<property name="detector">
|
||||
<component class="org.apache.tika.langdetect.OptimaizeLangDetector">
|
||||
<postConstruct name="loadModels"></postConstruct>
|
||||
</component>
|
||||
</property>
|
||||
</component>
|
||||
<component name="searchLogHelper" class="org.codelibs.fess.helper.SearchLogHelper">
|
||||
<!--
|
||||
|
|
Loading…
Add table
Reference in a new issue