diff --git a/pom.xml b/pom.xml
index 486b58775..cc197ec02 100644
--- a/pom.xml
+++ b/pom.xml
@@ -47,6 +47,7 @@
2.8.11
1.3.3
5.1
+ 1.20
4.12
@@ -1320,6 +1321,17 @@
+
+ org.apache.tika
+ tika-langdetect
+ ${tika.version}
+
+
+ javax.annotation
+ javax.annotation-api
+
+
+
diff --git a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java
index 323a7c2f2..473b7e274 100644
--- a/src/main/java/org/codelibs/fess/helper/LanguageHelper.java
+++ b/src/main/java/org/codelibs/fess/helper/LanguageHelper.java
@@ -19,16 +19,24 @@ import java.util.Map;
import javax.annotation.PostConstruct;
+import org.apache.tika.language.detect.LanguageDetector;
+import org.apache.tika.language.detect.LanguageResult;
+import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.DocumentUtil;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class LanguageHelper {
+ private static final Logger logger = LoggerFactory.getLogger(LanguageHelper.class);
protected String[] langFields;
protected String[] supportedLanguages;
+ protected LanguageDetector detector;
+
@PostConstruct
public void init() {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
@@ -37,21 +45,52 @@ public class LanguageHelper {
}
public void updateDocument(final Map doc) {
- final String language =
- getSupportedLanguage(DocumentUtil.getValue(doc, ComponentUtil.getFessConfig().getIndexFieldLang(), String.class));
+ final FessConfig fessConfig = ComponentUtil.getFessConfig();
+ String language = getSupportedLanguage(DocumentUtil.getValue(doc, fessConfig.getIndexFieldLang(), String.class));
if (language == null) {
- return;
+ for (final String f : langFields) {
+ if (doc.containsKey(f)) {
+ language = detectLanguage(DocumentUtil.getValue(doc, f, String.class));
+ if (language != null) {
+ if (logger.isDebugEnabled()) {
+ logger.debug("set {} to lang field", language);
+ }
+ doc.put(fessConfig.getIndexFieldLang(), language);
+ break;
+ }
+ }
+ }
+ if (language == null) {
+ return;
+ }
}
for (final String f : langFields) {
final String lf = f + "_" + language;
if (doc.containsKey(f) && !doc.containsKey(lf)) {
doc.put(lf, doc.get(f));
+ if (logger.isDebugEnabled()) {
+ logger.debug("add {} field", lf);
+ }
}
}
}
+ protected String detectLanguage(final String text) {
+ if (StringUtil.isBlank(text)) {
+ return null;
+ }
+ final LanguageResult result = detector.detect(text);
+ if (logger.isDebugEnabled()) {
+ logger.debug("detected lang:{}({}) from {}", result, result.getRawScore(), text);
+ }
+ return getSupportedLanguage(result.getLanguage());
+ }
+
protected String getSupportedLanguage(final String lang) {
+ if (StringUtil.isBlank(lang)) {
+ return null;
+ }
for (final String l : supportedLanguages) {
if (l.equals(lang)) {
return l;
@@ -60,4 +99,8 @@ public class LanguageHelper {
return null;
}
+ public void setDetector(LanguageDetector detector) {
+ this.detector = detector;
+ }
+
}
diff --git a/src/main/resources/fess.xml b/src/main/resources/fess.xml
index d2c30a641..1528580b4 100644
--- a/src/main/resources/fess.xml
+++ b/src/main/resources/fess.xml
@@ -11,6 +11,11 @@
+
+
+
+
+