fix #1646 import GSA configuration

This commit is contained in:
Shinsuke Sugaya 2018-05-13 15:50:58 +09:00
parent f0b4a2429e
commit ce58769519
6 changed files with 499 additions and 0 deletions

View file

@ -44,12 +44,16 @@ import org.codelibs.core.misc.Pair;
import org.codelibs.elasticsearch.runner.net.CurlResponse;
import org.codelibs.fess.Constants;
import org.codelibs.fess.app.web.base.FessAdminAction;
import org.codelibs.fess.es.config.exbhv.FileConfigBhv;
import org.codelibs.fess.es.config.exbhv.LabelTypeBhv;
import org.codelibs.fess.es.config.exbhv.WebConfigBhv;
import org.codelibs.fess.es.log.exbhv.ClickLogBhv;
import org.codelibs.fess.es.log.exbhv.FavoriteLogBhv;
import org.codelibs.fess.es.log.exbhv.SearchLogBhv;
import org.codelibs.fess.es.log.exbhv.UserInfoBhv;
import org.codelibs.fess.mylasta.direction.FessConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.GsaConfigParser;
import org.codelibs.fess.util.RenderDataUtil;
import org.lastaflute.core.magic.async.AsyncManager;
import org.lastaflute.web.Execute;
@ -59,6 +63,7 @@ import org.lastaflute.web.response.StreamResponse;
import org.lastaflute.web.ruts.process.ActionRuntime;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.InputSource;
/**
* @author shinsuke
@ -74,6 +79,15 @@ public class AdminBackupAction extends FessAdminAction {
@Resource
private AsyncManager asyncManager;
@Resource
private WebConfigBhv webConfigBhv;
@Resource
private FileConfigBhv fileConfigBhv;
@Resource
private LabelTypeBhv labelTypeBhv;
@Override
protected void setupHtmlData(final ActionRuntime runtime) {
super.setupHtmlData(runtime);
@ -98,6 +112,16 @@ public class AdminBackupAction extends FessAdminAction {
} catch (final IOException e) {
logger.warn("Failed to process system.properties file: " + form.bulkFile.getFileName(), e);
}
} else if (fileName.startsWith("gsa") && fileName.endsWith(".xml")) {
GsaConfigParser configParser = ComponentUtil.getComponent(GsaConfigParser.class);
try (final InputStream in = form.bulkFile.getInputStream()) {
configParser.parse(new InputSource(in));
} catch (final IOException e) {
logger.warn("Failed to process gsa.xml file: " + form.bulkFile.getFileName(), e);
}
configParser.getWebConfig().ifPresent(c -> webConfigBhv.insert(c));
configParser.getFileConfig().ifPresent(c -> fileConfigBhv.insert(c));
labelTypeBhv.batchInsert(Arrays.stream(configParser.getLabelTypes()).collect(Collectors.toList()));
} else {
try (CurlResponse response = ComponentUtil.getCurlHelper().post("/_bulk").onConnect((req, con) -> {
con.setDoOutput(true);

View file

@ -0,0 +1,30 @@
/*
* Copyright 2012-2018 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.exception;
public class GsaConfigException extends FessSystemException {
private static final long serialVersionUID = 1L;
public GsaConfigException(String message, Throwable cause) {
super(message, cause);
}
public GsaConfigException(String message) {
super(message);
}
}

View file

@ -0,0 +1,314 @@
/*
* Copyright 2012-2018 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.util;
import static org.codelibs.core.stream.StreamUtil.split;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.Constants;
import org.codelibs.fess.es.config.exentity.FileConfig;
import org.codelibs.fess.es.config.exentity.LabelType;
import org.codelibs.fess.es.config.exentity.WebConfig;
import org.codelibs.fess.exception.GsaConfigException;
import org.dbflute.optional.OptionalEntity;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class GsaConfigParser extends DefaultHandler {
private static final Logger logger = LoggerFactory.getLogger(GsaConfigParser.class);
protected static final String REGEXP = "regexp:";
protected static final String REGEXP_IGNORE_CASE = "regexpIgnoreCase:";
protected static final String CONTAINS = "contains:";
protected static final String COLLECTIONS = "collections";
protected static final String COLLECTION = "collection";
protected static final String GLOBALPARAMS = "globalparams";
protected static final String START_URLS = "start_urls";
protected static final String GOOD_URLS = "good_urls";
protected static final String BAD_URLS = "bad_urls";
protected String[] webProtocols = new String[] { "http:", "https:" };
protected String[] fileProtocols = new String[] { "file:", "smb:" };
protected LinkedList<String> tagQueue;
protected List<LabelType> labelList;
protected LabelType labelType;
protected Map<String, String> globalParams = new HashMap<>();
protected WebConfig webConfig = null;
protected FileConfig fileConfig = null;
protected StringBuilder textBuf = new StringBuilder(1000);
protected String userAgent = "gsa-crawler";
public void parse(final InputSource is) {
try {
final SAXParserFactory factory = SAXParserFactory.newInstance();
final SAXParser parser = factory.newSAXParser();
parser.parse(is, this);
} catch (final Exception e) {
throw new GsaConfigException("Failed to parse XML file.", e);
}
}
@Override
public void startDocument() throws SAXException {
tagQueue = new LinkedList<>();
labelList = new ArrayList<>();
labelType = null;
}
@Override
public void endDocument() throws SAXException {
globalParams.clear();
tagQueue.clear();
}
@Override
public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) throws SAXException {
if (logger.isDebugEnabled()) {
logger.debug("Start Element: " + qName);
}
if (tagQueue.isEmpty() && !"eef".equalsIgnoreCase(qName)) {
throw new GsaConfigException("Invalid format.");
} else if (COLLECTION.equalsIgnoreCase(qName) && COLLECTIONS.equalsIgnoreCase(tagQueue.peekLast())) {
final long now = System.currentTimeMillis();
final String name = attributes.getValue("Name");
labelType = new LabelType();
labelType.setName(name);
labelType.setValue(name);
labelType.setCreatedBy(Constants.SYSTEM_USER);
labelType.setCreatedTime(now);
labelType.setUpdatedBy(Constants.SYSTEM_USER);
labelType.setUpdatedTime(now);
}
tagQueue.offer(qName);
}
@Override
public void endElement(final String uri, final String localName, final String qName) throws SAXException {
if (logger.isDebugEnabled()) {
logger.debug("End Element: " + qName);
}
if (GOOD_URLS.equalsIgnoreCase(qName)) {
if (labelType != null) {
labelType.setIncludedPaths(parseFilterPaths(textBuf.toString(), true, true));
} else if (GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
globalParams.put(GOOD_URLS, textBuf.toString());
}
} else if (BAD_URLS.equalsIgnoreCase(qName)) {
if (labelType != null) {
labelType.setExcludedPaths(parseFilterPaths(textBuf.toString(), true, true));
} else if (GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
globalParams.put(BAD_URLS, textBuf.toString());
}
} else if (START_URLS.equalsIgnoreCase(qName) && GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
globalParams.put(START_URLS, textBuf.toString());
} else if (labelType != null && COLLECTION.equalsIgnoreCase(qName)) {
labelList.add(labelType);
labelType = null;
} else if (GLOBALPARAMS.equalsIgnoreCase(qName)) {
final Object startUrls = globalParams.get(START_URLS);
if (startUrls != null) {
final long now = System.currentTimeMillis();
final List<String> urlList =
split(startUrls.toString(), "\n").get(
stream -> stream.map(String::trim).filter(StringUtil::isNotBlank).collect(Collectors.toList()));
final String webUrls =
urlList.stream().filter(s -> Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p)))
.collect(Collectors.joining("\n"));
if (StringUtil.isNotBlank(webUrls)) {
webConfig = new WebConfig();
webConfig.setName("Default");
webConfig.setAvailable(true);
webConfig.setBoost(1.0f);
webConfig.setConfigParameter(StringUtil.EMPTY);
webConfig.setIntervalTime(1000);
webConfig.setNumOfThread(3);
webConfig.setSortOrder(1);
webConfig.setUrls(webUrls);
webConfig.setIncludedUrls(parseFilterPaths(globalParams.get(GOOD_URLS), true, false));
webConfig.setIncludedDocUrls(StringUtil.EMPTY);
webConfig.setExcludedUrls(parseFilterPaths(globalParams.get(BAD_URLS), true, false));
webConfig.setExcludedDocUrls(StringUtil.EMPTY);
webConfig.setUserAgent(userAgent);
webConfig.setCreatedBy(Constants.SYSTEM_USER);
webConfig.setCreatedTime(now);
webConfig.setUpdatedBy(Constants.SYSTEM_USER);
webConfig.setUpdatedTime(now);
}
final String fileUrls =
urlList.stream().filter(s -> Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p)))
.collect(Collectors.joining("\n"));
if (StringUtil.isNotBlank(fileUrls)) {
fileConfig = new FileConfig();
fileConfig.setName("Default");
fileConfig.setAvailable(true);
fileConfig.setBoost(1.0f);
fileConfig.setConfigParameter(StringUtil.EMPTY);
fileConfig.setIntervalTime(0);
fileConfig.setNumOfThread(5);
fileConfig.setSortOrder(2);
fileConfig.setPaths(fileUrls);
fileConfig.setIncludedPaths(parseFilterPaths(globalParams.get(GOOD_URLS), false, true));
fileConfig.setIncludedDocPaths(StringUtil.EMPTY);
fileConfig.setExcludedPaths(parseFilterPaths(globalParams.get(BAD_URLS), false, true));
fileConfig.setExcludedDocPaths(StringUtil.EMPTY);
fileConfig.setCreatedBy(Constants.SYSTEM_USER);
fileConfig.setCreatedTime(now);
fileConfig.setUpdatedBy(Constants.SYSTEM_USER);
fileConfig.setUpdatedTime(now);
}
}
} else if ("user_agent".equalsIgnoreCase(qName) && GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
userAgent = textBuf.toString().trim();
}
tagQueue.pollLast();
textBuf.setLength(0);
}
@Override
public void characters(final char[] ch, final int start, final int length) throws SAXException {
String text = new String(ch, start, length);
if (logger.isDebugEnabled()) {
logger.debug("Text: " + text);
}
textBuf.append(text);
}
protected String parseFilterPaths(final String text, final boolean web, final boolean file) {
return split(text, "\n").get(stream -> stream.map(String::trim).filter(StringUtil::isNotBlank).map(s -> {
if (s.startsWith("#")) {
return null;
} else if (s.startsWith(CONTAINS)) {
final String v = s.substring(CONTAINS.length());
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, escape(v));
} else if (s.startsWith(REGEXP_IGNORE_CASE)) {
final String v = s.substring(REGEXP_IGNORE_CASE.length());
final StringBuilder buf = new StringBuilder(100);
buf.append("(?i)");
return appendFileterPath(buf, unescape(v));
} else if (s.startsWith(REGEXP)) {
final String v = s.substring(REGEXP.length());
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, unescape(v));
} else if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
return escape(s) + ".*";
} else if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
return escape(s) + ".*";
} else {
final StringBuilder buf = new StringBuilder(100);
return appendFileterPath(buf, escape(s));
}
}).filter(s -> {
if (StringUtil.isBlank(s)) {
return false;
}
if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
return web;
}
if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
return file;
}
return true;
}).collect(Collectors.joining("\n")));
}
protected String escape(final String s) {
return s.replace(".", "\\.")//
.replace("+", "\\+")//
.replace("*", "\\*")//
.replace("[", "\\[")//
.replace("]", "\\]")//
.replace("(", "\\(")//
.replace("(", "\\)")//
.replace("?", "\\?");
}
protected String unescape(final String s) {
return s.replace("\\\\", "\\");
}
protected String appendFileterPath(final StringBuilder buf, final String v) {
if (!v.startsWith("^")) {
buf.append(".*");
}
buf.append(v);
if (!v.endsWith("$")) {
buf.append(".*");
}
return buf.toString();
}
public void setWebProtocols(final String[] webProtocols) {
this.webProtocols = webProtocols;
}
public void setFileProtocols(final String[] fileProtocols) {
this.fileProtocols = fileProtocols;
}
@Override
public String toString() {
return "GsaConfigParser [labelList=" + labelList + ", webConfig=" + webConfig + ", fileConfig=" + fileConfig + "]";
}
public OptionalEntity<WebConfig> getWebConfig() {
return OptionalUtil.ofNullable(webConfig);
}
public OptionalEntity<FileConfig> getFileConfig() {
return OptionalUtil.ofNullable(fileConfig);
}
public LabelType[] getLabelTypes() {
return labelList.toArray(new LabelType[labelList.size()]);
}
}

View file

@ -383,5 +383,7 @@
</component>
<component name="queryResponseList" class="org.codelibs.fess.util.QueryResponseList" instance="prototype">
</component>
<component name="gsaConfigParser" class="org.codelibs.fess.util.GsaConfigParser" instance="prototype">
</component>
</components>

View file

@ -0,0 +1,43 @@
/*
* Copyright 2012-2018 CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package org.codelibs.fess.util;
import java.io.IOException;
import java.io.InputStream;
import org.codelibs.core.io.ResourceUtil;
import org.codelibs.fess.es.config.exentity.LabelType;
import org.codelibs.fess.unit.UnitFessTestCase;
import org.xml.sax.InputSource;
public class GsaConfigParserTest extends UnitFessTestCase {
public void test_parse() throws IOException {
GsaConfigParser parser = new GsaConfigParser();
try (InputStream is = ResourceUtil.getResourceAsStream("data/gsaconfig.xml")) {
parser.parse(new InputSource(is));
}
parser.getWebConfig().ifPresent(c -> {
System.out.println(c.toString());
}).orElse(() -> fail());
parser.getFileConfig().ifPresent(c -> {
System.out.println(c.toString());
}).orElse(() -> fail());
LabelType[] labelTypes = parser.getLabelTypes();
assertEquals(3, labelTypes.length);
}
}

View file

@ -0,0 +1,86 @@
<?xml version="1.0" encoding="UTF-8" ?>
<eef>
<config Schema="2.0" EnterpriseVersion="'7.6.50'">
<collections Count="3">
<collection Name="fess">
<bad_urls><![CDATA[
https://fess.codelibs.org/images/
]]></bad_urls>
<good_urls><![CDATA[
https://fess.codelibs.org/
https://www.codelibs.org/
]]></good_urls>
<prerequisite_results><![CDATA[
20
]]></prerequisite_results>
<testwords><![CDATA[
]]></testwords>
</collection>
<collection Name="n2sm">
<bad_urls><![CDATA[
contains:\\.xml$
]]></bad_urls>
<good_urls><![CDATA[
https://www.n2sm.net/
]]></good_urls>
<prerequisite_results><![CDATA[
20
]]></prerequisite_results>
<testwords><![CDATA[
]]></testwords>
</collection>
<collection Name="smb">
<bad_urls><![CDATA[
smb://storage/sample/
]]></bad_urls>
<good_urls><![CDATA[
smb://storage/
]]></good_urls>
<prerequisite_results><![CDATA[
20
]]></prerequisite_results>
<testwords><![CDATA[
]]></testwords>
</collection>
</collections>
<globalparams>
<bad_urls><![CDATA[
contains:/images/
contains:?
contains:\\.xml$
# test
regexp:/([^/]*)/\\1/\\1/
.gif$
.jpg$
.jpeg$
.png$
regexpIgnoreCase:\\.dll$
regexpIgnoreCase:\\.exe$
/?S=A$
/?S=D$
contains:\001
contains:\002
contains:\003
.html/$
]]></bad_urls>
<good_urls><![CDATA[
https://fess.codelibs.org/
https://www.codelibs.org/
https://www.n2sm.net/
smb://storage/
]]></good_urls>
<start_urls><![CDATA[
https://fess.codelibs.org/
https://www.codelibs.org/
https://www.n2sm.net/
smb://storage/
]]></start_urls>
</globalparams>
</config>
</eef>