Browse Source

fix #1646 import GSA configuration

Shinsuke Sugaya 7 years ago
parent
commit
ce58769519

+ 24 - 0
src/main/java/org/codelibs/fess/app/web/admin/backup/AdminBackupAction.java

@@ -44,12 +44,16 @@ import org.codelibs.core.misc.Pair;
 import org.codelibs.elasticsearch.runner.net.CurlResponse;
 import org.codelibs.fess.Constants;
 import org.codelibs.fess.app.web.base.FessAdminAction;
+import org.codelibs.fess.es.config.exbhv.FileConfigBhv;
+import org.codelibs.fess.es.config.exbhv.LabelTypeBhv;
+import org.codelibs.fess.es.config.exbhv.WebConfigBhv;
 import org.codelibs.fess.es.log.exbhv.ClickLogBhv;
 import org.codelibs.fess.es.log.exbhv.FavoriteLogBhv;
 import org.codelibs.fess.es.log.exbhv.SearchLogBhv;
 import org.codelibs.fess.es.log.exbhv.UserInfoBhv;
 import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.util.ComponentUtil;
+import org.codelibs.fess.util.GsaConfigParser;
 import org.codelibs.fess.util.RenderDataUtil;
 import org.lastaflute.core.magic.async.AsyncManager;
 import org.lastaflute.web.Execute;
@@ -59,6 +63,7 @@ import org.lastaflute.web.response.StreamResponse;
 import org.lastaflute.web.ruts.process.ActionRuntime;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+import org.xml.sax.InputSource;
 
 /**
  * @author shinsuke
@@ -74,6 +79,15 @@ public class AdminBackupAction extends FessAdminAction {
     @Resource
     private AsyncManager asyncManager;
 
+    @Resource
+    private WebConfigBhv webConfigBhv;
+
+    @Resource
+    private FileConfigBhv fileConfigBhv;
+
+    @Resource
+    private LabelTypeBhv labelTypeBhv;
+
     @Override
     protected void setupHtmlData(final ActionRuntime runtime) {
         super.setupHtmlData(runtime);
@@ -98,6 +112,16 @@ public class AdminBackupAction extends FessAdminAction {
                 } catch (final IOException e) {
                     logger.warn("Failed to process system.properties file: " + form.bulkFile.getFileName(), e);
                 }
+            } else if (fileName.startsWith("gsa") && fileName.endsWith(".xml")) {
+                GsaConfigParser configParser = ComponentUtil.getComponent(GsaConfigParser.class);
+                try (final InputStream in = form.bulkFile.getInputStream()) {
+                    configParser.parse(new InputSource(in));
+                } catch (final IOException e) {
+                    logger.warn("Failed to process gsa.xml file: " + form.bulkFile.getFileName(), e);
+                }
+                configParser.getWebConfig().ifPresent(c -> webConfigBhv.insert(c));
+                configParser.getFileConfig().ifPresent(c -> fileConfigBhv.insert(c));
+                labelTypeBhv.batchInsert(Arrays.stream(configParser.getLabelTypes()).collect(Collectors.toList()));
             } else {
                 try (CurlResponse response = ComponentUtil.getCurlHelper().post("/_bulk").onConnect((req, con) -> {
                     con.setDoOutput(true);

+ 30 - 0
src/main/java/org/codelibs/fess/exception/GsaConfigException.java

@@ -0,0 +1,30 @@
+/*
+ * Copyright 2012-2018 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.exception;
+
+public class GsaConfigException extends FessSystemException {
+
+    private static final long serialVersionUID = 1L;
+
+    public GsaConfigException(String message, Throwable cause) {
+        super(message, cause);
+    }
+
+    public GsaConfigException(String message) {
+        super(message);
+    }
+
+}

+ 314 - 0
src/main/java/org/codelibs/fess/util/GsaConfigParser.java

@@ -0,0 +1,314 @@
+/*
+ * Copyright 2012-2018 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.util;
+
+import static org.codelibs.core.stream.StreamUtil.split;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.codelibs.core.lang.StringUtil;
+import org.codelibs.fess.Constants;
+import org.codelibs.fess.es.config.exentity.FileConfig;
+import org.codelibs.fess.es.config.exentity.LabelType;
+import org.codelibs.fess.es.config.exentity.WebConfig;
+import org.codelibs.fess.exception.GsaConfigException;
+import org.dbflute.optional.OptionalEntity;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class GsaConfigParser extends DefaultHandler {
+
+    private static final Logger logger = LoggerFactory.getLogger(GsaConfigParser.class);
+
+    protected static final String REGEXP = "regexp:";
+
+    protected static final String REGEXP_IGNORE_CASE = "regexpIgnoreCase:";
+
+    protected static final String CONTAINS = "contains:";
+
+    protected static final String COLLECTIONS = "collections";
+
+    protected static final String COLLECTION = "collection";
+
+    protected static final String GLOBALPARAMS = "globalparams";
+
+    protected static final String START_URLS = "start_urls";
+
+    protected static final String GOOD_URLS = "good_urls";
+
+    protected static final String BAD_URLS = "bad_urls";
+
+    protected String[] webProtocols = new String[] { "http:", "https:" };
+
+    protected String[] fileProtocols = new String[] { "file:", "smb:" };
+
+    protected LinkedList<String> tagQueue;
+
+    protected List<LabelType> labelList;
+
+    protected LabelType labelType;
+
+    protected Map<String, String> globalParams = new HashMap<>();
+
+    protected WebConfig webConfig = null;
+
+    protected FileConfig fileConfig = null;
+
+    protected StringBuilder textBuf = new StringBuilder(1000);
+
+    protected String userAgent = "gsa-crawler";
+
+    public void parse(final InputSource is) {
+        try {
+            final SAXParserFactory factory = SAXParserFactory.newInstance();
+            final SAXParser parser = factory.newSAXParser();
+            parser.parse(is, this);
+        } catch (final Exception e) {
+            throw new GsaConfigException("Failed to parse XML file.", e);
+        }
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        tagQueue = new LinkedList<>();
+        labelList = new ArrayList<>();
+        labelType = null;
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        globalParams.clear();
+        tagQueue.clear();
+    }
+
+    @Override
+    public void startElement(final String uri, final String localName, final String qName, final Attributes attributes) throws SAXException {
+        if (logger.isDebugEnabled()) {
+            logger.debug("Start Element: " + qName);
+        }
+        if (tagQueue.isEmpty() && !"eef".equalsIgnoreCase(qName)) {
+            throw new GsaConfigException("Invalid format.");
+        } else if (COLLECTION.equalsIgnoreCase(qName) && COLLECTIONS.equalsIgnoreCase(tagQueue.peekLast())) {
+            final long now = System.currentTimeMillis();
+            final String name = attributes.getValue("Name");
+            labelType = new LabelType();
+            labelType.setName(name);
+            labelType.setValue(name);
+            labelType.setCreatedBy(Constants.SYSTEM_USER);
+            labelType.setCreatedTime(now);
+            labelType.setUpdatedBy(Constants.SYSTEM_USER);
+            labelType.setUpdatedTime(now);
+        }
+        tagQueue.offer(qName);
+    }
+
+    @Override
+    public void endElement(final String uri, final String localName, final String qName) throws SAXException {
+        if (logger.isDebugEnabled()) {
+            logger.debug("End Element: " + qName);
+        }
+        if (GOOD_URLS.equalsIgnoreCase(qName)) {
+            if (labelType != null) {
+                labelType.setIncludedPaths(parseFilterPaths(textBuf.toString(), true, true));
+            } else if (GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
+                globalParams.put(GOOD_URLS, textBuf.toString());
+            }
+        } else if (BAD_URLS.equalsIgnoreCase(qName)) {
+            if (labelType != null) {
+                labelType.setExcludedPaths(parseFilterPaths(textBuf.toString(), true, true));
+            } else if (GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
+                globalParams.put(BAD_URLS, textBuf.toString());
+            }
+        } else if (START_URLS.equalsIgnoreCase(qName) && GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
+            globalParams.put(START_URLS, textBuf.toString());
+        } else if (labelType != null && COLLECTION.equalsIgnoreCase(qName)) {
+            labelList.add(labelType);
+            labelType = null;
+        } else if (GLOBALPARAMS.equalsIgnoreCase(qName)) {
+            final Object startUrls = globalParams.get(START_URLS);
+            if (startUrls != null) {
+                final long now = System.currentTimeMillis();
+                final List<String> urlList =
+                        split(startUrls.toString(), "\n").get(
+                                stream -> stream.map(String::trim).filter(StringUtil::isNotBlank).collect(Collectors.toList()));
+
+                final String webUrls =
+                        urlList.stream().filter(s -> Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p)))
+                                .collect(Collectors.joining("\n"));
+                if (StringUtil.isNotBlank(webUrls)) {
+                    webConfig = new WebConfig();
+                    webConfig.setName("Default");
+                    webConfig.setAvailable(true);
+                    webConfig.setBoost(1.0f);
+                    webConfig.setConfigParameter(StringUtil.EMPTY);
+                    webConfig.setIntervalTime(1000);
+                    webConfig.setNumOfThread(3);
+                    webConfig.setSortOrder(1);
+                    webConfig.setUrls(webUrls);
+                    webConfig.setIncludedUrls(parseFilterPaths(globalParams.get(GOOD_URLS), true, false));
+                    webConfig.setIncludedDocUrls(StringUtil.EMPTY);
+                    webConfig.setExcludedUrls(parseFilterPaths(globalParams.get(BAD_URLS), true, false));
+                    webConfig.setExcludedDocUrls(StringUtil.EMPTY);
+                    webConfig.setUserAgent(userAgent);
+                    webConfig.setCreatedBy(Constants.SYSTEM_USER);
+                    webConfig.setCreatedTime(now);
+                    webConfig.setUpdatedBy(Constants.SYSTEM_USER);
+                    webConfig.setUpdatedTime(now);
+                }
+
+                final String fileUrls =
+                        urlList.stream().filter(s -> Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p)))
+                                .collect(Collectors.joining("\n"));
+                if (StringUtil.isNotBlank(fileUrls)) {
+                    fileConfig = new FileConfig();
+                    fileConfig.setName("Default");
+                    fileConfig.setAvailable(true);
+                    fileConfig.setBoost(1.0f);
+                    fileConfig.setConfigParameter(StringUtil.EMPTY);
+                    fileConfig.setIntervalTime(0);
+                    fileConfig.setNumOfThread(5);
+                    fileConfig.setSortOrder(2);
+                    fileConfig.setPaths(fileUrls);
+                    fileConfig.setIncludedPaths(parseFilterPaths(globalParams.get(GOOD_URLS), false, true));
+                    fileConfig.setIncludedDocPaths(StringUtil.EMPTY);
+                    fileConfig.setExcludedPaths(parseFilterPaths(globalParams.get(BAD_URLS), false, true));
+                    fileConfig.setExcludedDocPaths(StringUtil.EMPTY);
+                    fileConfig.setCreatedBy(Constants.SYSTEM_USER);
+                    fileConfig.setCreatedTime(now);
+                    fileConfig.setUpdatedBy(Constants.SYSTEM_USER);
+                    fileConfig.setUpdatedTime(now);
+                }
+            }
+        } else if ("user_agent".equalsIgnoreCase(qName) && GLOBALPARAMS.equalsIgnoreCase(tagQueue.get(tagQueue.size() - 2))) {
+            userAgent = textBuf.toString().trim();
+        }
+        tagQueue.pollLast();
+        textBuf.setLength(0);
+    }
+
+    @Override
+    public void characters(final char[] ch, final int start, final int length) throws SAXException {
+        String text = new String(ch, start, length);
+        if (logger.isDebugEnabled()) {
+            logger.debug("Text: " + text);
+        }
+        textBuf.append(text);
+    }
+
+    protected String parseFilterPaths(final String text, final boolean web, final boolean file) {
+        return split(text, "\n").get(stream -> stream.map(String::trim).filter(StringUtil::isNotBlank).map(s -> {
+            if (s.startsWith("#")) {
+                return null;
+            } else if (s.startsWith(CONTAINS)) {
+                final String v = s.substring(CONTAINS.length());
+                final StringBuilder buf = new StringBuilder(100);
+                return appendFileterPath(buf, escape(v));
+            } else if (s.startsWith(REGEXP_IGNORE_CASE)) {
+                final String v = s.substring(REGEXP_IGNORE_CASE.length());
+                final StringBuilder buf = new StringBuilder(100);
+                buf.append("(?i)");
+                return appendFileterPath(buf, unescape(v));
+            } else if (s.startsWith(REGEXP)) {
+                final String v = s.substring(REGEXP.length());
+                final StringBuilder buf = new StringBuilder(100);
+                return appendFileterPath(buf, unescape(v));
+            } else if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
+                return escape(s) + ".*";
+            } else if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
+                return escape(s) + ".*";
+            } else {
+                final StringBuilder buf = new StringBuilder(100);
+                return appendFileterPath(buf, escape(s));
+            }
+        }).filter(s -> {
+            if (StringUtil.isBlank(s)) {
+                return false;
+            }
+            if (Arrays.stream(webProtocols).anyMatch(p -> s.startsWith(p))) {
+                return web;
+            }
+            if (Arrays.stream(fileProtocols).anyMatch(p -> s.startsWith(p))) {
+                return file;
+            }
+            return true;
+        }).collect(Collectors.joining("\n")));
+    }
+
+    protected String escape(final String s) {
+        return s.replace(".", "\\.")//
+                .replace("+", "\\+")//
+                .replace("*", "\\*")//
+                .replace("[", "\\[")//
+                .replace("]", "\\]")//
+                .replace("(", "\\(")//
+                .replace("(", "\\)")//
+                .replace("?", "\\?");
+    }
+
+    protected String unescape(final String s) {
+        return s.replace("\\\\", "\\");
+    }
+
+    protected String appendFileterPath(final StringBuilder buf, final String v) {
+        if (!v.startsWith("^")) {
+            buf.append(".*");
+        }
+        buf.append(v);
+        if (!v.endsWith("$")) {
+            buf.append(".*");
+        }
+        return buf.toString();
+    }
+
+    public void setWebProtocols(final String[] webProtocols) {
+        this.webProtocols = webProtocols;
+    }
+
+    public void setFileProtocols(final String[] fileProtocols) {
+        this.fileProtocols = fileProtocols;
+    }
+
+    @Override
+    public String toString() {
+        return "GsaConfigParser [labelList=" + labelList + ", webConfig=" + webConfig + ", fileConfig=" + fileConfig + "]";
+    }
+
+    public OptionalEntity<WebConfig> getWebConfig() {
+        return OptionalUtil.ofNullable(webConfig);
+    }
+
+    public OptionalEntity<FileConfig> getFileConfig() {
+        return OptionalUtil.ofNullable(fileConfig);
+    }
+
+    public LabelType[] getLabelTypes() {
+        return labelList.toArray(new LabelType[labelList.size()]);
+    }
+
+}

+ 2 - 0
src/main/resources/app.xml

@@ -383,5 +383,7 @@
 	</component>
 	<component name="queryResponseList" class="org.codelibs.fess.util.QueryResponseList" instance="prototype">
 	</component>
+	<component name="gsaConfigParser" class="org.codelibs.fess.util.GsaConfigParser" instance="prototype">
+	</component>
 
 </components>

+ 43 - 0
src/test/java/org/codelibs/fess/util/GsaConfigParserTest.java

@@ -0,0 +1,43 @@
+/*
+ * Copyright 2012-2018 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.util;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.codelibs.core.io.ResourceUtil;
+import org.codelibs.fess.es.config.exentity.LabelType;
+import org.codelibs.fess.unit.UnitFessTestCase;
+import org.xml.sax.InputSource;
+
+public class GsaConfigParserTest extends UnitFessTestCase {
+
+    public void test_parse() throws IOException {
+        GsaConfigParser parser = new GsaConfigParser();
+        try (InputStream is = ResourceUtil.getResourceAsStream("data/gsaconfig.xml")) {
+            parser.parse(new InputSource(is));
+        }
+        parser.getWebConfig().ifPresent(c -> {
+            System.out.println(c.toString());
+        }).orElse(() -> fail());
+        parser.getFileConfig().ifPresent(c -> {
+            System.out.println(c.toString());
+        }).orElse(() -> fail());
+        LabelType[] labelTypes = parser.getLabelTypes();
+        assertEquals(3, labelTypes.length);
+    }
+
+}

+ 86 - 0
src/test/resources/data/gsaconfig.xml

@@ -0,0 +1,86 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<eef>
+	<config Schema="2.0" EnterpriseVersion="'7.6.50'">
+		<collections Count="3">
+			<collection Name="fess">
+				<bad_urls><![CDATA[
+https://fess.codelibs.org/images/
+              ]]></bad_urls>
+				<good_urls><![CDATA[
+https://fess.codelibs.org/
+https://www.codelibs.org/
+              ]]></good_urls>
+				<prerequisite_results><![CDATA[
+20
+              ]]></prerequisite_results>
+				<testwords><![CDATA[
+
+              ]]></testwords>
+			</collection>
+			<collection Name="n2sm">
+				<bad_urls><![CDATA[
+contains:\\.xml$
+              ]]></bad_urls>
+				<good_urls><![CDATA[
+https://www.n2sm.net/
+              ]]></good_urls>
+				<prerequisite_results><![CDATA[
+20
+              ]]></prerequisite_results>
+				<testwords><![CDATA[
+
+              ]]></testwords>
+			</collection>
+			<collection Name="smb">
+				<bad_urls><![CDATA[
+smb://storage/sample/
+              ]]></bad_urls>
+				<good_urls><![CDATA[
+smb://storage/
+              ]]></good_urls>
+				<prerequisite_results><![CDATA[
+20
+              ]]></prerequisite_results>
+				<testwords><![CDATA[
+
+              ]]></testwords>
+			</collection>
+		</collections>
+		<globalparams>
+			<bad_urls><![CDATA[
+contains:/images/
+contains:?
+contains:\\.xml$
+# test
+regexp:/([^/]*)/\\1/\\1/
+.gif$
+.jpg$
+.jpeg$
+.png$
+regexpIgnoreCase:\\.dll$
+regexpIgnoreCase:\\.exe$
+/?S=A$
+/?S=D$
+contains:\001
+contains:\002
+contains:\003
+.html/$
+
+          ]]></bad_urls>
+			<good_urls><![CDATA[
+https://fess.codelibs.org/
+https://www.codelibs.org/
+https://www.n2sm.net/
+smb://storage/
+
+          ]]></good_urls>
+			<start_urls><![CDATA[
+https://fess.codelibs.org/
+https://www.codelibs.org/
+https://www.n2sm.net/
+smb://storage/
+
+          ]]></start_urls>
+		</globalparams>
+	</config>
+</eef>