Browse Source

fix #2861 Add Kryo support for temporary data serialization in the crawler

Shinsuke Sugaya 6 months ago
parent
commit
80e9cb0ddf

+ 5 - 0
pom.xml

@@ -1355,6 +1355,11 @@
 			<artifactId>bcprov-jdk18on</artifactId>
 			<version>${bouncycastle.version}</version>
 		</dependency>
+		<dependency>
+			<groupId>com.esotericsoftware</groupId>
+			<artifactId>kryo</artifactId>
+			<version>${kryo.version}</version>
+		</dependency>
 
 		<!-- suggest library -->
 		<dependency>

+ 0 - 6
src/main/java/org/codelibs/fess/app/web/admin/upgrade/AdminUpgradeAction.java

@@ -18,18 +18,13 @@ package org.codelibs.fess.app.web.admin.upgrade;
 import java.io.IOException;
 import java.util.List;
 import java.util.Map;
-import java.util.stream.Collectors;
 
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
-import org.codelibs.core.lang.StringUtil;
-import org.codelibs.core.stream.StreamUtil;
 import org.codelibs.curl.CurlResponse;
-import org.codelibs.fess.Constants;
 import org.codelibs.fess.annotation.Secured;
 import org.codelibs.fess.app.service.ScheduledJobService;
 import org.codelibs.fess.app.web.base.FessAdminAction;
-import org.codelibs.fess.mylasta.direction.FessConfig;
 import org.codelibs.fess.opensearch.client.SearchEngineClient;
 import org.codelibs.fess.opensearch.config.exbhv.DataConfigBhv;
 import org.codelibs.fess.opensearch.config.exbhv.ElevateWordBhv;
@@ -39,7 +34,6 @@ import org.codelibs.fess.opensearch.config.exbhv.RoleTypeBhv;
 import org.codelibs.fess.opensearch.config.exbhv.WebConfigBhv;
 import org.codelibs.fess.opensearch.user.exbhv.RoleBhv;
 import org.codelibs.fess.util.ComponentUtil;
-import org.codelibs.fess.util.UpgradeUtil;
 import org.codelibs.opensearch.runner.net.OpenSearchCurl;
 import org.lastaflute.web.Execute;
 import org.lastaflute.web.response.HtmlResponse;

+ 93 - 0
src/main/java/org/codelibs/fess/crawler/serializer/DataSerializer.java

@@ -0,0 +1,93 @@
+/*
+ * Copyright 2012-2024 CodeLibs Project and the Others.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+ * either express or implied. See the License for the specific language
+ * governing permissions and limitations under the License.
+ */
+package org.codelibs.fess.crawler.serializer;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.codelibs.core.exception.IORuntimeException;
+import org.codelibs.core.io.SerializeUtil;
+import org.codelibs.fess.util.ComponentUtil;
+
+import com.esotericsoftware.kryo.Kryo;
+import com.esotericsoftware.kryo.io.Input;
+import com.esotericsoftware.kryo.io.Output;
+
+public class DataSerializer {
+
+    private static final Logger logger = LogManager.getLogger(DataSerializer.class);
+
+    protected static final String JAVABIN = "javabin";
+
+    protected static final String KRYO = "kryo";
+
+    protected final ThreadLocal<Kryo> kryoThreadLocal;
+
+    public DataSerializer() {
+        kryoThreadLocal = ThreadLocal.withInitial(() -> {
+            final Kryo kryo = new Kryo();
+            // TODO use kryo.register
+            kryo.setRegistrationRequired(false);
+            if (logger.isDebugEnabled()) {
+                kryo.setWarnUnregisteredClasses(true);
+            }
+            return kryo;
+        });
+    }
+
+    protected String getSerializerType() {
+        return ComponentUtil.getFessConfig().getCrawlerDataSerializer();
+    }
+
+    public byte[] fromObjectToBinary(final Object obj) {
+        final String serializer = getSerializerType();
+        return switch (serializer) {
+        case KRYO -> serializeWithKryo(obj);
+        case JAVABIN -> SerializeUtil.fromObjectToBinary(obj);
+        default -> throw new IllegalArgumentException("Unexpected value: " + serializer);
+        };
+    }
+
+    public Object fromBinaryToObject(final byte[] bytes) {
+        final String serializer = getSerializerType();
+        return switch (serializer) {
+        case KRYO -> deserializeWithKryo(bytes);
+        case JAVABIN -> SerializeUtil.fromBinaryToObject(bytes);
+        default -> throw new IllegalArgumentException("Unexpected value: " + serializer);
+        };
+    }
+
+    protected byte[] serializeWithKryo(final Object obj) {
+        final Kryo kryo = kryoThreadLocal.get();
+        try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Output output = new Output(baos)) {
+            kryo.writeClassAndObject(output, obj);
+            output.flush();
+            return baos.toByteArray();
+        } catch (final IOException e) {
+            throw new IORuntimeException(e);
+        }
+    }
+
+    protected Object deserializeWithKryo(final byte[] bytes) {
+        final Kryo kryo = kryoThreadLocal.get();
+        try (final Input input = new Input(new ByteArrayInputStream(bytes))) {
+            return kryo.readClassAndObject(input);
+        }
+    }
+}

+ 5 - 3
src/main/java/org/codelibs/fess/crawler/transformer/AbstractFessFileTransformer.java

@@ -29,7 +29,6 @@ import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
-import org.codelibs.core.io.SerializeUtil;
 import org.codelibs.core.lang.StringUtil;
 import org.codelibs.core.misc.Tuple3;
 import org.codelibs.fess.Constants;
@@ -42,6 +41,7 @@ import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
 import org.codelibs.fess.crawler.extractor.Extractor;
 import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
+import org.codelibs.fess.crawler.serializer.DataSerializer;
 import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
 import org.codelibs.fess.crawler.util.FieldConfigs;
@@ -68,6 +68,8 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
 
     protected FessConfig fessConfig;
 
+    protected DataSerializer dataSerializer;
+
     protected abstract Extractor getExtractor(ResponseData responseData);
 
     @Override
@@ -79,7 +81,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         final ResultData resultData = new ResultData();
         resultData.setTransformerName(getName());
         try {
-            resultData.setData(SerializeUtil.fromObjectToBinary(generateData(responseData)));
+            resultData.setData(dataSerializer.fromObjectToBinary(generateData(responseData)));
         } catch (final Exception e) {
             throw new CrawlingAccessException("Could not serialize object", e);
         }
@@ -485,7 +487,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
         final byte[] data = accessResultData.getData();
         if (data != null) {
             try {
-                return SerializeUtil.fromBinaryToObject(data);
+                return dataSerializer.fromBinaryToObject(data);
             } catch (final Exception e) {
                 throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
             }

+ 1 - 0
src/main/java/org/codelibs/fess/crawler/transformer/FessFileTransformer.java

@@ -35,6 +35,7 @@ public class FessFileTransformer extends AbstractFessFileTransformer {
             logger.debug("Initialize {}", this.getClass().getSimpleName());
         }
         fessConfig = ComponentUtil.getFessConfig();
+        dataSerializer = ComponentUtil.getComponent("dataSerializer");
     }
 
     @Override

+ 1 - 0
src/main/java/org/codelibs/fess/crawler/transformer/FessStandardTransformer.java

@@ -35,6 +35,7 @@ public class FessStandardTransformer extends AbstractFessFileTransformer {
             logger.debug("Initialize {}", this.getClass().getSimpleName());
         }
         fessConfig = ComponentUtil.getFessConfig();
+        dataSerializer = ComponentUtil.getComponent("dataSerializer");
     }
 
     @Override

+ 1 - 0
src/main/java/org/codelibs/fess/crawler/transformer/FessTransformer.java

@@ -263,4 +263,5 @@ public interface FessTransformer {
         }
         return newDataMap;
     }
+
 }

+ 6 - 3
src/main/java/org/codelibs/fess/crawler/transformer/FessXpathTransformer.java

@@ -42,7 +42,6 @@ import javax.xml.xpath.XPathNodes;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.codelibs.core.io.InputStreamUtil;
-import org.codelibs.core.io.SerializeUtil;
 import org.codelibs.core.lang.StringUtil;
 import org.codelibs.core.misc.Pair;
 import org.codelibs.core.misc.ValueHolder;
@@ -56,6 +55,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
 import org.codelibs.fess.crawler.exception.CrawlerSystemException;
 import org.codelibs.fess.crawler.exception.CrawlingAccessException;
+import org.codelibs.fess.crawler.serializer.DataSerializer;
 import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
 import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
 import org.codelibs.fess.crawler.util.FieldConfigs;
@@ -109,6 +109,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
 
     protected FessConfig fessConfig;
 
+    protected DataSerializer dataSerializer;
+
     protected boolean useGoogleOffOn = true;
 
     protected Map<String, Boolean> fieldPrunedRuleMap = new HashMap<>();
@@ -121,6 +123,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
             logger.debug("Initialize {}", this.getClass().getSimpleName());
         }
         fessConfig = ComponentUtil.getFessConfig();
+        dataSerializer = ComponentUtil.getComponent("dataSerializer");
     }
 
     @Override
@@ -193,7 +196,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         normalizeData(responseData, dataMap);
 
         try {
-            resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
+            resultData.setData(dataSerializer.fromObjectToBinary(dataMap));
         } catch (final Exception e) {
             throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
         }
@@ -816,7 +819,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
         final byte[] data = accessResultData.getData();
         if (data != null) {
             try {
-                return SerializeUtil.fromBinaryToObject(data);
+                return dataSerializer.fromBinaryToObject(data);
             } catch (final Exception e) {
                 throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
             }

+ 15 - 0
src/main/java/org/codelibs/fess/mylasta/direction/FessConfig.java

@@ -319,6 +319,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
     /** The key of the configuration. e.g. 0 */
     String CRAWLER_HTTP_thread_pool_SIZE = "crawler.http.thread_pool.size";
 
+    /** The key of the configuration. e.g. kryo */
+    String CRAWLER_DATA_SERIALIZER = "crawler.data.serializer";
+
     /** The key of the configuration. e.g. 100 */
     String CRAWLER_DOCUMENT_MAX_SITE_LENGTH = "crawler.document.max.site.length";
 
@@ -2687,6 +2690,13 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
      */
     Integer getCrawlerHttpThreadPoolSizeAsInteger();
 
+    /**
+     * Get the value for the key 'crawler.data.serializer'. <br>
+     * The value is, e.g. kryo <br>
+     * @return The value of found property. (NotNull: if not found, exception but basically no way)
+     */
+    String getCrawlerDataSerializer();
+
     /**
      * Get the value for the key 'crawler.document.max.site.length'. <br>
      * The value is, e.g. 100 <br>
@@ -8259,6 +8269,10 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             return getAsInteger(FessConfig.CRAWLER_HTTP_thread_pool_SIZE);
         }
 
+        public String getCrawlerDataSerializer() {
+            return get(FessConfig.CRAWLER_DATA_SERIALIZER);
+        }
+
         public String getCrawlerDocumentMaxSiteLength() {
             return get(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH);
         }
@@ -11095,6 +11109,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
             defaultMap.put(FessConfig.HTTP_FILEUPLOAD_MAX_FILE_COUNT, "10");
             defaultMap.put(FessConfig.CRAWLER_DEFAULT_SCRIPT, "groovy");
             defaultMap.put(FessConfig.CRAWLER_HTTP_thread_pool_SIZE, "0");
+            defaultMap.put(FessConfig.CRAWLER_DATA_SERIALIZER, "kryo");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH, "100");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_SITE_ENCODING, "UTF-8");
             defaultMap.put(FessConfig.CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME, "unknown");

+ 3 - 0
src/main/resources/crawler/transformer.xml

@@ -35,4 +35,7 @@
 	<component name="fessStandardTransformer" class="org.codelibs.fess.crawler.transformer.FessStandardTransformer" instance="singleton">
 		<property name="name">"fessStandardTransformer"</property>
 	</component>
+
+	<component name="dataSerializer" class="org.codelibs.fess.crawler.serializer.DataSerializer" instance="singleton">
+	</component>
 </components>

+ 1 - 0
src/main/resources/fess_config.properties

@@ -204,6 +204,7 @@ http.fileupload.max.file.count=10
 # common
 crawler.default.script=groovy
 crawler.http.thread_pool.size=0
+crawler.data.serializer=kryo
 crawler.document.max.site.length=100
 crawler.document.site.encoding=UTF-8
 crawler.document.unknown.hostname=unknown

+ 15 - 0
src/test/java/org/codelibs/fess/crawler/transformer/FessFileTransformerTest.java

@@ -21,12 +21,26 @@ import java.util.Map;
 
 import org.apache.groovy.util.Maps;
 import org.codelibs.fess.Constants;
+import org.codelibs.fess.crawler.serializer.DataSerializer;
 import org.codelibs.fess.crawler.util.FieldConfigs;
 import org.codelibs.fess.exception.FessSystemException;
 import org.codelibs.fess.unit.UnitFessTestCase;
+import org.codelibs.fess.util.ComponentUtil;
 
 public class FessFileTransformerTest extends UnitFessTestCase {
 
+    @Override
+    public void setUp() throws Exception {
+        super.setUp();
+        ComponentUtil.register(new DataSerializer(), "dataSerializer");
+    }
+
+    @Override
+    public void tearDown() throws Exception {
+        ComponentUtil.setFessConfig(null);
+        super.tearDown();
+    }
+
     private String encodeUrl(final String url) {
         try {
             return URLEncoder.encode(url, Constants.UTF_8);
@@ -292,4 +306,5 @@ public class FessFileTransformerTest extends UnitFessTestCase {
         transformer.init();
         return transformer;
     }
+
 }

+ 13 - 0
src/test/java/org/codelibs/fess/crawler/transformer/FessXpathTransformerTest.java

@@ -44,6 +44,7 @@ import org.codelibs.fess.crawler.entity.RequestData;
 import org.codelibs.fess.crawler.entity.ResponseData;
 import org.codelibs.fess.crawler.entity.ResultData;
 import org.codelibs.fess.crawler.exception.ChildUrlsException;
+import org.codelibs.fess.crawler.serializer.DataSerializer;
 import org.codelibs.fess.crawler.util.FieldConfigs;
 import org.codelibs.fess.helper.CrawlingConfigHelper;
 import org.codelibs.fess.helper.CrawlingInfoHelper;
@@ -70,6 +71,18 @@ import org.xml.sax.InputSource;
 public class FessXpathTransformerTest extends UnitFessTestCase {
     private static final Logger logger = LogManager.getLogger(FessXpathTransformerTest.class);
 
+    @Override
+    public void setUp() throws Exception {
+        super.setUp();
+        ComponentUtil.register(new DataSerializer(), "dataSerializer");
+    }
+
+    @Override
+    public void tearDown() throws Exception {
+        ComponentUtil.setFessConfig(null);
+        super.tearDown();
+    }
+
     public void test_transform() throws Exception {
         String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";