fix #2861 Add Kryo support for temporary data serialization in the crawler
This commit is contained in:
parent
c3514c5b3a
commit
80e9cb0ddf
13 changed files with 159 additions and 12 deletions
5
pom.xml
5
pom.xml
|
@ -1355,6 +1355,11 @@
|
|||
<artifactId>bcprov-jdk18on</artifactId>
|
||||
<version>${bouncycastle.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.esotericsoftware</groupId>
|
||||
<artifactId>kryo</artifactId>
|
||||
<version>${kryo.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- suggest library -->
|
||||
<dependency>
|
||||
|
|
|
@ -18,18 +18,13 @@ package org.codelibs.fess.app.web.admin.upgrade;
|
|||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.stream.StreamUtil;
|
||||
import org.codelibs.curl.CurlResponse;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.annotation.Secured;
|
||||
import org.codelibs.fess.app.service.ScheduledJobService;
|
||||
import org.codelibs.fess.app.web.base.FessAdminAction;
|
||||
import org.codelibs.fess.mylasta.direction.FessConfig;
|
||||
import org.codelibs.fess.opensearch.client.SearchEngineClient;
|
||||
import org.codelibs.fess.opensearch.config.exbhv.DataConfigBhv;
|
||||
import org.codelibs.fess.opensearch.config.exbhv.ElevateWordBhv;
|
||||
|
@ -39,7 +34,6 @@ import org.codelibs.fess.opensearch.config.exbhv.RoleTypeBhv;
|
|||
import org.codelibs.fess.opensearch.config.exbhv.WebConfigBhv;
|
||||
import org.codelibs.fess.opensearch.user.exbhv.RoleBhv;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.UpgradeUtil;
|
||||
import org.codelibs.opensearch.runner.net.OpenSearchCurl;
|
||||
import org.lastaflute.web.Execute;
|
||||
import org.lastaflute.web.response.HtmlResponse;
|
||||
|
|
|
@ -0,0 +1,93 @@
|
|||
/*
|
||||
* Copyright 2012-2024 CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
package org.codelibs.fess.crawler.serializer;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.codelibs.core.exception.IORuntimeException;
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
|
||||
import com.esotericsoftware.kryo.Kryo;
|
||||
import com.esotericsoftware.kryo.io.Input;
|
||||
import com.esotericsoftware.kryo.io.Output;
|
||||
|
||||
public class DataSerializer {
|
||||
|
||||
private static final Logger logger = LogManager.getLogger(DataSerializer.class);
|
||||
|
||||
protected static final String JAVABIN = "javabin";
|
||||
|
||||
protected static final String KRYO = "kryo";
|
||||
|
||||
protected final ThreadLocal<Kryo> kryoThreadLocal;
|
||||
|
||||
public DataSerializer() {
|
||||
kryoThreadLocal = ThreadLocal.withInitial(() -> {
|
||||
final Kryo kryo = new Kryo();
|
||||
// TODO use kryo.register
|
||||
kryo.setRegistrationRequired(false);
|
||||
if (logger.isDebugEnabled()) {
|
||||
kryo.setWarnUnregisteredClasses(true);
|
||||
}
|
||||
return kryo;
|
||||
});
|
||||
}
|
||||
|
||||
protected String getSerializerType() {
|
||||
return ComponentUtil.getFessConfig().getCrawlerDataSerializer();
|
||||
}
|
||||
|
||||
public byte[] fromObjectToBinary(final Object obj) {
|
||||
final String serializer = getSerializerType();
|
||||
return switch (serializer) {
|
||||
case KRYO -> serializeWithKryo(obj);
|
||||
case JAVABIN -> SerializeUtil.fromObjectToBinary(obj);
|
||||
default -> throw new IllegalArgumentException("Unexpected value: " + serializer);
|
||||
};
|
||||
}
|
||||
|
||||
public Object fromBinaryToObject(final byte[] bytes) {
|
||||
final String serializer = getSerializerType();
|
||||
return switch (serializer) {
|
||||
case KRYO -> deserializeWithKryo(bytes);
|
||||
case JAVABIN -> SerializeUtil.fromBinaryToObject(bytes);
|
||||
default -> throw new IllegalArgumentException("Unexpected value: " + serializer);
|
||||
};
|
||||
}
|
||||
|
||||
protected byte[] serializeWithKryo(final Object obj) {
|
||||
final Kryo kryo = kryoThreadLocal.get();
|
||||
try (ByteArrayOutputStream baos = new ByteArrayOutputStream(); final Output output = new Output(baos)) {
|
||||
kryo.writeClassAndObject(output, obj);
|
||||
output.flush();
|
||||
return baos.toByteArray();
|
||||
} catch (final IOException e) {
|
||||
throw new IORuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
protected Object deserializeWithKryo(final byte[] bytes) {
|
||||
final Kryo kryo = kryoThreadLocal.get();
|
||||
try (final Input input = new Input(new ByteArrayInputStream(bytes))) {
|
||||
return kryo.readClassAndObject(input);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -29,7 +29,6 @@ import java.util.stream.Collectors;
|
|||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.misc.Tuple3;
|
||||
import org.codelibs.fess.Constants;
|
||||
|
@ -42,6 +41,7 @@ import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
|||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.extractor.Extractor;
|
||||
import org.codelibs.fess.crawler.extractor.impl.TikaExtractor;
|
||||
import org.codelibs.fess.crawler.serializer.DataSerializer;
|
||||
import org.codelibs.fess.crawler.transformer.impl.AbstractTransformer;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
|
@ -68,6 +68,8 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
|
||||
protected FessConfig fessConfig;
|
||||
|
||||
protected DataSerializer dataSerializer;
|
||||
|
||||
protected abstract Extractor getExtractor(ResponseData responseData);
|
||||
|
||||
@Override
|
||||
|
@ -79,7 +81,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
final ResultData resultData = new ResultData();
|
||||
resultData.setTransformerName(getName());
|
||||
try {
|
||||
resultData.setData(SerializeUtil.fromObjectToBinary(generateData(responseData)));
|
||||
resultData.setData(dataSerializer.fromObjectToBinary(generateData(responseData)));
|
||||
} catch (final Exception e) {
|
||||
throw new CrawlingAccessException("Could not serialize object", e);
|
||||
}
|
||||
|
@ -485,7 +487,7 @@ public abstract class AbstractFessFileTransformer extends AbstractTransformer im
|
|||
final byte[] data = accessResultData.getData();
|
||||
if (data != null) {
|
||||
try {
|
||||
return SerializeUtil.fromBinaryToObject(data);
|
||||
return dataSerializer.fromBinaryToObject(data);
|
||||
} catch (final Exception e) {
|
||||
throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
|
||||
}
|
||||
|
|
|
@ -35,6 +35,7 @@ public class FessFileTransformer extends AbstractFessFileTransformer {
|
|||
logger.debug("Initialize {}", this.getClass().getSimpleName());
|
||||
}
|
||||
fessConfig = ComponentUtil.getFessConfig();
|
||||
dataSerializer = ComponentUtil.getComponent("dataSerializer");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -35,6 +35,7 @@ public class FessStandardTransformer extends AbstractFessFileTransformer {
|
|||
logger.debug("Initialize {}", this.getClass().getSimpleName());
|
||||
}
|
||||
fessConfig = ComponentUtil.getFessConfig();
|
||||
dataSerializer = ComponentUtil.getComponent("dataSerializer");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -263,4 +263,5 @@ public interface FessTransformer {
|
|||
}
|
||||
return newDataMap;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -42,7 +42,6 @@ import javax.xml.xpath.XPathNodes;
|
|||
import org.apache.logging.log4j.LogManager;
|
||||
import org.apache.logging.log4j.Logger;
|
||||
import org.codelibs.core.io.InputStreamUtil;
|
||||
import org.codelibs.core.io.SerializeUtil;
|
||||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.core.misc.Pair;
|
||||
import org.codelibs.core.misc.ValueHolder;
|
||||
|
@ -56,6 +55,7 @@ import org.codelibs.fess.crawler.entity.UrlQueue;
|
|||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.serializer.DataSerializer;
|
||||
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
|
@ -109,6 +109,8 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
|
||||
protected FessConfig fessConfig;
|
||||
|
||||
protected DataSerializer dataSerializer;
|
||||
|
||||
protected boolean useGoogleOffOn = true;
|
||||
|
||||
protected Map<String, Boolean> fieldPrunedRuleMap = new HashMap<>();
|
||||
|
@ -121,6 +123,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
logger.debug("Initialize {}", this.getClass().getSimpleName());
|
||||
}
|
||||
fessConfig = ComponentUtil.getFessConfig();
|
||||
dataSerializer = ComponentUtil.getComponent("dataSerializer");
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -193,7 +196,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
normalizeData(responseData, dataMap);
|
||||
|
||||
try {
|
||||
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
|
||||
resultData.setData(dataSerializer.fromObjectToBinary(dataMap));
|
||||
} catch (final Exception e) {
|
||||
throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
|
||||
}
|
||||
|
@ -816,7 +819,7 @@ public class FessXpathTransformer extends XpathTransformer implements FessTransf
|
|||
final byte[] data = accessResultData.getData();
|
||||
if (data != null) {
|
||||
try {
|
||||
return SerializeUtil.fromBinaryToObject(data);
|
||||
return dataSerializer.fromBinaryToObject(data);
|
||||
} catch (final Exception e) {
|
||||
throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
|
||||
}
|
||||
|
|
|
@ -319,6 +319,9 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
/** The key of the configuration. e.g. 0 */
|
||||
String CRAWLER_HTTP_thread_pool_SIZE = "crawler.http.thread_pool.size";
|
||||
|
||||
/** The key of the configuration. e.g. kryo */
|
||||
String CRAWLER_DATA_SERIALIZER = "crawler.data.serializer";
|
||||
|
||||
/** The key of the configuration. e.g. 100 */
|
||||
String CRAWLER_DOCUMENT_MAX_SITE_LENGTH = "crawler.document.max.site.length";
|
||||
|
||||
|
@ -2687,6 +2690,13 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
*/
|
||||
Integer getCrawlerHttpThreadPoolSizeAsInteger();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.data.serializer'. <br>
|
||||
* The value is, e.g. kryo <br>
|
||||
* @return The value of found property. (NotNull: if not found, exception but basically no way)
|
||||
*/
|
||||
String getCrawlerDataSerializer();
|
||||
|
||||
/**
|
||||
* Get the value for the key 'crawler.document.max.site.length'. <br>
|
||||
* The value is, e.g. 100 <br>
|
||||
|
@ -8259,6 +8269,10 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
return getAsInteger(FessConfig.CRAWLER_HTTP_thread_pool_SIZE);
|
||||
}
|
||||
|
||||
public String getCrawlerDataSerializer() {
|
||||
return get(FessConfig.CRAWLER_DATA_SERIALIZER);
|
||||
}
|
||||
|
||||
public String getCrawlerDocumentMaxSiteLength() {
|
||||
return get(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH);
|
||||
}
|
||||
|
@ -11095,6 +11109,7 @@ public interface FessConfig extends FessEnv, org.codelibs.fess.mylasta.direction
|
|||
defaultMap.put(FessConfig.HTTP_FILEUPLOAD_MAX_FILE_COUNT, "10");
|
||||
defaultMap.put(FessConfig.CRAWLER_DEFAULT_SCRIPT, "groovy");
|
||||
defaultMap.put(FessConfig.CRAWLER_HTTP_thread_pool_SIZE, "0");
|
||||
defaultMap.put(FessConfig.CRAWLER_DATA_SERIALIZER, "kryo");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_MAX_SITE_LENGTH, "100");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_SITE_ENCODING, "UTF-8");
|
||||
defaultMap.put(FessConfig.CRAWLER_DOCUMENT_UNKNOWN_HOSTNAME, "unknown");
|
||||
|
|
|
@ -35,4 +35,7 @@
|
|||
<component name="fessStandardTransformer" class="org.codelibs.fess.crawler.transformer.FessStandardTransformer" instance="singleton">
|
||||
<property name="name">"fessStandardTransformer"</property>
|
||||
</component>
|
||||
|
||||
<component name="dataSerializer" class="org.codelibs.fess.crawler.serializer.DataSerializer" instance="singleton">
|
||||
</component>
|
||||
</components>
|
||||
|
|
|
@ -204,6 +204,7 @@ http.fileupload.max.file.count=10
|
|||
# common
|
||||
crawler.default.script=groovy
|
||||
crawler.http.thread_pool.size=0
|
||||
crawler.data.serializer=kryo
|
||||
crawler.document.max.site.length=100
|
||||
crawler.document.site.encoding=UTF-8
|
||||
crawler.document.unknown.hostname=unknown
|
||||
|
|
|
@ -21,12 +21,26 @@ import java.util.Map;
|
|||
|
||||
import org.apache.groovy.util.Maps;
|
||||
import org.codelibs.fess.Constants;
|
||||
import org.codelibs.fess.crawler.serializer.DataSerializer;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.exception.FessSystemException;
|
||||
import org.codelibs.fess.unit.UnitFessTestCase;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
|
||||
public class FessFileTransformerTest extends UnitFessTestCase {
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
ComponentUtil.register(new DataSerializer(), "dataSerializer");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
ComponentUtil.setFessConfig(null);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
private String encodeUrl(final String url) {
|
||||
try {
|
||||
return URLEncoder.encode(url, Constants.UTF_8);
|
||||
|
@ -292,4 +306,5 @@ public class FessFileTransformerTest extends UnitFessTestCase {
|
|||
transformer.init();
|
||||
return transformer;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -44,6 +44,7 @@ import org.codelibs.fess.crawler.entity.RequestData;
|
|||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.crawler.serializer.DataSerializer;
|
||||
import org.codelibs.fess.crawler.util.FieldConfigs;
|
||||
import org.codelibs.fess.helper.CrawlingConfigHelper;
|
||||
import org.codelibs.fess.helper.CrawlingInfoHelper;
|
||||
|
@ -70,6 +71,18 @@ import org.xml.sax.InputSource;
|
|||
public class FessXpathTransformerTest extends UnitFessTestCase {
|
||||
private static final Logger logger = LogManager.getLogger(FessXpathTransformerTest.class);
|
||||
|
||||
@Override
|
||||
public void setUp() throws Exception {
|
||||
super.setUp();
|
||||
ComponentUtil.register(new DataSerializer(), "dataSerializer");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void tearDown() throws Exception {
|
||||
ComponentUtil.setFessConfig(null);
|
||||
super.tearDown();
|
||||
}
|
||||
|
||||
public void test_transform() throws Exception {
|
||||
String data = "<html><head><title>Test</title></head><body><h1>Header1</h1><p>This is a pen.</p></body></html>";
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue