diff --git a/deps.xml b/deps.xml index b58f858ee..067dab2c9 100644 --- a/deps.xml +++ b/deps.xml @@ -45,27 +45,27 @@ - + - - + + - + - - + + - + - - + + diff --git a/src/main/java/org/codelibs/fess/Constants.java b/src/main/java/org/codelibs/fess/Constants.java index bb482f7dc..17853a6d1 100644 --- a/src/main/java/org/codelibs/fess/Constants.java +++ b/src/main/java/org/codelibs/fess/Constants.java @@ -458,4 +458,5 @@ public class Constants extends CoreLibConstants { public static final String TEXT_FRAGMENT_TYPE_HIGHLIGHT = "highlight"; + public static final String CRAWLER_STATS_KEY = "crawler.stats.key"; } diff --git a/src/main/java/org/codelibs/fess/ds/AbstractDataStore.java b/src/main/java/org/codelibs/fess/ds/AbstractDataStore.java index 8b870de9b..203153912 100644 --- a/src/main/java/org/codelibs/fess/ds/AbstractDataStore.java +++ b/src/main/java/org/codelibs/fess/ds/AbstractDataStore.java @@ -31,6 +31,7 @@ import org.codelibs.core.lang.ThreadUtil; import org.codelibs.core.misc.Pair; import org.codelibs.fess.Constants; import org.codelibs.fess.ds.callback.IndexUpdateCallback; +import org.codelibs.fess.entity.DataStoreParams; import org.codelibs.fess.es.config.exentity.DataConfig; import org.codelibs.fess.helper.CrawlingInfoHelper; import org.codelibs.fess.helper.SystemHelper; @@ -59,7 +60,7 @@ public abstract class AbstractDataStore implements DataStore { } @Override - public void store(final DataConfig config, final IndexUpdateCallback callback, final Map initParamMap) { + public void store(final DataConfig config, final IndexUpdateCallback callback, final DataStoreParams initParamMap) { final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper(); final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); final Date documentExpires = crawlingInfoHelper.getDocumentExpires(config); @@ -76,7 +77,7 @@ public abstract class AbstractDataStore implements DataStore { final Map configScriptMap = config.getHandlerScriptMap(); initParamMap.putAll(configParamMap); - final Map paramMap = initParamMap; + final DataStoreParams paramMap = initParamMap; // default values final Map defaultDataMap = new HashMap<>(); @@ -91,7 +92,7 @@ public abstract class AbstractDataStore implements DataStore { defaultDataMap.put(fessConfig.getIndexFieldExpires(), documentExpires); } // segment - defaultDataMap.put(fessConfig.getIndexFieldSegment(), initParamMap.get(Constants.SESSION_ID)); + defaultDataMap.put(fessConfig.getIndexFieldSegment(), initParamMap.getAsString(Constants.SESSION_ID)); // created defaultDataMap.put(fessConfig.getIndexFieldCreated(), systemHelper.getCurrentTime()); // boost @@ -118,12 +119,12 @@ public abstract class AbstractDataStore implements DataStore { defaultDataMap.put(fessConfig.getIndexFieldVirtualHost(), stream(config.getVirtualHosts()).get(stream -> stream.filter(StringUtil::isNotBlank).collect(Collectors.toList()))); - storeData(config, callback, new ParamMap<>(paramMap), configScriptMap, defaultDataMap); + storeData(config, callback, paramMap.newInstance(), configScriptMap, defaultDataMap); } - protected String getScriptType(final Map paramMap) { - final String value = paramMap.get(SCRIPT_TYPE); + protected String getScriptType(final DataStoreParams paramMap) { + final String value = paramMap.getAsString(SCRIPT_TYPE); if (StringUtil.isBlank(value)) { return Constants.DEFAULT_SCRIPT; } @@ -142,9 +143,9 @@ public abstract class AbstractDataStore implements DataStore { return ComponentUtil.getScriptEngineFactory().getScriptEngine(scriptType).evaluate(template, paramMap); } - protected long getReadInterval(final Map paramMap) { + protected long getReadInterval(final DataStoreParams paramMap) { long readInterval = 0; - final String value = paramMap.get("readInterval"); + final String value = paramMap.getAsString("readInterval"); if (StringUtil.isNotBlank(value)) { try { readInterval = Long.parseLong(value); @@ -159,6 +160,6 @@ public abstract class AbstractDataStore implements DataStore { ThreadUtil.sleepQuietly(interval); } - protected abstract void storeData(DataConfig dataConfig, IndexUpdateCallback callback, Map paramMap, + protected abstract void storeData(DataConfig dataConfig, IndexUpdateCallback callback, DataStoreParams paramMap, Map scriptMap, Map defaultDataMap); } diff --git a/src/main/java/org/codelibs/fess/ds/DataStore.java b/src/main/java/org/codelibs/fess/ds/DataStore.java index aea7b9c55..75449c4b9 100644 --- a/src/main/java/org/codelibs/fess/ds/DataStore.java +++ b/src/main/java/org/codelibs/fess/ds/DataStore.java @@ -15,14 +15,13 @@ */ package org.codelibs.fess.ds; -import java.util.Map; - import org.codelibs.fess.ds.callback.IndexUpdateCallback; +import org.codelibs.fess.entity.DataStoreParams; import org.codelibs.fess.es.config.exentity.DataConfig; public interface DataStore { - void store(DataConfig config, IndexUpdateCallback callback, Map initParamMap); + void store(DataConfig config, IndexUpdateCallback callback, DataStoreParams initParamMap); void stop(); diff --git a/src/main/java/org/codelibs/fess/ds/callback/FileListIndexUpdateCallbackImpl.java b/src/main/java/org/codelibs/fess/ds/callback/FileListIndexUpdateCallbackImpl.java index f02bde53b..c3522ace4 100644 --- a/src/main/java/org/codelibs/fess/ds/callback/FileListIndexUpdateCallbackImpl.java +++ b/src/main/java/org/codelibs/fess/ds/callback/FileListIndexUpdateCallbackImpl.java @@ -45,8 +45,11 @@ import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor; import org.codelibs.fess.crawler.rule.Rule; import org.codelibs.fess.crawler.rule.RuleManager; import org.codelibs.fess.crawler.transformer.Transformer; +import org.codelibs.fess.entity.DataStoreParams; import org.codelibs.fess.es.client.SearchEngineClient; import org.codelibs.fess.exception.DataStoreCrawlingException; +import org.codelibs.fess.helper.CrawlerStatsHelper; +import org.codelibs.fess.helper.CrawlerStatsHelper.StatsKeyObject; import org.codelibs.fess.helper.IndexingHelper; import org.codelibs.fess.mylasta.direction.FessConfig; import org.codelibs.fess.util.ComponentUtil; @@ -89,7 +92,7 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { } @Override - public void store(final Map paramMap, final Map dataMap) { + public void store(final DataStoreParams paramMap, final Map dataMap) { executor.execute(() -> { final Object eventType = dataMap.remove(getParamValue(paramMap, "field.event_type", "event_type")); if (getParamValue(paramMap, "event.create", "create").equals(eventType) @@ -105,12 +108,13 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { }); } - protected String getParamValue(final Map paramMap, final String key, final String defaultValue) { - return paramMap.getOrDefault(key, defaultValue); + protected String getParamValue(final DataStoreParams paramMap, final String key, final String defaultValue) { + return paramMap.getAsString(key, defaultValue); } - protected void addDocument(final Map paramMap, final Map dataMap) { + protected void addDocument(final DataStoreParams paramMap, final Map dataMap) { final FessConfig fessConfig = ComponentUtil.getFessConfig(); + final CrawlerStatsHelper crawlerStatsHelper = ComponentUtil.getCrawlerStatsHelper(); synchronized (indexUpdateCallback) { // required check if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) { @@ -125,6 +129,8 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { return; } + final StatsKeyObject keyObj = paramMap.get(Constants.CRAWLER_STATS_KEY) instanceof StatsKeyObject sko ? sko : null; + final long maxAccessCount = getMaxAccessCount(paramMap, dataMap); long counter = 0; final Deque urlQueue = new LinkedList<>(); @@ -138,16 +144,23 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { } try { for (int i = 0; i < maxRedirectCount; i++) { + if (keyObj != null) { + keyObj.setUrl(processingUrl); + } + crawlerStatsHelper.record(keyObj, "prepared"); processingUrl = processRequest(paramMap, localDataMap, processingUrl, client); if (processingUrl == null) { break; } counter++; localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl); + crawlerStatsHelper.record(keyObj, "redirected"); } } catch (final ChildUrlsException e) { + crawlerStatsHelper.record(keyObj, "child_urls"); e.getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer); } catch (final DataStoreCrawlingException e) { + crawlerStatsHelper.record(keyObj, "crawling_exception"); final Throwable cause = e.getCause(); if (cause instanceof ChildUrlsException) { ((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer); @@ -161,7 +174,7 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { } } - protected long getMaxAccessCount(final Map paramMap, final Map dataMap) { + protected long getMaxAccessCount(final DataStoreParams paramMap, final Map dataMap) { final Object recursive = dataMap.remove(getParamValue(paramMap, "field.recursive", "recursive")); if (recursive == null || Constants.FALSE.equalsIgnoreCase(recursive.toString())) { return 1L; @@ -176,9 +189,11 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { } } - protected String processRequest(final Map paramMap, final Map dataMap, final String url, + protected String processRequest(final DataStoreParams paramMap, final Map dataMap, final String url, final CrawlerClient client) { final long startTime = System.currentTimeMillis(); + final CrawlerStatsHelper crawlerStatsHelper = ComponentUtil.getCrawlerStatsHelper(); + final StatsKeyObject keyObj = paramMap.get(Constants.CRAWLER_STATS_KEY) instanceof StatsKeyObject sko ? sko : null; try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) { if (responseData.getRedirectLocation() != null) { return responseData.getRedirectLocation(); @@ -187,7 +202,7 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { if (dataMap.containsKey(Constants.SESSION_ID)) { responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID)); } else { - responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID)); + responseData.setSessionId((String) paramMap.get(Constants.CRAWLING_INFO_ID)); } final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class); @@ -210,17 +225,19 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { throw new CrawlerSystemException("Could not create an instance from bytes.", e); } } + crawlerStatsHelper.record(keyObj, "accessed"); // remove String[] ignoreFields; if (paramMap.containsKey("ignore.field.names")) { - ignoreFields = paramMap.get("ignore.field.names").split(","); + ignoreFields = ((String) paramMap.get("ignore.field.names")).split(","); } else { ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID }; } stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s))); indexUpdateCallback.store(paramMap, dataMap); + crawlerStatsHelper.record(keyObj, "processed"); } else { logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap); @@ -235,7 +252,7 @@ public class FileListIndexUpdateCallbackImpl implements IndexUpdateCallback { } } - protected boolean deleteDocument(final Map paramMap, final Map dataMap) { + protected boolean deleteDocument(final DataStoreParams paramMap, final Map dataMap) { if (logger.isDebugEnabled()) { logger.debug("Deleting {}", dataMap); diff --git a/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallback.java b/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallback.java index 423401c91..f5272233c 100644 --- a/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallback.java +++ b/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallback.java @@ -17,9 +17,11 @@ package org.codelibs.fess.ds.callback; import java.util.Map; +import org.codelibs.fess.entity.DataStoreParams; + public interface IndexUpdateCallback { - void store(Map paramMap, Map dataMap); + void store(DataStoreParams paramMap, Map dataMap); long getDocumentSize(); diff --git a/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallbackImpl.java b/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallbackImpl.java index 3f4168459..47ea3200d 100644 --- a/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallbackImpl.java +++ b/src/main/java/org/codelibs/fess/ds/callback/IndexUpdateCallbackImpl.java @@ -25,6 +25,7 @@ import javax.annotation.PostConstruct; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.codelibs.core.stream.StreamUtil; +import org.codelibs.fess.entity.DataStoreParams; import org.codelibs.fess.es.client.SearchEngineClient; import org.codelibs.fess.exception.DataStoreException; import org.codelibs.fess.helper.CrawlingInfoHelper; @@ -70,7 +71,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback { * @see org.codelibs.fess.ds.callback.IndexUpdateCallback#store(java.util.Map) */ @Override - public void store(final Map paramMap, final Map dataMap) { + public void store(final DataStoreParams paramMap, final Map dataMap) { final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); systemHelper.calibrateCpuLoad(); @@ -142,7 +143,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback { } - protected Map ingest(final Map paramMap, final Map dataMap) { + protected Map ingest(final DataStoreParams paramMap, final Map dataMap) { if (ingestFactory == null) { return dataMap; } diff --git a/src/main/java/org/codelibs/fess/entity/DataStoreParams.java b/src/main/java/org/codelibs/fess/entity/DataStoreParams.java new file mode 100644 index 000000000..be1340163 --- /dev/null +++ b/src/main/java/org/codelibs/fess/entity/DataStoreParams.java @@ -0,0 +1,75 @@ +/* + * Copyright 2012-2022 CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ +package org.codelibs.fess.entity; + +import java.util.HashMap; +import java.util.Map; + +public class DataStoreParams { + + protected final Map params; + + public DataStoreParams() { + params = new HashMap<>(); + } + + protected DataStoreParams(final Map params) { + this.params = new HashMap<>(params); + } + + public void put(final String key, final Object value) { + params.put(key, value); + } + + public Object get(final String key) { + return params.get(key); + } + + public String getAsString(final String key) { + if (params.get(key) instanceof String strValue) { + return strValue; + } + final Object value = params.get(key); + if (value != null) { + return value.toString(); + } + return null; + } + + public String getAsString(final String key, final String defaultValue) { + final String value = getAsString(key); + if (value != null) { + return value; + } + return defaultValue; + } + + public DataStoreParams newInstance() { + return new DataStoreParams(params); + } + + public void putAll(final Map map) { + params.putAll(map); + } + + public boolean containsKey(final String key) { + return params.containsKey(key); + } + + public Map asMap() { + return new HashMap<>(params); + } +} diff --git a/src/main/java/org/codelibs/fess/helper/DataIndexHelper.java b/src/main/java/org/codelibs/fess/helper/DataIndexHelper.java index e06fe682d..d5443a808 100644 --- a/src/main/java/org/codelibs/fess/helper/DataIndexHelper.java +++ b/src/main/java/org/codelibs/fess/helper/DataIndexHelper.java @@ -17,9 +17,7 @@ package org.codelibs.fess.helper; import java.util.ArrayList; import java.util.Collections; -import java.util.HashMap; import java.util.List; -import java.util.Map; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -30,6 +28,7 @@ import org.codelibs.fess.app.service.FailureUrlService; import org.codelibs.fess.ds.DataStore; import org.codelibs.fess.ds.DataStoreFactory; import org.codelibs.fess.ds.callback.IndexUpdateCallback; +import org.codelibs.fess.entity.DataStoreParams; import org.codelibs.fess.es.client.SearchEngineClient; import org.codelibs.fess.es.config.exentity.DataConfig; import org.codelibs.fess.mylasta.direction.FessConfig; @@ -88,7 +87,7 @@ public class DataIndexHelper { dataCrawlingThreadList.clear(); final List dataCrawlingThreadStatusList = new ArrayList<>(); for (final DataConfig dataConfig : configList) { - final Map initParamMap = new HashMap<>(); + final DataStoreParams initParamMap = new DataStoreParams(); final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, dataConfig); sessionIdList.add(sid); @@ -180,7 +179,7 @@ public class DataIndexHelper { private final IndexUpdateCallback indexUpdateCallback; - private final Map initParamMap; + private final DataStoreParams initParamMap; protected boolean finished = false; @@ -189,7 +188,7 @@ public class DataIndexHelper { private DataStore dataStore; protected DataCrawlingThread(final DataConfig dataConfig, final IndexUpdateCallback indexUpdateCallback, - final Map initParamMap) { + final DataStoreParams initParamMap) { this.dataConfig = dataConfig; this.indexUpdateCallback = indexUpdateCallback; this.initParamMap = initParamMap; @@ -226,10 +225,10 @@ public class DataIndexHelper { } private void deleteOldDocs() { - if (Constants.FALSE.equals(initParamMap.get(DELETE_OLD_DOCS))) { + if (Constants.FALSE.equals(initParamMap.getAsString(DELETE_OLD_DOCS))) { return; } - final String sessionId = initParamMap.get(Constants.SESSION_ID); + final String sessionId = initParamMap.getAsString(Constants.SESSION_ID); if (StringUtil.isBlank(sessionId)) { logger.warn("Invalid sessionId at {}", dataConfig); return; @@ -262,7 +261,7 @@ public class DataIndexHelper { } public String getCrawlingInfoId() { - return initParamMap.get(Constants.CRAWLING_INFO_ID); + return initParamMap.getAsString(Constants.CRAWLING_INFO_ID); } public boolean isRunning() { diff --git a/src/main/java/org/codelibs/fess/ingest/Ingester.java b/src/main/java/org/codelibs/fess/ingest/Ingester.java index fb632fb02..1a0398837 100644 --- a/src/main/java/org/codelibs/fess/ingest/Ingester.java +++ b/src/main/java/org/codelibs/fess/ingest/Ingester.java @@ -20,6 +20,7 @@ import java.util.Map; import org.codelibs.fess.crawler.entity.AccessResult; import org.codelibs.fess.crawler.entity.ResponseData; import org.codelibs.fess.crawler.entity.ResultData; +import org.codelibs.fess.entity.DataStoreParams; import org.codelibs.fess.util.ComponentUtil; public abstract class Ingester { @@ -49,7 +50,7 @@ public abstract class Ingester { } // datastore - public Map process(final Map target, final Map params) { + public Map process(final Map target, final DataStoreParams params) { return process(target); } diff --git a/src/test/java/org/codelibs/fess/ds/AbstractDataStoreTest.java b/src/test/java/org/codelibs/fess/ds/AbstractDataStoreTest.java index 2e8b6786e..339baafdc 100644 --- a/src/test/java/org/codelibs/fess/ds/AbstractDataStoreTest.java +++ b/src/test/java/org/codelibs/fess/ds/AbstractDataStoreTest.java @@ -20,6 +20,7 @@ import java.util.Map; import org.codelibs.fess.Constants; import org.codelibs.fess.ds.callback.IndexUpdateCallback; +import org.codelibs.fess.entity.DataStoreParams; import org.codelibs.fess.es.config.exentity.DataConfig; import org.codelibs.fess.exception.JobProcessingException; import org.codelibs.fess.script.AbstractScriptEngine; @@ -45,7 +46,7 @@ public class AbstractDataStoreTest extends UnitFessTestCase { } @Override - protected void storeData(DataConfig dataConfig, IndexUpdateCallback callback, Map paramMap, + protected void storeData(DataConfig dataConfig, IndexUpdateCallback callback, DataStoreParams paramMap, Map scriptMap, Map defaultDataMap) { // TODO nothing }