replace with fess-crawler
This commit is contained in:
parent
40a081f433
commit
8dbc3e9b77
40 changed files with 333 additions and 326 deletions
18
pom.xml
18
pom.xml
|
@ -55,8 +55,8 @@
|
|||
<junit.version>4.8.2</junit.version>
|
||||
<utflute.version>0.5.2</utflute.version>
|
||||
|
||||
<!-- S2Robot -->
|
||||
<s2robot.version>1.0.0-SNAPSHOT</s2robot.version>
|
||||
<!-- Crawler -->
|
||||
<crawler.version>1.0.0-SNAPSHOT</crawler.version>
|
||||
<tika.version>1.6</tika.version>
|
||||
<poi.version>3.11-beta2</poi.version>
|
||||
<pdfbox.version>1.8.7</pdfbox.version>
|
||||
|
@ -466,11 +466,11 @@
|
|||
<version>2.2.1</version>
|
||||
</dependency>
|
||||
|
||||
<!-- s2robot -->
|
||||
<!-- Crawler -->
|
||||
<dependency>
|
||||
<groupId>org.codelibs.robot</groupId>
|
||||
<artifactId>s2robot-lasta</artifactId>
|
||||
<version>${s2robot.version}</version>
|
||||
<groupId>org.codelibs.fess</groupId>
|
||||
<artifactId>fess-crawler-lasta</artifactId>
|
||||
<version>${crawler.version}</version>
|
||||
<exclusions>
|
||||
<exclusion>
|
||||
<groupId>commons-logging</groupId>
|
||||
|
@ -479,9 +479,9 @@
|
|||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.codelibs.robot</groupId>
|
||||
<artifactId>s2robot-es</artifactId>
|
||||
<version>${s2robot.version}</version>
|
||||
<groupId>org.codelibs.fess</groupId>
|
||||
<artifactId>fess-crawler-es</artifactId>
|
||||
<version>${crawler.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.bouncycastle</groupId>
|
||||
|
|
|
@ -139,7 +139,7 @@ public class WebConfigEditForm implements Serializable {
|
|||
sortOrder = "0";
|
||||
userAgent = ComponentUtil.getUserAgentName();
|
||||
if (StringUtil.isBlank(userAgent)) {
|
||||
userAgent = "Fess Robot/" + Constants.FESS_VERSION;
|
||||
userAgent = "FessCrawler/" + Constants.FESS_VERSION;
|
||||
}
|
||||
numOfThread = Integer.toString(Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB);
|
||||
intervalTime = Integer.toString(Constants.DEFAULT_INTERVAL_TIME_FOR_WEB);
|
||||
|
|
|
@ -39,7 +39,7 @@ import org.codelibs.fess.helper.JobHelper;
|
|||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.job.TriggeredJob;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.util.CharUtil;
|
||||
import org.codelibs.fess.crawler.util.CharUtil;
|
||||
import org.lastaflute.web.Execute;
|
||||
import org.lastaflute.web.callback.ActionRuntime;
|
||||
import org.lastaflute.web.response.HtmlResponse;
|
||||
|
|
|
@ -34,7 +34,7 @@ import org.codelibs.fess.helper.SearchLogHelper;
|
|||
import org.codelibs.fess.helper.ViewHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.DocumentUtil;
|
||||
import org.codelibs.robot.util.CharUtil;
|
||||
import org.codelibs.fess.crawler.util.CharUtil;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.elasticsearch.index.query.TermQueryBuilder;
|
||||
import org.lastaflute.web.Execute;
|
||||
|
|
|
@ -36,29 +36,29 @@ import org.codelibs.fess.helper.FieldHelper;
|
|||
import org.codelibs.fess.helper.IndexingHelper;
|
||||
import org.codelibs.fess.helper.SambaHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.S2RobotThread;
|
||||
import org.codelibs.robot.builder.RequestDataBuilder;
|
||||
import org.codelibs.robot.client.S2RobotClient;
|
||||
import org.codelibs.robot.client.smb.SmbClient;
|
||||
import org.codelibs.robot.entity.RequestData;
|
||||
import org.codelibs.robot.entity.ResponseData;
|
||||
import org.codelibs.robot.entity.UrlQueue;
|
||||
import org.codelibs.robot.log.LogType;
|
||||
import org.codelibs.fess.crawler.CrawlerThread;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClient;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbClient;
|
||||
import org.codelibs.fess.crawler.entity.RequestData;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.UrlQueue;
|
||||
import org.codelibs.fess.crawler.log.LogType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import jcifs.smb.ACE;
|
||||
import jcifs.smb.SID;
|
||||
|
||||
public class FessS2RobotThread extends S2RobotThread {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FessS2RobotThread.class);
|
||||
public class FessCrawlerThread extends CrawlerThread {
|
||||
private static final Logger logger = LoggerFactory.getLogger(FessCrawlerThread.class);
|
||||
|
||||
@Override
|
||||
protected boolean isContentUpdated(final S2RobotClient client, final UrlQueue urlQueue) {
|
||||
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue urlQueue) {
|
||||
final DynamicProperties crawlerProperties = ComponentUtil.getCrawlerProperties();
|
||||
if (crawlerProperties.getProperty(Constants.DIFF_CRAWLING_PROPERTY, Constants.TRUE).equals(Constants.TRUE)) {
|
||||
|
||||
log(logHelper, LogType.CHECK_LAST_MODIFIED, robotContext, urlQueue);
|
||||
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
|
||||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
|
||||
|
@ -72,7 +72,7 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
final String url = urlQueue.getUrl();
|
||||
ResponseData responseData = null;
|
||||
try {
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(robotContext.getSessionId());
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
|
||||
final Map<String, Object> dataMap = new HashMap<String, Object>();
|
||||
dataMap.put(fieldHelper.urlField, url);
|
||||
final List<String> roleTypeList = new ArrayList<String>();
|
||||
|
@ -140,12 +140,12 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
return true;
|
||||
} else if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
|
||||
|
||||
log(logHelper, LogType.NOT_MODIFIED, robotContext, urlQueue);
|
||||
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
|
||||
|
||||
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
|
||||
responseData.setParentUrl(urlQueue.getParentUrl());
|
||||
responseData.setSessionId(robotContext.getSessionId());
|
||||
responseData.setHttpStatusCode(org.codelibs.robot.Constants.NOT_MODIFIED_STATUS);
|
||||
responseData.setSessionId(crawlerContext.getSessionId());
|
||||
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
|
||||
processResponse(urlQueue, responseData);
|
||||
|
||||
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fieldHelper.anchorField)));
|
||||
|
@ -163,7 +163,7 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
|
||||
protected void storeChildUrlsToQueue(final UrlQueue urlQueue, final Set<RequestData> childUrlSet) {
|
||||
if (childUrlSet != null) {
|
||||
synchronized (robotContext.getAccessCountLock()) {
|
||||
synchronized (crawlerContext.getAccessCountLock()) {
|
||||
// add an url
|
||||
storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() != null ? urlQueue.getDepth() + 1 : 1);
|
||||
}
|
|
@ -18,7 +18,7 @@ package org.codelibs.fess.crawler.interval;
|
|||
|
||||
import org.codelibs.fess.helper.IntervalControlHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.interval.impl.DefaultIntervalController;
|
||||
import org.codelibs.fess.crawler.interval.impl.DefaultIntervalController;
|
||||
|
||||
public class FessIntervalController extends DefaultIntervalController {
|
||||
|
||||
|
|
|
@ -48,17 +48,17 @@ import org.codelibs.fess.helper.PathMappingHelper;
|
|||
import org.codelibs.fess.helper.SambaHelper;
|
||||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.client.smb.SmbClient;
|
||||
import org.codelibs.robot.entity.AccessResult;
|
||||
import org.codelibs.robot.entity.AccessResultData;
|
||||
import org.codelibs.robot.entity.ExtractData;
|
||||
import org.codelibs.robot.entity.ResponseData;
|
||||
import org.codelibs.robot.entity.ResultData;
|
||||
import org.codelibs.robot.entity.UrlQueue;
|
||||
import org.codelibs.robot.exception.RobotCrawlAccessException;
|
||||
import org.codelibs.robot.exception.RobotSystemException;
|
||||
import org.codelibs.robot.extractor.Extractor;
|
||||
import org.codelibs.robot.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbClient;
|
||||
import org.codelibs.fess.crawler.entity.AccessResult;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.ExtractData;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.entity.UrlQueue;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.extractor.Extractor;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
@ -96,7 +96,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
|
|||
@Override
|
||||
public ResultData transform(final ResponseData responseData) {
|
||||
if (responseData == null || responseData.getResponseBody() == null) {
|
||||
throw new RobotCrawlAccessException("No response body.");
|
||||
throw new CrawlingAccessException("No response body.");
|
||||
}
|
||||
|
||||
final Extractor extractor = getExtractor(responseData);
|
||||
|
@ -144,8 +144,8 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
|
|||
}
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
final RobotCrawlAccessException rcae = new RobotCrawlAccessException("Could not get a text from " + responseData.getUrl(), e);
|
||||
rcae.setLogLevel(RobotCrawlAccessException.WARN);
|
||||
final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
|
||||
rcae.setLogLevel(CrawlingAccessException.WARN);
|
||||
throw rcae;
|
||||
} finally {
|
||||
IOUtils.closeQuietly(in);
|
||||
|
@ -323,7 +323,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
|
|||
try {
|
||||
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
|
||||
} catch (final Exception e) {
|
||||
throw new RobotCrawlAccessException("Could not serialize object: " + url, e);
|
||||
throw new CrawlingAccessException("Could not serialize object: " + url, e);
|
||||
}
|
||||
resultData.setEncoding(charsetName);
|
||||
|
||||
|
@ -466,7 +466,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
|
|||
try {
|
||||
return SerializeUtil.fromBinaryToObject(data);
|
||||
} catch (final Exception e) {
|
||||
throw new RobotSystemException("Could not create an instanced from bytes.", e);
|
||||
throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
|
||||
}
|
||||
}
|
||||
return new HashMap<String, Object>();
|
||||
|
|
|
@ -25,7 +25,7 @@ import org.apache.commons.lang3.StringUtils;
|
|||
import org.codelibs.core.lang.StringUtil;
|
||||
import org.codelibs.fess.helper.FieldHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.transformer.impl.XpathTransformer;
|
||||
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
|
|
@ -18,9 +18,9 @@ package org.codelibs.fess.crawler.transformer;
|
|||
|
||||
import org.codelibs.fess.exception.FessSystemException;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.entity.ResponseData;
|
||||
import org.codelibs.robot.extractor.Extractor;
|
||||
import org.codelibs.robot.extractor.ExtractorFactory;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.extractor.Extractor;
|
||||
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
|
|
|
@ -17,8 +17,8 @@
|
|||
package org.codelibs.fess.crawler.transformer;
|
||||
|
||||
import org.codelibs.fess.exception.FessSystemException;
|
||||
import org.codelibs.robot.entity.ResponseData;
|
||||
import org.codelibs.robot.extractor.Extractor;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.extractor.Extractor;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
|
||||
public class FessTikaTransformer extends AbstractFessFileTransformer {
|
||||
|
|
|
@ -50,17 +50,17 @@ import org.codelibs.fess.helper.OverlappingHostHelper;
|
|||
import org.codelibs.fess.helper.PathMappingHelper;
|
||||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.builder.RequestDataBuilder;
|
||||
import org.codelibs.robot.entity.AccessResultData;
|
||||
import org.codelibs.robot.entity.RequestData;
|
||||
import org.codelibs.robot.entity.ResponseData;
|
||||
import org.codelibs.robot.entity.ResultData;
|
||||
import org.codelibs.robot.entity.UrlQueue;
|
||||
import org.codelibs.robot.exception.ChildUrlsException;
|
||||
import org.codelibs.robot.exception.RobotCrawlAccessException;
|
||||
import org.codelibs.robot.exception.RobotSystemException;
|
||||
import org.codelibs.robot.util.CrawlingParameterUtil;
|
||||
import org.codelibs.robot.util.ResponseDataUtil;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.RequestData;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.entity.UrlQueue;
|
||||
import org.codelibs.fess.crawler.exception.ChildUrlsException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
|
||||
import org.codelibs.fess.crawler.util.ResponseDataUtil;
|
||||
import org.cyberneko.html.parsers.DOMParser;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -116,7 +116,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
|
|||
}
|
||||
parser.parse(is);
|
||||
} catch (final Exception e) {
|
||||
throw new RobotCrawlAccessException("Could not parse " + responseData.getUrl(), e);
|
||||
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(bis);
|
||||
}
|
||||
|
@ -172,7 +172,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
|
|||
try {
|
||||
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
|
||||
} catch (final Exception e) {
|
||||
throw new RobotCrawlAccessException("Could not serialize object: " + responseData.getUrl(), e);
|
||||
throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
|
||||
}
|
||||
resultData.setEncoding(charsetName);
|
||||
} finally {
|
||||
|
@ -500,7 +500,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
|
|||
try {
|
||||
return SerializeUtil.fromBinaryToObject(data);
|
||||
} catch (final Exception e) {
|
||||
throw new RobotSystemException("Could not create an instanced from bytes.", e);
|
||||
throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
|
||||
}
|
||||
}
|
||||
return new HashMap<String, Object>();
|
||||
|
|
|
@ -16,9 +16,9 @@
|
|||
|
||||
package org.codelibs.fess.ds;
|
||||
|
||||
import org.codelibs.robot.exception.RobotCrawlAccessException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
|
||||
public class DataStoreCrawlingException extends RobotCrawlAccessException {
|
||||
public class DataStoreCrawlingException extends CrawlingAccessException {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
|
||||
|
|
|
@ -36,8 +36,8 @@ import org.codelibs.fess.ds.DataStoreCrawlingException;
|
|||
import org.codelibs.fess.ds.DataStoreException;
|
||||
import org.codelibs.fess.ds.IndexUpdateCallback;
|
||||
import org.codelibs.fess.es.exentity.DataConfig;
|
||||
import org.codelibs.robot.exception.RobotCrawlAccessException;
|
||||
import org.codelibs.robot.exception.RobotMultipleCrawlAccessException;
|
||||
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -230,10 +230,10 @@ public class CsvDataStoreImpl extends AbstractDataStoreImpl {
|
|||
|
||||
try {
|
||||
loop = callback.store(dataMap);
|
||||
} catch (final RobotCrawlAccessException e) {
|
||||
} catch (final CrawlingAccessException e) {
|
||||
Throwable target = e;
|
||||
if (target instanceof RobotMultipleCrawlAccessException) {
|
||||
final Throwable[] causes = ((RobotMultipleCrawlAccessException) target).getCauses();
|
||||
if (target instanceof MultipleCrawlingAccessException) {
|
||||
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
|
||||
if (causes.length > 0) {
|
||||
target = causes[causes.length - 1];
|
||||
}
|
||||
|
|
|
@ -34,17 +34,17 @@ import org.codelibs.fess.helper.CrawlingSessionHelper;
|
|||
import org.codelibs.fess.helper.FieldHelper;
|
||||
import org.codelibs.fess.helper.IndexingHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.builder.RequestDataBuilder;
|
||||
import org.codelibs.robot.client.S2RobotClient;
|
||||
import org.codelibs.robot.client.S2RobotClientFactory;
|
||||
import org.codelibs.robot.entity.ResponseData;
|
||||
import org.codelibs.robot.entity.ResultData;
|
||||
import org.codelibs.robot.exception.RobotSystemException;
|
||||
import org.codelibs.robot.processor.ResponseProcessor;
|
||||
import org.codelibs.robot.processor.impl.DefaultResponseProcessor;
|
||||
import org.codelibs.robot.rule.Rule;
|
||||
import org.codelibs.robot.rule.RuleManager;
|
||||
import org.codelibs.robot.transformer.Transformer;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClient;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.entity.ResultData;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
import org.codelibs.fess.crawler.processor.ResponseProcessor;
|
||||
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
|
||||
import org.codelibs.fess.crawler.rule.Rule;
|
||||
import org.codelibs.fess.crawler.rule.RuleManager;
|
||||
import org.codelibs.fess.crawler.transformer.Transformer;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -71,7 +71,7 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
|
|||
|
||||
public int maxDeleteDocumentCacheSize = 100;
|
||||
|
||||
protected S2RobotClientFactory robotClientFactory;
|
||||
protected CrawlerClientFactory crawlerClientFactory;
|
||||
|
||||
protected CrawlingSessionHelper crawlingSessionHelper;
|
||||
|
||||
|
@ -92,9 +92,9 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
|
|||
@Override
|
||||
public void store(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> initParamMap) {
|
||||
|
||||
robotClientFactory = SingletonLaContainer.getComponent(S2RobotClientFactory.class);
|
||||
crawlerClientFactory = SingletonLaContainer.getComponent(CrawlerClientFactory.class);
|
||||
|
||||
config.initializeClientFactory(robotClientFactory);
|
||||
config.initializeClientFactory(crawlerClientFactory);
|
||||
|
||||
super.store(config, callback, initParamMap);
|
||||
}
|
||||
|
@ -170,9 +170,9 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
|
|||
|
||||
final String url = dataMap.get(fieldHelper.urlField).toString();
|
||||
try {
|
||||
final S2RobotClient client = robotClientFactory.getClient(url);
|
||||
final CrawlerClient client = crawlerClientFactory.getClient(url);
|
||||
if (client == null) {
|
||||
logger.warn("S2RobotClient is null. Data: " + dataMap);
|
||||
logger.warn("CrawlerClient is null. Data: " + dataMap);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -200,7 +200,7 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
|
|||
(Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
|
||||
dataMap.putAll(responseDataMap);
|
||||
} catch (final Exception e) {
|
||||
throw new RobotSystemException("Could not create an instance from bytes.", e);
|
||||
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ package org.codelibs.fess.es.exentity;
|
|||
|
||||
import java.util.Map;
|
||||
|
||||
import org.codelibs.robot.client.S2RobotClientFactory;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
|
||||
public interface CrawlingConfig {
|
||||
|
||||
|
@ -20,7 +20,7 @@ public interface CrawlingConfig {
|
|||
|
||||
String getConfigId();
|
||||
|
||||
void initializeClientFactory(S2RobotClientFactory s2RobotClientFactory);
|
||||
void initializeClientFactory(CrawlerClientFactory crawlerClientFactory);
|
||||
|
||||
Map<String, String> getConfigParameterMap(ConfigName name);
|
||||
|
||||
|
|
|
@ -24,13 +24,13 @@ import org.codelibs.fess.es.exbhv.LabelTypeBhv;
|
|||
import org.codelibs.fess.es.exbhv.RoleTypeBhv;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.ParameterUtil;
|
||||
import org.codelibs.robot.client.S2RobotClientFactory;
|
||||
import org.codelibs.robot.client.http.Authentication;
|
||||
import org.codelibs.robot.client.http.HcHttpClient;
|
||||
import org.codelibs.robot.client.http.impl.AuthenticationImpl;
|
||||
import org.codelibs.robot.client.http.ntlm.JcifsEngine;
|
||||
import org.codelibs.robot.client.smb.SmbAuthentication;
|
||||
import org.codelibs.robot.client.smb.SmbClient;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.client.http.Authentication;
|
||||
import org.codelibs.fess.crawler.client.http.HcHttpClient;
|
||||
import org.codelibs.fess.crawler.client.http.impl.AuthenticationImpl;
|
||||
import org.codelibs.fess.crawler.client.http.ntlm.JcifsEngine;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbAuthentication;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbClient;
|
||||
import org.dbflute.cbean.result.ListResultBean;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -44,15 +44,15 @@ public class DataConfig extends BsDataConfig implements CrawlingConfig {
|
|||
|
||||
private static final Logger logger = LoggerFactory.getLogger(DataConfig.class);
|
||||
|
||||
private static final String S2ROBOT_WEB_HEADER_PREFIX = "s2robot.web.header.";
|
||||
private static final String S2ROBOT_WEB_HEADER_PREFIX = "crawler.web.header.";
|
||||
|
||||
private static final String S2ROBOT_WEB_AUTH = "s2robot.web.auth";
|
||||
private static final String S2ROBOT_WEB_AUTH = "crawler.web.auth";
|
||||
|
||||
private static final String S2ROBOT_USERAGENT = "s2robot.useragent";
|
||||
private static final String S2ROBOT_USERAGENT = "crawler.useragent";
|
||||
|
||||
private static final String S2ROBOT_PARAM_PREFIX = "s2robot.param.";
|
||||
private static final String S2ROBOT_PARAM_PREFIX = "crawler.param.";
|
||||
|
||||
private static final Object S2ROBOT_FILE_AUTH = "s2robot.file.auth";
|
||||
private static final Object S2ROBOT_FILE_AUTH = "crawler.file.auth";
|
||||
|
||||
private String[] labelTypeIds;
|
||||
|
||||
|
@ -211,11 +211,11 @@ public class DataConfig extends BsDataConfig implements CrawlingConfig {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void initializeClientFactory(final S2RobotClientFactory robotClientFactory) {
|
||||
public void initializeClientFactory(final CrawlerClientFactory crawlerClientFactory) {
|
||||
final Map<String, String> paramMap = getHandlerParameterMap();
|
||||
|
||||
final Map<String, Object> factoryParamMap = new HashMap<String, Object>();
|
||||
robotClientFactory.setInitParameterMap(factoryParamMap);
|
||||
crawlerClientFactory.setInitParameterMap(factoryParamMap);
|
||||
|
||||
// parameters
|
||||
for (final Map.Entry<String, String> entry : paramMap.entrySet()) {
|
||||
|
@ -301,18 +301,19 @@ public class DataConfig extends BsDataConfig implements CrawlingConfig {
|
|||
}
|
||||
|
||||
// request header
|
||||
final List<org.codelibs.robot.client.http.RequestHeader> rhList = new ArrayList<org.codelibs.robot.client.http.RequestHeader>();
|
||||
final List<org.codelibs.fess.crawler.client.http.RequestHeader> rhList =
|
||||
new ArrayList<org.codelibs.fess.crawler.client.http.RequestHeader>();
|
||||
int count = 1;
|
||||
String headerName = paramMap.get(S2ROBOT_WEB_HEADER_PREFIX + count + ".name");
|
||||
while (StringUtil.isNotBlank(headerName)) {
|
||||
final String headerValue = paramMap.get(S2ROBOT_WEB_HEADER_PREFIX + count + ".value");
|
||||
rhList.add(new org.codelibs.robot.client.http.RequestHeader(headerName, headerValue));
|
||||
rhList.add(new org.codelibs.fess.crawler.client.http.RequestHeader(headerName, headerValue));
|
||||
count++;
|
||||
headerName = paramMap.get(S2ROBOT_WEB_HEADER_PREFIX + count + ".name");
|
||||
}
|
||||
if (!rhList.isEmpty()) {
|
||||
factoryParamMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY,
|
||||
rhList.toArray(new org.codelibs.robot.client.http.RequestHeader[rhList.size()]));
|
||||
rhList.toArray(new org.codelibs.fess.crawler.client.http.RequestHeader[rhList.size()]));
|
||||
}
|
||||
|
||||
// file auth
|
||||
|
|
|
@ -18,9 +18,9 @@ import org.codelibs.fess.es.exbhv.RoleTypeBhv;
|
|||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.ParameterUtil;
|
||||
import org.codelibs.robot.client.S2RobotClientFactory;
|
||||
import org.codelibs.robot.client.smb.SmbAuthentication;
|
||||
import org.codelibs.robot.client.smb.SmbClient;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbAuthentication;
|
||||
import org.codelibs.fess.crawler.client.smb.SmbClient;
|
||||
import org.dbflute.cbean.result.ListResultBean;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
|
||||
|
@ -228,7 +228,7 @@ public class FileConfig extends BsFileConfig implements CrawlingConfig {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void initializeClientFactory(final S2RobotClientFactory clientFactory) {
|
||||
public void initializeClientFactory(final CrawlerClientFactory clientFactory) {
|
||||
final FileAuthenticationService fileAuthenticationService = SingletonLaContainer.getComponent(FileAuthenticationService.class);
|
||||
|
||||
// Parameters
|
||||
|
|
|
@ -30,8 +30,8 @@ public class RequestHeader extends BsRequestHeader {
|
|||
asDocMeta().version(version);
|
||||
}
|
||||
|
||||
public org.codelibs.robot.client.http.RequestHeader getS2RobotRequestHeader() {
|
||||
return new org.codelibs.robot.client.http.RequestHeader(getName(), getValue());
|
||||
public org.codelibs.fess.crawler.client.http.RequestHeader getCrawlerRequestHeader() {
|
||||
return new org.codelibs.fess.crawler.client.http.RequestHeader(getName(), getValue());
|
||||
}
|
||||
|
||||
public WebConfig getWebConfig() {
|
||||
|
|
|
@ -16,10 +16,10 @@ import org.codelibs.fess.app.service.WebConfigService;
|
|||
import org.codelibs.fess.es.bsentity.BsWebAuthentication;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.ParameterUtil;
|
||||
import org.codelibs.robot.client.http.Authentication;
|
||||
import org.codelibs.robot.client.http.impl.AuthenticationImpl;
|
||||
import org.codelibs.robot.client.http.ntlm.JcifsEngine;
|
||||
import org.codelibs.robot.exception.RobotSystemException;
|
||||
import org.codelibs.fess.crawler.client.http.Authentication;
|
||||
import org.codelibs.fess.crawler.client.http.impl.AuthenticationImpl;
|
||||
import org.codelibs.fess.crawler.client.http.ntlm.JcifsEngine;
|
||||
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
|
||||
|
||||
/**
|
||||
* @author FreeGen
|
||||
|
@ -71,7 +71,7 @@ public class WebAuthentication extends BsWebAuthentication {
|
|||
|
||||
private Credentials getCredentials() {
|
||||
if (StringUtil.isEmpty(getUsername())) {
|
||||
throw new RobotSystemException("username is empty.");
|
||||
throw new CrawlerSystemException("username is empty.");
|
||||
}
|
||||
|
||||
if (Constants.NTLM.equals(getProtocolScheme())) {
|
||||
|
|
|
@ -18,9 +18,9 @@ import org.codelibs.fess.es.exbhv.WebConfigToLabelBhv;
|
|||
import org.codelibs.fess.es.exbhv.WebConfigToRoleBhv;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.ParameterUtil;
|
||||
import org.codelibs.robot.client.S2RobotClientFactory;
|
||||
import org.codelibs.robot.client.http.Authentication;
|
||||
import org.codelibs.robot.client.http.HcHttpClient;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.client.http.Authentication;
|
||||
import org.codelibs.fess.crawler.client.http.HcHttpClient;
|
||||
import org.dbflute.cbean.result.ListResultBean;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
|
||||
|
@ -232,7 +232,7 @@ public class WebConfig extends BsWebConfig implements CrawlingConfig {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void initializeClientFactory(final S2RobotClientFactory clientFactory) {
|
||||
public void initializeClientFactory(final CrawlerClientFactory clientFactory) {
|
||||
final WebAuthenticationService webAuthenticationService = SingletonLaContainer.getComponent(WebAuthenticationService.class);
|
||||
final RequestHeaderService requestHeaderService = SingletonLaContainer.getComponent(RequestHeaderService.class);
|
||||
|
||||
|
@ -259,11 +259,13 @@ public class WebConfig extends BsWebConfig implements CrawlingConfig {
|
|||
|
||||
// request header
|
||||
final List<RequestHeader> requestHeaderList = requestHeaderService.getRequestHeaderList(getId());
|
||||
final List<org.codelibs.robot.client.http.RequestHeader> rhList = new ArrayList<org.codelibs.robot.client.http.RequestHeader>();
|
||||
final List<org.codelibs.fess.crawler.client.http.RequestHeader> rhList =
|
||||
new ArrayList<org.codelibs.fess.crawler.client.http.RequestHeader>();
|
||||
for (final RequestHeader requestHeader : requestHeaderList) {
|
||||
rhList.add(requestHeader.getS2RobotRequestHeader());
|
||||
rhList.add(requestHeader.getCrawlerRequestHeader());
|
||||
}
|
||||
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, rhList.toArray(new org.codelibs.robot.client.http.RequestHeader[rhList.size()]));
|
||||
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY,
|
||||
rhList.toArray(new org.codelibs.fess.crawler.client.http.RequestHeader[rhList.size()]));
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -46,7 +46,7 @@ import org.codelibs.fess.helper.PathMappingHelper;
|
|||
import org.codelibs.fess.helper.WebFsIndexHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.ResourceUtil;
|
||||
import org.codelibs.robot.client.EsClient;
|
||||
import org.codelibs.fess.crawler.client.EsClient;
|
||||
import org.elasticsearch.index.query.QueryBuilder;
|
||||
import org.elasticsearch.index.query.QueryBuilders;
|
||||
import org.kohsuke.args4j.CmdLineException;
|
||||
|
|
|
@ -19,29 +19,29 @@ package org.codelibs.fess.helper;
|
|||
import org.codelibs.fess.app.service.FailureUrlService;
|
||||
import org.codelibs.fess.es.exentity.CrawlingConfig;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.S2RobotContext;
|
||||
import org.codelibs.robot.entity.UrlQueue;
|
||||
import org.codelibs.robot.exception.RobotMultipleCrawlAccessException;
|
||||
import org.codelibs.robot.helper.impl.LogHelperImpl;
|
||||
import org.codelibs.robot.log.LogType;
|
||||
import org.codelibs.fess.crawler.CrawlerContext;
|
||||
import org.codelibs.fess.crawler.entity.UrlQueue;
|
||||
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
|
||||
import org.codelibs.fess.crawler.helper.impl.LogHelperImpl;
|
||||
import org.codelibs.fess.crawler.log.LogType;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class RobotLogHelper extends LogHelperImpl {
|
||||
public class CrawlerLogHelper extends LogHelperImpl {
|
||||
private static final Logger logger = LoggerFactory // NOPMD
|
||||
.getLogger(RobotLogHelper.class);
|
||||
.getLogger(CrawlerLogHelper.class);
|
||||
|
||||
@Override
|
||||
public void log(final LogType key, final Object... objs) {
|
||||
try {
|
||||
switch (key) {
|
||||
case CRAWLING_ACCESS_EXCEPTION: {
|
||||
final S2RobotContext robotContext = (S2RobotContext) objs[0];
|
||||
final CrawlerContext crawlerContext = (CrawlerContext) objs[0];
|
||||
final UrlQueue urlQueue = (UrlQueue) objs[1];
|
||||
Throwable e = (Throwable) objs[2];
|
||||
if (e instanceof RobotMultipleCrawlAccessException) {
|
||||
final Throwable[] causes = ((RobotMultipleCrawlAccessException) e).getCauses();
|
||||
if (e instanceof MultipleCrawlingAccessException) {
|
||||
final Throwable[] causes = ((MultipleCrawlingAccessException) e).getCauses();
|
||||
if (causes.length > 0) {
|
||||
e = causes[causes.length - 1];
|
||||
}
|
||||
|
@ -54,15 +54,15 @@ public class RobotLogHelper extends LogHelperImpl {
|
|||
} else {
|
||||
errorName = e.getClass().getCanonicalName();
|
||||
}
|
||||
storeFailureUrl(robotContext, urlQueue, errorName, e);
|
||||
storeFailureUrl(crawlerContext, urlQueue, errorName, e);
|
||||
break;
|
||||
}
|
||||
case CRAWLING_EXCETPION: {
|
||||
final S2RobotContext robotContext = (S2RobotContext) objs[0];
|
||||
final CrawlerContext crawlerContext = (CrawlerContext) objs[0];
|
||||
final UrlQueue urlQueue = (UrlQueue) objs[1];
|
||||
final Throwable e = (Throwable) objs[2];
|
||||
|
||||
storeFailureUrl(robotContext, urlQueue, e.getClass().getCanonicalName(), e);
|
||||
storeFailureUrl(crawlerContext, urlQueue, e.getClass().getCanonicalName(), e);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
@ -75,9 +75,9 @@ public class RobotLogHelper extends LogHelperImpl {
|
|||
super.log(key, objs);
|
||||
}
|
||||
|
||||
private void storeFailureUrl(final S2RobotContext robotContext, final UrlQueue urlQueue, final String errorName, final Throwable e) {
|
||||
private void storeFailureUrl(final CrawlerContext crawlerContext, final UrlQueue urlQueue, final String errorName, final Throwable e) {
|
||||
|
||||
final CrawlingConfig crawlingConfig = getCrawlingConfig(robotContext.getSessionId());
|
||||
final CrawlingConfig crawlingConfig = getCrawlingConfig(crawlerContext.getSessionId());
|
||||
final String url = urlQueue.getUrl();
|
||||
|
||||
final FailureUrlService failureUrlService = SingletonLaContainer.getComponent(FailureUrlService.class);
|
|
@ -132,8 +132,8 @@ public class DataIndexHelper implements Serializable {
|
|||
while (startedCrawlerNum < dataCrawlingThreadList.size()) {
|
||||
// Force to stop crawl
|
||||
if (systemHelper.isForceStop()) {
|
||||
for (final DataCrawlingThread s2Robot : dataCrawlingThreadList) {
|
||||
s2Robot.stopCrawling();
|
||||
for (final DataCrawlingThread crawlerThread : dataCrawlingThreadList) {
|
||||
crawlerThread.stopCrawling();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -42,7 +42,7 @@ import org.codelibs.fess.Constants;
|
|||
import org.codelibs.fess.app.service.RoleTypeService;
|
||||
import org.codelibs.fess.es.exentity.RoleType;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.util.CharUtil;
|
||||
import org.codelibs.fess.crawler.util.CharUtil;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.lastaflute.web.util.LaRequestUtil;
|
||||
|
||||
|
|
|
@ -56,11 +56,11 @@ import org.codelibs.fess.helper.UserAgentHelper.UserAgentType;
|
|||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.fess.util.DocumentUtil;
|
||||
import org.codelibs.fess.util.ResourceUtil;
|
||||
import org.codelibs.robot.builder.RequestDataBuilder;
|
||||
import org.codelibs.robot.client.S2RobotClient;
|
||||
import org.codelibs.robot.client.S2RobotClientFactory;
|
||||
import org.codelibs.robot.entity.ResponseData;
|
||||
import org.codelibs.robot.util.CharUtil;
|
||||
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClient;
|
||||
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
|
||||
import org.codelibs.fess.crawler.entity.ResponseData;
|
||||
import org.codelibs.fess.crawler.util.CharUtil;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.lastaflute.taglib.function.LaFunctions;
|
||||
import org.lastaflute.web.response.StreamResponse;
|
||||
|
@ -508,11 +508,11 @@ public class ViewHelper implements Serializable {
|
|||
throw new FessSystemException("No crawlingConfig: " + configIdObj);
|
||||
}
|
||||
final String url = (String) doc.get(fieldHelper.urlField);
|
||||
final S2RobotClientFactory robotClientFactory = SingletonLaContainer.getComponent(S2RobotClientFactory.class);
|
||||
config.initializeClientFactory(robotClientFactory);
|
||||
final S2RobotClient client = robotClientFactory.getClient(url);
|
||||
final CrawlerClientFactory crawlerClientFactory = SingletonLaContainer.getComponent(CrawlerClientFactory.class);
|
||||
config.initializeClientFactory(crawlerClientFactory);
|
||||
final CrawlerClient client = crawlerClientFactory.getClient(url);
|
||||
if (client == null) {
|
||||
throw new FessSystemException("No S2RobotClient: " + configIdObj + ", url: " + url);
|
||||
throw new FessSystemException("No CrawlerClient: " + configIdObj + ", url: " + url);
|
||||
}
|
||||
final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build());
|
||||
final StreamResponse response = new StreamResponse(StringUtil.EMPTY);
|
||||
|
|
|
@ -37,11 +37,11 @@ import org.codelibs.fess.es.exentity.FileConfig;
|
|||
import org.codelibs.fess.es.exentity.WebConfig;
|
||||
import org.codelibs.fess.indexer.IndexUpdater;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.S2Robot;
|
||||
import org.codelibs.robot.S2RobotContext;
|
||||
import org.codelibs.robot.service.DataService;
|
||||
import org.codelibs.robot.service.UrlFilterService;
|
||||
import org.codelibs.robot.service.UrlQueueService;
|
||||
import org.codelibs.fess.crawler.Crawler;
|
||||
import org.codelibs.fess.crawler.CrawlerContext;
|
||||
import org.codelibs.fess.crawler.service.DataService;
|
||||
import org.codelibs.fess.crawler.service.UrlFilterService;
|
||||
import org.codelibs.fess.crawler.service.UrlQueueService;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -81,7 +81,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
|
||||
public int crawlerPriority = Thread.NORM_PRIORITY;
|
||||
|
||||
private final List<S2Robot> s2RobotList = Collections.synchronizedList(new ArrayList<S2Robot>());
|
||||
private final List<Crawler> crawlerList = Collections.synchronizedList(new ArrayList<Crawler>());
|
||||
|
||||
// needed?
|
||||
@Deprecated
|
||||
|
@ -140,15 +140,15 @@ public class WebFsIndexHelper implements Serializable {
|
|||
final long startTime = System.currentTimeMillis();
|
||||
|
||||
final List<String> sessionIdList = new ArrayList<String>();
|
||||
s2RobotList.clear();
|
||||
final List<String> s2RobotStatusList = new ArrayList<String>();
|
||||
crawlerList.clear();
|
||||
final List<String> crawlerStatusList = new ArrayList<String>();
|
||||
// Web
|
||||
for (final WebConfig webConfig : webConfigList) {
|
||||
final String sid = crawlingConfigHelper.store(sessionId, webConfig);
|
||||
|
||||
// create s2robot
|
||||
final S2Robot s2Robot = SingletonLaContainer.getComponent(S2Robot.class);
|
||||
s2Robot.setSessionId(sid);
|
||||
// create crawler
|
||||
final Crawler crawler = SingletonLaContainer.getComponent(Crawler.class);
|
||||
crawler.setSessionId(sid);
|
||||
sessionIdList.add(sid);
|
||||
|
||||
final String urlsStr = webConfig.getUrls();
|
||||
|
@ -160,26 +160,26 @@ public class WebFsIndexHelper implements Serializable {
|
|||
// interval time
|
||||
final int intervalTime =
|
||||
webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
|
||||
((FessIntervalController) s2Robot.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
|
||||
((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
|
||||
|
||||
final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY;
|
||||
final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY;
|
||||
|
||||
// num of threads
|
||||
final S2RobotContext robotContext = s2Robot.getRobotContext();
|
||||
final CrawlerContext crawlerContext = crawler.getCrawlerContext();
|
||||
final int numOfThread =
|
||||
webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
|
||||
robotContext.setNumOfThread(numOfThread);
|
||||
crawlerContext.setNumOfThread(numOfThread);
|
||||
|
||||
// depth
|
||||
final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1;
|
||||
robotContext.setMaxDepth(depth);
|
||||
crawlerContext.setMaxDepth(depth);
|
||||
|
||||
// max count
|
||||
final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount;
|
||||
robotContext.setMaxAccessCount(maxCount);
|
||||
crawlerContext.setMaxAccessCount(maxCount);
|
||||
|
||||
webConfig.initializeClientFactory(s2Robot.getClientFactory());
|
||||
webConfig.initializeClientFactory(crawler.getClientFactory());
|
||||
|
||||
// set urls
|
||||
final String[] urls = urlsStr.split("[\r\n]");
|
||||
|
@ -187,7 +187,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
if (StringUtil.isNotBlank(u)) {
|
||||
final String urlValue = u.trim();
|
||||
if (!urlValue.startsWith("#")) {
|
||||
s2Robot.addUrl(urlValue);
|
||||
crawler.addUrl(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Target URL: " + urlValue);
|
||||
}
|
||||
|
@ -201,7 +201,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
if (StringUtil.isNotBlank(u)) {
|
||||
final String urlValue = u.trim();
|
||||
if (!urlValue.startsWith("#")) {
|
||||
s2Robot.addIncludeFilter(urlValue);
|
||||
crawler.addIncludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Included URL: " + urlValue);
|
||||
}
|
||||
|
@ -215,7 +215,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
if (StringUtil.isNotBlank(u)) {
|
||||
final String urlValue = u.trim();
|
||||
if (!urlValue.startsWith("#")) {
|
||||
s2Robot.addExcludeFilter(urlValue);
|
||||
crawler.addExcludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Excluded URL: " + urlValue);
|
||||
}
|
||||
|
@ -229,7 +229,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
for (final String u : excludedUrlList) {
|
||||
if (StringUtil.isNotBlank(u)) {
|
||||
final String urlValue = u.trim();
|
||||
s2Robot.addExcludeFilter(urlValue);
|
||||
crawler.addExcludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Excluded URL from failures: " + urlValue);
|
||||
}
|
||||
|
@ -241,20 +241,20 @@ public class WebFsIndexHelper implements Serializable {
|
|||
logger.debug("Crawling " + urlsStr);
|
||||
}
|
||||
|
||||
s2Robot.setBackground(true);
|
||||
s2Robot.setThreadPriority(crawlerPriority);
|
||||
crawler.setBackground(true);
|
||||
crawler.setThreadPriority(crawlerPriority);
|
||||
|
||||
s2RobotList.add(s2Robot);
|
||||
s2RobotStatusList.add(Constants.READY);
|
||||
crawlerList.add(crawler);
|
||||
crawlerStatusList.add(Constants.READY);
|
||||
}
|
||||
|
||||
// File
|
||||
for (final FileConfig fileConfig : fileConfigList) {
|
||||
final String sid = crawlingConfigHelper.store(sessionId, fileConfig);
|
||||
|
||||
// create s2robot
|
||||
final S2Robot s2Robot = SingletonLaContainer.getComponent(S2Robot.class);
|
||||
s2Robot.setSessionId(sid);
|
||||
// create crawler
|
||||
final Crawler crawler = SingletonLaContainer.getComponent(Crawler.class);
|
||||
crawler.setSessionId(sid);
|
||||
sessionIdList.add(sid);
|
||||
|
||||
final String pathsStr = fileConfig.getPaths();
|
||||
|
@ -265,26 +265,26 @@ public class WebFsIndexHelper implements Serializable {
|
|||
|
||||
final int intervalTime =
|
||||
fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
|
||||
((FessIntervalController) s2Robot.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
|
||||
((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
|
||||
|
||||
final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY;
|
||||
final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY;
|
||||
|
||||
// num of threads
|
||||
final S2RobotContext robotContext = s2Robot.getRobotContext();
|
||||
final CrawlerContext crawlerContext = crawler.getCrawlerContext();
|
||||
final int numOfThread =
|
||||
fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
|
||||
robotContext.setNumOfThread(numOfThread);
|
||||
crawlerContext.setNumOfThread(numOfThread);
|
||||
|
||||
// depth
|
||||
final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1;
|
||||
robotContext.setMaxDepth(depth);
|
||||
crawlerContext.setMaxDepth(depth);
|
||||
|
||||
// max count
|
||||
final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount;
|
||||
robotContext.setMaxAccessCount(maxCount);
|
||||
crawlerContext.setMaxAccessCount(maxCount);
|
||||
|
||||
fileConfig.initializeClientFactory(s2Robot.getClientFactory());
|
||||
fileConfig.initializeClientFactory(crawler.getClientFactory());
|
||||
|
||||
// set paths
|
||||
final String[] paths = pathsStr.split("[\r\n]");
|
||||
|
@ -299,7 +299,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
u = "file:/" + u;
|
||||
}
|
||||
}
|
||||
s2Robot.addUrl(u);
|
||||
crawler.addUrl(u);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Target Path: " + u);
|
||||
}
|
||||
|
@ -321,7 +321,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
} else {
|
||||
urlValue = systemHelper.encodeUrlFilter(line);
|
||||
}
|
||||
s2Robot.addIncludeFilter(urlValue);
|
||||
crawler.addIncludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Included Path: " + urlValue);
|
||||
}
|
||||
|
@ -345,7 +345,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
} else {
|
||||
urlValue = systemHelper.encodeUrlFilter(line);
|
||||
}
|
||||
s2Robot.addExcludeFilter(urlValue);
|
||||
crawler.addExcludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Excluded Path: " + urlValue);
|
||||
}
|
||||
|
@ -361,7 +361,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
for (final String u : excludedUrlList) {
|
||||
if (StringUtil.isNotBlank(u)) {
|
||||
final String urlValue = u.trim();
|
||||
s2Robot.addExcludeFilter(urlValue);
|
||||
crawler.addExcludeFilter(urlValue);
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Excluded Path from failures: " + urlValue);
|
||||
}
|
||||
|
@ -373,11 +373,11 @@ public class WebFsIndexHelper implements Serializable {
|
|||
logger.debug("Crawling " + pathsStr);
|
||||
}
|
||||
|
||||
s2Robot.setBackground(true);
|
||||
s2Robot.setThreadPriority(crawlerPriority);
|
||||
crawler.setBackground(true);
|
||||
crawler.setThreadPriority(crawlerPriority);
|
||||
|
||||
s2RobotList.add(s2Robot);
|
||||
s2RobotStatusList.add(Constants.READY);
|
||||
crawlerList.add(crawler);
|
||||
crawlerStatusList.add(Constants.READY);
|
||||
}
|
||||
|
||||
// run index update
|
||||
|
@ -386,7 +386,7 @@ public class WebFsIndexHelper implements Serializable {
|
|||
indexUpdater.setPriority(indexUpdaterPriority);
|
||||
indexUpdater.setSessionIdList(sessionIdList);
|
||||
indexUpdater.setDaemon(true);
|
||||
indexUpdater.setS2RobotList(s2RobotList);
|
||||
indexUpdater.setCrawlerList(crawlerList);
|
||||
for (final BoostDocumentRule rule : boostDocumentRuleService.getAvailableBoostDocumentRuleList()) {
|
||||
indexUpdater.addBoostDocumentRule(new org.codelibs.fess.indexer.BoostDocumentRule(rule));
|
||||
}
|
||||
|
@ -394,19 +394,19 @@ public class WebFsIndexHelper implements Serializable {
|
|||
|
||||
int startedCrawlerNum = 0;
|
||||
int activeCrawlerNum = 0;
|
||||
while (startedCrawlerNum < s2RobotList.size()) {
|
||||
while (startedCrawlerNum < crawlerList.size()) {
|
||||
// Force to stop crawl
|
||||
if (systemHelper.isForceStop()) {
|
||||
for (final S2Robot s2Robot : s2RobotList) {
|
||||
s2Robot.stop();
|
||||
for (final Crawler crawler : crawlerList) {
|
||||
crawler.stop();
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (activeCrawlerNum < multiprocessCrawlingCount) {
|
||||
// start crawling
|
||||
s2RobotList.get(startedCrawlerNum).execute();
|
||||
s2RobotStatusList.set(startedCrawlerNum, Constants.RUNNING);
|
||||
crawlerList.get(startedCrawlerNum).execute();
|
||||
crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING);
|
||||
startedCrawlerNum++;
|
||||
activeCrawlerNum++;
|
||||
try {
|
||||
|
@ -419,10 +419,10 @@ public class WebFsIndexHelper implements Serializable {
|
|||
|
||||
// check status
|
||||
for (int i = 0; i < startedCrawlerNum; i++) {
|
||||
if (!s2RobotList.get(i).getRobotContext().isRunning() && s2RobotStatusList.get(i).equals(Constants.RUNNING)) {
|
||||
s2RobotList.get(i).awaitTermination();
|
||||
s2RobotStatusList.set(i, Constants.DONE);
|
||||
final String sid = s2RobotList.get(i).getRobotContext().getSessionId();
|
||||
if (!crawlerList.get(i).getCrawlerContext().isRunning() && crawlerStatusList.get(i).equals(Constants.RUNNING)) {
|
||||
crawlerList.get(i).awaitTermination();
|
||||
crawlerStatusList.set(i, Constants.DONE);
|
||||
final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
|
||||
indexUpdater.addFinishedSessionId(sid);
|
||||
activeCrawlerNum--;
|
||||
}
|
||||
|
@ -437,20 +437,20 @@ public class WebFsIndexHelper implements Serializable {
|
|||
boolean finishedAll = false;
|
||||
while (!finishedAll) {
|
||||
finishedAll = true;
|
||||
for (int i = 0; i < s2RobotList.size(); i++) {
|
||||
s2RobotList.get(i).awaitTermination(crawlingExecutionInterval);
|
||||
if (!s2RobotList.get(i).getRobotContext().isRunning() && !s2RobotStatusList.get(i).equals(Constants.DONE)) {
|
||||
s2RobotStatusList.set(i, Constants.DONE);
|
||||
final String sid = s2RobotList.get(i).getRobotContext().getSessionId();
|
||||
for (int i = 0; i < crawlerList.size(); i++) {
|
||||
crawlerList.get(i).awaitTermination(crawlingExecutionInterval);
|
||||
if (!crawlerList.get(i).getCrawlerContext().isRunning() && !crawlerStatusList.get(i).equals(Constants.DONE)) {
|
||||
crawlerStatusList.set(i, Constants.DONE);
|
||||
final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
|
||||
indexUpdater.addFinishedSessionId(sid);
|
||||
}
|
||||
if (!s2RobotStatusList.get(i).equals(Constants.DONE)) {
|
||||
if (!crawlerStatusList.get(i).equals(Constants.DONE)) {
|
||||
finishedAll = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
s2RobotList.clear();
|
||||
s2RobotStatusList.clear();
|
||||
crawlerList.clear();
|
||||
crawlerStatusList.clear();
|
||||
|
||||
// put cralwing info
|
||||
final CrawlingSessionHelper crawlingSessionHelper = ComponentUtil.getCrawlingSessionHelper();
|
||||
|
|
|
@ -53,7 +53,11 @@ public class BoostDocumentRule {
|
|||
return ((Boolean) value).booleanValue();
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
logger.warn("Failed to parse a doc for boost: " + map, e);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Failed to evaluate \"" + matchExpression + "\" for " + map, e);
|
||||
} else {
|
||||
logger.warn("Failed to evaluate \"" + matchExpression + "\".");
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
|
|
|
@ -36,17 +36,17 @@ import org.codelibs.fess.helper.IntervalControlHelper;
|
|||
import org.codelibs.fess.helper.SearchLogHelper;
|
||||
import org.codelibs.fess.helper.SystemHelper;
|
||||
import org.codelibs.fess.util.ComponentUtil;
|
||||
import org.codelibs.robot.S2Robot;
|
||||
import org.codelibs.robot.entity.AccessResult;
|
||||
import org.codelibs.robot.entity.AccessResultData;
|
||||
import org.codelibs.robot.entity.EsAccessResult;
|
||||
import org.codelibs.robot.entity.EsUrlQueue;
|
||||
import org.codelibs.robot.service.DataService;
|
||||
import org.codelibs.robot.service.UrlFilterService;
|
||||
import org.codelibs.robot.service.UrlQueueService;
|
||||
import org.codelibs.robot.service.impl.EsDataService;
|
||||
import org.codelibs.robot.transformer.Transformer;
|
||||
import org.codelibs.robot.util.EsResultList;
|
||||
import org.codelibs.fess.crawler.Crawler;
|
||||
import org.codelibs.fess.crawler.entity.AccessResult;
|
||||
import org.codelibs.fess.crawler.entity.AccessResultData;
|
||||
import org.codelibs.fess.crawler.entity.EsAccessResult;
|
||||
import org.codelibs.fess.crawler.entity.EsUrlQueue;
|
||||
import org.codelibs.fess.crawler.service.DataService;
|
||||
import org.codelibs.fess.crawler.service.UrlFilterService;
|
||||
import org.codelibs.fess.crawler.service.UrlQueueService;
|
||||
import org.codelibs.fess.crawler.service.impl.EsDataService;
|
||||
import org.codelibs.fess.crawler.transformer.Transformer;
|
||||
import org.codelibs.fess.crawler.util.EsResultList;
|
||||
import org.elasticsearch.action.search.SearchRequestBuilder;
|
||||
import org.elasticsearch.index.query.FilterBuilders;
|
||||
import org.elasticsearch.index.query.QueryBuilder;
|
||||
|
@ -122,7 +122,7 @@ public class IndexUpdater extends Thread {
|
|||
|
||||
private final Map<String, Object> docValueMap = new HashMap<String, Object>();
|
||||
|
||||
private List<S2Robot> s2RobotList;
|
||||
private List<Crawler> crawlerList;
|
||||
|
||||
public IndexUpdater() {
|
||||
// nothing
|
||||
|
@ -176,7 +176,7 @@ public class IndexUpdater extends Thread {
|
|||
.boolFilter()
|
||||
.must(FilterBuilders.termsFilter(EsAccessResult.SESSION_ID, sessionIdList))
|
||||
.must(FilterBuilders.termFilter(EsAccessResult.STATUS,
|
||||
org.codelibs.robot.Constants.OK_STATUS)));
|
||||
org.codelibs.fess.crawler.Constants.OK_STATUS)));
|
||||
builder.setQuery(queryBuilder);
|
||||
builder.setFrom(0);
|
||||
if (maxDocumentCacheSize <= 0) {
|
||||
|
@ -507,8 +507,8 @@ public class IndexUpdater extends Thread {
|
|||
|
||||
private void forceStop() {
|
||||
systemHelper.setForceStop(true);
|
||||
for (final S2Robot s2Robot : s2RobotList) {
|
||||
s2Robot.stop();
|
||||
for (final Crawler crawler : crawlerList) {
|
||||
crawler.stop();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -557,7 +557,7 @@ public class IndexUpdater extends Thread {
|
|||
docValueMap.put(fieldName, value);
|
||||
}
|
||||
|
||||
public void setS2RobotList(final List<S2Robot> s2RobotList) {
|
||||
this.s2RobotList = s2RobotList;
|
||||
public void setCrawlerList(final List<Crawler> crawlerList) {
|
||||
this.crawlerList = crawlerList;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -45,9 +45,9 @@ import org.codelibs.fess.helper.UserAgentHelper;
|
|||
import org.codelibs.fess.helper.ViewHelper;
|
||||
import org.codelibs.fess.indexer.IndexUpdater;
|
||||
import org.codelibs.fess.job.JobExecutor;
|
||||
import org.codelibs.robot.entity.EsAccessResult;
|
||||
import org.codelibs.robot.extractor.ExtractorFactory;
|
||||
import org.codelibs.robot.service.DataService;
|
||||
import org.codelibs.fess.crawler.entity.EsAccessResult;
|
||||
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
|
||||
import org.codelibs.fess.crawler.service.DataService;
|
||||
import org.lastaflute.core.message.MessageManager;
|
||||
import org.lastaflute.di.core.SingletonLaContainer;
|
||||
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;
|
||||
|
|
|
@ -73,19 +73,19 @@ public class ResourceUtil {
|
|||
} catch (final Throwable e) { // NOSONAR
|
||||
// ignore
|
||||
}
|
||||
Path path = Paths.get(".", names);
|
||||
if (Files.exists(path)) {
|
||||
return path;
|
||||
final Path defaultPath = Paths.get("WEB-INF/" + base, names);
|
||||
if (Files.exists(defaultPath)) {
|
||||
return defaultPath;
|
||||
}
|
||||
path = Paths.get("src/main/webapps/WEB-INF/" + base, names);
|
||||
if (Files.exists(path)) {
|
||||
return path;
|
||||
final Path srcBasePath = Paths.get("src/main/webapps/WEB-INF/" + base, names);
|
||||
if (Files.exists(srcBasePath)) {
|
||||
return srcBasePath;
|
||||
}
|
||||
path = Paths.get("target/fess/WEB-INF/" + base, names);
|
||||
if (Files.exists(path)) {
|
||||
return path;
|
||||
final Path targetBasePath = Paths.get("target/fess/WEB-INF/" + base, names);
|
||||
if (Files.exists(targetBasePath)) {
|
||||
return targetBasePath;
|
||||
}
|
||||
return path;
|
||||
return defaultPath;
|
||||
}
|
||||
|
||||
public static File[] getJarFiles(final String namePrefix) {
|
||||
|
|
|
@ -10,8 +10,8 @@
|
|||
<include path="fess_api.xml"/>
|
||||
<include path="fess_dict.xml"/>
|
||||
|
||||
<include path="s2robot/client.xml" />
|
||||
<include path="s2robot/mimetype.xml" />
|
||||
<include path="crawler/client.xml" />
|
||||
<include path="crawler/mimetype.xml" />
|
||||
|
||||
<component name="authenticationCipher" class="org.codelibs.core.crypto.CachedCipher">
|
||||
<!-- CHANGE THE FOLLOWING KEY -->
|
||||
|
|
52
src/main/resources/crawler.xml
Normal file
52
src/main/resources/crawler.xml
Normal file
|
@ -0,0 +1,52 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
|
||||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components namespace="fessCrawler">
|
||||
<include path="crawler/container.xml"/>
|
||||
<include path="crawler/client.xml"/>
|
||||
<include path="crawler/rule.xml"/>
|
||||
<include path="crawler/filter.xml"/>
|
||||
<include path="crawler/interval.xml"/>
|
||||
<include path="crawler/extractor.xml"/>
|
||||
<include path="crawler/mimetype.xml"/>
|
||||
<include path="crawler/encoding.xml"/>
|
||||
<include path="crawler/urlconverter.xml"/>
|
||||
<include path="crawler/log.xml"/>
|
||||
<include path="crawler/sitemaps.xml"/>
|
||||
|
||||
<include path="crawler/es.xml"/>
|
||||
|
||||
<!-- Crawler -->
|
||||
<component name="crawler" class="org.codelibs.fess.crawler.Crawler" instance="prototype" >
|
||||
</component>
|
||||
|
||||
<!-- Crawler Thread -->
|
||||
<component name="crawlerThread" class="org.codelibs.fess.crawler.FessCrawlerThread" instance="prototype" >
|
||||
</component>
|
||||
|
||||
<!-- Entity -->
|
||||
<component name="accessResult"
|
||||
class="org.codelibs.fess.crawler.entity.EsAccessResult" instance="prototype">
|
||||
</component>
|
||||
<component name="urlQueue"
|
||||
class="org.codelibs.fess.crawler.entity.EsUrlQueue" instance="prototype">
|
||||
</component>
|
||||
|
||||
<!-- Service -->
|
||||
<component name="urlQueueService"
|
||||
class="org.codelibs.fess.crawler.service.impl.EsUrlQueueService">
|
||||
<property name="index">".crawler"</property>
|
||||
<property name="type">"queue"</property>
|
||||
</component>
|
||||
<component name="dataService"
|
||||
class="org.codelibs.fess.crawler.service.impl.EsDataService">
|
||||
<property name="index">".crawler"</property>
|
||||
<property name="type">"data"</property>
|
||||
</component>
|
||||
<component name="urlFilterService"
|
||||
class="org.codelibs.fess.crawler.service.impl.EsUrlFilterService">
|
||||
<property name="index">".crawler"</property>
|
||||
<property name="type">"filter"</property>
|
||||
</component>
|
||||
|
||||
</components>
|
|
@ -1,11 +1,11 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
|
||||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components namespace="s2robot">
|
||||
<include path="s2robot/container.xml" />
|
||||
<components namespace="fessCrawler">
|
||||
<include path="crawler/container.xml" />
|
||||
|
||||
<component name="contentLengthHelper"
|
||||
class="org.codelibs.robot.helper.ContentLengthHelper" instance="singleton">
|
||||
class="org.codelibs.fess.crawler.helper.ContentLengthHelper" instance="singleton">
|
||||
<property name="defaultMaxLength">10485760</property><!-- 10M -->
|
||||
<postConstruct name="addMaxLength">
|
||||
<arg>"text/html"</arg>
|
|
@ -3,6 +3,6 @@
|
|||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components>
|
||||
<component name="esClient"
|
||||
class="org.codelibs.robot.client.EsClient">
|
||||
class="org.codelibs.fess.crawler.client.EsClient">
|
||||
</component>
|
||||
</components>
|
|
@ -1,8 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
|
||||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components namespace="s2robot">
|
||||
<include path="s2robot/container.xml" />
|
||||
<components namespace="fessCrawler">
|
||||
<include path="crawler/container.xml" />
|
||||
|
||||
<component name="intervalController"
|
||||
class="org.codelibs.fess.crawler.interval.FessIntervalController"
|
|
@ -1,10 +1,10 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
|
||||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components namespace="s2robot">
|
||||
<include path="s2robot/container.xml" />
|
||||
<components namespace="fessCrawler">
|
||||
<include path="crawler/container.xml" />
|
||||
|
||||
<component name="logHelper"
|
||||
class="org.codelibs.fess.helper.RobotLogHelper">
|
||||
class="org.codelibs.fess.helper.CrawlerLogHelper">
|
||||
</component>
|
||||
</components>
|
|
@ -1,11 +1,11 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
|
||||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components namespace="s2robot">
|
||||
<include path="s2robot/container.xml" />
|
||||
<include path="s2robot/transformer.xml" />
|
||||
<components namespace="fessCrawler">
|
||||
<include path="crawler/container.xml" />
|
||||
<include path="crawler/transformer.xml" />
|
||||
|
||||
<component name="ruleManager" class="org.codelibs.robot.rule.impl.RuleManagerImpl" instance="prototype">
|
||||
<component name="ruleManager" class="org.codelibs.fess.crawler.rule.impl.RuleManagerImpl" instance="prototype">
|
||||
<postConstruct name="addRule">
|
||||
<arg>sitemapsRule</arg>
|
||||
</postConstruct>
|
||||
|
@ -23,10 +23,10 @@
|
|||
</postConstruct>
|
||||
</component>
|
||||
|
||||
<component name="sitemapsRule" class="org.codelibs.robot.rule.impl.RegexRule" >
|
||||
<component name="sitemapsRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
|
||||
<property name="ruleId">"sitemapsRule"</property>
|
||||
<property name="responseProcessor">
|
||||
<component class="org.codelibs.robot.processor.impl.SitemapsResponseProcessor">
|
||||
<component class="org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor">
|
||||
</component>
|
||||
</property>
|
||||
<postConstruct name="addRule">
|
||||
|
@ -35,10 +35,10 @@
|
|||
</postConstruct>
|
||||
</component>
|
||||
|
||||
<component name="webHtmlRule" class="org.codelibs.robot.rule.impl.RegexRule" >
|
||||
<component name="webHtmlRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
|
||||
<property name="ruleId">"webHtmlRule"</property>
|
||||
<property name="responseProcessor">
|
||||
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
|
||||
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
|
||||
<property name="transformer">fessXpathTransformer</property>
|
||||
<property name="successfulHttpCodes">(int[])[200]</property>
|
||||
<property name="notModifiedHttpCodes">(int[])[304]</property>
|
||||
|
@ -56,10 +56,10 @@
|
|||
</postConstruct>
|
||||
</component>
|
||||
|
||||
<component name="webFileRule" class="org.codelibs.robot.rule.impl.RegexRule" >
|
||||
<component name="webFileRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
|
||||
<property name="ruleId">"webFileRule"</property>
|
||||
<property name="responseProcessor">
|
||||
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
|
||||
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
|
||||
<property name="transformer">fessFileTransformer</property>
|
||||
<property name="successfulHttpCodes">(int[])[200]</property>
|
||||
<property name="notModifiedHttpCodes">(int[])[304]</property>
|
||||
|
@ -85,10 +85,10 @@
|
|||
</postConstruct>
|
||||
</component>
|
||||
|
||||
<component name="fsFileRule" class="org.codelibs.robot.rule.impl.RegexRule" >
|
||||
<component name="fsFileRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
|
||||
<property name="ruleId">"fsFileRule"</property>
|
||||
<property name="responseProcessor">
|
||||
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
|
||||
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
|
||||
<property name="transformer">fessFileTransformer</property>
|
||||
<property name="successfulHttpCodes">(int[])[200]</property>
|
||||
<property name="notModifiedHttpCodes">(int[])[304]</property>
|
||||
|
@ -116,10 +116,10 @@
|
|||
</component>
|
||||
|
||||
|
||||
<component name="defaultRule" class="org.codelibs.robot.rule.impl.RegexRule" >
|
||||
<component name="defaultRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
|
||||
<property name="ruleId">"defaultRule"</property>
|
||||
<property name="responseProcessor">
|
||||
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
|
||||
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
|
||||
<property name="transformer">fessTikaTransformer</property>
|
||||
<property name="successfulHttpCodes">(int[])[200]</property>
|
||||
<property name="notModifiedHttpCodes">(int[])[304]</property>
|
|
@ -1,8 +1,8 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
|
||||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components namespace="s2robot">
|
||||
<include path="s2robot/transformer_basic.xml"/>
|
||||
<components namespace="fessCrawler">
|
||||
<include path="crawler/transformer_basic.xml"/>
|
||||
|
||||
|
||||
<component name="fessXpathTransformer" class="org.codelibs.fess.crawler.transformer.FessXpathTransformer" instance="singleton">
|
|
@ -1,52 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
|
||||
"http://dbflute.org/meta/lastadi10.dtd">
|
||||
<components namespace="s2robot">
|
||||
<include path="s2robot/container.xml"/>
|
||||
<include path="s2robot/client.xml"/>
|
||||
<include path="s2robot/rule.xml"/>
|
||||
<include path="s2robot/filter.xml"/>
|
||||
<include path="s2robot/interval.xml"/>
|
||||
<include path="s2robot/extractor.xml"/>
|
||||
<include path="s2robot/mimetype.xml"/>
|
||||
<include path="s2robot/encoding.xml"/>
|
||||
<include path="s2robot/urlconverter.xml"/>
|
||||
<include path="s2robot/log.xml"/>
|
||||
<include path="s2robot/sitemaps.xml"/>
|
||||
|
||||
<include path="s2robot/es.xml"/>
|
||||
|
||||
<!-- S2Robot -->
|
||||
<component name="s2Robot" class="org.codelibs.robot.S2Robot" instance="prototype" >
|
||||
</component>
|
||||
|
||||
<!-- Robot Thread -->
|
||||
<component name="robotThread" class="org.codelibs.fess.crawler.FessS2RobotThread" instance="prototype" >
|
||||
</component>
|
||||
|
||||
<!-- Entity -->
|
||||
<component name="accessResult"
|
||||
class="org.codelibs.robot.entity.EsAccessResult" instance="prototype">
|
||||
</component>
|
||||
<component name="urlQueue"
|
||||
class="org.codelibs.robot.entity.EsUrlQueue" instance="prototype">
|
||||
</component>
|
||||
|
||||
<!-- Service -->
|
||||
<component name="urlQueueService"
|
||||
class="org.codelibs.robot.service.impl.EsUrlQueueService">
|
||||
<property name="index">".robot"</property>
|
||||
<property name="type">"queue"</property>
|
||||
</component>
|
||||
<component name="dataService"
|
||||
class="org.codelibs.robot.service.impl.EsDataService">
|
||||
<property name="index">".robot"</property>
|
||||
<property name="type">"data"</property>
|
||||
</component>
|
||||
<component name="urlFilterService"
|
||||
class="org.codelibs.robot.service.impl.EsUrlFilterService">
|
||||
<property name="index">".robot"</property>
|
||||
<property name="type">"filter"</property>
|
||||
</component>
|
||||
|
||||
</components>
|
|
@ -5,7 +5,7 @@
|
|||
<include path="convention.xml" />
|
||||
<include path="fess.xml" />
|
||||
|
||||
<include path="s2robot_es.xml" />
|
||||
<include path="crawler_es.xml" />
|
||||
|
||||
<component name="indexingHelper" class="org.codelibs.fess.helper.IndexingHelper">
|
||||
</component>
|
||||
|
|
Loading…
Add table
Reference in a new issue