replace with fess-crawler

This commit is contained in:
Shinsuke Sugaya 2015-10-11 15:51:14 +09:00
parent 40a081f433
commit 8dbc3e9b77
40 changed files with 333 additions and 326 deletions

18
pom.xml
View file

@ -55,8 +55,8 @@
<junit.version>4.8.2</junit.version>
<utflute.version>0.5.2</utflute.version>
<!-- S2Robot -->
<s2robot.version>1.0.0-SNAPSHOT</s2robot.version>
<!-- Crawler -->
<crawler.version>1.0.0-SNAPSHOT</crawler.version>
<tika.version>1.6</tika.version>
<poi.version>3.11-beta2</poi.version>
<pdfbox.version>1.8.7</pdfbox.version>
@ -466,11 +466,11 @@
<version>2.2.1</version>
</dependency>
<!-- s2robot -->
<!-- Crawler -->
<dependency>
<groupId>org.codelibs.robot</groupId>
<artifactId>s2robot-lasta</artifactId>
<version>${s2robot.version}</version>
<groupId>org.codelibs.fess</groupId>
<artifactId>fess-crawler-lasta</artifactId>
<version>${crawler.version}</version>
<exclusions>
<exclusion>
<groupId>commons-logging</groupId>
@ -479,9 +479,9 @@
</exclusions>
</dependency>
<dependency>
<groupId>org.codelibs.robot</groupId>
<artifactId>s2robot-es</artifactId>
<version>${s2robot.version}</version>
<groupId>org.codelibs.fess</groupId>
<artifactId>fess-crawler-es</artifactId>
<version>${crawler.version}</version>
</dependency>
<dependency>
<groupId>org.bouncycastle</groupId>

View file

@ -139,7 +139,7 @@ public class WebConfigEditForm implements Serializable {
sortOrder = "0";
userAgent = ComponentUtil.getUserAgentName();
if (StringUtil.isBlank(userAgent)) {
userAgent = "Fess Robot/" + Constants.FESS_VERSION;
userAgent = "FessCrawler/" + Constants.FESS_VERSION;
}
numOfThread = Integer.toString(Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB);
intervalTime = Integer.toString(Constants.DEFAULT_INTERVAL_TIME_FOR_WEB);

View file

@ -39,7 +39,7 @@ import org.codelibs.fess.helper.JobHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.job.TriggeredJob;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.util.CharUtil;
import org.codelibs.fess.crawler.util.CharUtil;
import org.lastaflute.web.Execute;
import org.lastaflute.web.callback.ActionRuntime;
import org.lastaflute.web.response.HtmlResponse;

View file

@ -34,7 +34,7 @@ import org.codelibs.fess.helper.SearchLogHelper;
import org.codelibs.fess.helper.ViewHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.DocumentUtil;
import org.codelibs.robot.util.CharUtil;
import org.codelibs.fess.crawler.util.CharUtil;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.index.query.TermQueryBuilder;
import org.lastaflute.web.Execute;

View file

@ -36,29 +36,29 @@ import org.codelibs.fess.helper.FieldHelper;
import org.codelibs.fess.helper.IndexingHelper;
import org.codelibs.fess.helper.SambaHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.S2RobotThread;
import org.codelibs.robot.builder.RequestDataBuilder;
import org.codelibs.robot.client.S2RobotClient;
import org.codelibs.robot.client.smb.SmbClient;
import org.codelibs.robot.entity.RequestData;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.entity.UrlQueue;
import org.codelibs.robot.log.LogType;
import org.codelibs.fess.crawler.CrawlerThread;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.client.CrawlerClient;
import org.codelibs.fess.crawler.client.smb.SmbClient;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.log.LogType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import jcifs.smb.ACE;
import jcifs.smb.SID;
public class FessS2RobotThread extends S2RobotThread {
private static final Logger logger = LoggerFactory.getLogger(FessS2RobotThread.class);
public class FessCrawlerThread extends CrawlerThread {
private static final Logger logger = LoggerFactory.getLogger(FessCrawlerThread.class);
@Override
protected boolean isContentUpdated(final S2RobotClient client, final UrlQueue urlQueue) {
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue urlQueue) {
final DynamicProperties crawlerProperties = ComponentUtil.getCrawlerProperties();
if (crawlerProperties.getProperty(Constants.DIFF_CRAWLING_PROPERTY, Constants.TRUE).equals(Constants.TRUE)) {
log(logHelper, LogType.CHECK_LAST_MODIFIED, robotContext, urlQueue);
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
final long startTime = System.currentTimeMillis();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
@ -72,7 +72,7 @@ public class FessS2RobotThread extends S2RobotThread {
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(robotContext.getSessionId());
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<String, Object>();
dataMap.put(fieldHelper.urlField, url);
final List<String> roleTypeList = new ArrayList<String>();
@ -140,12 +140,12 @@ public class FessS2RobotThread extends S2RobotThread {
return true;
} else if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
log(logHelper, LogType.NOT_MODIFIED, robotContext, urlQueue);
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(robotContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.robot.Constants.NOT_MODIFIED_STATUS);
responseData.setSessionId(crawlerContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
processResponse(urlQueue, responseData);
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fieldHelper.anchorField)));
@ -163,7 +163,7 @@ public class FessS2RobotThread extends S2RobotThread {
protected void storeChildUrlsToQueue(final UrlQueue urlQueue, final Set<RequestData> childUrlSet) {
if (childUrlSet != null) {
synchronized (robotContext.getAccessCountLock()) {
synchronized (crawlerContext.getAccessCountLock()) {
// add an url
storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() != null ? urlQueue.getDepth() + 1 : 1);
}

View file

@ -18,7 +18,7 @@ package org.codelibs.fess.crawler.interval;
import org.codelibs.fess.helper.IntervalControlHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.interval.impl.DefaultIntervalController;
import org.codelibs.fess.crawler.interval.impl.DefaultIntervalController;
public class FessIntervalController extends DefaultIntervalController {

View file

@ -48,17 +48,17 @@ import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.SambaHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.client.smb.SmbClient;
import org.codelibs.robot.entity.AccessResult;
import org.codelibs.robot.entity.AccessResultData;
import org.codelibs.robot.entity.ExtractData;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.entity.ResultData;
import org.codelibs.robot.entity.UrlQueue;
import org.codelibs.robot.exception.RobotCrawlAccessException;
import org.codelibs.robot.exception.RobotSystemException;
import org.codelibs.robot.extractor.Extractor;
import org.codelibs.robot.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.client.smb.SmbClient;
import org.codelibs.fess.crawler.entity.AccessResult;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.ExtractData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -96,7 +96,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || responseData.getResponseBody() == null) {
throw new RobotCrawlAccessException("No response body.");
throw new CrawlingAccessException("No response body.");
}
final Extractor extractor = getExtractor(responseData);
@ -144,8 +144,8 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
}
}
} catch (final Exception e) {
final RobotCrawlAccessException rcae = new RobotCrawlAccessException("Could not get a text from " + responseData.getUrl(), e);
rcae.setLogLevel(RobotCrawlAccessException.WARN);
final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
rcae.setLogLevel(CrawlingAccessException.WARN);
throw rcae;
} finally {
IOUtils.closeQuietly(in);
@ -323,7 +323,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
try {
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
} catch (final Exception e) {
throw new RobotCrawlAccessException("Could not serialize object: " + url, e);
throw new CrawlingAccessException("Could not serialize object: " + url, e);
}
resultData.setEncoding(charsetName);
@ -466,7 +466,7 @@ public abstract class AbstractFessFileTransformer extends AbstractFessXpathTrans
try {
return SerializeUtil.fromBinaryToObject(data);
} catch (final Exception e) {
throw new RobotSystemException("Could not create an instanced from bytes.", e);
throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
}
}
return new HashMap<String, Object>();

View file

@ -25,7 +25,7 @@ import org.apache.commons.lang3.StringUtils;
import org.codelibs.core.lang.StringUtil;
import org.codelibs.fess.helper.FieldHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.transformer.impl.XpathTransformer;
import org.codelibs.fess.crawler.transformer.impl.XpathTransformer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View file

@ -18,9 +18,9 @@ package org.codelibs.fess.crawler.transformer;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.extractor.Extractor;
import org.codelibs.robot.extractor.ExtractorFactory;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

View file

@ -17,8 +17,8 @@
package org.codelibs.fess.crawler.transformer;
import org.codelibs.fess.exception.FessSystemException;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.extractor.Extractor;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.extractor.Extractor;
import org.lastaflute.di.core.SingletonLaContainer;
public class FessTikaTransformer extends AbstractFessFileTransformer {

View file

@ -50,17 +50,17 @@ import org.codelibs.fess.helper.OverlappingHostHelper;
import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.builder.RequestDataBuilder;
import org.codelibs.robot.entity.AccessResultData;
import org.codelibs.robot.entity.RequestData;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.entity.ResultData;
import org.codelibs.robot.entity.UrlQueue;
import org.codelibs.robot.exception.ChildUrlsException;
import org.codelibs.robot.exception.RobotCrawlAccessException;
import org.codelibs.robot.exception.RobotSystemException;
import org.codelibs.robot.util.CrawlingParameterUtil;
import org.codelibs.robot.util.ResponseDataUtil;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.RequestData;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.ChildUrlsException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.util.CrawlingParameterUtil;
import org.codelibs.fess.crawler.util.ResponseDataUtil;
import org.cyberneko.html.parsers.DOMParser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -116,7 +116,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
}
parser.parse(is);
} catch (final Exception e) {
throw new RobotCrawlAccessException("Could not parse " + responseData.getUrl(), e);
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
} finally {
IOUtils.closeQuietly(bis);
}
@ -172,7 +172,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
try {
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
} catch (final Exception e) {
throw new RobotCrawlAccessException("Could not serialize object: " + responseData.getUrl(), e);
throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
}
resultData.setEncoding(charsetName);
} finally {
@ -500,7 +500,7 @@ public class FessXpathTransformer extends AbstractFessXpathTransformer {
try {
return SerializeUtil.fromBinaryToObject(data);
} catch (final Exception e) {
throw new RobotSystemException("Could not create an instanced from bytes.", e);
throw new CrawlerSystemException("Could not create an instanced from bytes.", e);
}
}
return new HashMap<String, Object>();

View file

@ -16,9 +16,9 @@
package org.codelibs.fess.ds;
import org.codelibs.robot.exception.RobotCrawlAccessException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
public class DataStoreCrawlingException extends RobotCrawlAccessException {
public class DataStoreCrawlingException extends CrawlingAccessException {
private static final long serialVersionUID = 1L;

View file

@ -36,8 +36,8 @@ import org.codelibs.fess.ds.DataStoreCrawlingException;
import org.codelibs.fess.ds.DataStoreException;
import org.codelibs.fess.ds.IndexUpdateCallback;
import org.codelibs.fess.es.exentity.DataConfig;
import org.codelibs.robot.exception.RobotCrawlAccessException;
import org.codelibs.robot.exception.RobotMultipleCrawlAccessException;
import org.codelibs.fess.crawler.exception.CrawlingAccessException;
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
import org.lastaflute.di.core.SingletonLaContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -230,10 +230,10 @@ public class CsvDataStoreImpl extends AbstractDataStoreImpl {
try {
loop = callback.store(dataMap);
} catch (final RobotCrawlAccessException e) {
} catch (final CrawlingAccessException e) {
Throwable target = e;
if (target instanceof RobotMultipleCrawlAccessException) {
final Throwable[] causes = ((RobotMultipleCrawlAccessException) target).getCauses();
if (target instanceof MultipleCrawlingAccessException) {
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}

View file

@ -34,17 +34,17 @@ import org.codelibs.fess.helper.CrawlingSessionHelper;
import org.codelibs.fess.helper.FieldHelper;
import org.codelibs.fess.helper.IndexingHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.builder.RequestDataBuilder;
import org.codelibs.robot.client.S2RobotClient;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.entity.ResultData;
import org.codelibs.robot.exception.RobotSystemException;
import org.codelibs.robot.processor.ResponseProcessor;
import org.codelibs.robot.processor.impl.DefaultResponseProcessor;
import org.codelibs.robot.rule.Rule;
import org.codelibs.robot.rule.RuleManager;
import org.codelibs.robot.transformer.Transformer;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.client.CrawlerClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.entity.ResultData;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
import org.codelibs.fess.crawler.processor.ResponseProcessor;
import org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor;
import org.codelibs.fess.crawler.rule.Rule;
import org.codelibs.fess.crawler.rule.RuleManager;
import org.codelibs.fess.crawler.transformer.Transformer;
import org.lastaflute.di.core.SingletonLaContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -71,7 +71,7 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
public int maxDeleteDocumentCacheSize = 100;
protected S2RobotClientFactory robotClientFactory;
protected CrawlerClientFactory crawlerClientFactory;
protected CrawlingSessionHelper crawlingSessionHelper;
@ -92,9 +92,9 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
@Override
public void store(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> initParamMap) {
robotClientFactory = SingletonLaContainer.getComponent(S2RobotClientFactory.class);
crawlerClientFactory = SingletonLaContainer.getComponent(CrawlerClientFactory.class);
config.initializeClientFactory(robotClientFactory);
config.initializeClientFactory(crawlerClientFactory);
super.store(config, callback, initParamMap);
}
@ -170,9 +170,9 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
final String url = dataMap.get(fieldHelper.urlField).toString();
try {
final S2RobotClient client = robotClientFactory.getClient(url);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
logger.warn("S2RobotClient is null. Data: " + dataMap);
logger.warn("CrawlerClient is null. Data: " + dataMap);
return false;
}
@ -200,7 +200,7 @@ public class FileListDataStoreImpl extends CsvDataStoreImpl {
(Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new RobotSystemException("Could not create an instance from bytes.", e);
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}

View file

@ -2,7 +2,7 @@ package org.codelibs.fess.es.exentity;
import java.util.Map;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
public interface CrawlingConfig {
@ -20,7 +20,7 @@ public interface CrawlingConfig {
String getConfigId();
void initializeClientFactory(S2RobotClientFactory s2RobotClientFactory);
void initializeClientFactory(CrawlerClientFactory crawlerClientFactory);
Map<String, String> getConfigParameterMap(ConfigName name);

View file

@ -24,13 +24,13 @@ import org.codelibs.fess.es.exbhv.LabelTypeBhv;
import org.codelibs.fess.es.exbhv.RoleTypeBhv;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.ParameterUtil;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.robot.client.http.Authentication;
import org.codelibs.robot.client.http.HcHttpClient;
import org.codelibs.robot.client.http.impl.AuthenticationImpl;
import org.codelibs.robot.client.http.ntlm.JcifsEngine;
import org.codelibs.robot.client.smb.SmbAuthentication;
import org.codelibs.robot.client.smb.SmbClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.client.http.Authentication;
import org.codelibs.fess.crawler.client.http.HcHttpClient;
import org.codelibs.fess.crawler.client.http.impl.AuthenticationImpl;
import org.codelibs.fess.crawler.client.http.ntlm.JcifsEngine;
import org.codelibs.fess.crawler.client.smb.SmbAuthentication;
import org.codelibs.fess.crawler.client.smb.SmbClient;
import org.dbflute.cbean.result.ListResultBean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -44,15 +44,15 @@ public class DataConfig extends BsDataConfig implements CrawlingConfig {
private static final Logger logger = LoggerFactory.getLogger(DataConfig.class);
private static final String S2ROBOT_WEB_HEADER_PREFIX = "s2robot.web.header.";
private static final String S2ROBOT_WEB_HEADER_PREFIX = "crawler.web.header.";
private static final String S2ROBOT_WEB_AUTH = "s2robot.web.auth";
private static final String S2ROBOT_WEB_AUTH = "crawler.web.auth";
private static final String S2ROBOT_USERAGENT = "s2robot.useragent";
private static final String S2ROBOT_USERAGENT = "crawler.useragent";
private static final String S2ROBOT_PARAM_PREFIX = "s2robot.param.";
private static final String S2ROBOT_PARAM_PREFIX = "crawler.param.";
private static final Object S2ROBOT_FILE_AUTH = "s2robot.file.auth";
private static final Object S2ROBOT_FILE_AUTH = "crawler.file.auth";
private String[] labelTypeIds;
@ -211,11 +211,11 @@ public class DataConfig extends BsDataConfig implements CrawlingConfig {
}
@Override
public void initializeClientFactory(final S2RobotClientFactory robotClientFactory) {
public void initializeClientFactory(final CrawlerClientFactory crawlerClientFactory) {
final Map<String, String> paramMap = getHandlerParameterMap();
final Map<String, Object> factoryParamMap = new HashMap<String, Object>();
robotClientFactory.setInitParameterMap(factoryParamMap);
crawlerClientFactory.setInitParameterMap(factoryParamMap);
// parameters
for (final Map.Entry<String, String> entry : paramMap.entrySet()) {
@ -301,18 +301,19 @@ public class DataConfig extends BsDataConfig implements CrawlingConfig {
}
// request header
final List<org.codelibs.robot.client.http.RequestHeader> rhList = new ArrayList<org.codelibs.robot.client.http.RequestHeader>();
final List<org.codelibs.fess.crawler.client.http.RequestHeader> rhList =
new ArrayList<org.codelibs.fess.crawler.client.http.RequestHeader>();
int count = 1;
String headerName = paramMap.get(S2ROBOT_WEB_HEADER_PREFIX + count + ".name");
while (StringUtil.isNotBlank(headerName)) {
final String headerValue = paramMap.get(S2ROBOT_WEB_HEADER_PREFIX + count + ".value");
rhList.add(new org.codelibs.robot.client.http.RequestHeader(headerName, headerValue));
rhList.add(new org.codelibs.fess.crawler.client.http.RequestHeader(headerName, headerValue));
count++;
headerName = paramMap.get(S2ROBOT_WEB_HEADER_PREFIX + count + ".name");
}
if (!rhList.isEmpty()) {
factoryParamMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY,
rhList.toArray(new org.codelibs.robot.client.http.RequestHeader[rhList.size()]));
rhList.toArray(new org.codelibs.fess.crawler.client.http.RequestHeader[rhList.size()]));
}
// file auth

View file

@ -18,9 +18,9 @@ import org.codelibs.fess.es.exbhv.RoleTypeBhv;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.ParameterUtil;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.robot.client.smb.SmbAuthentication;
import org.codelibs.robot.client.smb.SmbClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.client.smb.SmbAuthentication;
import org.codelibs.fess.crawler.client.smb.SmbClient;
import org.dbflute.cbean.result.ListResultBean;
import org.lastaflute.di.core.SingletonLaContainer;
@ -228,7 +228,7 @@ public class FileConfig extends BsFileConfig implements CrawlingConfig {
}
@Override
public void initializeClientFactory(final S2RobotClientFactory clientFactory) {
public void initializeClientFactory(final CrawlerClientFactory clientFactory) {
final FileAuthenticationService fileAuthenticationService = SingletonLaContainer.getComponent(FileAuthenticationService.class);
// Parameters

View file

@ -30,8 +30,8 @@ public class RequestHeader extends BsRequestHeader {
asDocMeta().version(version);
}
public org.codelibs.robot.client.http.RequestHeader getS2RobotRequestHeader() {
return new org.codelibs.robot.client.http.RequestHeader(getName(), getValue());
public org.codelibs.fess.crawler.client.http.RequestHeader getCrawlerRequestHeader() {
return new org.codelibs.fess.crawler.client.http.RequestHeader(getName(), getValue());
}
public WebConfig getWebConfig() {

View file

@ -16,10 +16,10 @@ import org.codelibs.fess.app.service.WebConfigService;
import org.codelibs.fess.es.bsentity.BsWebAuthentication;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.ParameterUtil;
import org.codelibs.robot.client.http.Authentication;
import org.codelibs.robot.client.http.impl.AuthenticationImpl;
import org.codelibs.robot.client.http.ntlm.JcifsEngine;
import org.codelibs.robot.exception.RobotSystemException;
import org.codelibs.fess.crawler.client.http.Authentication;
import org.codelibs.fess.crawler.client.http.impl.AuthenticationImpl;
import org.codelibs.fess.crawler.client.http.ntlm.JcifsEngine;
import org.codelibs.fess.crawler.exception.CrawlerSystemException;
/**
* @author FreeGen
@ -71,7 +71,7 @@ public class WebAuthentication extends BsWebAuthentication {
private Credentials getCredentials() {
if (StringUtil.isEmpty(getUsername())) {
throw new RobotSystemException("username is empty.");
throw new CrawlerSystemException("username is empty.");
}
if (Constants.NTLM.equals(getProtocolScheme())) {

View file

@ -18,9 +18,9 @@ import org.codelibs.fess.es.exbhv.WebConfigToLabelBhv;
import org.codelibs.fess.es.exbhv.WebConfigToRoleBhv;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.ParameterUtil;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.robot.client.http.Authentication;
import org.codelibs.robot.client.http.HcHttpClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.client.http.Authentication;
import org.codelibs.fess.crawler.client.http.HcHttpClient;
import org.dbflute.cbean.result.ListResultBean;
import org.lastaflute.di.core.SingletonLaContainer;
@ -232,7 +232,7 @@ public class WebConfig extends BsWebConfig implements CrawlingConfig {
}
@Override
public void initializeClientFactory(final S2RobotClientFactory clientFactory) {
public void initializeClientFactory(final CrawlerClientFactory clientFactory) {
final WebAuthenticationService webAuthenticationService = SingletonLaContainer.getComponent(WebAuthenticationService.class);
final RequestHeaderService requestHeaderService = SingletonLaContainer.getComponent(RequestHeaderService.class);
@ -259,11 +259,13 @@ public class WebConfig extends BsWebConfig implements CrawlingConfig {
// request header
final List<RequestHeader> requestHeaderList = requestHeaderService.getRequestHeaderList(getId());
final List<org.codelibs.robot.client.http.RequestHeader> rhList = new ArrayList<org.codelibs.robot.client.http.RequestHeader>();
final List<org.codelibs.fess.crawler.client.http.RequestHeader> rhList =
new ArrayList<org.codelibs.fess.crawler.client.http.RequestHeader>();
for (final RequestHeader requestHeader : requestHeaderList) {
rhList.add(requestHeader.getS2RobotRequestHeader());
rhList.add(requestHeader.getCrawlerRequestHeader());
}
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, rhList.toArray(new org.codelibs.robot.client.http.RequestHeader[rhList.size()]));
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY,
rhList.toArray(new org.codelibs.fess.crawler.client.http.RequestHeader[rhList.size()]));
}

View file

@ -46,7 +46,7 @@ import org.codelibs.fess.helper.PathMappingHelper;
import org.codelibs.fess.helper.WebFsIndexHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.ResourceUtil;
import org.codelibs.robot.client.EsClient;
import org.codelibs.fess.crawler.client.EsClient;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.kohsuke.args4j.CmdLineException;

View file

@ -19,29 +19,29 @@ package org.codelibs.fess.helper;
import org.codelibs.fess.app.service.FailureUrlService;
import org.codelibs.fess.es.exentity.CrawlingConfig;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.S2RobotContext;
import org.codelibs.robot.entity.UrlQueue;
import org.codelibs.robot.exception.RobotMultipleCrawlAccessException;
import org.codelibs.robot.helper.impl.LogHelperImpl;
import org.codelibs.robot.log.LogType;
import org.codelibs.fess.crawler.CrawlerContext;
import org.codelibs.fess.crawler.entity.UrlQueue;
import org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException;
import org.codelibs.fess.crawler.helper.impl.LogHelperImpl;
import org.codelibs.fess.crawler.log.LogType;
import org.lastaflute.di.core.SingletonLaContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class RobotLogHelper extends LogHelperImpl {
public class CrawlerLogHelper extends LogHelperImpl {
private static final Logger logger = LoggerFactory // NOPMD
.getLogger(RobotLogHelper.class);
.getLogger(CrawlerLogHelper.class);
@Override
public void log(final LogType key, final Object... objs) {
try {
switch (key) {
case CRAWLING_ACCESS_EXCEPTION: {
final S2RobotContext robotContext = (S2RobotContext) objs[0];
final CrawlerContext crawlerContext = (CrawlerContext) objs[0];
final UrlQueue urlQueue = (UrlQueue) objs[1];
Throwable e = (Throwable) objs[2];
if (e instanceof RobotMultipleCrawlAccessException) {
final Throwable[] causes = ((RobotMultipleCrawlAccessException) e).getCauses();
if (e instanceof MultipleCrawlingAccessException) {
final Throwable[] causes = ((MultipleCrawlingAccessException) e).getCauses();
if (causes.length > 0) {
e = causes[causes.length - 1];
}
@ -54,15 +54,15 @@ public class RobotLogHelper extends LogHelperImpl {
} else {
errorName = e.getClass().getCanonicalName();
}
storeFailureUrl(robotContext, urlQueue, errorName, e);
storeFailureUrl(crawlerContext, urlQueue, errorName, e);
break;
}
case CRAWLING_EXCETPION: {
final S2RobotContext robotContext = (S2RobotContext) objs[0];
final CrawlerContext crawlerContext = (CrawlerContext) objs[0];
final UrlQueue urlQueue = (UrlQueue) objs[1];
final Throwable e = (Throwable) objs[2];
storeFailureUrl(robotContext, urlQueue, e.getClass().getCanonicalName(), e);
storeFailureUrl(crawlerContext, urlQueue, e.getClass().getCanonicalName(), e);
break;
}
default:
@ -75,9 +75,9 @@ public class RobotLogHelper extends LogHelperImpl {
super.log(key, objs);
}
private void storeFailureUrl(final S2RobotContext robotContext, final UrlQueue urlQueue, final String errorName, final Throwable e) {
private void storeFailureUrl(final CrawlerContext crawlerContext, final UrlQueue urlQueue, final String errorName, final Throwable e) {
final CrawlingConfig crawlingConfig = getCrawlingConfig(robotContext.getSessionId());
final CrawlingConfig crawlingConfig = getCrawlingConfig(crawlerContext.getSessionId());
final String url = urlQueue.getUrl();
final FailureUrlService failureUrlService = SingletonLaContainer.getComponent(FailureUrlService.class);

View file

@ -132,8 +132,8 @@ public class DataIndexHelper implements Serializable {
while (startedCrawlerNum < dataCrawlingThreadList.size()) {
// Force to stop crawl
if (systemHelper.isForceStop()) {
for (final DataCrawlingThread s2Robot : dataCrawlingThreadList) {
s2Robot.stopCrawling();
for (final DataCrawlingThread crawlerThread : dataCrawlingThreadList) {
crawlerThread.stopCrawling();
}
break;
}

View file

@ -42,7 +42,7 @@ import org.codelibs.fess.Constants;
import org.codelibs.fess.app.service.RoleTypeService;
import org.codelibs.fess.es.exentity.RoleType;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.util.CharUtil;
import org.codelibs.fess.crawler.util.CharUtil;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.web.util.LaRequestUtil;

View file

@ -56,11 +56,11 @@ import org.codelibs.fess.helper.UserAgentHelper.UserAgentType;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.fess.util.DocumentUtil;
import org.codelibs.fess.util.ResourceUtil;
import org.codelibs.robot.builder.RequestDataBuilder;
import org.codelibs.robot.client.S2RobotClient;
import org.codelibs.robot.client.S2RobotClientFactory;
import org.codelibs.robot.entity.ResponseData;
import org.codelibs.robot.util.CharUtil;
import org.codelibs.fess.crawler.builder.RequestDataBuilder;
import org.codelibs.fess.crawler.client.CrawlerClient;
import org.codelibs.fess.crawler.client.CrawlerClientFactory;
import org.codelibs.fess.crawler.entity.ResponseData;
import org.codelibs.fess.crawler.util.CharUtil;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.taglib.function.LaFunctions;
import org.lastaflute.web.response.StreamResponse;
@ -508,11 +508,11 @@ public class ViewHelper implements Serializable {
throw new FessSystemException("No crawlingConfig: " + configIdObj);
}
final String url = (String) doc.get(fieldHelper.urlField);
final S2RobotClientFactory robotClientFactory = SingletonLaContainer.getComponent(S2RobotClientFactory.class);
config.initializeClientFactory(robotClientFactory);
final S2RobotClient client = robotClientFactory.getClient(url);
final CrawlerClientFactory crawlerClientFactory = SingletonLaContainer.getComponent(CrawlerClientFactory.class);
config.initializeClientFactory(crawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new FessSystemException("No S2RobotClient: " + configIdObj + ", url: " + url);
throw new FessSystemException("No CrawlerClient: " + configIdObj + ", url: " + url);
}
final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build());
final StreamResponse response = new StreamResponse(StringUtil.EMPTY);

View file

@ -37,11 +37,11 @@ import org.codelibs.fess.es.exentity.FileConfig;
import org.codelibs.fess.es.exentity.WebConfig;
import org.codelibs.fess.indexer.IndexUpdater;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.S2Robot;
import org.codelibs.robot.S2RobotContext;
import org.codelibs.robot.service.DataService;
import org.codelibs.robot.service.UrlFilterService;
import org.codelibs.robot.service.UrlQueueService;
import org.codelibs.fess.crawler.Crawler;
import org.codelibs.fess.crawler.CrawlerContext;
import org.codelibs.fess.crawler.service.DataService;
import org.codelibs.fess.crawler.service.UrlFilterService;
import org.codelibs.fess.crawler.service.UrlQueueService;
import org.lastaflute.di.core.SingletonLaContainer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -81,7 +81,7 @@ public class WebFsIndexHelper implements Serializable {
public int crawlerPriority = Thread.NORM_PRIORITY;
private final List<S2Robot> s2RobotList = Collections.synchronizedList(new ArrayList<S2Robot>());
private final List<Crawler> crawlerList = Collections.synchronizedList(new ArrayList<Crawler>());
// needed?
@Deprecated
@ -140,15 +140,15 @@ public class WebFsIndexHelper implements Serializable {
final long startTime = System.currentTimeMillis();
final List<String> sessionIdList = new ArrayList<String>();
s2RobotList.clear();
final List<String> s2RobotStatusList = new ArrayList<String>();
crawlerList.clear();
final List<String> crawlerStatusList = new ArrayList<String>();
// Web
for (final WebConfig webConfig : webConfigList) {
final String sid = crawlingConfigHelper.store(sessionId, webConfig);
// create s2robot
final S2Robot s2Robot = SingletonLaContainer.getComponent(S2Robot.class);
s2Robot.setSessionId(sid);
// create crawler
final Crawler crawler = SingletonLaContainer.getComponent(Crawler.class);
crawler.setSessionId(sid);
sessionIdList.add(sid);
final String urlsStr = webConfig.getUrls();
@ -160,26 +160,26 @@ public class WebFsIndexHelper implements Serializable {
// interval time
final int intervalTime =
webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
((FessIntervalController) s2Robot.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY;
final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY;
// num of threads
final S2RobotContext robotContext = s2Robot.getRobotContext();
final CrawlerContext crawlerContext = crawler.getCrawlerContext();
final int numOfThread =
webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
robotContext.setNumOfThread(numOfThread);
crawlerContext.setNumOfThread(numOfThread);
// depth
final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1;
robotContext.setMaxDepth(depth);
crawlerContext.setMaxDepth(depth);
// max count
final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount;
robotContext.setMaxAccessCount(maxCount);
crawlerContext.setMaxAccessCount(maxCount);
webConfig.initializeClientFactory(s2Robot.getClientFactory());
webConfig.initializeClientFactory(crawler.getClientFactory());
// set urls
final String[] urls = urlsStr.split("[\r\n]");
@ -187,7 +187,7 @@ public class WebFsIndexHelper implements Serializable {
if (StringUtil.isNotBlank(u)) {
final String urlValue = u.trim();
if (!urlValue.startsWith("#")) {
s2Robot.addUrl(urlValue);
crawler.addUrl(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Target URL: " + urlValue);
}
@ -201,7 +201,7 @@ public class WebFsIndexHelper implements Serializable {
if (StringUtil.isNotBlank(u)) {
final String urlValue = u.trim();
if (!urlValue.startsWith("#")) {
s2Robot.addIncludeFilter(urlValue);
crawler.addIncludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Included URL: " + urlValue);
}
@ -215,7 +215,7 @@ public class WebFsIndexHelper implements Serializable {
if (StringUtil.isNotBlank(u)) {
final String urlValue = u.trim();
if (!urlValue.startsWith("#")) {
s2Robot.addExcludeFilter(urlValue);
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded URL: " + urlValue);
}
@ -229,7 +229,7 @@ public class WebFsIndexHelper implements Serializable {
for (final String u : excludedUrlList) {
if (StringUtil.isNotBlank(u)) {
final String urlValue = u.trim();
s2Robot.addExcludeFilter(urlValue);
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded URL from failures: " + urlValue);
}
@ -241,20 +241,20 @@ public class WebFsIndexHelper implements Serializable {
logger.debug("Crawling " + urlsStr);
}
s2Robot.setBackground(true);
s2Robot.setThreadPriority(crawlerPriority);
crawler.setBackground(true);
crawler.setThreadPriority(crawlerPriority);
s2RobotList.add(s2Robot);
s2RobotStatusList.add(Constants.READY);
crawlerList.add(crawler);
crawlerStatusList.add(Constants.READY);
}
// File
for (final FileConfig fileConfig : fileConfigList) {
final String sid = crawlingConfigHelper.store(sessionId, fileConfig);
// create s2robot
final S2Robot s2Robot = SingletonLaContainer.getComponent(S2Robot.class);
s2Robot.setSessionId(sid);
// create crawler
final Crawler crawler = SingletonLaContainer.getComponent(Crawler.class);
crawler.setSessionId(sid);
sessionIdList.add(sid);
final String pathsStr = fileConfig.getPaths();
@ -265,26 +265,26 @@ public class WebFsIndexHelper implements Serializable {
final int intervalTime =
fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
((FessIntervalController) s2Robot.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY;
final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY;
// num of threads
final S2RobotContext robotContext = s2Robot.getRobotContext();
final CrawlerContext crawlerContext = crawler.getCrawlerContext();
final int numOfThread =
fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
robotContext.setNumOfThread(numOfThread);
crawlerContext.setNumOfThread(numOfThread);
// depth
final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1;
robotContext.setMaxDepth(depth);
crawlerContext.setMaxDepth(depth);
// max count
final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount;
robotContext.setMaxAccessCount(maxCount);
crawlerContext.setMaxAccessCount(maxCount);
fileConfig.initializeClientFactory(s2Robot.getClientFactory());
fileConfig.initializeClientFactory(crawler.getClientFactory());
// set paths
final String[] paths = pathsStr.split("[\r\n]");
@ -299,7 +299,7 @@ public class WebFsIndexHelper implements Serializable {
u = "file:/" + u;
}
}
s2Robot.addUrl(u);
crawler.addUrl(u);
if (logger.isInfoEnabled()) {
logger.info("Target Path: " + u);
}
@ -321,7 +321,7 @@ public class WebFsIndexHelper implements Serializable {
} else {
urlValue = systemHelper.encodeUrlFilter(line);
}
s2Robot.addIncludeFilter(urlValue);
crawler.addIncludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Included Path: " + urlValue);
}
@ -345,7 +345,7 @@ public class WebFsIndexHelper implements Serializable {
} else {
urlValue = systemHelper.encodeUrlFilter(line);
}
s2Robot.addExcludeFilter(urlValue);
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded Path: " + urlValue);
}
@ -361,7 +361,7 @@ public class WebFsIndexHelper implements Serializable {
for (final String u : excludedUrlList) {
if (StringUtil.isNotBlank(u)) {
final String urlValue = u.trim();
s2Robot.addExcludeFilter(urlValue);
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded Path from failures: " + urlValue);
}
@ -373,11 +373,11 @@ public class WebFsIndexHelper implements Serializable {
logger.debug("Crawling " + pathsStr);
}
s2Robot.setBackground(true);
s2Robot.setThreadPriority(crawlerPriority);
crawler.setBackground(true);
crawler.setThreadPriority(crawlerPriority);
s2RobotList.add(s2Robot);
s2RobotStatusList.add(Constants.READY);
crawlerList.add(crawler);
crawlerStatusList.add(Constants.READY);
}
// run index update
@ -386,7 +386,7 @@ public class WebFsIndexHelper implements Serializable {
indexUpdater.setPriority(indexUpdaterPriority);
indexUpdater.setSessionIdList(sessionIdList);
indexUpdater.setDaemon(true);
indexUpdater.setS2RobotList(s2RobotList);
indexUpdater.setCrawlerList(crawlerList);
for (final BoostDocumentRule rule : boostDocumentRuleService.getAvailableBoostDocumentRuleList()) {
indexUpdater.addBoostDocumentRule(new org.codelibs.fess.indexer.BoostDocumentRule(rule));
}
@ -394,19 +394,19 @@ public class WebFsIndexHelper implements Serializable {
int startedCrawlerNum = 0;
int activeCrawlerNum = 0;
while (startedCrawlerNum < s2RobotList.size()) {
while (startedCrawlerNum < crawlerList.size()) {
// Force to stop crawl
if (systemHelper.isForceStop()) {
for (final S2Robot s2Robot : s2RobotList) {
s2Robot.stop();
for (final Crawler crawler : crawlerList) {
crawler.stop();
}
break;
}
if (activeCrawlerNum < multiprocessCrawlingCount) {
// start crawling
s2RobotList.get(startedCrawlerNum).execute();
s2RobotStatusList.set(startedCrawlerNum, Constants.RUNNING);
crawlerList.get(startedCrawlerNum).execute();
crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING);
startedCrawlerNum++;
activeCrawlerNum++;
try {
@ -419,10 +419,10 @@ public class WebFsIndexHelper implements Serializable {
// check status
for (int i = 0; i < startedCrawlerNum; i++) {
if (!s2RobotList.get(i).getRobotContext().isRunning() && s2RobotStatusList.get(i).equals(Constants.RUNNING)) {
s2RobotList.get(i).awaitTermination();
s2RobotStatusList.set(i, Constants.DONE);
final String sid = s2RobotList.get(i).getRobotContext().getSessionId();
if (!crawlerList.get(i).getCrawlerContext().isRunning() && crawlerStatusList.get(i).equals(Constants.RUNNING)) {
crawlerList.get(i).awaitTermination();
crawlerStatusList.set(i, Constants.DONE);
final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
indexUpdater.addFinishedSessionId(sid);
activeCrawlerNum--;
}
@ -437,20 +437,20 @@ public class WebFsIndexHelper implements Serializable {
boolean finishedAll = false;
while (!finishedAll) {
finishedAll = true;
for (int i = 0; i < s2RobotList.size(); i++) {
s2RobotList.get(i).awaitTermination(crawlingExecutionInterval);
if (!s2RobotList.get(i).getRobotContext().isRunning() && !s2RobotStatusList.get(i).equals(Constants.DONE)) {
s2RobotStatusList.set(i, Constants.DONE);
final String sid = s2RobotList.get(i).getRobotContext().getSessionId();
for (int i = 0; i < crawlerList.size(); i++) {
crawlerList.get(i).awaitTermination(crawlingExecutionInterval);
if (!crawlerList.get(i).getCrawlerContext().isRunning() && !crawlerStatusList.get(i).equals(Constants.DONE)) {
crawlerStatusList.set(i, Constants.DONE);
final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
indexUpdater.addFinishedSessionId(sid);
}
if (!s2RobotStatusList.get(i).equals(Constants.DONE)) {
if (!crawlerStatusList.get(i).equals(Constants.DONE)) {
finishedAll = false;
}
}
}
s2RobotList.clear();
s2RobotStatusList.clear();
crawlerList.clear();
crawlerStatusList.clear();
// put cralwing info
final CrawlingSessionHelper crawlingSessionHelper = ComponentUtil.getCrawlingSessionHelper();

View file

@ -53,7 +53,11 @@ public class BoostDocumentRule {
return ((Boolean) value).booleanValue();
}
} catch (final Exception e) {
logger.warn("Failed to parse a doc for boost: " + map, e);
if (logger.isDebugEnabled()) {
logger.debug("Failed to evaluate \"" + matchExpression + "\" for " + map, e);
} else {
logger.warn("Failed to evaluate \"" + matchExpression + "\".");
}
}
return false;

View file

@ -36,17 +36,17 @@ import org.codelibs.fess.helper.IntervalControlHelper;
import org.codelibs.fess.helper.SearchLogHelper;
import org.codelibs.fess.helper.SystemHelper;
import org.codelibs.fess.util.ComponentUtil;
import org.codelibs.robot.S2Robot;
import org.codelibs.robot.entity.AccessResult;
import org.codelibs.robot.entity.AccessResultData;
import org.codelibs.robot.entity.EsAccessResult;
import org.codelibs.robot.entity.EsUrlQueue;
import org.codelibs.robot.service.DataService;
import org.codelibs.robot.service.UrlFilterService;
import org.codelibs.robot.service.UrlQueueService;
import org.codelibs.robot.service.impl.EsDataService;
import org.codelibs.robot.transformer.Transformer;
import org.codelibs.robot.util.EsResultList;
import org.codelibs.fess.crawler.Crawler;
import org.codelibs.fess.crawler.entity.AccessResult;
import org.codelibs.fess.crawler.entity.AccessResultData;
import org.codelibs.fess.crawler.entity.EsAccessResult;
import org.codelibs.fess.crawler.entity.EsUrlQueue;
import org.codelibs.fess.crawler.service.DataService;
import org.codelibs.fess.crawler.service.UrlFilterService;
import org.codelibs.fess.crawler.service.UrlQueueService;
import org.codelibs.fess.crawler.service.impl.EsDataService;
import org.codelibs.fess.crawler.transformer.Transformer;
import org.codelibs.fess.crawler.util.EsResultList;
import org.elasticsearch.action.search.SearchRequestBuilder;
import org.elasticsearch.index.query.FilterBuilders;
import org.elasticsearch.index.query.QueryBuilder;
@ -122,7 +122,7 @@ public class IndexUpdater extends Thread {
private final Map<String, Object> docValueMap = new HashMap<String, Object>();
private List<S2Robot> s2RobotList;
private List<Crawler> crawlerList;
public IndexUpdater() {
// nothing
@ -176,7 +176,7 @@ public class IndexUpdater extends Thread {
.boolFilter()
.must(FilterBuilders.termsFilter(EsAccessResult.SESSION_ID, sessionIdList))
.must(FilterBuilders.termFilter(EsAccessResult.STATUS,
org.codelibs.robot.Constants.OK_STATUS)));
org.codelibs.fess.crawler.Constants.OK_STATUS)));
builder.setQuery(queryBuilder);
builder.setFrom(0);
if (maxDocumentCacheSize <= 0) {
@ -507,8 +507,8 @@ public class IndexUpdater extends Thread {
private void forceStop() {
systemHelper.setForceStop(true);
for (final S2Robot s2Robot : s2RobotList) {
s2Robot.stop();
for (final Crawler crawler : crawlerList) {
crawler.stop();
}
}
@ -557,7 +557,7 @@ public class IndexUpdater extends Thread {
docValueMap.put(fieldName, value);
}
public void setS2RobotList(final List<S2Robot> s2RobotList) {
this.s2RobotList = s2RobotList;
public void setCrawlerList(final List<Crawler> crawlerList) {
this.crawlerList = crawlerList;
}
}

View file

@ -45,9 +45,9 @@ import org.codelibs.fess.helper.UserAgentHelper;
import org.codelibs.fess.helper.ViewHelper;
import org.codelibs.fess.indexer.IndexUpdater;
import org.codelibs.fess.job.JobExecutor;
import org.codelibs.robot.entity.EsAccessResult;
import org.codelibs.robot.extractor.ExtractorFactory;
import org.codelibs.robot.service.DataService;
import org.codelibs.fess.crawler.entity.EsAccessResult;
import org.codelibs.fess.crawler.extractor.ExtractorFactory;
import org.codelibs.fess.crawler.service.DataService;
import org.lastaflute.core.message.MessageManager;
import org.lastaflute.di.core.SingletonLaContainer;
import org.lastaflute.di.core.factory.SingletonLaContainerFactory;

View file

@ -73,19 +73,19 @@ public class ResourceUtil {
} catch (final Throwable e) { // NOSONAR
// ignore
}
Path path = Paths.get(".", names);
if (Files.exists(path)) {
return path;
final Path defaultPath = Paths.get("WEB-INF/" + base, names);
if (Files.exists(defaultPath)) {
return defaultPath;
}
path = Paths.get("src/main/webapps/WEB-INF/" + base, names);
if (Files.exists(path)) {
return path;
final Path srcBasePath = Paths.get("src/main/webapps/WEB-INF/" + base, names);
if (Files.exists(srcBasePath)) {
return srcBasePath;
}
path = Paths.get("target/fess/WEB-INF/" + base, names);
if (Files.exists(path)) {
return path;
final Path targetBasePath = Paths.get("target/fess/WEB-INF/" + base, names);
if (Files.exists(targetBasePath)) {
return targetBasePath;
}
return path;
return defaultPath;
}
public static File[] getJarFiles(final String namePrefix) {

View file

@ -10,8 +10,8 @@
<include path="fess_api.xml"/>
<include path="fess_dict.xml"/>
<include path="s2robot/client.xml" />
<include path="s2robot/mimetype.xml" />
<include path="crawler/client.xml" />
<include path="crawler/mimetype.xml" />
<component name="authenticationCipher" class="org.codelibs.core.crypto.CachedCipher">
<!-- CHANGE THE FOLLOWING KEY -->

View file

@ -0,0 +1,52 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="fessCrawler">
<include path="crawler/container.xml"/>
<include path="crawler/client.xml"/>
<include path="crawler/rule.xml"/>
<include path="crawler/filter.xml"/>
<include path="crawler/interval.xml"/>
<include path="crawler/extractor.xml"/>
<include path="crawler/mimetype.xml"/>
<include path="crawler/encoding.xml"/>
<include path="crawler/urlconverter.xml"/>
<include path="crawler/log.xml"/>
<include path="crawler/sitemaps.xml"/>
<include path="crawler/es.xml"/>
<!-- Crawler -->
<component name="crawler" class="org.codelibs.fess.crawler.Crawler" instance="prototype" >
</component>
<!-- Crawler Thread -->
<component name="crawlerThread" class="org.codelibs.fess.crawler.FessCrawlerThread" instance="prototype" >
</component>
<!-- Entity -->
<component name="accessResult"
class="org.codelibs.fess.crawler.entity.EsAccessResult" instance="prototype">
</component>
<component name="urlQueue"
class="org.codelibs.fess.crawler.entity.EsUrlQueue" instance="prototype">
</component>
<!-- Service -->
<component name="urlQueueService"
class="org.codelibs.fess.crawler.service.impl.EsUrlQueueService">
<property name="index">".crawler"</property>
<property name="type">"queue"</property>
</component>
<component name="dataService"
class="org.codelibs.fess.crawler.service.impl.EsDataService">
<property name="index">".crawler"</property>
<property name="type">"data"</property>
</component>
<component name="urlFilterService"
class="org.codelibs.fess.crawler.service.impl.EsUrlFilterService">
<property name="index">".crawler"</property>
<property name="type">"filter"</property>
</component>
</components>

View file

@ -1,11 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="s2robot">
<include path="s2robot/container.xml" />
<components namespace="fessCrawler">
<include path="crawler/container.xml" />
<component name="contentLengthHelper"
class="org.codelibs.robot.helper.ContentLengthHelper" instance="singleton">
class="org.codelibs.fess.crawler.helper.ContentLengthHelper" instance="singleton">
<property name="defaultMaxLength">10485760</property><!-- 10M -->
<postConstruct name="addMaxLength">
<arg>"text/html"</arg>

View file

@ -3,6 +3,6 @@
"http://dbflute.org/meta/lastadi10.dtd">
<components>
<component name="esClient"
class="org.codelibs.robot.client.EsClient">
class="org.codelibs.fess.crawler.client.EsClient">
</component>
</components>

View file

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="s2robot">
<include path="s2robot/container.xml" />
<components namespace="fessCrawler">
<include path="crawler/container.xml" />
<component name="intervalController"
class="org.codelibs.fess.crawler.interval.FessIntervalController"

View file

@ -1,10 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="s2robot">
<include path="s2robot/container.xml" />
<components namespace="fessCrawler">
<include path="crawler/container.xml" />
<component name="logHelper"
class="org.codelibs.fess.helper.RobotLogHelper">
class="org.codelibs.fess.helper.CrawlerLogHelper">
</component>
</components>

View file

@ -1,11 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="s2robot">
<include path="s2robot/container.xml" />
<include path="s2robot/transformer.xml" />
<components namespace="fessCrawler">
<include path="crawler/container.xml" />
<include path="crawler/transformer.xml" />
<component name="ruleManager" class="org.codelibs.robot.rule.impl.RuleManagerImpl" instance="prototype">
<component name="ruleManager" class="org.codelibs.fess.crawler.rule.impl.RuleManagerImpl" instance="prototype">
<postConstruct name="addRule">
<arg>sitemapsRule</arg>
</postConstruct>
@ -23,10 +23,10 @@
</postConstruct>
</component>
<component name="sitemapsRule" class="org.codelibs.robot.rule.impl.RegexRule" >
<component name="sitemapsRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
<property name="ruleId">"sitemapsRule"</property>
<property name="responseProcessor">
<component class="org.codelibs.robot.processor.impl.SitemapsResponseProcessor">
<component class="org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor">
</component>
</property>
<postConstruct name="addRule">
@ -35,10 +35,10 @@
</postConstruct>
</component>
<component name="webHtmlRule" class="org.codelibs.robot.rule.impl.RegexRule" >
<component name="webHtmlRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
<property name="ruleId">"webHtmlRule"</property>
<property name="responseProcessor">
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
<property name="transformer">fessXpathTransformer</property>
<property name="successfulHttpCodes">(int[])[200]</property>
<property name="notModifiedHttpCodes">(int[])[304]</property>
@ -56,10 +56,10 @@
</postConstruct>
</component>
<component name="webFileRule" class="org.codelibs.robot.rule.impl.RegexRule" >
<component name="webFileRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
<property name="ruleId">"webFileRule"</property>
<property name="responseProcessor">
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
<property name="transformer">fessFileTransformer</property>
<property name="successfulHttpCodes">(int[])[200]</property>
<property name="notModifiedHttpCodes">(int[])[304]</property>
@ -85,10 +85,10 @@
</postConstruct>
</component>
<component name="fsFileRule" class="org.codelibs.robot.rule.impl.RegexRule" >
<component name="fsFileRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
<property name="ruleId">"fsFileRule"</property>
<property name="responseProcessor">
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
<property name="transformer">fessFileTransformer</property>
<property name="successfulHttpCodes">(int[])[200]</property>
<property name="notModifiedHttpCodes">(int[])[304]</property>
@ -116,10 +116,10 @@
</component>
<component name="defaultRule" class="org.codelibs.robot.rule.impl.RegexRule" >
<component name="defaultRule" class="org.codelibs.fess.crawler.rule.impl.RegexRule" >
<property name="ruleId">"defaultRule"</property>
<property name="responseProcessor">
<component class="org.codelibs.robot.processor.impl.DefaultResponseProcessor">
<component class="org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor">
<property name="transformer">fessTikaTransformer</property>
<property name="successfulHttpCodes">(int[])[200]</property>
<property name="notModifiedHttpCodes">(int[])[304]</property>

View file

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="s2robot">
<include path="s2robot/transformer_basic.xml"/>
<components namespace="fessCrawler">
<include path="crawler/transformer_basic.xml"/>
<component name="fessXpathTransformer" class="org.codelibs.fess.crawler.transformer.FessXpathTransformer" instance="singleton">

View file

@ -1,52 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE components PUBLIC "-//DBFLUTE//DTD LastaDi 1.0//EN"
"http://dbflute.org/meta/lastadi10.dtd">
<components namespace="s2robot">
<include path="s2robot/container.xml"/>
<include path="s2robot/client.xml"/>
<include path="s2robot/rule.xml"/>
<include path="s2robot/filter.xml"/>
<include path="s2robot/interval.xml"/>
<include path="s2robot/extractor.xml"/>
<include path="s2robot/mimetype.xml"/>
<include path="s2robot/encoding.xml"/>
<include path="s2robot/urlconverter.xml"/>
<include path="s2robot/log.xml"/>
<include path="s2robot/sitemaps.xml"/>
<include path="s2robot/es.xml"/>
<!-- S2Robot -->
<component name="s2Robot" class="org.codelibs.robot.S2Robot" instance="prototype" >
</component>
<!-- Robot Thread -->
<component name="robotThread" class="org.codelibs.fess.crawler.FessS2RobotThread" instance="prototype" >
</component>
<!-- Entity -->
<component name="accessResult"
class="org.codelibs.robot.entity.EsAccessResult" instance="prototype">
</component>
<component name="urlQueue"
class="org.codelibs.robot.entity.EsUrlQueue" instance="prototype">
</component>
<!-- Service -->
<component name="urlQueueService"
class="org.codelibs.robot.service.impl.EsUrlQueueService">
<property name="index">".robot"</property>
<property name="type">"queue"</property>
</component>
<component name="dataService"
class="org.codelibs.robot.service.impl.EsDataService">
<property name="index">".robot"</property>
<property name="type">"data"</property>
</component>
<component name="urlFilterService"
class="org.codelibs.robot.service.impl.EsUrlFilterService">
<property name="index">".robot"</property>
<property name="type">"filter"</property>
</component>
</components>

View file

@ -5,7 +5,7 @@
<include path="convention.xml" />
<include path="fess.xml" />
<include path="s2robot_es.xml" />
<include path="crawler_es.xml" />
<component name="indexingHelper" class="org.codelibs.fess.helper.IndexingHelper">
</component>