#24 use expires_dt on incremental crawling
This commit is contained in:
parent
15b76c3c3a
commit
c604d92551
4 changed files with 50 additions and 84 deletions
|
@ -60,6 +60,8 @@ public class CrawlAction implements Serializable {
|
|||
}
|
||||
|
||||
protected String showIndex(final boolean redirect) {
|
||||
crawlForm.diffCrawling = crawlerProperties.getProperty(
|
||||
Constants.DIFF_CRAWLING_PROPERTY, Constants.TRUE);
|
||||
crawlForm.useAclAsRole = crawlerProperties.getProperty(
|
||||
Constants.USE_ACL_AS_ROLE, Constants.FALSE);
|
||||
crawlForm.serverRotation = crawlerProperties.getProperty(
|
||||
|
@ -134,6 +136,13 @@ public class CrawlAction implements Serializable {
|
|||
@Token(save = false, validate = true)
|
||||
@Execute(validator = true, input = "index.jsp")
|
||||
public String update() {
|
||||
crawlerProperties
|
||||
.setProperty(
|
||||
Constants.DIFF_CRAWLING_PROPERTY,
|
||||
crawlForm.diffCrawling != null
|
||||
&& Constants.ON
|
||||
.equalsIgnoreCase(crawlForm.diffCrawling) ? Constants.TRUE
|
||||
: Constants.FALSE);
|
||||
crawlerProperties
|
||||
.setProperty(
|
||||
Constants.USE_ACL_AS_ROLE,
|
||||
|
|
|
@ -412,21 +412,6 @@ public class Crawler implements Serializable {
|
|||
crawlingSessionService.deleteSessionIdsBefore(options.sessionId,
|
||||
options.name, new Date());
|
||||
|
||||
// expired session ids
|
||||
final List<Map<String, String>> sessionIdInfoList = crawlingSessionHelper
|
||||
.getSessionIdList(updateSolrGroup);
|
||||
for (final Map<String, String> sessionIdInfoMap : sessionIdInfoList) {
|
||||
final String sid = sessionIdInfoMap
|
||||
.get(CrawlingSessionHelper.FACET_SEGMENT_KEY);
|
||||
if (crawlingSessionService.get(sid) == null) {
|
||||
crawlingSessionHelper.addExpiredSessions(sid);
|
||||
}
|
||||
}
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Expired Session Ids: "
|
||||
+ crawlingSessionHelper.getExpiredSessionIdSet());
|
||||
}
|
||||
|
||||
final List<Long> webConfigIdList = options.getWebConfigIdList();
|
||||
final List<Long> fileConfigIdList = options.getFileConfigIdList();
|
||||
final List<Long> dataConfigIdList = options.getDataConfigIdList();
|
||||
|
|
|
@ -22,11 +22,9 @@ import java.util.ArrayList;
|
|||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import jp.sf.fess.Constants;
|
||||
import jp.sf.fess.FessSystemException;
|
||||
|
@ -53,8 +51,6 @@ public class CrawlingSessionHelper implements Serializable {
|
|||
|
||||
protected Map<String, String> infoMap;
|
||||
|
||||
protected Set<String> expiredSessionIdSet = new HashSet<String>();
|
||||
|
||||
protected Date documentExpires;
|
||||
|
||||
protected String expiresField = "expires_dt";
|
||||
|
@ -63,20 +59,6 @@ public class CrawlingSessionHelper implements Serializable {
|
|||
return SingletonS2Container.getComponent(CrawlingSessionService.class);
|
||||
}
|
||||
|
||||
public void addExpiredSessions(final String sessionId) {
|
||||
if (sessionId != null) {
|
||||
expiredSessionIdSet.add(sessionId);
|
||||
}
|
||||
}
|
||||
|
||||
public Set<String> getExpiredSessionIdSet() {
|
||||
return expiredSessionIdSet;
|
||||
}
|
||||
|
||||
public boolean expired(final String sessionId) {
|
||||
return expiredSessionIdSet.contains(sessionId);
|
||||
}
|
||||
|
||||
public String getCanonicalSessionId(final String sessionId) {
|
||||
final int idx = sessionId.indexOf('-');
|
||||
if (idx >= 0) {
|
||||
|
|
|
@ -44,7 +44,6 @@ import org.codelibs.solr.lib.SolrGroup;
|
|||
import org.codelibs.solr.lib.SolrGroupManager;
|
||||
import org.codelibs.solr.lib.policy.QueryType;
|
||||
import org.seasar.framework.container.SingletonS2Container;
|
||||
import org.seasar.framework.util.StringUtil;
|
||||
import org.seasar.robot.S2RobotThread;
|
||||
import org.seasar.robot.client.S2RobotClient;
|
||||
import org.seasar.robot.client.smb.SmbClient;
|
||||
|
@ -78,10 +77,11 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
final CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
|
||||
.getComponent(CrawlingSessionHelper.class);
|
||||
final SambaHelper sambaHelper = SingletonS2Container
|
||||
.getComponent("sambaHelper");
|
||||
.getComponent(SambaHelper.class);
|
||||
final boolean useAclAsRole = crawlerProperties.getProperty(
|
||||
Constants.USE_ACL_AS_ROLE, Constants.FALSE).equals(
|
||||
Constants.TRUE);
|
||||
final String expiresField = crawlingSessionHelper.getExpiresField();
|
||||
|
||||
ResponseData responseData = null;
|
||||
try {
|
||||
|
@ -91,7 +91,7 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
return true;
|
||||
}
|
||||
|
||||
SolrDocumentList oldSolrDocumentList = null;
|
||||
SolrDocumentList oldDocWithRoleList = null;
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper
|
||||
.get(robotContext.getSessionId());
|
||||
final Map<String, Object> dataMap = new HashMap<String, Object>();
|
||||
|
@ -108,7 +108,8 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
}
|
||||
if (useAclAsRole && responseData.getUrl().startsWith("smb://")) {
|
||||
final String id = crawlingSessionHelper.generateId(dataMap);
|
||||
oldSolrDocumentList = getSolrDocumentList(id, true);
|
||||
oldDocWithRoleList = getSolrDocumentList(id, true,
|
||||
expiresField);
|
||||
|
||||
final ACE[] aces = (ACE[]) responseData.getMetaDataMap()
|
||||
.get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
|
||||
|
@ -127,59 +128,44 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
final String id = crawlingSessionHelper.generateId(dataMap);
|
||||
|
||||
final SolrDocumentList solrDocumentList = getSolrDocumentList(
|
||||
id, false);
|
||||
id, false, expiresField);
|
||||
if (solrDocumentList == null) {
|
||||
final Set<String> childUrlSet = getChildUrlSet(id);
|
||||
if (childUrlSet != null) {
|
||||
synchronized (robotContext.getAccessCountLock()) {
|
||||
// add an url
|
||||
storeChildUrls(
|
||||
childUrlSet,
|
||||
urlQueue.getUrl(),
|
||||
urlQueue.getDepth() != null ? urlQueue
|
||||
.getDepth() + 1 : 1);
|
||||
}
|
||||
}
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
storeChildUrlsToQueue(urlQueue, getChildUrlSet(id));
|
||||
return true;
|
||||
}
|
||||
|
||||
deleteSolrDocumentList(oldSolrDocumentList);
|
||||
if (solrDocumentList.size() > 1) {
|
||||
// invalid state
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
deleteSolrDocumentList(solrDocumentList);
|
||||
return true;
|
||||
}
|
||||
|
||||
final SolrDocument solrDocument = solrDocumentList.get(0);
|
||||
|
||||
final String sessionId = (String) solrDocument.get("segment");
|
||||
if (StringUtil.isNotBlank(sessionId)
|
||||
&& crawlingSessionHelper.expired(sessionId)) {
|
||||
deleteSolrDocumentList(oldSolrDocumentList);
|
||||
final Date expires = (Date) solrDocument.get(expiresField);
|
||||
if (expires != null
|
||||
&& expires.getTime() < System.currentTimeMillis()) {
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
return true;
|
||||
}
|
||||
|
||||
final Date lastModified = (Date) solrDocument
|
||||
.get("lastModified");
|
||||
if (lastModified == null) {
|
||||
deleteSolrDocumentList(oldSolrDocumentList);
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
return true;
|
||||
}
|
||||
|
||||
final int httpStatusCode = responseData.getHttpStatusCode();
|
||||
if (httpStatusCode == 404) {
|
||||
deleteSolrDocument(id);
|
||||
final Set<String> childUrlSet = getAnchorSet(solrDocument
|
||||
.get("anchor"));
|
||||
if (childUrlSet != null) {
|
||||
synchronized (robotContext.getAccessCountLock()) {
|
||||
// add an url
|
||||
storeChildUrls(
|
||||
childUrlSet,
|
||||
urlQueue.getUrl(),
|
||||
urlQueue.getDepth() != null ? urlQueue
|
||||
.getDepth() + 1 : 1);
|
||||
}
|
||||
}
|
||||
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
storeChildUrlsToQueue(urlQueue,
|
||||
getAnchorSet(solrDocument.get("anchor")));
|
||||
return false;
|
||||
} else if (responseData.getLastModified() == null) {
|
||||
deleteSolrDocumentList(oldSolrDocumentList);
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
return true;
|
||||
} else if (responseData.getLastModified().getTime() <= lastModified
|
||||
.getTime() && httpStatusCode == 200) {
|
||||
|
@ -194,18 +180,8 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
.setStatus(org.seasar.robot.Constants.NOT_MODIFIED_STATUS);
|
||||
processResponse(urlQueue, responseData);
|
||||
|
||||
final Set<String> childUrlSet = getAnchorSet(solrDocument
|
||||
.get("anchor"));
|
||||
if (childUrlSet != null) {
|
||||
synchronized (robotContext.getAccessCountLock()) {
|
||||
// add an url
|
||||
storeChildUrls(
|
||||
childUrlSet,
|
||||
urlQueue.getUrl(),
|
||||
urlQueue.getDepth() != null ? urlQueue
|
||||
.getDepth() + 1 : 1);
|
||||
}
|
||||
}
|
||||
storeChildUrlsToQueue(urlQueue,
|
||||
getAnchorSet(solrDocument.get("anchor")));
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -218,6 +194,19 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
return true;
|
||||
}
|
||||
|
||||
protected void storeChildUrlsToQueue(final UrlQueue urlQueue,
|
||||
final Set<String> childUrlSet) {
|
||||
if (childUrlSet != null) {
|
||||
synchronized (robotContext.getAccessCountLock()) {
|
||||
// add an url
|
||||
storeChildUrls(childUrlSet, urlQueue.getUrl(),
|
||||
urlQueue.getDepth() != null ? urlQueue.getDepth() + 1
|
||||
: 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
protected Set<String> getAnchorSet(final Object obj) {
|
||||
List<String> anchorList;
|
||||
if (obj instanceof String) {
|
||||
|
@ -241,7 +230,7 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
}
|
||||
|
||||
protected SolrDocumentList getSolrDocumentList(final String id,
|
||||
final boolean wildcard) {
|
||||
final boolean wildcard, final String expiresField) {
|
||||
final SolrGroupManager solrGroupManager = SingletonS2Container
|
||||
.getComponent(SolrGroupManager.class);
|
||||
final SolrGroup solrGroup = solrGroupManager
|
||||
|
@ -255,7 +244,8 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
}
|
||||
queryBuf.append(id);
|
||||
solrQuery.setQuery(queryBuf.toString());
|
||||
solrQuery.setFields("id", "lastModified", "anchor", "segment", "role");
|
||||
solrQuery.setFields("id", "lastModified", "anchor", "segment", "role",
|
||||
expiresField);
|
||||
for (int i = 0; i < maxSolrQueryRetryCount; i++) {
|
||||
try {
|
||||
final QueryResponse response = solrGroup.query(solrQuery);
|
||||
|
|
Loading…
Add table
Reference in a new issue