Browse Source

#24 use expires_dt on incremental crawling

Shinsuke Sugaya 11 years ago
parent
commit
c604d92551

+ 9 - 0
src/main/java/jp/sf/fess/action/admin/CrawlAction.java

@@ -60,6 +60,8 @@ public class CrawlAction implements Serializable {
     }
     }
 
 
     protected String showIndex(final boolean redirect) {
     protected String showIndex(final boolean redirect) {
+        crawlForm.diffCrawling = crawlerProperties.getProperty(
+                Constants.DIFF_CRAWLING_PROPERTY, Constants.TRUE);
         crawlForm.useAclAsRole = crawlerProperties.getProperty(
         crawlForm.useAclAsRole = crawlerProperties.getProperty(
                 Constants.USE_ACL_AS_ROLE, Constants.FALSE);
                 Constants.USE_ACL_AS_ROLE, Constants.FALSE);
         crawlForm.serverRotation = crawlerProperties.getProperty(
         crawlForm.serverRotation = crawlerProperties.getProperty(
@@ -134,6 +136,13 @@ public class CrawlAction implements Serializable {
     @Token(save = false, validate = true)
     @Token(save = false, validate = true)
     @Execute(validator = true, input = "index.jsp")
     @Execute(validator = true, input = "index.jsp")
     public String update() {
     public String update() {
+        crawlerProperties
+                .setProperty(
+                        Constants.DIFF_CRAWLING_PROPERTY,
+                        crawlForm.diffCrawling != null
+                                && Constants.ON
+                                        .equalsIgnoreCase(crawlForm.diffCrawling) ? Constants.TRUE
+                                : Constants.FALSE);
         crawlerProperties
         crawlerProperties
                 .setProperty(
                 .setProperty(
                         Constants.USE_ACL_AS_ROLE,
                         Constants.USE_ACL_AS_ROLE,

+ 0 - 15
src/main/java/jp/sf/fess/exec/Crawler.java

@@ -412,21 +412,6 @@ public class Crawler implements Serializable {
             crawlingSessionService.deleteSessionIdsBefore(options.sessionId,
             crawlingSessionService.deleteSessionIdsBefore(options.sessionId,
                     options.name, new Date());
                     options.name, new Date());
 
 
-            // expired session ids
-            final List<Map<String, String>> sessionIdInfoList = crawlingSessionHelper
-                    .getSessionIdList(updateSolrGroup);
-            for (final Map<String, String> sessionIdInfoMap : sessionIdInfoList) {
-                final String sid = sessionIdInfoMap
-                        .get(CrawlingSessionHelper.FACET_SEGMENT_KEY);
-                if (crawlingSessionService.get(sid) == null) {
-                    crawlingSessionHelper.addExpiredSessions(sid);
-                }
-            }
-            if (logger.isInfoEnabled()) {
-                logger.info("Expired Session Ids: "
-                        + crawlingSessionHelper.getExpiredSessionIdSet());
-            }
-
             final List<Long> webConfigIdList = options.getWebConfigIdList();
             final List<Long> webConfigIdList = options.getWebConfigIdList();
             final List<Long> fileConfigIdList = options.getFileConfigIdList();
             final List<Long> fileConfigIdList = options.getFileConfigIdList();
             final List<Long> dataConfigIdList = options.getDataConfigIdList();
             final List<Long> dataConfigIdList = options.getDataConfigIdList();

+ 0 - 18
src/main/java/jp/sf/fess/helper/CrawlingSessionHelper.java

@@ -22,11 +22,9 @@ import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Collections;
 import java.util.Date;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.LinkedHashMap;
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.List;
 import java.util.Map;
 import java.util.Map;
-import java.util.Set;
 
 
 import jp.sf.fess.Constants;
 import jp.sf.fess.Constants;
 import jp.sf.fess.FessSystemException;
 import jp.sf.fess.FessSystemException;
@@ -53,8 +51,6 @@ public class CrawlingSessionHelper implements Serializable {
 
 
     protected Map<String, String> infoMap;
     protected Map<String, String> infoMap;
 
 
-    protected Set<String> expiredSessionIdSet = new HashSet<String>();
-
     protected Date documentExpires;
     protected Date documentExpires;
 
 
     protected String expiresField = "expires_dt";
     protected String expiresField = "expires_dt";
@@ -63,20 +59,6 @@ public class CrawlingSessionHelper implements Serializable {
         return SingletonS2Container.getComponent(CrawlingSessionService.class);
         return SingletonS2Container.getComponent(CrawlingSessionService.class);
     }
     }
 
 
-    public void addExpiredSessions(final String sessionId) {
-        if (sessionId != null) {
-            expiredSessionIdSet.add(sessionId);
-        }
-    }
-
-    public Set<String> getExpiredSessionIdSet() {
-        return expiredSessionIdSet;
-    }
-
-    public boolean expired(final String sessionId) {
-        return expiredSessionIdSet.contains(sessionId);
-    }
-
     public String getCanonicalSessionId(final String sessionId) {
     public String getCanonicalSessionId(final String sessionId) {
         final int idx = sessionId.indexOf('-');
         final int idx = sessionId.indexOf('-');
         if (idx >= 0) {
         if (idx >= 0) {

+ 41 - 51
src/main/java/jp/sf/fess/robot/FessS2RobotThread.java

@@ -44,7 +44,6 @@ import org.codelibs.solr.lib.SolrGroup;
 import org.codelibs.solr.lib.SolrGroupManager;
 import org.codelibs.solr.lib.SolrGroupManager;
 import org.codelibs.solr.lib.policy.QueryType;
 import org.codelibs.solr.lib.policy.QueryType;
 import org.seasar.framework.container.SingletonS2Container;
 import org.seasar.framework.container.SingletonS2Container;
-import org.seasar.framework.util.StringUtil;
 import org.seasar.robot.S2RobotThread;
 import org.seasar.robot.S2RobotThread;
 import org.seasar.robot.client.S2RobotClient;
 import org.seasar.robot.client.S2RobotClient;
 import org.seasar.robot.client.smb.SmbClient;
 import org.seasar.robot.client.smb.SmbClient;
@@ -78,10 +77,11 @@ public class FessS2RobotThread extends S2RobotThread {
             final CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
             final CrawlingSessionHelper crawlingSessionHelper = SingletonS2Container
                     .getComponent(CrawlingSessionHelper.class);
                     .getComponent(CrawlingSessionHelper.class);
             final SambaHelper sambaHelper = SingletonS2Container
             final SambaHelper sambaHelper = SingletonS2Container
-                    .getComponent("sambaHelper");
+                    .getComponent(SambaHelper.class);
             final boolean useAclAsRole = crawlerProperties.getProperty(
             final boolean useAclAsRole = crawlerProperties.getProperty(
                     Constants.USE_ACL_AS_ROLE, Constants.FALSE).equals(
                     Constants.USE_ACL_AS_ROLE, Constants.FALSE).equals(
                     Constants.TRUE);
                     Constants.TRUE);
+            final String expiresField = crawlingSessionHelper.getExpiresField();
 
 
             ResponseData responseData = null;
             ResponseData responseData = null;
             try {
             try {
@@ -91,7 +91,7 @@ public class FessS2RobotThread extends S2RobotThread {
                     return true;
                     return true;
                 }
                 }
 
 
-                SolrDocumentList oldSolrDocumentList = null;
+                SolrDocumentList oldDocWithRoleList = null;
                 final CrawlingConfig crawlingConfig = crawlingConfigHelper
                 final CrawlingConfig crawlingConfig = crawlingConfigHelper
                         .get(robotContext.getSessionId());
                         .get(robotContext.getSessionId());
                 final Map<String, Object> dataMap = new HashMap<String, Object>();
                 final Map<String, Object> dataMap = new HashMap<String, Object>();
@@ -108,7 +108,8 @@ public class FessS2RobotThread extends S2RobotThread {
                 }
                 }
                 if (useAclAsRole && responseData.getUrl().startsWith("smb://")) {
                 if (useAclAsRole && responseData.getUrl().startsWith("smb://")) {
                     final String id = crawlingSessionHelper.generateId(dataMap);
                     final String id = crawlingSessionHelper.generateId(dataMap);
-                    oldSolrDocumentList = getSolrDocumentList(id, true);
+                    oldDocWithRoleList = getSolrDocumentList(id, true,
+                            expiresField);
 
 
                     final ACE[] aces = (ACE[]) responseData.getMetaDataMap()
                     final ACE[] aces = (ACE[]) responseData.getMetaDataMap()
                             .get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
                             .get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
@@ -127,59 +128,44 @@ public class FessS2RobotThread extends S2RobotThread {
                 final String id = crawlingSessionHelper.generateId(dataMap);
                 final String id = crawlingSessionHelper.generateId(dataMap);
 
 
                 final SolrDocumentList solrDocumentList = getSolrDocumentList(
                 final SolrDocumentList solrDocumentList = getSolrDocumentList(
-                        id, false);
+                        id, false, expiresField);
                 if (solrDocumentList == null) {
                 if (solrDocumentList == null) {
-                    final Set<String> childUrlSet = getChildUrlSet(id);
-                    if (childUrlSet != null) {
-                        synchronized (robotContext.getAccessCountLock()) {
-                            //  add an url
-                            storeChildUrls(
-                                    childUrlSet,
-                                    urlQueue.getUrl(),
-                                    urlQueue.getDepth() != null ? urlQueue
-                                            .getDepth() + 1 : 1);
-                        }
-                    }
+                    deleteSolrDocumentList(oldDocWithRoleList);
+                    storeChildUrlsToQueue(urlQueue, getChildUrlSet(id));
+                    return true;
+                }
 
 
-                    deleteSolrDocumentList(oldSolrDocumentList);
+                if (solrDocumentList.size() > 1) {
+                    // invalid state
+                    deleteSolrDocumentList(oldDocWithRoleList);
+                    deleteSolrDocumentList(solrDocumentList);
                     return true;
                     return true;
                 }
                 }
 
 
                 final SolrDocument solrDocument = solrDocumentList.get(0);
                 final SolrDocument solrDocument = solrDocumentList.get(0);
-
-                final String sessionId = (String) solrDocument.get("segment");
-                if (StringUtil.isNotBlank(sessionId)
-                        && crawlingSessionHelper.expired(sessionId)) {
-                    deleteSolrDocumentList(oldSolrDocumentList);
+                final Date expires = (Date) solrDocument.get(expiresField);
+                if (expires != null
+                        && expires.getTime() < System.currentTimeMillis()) {
+                    deleteSolrDocumentList(oldDocWithRoleList);
                     return true;
                     return true;
                 }
                 }
 
 
                 final Date lastModified = (Date) solrDocument
                 final Date lastModified = (Date) solrDocument
                         .get("lastModified");
                         .get("lastModified");
                 if (lastModified == null) {
                 if (lastModified == null) {
-                    deleteSolrDocumentList(oldSolrDocumentList);
+                    deleteSolrDocumentList(oldDocWithRoleList);
                     return true;
                     return true;
                 }
                 }
 
 
                 final int httpStatusCode = responseData.getHttpStatusCode();
                 final int httpStatusCode = responseData.getHttpStatusCode();
                 if (httpStatusCode == 404) {
                 if (httpStatusCode == 404) {
                     deleteSolrDocument(id);
                     deleteSolrDocument(id);
-                    final Set<String> childUrlSet = getAnchorSet(solrDocument
-                            .get("anchor"));
-                    if (childUrlSet != null) {
-                        synchronized (robotContext.getAccessCountLock()) {
-                            //  add an url
-                            storeChildUrls(
-                                    childUrlSet,
-                                    urlQueue.getUrl(),
-                                    urlQueue.getDepth() != null ? urlQueue
-                                            .getDepth() + 1 : 1);
-                        }
-                    }
-
+                    deleteSolrDocumentList(oldDocWithRoleList);
+                    storeChildUrlsToQueue(urlQueue,
+                            getAnchorSet(solrDocument.get("anchor")));
                     return false;
                     return false;
                 } else if (responseData.getLastModified() == null) {
                 } else if (responseData.getLastModified() == null) {
-                    deleteSolrDocumentList(oldSolrDocumentList);
+                    deleteSolrDocumentList(oldDocWithRoleList);
                     return true;
                     return true;
                 } else if (responseData.getLastModified().getTime() <= lastModified
                 } else if (responseData.getLastModified().getTime() <= lastModified
                         .getTime() && httpStatusCode == 200) {
                         .getTime() && httpStatusCode == 200) {
@@ -194,18 +180,8 @@ public class FessS2RobotThread extends S2RobotThread {
                             .setStatus(org.seasar.robot.Constants.NOT_MODIFIED_STATUS);
                             .setStatus(org.seasar.robot.Constants.NOT_MODIFIED_STATUS);
                     processResponse(urlQueue, responseData);
                     processResponse(urlQueue, responseData);
 
 
-                    final Set<String> childUrlSet = getAnchorSet(solrDocument
-                            .get("anchor"));
-                    if (childUrlSet != null) {
-                        synchronized (robotContext.getAccessCountLock()) {
-                            //  add an url
-                            storeChildUrls(
-                                    childUrlSet,
-                                    urlQueue.getUrl(),
-                                    urlQueue.getDepth() != null ? urlQueue
-                                            .getDepth() + 1 : 1);
-                        }
-                    }
+                    storeChildUrlsToQueue(urlQueue,
+                            getAnchorSet(solrDocument.get("anchor")));
 
 
                     return false;
                     return false;
                 }
                 }
@@ -218,6 +194,19 @@ public class FessS2RobotThread extends S2RobotThread {
         return true;
         return true;
     }
     }
 
 
+    protected void storeChildUrlsToQueue(final UrlQueue urlQueue,
+            final Set<String> childUrlSet) {
+        if (childUrlSet != null) {
+            synchronized (robotContext.getAccessCountLock()) {
+                //  add an url
+                storeChildUrls(childUrlSet, urlQueue.getUrl(),
+                        urlQueue.getDepth() != null ? urlQueue.getDepth() + 1
+                                : 1);
+            }
+        }
+    }
+
+    @SuppressWarnings("unchecked")
     protected Set<String> getAnchorSet(final Object obj) {
     protected Set<String> getAnchorSet(final Object obj) {
         List<String> anchorList;
         List<String> anchorList;
         if (obj instanceof String) {
         if (obj instanceof String) {
@@ -241,7 +230,7 @@ public class FessS2RobotThread extends S2RobotThread {
     }
     }
 
 
     protected SolrDocumentList getSolrDocumentList(final String id,
     protected SolrDocumentList getSolrDocumentList(final String id,
-            final boolean wildcard) {
+            final boolean wildcard, final String expiresField) {
         final SolrGroupManager solrGroupManager = SingletonS2Container
         final SolrGroupManager solrGroupManager = SingletonS2Container
                 .getComponent(SolrGroupManager.class);
                 .getComponent(SolrGroupManager.class);
         final SolrGroup solrGroup = solrGroupManager
         final SolrGroup solrGroup = solrGroupManager
@@ -255,7 +244,8 @@ public class FessS2RobotThread extends S2RobotThread {
         }
         }
         queryBuf.append(id);
         queryBuf.append(id);
         solrQuery.setQuery(queryBuf.toString());
         solrQuery.setQuery(queryBuf.toString());
-        solrQuery.setFields("id", "lastModified", "anchor", "segment", "role");
+        solrQuery.setFields("id", "lastModified", "anchor", "segment", "role",
+                expiresField);
         for (int i = 0; i < maxSolrQueryRetryCount; i++) {
         for (int i = 0; i < maxSolrQueryRetryCount; i++) {
             try {
             try {
                 final QueryResponse response = solrGroup.query(solrQuery);
                 final QueryResponse response = solrGroup.query(solrQuery);