This commit is contained in:
Shinsuke Sugaya 2014-10-28 18:32:23 +09:00
parent be91289e37
commit 70bd6fa900
9 changed files with 364 additions and 277 deletions

View file

@ -25,6 +25,7 @@ import jp.sf.fess.Constants;
import jp.sf.fess.FessSystemException;
import jp.sf.fess.ds.IndexUpdateCallback;
import jp.sf.fess.helper.CrawlingSessionHelper;
import jp.sf.fess.helper.IndexingHelper;
import jp.sf.fess.helper.SearchLogHelper;
import jp.sf.fess.helper.SystemHelper;
import jp.sf.fess.util.ComponentUtil;
@ -40,7 +41,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
protected SolrGroup solrGroup;
public int maxDocumentCacheSize = 10;
public int maxDocumentCacheSize = 5;
public boolean clickCountEnabled = true;
@ -71,6 +72,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
throw new FessSystemException("url is null. dataMap=" + dataMap);
}
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final CrawlingSessionHelper crawlingSessionHelper = ComponentUtil
.getCrawlingSessionHelper();
dataMap.put("id", crawlingSessionHelper.generateId(dataMap));
@ -85,13 +87,13 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
}
if (docList.size() >= maxDocumentCacheSize) {
sendDocuments();
indexingHelper.sendDocuments(solrGroup, docList);
}
documentSize.getAndIncrement();
// commit
if (commitPerCount > 0 && documentSize.get() % commitPerCount == 0) {
if (!docList.isEmpty()) {
sendDocuments();
indexingHelper.sendDocuments(solrGroup, docList);
}
commitDocuments();
}
@ -142,7 +144,9 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
@Override
public void commit() {
if (!docList.isEmpty()) {
sendDocuments();
final IndexingHelper indexingHelper = ComponentUtil
.getIndexingHelper();
indexingHelper.sendDocuments(solrGroup, docList);
}
commitDocuments();
}
@ -161,22 +165,6 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
}
}
protected void sendDocuments() {
final long execTime = System.currentTimeMillis();
if (logger.isInfoEnabled()) {
logger.info("Sending " + docList.size() + " document to a server.");
}
synchronized (solrGroup) {
solrGroup.add(docList);
}
if (logger.isInfoEnabled()) {
logger.info("Sent " + docList.size()
+ " documents. The execution time is "
+ (System.currentTimeMillis() - execTime) + "ms.");
}
docList.clear();
}
protected void addClickCountField(final SolrInputDocument doc,
final String url) {
final SearchLogHelper searchLogHelper = ComponentUtil

View file

@ -0,0 +1,253 @@
/*
* Copyright 2009-2014 the CodeLibs Project and the Others.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
* either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/
package jp.sf.fess.helper;
import java.util.ArrayList;
import java.util.List;
import jp.sf.fess.util.ComponentUtil;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.codelibs.solr.lib.SolrGroup;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class IndexingHelper {
private static final Logger logger = LoggerFactory
.getLogger(IndexingHelper.class);
public int maxRetryCount = 5;
public int defaultRowSize = 100;
public long requestInterval = 500;
public void sendDocuments(final SolrGroup solrGroup,
final List<SolrInputDocument> docList) {
final long execTime = System.currentTimeMillis();
if (logger.isDebugEnabled()) {
logger.debug("Sending " + docList.size()
+ " documents to a server.");
}
synchronized (solrGroup) {
deleteOldDocuments(solrGroup, docList);
solrGroup.add(docList);
}
if (logger.isInfoEnabled()) {
logger.info("Sent " + docList.size() + " docs (Solr: "
+ (System.currentTimeMillis() - execTime) + "ms)");
}
docList.clear();
}
private void deleteOldDocuments(final SolrGroup solrGroup,
final List<SolrInputDocument> docList) {
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final List<String> ids = new ArrayList<String>();
final StringBuilder q = new StringBuilder(1000);
final StringBuilder fq = new StringBuilder(100);
for (final SolrInputDocument inputDoc : docList) {
final Object idValue = inputDoc.getFieldValue(systemHelper.idField);
if (idValue == null) {
continue;
}
final Object configIdValue = inputDoc
.getFieldValue(systemHelper.configIdField);
if (configIdValue == null) {
continue;
}
q.setLength(0);
q.append(systemHelper.urlField).append(":\"");
q.append(ClientUtils.escapeQueryChars((String) inputDoc
.getFieldValue(systemHelper.urlField)));
q.append('"');
fq.setLength(0);
fq.append(systemHelper.configIdField).append(':');
fq.append(configIdValue.toString());
final SolrDocumentList docs = getSolrDocumentList(solrGroup,
fq.toString(), q.toString(),
new String[] { systemHelper.idField });
for (final SolrDocument doc : docs) {
final Object oldIdValue = doc
.getFieldValue(systemHelper.idField);
if (!idValue.equals(oldIdValue) && oldIdValue != null) {
ids.add(oldIdValue.toString());
}
}
if (logger.isDebugEnabled()) {
logger.debug(q + " in " + fq + " => " + docs);
}
}
if (!ids.isEmpty()) {
for (final String id : ids) {
deleteDocument(solrGroup, id);
}
}
}
public SolrDocumentList getSolrDocumentList(final SolrGroup solrGroup,
final String fq, final String q, final String[] fields) {
return getSolrDocumentList(solrGroup, fq, q, fields, defaultRowSize);
}
protected SolrDocumentList getSolrDocumentList(final SolrGroup solrGroup,
final String fq, final String q, final String[] fields,
final int row) {
final SolrQuery sq = new SolrQuery();
if (fq != null) {
sq.setFilterQueries(fq);
}
sq.setQuery(q);
if (fields != null) {
sq.setFields(fields);
}
sq.setRows(row);
final SolrDocumentList docList = solrGroup.query(sq).getResults();
if (docList.getNumFound() < row) {
return docList;
}
return getSolrDocumentList(solrGroup, fq, q, fields,
(int) docList.getNumFound());
}
public void deleteDocument(final SolrGroup solrGroup, final String id) {
final String query = "{!raw f=id}" + id;
for (int i = 0; i < maxRetryCount; i++) {
boolean done = true;
try {
for (final UpdateResponse response : solrGroup
.deleteByQuery(query)) {
if (response.getStatus() != 200) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to delete: " + response);
}
done = false;
}
}
} catch (final Exception e) {
logger.warn("Could not delete a document from Solr."
+ " It might be busy. " + "Retrying.. id:" + id
+ ", cause: " + e.getMessage());
done = false;
}
if (done) {
logger.info("Deleted from Solr: " + id);
break;
}
try {
Thread.sleep(requestInterval);
} catch (final InterruptedException e) {
}
}
}
public SolrDocument getSolrDocument(final SolrGroup solrGroup,
final String id, final String[] fields) {
final SolrQuery solrQuery = new SolrQuery();
final StringBuilder queryBuf = new StringBuilder(200);
queryBuf.append("{!raw f=id}");
queryBuf.append(id);
solrQuery.setQuery(queryBuf.toString());
if (fields != null) {
solrQuery.setFields(fields);
}
final QueryResponse response = solrGroup.query(solrQuery);
final SolrDocumentList docList = response.getResults();
if (docList.isEmpty()) {
return null;
}
if (docList.size() > 1) {
logger.error("Invalid multiple docs for " + id);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
for (final SolrDocument doc : docList) {
final Object idValue = doc.getFieldValue(systemHelper.idField);
if (idValue != null) {
deleteDocument(solrGroup, idValue.toString());
}
}
return null;
}
return docList.get(0);
}
public SolrDocumentList getSolrDocumentListByPrefixId(
final SolrGroup solrGroup, final String id, final String[] fields) {
final SolrQuery solrQuery = new SolrQuery();
final StringBuilder queryBuf = new StringBuilder(200);
queryBuf.append("{!prefix f=id}");
queryBuf.append(id);
solrQuery.setQuery(queryBuf.toString());
if (fields != null) {
solrQuery.setFields(fields);
}
final QueryResponse response = solrGroup.query(solrQuery);
final SolrDocumentList docList = response.getResults();
if (docList.isEmpty()) {
return null;
}
if (logger.isDebugEnabled()) {
logger.debug("Found solr documents: " + docList);
}
return docList;
}
public void deleteChildSolrDocument(final SolrGroup solrGroup,
final String id) {
final String query = "{!raw f=parentId v=\"" + id + "\"}";
for (final UpdateResponse response : solrGroup.deleteByQuery(query)) {
if (response.getStatus() != 200) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to delete: " + response);
}
}
}
}
public SolrDocumentList getChildSolrDocumentList(final SolrGroup solrGroup,
final String id, final String[] fields) {
return getChildSolrDocumentList(solrGroup, id, fields, defaultRowSize);
}
protected SolrDocumentList getChildSolrDocumentList(
final SolrGroup solrGroup, final String id, final String[] fields,
final int row) {
final SolrQuery solrQuery = new SolrQuery();
solrQuery.setQuery("{!raw f=parentId v=\"" + id + "\"}");
if (fields != null) {
solrQuery.setFields(fields);
}
solrQuery.setRows(row);
final SolrDocumentList docList = solrGroup.query(solrQuery)
.getResults();
if (docList.getNumFound() < row) {
return docList;
}
return getChildSolrDocumentList(solrGroup, id, fields,
(int) docList.getNumFound());
}
}

View file

@ -107,13 +107,13 @@ public class QueryHelper implements Serializable {
protected String[] responseFields = new String[] { "id", "docId", "score",
"boost", "contentLength", "host", "site", "lastModified",
"mimetype", "filetype_s", "created", TITLE_FIELD, "digest", "url",
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s_s", "lang_s",
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s", "lang_s",
"hasCache_s_s" };
protected String[] cacheResponseFields = new String[] { "id", "docId",
"score", "boost", "contentLength", "host", "site", "lastModified",
"mimetype", "filetype_s", "created", TITLE_FIELD, "digest", "url",
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s_s", "lang_s",
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s", "lang_s",
"cache" };
protected String[] responseDocValuesFields = new String[] {

View file

@ -122,7 +122,7 @@ public class SystemHelper implements Serializable {
public String clickCountField = "clickCount_l_x_dv";
public String configIdField = "cid_s_s";
public String configIdField = "cid_s";
public String expiresField = "expires_dt";
@ -136,6 +136,14 @@ public class SystemHelper implements Serializable {
public String hasCacheField = "hasCache_s_s";
public String lastModifiedField = "lastModified";
public String anchorField = "anchor";
public String segmentField = "segment";
public String roleField = "role";
protected String[] supportedLanguages = new String[] { "ar", "bg", "ca",
"da", "de", "el", "en", "es", "eu", "fa", "fi", "fr", "ga", "gl",
"hi", "hu", "hy", "id", "it", "ja", "lv", "ko", "nl", "no", "pt",

View file

@ -31,15 +31,13 @@ import jp.sf.fess.Constants;
import jp.sf.fess.db.exentity.CrawlingConfig;
import jp.sf.fess.helper.CrawlingConfigHelper;
import jp.sf.fess.helper.CrawlingSessionHelper;
import jp.sf.fess.helper.IndexingHelper;
import jp.sf.fess.helper.SambaHelper;
import jp.sf.fess.helper.SearchLogHelper;
import jp.sf.fess.helper.SystemHelper;
import jp.sf.fess.util.ComponentUtil;
import org.apache.commons.io.IOUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.codelibs.core.util.DynamicProperties;
@ -61,10 +59,6 @@ public class FessS2RobotThread extends S2RobotThread {
private static final Logger logger = LoggerFactory
.getLogger(FessS2RobotThread.class);
public int maxSolrQueryRetryCount = 5;
public int childUrlSize = 10000;
@Override
protected boolean isContentUpdated(final S2RobotClient client,
final UrlQueue urlQueue) {
@ -82,34 +76,35 @@ public class FessS2RobotThread extends S2RobotThread {
.getCrawlingSessionHelper();
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final SambaHelper sambaHelper = ComponentUtil.getSambaHelper();
final IndexingHelper indexingHelper = ComponentUtil
.getIndexingHelper();
final SolrGroupManager solrGroupManager = ComponentUtil
.getSolrGroupManager();
final boolean useAclAsRole = crawlerProperties.getProperty(
Constants.USE_ACL_AS_ROLE, Constants.FALSE).equals(
Constants.TRUE);
final String expiresField = systemHelper.expiresField;
final SolrGroup solrGroup = solrGroupManager
.getSolrGroup(QueryType.ADD);
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
// head method
responseData = client
.execute(RequestDataBuilder.newRequestData().head()
.url(urlQueue.getUrl()).build());
if (responseData == null) {
return true;
}
SolrDocumentList oldDocWithRoleList = null;
final CrawlingConfig crawlingConfig = crawlingConfigHelper
.get(robotContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<String, Object>();
dataMap.put("url", urlQueue.getUrl());
dataMap.put(systemHelper.urlField, url);
final List<String> roleTypeList = new ArrayList<String>();
for (final String roleType : crawlingConfig.getRoleTypeValues()) {
roleTypeList.add(roleType);
}
if (useAclAsRole && responseData.getUrl().startsWith("smb://")) {
final String id = crawlingSessionHelper.generateId(dataMap);
oldDocWithRoleList = getSolrDocumentList(id, true,
expiresField);
if (useAclAsRole && url.startsWith("smb://")) {
// head method
responseData = client.execute(RequestDataBuilder
.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
final ACE[] aces = (ACE[]) responseData.getMetaDataMap()
.get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
@ -124,36 +119,40 @@ public class FessS2RobotThread extends S2RobotThread {
}
}
}
dataMap.put("role", roleTypeList);
dataMap.put(systemHelper.roleField, roleTypeList);
final String id = crawlingSessionHelper.generateId(dataMap);
final SolrDocumentList solrDocumentList = getSolrDocumentList(
id, false, expiresField);
if (solrDocumentList == null) {
deleteSolrDocumentList(oldDocWithRoleList);
storeChildUrlsToQueue(urlQueue, getChildUrlSet(id));
final SolrDocument solrDocument = indexingHelper
.getSolrDocument(solrGroup, id, new String[] {
systemHelper.idField,
systemHelper.lastModifiedField,
systemHelper.anchorField,
systemHelper.segmentField,
systemHelper.expiresField,
systemHelper.clickCountField,
systemHelper.favoriteCountField });
if (solrDocument == null) {
storeChildUrlsToQueue(urlQueue,
getChildUrlSet(solrGroup, id)); // TODO
return true;
}
if (solrDocumentList.size() > 1) {
// invalid state
deleteSolrDocumentList(oldDocWithRoleList);
deleteSolrDocumentList(solrDocumentList);
return true;
}
final SolrDocument solrDocument = solrDocumentList.get(0);
final Date expires = (Date) solrDocument.get(expiresField);
final Date expires = (Date) solrDocument
.get(systemHelper.expiresField);
if (expires != null
&& expires.getTime() < System.currentTimeMillis()) {
deleteSolrDocumentList(oldDocWithRoleList);
final Object idValue = solrDocument
.getFieldValue(systemHelper.idField);
if (idValue != null) {
indexingHelper.deleteDocument(solrGroup,
idValue.toString());
}
return true;
}
final Date lastModified = (Date) solrDocument
.get("lastModified");
.get(systemHelper.lastModifiedField);
if (lastModified == null) {
deleteSolrDocumentList(oldDocWithRoleList);
return true;
}
@ -162,10 +161,8 @@ public class FessS2RobotThread extends S2RobotThread {
if (clickCount != null) {
final SearchLogHelper searchLogHelper = ComponentUtil
.getSearchLogHelper();
final int count = searchLogHelper.getClickCount(urlQueue
.getUrl());
final int count = searchLogHelper.getClickCount(url);
if (count != clickCount.intValue()) {
deleteSolrDocumentList(oldDocWithRoleList);
return true;
}
}
@ -175,23 +172,28 @@ public class FessS2RobotThread extends S2RobotThread {
if (favoriteCount != null) {
final SearchLogHelper searchLogHelper = ComponentUtil
.getSearchLogHelper();
final long count = searchLogHelper
.getFavoriteCount(urlQueue.getUrl());
final long count = searchLogHelper.getFavoriteCount(url);
if (count != favoriteCount.longValue()) {
deleteSolrDocumentList(oldDocWithRoleList);
return true;
}
}
if (responseData == null) {
// head method
responseData = client.execute(RequestDataBuilder
.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
}
final int httpStatusCode = responseData.getHttpStatusCode();
if (httpStatusCode == 404) {
deleteSolrDocument(id);
deleteSolrDocumentList(oldDocWithRoleList);
storeChildUrlsToQueue(urlQueue,
getAnchorSet(solrDocument.get("anchor")));
indexingHelper.deleteDocument(solrGroup, id);
return false;
} else if (responseData.getLastModified() == null) {
deleteSolrDocumentList(oldDocWithRoleList);
return true;
} else if (responseData.getLastModified().getTime() <= lastModified
.getTime() && httpStatusCode == 200) {
@ -256,134 +258,28 @@ public class FessS2RobotThread extends S2RobotThread {
return childUrlSet;
}
protected SolrDocumentList getSolrDocumentList(final String id,
final boolean wildcard, final String expiresField) {
final SolrGroupManager solrGroupManager = ComponentUtil
.getSolrGroupManager();
protected Set<RequestData> getChildUrlSet(final SolrGroup solrGroup,
final String id) {
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final SolrGroup solrGroup = solrGroupManager
.getSolrGroup(QueryType.ADD);
final SolrQuery solrQuery = new SolrQuery();
final StringBuilder queryBuf = new StringBuilder(200);
if (wildcard) {
queryBuf.append("{!prefix f=id}");
} else {
queryBuf.append("{!raw f=id}");
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final SolrDocumentList docList = indexingHelper
.getChildSolrDocumentList(solrGroup, id,
new String[] { systemHelper.urlField });
if (docList.isEmpty()) {
return null;
}
queryBuf.append(id);
solrQuery.setQuery(queryBuf.toString());
solrQuery.setFields("id", "lastModified", "anchor", "segment", "role",
expiresField, systemHelper.clickCountField,
systemHelper.favoriteCountField);
for (int i = 0; i < maxSolrQueryRetryCount; i++) {
try {
final QueryResponse response = solrGroup.query(solrQuery);
final SolrDocumentList docList = response.getResults();
if (docList.isEmpty()) {
return null;
}
if (logger.isDebugEnabled()) {
logger.debug("Found solr documents: " + docList);
}
return docList;
} catch (final Exception e) {
logger.info("Could not get a response from Solr."
+ " It might be busy. " + "Retrying.. id:" + id
+ ", cause: " + e.getMessage());
}
try {
Thread.sleep(500);
} catch (final InterruptedException e) {
if (logger.isDebugEnabled()) {
logger.debug("Found solr documents: " + docList);
}
final Set<RequestData> urlSet = new HashSet<>(docList.size());
for (final SolrDocument doc : docList) {
final Object obj = doc.get(systemHelper.urlField);
if (obj != null) {
urlSet.add(RequestDataBuilder.newRequestData().get()
.url(obj.toString()).build());
}
}
return null;
return urlSet;
}
protected Set<RequestData> getChildUrlSet(final String id) {
final SolrGroupManager solrGroupManager = ComponentUtil
.getSolrGroupManager();
final SolrGroup solrGroup = solrGroupManager
.getSolrGroup(QueryType.ADD);
final SolrQuery solrQuery = new SolrQuery();
solrQuery.setQuery("{!raw f=parentId v=\"" + id + "\"}");
solrQuery.setFields("url");
solrQuery.setRows(childUrlSize);
for (int i = 0; i < maxSolrQueryRetryCount; i++) {
try {
final QueryResponse response = solrGroup.query(solrQuery);
final SolrDocumentList docList = response.getResults();
if (docList.isEmpty()) {
return null;
}
if (logger.isDebugEnabled()) {
logger.debug("Found solr documents: " + docList);
}
final Set<RequestData> urlSet = new HashSet<>(docList.size());
for (final SolrDocument doc : docList) {
final Object obj = doc.get("url");
if (obj != null) {
urlSet.add(RequestDataBuilder.newRequestData().get()
.url(obj.toString()).build());
}
}
return urlSet;
} catch (final Exception e) {
logger.info("Could not get a response from Solr."
+ " It might be busy. " + "Retrying.. id:" + id
+ ", cause: " + e.getMessage());
}
try {
Thread.sleep(500);
} catch (final InterruptedException e) {
}
}
return null;
}
protected void deleteSolrDocument(final String id) {
final SolrGroupManager solrGroupManager = ComponentUtil
.getSolrGroupManager();
final SolrGroup solrGroup = solrGroupManager
.getSolrGroup(QueryType.DELETE);
final String query = "{!raw f=parentId v=\"" + id + "\"}";
for (int i = 0; i < maxSolrQueryRetryCount; i++) {
boolean done = true;
try {
for (final UpdateResponse response : solrGroup
.deleteByQuery(query)) {
if (response.getStatus() != 200) {
if (logger.isDebugEnabled()) {
logger.debug("Failed to delete: " + response);
}
done = false;
}
}
} catch (final Exception e) {
logger.info("Could not delete a document from Solr."
+ " It might be busy. " + "Retrying.. id:" + id
+ ", cause: " + e.getMessage());
done = false;
}
if (done) {
logger.info("Deleted from Solr: " + id);
break;
}
try {
Thread.sleep(500);
} catch (final InterruptedException e) {
}
}
}
protected void deleteSolrDocumentList(
final SolrDocumentList solrDocumentList) {
if (solrDocumentList != null) {
for (final SolrDocument solrDocument : solrDocumentList) {
final Object idObj = solrDocument.get("id");
if (idObj != null) {
deleteSolrDocument(idObj.toString());
}
}
}
}
}

View file

@ -17,7 +17,6 @@
package jp.sf.fess.solr;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
@ -32,15 +31,11 @@ import jp.sf.fess.db.exbhv.ClickLogBhv;
import jp.sf.fess.db.exbhv.FavoriteLogBhv;
import jp.sf.fess.db.exbhv.pmbean.FavoriteUrlCountPmb;
import jp.sf.fess.db.exentity.customize.FavoriteUrlCount;
import jp.sf.fess.helper.IndexingHelper;
import jp.sf.fess.helper.IntervalControlHelper;
import jp.sf.fess.helper.SystemHelper;
import jp.sf.fess.util.ComponentUtil;
import org.apache.commons.collections.CollectionUtils;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.codelibs.core.util.StringUtil;
import org.codelibs.robot.S2Robot;
@ -95,8 +90,13 @@ public class IndexUpdater extends Thread {
@Resource
protected SystemHelper systemHelper;
@Resource
protected IndexingHelper indexingHelper;
public int maxDocumentCacheSize = 5;
public int maxInvalidDocumentSize = 100;
protected boolean finishCrawling = false;
public long updateInterval = 60000; // 1 min
@ -243,7 +243,7 @@ public class IndexUpdater extends Thread {
}
if (!docList.isEmpty()) {
sendDocuments(docList);
indexingHelper.sendDocuments(solrGroup, docList);
}
synchronized (finishedSessionIdList) {
@ -395,14 +395,14 @@ public class IndexUpdater extends Thread {
}
if (docList.size() >= maxDocumentCacheSize) {
sendDocuments(docList);
indexingHelper.sendDocuments(solrGroup, docList);
}
documentSize++;
// commit
if (commitPerCount > 0
&& documentSize % commitPerCount == 0) {
if (!docList.isEmpty()) {
sendDocuments(docList);
indexingHelper.sendDocuments(solrGroup, docList);
}
commitDocuments();
}
@ -563,17 +563,15 @@ public class IndexUpdater extends Thread {
}
}
if (logger.isInfoEnabled()) {
logger.info("The number of a crawled document is "
+ arList.getAllRecordCount() + ". The processing size is "
+ arList.size() + ". The execution time is "
+ (System.currentTimeMillis() - execTime) + "ms.");
logger.info("Processing " + arList.size() + "/"
+ arList.getAllRecordCount() + " docs (DB: "
+ (System.currentTimeMillis() - execTime) + "ms)");
}
if (arList.getAllRecordCount() > unprocessedDocumentSize) {
if (logger.isInfoEnabled()) {
logger.info("Stopped all crawler threads. " + " You have "
+ arList.getAllRecordCount() + " (>"
+ unprocessedDocumentSize + ") "
+ " unprocessed documents.");
+ unprocessedDocumentSize + ") " + " unprocessed docs.");
}
final IntervalControlHelper intervalControlHelper = ComponentUtil
.getIntervalControlHelper();
@ -620,71 +618,6 @@ public class IndexUpdater extends Thread {
}
}
@SuppressWarnings("unchecked")
private void deleteDocuments(final List<SolrInputDocument> docList) {
final List<String> ids = new ArrayList<String>();
for (final SolrInputDocument inputDoc : docList) {
final Collection<Object> roleList = inputDoc.getFieldValues("role");
final StringBuilder query = new StringBuilder();
query.append("url:\"");
query.append(ClientUtils.escapeQueryChars((String) inputDoc
.getFieldValue("url")));
query.append("\"");
final SolrQuery sq = new SolrQuery();
sq.setRows(1);
sq.setFields(new String[] { "id", "role" });
sq.setQuery(query.toString());
final SolrDocumentList docs = solrGroup.query(sq).getResults();
if (docs.size() > 0) {
for (final SolrDocument doc : docs) {
// checking changed roles
final Collection<Object> docRoleList = doc
.getFieldValues("role");
if (CollectionUtils.isEmpty(roleList)
&& CollectionUtils.isEmpty(docRoleList)) {
// neither have role
continue;
}
if (CollectionUtils.isNotEmpty(roleList)
&& CollectionUtils.isNotEmpty(docRoleList)) {
final List<String> diff = (List<String>) CollectionUtils
.disjunction(roleList, docRoleList);
if (diff.size() == 0) {
// has same role(s)
continue;
}
}
// has different role(s)
ids.add((String) doc.getFieldValue("id"));
}
}
}
if (ids.size() > 0) {
synchronized (solrGroup) {
solrGroup.deleteById(ids);
}
}
}
private void sendDocuments(final List<SolrInputDocument> docList) {
final long execTime = System.currentTimeMillis();
if (logger.isInfoEnabled()) {
logger.info("Sending " + docList.size() + " document to a server.");
}
synchronized (solrGroup) {
deleteDocuments(docList);
solrGroup.add(docList);
}
if (logger.isInfoEnabled()) {
logger.info("Sent " + docList.size()
+ " documents. The execution time is "
+ (System.currentTimeMillis() - execTime) + "ms.");
}
docList.clear();
}
private void forceStop() {
systemHelper.setForceStop(true);
for (final S2Robot s2Robot : s2RobotList) {

View file

@ -25,6 +25,7 @@ import jp.sf.fess.helper.DatabaseHelper;
import jp.sf.fess.helper.DocumentHelper;
import jp.sf.fess.helper.FileTypeHelper;
import jp.sf.fess.helper.HotSearchWordHelper;
import jp.sf.fess.helper.IndexingHelper;
import jp.sf.fess.helper.IntervalControlHelper;
import jp.sf.fess.helper.JobHelper;
import jp.sf.fess.helper.KeyMatchHelper;
@ -111,6 +112,8 @@ public final class ComponentUtil {
private static final String KEY_MATCH_HELPER = "keyMatchHelper";
private static final String INDEXING_HELPER = "indexingHelper";
private ComponentUtil() {
}
@ -242,4 +245,8 @@ public final class ComponentUtil {
public static KeyMatchHelper getKeyMatchHelper() {
return SingletonS2Container.getComponent(KEY_MATCH_HELPER);
}
public static IndexingHelper getIndexingHelper() {
return SingletonS2Container.getComponent(INDEXING_HELPER);
}
}

View file

@ -81,7 +81,7 @@
"boost", "contentLength", "host", "site", "lastModified",
"mimetype", "filetype_s", "created", "title", "digest", "url",
"clickCount_l_x_dv", "favoriteCount_l_x_dv",
"cid_s_s", "lang_s", "hasCache_s_s" }</property>
"cid_s", "lang_s", "hasCache_s_s" }</property>
<property name="responseDocValuesFields">new String[]{
"clickCount_l_x_dv", "favoriteCount_l_x_dv"}</property>
<property name="highlightingFields">new String[]{"digest", "cache" }</property>

View file

@ -10,6 +10,8 @@
<include path="s2robot_db.dicon" />
<component name="indexingHelper" class="jp.sf.fess.helper.IndexingHelper">
</component>
<component name="labelTypeHelper" class="jp.sf.fess.helper.LabelTypeHelper">
</component>
<component name="webFsIndexHelper" class="jp.sf.fess.helper.WebFsIndexHelper">