fix #216
This commit is contained in:
parent
be91289e37
commit
70bd6fa900
9 changed files with 364 additions and 277 deletions
|
@ -25,6 +25,7 @@ import jp.sf.fess.Constants;
|
|||
import jp.sf.fess.FessSystemException;
|
||||
import jp.sf.fess.ds.IndexUpdateCallback;
|
||||
import jp.sf.fess.helper.CrawlingSessionHelper;
|
||||
import jp.sf.fess.helper.IndexingHelper;
|
||||
import jp.sf.fess.helper.SearchLogHelper;
|
||||
import jp.sf.fess.helper.SystemHelper;
|
||||
import jp.sf.fess.util.ComponentUtil;
|
||||
|
@ -40,7 +41,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
|
||||
protected SolrGroup solrGroup;
|
||||
|
||||
public int maxDocumentCacheSize = 10;
|
||||
public int maxDocumentCacheSize = 5;
|
||||
|
||||
public boolean clickCountEnabled = true;
|
||||
|
||||
|
@ -71,6 +72,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
throw new FessSystemException("url is null. dataMap=" + dataMap);
|
||||
}
|
||||
|
||||
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
|
||||
final CrawlingSessionHelper crawlingSessionHelper = ComponentUtil
|
||||
.getCrawlingSessionHelper();
|
||||
dataMap.put("id", crawlingSessionHelper.generateId(dataMap));
|
||||
|
@ -85,13 +87,13 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
}
|
||||
|
||||
if (docList.size() >= maxDocumentCacheSize) {
|
||||
sendDocuments();
|
||||
indexingHelper.sendDocuments(solrGroup, docList);
|
||||
}
|
||||
documentSize.getAndIncrement();
|
||||
// commit
|
||||
if (commitPerCount > 0 && documentSize.get() % commitPerCount == 0) {
|
||||
if (!docList.isEmpty()) {
|
||||
sendDocuments();
|
||||
indexingHelper.sendDocuments(solrGroup, docList);
|
||||
}
|
||||
commitDocuments();
|
||||
}
|
||||
|
@ -142,7 +144,9 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
@Override
|
||||
public void commit() {
|
||||
if (!docList.isEmpty()) {
|
||||
sendDocuments();
|
||||
final IndexingHelper indexingHelper = ComponentUtil
|
||||
.getIndexingHelper();
|
||||
indexingHelper.sendDocuments(solrGroup, docList);
|
||||
}
|
||||
commitDocuments();
|
||||
}
|
||||
|
@ -161,22 +165,6 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback {
|
|||
}
|
||||
}
|
||||
|
||||
protected void sendDocuments() {
|
||||
final long execTime = System.currentTimeMillis();
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Sending " + docList.size() + " document to a server.");
|
||||
}
|
||||
synchronized (solrGroup) {
|
||||
solrGroup.add(docList);
|
||||
}
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Sent " + docList.size()
|
||||
+ " documents. The execution time is "
|
||||
+ (System.currentTimeMillis() - execTime) + "ms.");
|
||||
}
|
||||
docList.clear();
|
||||
}
|
||||
|
||||
protected void addClickCountField(final SolrInputDocument doc,
|
||||
final String url) {
|
||||
final SearchLogHelper searchLogHelper = ComponentUtil
|
||||
|
|
253
src/main/java/jp/sf/fess/helper/IndexingHelper.java
Normal file
253
src/main/java/jp/sf/fess/helper/IndexingHelper.java
Normal file
|
@ -0,0 +1,253 @@
|
|||
/*
|
||||
* Copyright 2009-2014 the CodeLibs Project and the Others.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
|
||||
* either express or implied. See the License for the specific language
|
||||
* governing permissions and limitations under the License.
|
||||
*/
|
||||
|
||||
package jp.sf.fess.helper;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import jp.sf.fess.util.ComponentUtil;
|
||||
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.client.solrj.util.ClientUtils;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.codelibs.solr.lib.SolrGroup;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class IndexingHelper {
|
||||
private static final Logger logger = LoggerFactory
|
||||
.getLogger(IndexingHelper.class);
|
||||
|
||||
public int maxRetryCount = 5;
|
||||
|
||||
public int defaultRowSize = 100;
|
||||
|
||||
public long requestInterval = 500;
|
||||
|
||||
public void sendDocuments(final SolrGroup solrGroup,
|
||||
final List<SolrInputDocument> docList) {
|
||||
final long execTime = System.currentTimeMillis();
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Sending " + docList.size()
|
||||
+ " documents to a server.");
|
||||
}
|
||||
synchronized (solrGroup) {
|
||||
deleteOldDocuments(solrGroup, docList);
|
||||
solrGroup.add(docList);
|
||||
}
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Sent " + docList.size() + " docs (Solr: "
|
||||
+ (System.currentTimeMillis() - execTime) + "ms)");
|
||||
}
|
||||
docList.clear();
|
||||
}
|
||||
|
||||
private void deleteOldDocuments(final SolrGroup solrGroup,
|
||||
final List<SolrInputDocument> docList) {
|
||||
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
||||
|
||||
final List<String> ids = new ArrayList<String>();
|
||||
final StringBuilder q = new StringBuilder(1000);
|
||||
final StringBuilder fq = new StringBuilder(100);
|
||||
for (final SolrInputDocument inputDoc : docList) {
|
||||
final Object idValue = inputDoc.getFieldValue(systemHelper.idField);
|
||||
if (idValue == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
final Object configIdValue = inputDoc
|
||||
.getFieldValue(systemHelper.configIdField);
|
||||
if (configIdValue == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
q.setLength(0);
|
||||
q.append(systemHelper.urlField).append(":\"");
|
||||
q.append(ClientUtils.escapeQueryChars((String) inputDoc
|
||||
.getFieldValue(systemHelper.urlField)));
|
||||
q.append('"');
|
||||
|
||||
fq.setLength(0);
|
||||
fq.append(systemHelper.configIdField).append(':');
|
||||
fq.append(configIdValue.toString());
|
||||
|
||||
final SolrDocumentList docs = getSolrDocumentList(solrGroup,
|
||||
fq.toString(), q.toString(),
|
||||
new String[] { systemHelper.idField });
|
||||
for (final SolrDocument doc : docs) {
|
||||
final Object oldIdValue = doc
|
||||
.getFieldValue(systemHelper.idField);
|
||||
if (!idValue.equals(oldIdValue) && oldIdValue != null) {
|
||||
ids.add(oldIdValue.toString());
|
||||
}
|
||||
}
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug(q + " in " + fq + " => " + docs);
|
||||
}
|
||||
}
|
||||
if (!ids.isEmpty()) {
|
||||
for (final String id : ids) {
|
||||
deleteDocument(solrGroup, id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public SolrDocumentList getSolrDocumentList(final SolrGroup solrGroup,
|
||||
final String fq, final String q, final String[] fields) {
|
||||
return getSolrDocumentList(solrGroup, fq, q, fields, defaultRowSize);
|
||||
}
|
||||
|
||||
protected SolrDocumentList getSolrDocumentList(final SolrGroup solrGroup,
|
||||
final String fq, final String q, final String[] fields,
|
||||
final int row) {
|
||||
final SolrQuery sq = new SolrQuery();
|
||||
if (fq != null) {
|
||||
sq.setFilterQueries(fq);
|
||||
}
|
||||
sq.setQuery(q);
|
||||
if (fields != null) {
|
||||
sq.setFields(fields);
|
||||
}
|
||||
sq.setRows(row);
|
||||
final SolrDocumentList docList = solrGroup.query(sq).getResults();
|
||||
if (docList.getNumFound() < row) {
|
||||
return docList;
|
||||
}
|
||||
return getSolrDocumentList(solrGroup, fq, q, fields,
|
||||
(int) docList.getNumFound());
|
||||
}
|
||||
|
||||
public void deleteDocument(final SolrGroup solrGroup, final String id) {
|
||||
final String query = "{!raw f=id}" + id;
|
||||
for (int i = 0; i < maxRetryCount; i++) {
|
||||
boolean done = true;
|
||||
try {
|
||||
for (final UpdateResponse response : solrGroup
|
||||
.deleteByQuery(query)) {
|
||||
if (response.getStatus() != 200) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Failed to delete: " + response);
|
||||
}
|
||||
done = false;
|
||||
}
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
logger.warn("Could not delete a document from Solr."
|
||||
+ " It might be busy. " + "Retrying.. id:" + id
|
||||
+ ", cause: " + e.getMessage());
|
||||
done = false;
|
||||
}
|
||||
if (done) {
|
||||
logger.info("Deleted from Solr: " + id);
|
||||
break;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(requestInterval);
|
||||
} catch (final InterruptedException e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public SolrDocument getSolrDocument(final SolrGroup solrGroup,
|
||||
final String id, final String[] fields) {
|
||||
final SolrQuery solrQuery = new SolrQuery();
|
||||
final StringBuilder queryBuf = new StringBuilder(200);
|
||||
queryBuf.append("{!raw f=id}");
|
||||
queryBuf.append(id);
|
||||
solrQuery.setQuery(queryBuf.toString());
|
||||
if (fields != null) {
|
||||
solrQuery.setFields(fields);
|
||||
}
|
||||
final QueryResponse response = solrGroup.query(solrQuery);
|
||||
final SolrDocumentList docList = response.getResults();
|
||||
if (docList.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (docList.size() > 1) {
|
||||
logger.error("Invalid multiple docs for " + id);
|
||||
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
||||
for (final SolrDocument doc : docList) {
|
||||
final Object idValue = doc.getFieldValue(systemHelper.idField);
|
||||
if (idValue != null) {
|
||||
deleteDocument(solrGroup, idValue.toString());
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
return docList.get(0);
|
||||
}
|
||||
|
||||
public SolrDocumentList getSolrDocumentListByPrefixId(
|
||||
final SolrGroup solrGroup, final String id, final String[] fields) {
|
||||
final SolrQuery solrQuery = new SolrQuery();
|
||||
final StringBuilder queryBuf = new StringBuilder(200);
|
||||
queryBuf.append("{!prefix f=id}");
|
||||
queryBuf.append(id);
|
||||
solrQuery.setQuery(queryBuf.toString());
|
||||
if (fields != null) {
|
||||
solrQuery.setFields(fields);
|
||||
}
|
||||
final QueryResponse response = solrGroup.query(solrQuery);
|
||||
final SolrDocumentList docList = response.getResults();
|
||||
if (docList.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Found solr documents: " + docList);
|
||||
}
|
||||
return docList;
|
||||
}
|
||||
|
||||
public void deleteChildSolrDocument(final SolrGroup solrGroup,
|
||||
final String id) {
|
||||
final String query = "{!raw f=parentId v=\"" + id + "\"}";
|
||||
for (final UpdateResponse response : solrGroup.deleteByQuery(query)) {
|
||||
if (response.getStatus() != 200) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Failed to delete: " + response);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public SolrDocumentList getChildSolrDocumentList(final SolrGroup solrGroup,
|
||||
final String id, final String[] fields) {
|
||||
return getChildSolrDocumentList(solrGroup, id, fields, defaultRowSize);
|
||||
}
|
||||
|
||||
protected SolrDocumentList getChildSolrDocumentList(
|
||||
final SolrGroup solrGroup, final String id, final String[] fields,
|
||||
final int row) {
|
||||
final SolrQuery solrQuery = new SolrQuery();
|
||||
solrQuery.setQuery("{!raw f=parentId v=\"" + id + "\"}");
|
||||
if (fields != null) {
|
||||
solrQuery.setFields(fields);
|
||||
}
|
||||
solrQuery.setRows(row);
|
||||
final SolrDocumentList docList = solrGroup.query(solrQuery)
|
||||
.getResults();
|
||||
if (docList.getNumFound() < row) {
|
||||
return docList;
|
||||
}
|
||||
return getChildSolrDocumentList(solrGroup, id, fields,
|
||||
(int) docList.getNumFound());
|
||||
}
|
||||
}
|
|
@ -107,13 +107,13 @@ public class QueryHelper implements Serializable {
|
|||
protected String[] responseFields = new String[] { "id", "docId", "score",
|
||||
"boost", "contentLength", "host", "site", "lastModified",
|
||||
"mimetype", "filetype_s", "created", TITLE_FIELD, "digest", "url",
|
||||
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s_s", "lang_s",
|
||||
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s", "lang_s",
|
||||
"hasCache_s_s" };
|
||||
|
||||
protected String[] cacheResponseFields = new String[] { "id", "docId",
|
||||
"score", "boost", "contentLength", "host", "site", "lastModified",
|
||||
"mimetype", "filetype_s", "created", TITLE_FIELD, "digest", "url",
|
||||
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s_s", "lang_s",
|
||||
"clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s", "lang_s",
|
||||
"cache" };
|
||||
|
||||
protected String[] responseDocValuesFields = new String[] {
|
||||
|
|
|
@ -122,7 +122,7 @@ public class SystemHelper implements Serializable {
|
|||
|
||||
public String clickCountField = "clickCount_l_x_dv";
|
||||
|
||||
public String configIdField = "cid_s_s";
|
||||
public String configIdField = "cid_s";
|
||||
|
||||
public String expiresField = "expires_dt";
|
||||
|
||||
|
@ -136,6 +136,14 @@ public class SystemHelper implements Serializable {
|
|||
|
||||
public String hasCacheField = "hasCache_s_s";
|
||||
|
||||
public String lastModifiedField = "lastModified";
|
||||
|
||||
public String anchorField = "anchor";
|
||||
|
||||
public String segmentField = "segment";
|
||||
|
||||
public String roleField = "role";
|
||||
|
||||
protected String[] supportedLanguages = new String[] { "ar", "bg", "ca",
|
||||
"da", "de", "el", "en", "es", "eu", "fa", "fi", "fr", "ga", "gl",
|
||||
"hi", "hu", "hy", "id", "it", "ja", "lv", "ko", "nl", "no", "pt",
|
||||
|
|
|
@ -31,15 +31,13 @@ import jp.sf.fess.Constants;
|
|||
import jp.sf.fess.db.exentity.CrawlingConfig;
|
||||
import jp.sf.fess.helper.CrawlingConfigHelper;
|
||||
import jp.sf.fess.helper.CrawlingSessionHelper;
|
||||
import jp.sf.fess.helper.IndexingHelper;
|
||||
import jp.sf.fess.helper.SambaHelper;
|
||||
import jp.sf.fess.helper.SearchLogHelper;
|
||||
import jp.sf.fess.helper.SystemHelper;
|
||||
import jp.sf.fess.util.ComponentUtil;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.response.QueryResponse;
|
||||
import org.apache.solr.client.solrj.response.UpdateResponse;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.codelibs.core.util.DynamicProperties;
|
||||
|
@ -61,10 +59,6 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
private static final Logger logger = LoggerFactory
|
||||
.getLogger(FessS2RobotThread.class);
|
||||
|
||||
public int maxSolrQueryRetryCount = 5;
|
||||
|
||||
public int childUrlSize = 10000;
|
||||
|
||||
@Override
|
||||
protected boolean isContentUpdated(final S2RobotClient client,
|
||||
final UrlQueue urlQueue) {
|
||||
|
@ -82,34 +76,35 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
.getCrawlingSessionHelper();
|
||||
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
||||
final SambaHelper sambaHelper = ComponentUtil.getSambaHelper();
|
||||
final IndexingHelper indexingHelper = ComponentUtil
|
||||
.getIndexingHelper();
|
||||
final SolrGroupManager solrGroupManager = ComponentUtil
|
||||
.getSolrGroupManager();
|
||||
final boolean useAclAsRole = crawlerProperties.getProperty(
|
||||
Constants.USE_ACL_AS_ROLE, Constants.FALSE).equals(
|
||||
Constants.TRUE);
|
||||
final String expiresField = systemHelper.expiresField;
|
||||
|
||||
final SolrGroup solrGroup = solrGroupManager
|
||||
.getSolrGroup(QueryType.ADD);
|
||||
|
||||
final String url = urlQueue.getUrl();
|
||||
ResponseData responseData = null;
|
||||
try {
|
||||
// head method
|
||||
responseData = client
|
||||
.execute(RequestDataBuilder.newRequestData().head()
|
||||
.url(urlQueue.getUrl()).build());
|
||||
if (responseData == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
SolrDocumentList oldDocWithRoleList = null;
|
||||
final CrawlingConfig crawlingConfig = crawlingConfigHelper
|
||||
.get(robotContext.getSessionId());
|
||||
final Map<String, Object> dataMap = new HashMap<String, Object>();
|
||||
dataMap.put("url", urlQueue.getUrl());
|
||||
dataMap.put(systemHelper.urlField, url);
|
||||
final List<String> roleTypeList = new ArrayList<String>();
|
||||
for (final String roleType : crawlingConfig.getRoleTypeValues()) {
|
||||
roleTypeList.add(roleType);
|
||||
}
|
||||
if (useAclAsRole && responseData.getUrl().startsWith("smb://")) {
|
||||
final String id = crawlingSessionHelper.generateId(dataMap);
|
||||
oldDocWithRoleList = getSolrDocumentList(id, true,
|
||||
expiresField);
|
||||
if (useAclAsRole && url.startsWith("smb://")) {
|
||||
// head method
|
||||
responseData = client.execute(RequestDataBuilder
|
||||
.newRequestData().head().url(url).build());
|
||||
if (responseData == null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
final ACE[] aces = (ACE[]) responseData.getMetaDataMap()
|
||||
.get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
|
||||
|
@ -124,36 +119,40 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
}
|
||||
}
|
||||
}
|
||||
dataMap.put("role", roleTypeList);
|
||||
dataMap.put(systemHelper.roleField, roleTypeList);
|
||||
final String id = crawlingSessionHelper.generateId(dataMap);
|
||||
|
||||
final SolrDocumentList solrDocumentList = getSolrDocumentList(
|
||||
id, false, expiresField);
|
||||
if (solrDocumentList == null) {
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
storeChildUrlsToQueue(urlQueue, getChildUrlSet(id));
|
||||
final SolrDocument solrDocument = indexingHelper
|
||||
.getSolrDocument(solrGroup, id, new String[] {
|
||||
systemHelper.idField,
|
||||
systemHelper.lastModifiedField,
|
||||
systemHelper.anchorField,
|
||||
systemHelper.segmentField,
|
||||
systemHelper.expiresField,
|
||||
systemHelper.clickCountField,
|
||||
systemHelper.favoriteCountField });
|
||||
if (solrDocument == null) {
|
||||
storeChildUrlsToQueue(urlQueue,
|
||||
getChildUrlSet(solrGroup, id)); // TODO
|
||||
return true;
|
||||
}
|
||||
|
||||
if (solrDocumentList.size() > 1) {
|
||||
// invalid state
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
deleteSolrDocumentList(solrDocumentList);
|
||||
return true;
|
||||
}
|
||||
|
||||
final SolrDocument solrDocument = solrDocumentList.get(0);
|
||||
final Date expires = (Date) solrDocument.get(expiresField);
|
||||
final Date expires = (Date) solrDocument
|
||||
.get(systemHelper.expiresField);
|
||||
if (expires != null
|
||||
&& expires.getTime() < System.currentTimeMillis()) {
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
final Object idValue = solrDocument
|
||||
.getFieldValue(systemHelper.idField);
|
||||
if (idValue != null) {
|
||||
indexingHelper.deleteDocument(solrGroup,
|
||||
idValue.toString());
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
final Date lastModified = (Date) solrDocument
|
||||
.get("lastModified");
|
||||
.get(systemHelper.lastModifiedField);
|
||||
if (lastModified == null) {
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -162,10 +161,8 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
if (clickCount != null) {
|
||||
final SearchLogHelper searchLogHelper = ComponentUtil
|
||||
.getSearchLogHelper();
|
||||
final int count = searchLogHelper.getClickCount(urlQueue
|
||||
.getUrl());
|
||||
final int count = searchLogHelper.getClickCount(url);
|
||||
if (count != clickCount.intValue()) {
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -175,23 +172,28 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
if (favoriteCount != null) {
|
||||
final SearchLogHelper searchLogHelper = ComponentUtil
|
||||
.getSearchLogHelper();
|
||||
final long count = searchLogHelper
|
||||
.getFavoriteCount(urlQueue.getUrl());
|
||||
final long count = searchLogHelper.getFavoriteCount(url);
|
||||
if (count != favoriteCount.longValue()) {
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (responseData == null) {
|
||||
// head method
|
||||
responseData = client.execute(RequestDataBuilder
|
||||
.newRequestData().head().url(url).build());
|
||||
if (responseData == null) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
final int httpStatusCode = responseData.getHttpStatusCode();
|
||||
if (httpStatusCode == 404) {
|
||||
deleteSolrDocument(id);
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
storeChildUrlsToQueue(urlQueue,
|
||||
getAnchorSet(solrDocument.get("anchor")));
|
||||
indexingHelper.deleteDocument(solrGroup, id);
|
||||
return false;
|
||||
} else if (responseData.getLastModified() == null) {
|
||||
deleteSolrDocumentList(oldDocWithRoleList);
|
||||
return true;
|
||||
} else if (responseData.getLastModified().getTime() <= lastModified
|
||||
.getTime() && httpStatusCode == 200) {
|
||||
|
@ -256,134 +258,28 @@ public class FessS2RobotThread extends S2RobotThread {
|
|||
return childUrlSet;
|
||||
}
|
||||
|
||||
protected SolrDocumentList getSolrDocumentList(final String id,
|
||||
final boolean wildcard, final String expiresField) {
|
||||
final SolrGroupManager solrGroupManager = ComponentUtil
|
||||
.getSolrGroupManager();
|
||||
protected Set<RequestData> getChildUrlSet(final SolrGroup solrGroup,
|
||||
final String id) {
|
||||
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
|
||||
final SolrGroup solrGroup = solrGroupManager
|
||||
.getSolrGroup(QueryType.ADD);
|
||||
final SolrQuery solrQuery = new SolrQuery();
|
||||
final StringBuilder queryBuf = new StringBuilder(200);
|
||||
if (wildcard) {
|
||||
queryBuf.append("{!prefix f=id}");
|
||||
} else {
|
||||
queryBuf.append("{!raw f=id}");
|
||||
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
|
||||
final SolrDocumentList docList = indexingHelper
|
||||
.getChildSolrDocumentList(solrGroup, id,
|
||||
new String[] { systemHelper.urlField });
|
||||
if (docList.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
queryBuf.append(id);
|
||||
solrQuery.setQuery(queryBuf.toString());
|
||||
solrQuery.setFields("id", "lastModified", "anchor", "segment", "role",
|
||||
expiresField, systemHelper.clickCountField,
|
||||
systemHelper.favoriteCountField);
|
||||
for (int i = 0; i < maxSolrQueryRetryCount; i++) {
|
||||
try {
|
||||
final QueryResponse response = solrGroup.query(solrQuery);
|
||||
final SolrDocumentList docList = response.getResults();
|
||||
if (docList.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Found solr documents: " + docList);
|
||||
}
|
||||
return docList;
|
||||
} catch (final Exception e) {
|
||||
logger.info("Could not get a response from Solr."
|
||||
+ " It might be busy. " + "Retrying.. id:" + id
|
||||
+ ", cause: " + e.getMessage());
|
||||
}
|
||||
try {
|
||||
Thread.sleep(500);
|
||||
} catch (final InterruptedException e) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Found solr documents: " + docList);
|
||||
}
|
||||
final Set<RequestData> urlSet = new HashSet<>(docList.size());
|
||||
for (final SolrDocument doc : docList) {
|
||||
final Object obj = doc.get(systemHelper.urlField);
|
||||
if (obj != null) {
|
||||
urlSet.add(RequestDataBuilder.newRequestData().get()
|
||||
.url(obj.toString()).build());
|
||||
}
|
||||
}
|
||||
return null;
|
||||
return urlSet;
|
||||
}
|
||||
|
||||
protected Set<RequestData> getChildUrlSet(final String id) {
|
||||
final SolrGroupManager solrGroupManager = ComponentUtil
|
||||
.getSolrGroupManager();
|
||||
final SolrGroup solrGroup = solrGroupManager
|
||||
.getSolrGroup(QueryType.ADD);
|
||||
final SolrQuery solrQuery = new SolrQuery();
|
||||
solrQuery.setQuery("{!raw f=parentId v=\"" + id + "\"}");
|
||||
solrQuery.setFields("url");
|
||||
solrQuery.setRows(childUrlSize);
|
||||
for (int i = 0; i < maxSolrQueryRetryCount; i++) {
|
||||
try {
|
||||
final QueryResponse response = solrGroup.query(solrQuery);
|
||||
final SolrDocumentList docList = response.getResults();
|
||||
if (docList.isEmpty()) {
|
||||
return null;
|
||||
}
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Found solr documents: " + docList);
|
||||
}
|
||||
final Set<RequestData> urlSet = new HashSet<>(docList.size());
|
||||
for (final SolrDocument doc : docList) {
|
||||
final Object obj = doc.get("url");
|
||||
if (obj != null) {
|
||||
urlSet.add(RequestDataBuilder.newRequestData().get()
|
||||
.url(obj.toString()).build());
|
||||
}
|
||||
}
|
||||
return urlSet;
|
||||
} catch (final Exception e) {
|
||||
logger.info("Could not get a response from Solr."
|
||||
+ " It might be busy. " + "Retrying.. id:" + id
|
||||
+ ", cause: " + e.getMessage());
|
||||
}
|
||||
try {
|
||||
Thread.sleep(500);
|
||||
} catch (final InterruptedException e) {
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
protected void deleteSolrDocument(final String id) {
|
||||
final SolrGroupManager solrGroupManager = ComponentUtil
|
||||
.getSolrGroupManager();
|
||||
final SolrGroup solrGroup = solrGroupManager
|
||||
.getSolrGroup(QueryType.DELETE);
|
||||
final String query = "{!raw f=parentId v=\"" + id + "\"}";
|
||||
for (int i = 0; i < maxSolrQueryRetryCount; i++) {
|
||||
boolean done = true;
|
||||
try {
|
||||
for (final UpdateResponse response : solrGroup
|
||||
.deleteByQuery(query)) {
|
||||
if (response.getStatus() != 200) {
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Failed to delete: " + response);
|
||||
}
|
||||
done = false;
|
||||
}
|
||||
}
|
||||
} catch (final Exception e) {
|
||||
logger.info("Could not delete a document from Solr."
|
||||
+ " It might be busy. " + "Retrying.. id:" + id
|
||||
+ ", cause: " + e.getMessage());
|
||||
done = false;
|
||||
}
|
||||
if (done) {
|
||||
logger.info("Deleted from Solr: " + id);
|
||||
break;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(500);
|
||||
} catch (final InterruptedException e) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected void deleteSolrDocumentList(
|
||||
final SolrDocumentList solrDocumentList) {
|
||||
if (solrDocumentList != null) {
|
||||
for (final SolrDocument solrDocument : solrDocumentList) {
|
||||
final Object idObj = solrDocument.get("id");
|
||||
if (idObj != null) {
|
||||
deleteSolrDocument(idObj.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
package jp.sf.fess.solr;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -32,15 +31,11 @@ import jp.sf.fess.db.exbhv.ClickLogBhv;
|
|||
import jp.sf.fess.db.exbhv.FavoriteLogBhv;
|
||||
import jp.sf.fess.db.exbhv.pmbean.FavoriteUrlCountPmb;
|
||||
import jp.sf.fess.db.exentity.customize.FavoriteUrlCount;
|
||||
import jp.sf.fess.helper.IndexingHelper;
|
||||
import jp.sf.fess.helper.IntervalControlHelper;
|
||||
import jp.sf.fess.helper.SystemHelper;
|
||||
import jp.sf.fess.util.ComponentUtil;
|
||||
|
||||
import org.apache.commons.collections.CollectionUtils;
|
||||
import org.apache.solr.client.solrj.SolrQuery;
|
||||
import org.apache.solr.client.solrj.util.ClientUtils;
|
||||
import org.apache.solr.common.SolrDocument;
|
||||
import org.apache.solr.common.SolrDocumentList;
|
||||
import org.apache.solr.common.SolrInputDocument;
|
||||
import org.codelibs.core.util.StringUtil;
|
||||
import org.codelibs.robot.S2Robot;
|
||||
|
@ -95,8 +90,13 @@ public class IndexUpdater extends Thread {
|
|||
@Resource
|
||||
protected SystemHelper systemHelper;
|
||||
|
||||
@Resource
|
||||
protected IndexingHelper indexingHelper;
|
||||
|
||||
public int maxDocumentCacheSize = 5;
|
||||
|
||||
public int maxInvalidDocumentSize = 100;
|
||||
|
||||
protected boolean finishCrawling = false;
|
||||
|
||||
public long updateInterval = 60000; // 1 min
|
||||
|
@ -243,7 +243,7 @@ public class IndexUpdater extends Thread {
|
|||
}
|
||||
|
||||
if (!docList.isEmpty()) {
|
||||
sendDocuments(docList);
|
||||
indexingHelper.sendDocuments(solrGroup, docList);
|
||||
}
|
||||
|
||||
synchronized (finishedSessionIdList) {
|
||||
|
@ -395,14 +395,14 @@ public class IndexUpdater extends Thread {
|
|||
}
|
||||
|
||||
if (docList.size() >= maxDocumentCacheSize) {
|
||||
sendDocuments(docList);
|
||||
indexingHelper.sendDocuments(solrGroup, docList);
|
||||
}
|
||||
documentSize++;
|
||||
// commit
|
||||
if (commitPerCount > 0
|
||||
&& documentSize % commitPerCount == 0) {
|
||||
if (!docList.isEmpty()) {
|
||||
sendDocuments(docList);
|
||||
indexingHelper.sendDocuments(solrGroup, docList);
|
||||
}
|
||||
commitDocuments();
|
||||
}
|
||||
|
@ -563,17 +563,15 @@ public class IndexUpdater extends Thread {
|
|||
}
|
||||
}
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("The number of a crawled document is "
|
||||
+ arList.getAllRecordCount() + ". The processing size is "
|
||||
+ arList.size() + ". The execution time is "
|
||||
+ (System.currentTimeMillis() - execTime) + "ms.");
|
||||
logger.info("Processing " + arList.size() + "/"
|
||||
+ arList.getAllRecordCount() + " docs (DB: "
|
||||
+ (System.currentTimeMillis() - execTime) + "ms)");
|
||||
}
|
||||
if (arList.getAllRecordCount() > unprocessedDocumentSize) {
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Stopped all crawler threads. " + " You have "
|
||||
+ arList.getAllRecordCount() + " (>"
|
||||
+ unprocessedDocumentSize + ") "
|
||||
+ " unprocessed documents.");
|
||||
+ unprocessedDocumentSize + ") " + " unprocessed docs.");
|
||||
}
|
||||
final IntervalControlHelper intervalControlHelper = ComponentUtil
|
||||
.getIntervalControlHelper();
|
||||
|
@ -620,71 +618,6 @@ public class IndexUpdater extends Thread {
|
|||
}
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
private void deleteDocuments(final List<SolrInputDocument> docList) {
|
||||
final List<String> ids = new ArrayList<String>();
|
||||
for (final SolrInputDocument inputDoc : docList) {
|
||||
final Collection<Object> roleList = inputDoc.getFieldValues("role");
|
||||
final StringBuilder query = new StringBuilder();
|
||||
query.append("url:\"");
|
||||
query.append(ClientUtils.escapeQueryChars((String) inputDoc
|
||||
.getFieldValue("url")));
|
||||
query.append("\"");
|
||||
|
||||
final SolrQuery sq = new SolrQuery();
|
||||
sq.setRows(1);
|
||||
sq.setFields(new String[] { "id", "role" });
|
||||
sq.setQuery(query.toString());
|
||||
final SolrDocumentList docs = solrGroup.query(sq).getResults();
|
||||
if (docs.size() > 0) {
|
||||
for (final SolrDocument doc : docs) {
|
||||
// checking changed roles
|
||||
final Collection<Object> docRoleList = doc
|
||||
.getFieldValues("role");
|
||||
|
||||
if (CollectionUtils.isEmpty(roleList)
|
||||
&& CollectionUtils.isEmpty(docRoleList)) {
|
||||
// neither have role
|
||||
continue;
|
||||
}
|
||||
if (CollectionUtils.isNotEmpty(roleList)
|
||||
&& CollectionUtils.isNotEmpty(docRoleList)) {
|
||||
final List<String> diff = (List<String>) CollectionUtils
|
||||
.disjunction(roleList, docRoleList);
|
||||
if (diff.size() == 0) {
|
||||
// has same role(s)
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// has different role(s)
|
||||
ids.add((String) doc.getFieldValue("id"));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (ids.size() > 0) {
|
||||
synchronized (solrGroup) {
|
||||
solrGroup.deleteById(ids);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void sendDocuments(final List<SolrInputDocument> docList) {
|
||||
final long execTime = System.currentTimeMillis();
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Sending " + docList.size() + " document to a server.");
|
||||
}
|
||||
synchronized (solrGroup) {
|
||||
deleteDocuments(docList);
|
||||
solrGroup.add(docList);
|
||||
}
|
||||
if (logger.isInfoEnabled()) {
|
||||
logger.info("Sent " + docList.size()
|
||||
+ " documents. The execution time is "
|
||||
+ (System.currentTimeMillis() - execTime) + "ms.");
|
||||
}
|
||||
docList.clear();
|
||||
}
|
||||
|
||||
private void forceStop() {
|
||||
systemHelper.setForceStop(true);
|
||||
for (final S2Robot s2Robot : s2RobotList) {
|
||||
|
|
|
@ -25,6 +25,7 @@ import jp.sf.fess.helper.DatabaseHelper;
|
|||
import jp.sf.fess.helper.DocumentHelper;
|
||||
import jp.sf.fess.helper.FileTypeHelper;
|
||||
import jp.sf.fess.helper.HotSearchWordHelper;
|
||||
import jp.sf.fess.helper.IndexingHelper;
|
||||
import jp.sf.fess.helper.IntervalControlHelper;
|
||||
import jp.sf.fess.helper.JobHelper;
|
||||
import jp.sf.fess.helper.KeyMatchHelper;
|
||||
|
@ -111,6 +112,8 @@ public final class ComponentUtil {
|
|||
|
||||
private static final String KEY_MATCH_HELPER = "keyMatchHelper";
|
||||
|
||||
private static final String INDEXING_HELPER = "indexingHelper";
|
||||
|
||||
private ComponentUtil() {
|
||||
}
|
||||
|
||||
|
@ -242,4 +245,8 @@ public final class ComponentUtil {
|
|||
public static KeyMatchHelper getKeyMatchHelper() {
|
||||
return SingletonS2Container.getComponent(KEY_MATCH_HELPER);
|
||||
}
|
||||
|
||||
public static IndexingHelper getIndexingHelper() {
|
||||
return SingletonS2Container.getComponent(INDEXING_HELPER);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -81,7 +81,7 @@
|
|||
"boost", "contentLength", "host", "site", "lastModified",
|
||||
"mimetype", "filetype_s", "created", "title", "digest", "url",
|
||||
"clickCount_l_x_dv", "favoriteCount_l_x_dv",
|
||||
"cid_s_s", "lang_s", "hasCache_s_s" }</property>
|
||||
"cid_s", "lang_s", "hasCache_s_s" }</property>
|
||||
<property name="responseDocValuesFields">new String[]{
|
||||
"clickCount_l_x_dv", "favoriteCount_l_x_dv"}</property>
|
||||
<property name="highlightingFields">new String[]{"digest", "cache" }</property>
|
||||
|
|
|
@ -10,6 +10,8 @@
|
|||
|
||||
<include path="s2robot_db.dicon" />
|
||||
|
||||
<component name="indexingHelper" class="jp.sf.fess.helper.IndexingHelper">
|
||||
</component>
|
||||
<component name="labelTypeHelper" class="jp.sf.fess.helper.LabelTypeHelper">
|
||||
</component>
|
||||
<component name="webFsIndexHelper" class="jp.sf.fess.helper.WebFsIndexHelper">
|
||||
|
|
Loading…
Add table
Reference in a new issue