diff --git a/src/main/java/jp/sf/fess/ds/impl/IndexUpdateCallbackImpl.java b/src/main/java/jp/sf/fess/ds/impl/IndexUpdateCallbackImpl.java index 9f84c39ab..53b1f3ac7 100644 --- a/src/main/java/jp/sf/fess/ds/impl/IndexUpdateCallbackImpl.java +++ b/src/main/java/jp/sf/fess/ds/impl/IndexUpdateCallbackImpl.java @@ -25,6 +25,7 @@ import jp.sf.fess.Constants; import jp.sf.fess.FessSystemException; import jp.sf.fess.ds.IndexUpdateCallback; import jp.sf.fess.helper.CrawlingSessionHelper; +import jp.sf.fess.helper.IndexingHelper; import jp.sf.fess.helper.SearchLogHelper; import jp.sf.fess.helper.SystemHelper; import jp.sf.fess.util.ComponentUtil; @@ -40,7 +41,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback { protected SolrGroup solrGroup; - public int maxDocumentCacheSize = 10; + public int maxDocumentCacheSize = 5; public boolean clickCountEnabled = true; @@ -71,6 +72,7 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback { throw new FessSystemException("url is null. dataMap=" + dataMap); } + final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper(); final CrawlingSessionHelper crawlingSessionHelper = ComponentUtil .getCrawlingSessionHelper(); dataMap.put("id", crawlingSessionHelper.generateId(dataMap)); @@ -85,13 +87,13 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback { } if (docList.size() >= maxDocumentCacheSize) { - sendDocuments(); + indexingHelper.sendDocuments(solrGroup, docList); } documentSize.getAndIncrement(); // commit if (commitPerCount > 0 && documentSize.get() % commitPerCount == 0) { if (!docList.isEmpty()) { - sendDocuments(); + indexingHelper.sendDocuments(solrGroup, docList); } commitDocuments(); } @@ -142,7 +144,9 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback { @Override public void commit() { if (!docList.isEmpty()) { - sendDocuments(); + final IndexingHelper indexingHelper = ComponentUtil + .getIndexingHelper(); + indexingHelper.sendDocuments(solrGroup, docList); } commitDocuments(); } @@ -161,22 +165,6 @@ public class IndexUpdateCallbackImpl implements IndexUpdateCallback { } } - protected void sendDocuments() { - final long execTime = System.currentTimeMillis(); - if (logger.isInfoEnabled()) { - logger.info("Sending " + docList.size() + " document to a server."); - } - synchronized (solrGroup) { - solrGroup.add(docList); - } - if (logger.isInfoEnabled()) { - logger.info("Sent " + docList.size() - + " documents. The execution time is " - + (System.currentTimeMillis() - execTime) + "ms."); - } - docList.clear(); - } - protected void addClickCountField(final SolrInputDocument doc, final String url) { final SearchLogHelper searchLogHelper = ComponentUtil diff --git a/src/main/java/jp/sf/fess/helper/IndexingHelper.java b/src/main/java/jp/sf/fess/helper/IndexingHelper.java new file mode 100644 index 000000000..85c9a7cdb --- /dev/null +++ b/src/main/java/jp/sf/fess/helper/IndexingHelper.java @@ -0,0 +1,253 @@ +/* + * Copyright 2009-2014 the CodeLibs Project and the Others. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, + * either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +package jp.sf.fess.helper; + +import java.util.ArrayList; +import java.util.List; + +import jp.sf.fess.util.ComponentUtil; + +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.client.solrj.response.UpdateResponse; +import org.apache.solr.client.solrj.util.ClientUtils; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrInputDocument; +import org.codelibs.solr.lib.SolrGroup; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class IndexingHelper { + private static final Logger logger = LoggerFactory + .getLogger(IndexingHelper.class); + + public int maxRetryCount = 5; + + public int defaultRowSize = 100; + + public long requestInterval = 500; + + public void sendDocuments(final SolrGroup solrGroup, + final List docList) { + final long execTime = System.currentTimeMillis(); + if (logger.isDebugEnabled()) { + logger.debug("Sending " + docList.size() + + " documents to a server."); + } + synchronized (solrGroup) { + deleteOldDocuments(solrGroup, docList); + solrGroup.add(docList); + } + if (logger.isInfoEnabled()) { + logger.info("Sent " + docList.size() + " docs (Solr: " + + (System.currentTimeMillis() - execTime) + "ms)"); + } + docList.clear(); + } + + private void deleteOldDocuments(final SolrGroup solrGroup, + final List docList) { + final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); + + final List ids = new ArrayList(); + final StringBuilder q = new StringBuilder(1000); + final StringBuilder fq = new StringBuilder(100); + for (final SolrInputDocument inputDoc : docList) { + final Object idValue = inputDoc.getFieldValue(systemHelper.idField); + if (idValue == null) { + continue; + } + + final Object configIdValue = inputDoc + .getFieldValue(systemHelper.configIdField); + if (configIdValue == null) { + continue; + } + + q.setLength(0); + q.append(systemHelper.urlField).append(":\""); + q.append(ClientUtils.escapeQueryChars((String) inputDoc + .getFieldValue(systemHelper.urlField))); + q.append('"'); + + fq.setLength(0); + fq.append(systemHelper.configIdField).append(':'); + fq.append(configIdValue.toString()); + + final SolrDocumentList docs = getSolrDocumentList(solrGroup, + fq.toString(), q.toString(), + new String[] { systemHelper.idField }); + for (final SolrDocument doc : docs) { + final Object oldIdValue = doc + .getFieldValue(systemHelper.idField); + if (!idValue.equals(oldIdValue) && oldIdValue != null) { + ids.add(oldIdValue.toString()); + } + } + if (logger.isDebugEnabled()) { + logger.debug(q + " in " + fq + " => " + docs); + } + } + if (!ids.isEmpty()) { + for (final String id : ids) { + deleteDocument(solrGroup, id); + } + } + } + + public SolrDocumentList getSolrDocumentList(final SolrGroup solrGroup, + final String fq, final String q, final String[] fields) { + return getSolrDocumentList(solrGroup, fq, q, fields, defaultRowSize); + } + + protected SolrDocumentList getSolrDocumentList(final SolrGroup solrGroup, + final String fq, final String q, final String[] fields, + final int row) { + final SolrQuery sq = new SolrQuery(); + if (fq != null) { + sq.setFilterQueries(fq); + } + sq.setQuery(q); + if (fields != null) { + sq.setFields(fields); + } + sq.setRows(row); + final SolrDocumentList docList = solrGroup.query(sq).getResults(); + if (docList.getNumFound() < row) { + return docList; + } + return getSolrDocumentList(solrGroup, fq, q, fields, + (int) docList.getNumFound()); + } + + public void deleteDocument(final SolrGroup solrGroup, final String id) { + final String query = "{!raw f=id}" + id; + for (int i = 0; i < maxRetryCount; i++) { + boolean done = true; + try { + for (final UpdateResponse response : solrGroup + .deleteByQuery(query)) { + if (response.getStatus() != 200) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to delete: " + response); + } + done = false; + } + } + } catch (final Exception e) { + logger.warn("Could not delete a document from Solr." + + " It might be busy. " + "Retrying.. id:" + id + + ", cause: " + e.getMessage()); + done = false; + } + if (done) { + logger.info("Deleted from Solr: " + id); + break; + } + try { + Thread.sleep(requestInterval); + } catch (final InterruptedException e) { + } + } + } + + public SolrDocument getSolrDocument(final SolrGroup solrGroup, + final String id, final String[] fields) { + final SolrQuery solrQuery = new SolrQuery(); + final StringBuilder queryBuf = new StringBuilder(200); + queryBuf.append("{!raw f=id}"); + queryBuf.append(id); + solrQuery.setQuery(queryBuf.toString()); + if (fields != null) { + solrQuery.setFields(fields); + } + final QueryResponse response = solrGroup.query(solrQuery); + final SolrDocumentList docList = response.getResults(); + if (docList.isEmpty()) { + return null; + } + if (docList.size() > 1) { + logger.error("Invalid multiple docs for " + id); + final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); + for (final SolrDocument doc : docList) { + final Object idValue = doc.getFieldValue(systemHelper.idField); + if (idValue != null) { + deleteDocument(solrGroup, idValue.toString()); + } + } + return null; + } + return docList.get(0); + } + + public SolrDocumentList getSolrDocumentListByPrefixId( + final SolrGroup solrGroup, final String id, final String[] fields) { + final SolrQuery solrQuery = new SolrQuery(); + final StringBuilder queryBuf = new StringBuilder(200); + queryBuf.append("{!prefix f=id}"); + queryBuf.append(id); + solrQuery.setQuery(queryBuf.toString()); + if (fields != null) { + solrQuery.setFields(fields); + } + final QueryResponse response = solrGroup.query(solrQuery); + final SolrDocumentList docList = response.getResults(); + if (docList.isEmpty()) { + return null; + } + if (logger.isDebugEnabled()) { + logger.debug("Found solr documents: " + docList); + } + return docList; + } + + public void deleteChildSolrDocument(final SolrGroup solrGroup, + final String id) { + final String query = "{!raw f=parentId v=\"" + id + "\"}"; + for (final UpdateResponse response : solrGroup.deleteByQuery(query)) { + if (response.getStatus() != 200) { + if (logger.isDebugEnabled()) { + logger.debug("Failed to delete: " + response); + } + } + } + } + + public SolrDocumentList getChildSolrDocumentList(final SolrGroup solrGroup, + final String id, final String[] fields) { + return getChildSolrDocumentList(solrGroup, id, fields, defaultRowSize); + } + + protected SolrDocumentList getChildSolrDocumentList( + final SolrGroup solrGroup, final String id, final String[] fields, + final int row) { + final SolrQuery solrQuery = new SolrQuery(); + solrQuery.setQuery("{!raw f=parentId v=\"" + id + "\"}"); + if (fields != null) { + solrQuery.setFields(fields); + } + solrQuery.setRows(row); + final SolrDocumentList docList = solrGroup.query(solrQuery) + .getResults(); + if (docList.getNumFound() < row) { + return docList; + } + return getChildSolrDocumentList(solrGroup, id, fields, + (int) docList.getNumFound()); + } +} \ No newline at end of file diff --git a/src/main/java/jp/sf/fess/helper/QueryHelper.java b/src/main/java/jp/sf/fess/helper/QueryHelper.java index a17827dd8..3a085b201 100644 --- a/src/main/java/jp/sf/fess/helper/QueryHelper.java +++ b/src/main/java/jp/sf/fess/helper/QueryHelper.java @@ -107,13 +107,13 @@ public class QueryHelper implements Serializable { protected String[] responseFields = new String[] { "id", "docId", "score", "boost", "contentLength", "host", "site", "lastModified", "mimetype", "filetype_s", "created", TITLE_FIELD, "digest", "url", - "clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s_s", "lang_s", + "clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s", "lang_s", "hasCache_s_s" }; protected String[] cacheResponseFields = new String[] { "id", "docId", "score", "boost", "contentLength", "host", "site", "lastModified", "mimetype", "filetype_s", "created", TITLE_FIELD, "digest", "url", - "clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s_s", "lang_s", + "clickCount_l_x_dv", "favoriteCount_l_x_dv", "cid_s", "lang_s", "cache" }; protected String[] responseDocValuesFields = new String[] { diff --git a/src/main/java/jp/sf/fess/helper/SystemHelper.java b/src/main/java/jp/sf/fess/helper/SystemHelper.java index 63a088fb7..77f71cc8c 100644 --- a/src/main/java/jp/sf/fess/helper/SystemHelper.java +++ b/src/main/java/jp/sf/fess/helper/SystemHelper.java @@ -122,7 +122,7 @@ public class SystemHelper implements Serializable { public String clickCountField = "clickCount_l_x_dv"; - public String configIdField = "cid_s_s"; + public String configIdField = "cid_s"; public String expiresField = "expires_dt"; @@ -136,6 +136,14 @@ public class SystemHelper implements Serializable { public String hasCacheField = "hasCache_s_s"; + public String lastModifiedField = "lastModified"; + + public String anchorField = "anchor"; + + public String segmentField = "segment"; + + public String roleField = "role"; + protected String[] supportedLanguages = new String[] { "ar", "bg", "ca", "da", "de", "el", "en", "es", "eu", "fa", "fi", "fr", "ga", "gl", "hi", "hu", "hy", "id", "it", "ja", "lv", "ko", "nl", "no", "pt", diff --git a/src/main/java/jp/sf/fess/robot/FessS2RobotThread.java b/src/main/java/jp/sf/fess/robot/FessS2RobotThread.java index 6835c36db..3e354d1b7 100644 --- a/src/main/java/jp/sf/fess/robot/FessS2RobotThread.java +++ b/src/main/java/jp/sf/fess/robot/FessS2RobotThread.java @@ -31,15 +31,13 @@ import jp.sf.fess.Constants; import jp.sf.fess.db.exentity.CrawlingConfig; import jp.sf.fess.helper.CrawlingConfigHelper; import jp.sf.fess.helper.CrawlingSessionHelper; +import jp.sf.fess.helper.IndexingHelper; import jp.sf.fess.helper.SambaHelper; import jp.sf.fess.helper.SearchLogHelper; import jp.sf.fess.helper.SystemHelper; import jp.sf.fess.util.ComponentUtil; import org.apache.commons.io.IOUtils; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.response.QueryResponse; -import org.apache.solr.client.solrj.response.UpdateResponse; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; import org.codelibs.core.util.DynamicProperties; @@ -61,10 +59,6 @@ public class FessS2RobotThread extends S2RobotThread { private static final Logger logger = LoggerFactory .getLogger(FessS2RobotThread.class); - public int maxSolrQueryRetryCount = 5; - - public int childUrlSize = 10000; - @Override protected boolean isContentUpdated(final S2RobotClient client, final UrlQueue urlQueue) { @@ -82,34 +76,35 @@ public class FessS2RobotThread extends S2RobotThread { .getCrawlingSessionHelper(); final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); final SambaHelper sambaHelper = ComponentUtil.getSambaHelper(); + final IndexingHelper indexingHelper = ComponentUtil + .getIndexingHelper(); + final SolrGroupManager solrGroupManager = ComponentUtil + .getSolrGroupManager(); final boolean useAclAsRole = crawlerProperties.getProperty( Constants.USE_ACL_AS_ROLE, Constants.FALSE).equals( Constants.TRUE); - final String expiresField = systemHelper.expiresField; + final SolrGroup solrGroup = solrGroupManager + .getSolrGroup(QueryType.ADD); + + final String url = urlQueue.getUrl(); ResponseData responseData = null; try { - // head method - responseData = client - .execute(RequestDataBuilder.newRequestData().head() - .url(urlQueue.getUrl()).build()); - if (responseData == null) { - return true; - } - - SolrDocumentList oldDocWithRoleList = null; final CrawlingConfig crawlingConfig = crawlingConfigHelper .get(robotContext.getSessionId()); final Map dataMap = new HashMap(); - dataMap.put("url", urlQueue.getUrl()); + dataMap.put(systemHelper.urlField, url); final List roleTypeList = new ArrayList(); for (final String roleType : crawlingConfig.getRoleTypeValues()) { roleTypeList.add(roleType); } - if (useAclAsRole && responseData.getUrl().startsWith("smb://")) { - final String id = crawlingSessionHelper.generateId(dataMap); - oldDocWithRoleList = getSolrDocumentList(id, true, - expiresField); + if (useAclAsRole && url.startsWith("smb://")) { + // head method + responseData = client.execute(RequestDataBuilder + .newRequestData().head().url(url).build()); + if (responseData == null) { + return true; + } final ACE[] aces = (ACE[]) responseData.getMetaDataMap() .get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES); @@ -124,36 +119,40 @@ public class FessS2RobotThread extends S2RobotThread { } } } - dataMap.put("role", roleTypeList); + dataMap.put(systemHelper.roleField, roleTypeList); final String id = crawlingSessionHelper.generateId(dataMap); - final SolrDocumentList solrDocumentList = getSolrDocumentList( - id, false, expiresField); - if (solrDocumentList == null) { - deleteSolrDocumentList(oldDocWithRoleList); - storeChildUrlsToQueue(urlQueue, getChildUrlSet(id)); + final SolrDocument solrDocument = indexingHelper + .getSolrDocument(solrGroup, id, new String[] { + systemHelper.idField, + systemHelper.lastModifiedField, + systemHelper.anchorField, + systemHelper.segmentField, + systemHelper.expiresField, + systemHelper.clickCountField, + systemHelper.favoriteCountField }); + if (solrDocument == null) { + storeChildUrlsToQueue(urlQueue, + getChildUrlSet(solrGroup, id)); // TODO return true; } - if (solrDocumentList.size() > 1) { - // invalid state - deleteSolrDocumentList(oldDocWithRoleList); - deleteSolrDocumentList(solrDocumentList); - return true; - } - - final SolrDocument solrDocument = solrDocumentList.get(0); - final Date expires = (Date) solrDocument.get(expiresField); + final Date expires = (Date) solrDocument + .get(systemHelper.expiresField); if (expires != null && expires.getTime() < System.currentTimeMillis()) { - deleteSolrDocumentList(oldDocWithRoleList); + final Object idValue = solrDocument + .getFieldValue(systemHelper.idField); + if (idValue != null) { + indexingHelper.deleteDocument(solrGroup, + idValue.toString()); + } return true; } final Date lastModified = (Date) solrDocument - .get("lastModified"); + .get(systemHelper.lastModifiedField); if (lastModified == null) { - deleteSolrDocumentList(oldDocWithRoleList); return true; } @@ -162,10 +161,8 @@ public class FessS2RobotThread extends S2RobotThread { if (clickCount != null) { final SearchLogHelper searchLogHelper = ComponentUtil .getSearchLogHelper(); - final int count = searchLogHelper.getClickCount(urlQueue - .getUrl()); + final int count = searchLogHelper.getClickCount(url); if (count != clickCount.intValue()) { - deleteSolrDocumentList(oldDocWithRoleList); return true; } } @@ -175,23 +172,28 @@ public class FessS2RobotThread extends S2RobotThread { if (favoriteCount != null) { final SearchLogHelper searchLogHelper = ComponentUtil .getSearchLogHelper(); - final long count = searchLogHelper - .getFavoriteCount(urlQueue.getUrl()); + final long count = searchLogHelper.getFavoriteCount(url); if (count != favoriteCount.longValue()) { - deleteSolrDocumentList(oldDocWithRoleList); + return true; + } + } + + if (responseData == null) { + // head method + responseData = client.execute(RequestDataBuilder + .newRequestData().head().url(url).build()); + if (responseData == null) { return true; } } final int httpStatusCode = responseData.getHttpStatusCode(); if (httpStatusCode == 404) { - deleteSolrDocument(id); - deleteSolrDocumentList(oldDocWithRoleList); storeChildUrlsToQueue(urlQueue, getAnchorSet(solrDocument.get("anchor"))); + indexingHelper.deleteDocument(solrGroup, id); return false; } else if (responseData.getLastModified() == null) { - deleteSolrDocumentList(oldDocWithRoleList); return true; } else if (responseData.getLastModified().getTime() <= lastModified .getTime() && httpStatusCode == 200) { @@ -256,134 +258,28 @@ public class FessS2RobotThread extends S2RobotThread { return childUrlSet; } - protected SolrDocumentList getSolrDocumentList(final String id, - final boolean wildcard, final String expiresField) { - final SolrGroupManager solrGroupManager = ComponentUtil - .getSolrGroupManager(); + protected Set getChildUrlSet(final SolrGroup solrGroup, + final String id) { final SystemHelper systemHelper = ComponentUtil.getSystemHelper(); - final SolrGroup solrGroup = solrGroupManager - .getSolrGroup(QueryType.ADD); - final SolrQuery solrQuery = new SolrQuery(); - final StringBuilder queryBuf = new StringBuilder(200); - if (wildcard) { - queryBuf.append("{!prefix f=id}"); - } else { - queryBuf.append("{!raw f=id}"); + final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper(); + final SolrDocumentList docList = indexingHelper + .getChildSolrDocumentList(solrGroup, id, + new String[] { systemHelper.urlField }); + if (docList.isEmpty()) { + return null; } - queryBuf.append(id); - solrQuery.setQuery(queryBuf.toString()); - solrQuery.setFields("id", "lastModified", "anchor", "segment", "role", - expiresField, systemHelper.clickCountField, - systemHelper.favoriteCountField); - for (int i = 0; i < maxSolrQueryRetryCount; i++) { - try { - final QueryResponse response = solrGroup.query(solrQuery); - final SolrDocumentList docList = response.getResults(); - if (docList.isEmpty()) { - return null; - } - if (logger.isDebugEnabled()) { - logger.debug("Found solr documents: " + docList); - } - return docList; - } catch (final Exception e) { - logger.info("Could not get a response from Solr." - + " It might be busy. " + "Retrying.. id:" + id - + ", cause: " + e.getMessage()); - } - try { - Thread.sleep(500); - } catch (final InterruptedException e) { + if (logger.isDebugEnabled()) { + logger.debug("Found solr documents: " + docList); + } + final Set urlSet = new HashSet<>(docList.size()); + for (final SolrDocument doc : docList) { + final Object obj = doc.get(systemHelper.urlField); + if (obj != null) { + urlSet.add(RequestDataBuilder.newRequestData().get() + .url(obj.toString()).build()); } } - return null; + return urlSet; } - protected Set getChildUrlSet(final String id) { - final SolrGroupManager solrGroupManager = ComponentUtil - .getSolrGroupManager(); - final SolrGroup solrGroup = solrGroupManager - .getSolrGroup(QueryType.ADD); - final SolrQuery solrQuery = new SolrQuery(); - solrQuery.setQuery("{!raw f=parentId v=\"" + id + "\"}"); - solrQuery.setFields("url"); - solrQuery.setRows(childUrlSize); - for (int i = 0; i < maxSolrQueryRetryCount; i++) { - try { - final QueryResponse response = solrGroup.query(solrQuery); - final SolrDocumentList docList = response.getResults(); - if (docList.isEmpty()) { - return null; - } - if (logger.isDebugEnabled()) { - logger.debug("Found solr documents: " + docList); - } - final Set urlSet = new HashSet<>(docList.size()); - for (final SolrDocument doc : docList) { - final Object obj = doc.get("url"); - if (obj != null) { - urlSet.add(RequestDataBuilder.newRequestData().get() - .url(obj.toString()).build()); - } - } - return urlSet; - } catch (final Exception e) { - logger.info("Could not get a response from Solr." - + " It might be busy. " + "Retrying.. id:" + id - + ", cause: " + e.getMessage()); - } - try { - Thread.sleep(500); - } catch (final InterruptedException e) { - } - } - return null; - } - - protected void deleteSolrDocument(final String id) { - final SolrGroupManager solrGroupManager = ComponentUtil - .getSolrGroupManager(); - final SolrGroup solrGroup = solrGroupManager - .getSolrGroup(QueryType.DELETE); - final String query = "{!raw f=parentId v=\"" + id + "\"}"; - for (int i = 0; i < maxSolrQueryRetryCount; i++) { - boolean done = true; - try { - for (final UpdateResponse response : solrGroup - .deleteByQuery(query)) { - if (response.getStatus() != 200) { - if (logger.isDebugEnabled()) { - logger.debug("Failed to delete: " + response); - } - done = false; - } - } - } catch (final Exception e) { - logger.info("Could not delete a document from Solr." - + " It might be busy. " + "Retrying.. id:" + id - + ", cause: " + e.getMessage()); - done = false; - } - if (done) { - logger.info("Deleted from Solr: " + id); - break; - } - try { - Thread.sleep(500); - } catch (final InterruptedException e) { - } - } - } - - protected void deleteSolrDocumentList( - final SolrDocumentList solrDocumentList) { - if (solrDocumentList != null) { - for (final SolrDocument solrDocument : solrDocumentList) { - final Object idObj = solrDocument.get("id"); - if (idObj != null) { - deleteSolrDocument(idObj.toString()); - } - } - } - } } diff --git a/src/main/java/jp/sf/fess/solr/IndexUpdater.java b/src/main/java/jp/sf/fess/solr/IndexUpdater.java index e2b76df5b..034369b38 100644 --- a/src/main/java/jp/sf/fess/solr/IndexUpdater.java +++ b/src/main/java/jp/sf/fess/solr/IndexUpdater.java @@ -17,7 +17,6 @@ package jp.sf.fess.solr; import java.util.ArrayList; -import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -32,15 +31,11 @@ import jp.sf.fess.db.exbhv.ClickLogBhv; import jp.sf.fess.db.exbhv.FavoriteLogBhv; import jp.sf.fess.db.exbhv.pmbean.FavoriteUrlCountPmb; import jp.sf.fess.db.exentity.customize.FavoriteUrlCount; +import jp.sf.fess.helper.IndexingHelper; import jp.sf.fess.helper.IntervalControlHelper; import jp.sf.fess.helper.SystemHelper; import jp.sf.fess.util.ComponentUtil; -import org.apache.commons.collections.CollectionUtils; -import org.apache.solr.client.solrj.SolrQuery; -import org.apache.solr.client.solrj.util.ClientUtils; -import org.apache.solr.common.SolrDocument; -import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrInputDocument; import org.codelibs.core.util.StringUtil; import org.codelibs.robot.S2Robot; @@ -95,8 +90,13 @@ public class IndexUpdater extends Thread { @Resource protected SystemHelper systemHelper; + @Resource + protected IndexingHelper indexingHelper; + public int maxDocumentCacheSize = 5; + public int maxInvalidDocumentSize = 100; + protected boolean finishCrawling = false; public long updateInterval = 60000; // 1 min @@ -243,7 +243,7 @@ public class IndexUpdater extends Thread { } if (!docList.isEmpty()) { - sendDocuments(docList); + indexingHelper.sendDocuments(solrGroup, docList); } synchronized (finishedSessionIdList) { @@ -395,14 +395,14 @@ public class IndexUpdater extends Thread { } if (docList.size() >= maxDocumentCacheSize) { - sendDocuments(docList); + indexingHelper.sendDocuments(solrGroup, docList); } documentSize++; // commit if (commitPerCount > 0 && documentSize % commitPerCount == 0) { if (!docList.isEmpty()) { - sendDocuments(docList); + indexingHelper.sendDocuments(solrGroup, docList); } commitDocuments(); } @@ -563,17 +563,15 @@ public class IndexUpdater extends Thread { } } if (logger.isInfoEnabled()) { - logger.info("The number of a crawled document is " - + arList.getAllRecordCount() + ". The processing size is " - + arList.size() + ". The execution time is " - + (System.currentTimeMillis() - execTime) + "ms."); + logger.info("Processing " + arList.size() + "/" + + arList.getAllRecordCount() + " docs (DB: " + + (System.currentTimeMillis() - execTime) + "ms)"); } if (arList.getAllRecordCount() > unprocessedDocumentSize) { if (logger.isInfoEnabled()) { logger.info("Stopped all crawler threads. " + " You have " + arList.getAllRecordCount() + " (>" - + unprocessedDocumentSize + ") " - + " unprocessed documents."); + + unprocessedDocumentSize + ") " + " unprocessed docs."); } final IntervalControlHelper intervalControlHelper = ComponentUtil .getIntervalControlHelper(); @@ -620,71 +618,6 @@ public class IndexUpdater extends Thread { } } - @SuppressWarnings("unchecked") - private void deleteDocuments(final List docList) { - final List ids = new ArrayList(); - for (final SolrInputDocument inputDoc : docList) { - final Collection roleList = inputDoc.getFieldValues("role"); - final StringBuilder query = new StringBuilder(); - query.append("url:\""); - query.append(ClientUtils.escapeQueryChars((String) inputDoc - .getFieldValue("url"))); - query.append("\""); - - final SolrQuery sq = new SolrQuery(); - sq.setRows(1); - sq.setFields(new String[] { "id", "role" }); - sq.setQuery(query.toString()); - final SolrDocumentList docs = solrGroup.query(sq).getResults(); - if (docs.size() > 0) { - for (final SolrDocument doc : docs) { - // checking changed roles - final Collection docRoleList = doc - .getFieldValues("role"); - - if (CollectionUtils.isEmpty(roleList) - && CollectionUtils.isEmpty(docRoleList)) { - // neither have role - continue; - } - if (CollectionUtils.isNotEmpty(roleList) - && CollectionUtils.isNotEmpty(docRoleList)) { - final List diff = (List) CollectionUtils - .disjunction(roleList, docRoleList); - if (diff.size() == 0) { - // has same role(s) - continue; - } - } - // has different role(s) - ids.add((String) doc.getFieldValue("id")); - } - } - } - if (ids.size() > 0) { - synchronized (solrGroup) { - solrGroup.deleteById(ids); - } - } - } - - private void sendDocuments(final List docList) { - final long execTime = System.currentTimeMillis(); - if (logger.isInfoEnabled()) { - logger.info("Sending " + docList.size() + " document to a server."); - } - synchronized (solrGroup) { - deleteDocuments(docList); - solrGroup.add(docList); - } - if (logger.isInfoEnabled()) { - logger.info("Sent " + docList.size() - + " documents. The execution time is " - + (System.currentTimeMillis() - execTime) + "ms."); - } - docList.clear(); - } - private void forceStop() { systemHelper.setForceStop(true); for (final S2Robot s2Robot : s2RobotList) { diff --git a/src/main/java/jp/sf/fess/util/ComponentUtil.java b/src/main/java/jp/sf/fess/util/ComponentUtil.java index 49899cef9..202992360 100644 --- a/src/main/java/jp/sf/fess/util/ComponentUtil.java +++ b/src/main/java/jp/sf/fess/util/ComponentUtil.java @@ -25,6 +25,7 @@ import jp.sf.fess.helper.DatabaseHelper; import jp.sf.fess.helper.DocumentHelper; import jp.sf.fess.helper.FileTypeHelper; import jp.sf.fess.helper.HotSearchWordHelper; +import jp.sf.fess.helper.IndexingHelper; import jp.sf.fess.helper.IntervalControlHelper; import jp.sf.fess.helper.JobHelper; import jp.sf.fess.helper.KeyMatchHelper; @@ -111,6 +112,8 @@ public final class ComponentUtil { private static final String KEY_MATCH_HELPER = "keyMatchHelper"; + private static final String INDEXING_HELPER = "indexingHelper"; + private ComponentUtil() { } @@ -242,4 +245,8 @@ public final class ComponentUtil { public static KeyMatchHelper getKeyMatchHelper() { return SingletonS2Container.getComponent(KEY_MATCH_HELPER); } + + public static IndexingHelper getIndexingHelper() { + return SingletonS2Container.getComponent(INDEXING_HELPER); + } } diff --git a/src/main/resources/app.dicon b/src/main/resources/app.dicon index 071e40d36..79ef8d107 100644 --- a/src/main/resources/app.dicon +++ b/src/main/resources/app.dicon @@ -81,7 +81,7 @@ "boost", "contentLength", "host", "site", "lastModified", "mimetype", "filetype_s", "created", "title", "digest", "url", "clickCount_l_x_dv", "favoriteCount_l_x_dv", - "cid_s_s", "lang_s", "hasCache_s_s" } + "cid_s", "lang_s", "hasCache_s_s" } new String[]{ "clickCount_l_x_dv", "favoriteCount_l_x_dv"} new String[]{"digest", "cache" } diff --git a/src/main/webapp/WEB-INF/cmd/resources/app.dicon b/src/main/webapp/WEB-INF/cmd/resources/app.dicon index e9ec09acc..9801c481d 100644 --- a/src/main/webapp/WEB-INF/cmd/resources/app.dicon +++ b/src/main/webapp/WEB-INF/cmd/resources/app.dicon @@ -10,6 +10,8 @@ + +