IndexingHelper.java 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267
  1. /*
  2. * Copyright 2012-2022 CodeLibs Project and the Others.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  13. * either express or implied. See the License for the specific language
  14. * governing permissions and limitations under the License.
  15. */
  16. package org.codelibs.fess.helper;
  17. import java.util.ArrayList;
  18. import java.util.List;
  19. import java.util.Map;
  20. import org.apache.logging.log4j.LogManager;
  21. import org.apache.logging.log4j.Logger;
  22. import org.codelibs.fess.es.client.SearchEngineClient;
  23. import org.codelibs.fess.mylasta.direction.FessConfig;
  24. import org.codelibs.fess.thumbnail.ThumbnailManager;
  25. import org.codelibs.fess.util.ComponentUtil;
  26. import org.codelibs.fess.util.DocList;
  27. import org.codelibs.fess.util.MemoryUtil;
  28. import org.opensearch.action.search.SearchResponse;
  29. import org.opensearch.index.query.QueryBuilder;
  30. import org.opensearch.index.query.QueryBuilders;
  31. public class IndexingHelper {
  32. private static final Logger logger = LogManager.getLogger(IndexingHelper.class);
  33. protected int maxRetryCount = 5;
  34. protected int defaultRowSize = 100;
  35. protected long requestInterval = 500;
  36. public void sendDocuments(final SearchEngineClient searchEngineClient, final DocList docList) {
  37. if (docList.isEmpty()) {
  38. return;
  39. }
  40. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  41. final long execTime = System.currentTimeMillis();
  42. if (logger.isDebugEnabled()) {
  43. logger.debug("Sending {} documents to a server.", docList.size());
  44. }
  45. try {
  46. if (fessConfig.isThumbnailCrawlerEnabled()) {
  47. final ThumbnailManager thumbnailManager = ComponentUtil.getThumbnailManager();
  48. docList.stream().forEach(doc -> {
  49. if (!thumbnailManager.offer(doc)) {
  50. if (logger.isDebugEnabled()) {
  51. logger.debug("Removing {} from {}", doc.get(fessConfig.getIndexFieldThumbnail()),
  52. doc.get(fessConfig.getIndexFieldUrl()));
  53. }
  54. doc.remove(fessConfig.getIndexFieldThumbnail());
  55. }
  56. });
  57. }
  58. final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
  59. synchronized (searchEngineClient) {
  60. deleteOldDocuments(searchEngineClient, docList);
  61. searchEngineClient.addAll(fessConfig.getIndexDocumentUpdateIndex(), docList, (doc, builder) -> {
  62. final String configId = (String) doc.get(fessConfig.getIndexFieldConfigId());
  63. crawlingConfigHelper.getPipeline(configId).ifPresent(s -> builder.setPipeline(s));
  64. });
  65. }
  66. if (logger.isInfoEnabled()) {
  67. if (docList.getContentSize() > 0) {
  68. logger.info("Sent {} docs (Doc:{process {}ms, send {}ms, size {}}, {})", docList.size(), docList.getProcessingTime(),
  69. (System.currentTimeMillis() - execTime), MemoryUtil.byteCountToDisplaySize(docList.getContentSize()),
  70. MemoryUtil.getMemoryUsageLog());
  71. } else {
  72. logger.info("Sent {} docs (Doc:{send {}ms}, {})", docList.size(), (System.currentTimeMillis() - execTime),
  73. MemoryUtil.getMemoryUsageLog());
  74. }
  75. }
  76. } finally {
  77. docList.clear();
  78. }
  79. }
  80. private void deleteOldDocuments(final SearchEngineClient searchEngineClient, final DocList docList) {
  81. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  82. final List<String> docIdList = new ArrayList<>();
  83. for (final Map<String, Object> inputDoc : docList) {
  84. final Object idValue = inputDoc.get(fessConfig.getIndexFieldId());
  85. if (idValue == null) {
  86. continue;
  87. }
  88. final Object configIdValue = inputDoc.get(fessConfig.getIndexFieldConfigId());
  89. if (configIdValue == null) {
  90. continue;
  91. }
  92. final QueryBuilder queryBuilder = QueryBuilders.boolQuery()
  93. .must(QueryBuilders.termQuery(fessConfig.getIndexFieldUrl(), inputDoc.get(fessConfig.getIndexFieldUrl())))
  94. .filter(QueryBuilders.termQuery(fessConfig.getIndexFieldConfigId(), configIdValue));
  95. final List<Map<String, Object>> docs = getDocumentListByQuery(searchEngineClient, queryBuilder,
  96. new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldDocId() });
  97. for (final Map<String, Object> doc : docs) {
  98. final Object oldIdValue = doc.get(fessConfig.getIndexFieldId());
  99. if (!idValue.equals(oldIdValue) && oldIdValue != null) {
  100. final Object oldDocIdValue = doc.get(fessConfig.getIndexFieldDocId());
  101. if (oldDocIdValue != null) {
  102. docIdList.add(oldDocIdValue.toString());
  103. }
  104. }
  105. }
  106. if (logger.isDebugEnabled()) {
  107. logger.debug("{} => {}", queryBuilder, docs);
  108. }
  109. }
  110. if (!docIdList.isEmpty()) {
  111. searchEngineClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(),
  112. QueryBuilders.idsQuery().addIds(docIdList.stream().toArray(n -> new String[n])));
  113. }
  114. }
  115. public boolean updateDocument(final SearchEngineClient searchEngineClient, final String id, final String field, final Object value) {
  116. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  117. return searchEngineClient.update(fessConfig.getIndexDocumentUpdateIndex(), id, field, value);
  118. }
  119. public boolean deleteDocument(final SearchEngineClient searchEngineClient, final String id) {
  120. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  121. return searchEngineClient.delete(fessConfig.getIndexDocumentUpdateIndex(), id);
  122. }
  123. public long deleteDocumentByUrl(final SearchEngineClient searchEngineClient, final String url) {
  124. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  125. return searchEngineClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(),
  126. QueryBuilders.termQuery(fessConfig.getIndexFieldUrl(), url));
  127. }
  128. public long deleteDocumentsByDocId(final SearchEngineClient searchEngineClient, final List<String> docIdList) {
  129. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  130. return searchEngineClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(),
  131. QueryBuilders.idsQuery().addIds(docIdList.stream().toArray(n -> new String[n])));
  132. }
  133. public long deleteDocumentByQuery(final SearchEngineClient searchEngineClient, final QueryBuilder queryBuilder) {
  134. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  135. return searchEngineClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(), queryBuilder);
  136. }
  137. public Map<String, Object> getDocument(final SearchEngineClient searchEngineClient, final String id, final String[] fields) {
  138. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  139. return searchEngineClient.getDocument(fessConfig.getIndexDocumentUpdateIndex(), builder -> {
  140. builder.setQuery(QueryBuilders.idsQuery().addIds(id));
  141. builder.setFetchSource(fields, null);
  142. return true;
  143. }).orElse(null);
  144. }
  145. public List<Map<String, Object>> getDocumentListByPrefixId(final SearchEngineClient searchEngineClient, final String id,
  146. final String[] fields) {
  147. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  148. final QueryBuilder queryBuilder = QueryBuilders.prefixQuery(fessConfig.getIndexFieldId(), id);
  149. return getDocumentListByQuery(searchEngineClient, queryBuilder, fields);
  150. }
  151. public void deleteChildDocument(final SearchEngineClient searchEngineClient, final String id) {
  152. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  153. searchEngineClient.deleteByQuery(fessConfig.getIndexDocumentUpdateIndex(),
  154. QueryBuilders.termQuery(fessConfig.getIndexFieldParentId(), id));
  155. }
  156. public List<Map<String, Object>> getChildDocumentList(final SearchEngineClient searchEngineClient, final String id,
  157. final String[] fields) {
  158. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  159. final QueryBuilder queryBuilder = QueryBuilders.termQuery(fessConfig.getIndexFieldParentId(), id);
  160. return getDocumentListByQuery(searchEngineClient, queryBuilder, fields);
  161. }
  162. protected List<Map<String, Object>> getDocumentListByQuery(final SearchEngineClient searchEngineClient, final QueryBuilder queryBuilder,
  163. final String[] fields) {
  164. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  165. final SearchResponse countResponse = searchEngineClient.prepareSearch(fessConfig.getIndexDocumentUpdateIndex())
  166. .setQuery(queryBuilder).setSize(0).execute().actionGet(fessConfig.getIndexSearchTimeout());
  167. final long numFound = countResponse.getHits().getTotalHits().value;
  168. final long maxSearchDocSize = fessConfig.getIndexerMaxSearchDocSizeAsInteger().longValue();
  169. final boolean exceeded = numFound > maxSearchDocSize;
  170. if (exceeded) {
  171. logger.warn("Max document size is exceeded({}>{}): {}", numFound, fessConfig.getIndexerMaxSearchDocSize(), queryBuilder);
  172. }
  173. if (numFound > fessConfig.getIndexerMaxResultWindowSizeAsInteger().longValue()) {
  174. final List<Map<String, Object>> entityList = new ArrayList<>(Long.valueOf(numFound).intValue());
  175. searchEngineClient.scrollSearch(fessConfig.getIndexDocumentUpdateIndex(), requestBuilder -> {
  176. requestBuilder.setQuery(queryBuilder).setSize((int) numFound);
  177. if (fields != null) {
  178. requestBuilder.setFetchSource(fields, null);
  179. }
  180. return true;
  181. }, entity -> {
  182. entityList.add(entity);
  183. return entityList.size() <= (exceeded ? maxSearchDocSize : numFound);
  184. });
  185. return entityList;
  186. } else {
  187. return searchEngineClient.getDocumentList(fessConfig.getIndexDocumentUpdateIndex(), requestBuilder -> {
  188. requestBuilder.setQuery(queryBuilder).setSize((int) numFound);
  189. if (fields != null) {
  190. requestBuilder.setFetchSource(fields, null);
  191. }
  192. return true;
  193. });
  194. }
  195. }
  196. public long deleteBySessionId(final String sessionId) {
  197. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  198. final String index = fessConfig.getIndexDocumentUpdateIndex();
  199. final QueryBuilder queryBuilder = QueryBuilders.termQuery(fessConfig.getIndexFieldSegment(), sessionId);
  200. return deleteByQueryBuilder(index, queryBuilder);
  201. }
  202. public long deleteByConfigId(final String configId) {
  203. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  204. final String index = fessConfig.getIndexDocumentUpdateIndex();
  205. final QueryBuilder queryBuilder = QueryBuilders.termQuery(fessConfig.getIndexFieldConfigId(), configId);
  206. return deleteByQueryBuilder(index, queryBuilder);
  207. }
  208. public long deleteByVirtualHost(final String virtualHost) {
  209. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  210. final String index = fessConfig.getIndexDocumentUpdateIndex();
  211. final QueryBuilder queryBuilder = QueryBuilders.termQuery(fessConfig.getIndexFieldVirtualHost(), virtualHost);
  212. return deleteByQueryBuilder(index, queryBuilder);
  213. }
  214. protected long deleteByQueryBuilder(final String index, final QueryBuilder queryBuilder) {
  215. final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
  216. searchEngineClient.admin().indices().prepareRefresh(index).execute().actionGet();
  217. final long numOfDeleted = searchEngineClient.deleteByQuery(index, queryBuilder);
  218. logger.info("Deleted {} old docs.", numOfDeleted);
  219. return numOfDeleted;
  220. }
  221. public long calculateDocumentSize(final Map<String, Object> dataMap) {
  222. return MemoryUtil.sizeOf(dataMap);
  223. }
  224. public void setMaxRetryCount(final int maxRetryCount) {
  225. this.maxRetryCount = maxRetryCount;
  226. }
  227. public void setDefaultRowSize(final int defaultRowSize) {
  228. this.defaultRowSize = defaultRowSize;
  229. }
  230. public void setRequestInterval(final long requestInterval) {
  231. this.requestInterval = requestInterval;
  232. }
  233. }