FessCrawlerThread.java 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276
  1. /*
  2. * Copyright 2012-2017 CodeLibs Project and the Others.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  13. * either express or implied. See the License for the specific language
  14. * governing permissions and limitations under the License.
  15. */
  16. package org.codelibs.fess.crawler;
  17. import static org.codelibs.core.stream.StreamUtil.stream;
  18. import java.util.ArrayList;
  19. import java.util.Date;
  20. import java.util.HashMap;
  21. import java.util.HashSet;
  22. import java.util.LinkedHashSet;
  23. import java.util.List;
  24. import java.util.Map;
  25. import java.util.Set;
  26. import java.util.stream.Collectors;
  27. import org.apache.commons.io.IOUtils;
  28. import org.codelibs.core.lang.StringUtil;
  29. import org.codelibs.fess.app.service.FailureUrlService;
  30. import org.codelibs.fess.crawler.builder.RequestDataBuilder;
  31. import org.codelibs.fess.crawler.client.CrawlerClient;
  32. import org.codelibs.fess.crawler.client.smb.SmbClient;
  33. import org.codelibs.fess.crawler.entity.RequestData;
  34. import org.codelibs.fess.crawler.entity.ResponseData;
  35. import org.codelibs.fess.crawler.entity.UrlQueue;
  36. import org.codelibs.fess.crawler.log.LogType;
  37. import org.codelibs.fess.es.client.FessEsClient;
  38. import org.codelibs.fess.es.config.exentity.CrawlingConfig;
  39. import org.codelibs.fess.exception.ContainerNotAvailableException;
  40. import org.codelibs.fess.exception.ContentNotFoundException;
  41. import org.codelibs.fess.helper.CrawlingConfigHelper;
  42. import org.codelibs.fess.helper.CrawlingInfoHelper;
  43. import org.codelibs.fess.helper.DuplicateHostHelper;
  44. import org.codelibs.fess.helper.IndexingHelper;
  45. import org.codelibs.fess.helper.SambaHelper;
  46. import org.codelibs.fess.mylasta.direction.FessConfig;
  47. import org.codelibs.fess.util.ComponentUtil;
  48. import org.codelibs.fess.util.DocumentUtil;
  49. import org.slf4j.Logger;
  50. import org.slf4j.LoggerFactory;
  51. import jcifs.smb.ACE;
  52. import jcifs.smb.SID;
  53. public class FessCrawlerThread extends CrawlerThread {
  54. private static final Logger logger = LoggerFactory.getLogger(FessCrawlerThread.class);
  55. @Override
  56. protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
  57. if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
  58. final long startTime = System.currentTimeMillis();
  59. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  60. final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
  61. final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
  62. final SambaHelper sambaHelper = ComponentUtil.getSambaHelper();
  63. final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
  64. final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
  65. final String url = urlQueue.getUrl();
  66. ResponseData responseData = null;
  67. try {
  68. final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
  69. final Map<String, Object> dataMap = new HashMap<>();
  70. dataMap.put(fessConfig.getIndexFieldUrl(), url);
  71. final List<String> roleTypeList = new ArrayList<>();
  72. stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
  73. if (url.startsWith("smb://")) {
  74. if (url.endsWith("/")) {
  75. // directory
  76. return true;
  77. }
  78. if (fessConfig.isSmbRoleFromFile()) {
  79. // head method
  80. responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
  81. if (responseData == null) {
  82. return true;
  83. }
  84. final ACE[] aces = (ACE[]) responseData.getMetaDataMap().get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
  85. if (aces != null) {
  86. for (final ACE item : aces) {
  87. final SID sid = item.getSID();
  88. final String accountId = sambaHelper.getAccountId(sid);
  89. if (accountId != null) {
  90. roleTypeList.add(accountId);
  91. }
  92. }
  93. if (logger.isDebugEnabled()) {
  94. logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
  95. }
  96. }
  97. }
  98. }
  99. dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
  100. final String id = crawlingInfoHelper.generateId(dataMap);
  101. if (logger.isDebugEnabled()) {
  102. logger.debug("Searching indexed document: " + id);
  103. }
  104. final Map<String, Object> document =
  105. indexingHelper.getDocument(
  106. fessEsClient,
  107. id,
  108. new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(),
  109. fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(),
  110. fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(),
  111. fessConfig.getIndexFieldFavoriteCount() });
  112. if (document == null) {
  113. storeChildUrlsToQueue(urlQueue, getChildUrlSet(fessEsClient, id));
  114. return true;
  115. }
  116. final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
  117. if (expires != null && expires.getTime() < System.currentTimeMillis()) {
  118. final Object idValue = document.get(fessConfig.getIndexFieldId());
  119. if (idValue != null && !indexingHelper.deleteDocument(fessEsClient, idValue.toString())) {
  120. logger.debug("Failed to delete expired document: " + url);
  121. }
  122. return true;
  123. }
  124. final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
  125. if (lastModified == null) {
  126. return true;
  127. }
  128. urlQueue.setLastModified(lastModified.getTime());
  129. log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
  130. if (responseData == null) {
  131. // head method
  132. responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
  133. if (responseData == null) {
  134. return true;
  135. }
  136. }
  137. final int httpStatusCode = responseData.getHttpStatusCode();
  138. if (logger.isDebugEnabled()) {
  139. logger.debug("Accessing document: " + url + ", status: " + httpStatusCode);
  140. }
  141. if (httpStatusCode == 404) {
  142. storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
  143. if (!indexingHelper.deleteDocument(fessEsClient, id)) {
  144. logger.debug("Failed to delete 404 document: " + url);
  145. }
  146. return false;
  147. } else if (responseData.getLastModified() == null) {
  148. return true;
  149. } else if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
  150. log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
  151. responseData.setExecutionTime(System.currentTimeMillis() - startTime);
  152. responseData.setParentUrl(urlQueue.getParentUrl());
  153. responseData.setSessionId(crawlerContext.getSessionId());
  154. responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
  155. processResponse(urlQueue, responseData);
  156. storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
  157. final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
  158. if (documentExpires != null
  159. && !indexingHelper.updateDocument(fessEsClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
  160. logger.debug("Failed to update " + fessConfig.getIndexFieldExpires() + " at " + url);
  161. }
  162. return false;
  163. }
  164. } finally {
  165. if (responseData != null) {
  166. IOUtils.closeQuietly(responseData);
  167. }
  168. }
  169. }
  170. return true;
  171. }
  172. protected void storeChildUrlsToQueue(final UrlQueue<?> urlQueue, final Set<RequestData> childUrlSet) {
  173. if (childUrlSet != null) {
  174. // add an url
  175. try {
  176. storeChildUrls(childUrlSet.stream().filter(rd -> StringUtil.isNotBlank(rd.getUrl())).collect(Collectors.toSet()),
  177. urlQueue.getUrl(), urlQueue.getDepth() != null ? urlQueue.getDepth() + 1 : 1);
  178. } catch (final Throwable t) {
  179. if (!ComponentUtil.available()) {
  180. throw new ContainerNotAvailableException(t);
  181. }
  182. throw t;
  183. }
  184. }
  185. }
  186. @SuppressWarnings("unchecked")
  187. protected Set<RequestData> getAnchorSet(final Object obj) {
  188. List<String> anchorList;
  189. if (obj instanceof String) {
  190. anchorList = new ArrayList<>();
  191. anchorList.add(obj.toString());
  192. } else if (obj instanceof List<?>) {
  193. anchorList = (List<String>) obj;
  194. } else {
  195. return null;
  196. }
  197. if (anchorList.isEmpty()) {
  198. return null;
  199. }
  200. final Set<RequestData> childUrlSet = new LinkedHashSet<>();
  201. for (final String anchor : anchorList) {
  202. childUrlSet.add(RequestDataBuilder.newRequestData().get().url(anchor).build());
  203. }
  204. return childUrlSet;
  205. }
  206. protected Set<RequestData> getChildUrlSet(final FessEsClient fessEsClient, final String id) {
  207. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  208. final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
  209. final List<Map<String, Object>> docList =
  210. indexingHelper.getChildDocumentList(fessEsClient, id, new String[] { fessConfig.getIndexFieldUrl() });
  211. if (docList.isEmpty()) {
  212. return null;
  213. }
  214. if (logger.isDebugEnabled()) {
  215. logger.debug("Found documents: " + docList);
  216. }
  217. final Set<RequestData> urlSet = new HashSet<>(docList.size());
  218. for (final Map<String, Object> doc : docList) {
  219. final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
  220. if (StringUtil.isNotBlank(url)) {
  221. urlSet.add(RequestDataBuilder.newRequestData().get().url(url).build());
  222. }
  223. }
  224. return urlSet;
  225. }
  226. @Override
  227. protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
  228. super.processResponse(urlQueue, responseData);
  229. FessConfig fessConfig = ComponentUtil.getFessConfig();
  230. if (fessConfig.isCrawlerFailureUrlStatusCodes(responseData.getHttpStatusCode())) {
  231. String sessionId = crawlerContext.getSessionId();
  232. final CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(sessionId);
  233. final String url = urlQueue.getUrl();
  234. final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
  235. failureUrlService.store(crawlingConfig, ContentNotFoundException.class.getCanonicalName(), url, new ContentNotFoundException(
  236. url));
  237. }
  238. }
  239. @Override
  240. protected void storeChildUrl(final String childUrl, final String parentUrl, final String metaData, final int depth) {
  241. if (StringUtil.isNotBlank(childUrl)) {
  242. final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
  243. final String url = duplicateHostHelper.convert(childUrl);
  244. super.storeChildUrl(url, parentUrl, metaData, depth);
  245. }
  246. }
  247. }