WebFsIndexHelper.java 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. /*
  2. * Copyright 2012-2019 CodeLibs Project and the Others.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  13. * either express or implied. See the License for the specific language
  14. * governing permissions and limitations under the License.
  15. */
  16. package org.codelibs.fess.helper;
  17. import static org.codelibs.core.stream.StreamUtil.split;
  18. import java.util.ArrayList;
  19. import java.util.Collections;
  20. import java.util.List;
  21. import java.util.Map;
  22. import java.util.concurrent.atomic.AtomicBoolean;
  23. import java.util.regex.Pattern;
  24. import org.codelibs.core.lang.StringUtil;
  25. import org.codelibs.fess.Constants;
  26. import org.codelibs.fess.crawler.Crawler;
  27. import org.codelibs.fess.crawler.CrawlerContext;
  28. import org.codelibs.fess.crawler.CrawlerStatus;
  29. import org.codelibs.fess.crawler.interval.FessIntervalController;
  30. import org.codelibs.fess.crawler.service.impl.EsDataService;
  31. import org.codelibs.fess.crawler.service.impl.EsUrlFilterService;
  32. import org.codelibs.fess.crawler.service.impl.EsUrlQueueService;
  33. import org.codelibs.fess.es.config.exbhv.BoostDocumentRuleBhv;
  34. import org.codelibs.fess.es.config.exentity.BoostDocumentRule;
  35. import org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName;
  36. import org.codelibs.fess.es.config.exentity.FileConfig;
  37. import org.codelibs.fess.es.config.exentity.WebConfig;
  38. import org.codelibs.fess.indexer.IndexUpdater;
  39. import org.codelibs.fess.mylasta.direction.FessConfig;
  40. import org.codelibs.fess.util.ComponentUtil;
  41. import org.slf4j.Logger;
  42. import org.slf4j.LoggerFactory;
  43. public class WebFsIndexHelper {
  44. private static final Logger logger = LoggerFactory.getLogger(WebFsIndexHelper.class);
  45. protected long maxAccessCount = Long.MAX_VALUE;
  46. protected long crawlingExecutionInterval = Constants.DEFAULT_CRAWLING_EXECUTION_INTERVAL;
  47. protected int indexUpdaterPriority = Thread.MAX_PRIORITY;
  48. protected int crawlerPriority = Thread.NORM_PRIORITY;
  49. protected final List<Crawler> crawlerList = Collections.synchronizedList(new ArrayList<Crawler>());
  50. public void crawl(final String sessionId, final List<String> webConfigIdList, final List<String> fileConfigIdList) {
  51. final boolean runAll = webConfigIdList == null && fileConfigIdList == null;
  52. final List<WebConfig> webConfigList;
  53. if (runAll || webConfigIdList != null) {
  54. webConfigList = ComponentUtil.getCrawlingConfigHelper().getWebConfigListByIds(webConfigIdList);
  55. } else {
  56. webConfigList = Collections.emptyList();
  57. }
  58. final List<FileConfig> fileConfigList;
  59. if (runAll || fileConfigIdList != null) {
  60. fileConfigList = ComponentUtil.getCrawlingConfigHelper().getFileConfigListByIds(fileConfigIdList);
  61. } else {
  62. fileConfigList = Collections.emptyList();
  63. }
  64. if (webConfigList.isEmpty() && fileConfigList.isEmpty()) {
  65. // nothing
  66. if (logger.isInfoEnabled()) {
  67. logger.info("No crawling target urls.");
  68. }
  69. return;
  70. }
  71. doCrawl(sessionId, webConfigList, fileConfigList);
  72. }
  73. protected void doCrawl(final String sessionId, final List<WebConfig> webConfigList, final List<FileConfig> fileConfigList) {
  74. final int multiprocessCrawlingCount = ComponentUtil.getFessConfig().getCrawlingThreadCount();
  75. final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
  76. final FessConfig fessConfig = ComponentUtil.getFessConfig();
  77. final long startTime = System.currentTimeMillis();
  78. final List<String> sessionIdList = new ArrayList<>();
  79. crawlerList.clear();
  80. final List<String> crawlerStatusList = new ArrayList<>();
  81. // Web
  82. for (final WebConfig webConfig : webConfigList) {
  83. final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, webConfig);
  84. // create crawler
  85. final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
  86. crawler.setSessionId(sid);
  87. sessionIdList.add(sid);
  88. final String urlsStr = webConfig.getUrls();
  89. if (StringUtil.isBlank(urlsStr)) {
  90. logger.warn("No target urls. Skipped");
  91. break;
  92. }
  93. // interval time
  94. final int intervalTime =
  95. webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
  96. ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
  97. final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY;
  98. final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY;
  99. // num of threads
  100. final CrawlerContext crawlerContext = crawler.getCrawlerContext();
  101. final int numOfThread =
  102. webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
  103. crawlerContext.setNumOfThread(numOfThread);
  104. // depth
  105. final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1;
  106. crawlerContext.setMaxDepth(depth);
  107. // max count
  108. final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount;
  109. crawlerContext.setMaxAccessCount(maxCount);
  110. webConfig.initializeClientFactory(crawler.getClientFactory());
  111. final Map<String, String> configParamMap = webConfig.getConfigParameterMap(ConfigName.CONFIG);
  112. if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_ALL))) {
  113. deleteCrawlData(sid);
  114. } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_FILTERS))) {
  115. final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
  116. try {
  117. urlFilterService.delete(sid);
  118. } catch (final Exception e) {
  119. logger.warn("Failed to delete url filters for " + sid);
  120. }
  121. }
  122. // set urls
  123. split(urlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
  124. if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(urlValue)) {
  125. crawler.addUrl(urlValue);
  126. if (logger.isInfoEnabled()) {
  127. logger.info("Target URL: " + urlValue);
  128. }
  129. }
  130. }));
  131. // set included urls
  132. split(includedUrlsStr, "[\r\n]").of(
  133. stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
  134. if (!urlValue.startsWith("#")) {
  135. crawler.addIncludeFilter(urlValue);
  136. if (logger.isInfoEnabled()) {
  137. logger.info("Included URL: " + urlValue);
  138. }
  139. }
  140. }));
  141. // set excluded urls
  142. split(excludedUrlsStr, "[\r\n]").of(
  143. stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
  144. if (!urlValue.startsWith("#")) {
  145. crawler.addExcludeFilter(urlValue);
  146. if (logger.isInfoEnabled()) {
  147. logger.info("Excluded URL: " + urlValue);
  148. }
  149. }
  150. }));
  151. // failure url
  152. final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(webConfig.getConfigId());
  153. if (excludedUrlList != null) {
  154. excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
  155. final String urlValue = Pattern.quote(u);
  156. crawler.addExcludeFilter(urlValue);
  157. if (logger.isInfoEnabled()) {
  158. logger.info("Excluded URL from failures: " + urlValue);
  159. }
  160. });
  161. }
  162. if (logger.isDebugEnabled()) {
  163. logger.debug("Crawling " + urlsStr);
  164. }
  165. crawler.setBackground(true);
  166. crawler.setThreadPriority(crawlerPriority);
  167. crawlerList.add(crawler);
  168. crawlerStatusList.add(Constants.READY);
  169. }
  170. // File
  171. for (final FileConfig fileConfig : fileConfigList) {
  172. final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, fileConfig);
  173. // create crawler
  174. final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
  175. crawler.setSessionId(sid);
  176. sessionIdList.add(sid);
  177. final String pathsStr = fileConfig.getPaths();
  178. if (StringUtil.isBlank(pathsStr)) {
  179. logger.warn("No target uris. Skipped");
  180. break;
  181. }
  182. final int intervalTime =
  183. fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
  184. ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
  185. final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY;
  186. final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY;
  187. // num of threads
  188. final CrawlerContext crawlerContext = crawler.getCrawlerContext();
  189. final int numOfThread =
  190. fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
  191. crawlerContext.setNumOfThread(numOfThread);
  192. // depth
  193. final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1;
  194. crawlerContext.setMaxDepth(depth);
  195. // max count
  196. final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount;
  197. crawlerContext.setMaxAccessCount(maxCount);
  198. fileConfig.initializeClientFactory(crawler.getClientFactory());
  199. final Map<String, String> configParamMap = fileConfig.getConfigParameterMap(ConfigName.CONFIG);
  200. if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_ALL))) {
  201. deleteCrawlData(sid);
  202. } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_FILTERS))) {
  203. final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
  204. try {
  205. urlFilterService.delete(sid);
  206. } catch (final Exception e) {
  207. logger.warn("Failed to delete url filters for " + sid);
  208. }
  209. }
  210. // set paths
  211. split(pathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
  212. if (!urlValue.startsWith("#")) {
  213. final String u;
  214. if (!fessConfig.isValidCrawlerFileProtocol(urlValue)) {
  215. if (urlValue.startsWith("/")) {
  216. u = "file:" + urlValue;
  217. } else {
  218. u = "file:/" + urlValue;
  219. }
  220. } else {
  221. u = urlValue;
  222. }
  223. crawler.addUrl(u);
  224. if (logger.isInfoEnabled()) {
  225. logger.info("Target Path: " + u);
  226. }
  227. }
  228. }));
  229. // set included paths
  230. final AtomicBoolean urlEncodeDisabled = new AtomicBoolean(false);
  231. split(includedPathsStr, "[\r\n]").of(
  232. stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(line -> {
  233. if (!line.startsWith("#")) {
  234. final String urlValue;
  235. if (urlEncodeDisabled.get()) {
  236. urlValue = line;
  237. urlEncodeDisabled.set(false);
  238. } else {
  239. urlValue = systemHelper.encodeUrlFilter(line);
  240. }
  241. crawler.addIncludeFilter(urlValue);
  242. if (logger.isInfoEnabled()) {
  243. logger.info("Included Path: " + urlValue);
  244. }
  245. } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
  246. urlEncodeDisabled.set(true);
  247. }
  248. }));
  249. // set excluded paths
  250. urlEncodeDisabled.set(false);
  251. split(excludedPathsStr, "[\r\n]").of(
  252. stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(line -> {
  253. if (!line.startsWith("#")) {
  254. final String urlValue;
  255. if (urlEncodeDisabled.get()) {
  256. urlValue = line;
  257. urlEncodeDisabled.set(false);
  258. } else {
  259. urlValue = systemHelper.encodeUrlFilter(line);
  260. }
  261. crawler.addExcludeFilter(urlValue);
  262. if (logger.isInfoEnabled()) {
  263. logger.info("Excluded Path: " + urlValue);
  264. }
  265. } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
  266. urlEncodeDisabled.set(true);
  267. }
  268. }));
  269. // failure url
  270. final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(fileConfig.getConfigId());
  271. if (excludedUrlList != null) {
  272. excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
  273. final String urlValue = Pattern.quote(u);
  274. crawler.addExcludeFilter(urlValue);
  275. if (logger.isInfoEnabled()) {
  276. logger.info("Excluded Path from failures: " + urlValue);
  277. }
  278. });
  279. }
  280. if (logger.isDebugEnabled()) {
  281. logger.debug("Crawling " + pathsStr);
  282. }
  283. crawler.setBackground(true);
  284. crawler.setThreadPriority(crawlerPriority);
  285. crawlerList.add(crawler);
  286. crawlerStatusList.add(Constants.READY);
  287. }
  288. // run index update
  289. final IndexUpdater indexUpdater = ComponentUtil.getIndexUpdater();
  290. indexUpdater.setName("IndexUpdater");
  291. indexUpdater.setPriority(indexUpdaterPriority);
  292. indexUpdater.setSessionIdList(sessionIdList);
  293. indexUpdater.setDaemon(true);
  294. indexUpdater.setCrawlerList(crawlerList);
  295. getAvailableBoostDocumentRuleList().forEach(rule -> {
  296. indexUpdater.addDocBoostMatcher(new org.codelibs.fess.indexer.DocBoostMatcher(rule));
  297. });
  298. indexUpdater.start();
  299. int startedCrawlerNum = 0;
  300. int activeCrawlerNum = 0;
  301. while (startedCrawlerNum < crawlerList.size()) {
  302. // Force to stop crawl
  303. if (systemHelper.isForceStop()) {
  304. for (final Crawler crawler : crawlerList) {
  305. crawler.stop();
  306. }
  307. break;
  308. }
  309. if (activeCrawlerNum < multiprocessCrawlingCount) {
  310. // start crawling
  311. crawlerList.get(startedCrawlerNum).execute();
  312. crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING);
  313. startedCrawlerNum++;
  314. activeCrawlerNum++;
  315. try {
  316. Thread.sleep(crawlingExecutionInterval);
  317. } catch (final InterruptedException e) {
  318. if (logger.isDebugEnabled()) {
  319. logger.debug("Interrupted.", e);
  320. }
  321. }
  322. continue;
  323. }
  324. // check status
  325. for (int i = 0; i < startedCrawlerNum; i++) {
  326. if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE
  327. && crawlerStatusList.get(i).equals(Constants.RUNNING)) {
  328. crawlerList.get(i).awaitTermination();
  329. crawlerStatusList.set(i, Constants.DONE);
  330. final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
  331. indexUpdater.addFinishedSessionId(sid);
  332. activeCrawlerNum--;
  333. }
  334. }
  335. try {
  336. Thread.sleep(crawlingExecutionInterval);
  337. } catch (final InterruptedException e) {
  338. if (logger.isDebugEnabled()) {
  339. logger.debug("Interrupted.", e);
  340. }
  341. }
  342. }
  343. boolean finishedAll = false;
  344. while (!finishedAll) {
  345. finishedAll = true;
  346. for (int i = 0; i < crawlerList.size(); i++) {
  347. crawlerList.get(i).awaitTermination(crawlingExecutionInterval);
  348. if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE
  349. && !crawlerStatusList.get(i).equals(Constants.DONE)) {
  350. crawlerStatusList.set(i, Constants.DONE);
  351. final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
  352. indexUpdater.addFinishedSessionId(sid);
  353. }
  354. if (!crawlerStatusList.get(i).equals(Constants.DONE)) {
  355. finishedAll = false;
  356. }
  357. }
  358. }
  359. crawlerList.clear();
  360. crawlerStatusList.clear();
  361. // put cralwing info
  362. final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
  363. final long execTime = System.currentTimeMillis() - startTime;
  364. crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_CRAWLING_EXEC_TIME, Long.toString(execTime));
  365. if (logger.isInfoEnabled()) {
  366. logger.info("[EXEC TIME] crawling time: " + execTime + "ms");
  367. }
  368. indexUpdater.setFinishCrawling(true);
  369. try {
  370. indexUpdater.join();
  371. } catch (final InterruptedException e) {
  372. logger.warn("Interrupted index update.", e);
  373. }
  374. crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_EXEC_TIME, Long.toString(indexUpdater.getExecuteTime()));
  375. crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_SIZE, Long.toString(indexUpdater.getDocumentSize()));
  376. if (systemHelper.isForceStop()) {
  377. return;
  378. }
  379. for (final String sid : sessionIdList) {
  380. // remove config
  381. ComponentUtil.getCrawlingConfigHelper().remove(sid);
  382. deleteCrawlData(sid);
  383. }
  384. }
  385. protected List<BoostDocumentRule> getAvailableBoostDocumentRuleList() {
  386. return ComponentUtil.getComponent(BoostDocumentRuleBhv.class).selectList(cb -> {
  387. cb.query().matchAll();
  388. cb.query().addOrderBy_SortOrder_Asc();
  389. cb.fetchFirst(ComponentUtil.getFessConfig().getPageDocboostMaxFetchSizeAsInteger());
  390. });
  391. }
  392. protected void deleteCrawlData(final String sid) {
  393. final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
  394. final EsUrlQueueService urlQueueService = ComponentUtil.getComponent(EsUrlQueueService.class);
  395. final EsDataService dataService = ComponentUtil.getComponent(EsDataService.class);
  396. try {
  397. // clear url filter
  398. urlFilterService.delete(sid);
  399. } catch (final Exception e) {
  400. logger.warn("Failed to delete UrlFilter for " + sid, e);
  401. }
  402. try {
  403. // clear queue
  404. urlQueueService.clearCache();
  405. urlQueueService.delete(sid);
  406. } catch (final Exception e) {
  407. logger.warn("Failed to delete UrlQueue for " + sid, e);
  408. }
  409. try {
  410. // clear
  411. dataService.delete(sid);
  412. } catch (final Exception e) {
  413. logger.warn("Failed to delete AccessResult for " + sid, e);
  414. }
  415. }
  416. public void setMaxAccessCount(final long maxAccessCount) {
  417. this.maxAccessCount = maxAccessCount;
  418. }
  419. public void setCrawlingExecutionInterval(final long crawlingExecutionInterval) {
  420. this.crawlingExecutionInterval = crawlingExecutionInterval;
  421. }
  422. public void setIndexUpdaterPriority(final int indexUpdaterPriority) {
  423. this.indexUpdaterPriority = indexUpdaterPriority;
  424. }
  425. public void setCrawlerPriority(final int crawlerPriority) {
  426. this.crawlerPriority = crawlerPriority;
  427. }
  428. }