11 月之前 · ac9fb2ebbd
--- a/crates/core/src/crawler/planner.rs
+++ b/crates/core/src/crawler/planner.rs
@@ -19,7 +19,6 @@ use futures::StreamExt;
 
															 use indicatif::ProgressIterator;
														
 
															 use itertools::Itertools;
														
 
															 use rayon::{prelude::*, ThreadPoolBuilder};
														
 
															-use rustc_hash::FxHashSet;
														
 
															 use std::cmp::Reverse;
														
 
															 use std::collections::BTreeMap;
														
 
															 use std::hash::{Hash, Hasher};
														
@@ -49,7 +48,6 @@ use super::Domain;
 
															 const MAX_UNCOMMITTED_INSERTS_PER_GROUP: usize = 50_000;
														
 
															 const NUM_GROUPS: usize = 1024;
														
 
															 const CONCURRENCY_LIMIT: usize = 32;
														
 
															-const CONVERT_HIGH_WANDER_BUDGET: bool = false;
														
 
															 #[derive(bincode::Encode, bincode::Decode, Clone, Debug, PartialEq, Eq, Hash)]
														
 
															 struct StoredUrl(#[bincode(with_serde)] Url);
														
@@ -205,7 +203,7 @@ impl CrawlPlanner {
 
															             .map(|id| self.host_centrality.get(&id).unwrap().unwrap_or_default())
														
 
															             .sum();
														
 
															-        let mut wander_budget = ((wander_budget * host_centrality) / total_host_centralities)
														
 
															+        let wander_budget = ((wander_budget * host_centrality) / total_host_centralities)
														
 
															             .max(1.0)
														
 
															             .round() as u64;
														
@@ -230,53 +228,6 @@ impl CrawlPlanner {
 
															         urls.sort_by(|a, b| a.weight.total_cmp(&b.weight));
														
 
															         urls.reverse();
														
 
															-        if CONVERT_HIGH_WANDER_BUDGET {
														
 
															-            // convert some of the wander budget to scheduled urls if the wander budget is too large
														
 
															-            if wander_budget > urls.len() as u64 {
														
 
															-                // we should maybe convert the budget on a per-host basis
														
 
															-                // instead of aggregating all hosts for the domain. some blog sites
														
 
															-                // like wordpress/blogspot/github.io etc. will have lots of hosts,
														
 
															-                // and if we simply convert the budget for the domain, we might end up
														
 
															-                // with a lot of urls from the same host.
														
 
															-
														
 
															-                let mut bloom = bloom::U64BloomFilter::new(urls.len() as u64, 0.05);
														
 
															-                let mut hosts = FxHashSet::default();
														
 
															-                for url in &urls {
														
 
															-                    let node = Node::from(url.url.clone());
														
 
															-                    bloom.insert(node.id().as_u64());
														
 
															-                    hosts.insert(node.into_host().id());
														
 
															-                }
														
 
															-
														
 
															-                let budget_to_convert = wander_budget - urls.len() as u64;
														
 
															-
														
 
															-                let hosts = hosts.into_iter().collect::<Vec<_>>();
														
 
															-                let nodes: Vec<_> = crate::block_on(self.page_graph.pages_by_hosts(&hosts))
														
 
															-                    .unwrap_or_default()
														
 
															-                    .into_iter()
														
 
															-                    .filter(|node| !bloom.contains(node.as_u64()))
														
 
															-                    .take(budget_to_convert as usize)
														
 
															-                    .collect();
														
 
															-
														
 
															-                let nodes =
														
 
															-                    crate::block_on(self.page_graph.batch_get_node(&nodes)).unwrap_or_default();
														
 
															-
														
 
															-                let mut new_scheduled = 0;
														
 
															-                for node in nodes.into_iter().flatten() {
														
 
															-                    if let Ok(url) = Url::parse(&format!("https://{}", node.as_str())) {
														
 
															-                        urls.push(WeightedUrl { url, weight: 0.0 });
														
 
															-                        new_scheduled += 1;
														
 
															-                    }
														
 
															-                }
														
 
															-
														
 
															-                tracing::info!(
														
 
															-                    "converted {} wander budget to scheduled urls",
														
 
															-                    new_scheduled
														
 
															-                );
														
 
															-
														
 
															-                wander_budget -= new_scheduled as u64;
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															         let job = Job {
														
 
															             domain: domain.clone(),
														
 
															             urls: urls.into_iter().collect(),
														
--- a/crates/core/src/crawler/robots_txt.rs
+++ b/crates/core/src/crawler/robots_txt.rs
@@ -22,6 +22,8 @@ use crate::{config::CrawlerConfig, crawler};
 
															 use super::{encoded_body, Result, Site};
														
 
															+const RETRY_ROBOTSTXT_UNREACHABLE: bool = false;
														
 
															+
														
 
															 enum Lookup<T> {
														
 
															     Found(T),
														
 
															     /// 404
														
@@ -139,6 +141,10 @@ impl RobotsTxtManager {
 
															     }
														
 
															     async fn fetch_robots_txt(&self, site: &Site) -> Lookup<RobotsTxt> {
														
 
															+        if !RETRY_ROBOTSTXT_UNREACHABLE {
														
 
															+            return self.fetch_robots_txt_without_retry(site).await;
														
 
															+        }
														
 
															+
														
 
															         for _ in 0..3 {
														
 
															             match self.fetch_robots_txt_without_retry(site).await {
														
 
															                 Lookup::Found(robots_txt) => return Lookup::Found(robots_txt),
														
@@ -171,8 +177,8 @@ impl RobotsTxtManager {
 
															         let cache_should_update = match self.cache.get_mut(&site) {
														
 
															             Some(Lookup::Found(robots_txt)) => robots_txt.is_expired(&self.cache_expiration),
														
 
															-            Some(Lookup::Unavailable) => false,
														
 
															-            _ => true,
														
 
															+            Some(Lookup::Unavailable) | Some(Lookup::Unreachable) => false,
														
 
															+            None => true,
														
 
															         };
														
 
															         if cache_should_update {
														
--- a/crates/robotstxt/src/lib.rs
+++ b/crates/robotstxt/src/lib.rs
@@ -672,8 +672,8 @@ disallow: /x/
 
															         let robotstxt = r#"
														
 
															 user-agent: FooBot
														
 
															-disallow: 
														
 
															-allow: 
														
 
															+disallow:
														
 
															+allow:
														
 
															 "#;
														
 
															         assert!(is_user_agent_allowed(robotstxt, "FooBot", url));