hace 11 meses · ac9fb2ebbd
--- a/crates/core/src/crawler/planner.rs
+++ b/crates/core/src/crawler/planner.rs
@@ -19,7 +19,6 @@ use futures::StreamExt;
 
				 use indicatif::ProgressIterator;
			
 
				 use itertools::Itertools;
			
 
				 use rayon::{prelude::*, ThreadPoolBuilder};
			
 
				-use rustc_hash::FxHashSet;
			
 
				 use std::cmp::Reverse;
			
 
				 use std::collections::BTreeMap;
			
 
				 use std::hash::{Hash, Hasher};
			
@@ -49,7 +48,6 @@ use super::Domain;
 
				 const MAX_UNCOMMITTED_INSERTS_PER_GROUP: usize = 50_000;
			
 
				 const NUM_GROUPS: usize = 1024;
			
 
				 const CONCURRENCY_LIMIT: usize = 32;
			
 
				-const CONVERT_HIGH_WANDER_BUDGET: bool = false;
			
 
				 
			
 
				 #[derive(bincode::Encode, bincode::Decode, Clone, Debug, PartialEq, Eq, Hash)]
			
 
				 struct StoredUrl(#[bincode(with_serde)] Url);
			
@@ -205,7 +203,7 @@ impl CrawlPlanner {
 
				             .map(|id| self.host_centrality.get(&id).unwrap().unwrap_or_default())
			
 
				             .sum();
			
 
				 
			
 
				-        let mut wander_budget = ((wander_budget * host_centrality) / total_host_centralities)
			
 
				+        let wander_budget = ((wander_budget * host_centrality) / total_host_centralities)
			
 
				             .max(1.0)
			
 
				             .round() as u64;
			
 
				 
			
@@ -230,53 +228,6 @@ impl CrawlPlanner {
 
				         urls.sort_by(|a, b| a.weight.total_cmp(&b.weight));
			
 
				         urls.reverse();
			
 
				 
			
 
				-        if CONVERT_HIGH_WANDER_BUDGET {
			
 
				-            // convert some of the wander budget to scheduled urls if the wander budget is too large
			
 
				-            if wander_budget > urls.len() as u64 {
			
 
				-                // we should maybe convert the budget on a per-host basis
			
 
				-                // instead of aggregating all hosts for the domain. some blog sites
			
 
				-                // like wordpress/blogspot/github.io etc. will have lots of hosts,
			
 
				-                // and if we simply convert the budget for the domain, we might end up
			
 
				-                // with a lot of urls from the same host.
			
 
				-
			
 
				-                let mut bloom = bloom::U64BloomFilter::new(urls.len() as u64, 0.05);
			
 
				-                let mut hosts = FxHashSet::default();
			
 
				-                for url in &urls {
			
 
				-                    let node = Node::from(url.url.clone());
			
 
				-                    bloom.insert(node.id().as_u64());
			
 
				-                    hosts.insert(node.into_host().id());
			
 
				-                }
			
 
				-
			
 
				-                let budget_to_convert = wander_budget - urls.len() as u64;
			
 
				-
			
 
				-                let hosts = hosts.into_iter().collect::<Vec<_>>();
			
 
				-                let nodes: Vec<_> = crate::block_on(self.page_graph.pages_by_hosts(&hosts))
			
 
				-                    .unwrap_or_default()
			
 
				-                    .into_iter()
			
 
				-                    .filter(|node| !bloom.contains(node.as_u64()))
			
 
				-                    .take(budget_to_convert as usize)
			
 
				-                    .collect();
			
 
				-
			
 
				-                let nodes =
			
 
				-                    crate::block_on(self.page_graph.batch_get_node(&nodes)).unwrap_or_default();
			
 
				-
			
 
				-                let mut new_scheduled = 0;
			
 
				-                for node in nodes.into_iter().flatten() {
			
 
				-                    if let Ok(url) = Url::parse(&format!("https://{}", node.as_str())) {
			
 
				-                        urls.push(WeightedUrl { url, weight: 0.0 });
			
 
				-                        new_scheduled += 1;
			
 
				-                    }
			
 
				-                }
			
 
				-
			
 
				-                tracing::info!(
			
 
				-                    "converted {} wander budget to scheduled urls",
			
 
				-                    new_scheduled
			
 
				-                );
			
 
				-
			
 
				-                wander_budget -= new_scheduled as u64;
			
 
				-            }
			
 
				-        }
			
 
				-
			
 
				         let job = Job {
			
 
				             domain: domain.clone(),
			
 
				             urls: urls.into_iter().collect(),
			
--- a/crates/core/src/crawler/robots_txt.rs
+++ b/crates/core/src/crawler/robots_txt.rs
@@ -22,6 +22,8 @@ use crate::{config::CrawlerConfig, crawler};
 
				 
			
 
				 use super::{encoded_body, Result, Site};
			
 
				 
			
 
				+const RETRY_ROBOTSTXT_UNREACHABLE: bool = false;
			
 
				+
			
 
				 enum Lookup<T> {
			
 
				     Found(T),
			
 
				     /// 404
			
@@ -139,6 +141,10 @@ impl RobotsTxtManager {
 
				     }
			
 
				 
			
 
				     async fn fetch_robots_txt(&self, site: &Site) -> Lookup<RobotsTxt> {
			
 
				+        if !RETRY_ROBOTSTXT_UNREACHABLE {
			
 
				+            return self.fetch_robots_txt_without_retry(site).await;
			
 
				+        }
			
 
				+
			
 
				         for _ in 0..3 {
			
 
				             match self.fetch_robots_txt_without_retry(site).await {
			
 
				                 Lookup::Found(robots_txt) => return Lookup::Found(robots_txt),
			
@@ -171,8 +177,8 @@ impl RobotsTxtManager {
 
				 
			
 
				         let cache_should_update = match self.cache.get_mut(&site) {
			
 
				             Some(Lookup::Found(robots_txt)) => robots_txt.is_expired(&self.cache_expiration),
			
 
				-            Some(Lookup::Unavailable) => false,
			
 
				-            _ => true,
			
 
				+            Some(Lookup::Unavailable) | Some(Lookup::Unreachable) => false,
			
 
				+            None => true,
			
 
				         };
			
 
				 
			
 
				         if cache_should_update {
			
--- a/crates/robotstxt/src/lib.rs
+++ b/crates/robotstxt/src/lib.rs
@@ -672,8 +672,8 @@ disallow: /x/
 
				 
			
 
				         let robotstxt = r#"
			
 
				 user-agent: FooBot
			
 
				-disallow: 
			
 
				-allow: 
			
 
				+disallow:
			
 
				+allow:
			
 
				 "#;
			
 
				 
			
 
				         assert!(is_user_agent_allowed(robotstxt, "FooBot", url));