Explorar el Código

disable robotstxt retry on unreachable

Mikkel Denker hace 11 meses
padre
commit
ac9fb2ebbd

+ 1 - 50
crates/core/src/crawler/planner.rs

@@ -19,7 +19,6 @@ use futures::StreamExt;
 use indicatif::ProgressIterator;
 use itertools::Itertools;
 use rayon::{prelude::*, ThreadPoolBuilder};
-use rustc_hash::FxHashSet;
 use std::cmp::Reverse;
 use std::collections::BTreeMap;
 use std::hash::{Hash, Hasher};
@@ -49,7 +48,6 @@ use super::Domain;
 const MAX_UNCOMMITTED_INSERTS_PER_GROUP: usize = 50_000;
 const NUM_GROUPS: usize = 1024;
 const CONCURRENCY_LIMIT: usize = 32;
-const CONVERT_HIGH_WANDER_BUDGET: bool = false;
 
 #[derive(bincode::Encode, bincode::Decode, Clone, Debug, PartialEq, Eq, Hash)]
 struct StoredUrl(#[bincode(with_serde)] Url);
@@ -205,7 +203,7 @@ impl CrawlPlanner {
             .map(|id| self.host_centrality.get(&id).unwrap().unwrap_or_default())
             .sum();
 
-        let mut wander_budget = ((wander_budget * host_centrality) / total_host_centralities)
+        let wander_budget = ((wander_budget * host_centrality) / total_host_centralities)
             .max(1.0)
             .round() as u64;
 
@@ -230,53 +228,6 @@ impl CrawlPlanner {
         urls.sort_by(|a, b| a.weight.total_cmp(&b.weight));
         urls.reverse();
 
-        if CONVERT_HIGH_WANDER_BUDGET {
-            // convert some of the wander budget to scheduled urls if the wander budget is too large
-            if wander_budget > urls.len() as u64 {
-                // we should maybe convert the budget on a per-host basis
-                // instead of aggregating all hosts for the domain. some blog sites
-                // like wordpress/blogspot/github.io etc. will have lots of hosts,
-                // and if we simply convert the budget for the domain, we might end up
-                // with a lot of urls from the same host.
-
-                let mut bloom = bloom::U64BloomFilter::new(urls.len() as u64, 0.05);
-                let mut hosts = FxHashSet::default();
-                for url in &urls {
-                    let node = Node::from(url.url.clone());
-                    bloom.insert(node.id().as_u64());
-                    hosts.insert(node.into_host().id());
-                }
-
-                let budget_to_convert = wander_budget - urls.len() as u64;
-
-                let hosts = hosts.into_iter().collect::<Vec<_>>();
-                let nodes: Vec<_> = crate::block_on(self.page_graph.pages_by_hosts(&hosts))
-                    .unwrap_or_default()
-                    .into_iter()
-                    .filter(|node| !bloom.contains(node.as_u64()))
-                    .take(budget_to_convert as usize)
-                    .collect();
-
-                let nodes =
-                    crate::block_on(self.page_graph.batch_get_node(&nodes)).unwrap_or_default();
-
-                let mut new_scheduled = 0;
-                for node in nodes.into_iter().flatten() {
-                    if let Ok(url) = Url::parse(&format!("https://{}", node.as_str())) {
-                        urls.push(WeightedUrl { url, weight: 0.0 });
-                        new_scheduled += 1;
-                    }
-                }
-
-                tracing::info!(
-                    "converted {} wander budget to scheduled urls",
-                    new_scheduled
-                );
-
-                wander_budget -= new_scheduled as u64;
-            }
-        }
-
         let job = Job {
             domain: domain.clone(),
             urls: urls.into_iter().collect(),

+ 8 - 2
crates/core/src/crawler/robots_txt.rs

@@ -22,6 +22,8 @@ use crate::{config::CrawlerConfig, crawler};
 
 use super::{encoded_body, Result, Site};
 
+const RETRY_ROBOTSTXT_UNREACHABLE: bool = false;
+
 enum Lookup<T> {
     Found(T),
     /// 404
@@ -139,6 +141,10 @@ impl RobotsTxtManager {
     }
 
     async fn fetch_robots_txt(&self, site: &Site) -> Lookup<RobotsTxt> {
+        if !RETRY_ROBOTSTXT_UNREACHABLE {
+            return self.fetch_robots_txt_without_retry(site).await;
+        }
+
         for _ in 0..3 {
             match self.fetch_robots_txt_without_retry(site).await {
                 Lookup::Found(robots_txt) => return Lookup::Found(robots_txt),
@@ -171,8 +177,8 @@ impl RobotsTxtManager {
 
         let cache_should_update = match self.cache.get_mut(&site) {
             Some(Lookup::Found(robots_txt)) => robots_txt.is_expired(&self.cache_expiration),
-            Some(Lookup::Unavailable) => false,
-            _ => true,
+            Some(Lookup::Unavailable) | Some(Lookup::Unreachable) => false,
+            None => true,
         };
 
         if cache_should_update {

+ 2 - 2
crates/robotstxt/src/lib.rs

@@ -672,8 +672,8 @@ disallow: /x/
 
         let robotstxt = r#"
 user-agent: FooBot
-disallow: 
-allow: 
+disallow:
+allow:
 "#;
 
         assert!(is_user_agent_allowed(robotstxt, "FooBot", url));