Pārlūkot izejas kodu

[webgraph] add from_centrality, to_centrality etc. fields to edges

Mikkel Denker 8 mēneši atpakaļ
vecāks
revīzija
13f06e41ba

+ 2 - 2
crates/core/examples/distributed_harmonic.rs

@@ -104,7 +104,7 @@ fn build_graphs_if_not_exist(warc_path: &str, graph_path: &str) -> anyhow::Resul
                         to: destination,
                         rel_flags: link.rel,
                         label: link.text,
-                        sort_score: 0.0,
+                        ..Edge::empty()
                     })?;
                 } else {
                     b.insert(Edge {
@@ -112,7 +112,7 @@ fn build_graphs_if_not_exist(warc_path: &str, graph_path: &str) -> anyhow::Resul
                         to: destination,
                         rel_flags: link.rel,
                         label: link.text,
-                        sort_score: 0.0,
+                        ..Edge::empty()
                     })?;
                 }
             }

+ 1 - 0
crates/core/src/config/mod.rs

@@ -100,6 +100,7 @@ pub struct WebgraphConstructConfig {
     pub graph_base_path: String,
     pub shard: ShardId,
     pub host_centrality_store_path: String,
+    pub host_rank_store_path: String,
     pub warc_source: WarcSource,
     pub limit_warc_files: Option<usize>,
     pub skip_warc_files: Option<usize>,

+ 3 - 12
crates/core/src/entrypoint/ampc/harmonic_centrality/mod.rs

@@ -83,10 +83,7 @@ mod tests {
     use tracing_test::traced_test;
     use webgraph::{Edge, Webgraph};
 
-    use crate::{
-        free_socket_addr, webgraph::centrality::harmonic::HarmonicCentrality,
-        webpage::html::links::RelFlags,
-    };
+    use crate::{free_socket_addr, webgraph::centrality::harmonic::HarmonicCentrality};
 
     use super::*;
 
@@ -106,14 +103,8 @@ mod tests {
 
         let edges = crate::webgraph::tests::test_edges();
 
-        for (i, (from, to, label)) in edges.into_iter().enumerate() {
-            let e = Edge {
-                from: from.clone(),
-                to: to.clone(),
-                label: label.clone(),
-                rel_flags: RelFlags::default(),
-                sort_score: 0.0,
-            };
+        for (i, (from, to)) in edges.into_iter().enumerate() {
+            let e = Edge::new_test(from.clone(), to.clone());
             combined.insert(e.clone()).unwrap();
 
             if i % 2 == 0 {

+ 1 - 0
crates/core/src/entrypoint/configure.rs

@@ -117,6 +117,7 @@ fn create_webgraph() -> Result<()> {
     let mut worker = webgraph::WebgraphWorker {
         graph: crate::webgraph::Webgraph::open(&out_path, 0u64.into()).unwrap(),
         host_centrality_store: None,
+        host_rank_store: None,
         canonical_index: None,
     };
 

+ 27 - 0
crates/core/src/entrypoint/webgraph.rs

@@ -72,6 +72,7 @@ fn canonical_or_self(index: &CanonicalIndex, url: Url) -> Url {
 
 pub struct WebgraphWorker {
     pub host_centrality_store: Option<Arc<speedy_kv::Db<NodeID, f64>>>,
+    pub host_rank_store: Option<Arc<speedy_kv::Db<NodeID, u64>>>,
     pub graph: webgraph::Webgraph,
     pub canonical_index: Option<Arc<CanonicalIndex>>,
 }
@@ -109,6 +110,18 @@ impl WebgraphWorker {
                     .as_ref()
                     .and_then(|store| store.get(&source.clone().into_host().id()).unwrap())
                     .unwrap_or(0.0);
+                let source_rank = self
+                    .host_rank_store
+                    .as_ref()
+                    .and_then(|store| store.get(&source.clone().into_host().id()).unwrap())
+                    .unwrap_or(u64::MAX);
+
+                let num_outgoing_hosts_from_page = webpage
+                    .anchor_links()
+                    .into_iter()
+                    .filter_map(|l| l.destination.host_str().map(|h| h.to_string()))
+                    .unique()
+                    .count() as u64;
 
                 for mut link in webpage.anchor_links().into_iter() {
                     let mut destination = link.destination.clone();
@@ -126,6 +139,11 @@ impl WebgraphWorker {
                         .as_ref()
                         .and_then(|store| store.get(&destination.clone().into_host().id()).unwrap())
                         .unwrap_or(0.0);
+                    let destination_rank = self
+                        .host_rank_store
+                        .as_ref()
+                        .and_then(|store| store.get(&destination.clone().into_host().id()).unwrap())
+                        .unwrap_or(u64::MAX);
 
                     trace!("inserting link {:?}", link);
                     self.graph
@@ -135,6 +153,11 @@ impl WebgraphWorker {
                             rel_flags: link.rel,
                             label: link.text,
                             sort_score: source_centrality + destination_centrality,
+                            from_centrality: source_centrality,
+                            to_centrality: destination_centrality,
+                            from_rank: source_rank,
+                            to_rank: destination_rank,
+                            num_outgoing_hosts_from_page,
                         })
                         .unwrap();
                 }
@@ -177,6 +200,9 @@ impl Webgraph {
             &config.host_centrality_store_path,
         )?);
 
+        let host_rank_store =
+            Arc::new(speedy_kv::Db::open_or_create(&config.host_rank_store_path)?);
+
         let num_workers = usize::from(std::thread::available_parallelism()?);
 
         let mut handlers = Vec::new();
@@ -196,6 +222,7 @@ impl Webgraph {
             let mut worker = WebgraphWorker {
                 graph: webgraph::Webgraph::open(graph_path, config.shard)?,
                 host_centrality_store: Some(host_centrality_store.clone()),
+                host_rank_store: Some(host_rank_store.clone()),
                 canonical_index: canonical_index.clone(),
             };
 

+ 21 - 36
crates/core/src/query/optic.rs

@@ -262,7 +262,7 @@ mod tests {
             LocalSearchClient, LocalSearcher, SearchQuery,
         },
         webgraph::{Edge, Node, Webgraph},
-        webpage::{html::links::RelFlags, Html, Webpage},
+        webpage::{Html, Webpage},
     };
 
     const CONTENT: &str = "this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever";
@@ -610,49 +610,34 @@ mod tests {
         let mut graph = Webgraph::open(&dir, 0u64.into()).unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://www.e.com").into_host(),
-                to: Node::from("https://www.a.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.e.com").into_host(),
+                Node::from("https://www.a.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.a.com").into_host(),
-                to: Node::from("https://www.e.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.a.com").into_host(),
+                Node::from("https://www.e.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.c.com").into_host(),
-                to: Node::from("https://www.c.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.c.com").into_host(),
+                Node::from("https://www.c.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.b.com").into_host(),
-                to: Node::from("https://www.e.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.b.com").into_host(),
+                Node::from("https://www.e.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.e.com").into_host(),
-                to: Node::from("https://www.b.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.e.com").into_host(),
+                Node::from("https://www.b.com").into_host(),
+            ))
             .unwrap();
 
         graph.commit().unwrap();

+ 2 - 5
crates/core/src/ranking/bitvec_similarity.rs

@@ -309,8 +309,7 @@ mod tests {
                 from: a.clone(),
                 to: b.clone(),
                 rel_flags: RelFlags::NOFOLLOW,
-                label: String::new(),
-                sort_score: 0.0,
+                ..Edge::empty()
             })
             .unwrap();
 
@@ -318,9 +317,7 @@ mod tests {
             .insert(Edge {
                 from: a.clone(),
                 to: c.clone(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
+                ..Edge::empty()
             })
             .unwrap();
 

+ 53 - 92
crates/core/src/ranking/inbound_similarity.rs

@@ -150,7 +150,7 @@ mod tests {
         rand_words,
         searcher::{api::ApiSearcher, LocalSearchClient, LocalSearcher, SearchQuery},
         webgraph::{Edge, EdgeLimit, Node, Webgraph},
-        webpage::{html::links::RelFlags, Html, Webpage},
+        webpage::{Html, Webpage},
     };
 
     use super::*;
@@ -171,86 +171,59 @@ mod tests {
         let mut graph = Webgraph::open(&dir, 0u64.into()).unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("a.com").into_host(),
-                to: Node::from("b.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("a.com").into_host(),
+                Node::from("b.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("c.com").into_host(),
-                to: Node::from("d.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("c.com").into_host(),
+                Node::from("d.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("a.com").into_host(),
-                to: Node::from("e.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("a.com").into_host(),
+                Node::from("e.com").into_host(),
+            ))
             .unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("z.com").into_host(),
-                to: Node::from("a.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("z.com").into_host(),
+                Node::from("a.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("z.com").into_host(),
-                to: Node::from("b.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("z.com").into_host(),
+                Node::from("b.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("z.com").into_host(),
-                to: Node::from("c.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("z.com").into_host(),
+                Node::from("c.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("z.com").into_host(),
-                to: Node::from("d.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("z.com").into_host(),
+                Node::from("d.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("z.com").into_host(),
-                to: Node::from("d.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("z.com").into_host(),
+                Node::from("d.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("z.com").into_host(),
-                to: Node::from("e.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("z.com").into_host(),
+                Node::from("e.com").into_host(),
+            ))
             .unwrap();
 
         graph.commit().unwrap();
@@ -269,40 +242,28 @@ mod tests {
         let mut graph = Webgraph::open(&dir, 0u64.into()).unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("b.com").into_host(),
-                to: Node::from("a.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("b.com").into_host(),
+                Node::from("a.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("c.com").into_host(),
-                to: Node::from("d.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("c.com").into_host(),
+                Node::from("d.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("b.com").into_host(),
-                to: Node::from("e.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("b.com").into_host(),
+                Node::from("e.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("c.com").into_host(),
-                to: Node::from("b.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("c.com").into_host(),
+                Node::from("b.com").into_host(),
+            ))
             .unwrap();
 
         graph.commit().unwrap();

+ 33 - 57
crates/core/src/ranking/optics.rs

@@ -26,7 +26,7 @@ mod tests {
         index::Index,
         searcher::{api::ApiSearcher, LocalSearchClient, LocalSearcher, SearchQuery},
         webgraph::{Edge, Node, Webgraph},
-        webpage::{html::links::RelFlags, Html, Webpage},
+        webpage::{Html, Webpage},
     };
     const CONTENT: &str = "this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever this is the best example website ever";
 
@@ -39,76 +39,52 @@ mod tests {
         let mut graph = Webgraph::open(&dir, 0u64.into()).unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://www.first.com").into_host(),
-                to: Node::from("https://www.nan.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.first.com").into_host(),
+                Node::from("https://www.nan.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.nan.com").into_host(),
-                to: Node::from("https://www.first.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.nan.com").into_host(),
+                Node::from("https://www.first.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.third.com").into_host(),
-                to: Node::from("https://www.third.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.third.com").into_host(),
+                Node::from("https://www.third.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.nan.com").into_host(),
-                to: Node::from("https://www.second.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.nan.com").into_host(),
+                Node::from("https://www.second.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.second.com").into_host(),
-                to: Node::from("https://www.nan.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.second.com").into_host(),
+                Node::from("https://www.nan.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.second.com").into_host(),
-                to: Node::from("https://www.third.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.second.com").into_host(),
+                Node::from("https://www.third.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.extra.com").into_host(),
-                to: Node::from("https://www.first.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.extra.com").into_host(),
+                Node::from("https://www.first.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://www.second.com").into_host(),
-                to: Node::from("https://www.extra.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.second.com").into_host(),
+                Node::from("https://www.extra.com").into_host(),
+            ))
             .unwrap();
         graph.commit().unwrap();
 

+ 5 - 11
crates/core/src/webgraph/centrality/betweenness.rs

@@ -177,10 +177,7 @@ mod tests {
     use file_store::temp::TempDir;
     use maplit::hashmap;
 
-    use crate::{
-        webgraph::{Edge, Webgraph},
-        webpage::html::links::RelFlags,
-    };
+    use crate::webgraph::{Edge, Webgraph};
 
     use super::*;
 
@@ -192,13 +189,10 @@ mod tests {
 
         for i in 0..n - 1 {
             graph
-                .insert(Edge {
-                    from: Node::from(i.to_string()),
-                    to: Node::from((i + 1).to_string()),
-                    label: String::new(),
-                    rel_flags: RelFlags::default(),
-                    sort_score: 0.0,
-                })
+                .insert(Edge::new_test(
+                    Node::from(i.to_string()),
+                    Node::from((i + 1).to_string()),
+                ))
                 .unwrap();
         }
         graph.commit().unwrap();

+ 81 - 179
crates/core/src/webgraph/centrality/harmonic.rs

@@ -320,7 +320,7 @@ mod tests {
         webpage::html::links::RelFlags,
     };
 
-    fn test_edges() -> Vec<(Node, Node, String)> {
+    fn test_edges() -> Vec<(Node, Node)> {
         //     ┌────┐
         //     │    │
         // ┌───A◄─┐ │
@@ -332,11 +332,11 @@ mod tests {
         //        │
         //        D
         vec![
-            (Node::from("A"), Node::from("B"), String::new()),
-            (Node::from("B"), Node::from("C"), String::new()),
-            (Node::from("A"), Node::from("C"), String::new()),
-            (Node::from("C"), Node::from("A"), String::new()),
-            (Node::from("D"), Node::from("C"), String::new()),
+            (Node::from("A"), Node::from("B")),
+            (Node::from("B"), Node::from("C")),
+            (Node::from("A"), Node::from("C")),
+            (Node::from("C"), Node::from("A")),
+            (Node::from("D"), Node::from("C")),
         ]
     }
 
@@ -346,16 +346,8 @@ mod tests {
             .open()
             .unwrap();
 
-        for (from, to, label) in test_edges() {
-            graph
-                .insert(Edge {
-                    from,
-                    to,
-                    rel_flags: RelFlags::default(),
-                    label,
-                    sort_score: 0.0,
-                })
-                .unwrap();
+        for (from, to) in test_edges() {
+            graph.insert(Edge::new_test(from, to)).unwrap();
         }
 
         graph.commit().unwrap();
@@ -371,130 +363,88 @@ mod tests {
             .unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("A.com/1").into_host(),
-                to: Node::from("A.com/2").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/1").into_host(),
+                Node::from("A.com/2").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/1").into_host(),
-                to: Node::from("A.com/3").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/1").into_host(),
+                Node::from("A.com/3").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/1").into_host(),
-                to: Node::from("A.com/4").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/1").into_host(),
+                Node::from("A.com/4").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/2").into_host(),
-                to: Node::from("A.com/1").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/2").into_host(),
+                Node::from("A.com/1").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/2").into_host(),
-                to: Node::from("A.com/3").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/2").into_host(),
+                Node::from("A.com/3").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/2").into_host(),
-                to: Node::from("A.com/4").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/2").into_host(),
+                Node::from("A.com/4").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/3").into_host(),
-                to: Node::from("A.com/1").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/3").into_host(),
+                Node::from("A.com/1").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/3").into_host(),
-                to: Node::from("A.com/2").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/3").into_host(),
+                Node::from("A.com/2").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/3").into_host(),
-                to: Node::from("A.com/4").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/3").into_host(),
+                Node::from("A.com/4").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/4").into_host(),
-                to: Node::from("A.com/1").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/4").into_host(),
+                Node::from("A.com/1").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/4").into_host(),
-                to: Node::from("A.com/2").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/4").into_host(),
+                Node::from("A.com/2").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A.com/4").into_host(),
-                to: Node::from("A.com/3").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("A.com/4").into_host(),
+                Node::from("A.com/3").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("C.com").into_host(),
-                to: Node::from("B.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("C.com").into_host(),
+                Node::from("B.com").into_host(),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("D.com").into_host(),
-                to: Node::from("B.com").into_host(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("D.com").into_host(),
+                Node::from("B.com").into_host(),
+            ))
             .unwrap();
 
         graph.commit().unwrap();
@@ -533,86 +483,40 @@ mod tests {
             .open()
             .unwrap();
 
-        for (from, to, label) in test_edges() {
-            graph
-                .insert(Edge {
-                    from,
-                    to,
-                    rel_flags: RelFlags::default(),
-                    label,
-                    sort_score: 0.0,
-                })
-                .unwrap();
+        for (from, to) in test_edges() {
+            graph.insert(Edge::new_test(from, to)).unwrap();
         }
 
         graph
-            .insert(Edge {
-                from: Node::from("A"),
-                to: Node::from("B"),
-                rel_flags: RelFlags::default(),
-                label: "1".to_string(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
+            .unwrap();
+        graph.commit().unwrap();
+        graph
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
             .unwrap();
         graph.commit().unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A"),
-                to: Node::from("B"),
-                rel_flags: RelFlags::default(),
-                label: "2".to_string(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
             .unwrap();
         graph.commit().unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A"),
-                to: Node::from("B"),
-                rel_flags: RelFlags::default(),
-                label: "3".to_string(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
             .unwrap();
         graph.commit().unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A"),
-                to: Node::from("B"),
-                rel_flags: RelFlags::default(),
-                label: "4".to_string(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
             .unwrap();
         graph.commit().unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A"),
-                to: Node::from("B"),
-                rel_flags: RelFlags::default(),
-                label: "5".to_string(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
             .unwrap();
         graph.commit().unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A"),
-                to: Node::from("B"),
-                rel_flags: RelFlags::default(),
-                label: "6".to_string(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
             .unwrap();
         graph.commit().unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("A"),
-                to: Node::from("B"),
-                rel_flags: RelFlags::default(),
-                label: "7".to_string(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(Node::from("A"), Node::from("B")))
             .unwrap();
         graph.commit().unwrap();
 
@@ -630,14 +534,13 @@ mod tests {
             .open()
             .unwrap();
 
-        for (from, to, label) in test_edges() {
+        for (from, to) in test_edges() {
             graph
                 .insert(Edge {
                     from,
                     to,
                     rel_flags: RelFlags::TAG,
-                    label,
-                    sort_score: 0.0,
+                    ..Edge::empty()
                 })
                 .unwrap();
         }
@@ -656,14 +559,13 @@ mod tests {
             .open()
             .unwrap();
 
-        for (from, to, label) in test_edges() {
+        for (from, to) in test_edges() {
             graph
                 .insert(Edge {
                     from,
                     to,
                     rel_flags: RelFlags::SAME_ICANN_DOMAIN,
-                    label,
-                    sort_score: 0.0,
+                    ..Edge::empty()
                 })
                 .unwrap();
         }

+ 19 - 0
crates/core/src/webgraph/document.rs

@@ -172,6 +172,11 @@ pub struct Edge {
     pub rel_flags: RelFlags,
     pub label: String,
     pub sort_score: f64,
+    pub from_centrality: f64,
+    pub to_centrality: f64,
+    pub from_rank: u64,
+    pub to_rank: u64,
+    pub num_outgoing_hosts_from_page: u64,
 }
 
 impl Edge {
@@ -182,6 +187,20 @@ impl Edge {
             rel_flags: RelFlags::default(),
             label: String::default(),
             sort_score: 0.0,
+            from_centrality: 0.0,
+            to_centrality: 0.0,
+            from_rank: 0,
+            to_rank: 0,
+            num_outgoing_hosts_from_page: 0,
+        }
+    }
+
+    #[cfg(test)]
+    pub fn new_test(from: Node, to: Node) -> Self {
+        Self {
+            from,
+            to,
+            ..Self::empty()
         }
     }
 }

+ 2 - 8
crates/core/src/webgraph/query/between.rs

@@ -153,7 +153,7 @@ impl Query for FullLinksBetweenQuery {
 
 #[cfg(test)]
 mod tests {
-    use crate::{webgraph::Webgraph, webpage::RelFlags};
+    use crate::webgraph::Webgraph;
 
     use super::*;
 
@@ -166,13 +166,7 @@ mod tests {
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
         graph
-            .insert(Edge {
-                from: from.clone(),
-                to: to.clone(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(from.clone(), to.clone()))
             .unwrap();
         graph.commit().unwrap();
 

+ 26 - 49
crates/core/src/webgraph/query/collector/top_docs.rs

@@ -449,12 +449,9 @@ impl<S: DocumentScorer + 'static, D: Deduplicator + 'static> SegmentCollector
 
 #[cfg(test)]
 mod tests {
-    use crate::{
-        webgraph::{
-            query::{BacklinksQuery, HostBacklinksQuery},
-            Edge, EdgeLimit, Node, Webgraph,
-        },
-        webpage::RelFlags,
+    use crate::webgraph::{
+        query::{BacklinksQuery, HostBacklinksQuery},
+        Edge, EdgeLimit, Node, Webgraph,
     };
 
     #[test]
@@ -463,13 +460,10 @@ mod tests {
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/1"),
-                to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/1"),
+                Node::from("https://B.com/1"),
+            ))
             .unwrap();
 
         graph.commit().unwrap();
@@ -488,22 +482,16 @@ mod tests {
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/1"),
-                to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/1"),
+                Node::from("https://B.com/1"),
+            ))
             .unwrap();
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/2"),
-                to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/2"),
+                Node::from("https://B.com/1"),
+            ))
             .unwrap();
 
         graph.commit().unwrap();
@@ -529,24 +517,16 @@ mod tests {
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/1"),
-                to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/1"),
+                Node::from("https://B.com/1"),
+            ))
             .unwrap();
-        graph.commit().unwrap();
-
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/2"),
-                to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/2"),
+                Node::from("https://B.com/1"),
+            ))
             .unwrap();
 
         graph.commit().unwrap();
@@ -569,27 +549,24 @@ mod tests {
             .insert(Edge {
                 from: Node::from("https://A.com/1"),
                 to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
                 sort_score: 0.1,
+                ..Edge::empty()
             })
             .unwrap();
         graph
             .insert(Edge {
                 from: Node::from("https://A.com/2"),
                 to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
                 sort_score: 0.1,
+                ..Edge::empty()
             })
             .unwrap();
         graph
             .insert(Edge {
                 from: Node::from("https://C.com/1"),
                 to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
                 sort_score: 0.0,
+                ..Edge::empty()
             })
             .unwrap();
 

+ 11 - 22
crates/core/src/webgraph/query/filter/and.rs

@@ -124,21 +124,18 @@ impl super::InvertedIndexFilter for AndInvertedIndexFilter {
 mod tests {
     use file_store::temp::TempDir;
 
-    use crate::{
-        webgraph::{
-            query::{ForwardlinksQuery, TextFilter},
-            schema::ToUrl,
-            Edge, Node, Webgraph,
-        },
-        webpage::RelFlags,
+    use crate::webgraph::{
+        query::{ForwardlinksQuery, TextFilter},
+        schema::ToUrl,
+        Edge, Node, Webgraph,
     };
 
-    pub fn test_edges() -> Vec<(Node, Node, String)> {
+    pub fn test_edges() -> Vec<(Node, Node)> {
         vec![
-            (Node::from("a.com"), Node::from("b.com/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.dk/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.com/321"), String::new()),
-            (Node::from("a.com"), Node::from("c.com"), String::new()),
+            (Node::from("a.com"), Node::from("b.com/123")),
+            (Node::from("a.com"), Node::from("b.dk/123")),
+            (Node::from("a.com"), Node::from("b.com/321")),
+            (Node::from("a.com"), Node::from("c.com")),
         ]
     }
 
@@ -146,16 +143,8 @@ mod tests {
         let temp_dir = crate::gen_temp_dir().unwrap();
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
-        for (from, to, label) in test_edges() {
-            graph
-                .insert(Edge {
-                    from,
-                    to,
-                    rel_flags: RelFlags::default(),
-                    label,
-                    sort_score: 0.0,
-                })
-                .unwrap();
+        for (from, to) in test_edges() {
+            graph.insert(Edge::new_test(from, to)).unwrap();
         }
 
         graph.commit().unwrap();

+ 12 - 23
crates/core/src/webgraph/query/filter/not.rs

@@ -93,24 +93,21 @@ impl super::InvertedIndexFilter for NotInvertedIndexFilter {
 mod tests {
     use file_store::temp::TempDir;
 
-    use crate::{
-        webgraph::{
-            query::{FullForwardlinksQuery, OrFilter, TextFilter},
-            schema::ToUrl,
-            Edge, Node, Webgraph,
-        },
-        webpage::RelFlags,
+    use crate::webgraph::{
+        query::{FullForwardlinksQuery, OrFilter, TextFilter},
+        schema::ToUrl,
+        Edge, Node, Webgraph,
     };
 
     use super::*;
 
-    pub fn test_edges() -> Vec<(Node, Node, String)> {
+    pub fn test_edges() -> Vec<(Node, Node)> {
         vec![
-            (Node::from("a.com"), Node::from("b.com/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.dk/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.se/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.com/321"), String::new()),
-            (Node::from("a.com"), Node::from("c.com"), String::new()),
+            (Node::from("a.com"), Node::from("b.com/123")),
+            (Node::from("a.com"), Node::from("b.dk/123")),
+            (Node::from("a.com"), Node::from("b.se/123")),
+            (Node::from("a.com"), Node::from("b.com/321")),
+            (Node::from("a.com"), Node::from("c.com")),
         ]
     }
 
@@ -118,16 +115,8 @@ mod tests {
         let temp_dir = crate::gen_temp_dir().unwrap();
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
-        for (from, to, label) in test_edges() {
-            graph
-                .insert(Edge {
-                    from,
-                    to,
-                    rel_flags: RelFlags::default(),
-                    label,
-                    sort_score: 0.0,
-                })
-                .unwrap();
+        for (from, to) in test_edges() {
+            graph.insert(Edge::new_test(from, to)).unwrap();
         }
 
         graph.commit().unwrap();

+ 12 - 23
crates/core/src/webgraph/query/filter/or.rs

@@ -137,24 +137,21 @@ impl super::InvertedIndexFilter for OrInvertedIndexFilter {
 mod tests {
     use file_store::temp::TempDir;
 
-    use crate::{
-        webgraph::{
-            query::{FullForwardlinksQuery, TextFilter},
-            schema::ToUrl,
-            Edge, Node, Webgraph,
-        },
-        webpage::RelFlags,
+    use crate::webgraph::{
+        query::{FullForwardlinksQuery, TextFilter},
+        schema::ToUrl,
+        Edge, Node, Webgraph,
     };
 
     use super::*;
 
-    pub fn test_edges() -> Vec<(Node, Node, String)> {
+    pub fn test_edges() -> Vec<(Node, Node)> {
         vec![
-            (Node::from("a.com"), Node::from("b.com/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.dk/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.se/123"), String::new()),
-            (Node::from("a.com"), Node::from("b.com/321"), String::new()),
-            (Node::from("a.com"), Node::from("c.com"), String::new()),
+            (Node::from("a.com"), Node::from("b.com/123")),
+            (Node::from("a.com"), Node::from("b.dk/123")),
+            (Node::from("a.com"), Node::from("b.se/123")),
+            (Node::from("a.com"), Node::from("b.com/321")),
+            (Node::from("a.com"), Node::from("c.com")),
         ]
     }
 
@@ -162,16 +159,8 @@ mod tests {
         let temp_dir = crate::gen_temp_dir().unwrap();
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
-        for (from, to, label) in test_edges() {
-            graph
-                .insert(Edge {
-                    from,
-                    to,
-                    rel_flags: RelFlags::default(),
-                    label,
-                    sort_score: 0.0,
-                })
-                .unwrap();
+        for (from, to) in test_edges() {
+            graph.insert(Edge::new_test(from, to)).unwrap();
         }
 
         graph.commit().unwrap();

+ 1 - 2
crates/core/src/webgraph/query/filter/rel_flag.rs

@@ -113,8 +113,7 @@ mod tests {
                     from,
                     to,
                     rel_flags,
-                    label: String::new(),
-                    sort_score: 0.0,
+                    ..Edge::empty()
                 })
                 .unwrap();
         }

+ 12 - 23
crates/core/src/webgraph/query/filter/text.rs

@@ -76,24 +76,21 @@ impl super::InvertedIndexFilter for TextInvertedIndexFilter {
 mod tests {
     use file_store::temp::TempDir;
 
-    use crate::{
-        webgraph::{
-            query::{BacklinksQuery, ForwardlinksQuery, FullBacklinksQuery, FullForwardlinksQuery},
-            schema::{FromUrl, ToUrl},
-            Edge, Node, Webgraph,
-        },
-        webpage::RelFlags,
+    use crate::webgraph::{
+        query::{BacklinksQuery, ForwardlinksQuery, FullBacklinksQuery, FullForwardlinksQuery},
+        schema::{FromUrl, ToUrl},
+        Edge, Node, Webgraph,
     };
 
     use super::*;
 
-    pub fn test_edges() -> Vec<(Node, Node, String)> {
+    pub fn test_edges() -> Vec<(Node, Node)> {
         vec![
-            (Node::from("a.com"), Node::from("b.com"), String::new()),
-            (Node::from("a.com"), Node::from("b.dk"), String::new()),
-            (Node::from("b.com"), Node::from("b.dk"), String::new()),
-            (Node::from("c.dk"), Node::from("b.dk"), String::new()),
-            (Node::from("c.com"), Node::from("a.com"), String::new()),
+            (Node::from("a.com"), Node::from("b.com")),
+            (Node::from("a.com"), Node::from("b.dk")),
+            (Node::from("b.com"), Node::from("b.dk")),
+            (Node::from("c.dk"), Node::from("b.dk")),
+            (Node::from("c.com"), Node::from("a.com")),
         ]
     }
 
@@ -101,16 +98,8 @@ mod tests {
         let temp_dir = crate::gen_temp_dir().unwrap();
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
-        for (from, to, label) in test_edges() {
-            graph
-                .insert(Edge {
-                    from,
-                    to,
-                    rel_flags: RelFlags::default(),
-                    label,
-                    sort_score: 0.0,
-                })
-                .unwrap();
+        for (from, to) in test_edges() {
+            graph.insert(Edge::new_test(from, to)).unwrap();
         }
 
         graph.commit().unwrap();

+ 17 - 53
crates/core/src/webgraph/query/raw/links.rs

@@ -238,10 +238,7 @@ impl tantivy::DocSet for LinksScorer {
 
 #[cfg(test)]
 mod tests {
-    use crate::{
-        webgraph::{query::HostBacklinksQuery, Edge, Node, Webgraph},
-        webpage::RelFlags,
-    };
+    use crate::webgraph::{query::HostBacklinksQuery, Edge, Node, Webgraph};
 
     #[test]
     fn test_simple() {
@@ -253,23 +250,11 @@ mod tests {
         let node_c = Node::from("C");
 
         graph
-            .insert(Edge {
-                from: node_a.clone(),
-                to: node_b.clone(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(node_a.clone(), node_b.clone()))
             .unwrap();
 
         graph
-            .insert(Edge {
-                from: node_c.clone(),
-                to: node_b.clone(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(node_c.clone(), node_b.clone()))
             .unwrap();
 
         graph.commit().unwrap();
@@ -290,23 +275,11 @@ mod tests {
         let node_b = Node::from("B");
 
         graph
-            .insert(Edge {
-                from: node_a.clone(),
-                to: node_b.clone(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(node_a.clone(), node_b.clone()))
             .unwrap();
 
         graph
-            .insert(Edge {
-                from: node_b.clone(),
-                to: node_b.clone(),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(node_b.clone(), node_b.clone()))
             .unwrap();
 
         graph.commit().unwrap();
@@ -323,33 +296,24 @@ mod tests {
         let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/1"),
-                to: Node::from("https://B.com/1"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/1"),
+                Node::from("https://B.com/1"),
+            ))
             .unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/2"),
-                to: Node::from("https://B.com/2"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/2"),
+                Node::from("https://B.com/2"),
+            ))
             .unwrap();
 
         graph
-            .insert(Edge {
-                from: Node::from("https://A.com/3"),
-                to: Node::from("https://B.com/3"),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://A.com/3"),
+                Node::from("https://B.com/3"),
+            ))
             .unwrap();
 
         graph.commit().unwrap();

+ 164 - 0
crates/core/src/webgraph/schema.rs

@@ -321,6 +321,160 @@ impl Field for SortScore {
     }
 }
 
+#[derive(Clone, Copy, Debug, bincode::Encode, bincode::Decode)]
+pub struct FromCentrality;
+impl Field for FromCentrality {
+    fn name(&self) -> &'static str {
+        "from_centrality"
+    }
+
+    fn document_value<'a>(&self, edge: &'a Edge) -> ReferenceValue<'a> {
+        ReferenceValue::F64(edge.from_centrality)
+    }
+
+    fn set_value(&self, edge: &mut Edge, value: OwnedValue) -> Result<()> {
+        edge.from_centrality = value
+            .as_ref()
+            .as_f64()
+            .ok_or(anyhow::anyhow!("Invalid centrality"))?;
+
+        Ok(())
+    }
+
+    fn field_type(&self) -> tantivy::schema::FieldType {
+        FieldType::F64(
+            NumericOptions::default()
+                .set_indexed()
+                .set_stored()
+                .set_columnar(),
+        )
+    }
+}
+
+#[derive(Clone, Copy, Debug, bincode::Encode, bincode::Decode)]
+pub struct ToCentrality;
+
+impl Field for ToCentrality {
+    fn name(&self) -> &'static str {
+        "to_centrality"
+    }
+
+    fn document_value<'a>(&self, edge: &'a Edge) -> ReferenceValue<'a> {
+        ReferenceValue::F64(edge.to_centrality)
+    }
+
+    fn set_value(&self, edge: &mut Edge, value: OwnedValue) -> Result<()> {
+        edge.to_centrality = value
+            .as_ref()
+            .as_f64()
+            .ok_or(anyhow::anyhow!("Invalid centrality"))?;
+
+        Ok(())
+    }
+
+    fn field_type(&self) -> tantivy::schema::FieldType {
+        FieldType::F64(
+            NumericOptions::default()
+                .set_indexed()
+                .set_stored()
+                .set_columnar(),
+        )
+    }
+}
+
+#[derive(Clone, Copy, Debug, bincode::Encode, bincode::Decode)]
+pub struct FromRank;
+
+impl Field for FromRank {
+    fn name(&self) -> &'static str {
+        "from_rank"
+    }
+
+    fn document_value<'a>(&self, edge: &'a Edge) -> ReferenceValue<'a> {
+        ReferenceValue::U64(edge.from_rank)
+    }
+
+    fn set_value(&self, edge: &mut Edge, value: OwnedValue) -> Result<()> {
+        edge.from_rank = value
+            .as_ref()
+            .as_u64()
+            .ok_or(anyhow::anyhow!("Invalid rank"))?;
+
+        Ok(())
+    }
+
+    fn field_type(&self) -> tantivy::schema::FieldType {
+        FieldType::U64(
+            NumericOptions::default()
+                .set_indexed()
+                .set_stored()
+                .set_columnar(),
+        )
+    }
+}
+
+#[derive(Clone, Copy, Debug, bincode::Encode, bincode::Decode)]
+pub struct ToRank;
+
+impl Field for ToRank {
+    fn name(&self) -> &'static str {
+        "to_rank"
+    }
+
+    fn document_value<'a>(&self, edge: &'a Edge) -> ReferenceValue<'a> {
+        ReferenceValue::U64(edge.to_rank)
+    }
+
+    fn set_value(&self, edge: &mut Edge, value: OwnedValue) -> Result<()> {
+        edge.to_rank = value
+            .as_ref()
+            .as_u64()
+            .ok_or(anyhow::anyhow!("Invalid rank"))?;
+
+        Ok(())
+    }
+
+    fn field_type(&self) -> tantivy::schema::FieldType {
+        FieldType::U64(
+            NumericOptions::default()
+                .set_indexed()
+                .set_stored()
+                .set_columnar(),
+        )
+    }
+}
+
+#[derive(Clone, Copy, Debug, bincode::Encode, bincode::Decode)]
+pub struct NumOutgoingHostsFromPage;
+
+impl Field for NumOutgoingHostsFromPage {
+    fn name(&self) -> &'static str {
+        "num_outgoing_hosts_from_page"
+    }
+
+    fn document_value<'a>(&self, edge: &'a Edge) -> ReferenceValue<'a> {
+        ReferenceValue::U64(edge.num_outgoing_hosts_from_page)
+    }
+
+    fn set_value(&self, edge: &mut Edge, value: OwnedValue) -> Result<()> {
+        edge.num_outgoing_hosts_from_page = value
+            .as_ref()
+            .as_u64()
+            .ok_or(anyhow::anyhow!("Invalid number of outgoing hosts"))?;
+
+        Ok(())
+    }
+
+    fn field_type(&self) -> tantivy::schema::FieldType {
+        FieldType::U64(
+            NumericOptions::default()
+                .set_indexed()
+                .set_stored()
+                .set_columnar(),
+        )
+    }
+}
+
 #[enum_dispatch(Field)]
 #[derive(Clone, Copy, Debug, EnumDiscriminants, bincode::Encode, bincode::Decode)]
 #[strum_discriminants(derive(VariantArray))]
@@ -334,6 +488,11 @@ pub enum FieldEnum {
     RelFlags,
     Label,
     SortScore,
+    FromCentrality,
+    ToCentrality,
+    FromRank,
+    ToRank,
+    NumOutgoingHostsFromPage,
 }
 
 impl FieldEnum {
@@ -356,6 +515,11 @@ enum_dispatch_from_discriminant!(FieldEnumDiscriminants => FieldEnum,
   RelFlags,
   Label,
   SortScore,
+  FromCentrality,
+  ToCentrality,
+  FromRank,
+  ToRank,
+  NumOutgoingHostsFromPage,
 ]);
 
 impl crate::enum_map::InsertEnumMapKey for FieldEnumDiscriminants {

+ 31 - 28
crates/core/src/webgraph/store.rs

@@ -425,13 +425,10 @@ mod tests {
         let temp_dir = crate::gen_temp_dir().unwrap();
         let mut store = EdgeStore::open(&temp_dir, ShardId::new(0)).unwrap();
 
-        let e = Edge {
-            from: Node::from("https://www.first.com").into_host(),
-            to: Node::from("https://www.second.com").into_host(),
-            label: "test".to_string(),
-            rel_flags: RelFlags::default(),
-            sort_score: 0.0,
-        };
+        let e = Edge::new_test(
+            Node::from("https://www.first.com").into_host(),
+            Node::from("https://www.second.com").into_host(),
+        );
         let from_node_id = e.from.id();
         let to_node_id = e.to.id();
 
@@ -477,6 +474,11 @@ mod tests {
             label: "test".to_string(),
             rel_flags: RelFlags::default(),
             sort_score: a_centrality + b_centrality,
+            from_centrality: b_centrality,
+            to_centrality: a_centrality,
+            from_rank: 2,
+            to_rank: 1,
+            ..Edge::empty()
         };
 
         let e2 = Edge {
@@ -485,6 +487,11 @@ mod tests {
             label: "2".to_string(),
             rel_flags: RelFlags::default(),
             sort_score: a_centrality + c_centrality,
+            from_centrality: c_centrality,
+            to_centrality: a_centrality,
+            from_rank: 3,
+            to_rank: 1,
+            ..Edge::empty()
         };
 
         let e3 = Edge {
@@ -493,6 +500,11 @@ mod tests {
             label: "3".to_string(),
             rel_flags: RelFlags::default(),
             sort_score: a_centrality + d_centrality,
+            from_centrality: d_centrality,
+            to_centrality: a_centrality,
+            from_rank: 4,
+            to_rank: 1,
+            ..Edge::empty()
         };
 
         store.insert(e1.clone()).unwrap();
@@ -517,35 +529,26 @@ mod tests {
         let mut store = EdgeStore::open(&temp_dir, ShardId::new(0)).unwrap();
 
         store
-            .insert(Edge {
-                from: Node::from("https://www.first.com").into_host(),
-                to: Node::from("https://www.second.com").into_host(),
-                label: String::new(),
-                rel_flags: RelFlags::default(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.first.com").into_host(),
+                Node::from("https://www.second.com").into_host(),
+            ))
             .unwrap();
         store.commit().unwrap();
 
         store
-            .insert(Edge {
-                from: Node::from("https://www.second.com").into_host(),
-                to: Node::from("https://www.first.com").into_host(),
-                label: String::new(),
-                rel_flags: RelFlags::default(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.second.com").into_host(),
+                Node::from("https://www.first.com").into_host(),
+            ))
             .unwrap();
         store.commit().unwrap();
 
         store
-            .insert(Edge {
-                from: Node::from("https://www.third.com").into_host(),
-                to: Node::from("https://www.first.com").into_host(),
-                label: String::new(),
-                rel_flags: RelFlags::default(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::from("https://www.third.com").into_host(),
+                Node::from("https://www.first.com").into_host(),
+            ))
             .unwrap();
         store.commit().unwrap();
 

+ 51 - 113
crates/core/src/webgraph/tests.rs

@@ -55,13 +55,13 @@ use crate::webpage::html::links::RelFlags;
 use file_store::temp::TempDir;
 use proptest::prelude::*;
 
-pub fn test_edges() -> Vec<(Node, Node, String)> {
+pub fn test_edges() -> Vec<(Node, Node)> {
     vec![
-        (Node::from("A"), Node::from("B"), String::new()),
-        (Node::from("B"), Node::from("C"), String::new()),
-        (Node::from("A"), Node::from("C"), String::new()),
-        (Node::from("C"), Node::from("A"), String::new()),
-        (Node::from("D"), Node::from("C"), String::new()),
+        (Node::from("A"), Node::from("B")),
+        (Node::from("B"), Node::from("C")),
+        (Node::from("A"), Node::from("C")),
+        (Node::from("C"), Node::from("A")),
+        (Node::from("D"), Node::from("C")),
     ]
 }
 
@@ -80,16 +80,8 @@ pub fn test_graph() -> (Webgraph, TempDir) {
     let temp_dir = crate::gen_temp_dir().unwrap();
     let mut graph = Webgraph::builder(&temp_dir, 0u64.into()).open().unwrap();
 
-    for (from, to, label) in test_edges() {
-        graph
-            .insert(Edge {
-                from,
-                to,
-                rel_flags: RelFlags::default(),
-                label,
-                sort_score: 0.0,
-            })
-            .unwrap();
+    for (from, to) in test_edges() {
+        graph.insert(Edge::new_test(from, to)).unwrap();
     }
 
     graph.commit().unwrap();
@@ -146,28 +138,20 @@ fn reversed_distance_calculation() {
 fn merge_path() {
     let mut graphs = Vec::new();
     let temp_dir = crate::gen_temp_dir().unwrap();
-    for (i, (from, to, label)) in (0..).zip([
-        (Node::from("A"), Node::from("B"), String::new()),
-        (Node::from("B"), Node::from("C"), String::new()),
-        (Node::from("C"), Node::from("D"), String::new()),
-        (Node::from("D"), Node::from("E"), String::new()),
-        (Node::from("E"), Node::from("F"), String::new()),
-        (Node::from("F"), Node::from("G"), String::new()),
-        (Node::from("G"), Node::from("H"), String::new()),
+    for (i, (from, to)) in (0..).zip([
+        (Node::from("A"), Node::from("B")),
+        (Node::from("B"), Node::from("C")),
+        (Node::from("C"), Node::from("D")),
+        (Node::from("D"), Node::from("E")),
+        (Node::from("E"), Node::from("F")),
+        (Node::from("F"), Node::from("G")),
+        (Node::from("G"), Node::from("H")),
     ]) {
         let mut graph =
             Webgraph::builder(&temp_dir.as_ref().join(format!("test_{}", i)), 0u64.into())
                 .open()
                 .unwrap();
-        graph
-            .insert(Edge {
-                from,
-                to,
-                rel_flags: RelFlags::default(),
-                label,
-                sort_score: 0.0,
-            })
-            .unwrap();
+        graph.insert(Edge::new_test(from, to)).unwrap();
         graph.commit().unwrap();
         graphs.push(graph);
     }
@@ -199,20 +183,12 @@ fn merge_path() {
 fn merge_simple() {
     let mut graphs = Vec::new();
     let temp_dir = crate::gen_temp_dir().unwrap();
-    for (i, (from, to, label)) in (0..).zip(test_edges()) {
+    for (i, (from, to)) in (0..).zip(test_edges()) {
         let mut graph =
             Webgraph::builder(&temp_dir.as_ref().join(format!("test_{}", i)), 0u64.into())
                 .open()
                 .unwrap();
-        graph
-            .insert(Edge {
-                from,
-                to,
-                rel_flags: RelFlags::default(),
-                label,
-                sort_score: 0.0,
-            })
-            .unwrap();
+        graph.insert(Edge::new_test(from, to)).unwrap();
         graph.commit().unwrap();
         graphs.push(graph);
     }
@@ -275,25 +251,17 @@ fn merge_simple() {
 fn merge_cycle() {
     let mut graphs = Vec::new();
     let temp_dir = crate::gen_temp_dir().unwrap();
-    for (i, (from, to, label)) in (0..).zip([
-        (Node::from("A"), Node::from("B"), String::new()),
-        (Node::from("B"), Node::from("A"), String::new()),
-        (Node::from("B"), Node::from("C"), String::new()),
-        (Node::from("C"), Node::from("A"), String::new()),
+    for (i, (from, to)) in (0..).zip([
+        (Node::from("A"), Node::from("B")),
+        (Node::from("B"), Node::from("A")),
+        (Node::from("B"), Node::from("C")),
+        (Node::from("C"), Node::from("A")),
     ]) {
         let mut graph =
             Webgraph::builder(&temp_dir.as_ref().join(format!("test_{}", i)), 0u64.into())
                 .open()
                 .unwrap();
-        graph
-            .insert(Edge {
-                from,
-                to,
-                rel_flags: RelFlags::default(),
-                label,
-                sort_score: 0.0,
-            })
-            .unwrap();
+        graph.insert(Edge::new_test(from, to)).unwrap();
         graph.commit().unwrap();
         graphs.push(graph);
     }
@@ -344,25 +312,17 @@ fn merge_cycle() {
 fn merge_star() {
     let mut graphs = Vec::new();
     let temp_dir = crate::gen_temp_dir().unwrap();
-    for (i, (from, to, label)) in (0..).zip([
-        (Node::from("A"), Node::from("B"), String::new()),
-        (Node::from("A"), Node::from("C"), String::new()),
-        (Node::from("A"), Node::from("D"), String::new()),
-        (Node::from("A"), Node::from("E"), String::new()),
+    for (i, (from, to)) in (0..).zip([
+        (Node::from("A"), Node::from("B")),
+        (Node::from("A"), Node::from("C")),
+        (Node::from("A"), Node::from("D")),
+        (Node::from("A"), Node::from("E")),
     ]) {
         let mut graph =
             Webgraph::builder(&temp_dir.as_ref().join(format!("test_{}", i)), 0u64.into())
                 .open()
                 .unwrap();
-        graph
-            .insert(Edge {
-                from,
-                to,
-                rel_flags: RelFlags::default(),
-                label,
-                sort_score: 0.0,
-            })
-            .unwrap();
+        graph.insert(Edge::new_test(from, to)).unwrap();
         graph.commit().unwrap();
         graphs.push(graph);
     }
@@ -408,25 +368,17 @@ fn merge_star() {
 fn merge_reverse_star() {
     let mut graphs = Vec::new();
     let temp_dir = crate::gen_temp_dir().unwrap();
-    for (i, (from, to, label)) in (0..).zip([
-        (Node::from("B"), Node::from("A"), String::new()),
-        (Node::from("C"), Node::from("A"), String::new()),
-        (Node::from("D"), Node::from("A"), String::new()),
-        (Node::from("E"), Node::from("A"), String::new()),
+    for (i, (from, to)) in (0..).zip([
+        (Node::from("B"), Node::from("A")),
+        (Node::from("C"), Node::from("A")),
+        (Node::from("D"), Node::from("A")),
+        (Node::from("E"), Node::from("A")),
     ]) {
         let mut graph =
             Webgraph::builder(&temp_dir.as_ref().join(format!("test_{}", i)), 0u64.into())
                 .open()
                 .unwrap();
-        graph
-            .insert(Edge {
-                from,
-                to,
-                rel_flags: RelFlags::default(),
-                label,
-                sort_score: 0.0,
-            })
-            .unwrap();
+        graph.insert(Edge::new_test(from, to)).unwrap();
         graph.commit().unwrap();
         graphs.push(graph);
     }
@@ -484,13 +436,10 @@ proptest! {
             .open()
             .unwrap();
         for (from, to) in nodes.clone() {
-            graph.insert(Edge {
-                from: Node::new_for_test(from.as_str()),
-                to: Node::new_for_test(to.as_str()),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            }).unwrap();
+            graph.insert(Edge::new_test(
+                Node::new_for_test(from.as_str()),
+                Node::new_for_test(to.as_str()),
+            )).unwrap();
 
             if rand::random::<usize>() % 10 == 0 {
                 graph.commit().unwrap();
@@ -544,13 +493,10 @@ fn proptest_case(nodes: &[(&str, &str)]) {
 
     for (i, (from, to)) in nodes.iter().enumerate() {
         graph
-            .insert(Edge {
-                from: Node::new_for_test(from),
-                to: Node::new_for_test(to),
-                rel_flags: RelFlags::default(),
-                label: String::new(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(
+                Node::new_for_test(from),
+                Node::new_for_test(to),
+            ))
             .unwrap();
 
         if i % 2 == 0 {
@@ -642,9 +588,8 @@ fn cap_label_length() {
         .insert(Edge {
             from: Node::from("A"),
             to: Node::from("B"),
-            rel_flags: RelFlags::default(),
             label: "a".repeat(MAX_LABEL_LENGTH + 1),
-            sort_score: 0.0,
+            ..Edge::empty()
         })
         .unwrap();
 
@@ -675,9 +620,9 @@ fn test_edge_limits() {
     );
 
     let mut graphs = Vec::new();
-    for (from, to, label) in &[
-        (Node::from("A"), Node::from("B"), String::new()),
-        (Node::from("A"), Node::from("C"), String::new()),
+    for (from, to) in &[
+        (Node::from("A"), Node::from("B")),
+        (Node::from("A"), Node::from("C")),
     ] {
         let mut graph = Webgraph::builder(
             &temp_dir.as_ref().join(uuid::Uuid::new_v4().to_string()),
@@ -686,13 +631,7 @@ fn test_edge_limits() {
         .open()
         .unwrap();
         graph
-            .insert(Edge {
-                from: from.clone(),
-                to: to.clone(),
-                rel_flags: RelFlags::default(),
-                label: label.clone(),
-                sort_score: 0.0,
-            })
+            .insert(Edge::new_test(from.clone(), to.clone()))
             .unwrap();
         graph.commit().unwrap();
         graphs.push(graph);
@@ -759,8 +698,7 @@ fn test_rel_flags() {
             from: Node::from("A"),
             to: Node::from("B"),
             rel_flags: RelFlags::IS_IN_FOOTER | RelFlags::TAG,
-            label: String::new(),
-            sort_score: 0.0,
+            ..Edge::empty()
         })
         .unwrap();