1 年之前 · cef67d6aaf
--- a/assets/licenses.html
+++ b/assets/licenses.html
@@ -48,7 +48,7 @@
 
															             <li><a href="#MIT">MIT License</a> (187)</li>
														
 
															             <li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (9)</li>
														
 
															             <li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (8)</li>
														
 
															-            <li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (5)</li>
														
 
															+            <li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (6)</li>
														
 
															             <li><a href="#ISC">ISC License</a> (4)</li>
														
 
															             <li><a href="#Unicode-3.0">Unicode License v3</a> (4)</li>
														
 
															             <li><a href="#BSD-2-Clause">BSD 2-Clause &quot;Simplified&quot; License</a> (3)</li>
														
@@ -70,6 +70,7 @@
 
															                 <ul class="license-used-by">
														
 
															                     <li><a href=" https://crates.io/crates/bloom ">bloom 0.1.0</a></li>
														
 
															                     <li><a href=" https://crates.io/crates/file_store ">file_store 0.1.0</a></li>
														
 
															+                    <li><a href=" https://crates.io/crates/lending_iter ">lending_iter 0.1.0</a></li>
														
 
															                     <li><a href=" https://crates.io/crates/ownedbytes ">ownedbytes 0.1.0</a></li>
														
 
															                     <li><a href=" https://crates.io/crates/robotstxt ">robotstxt 0.1.0</a></li>
														
 
															                     <li><a href=" https://crates.io/crates/speedy_kv ">speedy_kv 0.1.0</a></li>
														
--- a/configs/api.toml
+++ b/configs/api.toml
@@ -7,7 +7,7 @@ host = "0.0.0.0:3000"
 
															 bangs_path = "data/bangs.json"
														
 
															 # dual_encoder_model_path = "data/dual_encoder"
														
 
															 prometheus_host = "0.0.0.0:3001"
														
 
															-queries_csv_path = "data/queries_us.csv"
														
 
															+top_phrases_for_autosuggest = 1_000
														
 
															 [spell_check]
														
 
															 path = "data/web_spell/checker"
														
--- a/crates/core/examples/search_preindexed.rs
+++ b/crates/core/examples/search_preindexed.rs
@@ -22,7 +22,6 @@ pub async fn main() {
 
															     };
														
 
															     let config = ApiConfig {
														
 
															-        queries_csv_path: Some("data/queries_us.csv".to_string()),
														
 
															         host: "0.0.0.0:8000".parse().unwrap(),
														
 
															         prometheus_host: "0.0.0.0:8001".parse().unwrap(),
														
 
															         crossencoder_model_path: None,
														
@@ -45,17 +44,18 @@ pub async fn main() {
 
															         }),
														
 
															         max_concurrent_searches: defaults::Api::max_concurrent_searches(),
														
 
															         max_similar_hosts: defaults::Api::max_similar_hosts(),
														
 
															+        top_phrases_for_autosuggest: defaults::Api::top_phrases_for_autosuggest(),
														
 
															     };
														
 
															-    let mut queries =
														
 
															-        stract::autosuggest::Autosuggest::load_csv(config.queries_csv_path.as_ref().unwrap())
														
 
															-            .unwrap()
														
 
															-            .all()
														
 
															-            .unwrap();
														
 
															+    let mut searcher = LocalSearcher::new(index);
														
 
															+    let mut queries: Vec<_> = searcher
														
 
															+        .top_key_phrases(1_000_000)
														
 
															+        .into_iter()
														
 
															+        .map(|phrase| phrase.text().to_string())
														
 
															+        .collect();
														
 
															     queries.shuffle(&mut rand::thread_rng());
														
 
															-    let mut searcher = LocalSearcher::new(index);
														
 
															     searcher.set_collector_config(collector_conf);
														
 
															     searcher.set_snippet_config(SnippetConfig {
														
 
															         num_words_for_lang_detection: Some(250),
														
--- a/crates/core/src/api/mod.rs
+++ b/crates/core/src/api/mod.rs
@@ -34,7 +34,7 @@ use crate::{
 
															     leaky_queue::LeakyQueue,
														
 
															     models::dual_encoder::DualEncoder,
														
 
															     ranking::models::lambdamart::LambdaMART,
														
 
															-    searcher::{api::ApiSearcher, live::LiveSearcher, DistributedSearcher},
														
 
															+    searcher::{api::ApiSearcher, live::LiveSearcher, DistributedSearcher, SearchClient},
														
 
															     similar_hosts::SimilarHostsFinder,
														
 
															     webgraph::remote::RemoteWebgraph,
														
 
															 };
														
@@ -148,11 +148,6 @@ fn build_router(state: Arc<State>) -> Router {
 
															 }
														
 
															 pub async fn router(config: &ApiConfig, counters: Counters) -> Result<Router> {
														
 
															-    let autosuggest = match &config.queries_csv_path {
														
 
															-        Some(queries_csv_path) => Autosuggest::load_csv(queries_csv_path)?,
														
 
															-        None => Autosuggest::empty(),
														
 
															-    };
														
 
															-
														
 
															     let lambda_model = match &config.lambda_model_path {
														
 
															         Some(path) => Some(LambdaMART::open(path)?),
														
 
															         None => None,
														
@@ -194,6 +189,23 @@ pub async fn router(config: &ApiConfig, counters: Counters) -> Result<Router> {
 
															     let dist_searcher = DistributedSearcher::new(Arc::clone(&cluster)).await;
														
 
															     let live_searcher = LiveSearcher::new(Arc::clone(&cluster));
														
 
															+    if !cluster
														
 
															+        .members()
														
 
															+        .await
														
 
															+        .iter()
														
 
															+        .any(|m| m.service.is_searcher())
														
 
															+    {
														
 
															+        log::warn!("Waiting for search nodes to join the cluster");
														
 
															+        cluster.await_member(|m| m.service.is_searcher()).await;
														
 
															+        log::info!("Search nodes joined the cluster");
														
 
															+    }
														
 
															+
														
 
															+    let autosuggest = Autosuggest::from_key_phrases(
														
 
															+        dist_searcher
														
 
															+            .top_key_phrases(config.top_phrases_for_autosuggest)
														
 
															+            .await,
														
 
															+    )?;
														
 
															+
														
 
															     let state = {
														
 
															         let mut cross_encoder = None;
														
--- a/crates/core/src/autosuggest.rs
+++ b/crates/core/src/autosuggest.rs
@@ -21,38 +21,28 @@
 
															 use fst::{automaton::Str, Automaton, IntoStreamer};
														
 
															-use crate::Result;
														
 
															-use std::path::Path;
														
 
															+use crate::{inverted_index::KeyPhrase, Result};
														
 
															 pub struct Autosuggest {
														
 
															     queries: fst::Set<Vec<u8>>,
														
 
															 }
														
 
															 impl Autosuggest {
														
 
															-    pub fn load_csv<P: AsRef<Path>>(path: P) -> Result<Self> {
														
 
															+    pub fn from_key_phrases(key_phrases: Vec<KeyPhrase>) -> Result<Self> {
														
 
															         let mut queries: Vec<String> = Vec::new();
														
 
															-        let mut rdr = csv::Reader::from_path(path)?;
														
 
															-        for result in rdr.records() {
														
 
															-            let record = result?;
														
 
															-            if let Some(query) = record.get(0) {
														
 
															-                queries.push(query.to_string());
														
 
															-            }
														
 
															+        for key_phrase in key_phrases {
														
 
															+            queries.push(key_phrase.text().to_string());
														
 
															         }
														
 
															         queries.sort();
														
 
															+        queries.dedup();
														
 
															         let queries = fst::Set::from_iter(queries)?;
														
 
															         Ok(Self { queries })
														
 
															     }
														
 
															-    pub fn empty() -> Self {
														
 
															-        Self {
														
 
															-            queries: fst::Set::default(),
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															     pub fn suggestions(&self, query: &str) -> Result<Vec<String>> {
														
 
															         let query = query.to_ascii_lowercase();
														
 
															         let q = Str::new(query.as_str()).starts_with();
														
--- a/crates/core/src/config/defaults.rs
+++ b/crates/core/src/config/defaults.rs
@@ -56,6 +56,10 @@ impl Api {
 
															     pub fn max_similar_hosts() -> usize {
														
 
															         1_000
														
 
															     }
														
 
															+
														
 
															+    pub fn top_phrases_for_autosuggest() -> usize {
														
 
															+        1_000_000
														
 
															+    }
														
 
															 }
														
 
															 pub struct Snippet;
														
--- a/crates/core/src/config/mod.rs
+++ b/crates/core/src/config/mod.rs
@@ -224,7 +224,6 @@ pub struct ApiSpellCheck {
 
															 #[derive(Debug, serde::Serialize, serde::Deserialize, Clone)]
														
 
															 pub struct ApiConfig {
														
 
															-    pub queries_csv_path: Option<String>,
														
 
															     pub host: SocketAddr,
														
 
															     pub prometheus_host: SocketAddr,
														
 
															     pub crossencoder_model_path: Option<String>,
														
@@ -239,6 +238,9 @@ pub struct ApiConfig {
 
															     #[serde(default = "defaults::Api::max_similar_hosts")]
														
 
															     pub max_similar_hosts: usize,
														
 
															+    #[serde(default = "defaults::Api::top_phrases_for_autosuggest")]
														
 
															+    pub top_phrases_for_autosuggest: usize,
														
 
															+
														
 
															     pub spell_check: Option<ApiSpellCheck>,
														
 
															     #[serde(default)]
														
--- a/crates/core/src/distributed/cluster.rs
+++ b/crates/core/src/distributed/cluster.rs
@@ -184,4 +184,20 @@ impl Cluster {
 
															         res
														
 
															     }
														
 
															+
														
 
															+    pub async fn await_member<P>(&self, pred: P) -> Member
														
 
															+    where
														
 
															+        P: Fn(&Member) -> bool,
														
 
															+    {
														
 
															+        loop {
														
 
															+            let members = self.members().await;
														
 
															+            for member in members {
														
 
															+                if pred(&member) {
														
 
															+                    return member;
														
 
															+                }
														
 
															+            }
														
 
															+
														
 
															+            tokio::time::sleep(Duration::from_secs(1)).await;
														
 
															+        }
														
 
															+    }
														
 
															 }
														
--- a/crates/core/src/distributed/member.rs
+++ b/crates/core/src/distributed/member.rs
@@ -107,6 +107,12 @@ pub enum Service {
 
															     },
														
 
															 }
														
 
															+impl Service {
														
 
															+    pub fn is_searcher(&self) -> bool {
														
 
															+        matches!(self, Self::Searcher { .. })
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 #[derive(PartialEq, Eq, Hash, Clone, Debug)]
														
 
															 pub struct Member {
														
 
															     pub id: String,
														
--- a/crates/core/src/distributed/retry_strategy.rs
+++ b/crates/core/src/distributed/retry_strategy.rs
@@ -8,7 +8,7 @@ use std::time::Duration;
 
															 /// A retry strategy driven by exponential back-off.
														
 
															 ///
														
 
															 /// The power corresponds to the number of past attempts.
														
 
															-#[derive(Debug, Clone)]
														
 
															+#[derive(Debug, Clone, Copy)]
														
 
															 pub struct ExponentialBackoff {
														
 
															     current: u64,
														
 
															     base: u64,
														
@@ -21,7 +21,7 @@ impl ExponentialBackoff {
 
															     ///
														
 
															     /// The resulting duration is calculated by taking the base to the `n`-th power,
														
 
															     /// where `n` denotes the number of past attempts.
														
 
															-    pub fn from_millis(base: u64) -> ExponentialBackoff {
														
 
															+    pub const fn from_millis(base: u64) -> ExponentialBackoff {
														
 
															         ExponentialBackoff {
														
 
															             current: base,
														
 
															             base,
														
@@ -29,7 +29,7 @@ impl ExponentialBackoff {
 
															         }
														
 
															     }
														
 
															-    pub fn with_limit(mut self, limit: Duration) -> Self {
														
 
															+    pub const fn with_limit(mut self, limit: Duration) -> Self {
														
 
															         self.max_delay = Some(limit);
														
 
															         self
														
 
															     }
														
--- a/crates/core/src/distributed/sonic/replication.rs
+++ b/crates/core/src/distributed/sonic/replication.rs
@@ -21,6 +21,10 @@ use super::Result;
 
															 use crate::distributed::{cluster::Cluster, retry_strategy::ExponentialBackoff, sonic};
														
 
															 use std::{net::SocketAddr, ops::DerefMut, sync::Arc, time::Duration};
														
 
															+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(60);
														
 
															+const DEFAULT_RETRY: ExponentialBackoff =
														
 
															+    ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(3));
														
 
															+
														
 
															 #[derive(Debug)]
														
 
															 pub struct RemoteClient<S>
														
 
															 where
														
@@ -73,24 +77,16 @@ where
 
															     }
														
 
															     pub async fn send<R: sonic::service::Wrapper<S> + Clone>(&self, req: R) -> Result<R::Response> {
														
 
															-        self.send_with_timeout_retry(
														
 
															-            req,
														
 
															-            Duration::from_secs(60),
														
 
															-            ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(3)),
														
 
															-        )
														
 
															-        .await
														
 
															+        self.send_with_timeout_retry(req, DEFAULT_TIMEOUT, DEFAULT_RETRY)
														
 
															+            .await
														
 
															     }
														
 
															     pub async fn batch_send<R: sonic::service::Wrapper<S> + Clone>(
														
 
															         &self,
														
 
															         reqs: &[R],
														
 
															     ) -> Result<Vec<R::Response>> {
														
 
															-        self.batch_send_with_timeout_retry(
														
 
															-            reqs,
														
 
															-            Duration::from_secs(60),
														
 
															-            ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(3)),
														
 
															-        )
														
 
															-        .await
														
 
															+        self.batch_send_with_timeout_retry(reqs, DEFAULT_TIMEOUT, DEFAULT_RETRY)
														
 
															+            .await
														
 
															     }
														
 
															     pub async fn send_with_timeout<R: sonic::service::Wrapper<S> + Clone>(
														
@@ -201,17 +197,18 @@ where
 
															         &self,
														
 
															         req: Req,
														
 
															         client: &RemoteClient<S>,
														
 
															+        timeout: Duration,
														
 
															     ) -> Result<(SocketAddr, Req::Response)>
														
 
															     where
														
 
															         Req: sonic::service::Wrapper<S> + Clone,
														
 
															     {
														
 
															-        Ok((client.addr(), client.send(req).await?))
														
 
															+        Ok((client.addr(), client.send_with_timeout(req, timeout).await?))
														
 
															     }
														
 
															-
														
 
															-    pub async fn send<Req, Sel>(
														
 
															+    pub async fn send_with_timeout<Req, Sel>(
														
 
															         &self,
														
 
															         req: Req,
														
 
															         selector: &Sel,
														
 
															+        timeout: Duration,
														
 
															     ) -> Result<Vec<(SocketAddr, Req::Response)>>
														
 
															     where
														
 
															         Req: sonic::service::Wrapper<S> + Clone,
														
@@ -219,7 +216,7 @@ where
 
															     {
														
 
															         let mut futures = Vec::new();
														
 
															         for client in selector.select(&self.clients) {
														
 
															-            futures.push(self.send_single(req.clone(), client));
														
 
															+            futures.push(self.send_single(req.clone(), client, timeout));
														
 
															         }
														
 
															         let mut results = Vec::new();
														
@@ -235,6 +232,18 @@ where
 
															         Ok(results)
														
 
															     }
														
 
															+    pub async fn send<Req, Sel>(
														
 
															+        &self,
														
 
															+        req: Req,
														
 
															+        selector: &Sel,
														
 
															+    ) -> Result<Vec<(SocketAddr, Req::Response)>>
														
 
															+    where
														
 
															+        Req: sonic::service::Wrapper<S> + Clone,
														
 
															+        Sel: ReplicaSelector<S>,
														
 
															+    {
														
 
															+        self.send_with_timeout(req, selector, DEFAULT_TIMEOUT).await
														
 
															+    }
														
 
															+
														
 
															     async fn batch_send_single<Req>(
														
 
															         &self,
														
 
															         reqs: &[Req],
														
@@ -356,6 +365,7 @@ where
 
															         req: Req,
														
 
															         shard: &Shard<S, Id>,
														
 
															         replica_selector: &Sel,
														
 
															+        timeout: Duration,
														
 
															     ) -> Result<(Id, Vec<(SocketAddr, Req::Response)>)>
														
 
															     where
														
 
															         Req: sonic::service::Wrapper<S> + Clone,
														
@@ -363,15 +373,19 @@ where
 
															     {
														
 
															         Ok((
														
 
															             shard.id.clone(),
														
 
															-            shard.replicas.send(req, replica_selector).await?,
														
 
															+            shard
														
 
															+                .replicas
														
 
															+                .send_with_timeout(req, replica_selector, timeout)
														
 
															+                .await?,
														
 
															         ))
														
 
															     }
														
 
															-    pub async fn send<Req, SSel, RSel>(
														
 
															+    pub async fn send_with_timeout<Req, SSel, RSel>(
														
 
															         &self,
														
 
															         req: Req,
														
 
															         shard_selector: &SSel,
														
 
															         replica_selector: &RSel,
														
 
															+        timeout: Duration,
														
 
															     ) -> Result<Vec<(Id, Vec<(SocketAddr, Req::Response)>)>>
														
 
															     where
														
 
															         Req: sonic::service::Wrapper<S> + Clone,
														
@@ -380,7 +394,7 @@ where
 
															     {
														
 
															         let mut futures = Vec::new();
														
 
															         for shard in shard_selector.select(&self.shards) {
														
 
															-            futures.push(self.send_single(req.clone(), shard, replica_selector));
														
 
															+            futures.push(self.send_single(req.clone(), shard, replica_selector, timeout));
														
 
															         }
														
 
															         let mut results = Vec::new();
														
@@ -396,6 +410,21 @@ where
 
															         Ok(results)
														
 
															     }
														
 
															+    pub async fn send<Req, SSel, RSel>(
														
 
															+        &self,
														
 
															+        req: Req,
														
 
															+        shard_selector: &SSel,
														
 
															+        replica_selector: &RSel,
														
 
															+    ) -> Result<Vec<(Id, Vec<(SocketAddr, Req::Response)>)>>
														
 
															+    where
														
 
															+        Req: sonic::service::Wrapper<S> + Clone,
														
 
															+        SSel: ShardSelector<S, Id>,
														
 
															+        RSel: ReplicaSelector<S>,
														
 
															+    {
														
 
															+        self.send_with_timeout(req, shard_selector, replica_selector, DEFAULT_TIMEOUT)
														
 
															+            .await
														
 
															+    }
														
 
															+
														
 
															     async fn batch_send_single<Req, Sel>(
														
 
															         &self,
														
 
															         reqs: &[Req],
														
--- a/crates/core/src/entrypoint/autosuggest_scrape.rs
+++ b/crates/core/src/entrypoint/autosuggest_scrape.rs
@@ -1,188 +0,0 @@
 
															-// Stract is an open source web search engine.
														
 
															-// Copyright (C) 2023 Stract ApS
														
 
															-//
														
 
															-// This program is free software: you can redistribute it and/or modify
														
 
															-// it under the terms of the GNU Affero General Public License as
														
 
															-// published by the Free Software Foundation, either version 3 of the
														
 
															-// License, or (at your option) any later version.
														
 
															-//
														
 
															-// This program is distributed in the hope that it will be useful,
														
 
															-// but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															-// GNU Affero General Public License for more details.
														
 
															-//
														
 
															-// You should have received a copy of the GNU Affero General Public License
														
 
															-// along with this program.  If not, see <https://www.gnu.org/licenses/>.
														
 
															-
														
 
															-use std::fmt::Display;
														
 
															-use std::{
														
 
															-    collections::{HashSet, VecDeque},
														
 
															-    path::Path,
														
 
															-    str::FromStr,
														
 
															-};
														
 
															-
														
 
															-use csv::Writer;
														
 
															-use indicatif::{ProgressBar, ProgressStyle};
														
 
															-
														
 
															-use crate::Result;
														
 
															-
														
 
															-fn suggestions(query: &str, gl: &str) -> Result<Vec<String>> {
														
 
															-    let url = format!(
														
 
															-        "https://www.google.com/complete/search?q={}&gl={}&client=gws-wiz&xssi=t",
														
 
															-        crate::urlencode(query),
														
 
															-        gl
														
 
															-    );
														
 
															-
														
 
															-    let client = reqwest::blocking::Client::new();
														
 
															-    let builder = client.get(url);
														
 
															-
														
 
															-    let input = builder.send()?.text()?;
														
 
															-
														
 
															-    let mut input = input.split('\n');
														
 
															-    input.next();
														
 
															-    let input = input.next().expect("None option");
														
 
															-
														
 
															-    let output: serde_json::Value = serde_json::from_str(input)?;
														
 
															-    let mut suggestions = Vec::new();
														
 
															-
														
 
															-    if let serde_json::Value::Array(arr) = output {
														
 
															-        if let serde_json::Value::Array(arr) = arr[0].clone() {
														
 
															-            for result in arr {
														
 
															-                if let serde_json::Value::Array(result) = result {
														
 
															-                    if let serde_json::Value::String(result) = result[0].clone() {
														
 
															-                        let result = result.replace("<b>", "").replace("</b>", "");
														
 
															-                        suggestions.push(result);
														
 
															-                    }
														
 
															-                }
														
 
															-            }
														
 
															-        }
														
 
															-    }
														
 
															-
														
 
															-    Ok(suggestions)
														
 
															-}
														
 
															-
														
 
															-#[derive(Clone)]
														
 
															-pub enum Gl {
														
 
															-    Us,
														
 
															-}
														
 
															-
														
 
															-impl Display for Gl {
														
 
															-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
														
 
															-        let code = match self {
														
 
															-            Gl::Us => "us",
														
 
															-        };
														
 
															-        write!(f, "{code}")
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-impl FromStr for Gl {
														
 
															-    type Err = crate::Error;
														
 
															-
														
 
															-    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
														
 
															-        match s.to_ascii_lowercase().as_str() {
														
 
															-            "us" => Ok(Gl::Us),
														
 
															-            _ => Err(crate::Error::UnknownCLIOption),
														
 
															-        }
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-fn save_queries<P: AsRef<Path>>(queries: &HashSet<String>, path: P) -> Result<()> {
														
 
															-    let mut wtr = Writer::from_path(&path)?;
														
 
															-
														
 
															-    let mut queries: Vec<_> = queries.iter().collect();
														
 
															-    queries.sort();
														
 
															-
														
 
															-    for query in queries {
														
 
															-        wtr.write_record([query])?;
														
 
															-    }
														
 
															-
														
 
															-    wtr.flush()?;
														
 
															-
														
 
															-    Ok(())
														
 
															-}
														
 
															-
														
 
															-pub fn run<P: AsRef<Path>>(
														
 
															-    queries_to_scrape: usize,
														
 
															-    gl: Gl,
														
 
															-    ms_sleep_between_req: u64,
														
 
															-    output_dir: P,
														
 
															-) -> Result<()> {
														
 
															-    let mut queries = HashSet::new();
														
 
															-    let mut queue = VecDeque::new();
														
 
															-
														
 
															-    let pb = ProgressBar::new(queries_to_scrape as u64);
														
 
															-    pb.set_style(
														
 
															-        ProgressStyle::default_bar()
														
 
															-            .template("{spinner:.green} [{elapsed_precise}] [{wide_bar}] {pos:>7}/{len:7} ({eta})")
														
 
															-            .unwrap()
														
 
															-            .progress_chars("#>-"),
														
 
															-    );
														
 
															-
														
 
															-    for c in 'a'..='z' {
														
 
															-        queue.push_back(c.to_string());
														
 
															-    }
														
 
															-
														
 
															-    let path = output_dir
														
 
															-        .as_ref()
														
 
															-        .join(format!("queries_{:}.csv", gl.to_string().as_str()));
														
 
															-
														
 
															-    let mut queries_since_last_save = 0;
														
 
															-
														
 
															-    while let Some(query) = queue.pop_front() {
														
 
															-        let res = suggestions(&query, gl.to_string().as_str());
														
 
															-
														
 
															-        if res.is_err() {
														
 
															-            continue;
														
 
															-        }
														
 
															-
														
 
															-        let res = res.unwrap();
														
 
															-        let mut new_queries = 0;
														
 
															-
														
 
															-        for next_query in res {
														
 
															-            if queries.contains(&next_query) {
														
 
															-                continue;
														
 
															-            }
														
 
															-
														
 
															-            let mut new = Vec::new();
														
 
															-            for c in next_query.chars() {
														
 
															-                let q = new.clone().into_iter().collect();
														
 
															-                new.push(c);
														
 
															-
														
 
															-                if !queries.contains(&q) {
														
 
															-                    queue.push_back(q);
														
 
															-                }
														
 
															-            }
														
 
															-
														
 
															-            for q in next_query.split_whitespace() {
														
 
															-                if !queries.contains(q) {
														
 
															-                    queries.insert(q.to_string());
														
 
															-                    queue.push_back(q.to_string());
														
 
															-                }
														
 
															-            }
														
 
															-
														
 
															-            new_queries += 1;
														
 
															-            queries.insert(next_query);
														
 
															-            pb.tick();
														
 
															-            pb.set_position(queries.len() as u64);
														
 
															-        }
														
 
															-
														
 
															-        if queries.len() >= queries_to_scrape {
														
 
															-            break;
														
 
															-        }
														
 
															-
														
 
															-        queries_since_last_save += new_queries;
														
 
															-
														
 
															-        if queries_since_last_save > 1_000 {
														
 
															-            save_queries(&queries, &path)?;
														
 
															-            queries_since_last_save = 0;
														
 
															-        }
														
 
															-
														
 
															-        std::thread::sleep(std::time::Duration::from_millis(ms_sleep_between_req));
														
 
															-    }
														
 
															-
														
 
															-    pb.finish();
														
 
															-
														
 
															-    save_queries(&queries, &path)?;
														
 
															-
														
 
															-    Ok(())
														
 
															-}
														
--- a/crates/core/src/entrypoint/configure.rs
+++ b/crates/core/src/entrypoint/configure.rs
@@ -41,7 +41,6 @@ fn download_files() {
 
															         .unwrap()
														
 
															         .block_on(async {
														
 
															             for name in [
														
 
															-                "queries_us.csv",
														
 
															                 "sample.warc.gz",
														
 
															                 "bangs.json",
														
 
															                 "english-wordnet-2022-subset.ttl",
														
--- a/crates/core/src/entrypoint/mod.rs
+++ b/crates/core/src/entrypoint/mod.rs
@@ -17,7 +17,6 @@
 
															 //! The entrypoint module contains all entrypoints that runs the executables.
														
 
															 pub mod ampc;
														
 
															 pub mod api;
														
 
															-pub mod autosuggest_scrape;
														
 
															 pub mod canonical;
														
 
															 mod centrality;
														
 
															 #[cfg(feature = "dev")]
														
--- a/crates/core/src/entrypoint/search_server.rs
+++ b/crates/core/src/entrypoint/search_server.rs
@@ -21,14 +21,13 @@ use url::Url;
 
															 use crate::{
														
 
															     config,
														
 
															-    distributed::sonic::service::sonic_service,
														
 
															     distributed::{
														
 
															         cluster::Cluster,
														
 
															         member::{Member, Service},
														
 
															-        sonic,
														
 
															+        sonic::{self, service::sonic_service},
														
 
															     },
														
 
															     index::Index,
														
 
															-    inverted_index::{self, RetrievedWebpage},
														
 
															+    inverted_index::{self, KeyPhrase, RetrievedWebpage},
														
 
															     models::dual_encoder::DualEncoder,
														
 
															     ranking::models::{lambdamart::LambdaMART, linear::LinearRegression},
														
 
															     searcher::{InitialWebsiteResult, LocalSearcher, SearchQuery},
														
@@ -42,6 +41,7 @@ sonic_service!(
 
															         Search,
														
 
															         GetWebpage,
														
 
															         GetHomepageDescriptions,
														
 
															+        TopKeyPhrases,
														
 
															     ]
														
 
															 );
														
@@ -93,7 +93,7 @@ impl SearchService {
 
															     }
														
 
															 }
														
 
															-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
														
 
															+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
														
 
															 pub struct RetrieveWebsites {
														
 
															     pub websites: Vec<inverted_index::WebpagePointer>,
														
 
															     pub query: String,
														
@@ -108,7 +108,7 @@ impl sonic::service::Message<SearchService> for RetrieveWebsites {
 
															     }
														
 
															 }
														
 
															-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
														
 
															+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
														
 
															 pub struct Search {
														
 
															     pub query: SearchQuery,
														
 
															 }
														
@@ -119,7 +119,7 @@ impl sonic::service::Message<SearchService> for Search {
 
															     }
														
 
															 }
														
 
															-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
														
 
															+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
														
 
															 pub struct GetWebpage {
														
 
															     pub url: String,
														
 
															 }
														
@@ -130,7 +130,7 @@ impl sonic::service::Message<SearchService> for GetWebpage {
 
															     }
														
 
															 }
														
 
															-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
														
 
															+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
														
 
															 pub struct GetHomepageDescriptions {
														
 
															     #[bincode(with_serde)]
														
 
															     pub urls: Vec<Url>,
														
@@ -152,6 +152,17 @@ impl sonic::service::Message<SearchService> for GetHomepageDescriptions {
 
															     }
														
 
															 }
														
 
															+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
														
 
															+pub struct TopKeyPhrases {
														
 
															+    pub top_n: usize,
														
 
															+}
														
 
															+impl sonic::service::Message<SearchService> for TopKeyPhrases {
														
 
															+    type Response = Vec<KeyPhrase>;
														
 
															+    async fn handle(self, server: &SearchService) -> Self::Response {
														
 
															+        server.local_searcher.top_key_phrases(self.top_n)
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 pub async fn run(config: config::SearchServerConfig) -> Result<()> {
														
 
															     let addr = config.host;
														
 
															     let server = SearchService::new(config).await?.bind(addr).await.unwrap();
														
--- a/crates/core/src/inverted_index/key_phrase.rs
+++ b/crates/core/src/inverted_index/key_phrase.rs
@@ -0,0 +1,204 @@
 
															+// Stract is an open source web search engine.
														
 
															+// Copyright (C) 2024 Stract ApS
														
 
															+//
														
 
															+// This program is free software: you can redistribute it and/or modify
														
 
															+// it under the terms of the GNU Affero General Public License as
														
 
															+// published by the Free Software Foundation, either version 3 of the
														
 
															+// License, or (at your option) any later version.
														
 
															+//
														
 
															+// This program is distributed in the hope that it will be useful,
														
 
															+// but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															+// GNU Affero General Public License for more details.
														
 
															+//
														
 
															+// You should have received a copy of the GNU Affero General Public License
														
 
															+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
														
 
															+
														
 
															+use itertools::Itertools;
														
 
															+use std::str;
														
 
															+use std::{cmp::Reverse, collections::HashMap};
														
 
															+
														
 
															+use crate::{
														
 
															+    schema::text_field::{self, TextField},
														
 
															+    SortableFloat,
														
 
															+};
														
 
															+
														
 
															+const DONT_SCORE_TOP_PERCENT_OF_WORDS: f64 = 0.002;
														
 
															+const NON_ALPHABETIC_CHAR_THRESHOLD: f64 = 0.25;
														
 
															+
														
 
															+struct Scorer {
														
 
															+    word_freq: HashMap<String, u64>,
														
 
															+    word_docs: HashMap<String, u64>,
														
 
															+    total_doc_freq: f64,
														
 
															+    num_docs: u64,
														
 
															+    word_freq_threshold: u64,
														
 
															+}
														
 
															+
														
 
															+impl Scorer {
														
 
															+    fn new(searcher: &tantivy::Searcher, field: tantivy::schema::Field) -> Self {
														
 
															+        let mut word_freq = HashMap::new();
														
 
															+        let mut word_docs = HashMap::new();
														
 
															+        let mut total_doc_freq = 0.0;
														
 
															+        let mut num_docs = 0;
														
 
															+
														
 
															+        for seg_reader in searcher.segment_readers() {
														
 
															+            let inv_index = seg_reader.inverted_index(field).unwrap();
														
 
															+            let mut stream = inv_index.terms().stream().unwrap();
														
 
															+
														
 
															+            while let Some((term, _)) = stream.next() {
														
 
															+                let term_str = str::from_utf8(term).unwrap().to_string();
														
 
															+
														
 
															+                let words = term_str.split_whitespace().collect::<Vec<_>>();
														
 
															+                if words.is_empty() {
														
 
															+                    continue;
														
 
															+                }
														
 
															+
														
 
															+                for word in &words {
														
 
															+                    *word_freq.entry(word.to_string()).or_insert(0) += 1;
														
 
															+                }
														
 
															+
														
 
															+                for word in words.into_iter().unique() {
														
 
															+                    *word_docs.entry(word.to_string()).or_insert(0) += 1;
														
 
															+                }
														
 
															+            }
														
 
															+
														
 
															+            total_doc_freq += seg_reader.num_docs() as f64;
														
 
															+            num_docs += inv_index.terms().num_terms() as u64;
														
 
															+        }
														
 
															+
														
 
															+        let num_words = word_freq.len() as f64;
														
 
															+        let word_freq_threshold = *word_freq
														
 
															+            .values()
														
 
															+            .sorted_by(|a, b| b.cmp(a))
														
 
															+            .nth((num_words * DONT_SCORE_TOP_PERCENT_OF_WORDS).ceil() as usize)
														
 
															+            .unwrap_or(&0);
														
 
															+
														
 
															+        Self {
														
 
															+            word_freq,
														
 
															+            word_docs,
														
 
															+            total_doc_freq,
														
 
															+            num_docs,
														
 
															+            word_freq_threshold,
														
 
															+        }
														
 
															+    }
														
 
															+
														
 
															+    #[inline]
														
 
															+    fn word_freq(&self) -> &HashMap<String, u64> {
														
 
															+        &self.word_freq
														
 
															+    }
														
 
															+
														
 
															+    #[inline]
														
 
															+    fn word_docs(&self) -> &HashMap<String, u64> {
														
 
															+        &self.word_docs
														
 
															+    }
														
 
															+
														
 
															+    #[inline]
														
 
															+    fn total_doc_freq(&self) -> f64 {
														
 
															+        self.total_doc_freq
														
 
															+    }
														
 
															+
														
 
															+    #[inline]
														
 
															+    fn num_docs(&self) -> u64 {
														
 
															+        self.num_docs
														
 
															+    }
														
 
															+
														
 
															+    #[inline]
														
 
															+    fn word_freq_threshold(&self) -> u64 {
														
 
															+        self.word_freq_threshold
														
 
															+    }
														
 
															+
														
 
															+    fn score(&self, words: &[&str], doc_freq: u32) -> f64 {
														
 
															+        let word_freq_threshold = self.word_freq_threshold();
														
 
															+        let mut score = 0.0;
														
 
															+        let num_words = words.len();
														
 
															+        for word in words.iter().unique() {
														
 
															+            let word_chars = word.chars().count();
														
 
															+            if word.chars().filter(|c| !c.is_alphabetic()).count() as f64 / word_chars as f64
														
 
															+                > NON_ALPHABETIC_CHAR_THRESHOLD
														
 
															+            {
														
 
															+                continue;
														
 
															+            }
														
 
															+
														
 
															+            let word_docs = self.word_docs().get(*word).unwrap_or(&0);
														
 
															+            let wf = *self.word_freq().get(*word).unwrap_or(&0);
														
 
															+
														
 
															+            if wf > word_freq_threshold {
														
 
															+                continue;
														
 
															+            }
														
 
															+
														
 
															+            let tf = (wf as f64) / num_words as f64;
														
 
															+            let idf = ((self.num_docs() as f64) / (*word_docs as f64) + 1.0).ln();
														
 
															+
														
 
															+            score += tf * idf;
														
 
															+        }
														
 
															+
														
 
															+        let cf = doc_freq as f64 / (self.total_doc_freq() + 1.0);
														
 
															+
														
 
															+        score * (1.0 - cf)
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
														
 
															+pub struct KeyPhrase {
														
 
															+    phrase: String,
														
 
															+    score: f64,
														
 
															+}
														
 
															+
														
 
															+impl KeyPhrase {
														
 
															+    pub fn new(phrase: String, score: f64) -> Self {
														
 
															+        Self { phrase, score }
														
 
															+    }
														
 
															+
														
 
															+    pub fn compute_top(reader: &tantivy::IndexReader, top_n: usize) -> Vec<Self> {
														
 
															+        let searcher = reader.searcher();
														
 
															+        let field = searcher
														
 
															+            .schema()
														
 
															+            .get_field(text_field::KeyPhrases.name())
														
 
															+            .unwrap();
														
 
															+
														
 
															+        let scorer = Scorer::new(&searcher, field);
														
 
															+
														
 
															+        let mut keywords: HashMap<String, f64> = HashMap::new();
														
 
															+
														
 
															+        for seg_reader in searcher.segment_readers() {
														
 
															+            let inv_index = seg_reader.inverted_index(field).unwrap();
														
 
															+            let mut stream = inv_index.terms().stream().unwrap();
														
 
															+            while let Some((term, info)) = stream.next() {
														
 
															+                let term_str = str::from_utf8(term).unwrap().to_string();
														
 
															+                let num_chars = term_str.chars().count();
														
 
															+
														
 
															+                if term_str.chars().filter(|c| !c.is_alphabetic()).count() as f64 / num_chars as f64
														
 
															+                    > NON_ALPHABETIC_CHAR_THRESHOLD
														
 
															+                {
														
 
															+                    continue;
														
 
															+                }
														
 
															+
														
 
															+                let words = term_str.split_whitespace().collect::<Vec<_>>();
														
 
															+                let score = scorer.score(&words, info.doc_freq);
														
 
															+
														
 
															+                if score.is_normal() {
														
 
															+                    let term_str = words.join(" ");
														
 
															+                    *keywords.entry(term_str).or_default() += score;
														
 
															+                }
														
 
															+            }
														
 
															+        }
														
 
															+
														
 
															+        crate::sorted_k(
														
 
															+            keywords
														
 
															+                .into_iter()
														
 
															+                .map(|(phrase, score)| (Reverse(SortableFloat::from(score)), phrase)),
														
 
															+            top_n,
														
 
															+        )
														
 
															+        .into_iter()
														
 
															+        .map(|(Reverse(score), phrase)| KeyPhrase::new(phrase, score.into()))
														
 
															+        .collect()
														
 
															+    }
														
 
															+
														
 
															+    pub fn score(&self) -> f64 {
														
 
															+        self.score
														
 
															+    }
														
 
															+
														
 
															+    pub fn text(&self) -> &str {
														
 
															+        &self.phrase
														
 
															+    }
														
 
															+}
														
--- a/crates/core/src/inverted_index/mod.rs
+++ b/crates/core/src/inverted_index/mod.rs
@@ -27,17 +27,19 @@
 
															 //! but the principle is the same.
														
 
															 mod indexing;
														
 
															+mod key_phrase;
														
 
															+mod retrieved_webpage;
														
 
															 mod search;
														
 
															 pub use indexing::merge_tantivy_segments;
														
 
															-
														
 
															-use chrono::{DateTime, NaiveDateTime};
														
 
															+pub use key_phrase::KeyPhrase;
														
 
															+pub use retrieved_webpage::RetrievedWebpage;
														
 
															 use tantivy::directory::MmapDirectory;
														
 
															-use tantivy::schema::{Schema, Value};
														
 
															+use tantivy::schema::Schema;
														
 
															 use tantivy::tokenizer::TokenizerManager;
														
 
															-use tantivy::{IndexReader, IndexWriter, TantivyDocument};
														
 
															+use tantivy::{IndexReader, IndexWriter};
														
 
															 use crate::collector::{approx_count, Hashes};
														
 
															 use crate::config::SnippetConfig;
														
@@ -45,15 +47,10 @@ use crate::numericalfield_reader::NumericalFieldReader;
 
															 use crate::ranking::initial::Score;
														
 
															-use crate::schema::text_field::TextField;
														
 
															-use crate::schema::{numerical_field, text_field, Field, NumericalFieldEnum, TextFieldEnum};
														
 
															-use crate::snippet::TextSnippet;
														
 
															+use crate::schema::{numerical_field, Field, NumericalFieldEnum};
														
 
															 use crate::tokenizer::fields::{
														
 
															-    BigramTokenizer, Identity, JsonField, Stemmed, TrigramTokenizer, UrlTokenizer,
														
 
															+    BigramTokenizer, Identity, JsonField, NewlineTokenizer, Stemmed, TrigramTokenizer, UrlTokenizer,
														
 
															 };
														
 
															-use crate::webpage::region::Region;
														
 
															-
														
 
															-use crate::webpage::schema_org;
														
 
															 use crate::Result;
														
 
															 use crate::{schema::create_schema, tokenizer::FieldTokenizer};
														
 
															 use std::fs;
														
@@ -129,6 +126,9 @@ fn register_tokenizers(manager: &TokenizerManager) {
 
															     let tokenizer = FieldTokenizer::Json(JsonField);
														
 
															     manager.register(tokenizer.as_str(), tokenizer);
														
 
															+
														
 
															+    let tokenizer = FieldTokenizer::Newline(NewlineTokenizer::default());
														
 
															+    manager.register(tokenizer.as_str(), tokenizer);
														
 
															 }
														
 
															 pub struct InvertedIndex {
														
@@ -169,7 +169,6 @@ impl InvertedIndex {
 
															         register_tokenizers(tantivy_index.tokenizers());
														
 
															         let reader: IndexReader = tantivy_index.reader_builder().try_into()?;
														
 
															-
														
 
															         let columnfield_reader = NumericalFieldReader::new(&reader.searcher());
														
 
															         Ok(InvertedIndex {
														
@@ -220,116 +219,6 @@ pub struct SearchResult {
 
															     pub documents: Vec<RetrievedWebpage>,
														
 
															 }
														
 
															-#[derive(
														
 
															-    Default,
														
 
															-    Debug,
														
 
															-    Clone,
														
 
															-    serde::Serialize,
														
 
															-    serde::Deserialize,
														
 
															-    bincode::Encode,
														
 
															-    bincode::Decode,
														
 
															-    PartialEq,
														
 
															-)]
														
 
															-pub struct RetrievedWebpage {
														
 
															-    pub title: String,
														
 
															-    pub url: String,
														
 
															-    pub body: String,
														
 
															-    pub snippet: TextSnippet,
														
 
															-    pub dirty_body: String,
														
 
															-    pub description: Option<String>,
														
 
															-    pub dmoz_description: Option<String>,
														
 
															-    #[bincode(with_serde)]
														
 
															-    pub updated_time: Option<NaiveDateTime>,
														
 
															-    pub schema_org: Vec<schema_org::Item>,
														
 
															-    pub region: Region,
														
 
															-    pub likely_has_ads: bool,
														
 
															-    pub likely_has_paywall: bool,
														
 
															-    pub recipe_first_ingredient_tag_id: Option<String>,
														
 
															-    pub keywords: Vec<String>,
														
 
															-}
														
 
															-impl RetrievedWebpage {
														
 
															-    pub fn description(&self) -> Option<&String> {
														
 
															-        self.description.as_ref().or(self.dmoz_description.as_ref())
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															-fn str_value(name: &str, value: &tantivy::schema::document::CompactDocValue) -> String {
														
 
															-    value
														
 
															-        .as_str()
														
 
															-        .unwrap_or_else(|| panic!("{} field should be text", name))
														
 
															-        .to_string()
														
 
															-}
														
 
															-
														
 
															-impl From<TantivyDocument> for RetrievedWebpage {
														
 
															-    fn from(doc: TantivyDocument) -> Self {
														
 
															-        let mut webpage = RetrievedWebpage::default();
														
 
															-
														
 
															-        for (field, value) in doc.field_values() {
														
 
															-            match Field::get(field.field_id() as usize) {
														
 
															-                Some(Field::Text(TextFieldEnum::Title(_))) => {
														
 
															-                    webpage.title = str_value(text_field::Title.name(), &value);
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::StemmedCleanBody(_))) => {
														
 
															-                    webpage.body = str_value(text_field::StemmedCleanBody.name(), &value);
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::Description(_))) => {
														
 
															-                    let desc = str_value(text_field::Description.name(), &value);
														
 
															-                    webpage.description = if desc.is_empty() { None } else { Some(desc) }
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::Url(_))) => {
														
 
															-                    webpage.url = str_value(text_field::Url.name(), &value);
														
 
															-                }
														
 
															-                Some(Field::Numerical(NumericalFieldEnum::LastUpdated(_))) => {
														
 
															-                    webpage.updated_time = {
														
 
															-                        let timestamp = value.as_u64().unwrap() as i64;
														
 
															-                        if timestamp == 0 {
														
 
															-                            None
														
 
															-                        } else {
														
 
															-                            DateTime::from_timestamp(timestamp, 0).map(|dt| dt.naive_utc())
														
 
															-                        }
														
 
															-                    }
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::AllBody(_))) => {
														
 
															-                    webpage.dirty_body = str_value(text_field::AllBody.name(), &value);
														
 
															-                }
														
 
															-                Some(Field::Numerical(NumericalFieldEnum::Region(_))) => {
														
 
															-                    webpage.region = {
														
 
															-                        let id = value.as_u64().unwrap();
														
 
															-                        Region::from_id(id)
														
 
															-                    }
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::DmozDescription(_))) => {
														
 
															-                    let desc = str_value(text_field::DmozDescription.name(), &value);
														
 
															-                    webpage.dmoz_description = if desc.is_empty() { None } else { Some(desc) }
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::SchemaOrgJson(_))) => {
														
 
															-                    let json = str_value(text_field::SchemaOrgJson.name(), &value);
														
 
															-                    webpage.schema_org = serde_json::from_str(&json).unwrap_or_default();
														
 
															-                }
														
 
															-                Some(Field::Numerical(NumericalFieldEnum::LikelyHasAds(_))) => {
														
 
															-                    webpage.likely_has_ads = value.as_bool().unwrap_or_default();
														
 
															-                }
														
 
															-                Some(Field::Numerical(NumericalFieldEnum::LikelyHasPaywall(_))) => {
														
 
															-                    webpage.likely_has_paywall = value.as_bool().unwrap_or_default();
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::RecipeFirstIngredientTagId(_))) => {
														
 
															-                    let tag_id = str_value(text_field::RecipeFirstIngredientTagId.name(), &value);
														
 
															-                    if !tag_id.is_empty() {
														
 
															-                        webpage.recipe_first_ingredient_tag_id = Some(tag_id);
														
 
															-                    }
														
 
															-                }
														
 
															-                Some(Field::Text(TextFieldEnum::Keywords(_))) => {
														
 
															-                    let keywords = str_value(text_field::Keywords.name(), &value);
														
 
															-                    webpage.keywords = keywords.split('\n').map(|s| s.to_string()).collect();
														
 
															-                }
														
 
															-                _ => {}
														
 
															-            }
														
 
															-        }
														
 
															-
														
 
															-        webpage
														
 
															-    }
														
 
															-}
														
 
															-
														
 
															 #[cfg(test)]
														
 
															 mod tests {
														
 
															     use candle_core::Tensor;
														
@@ -343,7 +232,7 @@ mod tests {
 
															         ranking::{Ranker, SignalComputer},
														
 
															         search_ctx::Ctx,
														
 
															         searcher::SearchQuery,
														
 
															-        webpage::{Html, Webpage},
														
 
															+        webpage::{schema_org, Html, Webpage},
														
 
															         OneOrMany,
														
 
															     };
														
--- a/crates/core/src/inverted_index/retrieved_webpage.rs
+++ b/crates/core/src/inverted_index/retrieved_webpage.rs
@@ -0,0 +1,137 @@
 
															+// Stract is an open source web search engine.
														
 
															+// Copyright (C) 2024 Stract ApS
														
 
															+//
														
 
															+// This program is free software: you can redistribute it and/or modify
														
 
															+// it under the terms of the GNU Affero General Public License as
														
 
															+// published by the Free Software Foundation, either version 3 of the
														
 
															+// License, or (at your option) any later version.
														
 
															+//
														
 
															+// This program is distributed in the hope that it will be useful,
														
 
															+// but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															+// GNU Affero General Public License for more details.
														
 
															+//
														
 
															+// You should have received a copy of the GNU Affero General Public License
														
 
															+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
														
 
															+
														
 
															+use chrono::{DateTime, NaiveDateTime};
														
 
															+use tantivy::{schema::Value, TantivyDocument};
														
 
															+
														
 
															+use crate::{
														
 
															+    schema::{
														
 
															+        text_field::{self, TextField},
														
 
															+        Field, NumericalFieldEnum, TextFieldEnum,
														
 
															+    },
														
 
															+    snippet::TextSnippet,
														
 
															+    webpage::{schema_org, Region},
														
 
															+};
														
 
															+
														
 
															+#[derive(
														
 
															+    Default,
														
 
															+    Debug,
														
 
															+    Clone,
														
 
															+    serde::Serialize,
														
 
															+    serde::Deserialize,
														
 
															+    bincode::Encode,
														
 
															+    bincode::Decode,
														
 
															+    PartialEq,
														
 
															+)]
														
 
															+pub struct RetrievedWebpage {
														
 
															+    pub title: String,
														
 
															+    pub url: String,
														
 
															+    pub body: String,
														
 
															+    pub snippet: TextSnippet,
														
 
															+    pub dirty_body: String,
														
 
															+    pub description: Option<String>,
														
 
															+    pub dmoz_description: Option<String>,
														
 
															+    #[bincode(with_serde)]
														
 
															+    pub updated_time: Option<NaiveDateTime>,
														
 
															+    pub schema_org: Vec<schema_org::Item>,
														
 
															+    pub region: Region,
														
 
															+    pub likely_has_ads: bool,
														
 
															+    pub likely_has_paywall: bool,
														
 
															+    pub recipe_first_ingredient_tag_id: Option<String>,
														
 
															+    pub keywords: Vec<String>,
														
 
															+}
														
 
															+impl RetrievedWebpage {
														
 
															+    pub fn description(&self) -> Option<&String> {
														
 
															+        self.description.as_ref().or(self.dmoz_description.as_ref())
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+fn str_value(name: &str, value: &tantivy::schema::document::CompactDocValue) -> String {
														
 
															+    value
														
 
															+        .as_str()
														
 
															+        .unwrap_or_else(|| panic!("{} field should be text", name))
														
 
															+        .to_string()
														
 
															+}
														
 
															+
														
 
															+impl From<TantivyDocument> for RetrievedWebpage {
														
 
															+    fn from(doc: TantivyDocument) -> Self {
														
 
															+        let mut webpage = RetrievedWebpage::default();
														
 
															+
														
 
															+        for (field, value) in doc.field_values() {
														
 
															+            match Field::get(field.field_id() as usize) {
														
 
															+                Some(Field::Text(TextFieldEnum::Title(_))) => {
														
 
															+                    webpage.title = str_value(text_field::Title.name(), &value);
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::StemmedCleanBody(_))) => {
														
 
															+                    webpage.body = str_value(text_field::StemmedCleanBody.name(), &value);
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::Description(_))) => {
														
 
															+                    let desc = str_value(text_field::Description.name(), &value);
														
 
															+                    webpage.description = if desc.is_empty() { None } else { Some(desc) }
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::Url(_))) => {
														
 
															+                    webpage.url = str_value(text_field::Url.name(), &value);
														
 
															+                }
														
 
															+                Some(Field::Numerical(NumericalFieldEnum::LastUpdated(_))) => {
														
 
															+                    webpage.updated_time = {
														
 
															+                        let timestamp = value.as_u64().unwrap() as i64;
														
 
															+                        if timestamp == 0 {
														
 
															+                            None
														
 
															+                        } else {
														
 
															+                            DateTime::from_timestamp(timestamp, 0).map(|dt| dt.naive_utc())
														
 
															+                        }
														
 
															+                    }
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::AllBody(_))) => {
														
 
															+                    webpage.dirty_body = str_value(text_field::AllBody.name(), &value);
														
 
															+                }
														
 
															+                Some(Field::Numerical(NumericalFieldEnum::Region(_))) => {
														
 
															+                    webpage.region = {
														
 
															+                        let id = value.as_u64().unwrap();
														
 
															+                        Region::from_id(id)
														
 
															+                    }
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::DmozDescription(_))) => {
														
 
															+                    let desc = str_value(text_field::DmozDescription.name(), &value);
														
 
															+                    webpage.dmoz_description = if desc.is_empty() { None } else { Some(desc) }
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::SchemaOrgJson(_))) => {
														
 
															+                    let json = str_value(text_field::SchemaOrgJson.name(), &value);
														
 
															+                    webpage.schema_org = serde_json::from_str(&json).unwrap_or_default();
														
 
															+                }
														
 
															+                Some(Field::Numerical(NumericalFieldEnum::LikelyHasAds(_))) => {
														
 
															+                    webpage.likely_has_ads = value.as_bool().unwrap_or_default();
														
 
															+                }
														
 
															+                Some(Field::Numerical(NumericalFieldEnum::LikelyHasPaywall(_))) => {
														
 
															+                    webpage.likely_has_paywall = value.as_bool().unwrap_or_default();
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::RecipeFirstIngredientTagId(_))) => {
														
 
															+                    let tag_id = str_value(text_field::RecipeFirstIngredientTagId.name(), &value);
														
 
															+                    if !tag_id.is_empty() {
														
 
															+                        webpage.recipe_first_ingredient_tag_id = Some(tag_id);
														
 
															+                    }
														
 
															+                }
														
 
															+                Some(Field::Text(TextFieldEnum::Keywords(_))) => {
														
 
															+                    let keywords = str_value(text_field::Keywords.name(), &value);
														
 
															+                    webpage.keywords = keywords.split('\n').map(|s| s.to_string()).collect();
														
 
															+                }
														
 
															+                _ => {}
														
 
															+            }
														
 
															+        }
														
 
															+
														
 
															+        webpage
														
 
															+    }
														
 
															+}
														
--- a/crates/core/src/inverted_index/search.rs
+++ b/crates/core/src/inverted_index/search.rs
@@ -14,6 +14,7 @@
 
															 // You should have received a copy of the GNU Affero General Public License
														
 
															 // along with this program.  If not, see <https://www.gnu.org/licenses/>
														
 
															+use super::key_phrase::KeyPhrase;
														
 
															 use super::{DocAddress, InitialSearchResult, InvertedIndex, RetrievedWebpage, WebpagePointer};
														
 
															 use itertools::Itertools;
														
 
															 use tantivy::collector::Count;
														
@@ -298,4 +299,8 @@ impl InvertedIndex {
 
															         res.pop()
														
 
															             .map(|(_, doc)| self.retrieve_doc(doc.into(), &tv_searcher).unwrap())
														
 
															     }
														
 
															+
														
 
															+    pub(crate) fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
														
 
															+        KeyPhrase::compute_top(&self.reader, top_n)
														
 
															+    }
														
 
															 }
														
--- a/crates/core/src/lib.rs
+++ b/crates/core/src/lib.rs
@@ -344,14 +344,17 @@ impl<T> OneOrMany<T> {
 
															     }
														
 
															 }
														
 
															-pub trait TopKOrderable: Ord {
														
 
															+pub trait TopKOrderable {
														
 
															     type SortKey: Ord + Copy;
														
 
															     fn sort_key(&self) -> Self::SortKey;
														
 
															 }
														
 
															-impl TopKOrderable for (SortableFloat, webgraph::NodeID) {
														
 
															-    type SortKey = SortableFloat;
														
 
															+impl<K, T> TopKOrderable for (K, T)
														
 
															+where
														
 
															+    K: Ord + Copy,
														
 
															+{
														
 
															+    type SortKey = K;
														
 
															     fn sort_key(&self) -> Self::SortKey {
														
 
															         self.0
														
@@ -391,12 +394,12 @@ where
 
															         top_k.push(hit);
														
 
															         if top_k.len() >= 2 * k {
														
 
															             // The standard library does all of the heavy lifting here.
														
 
															-            let (_, median_el, _) = top_k.select_nth_unstable(k - 1);
														
 
															+            let (_, median_el, _) = top_k.select_nth_unstable_by_key(k - 1, |el| el.sort_key());
														
 
															             threshold = Some(median_el.sort_key());
														
 
															             top_k.truncate(k);
														
 
															         }
														
 
															     }
														
 
															-    top_k.sort_unstable();
														
 
															+    top_k.sort_unstable_by_key(|el| el.sort_key());
														
 
															     top_k.truncate(k);
														
 
															     top_k
														
 
															 }
														
--- a/crates/core/src/main.rs
+++ b/crates/core/src/main.rs
@@ -19,7 +19,6 @@ use serde::de::DeserializeOwned;
 
															 use std::fs;
														
 
															 use std::path::Path;
														
 
															 use stract::config;
														
 
															-use stract::entrypoint::autosuggest_scrape::{self, Gl};
														
 
															 #[cfg(feature = "dev")]
														
 
															 use stract::entrypoint::configure;
														
@@ -81,14 +80,6 @@ enum Commands {
 
															         config_path: String,
														
 
															     },
														
 
															-    /// Scrape the Google autosuggest API for search queries.
														
 
															-    AutosuggestScrape {
														
 
															-        num_queries: usize,
														
 
															-        gl: Gl,
														
 
															-        ms_sleep_between_req: u64,
														
 
															-        output_dir: String,
														
 
															-    },
														
 
															-
														
 
															     /// Deploy the crawler.
														
 
															     Crawler {
														
 
															         #[clap(subcommand)]
														
@@ -375,14 +366,6 @@ fn main() -> Result<()> {
 
															                 .build()?
														
 
															                 .block_on(entity_search_server::run(config))?;
														
 
															         }
														
 
															-        Commands::AutosuggestScrape {
														
 
															-            num_queries: queries_to_scrape,
														
 
															-            gl,
														
 
															-            ms_sleep_between_req,
														
 
															-            output_dir,
														
 
															-        } => {
														
 
															-            autosuggest_scrape::run(queries_to_scrape, gl, ms_sleep_between_req, output_dir)?;
														
 
															-        }
														
 
															         #[cfg(feature = "dev")]
														
 
															         Commands::Configure { skip_download } => {
														
 
															             configure::run(skip_download)?;
														
--- a/crates/core/src/schema/text_field.rs
+++ b/crates/core/src/schema/text_field.rs
@@ -29,9 +29,12 @@ use crate::{
 
															     enum_dispatch_from_discriminant,
														
 
															     enum_map::InsertEnumMapKey,
														
 
															     ranking::bm25::Bm25Constants,
														
 
															-    tokenizer,
														
 
															-    tokenizer::fields::{
														
 
															-        BigramTokenizer, FieldTokenizer, Identity, JsonField, TrigramTokenizer, UrlTokenizer,
														
 
															+    tokenizer::{
														
 
															+        self,
														
 
															+        fields::{
														
 
															+            BigramTokenizer, FieldTokenizer, Identity, JsonField, NewlineTokenizer,
														
 
															+            TrigramTokenizer, UrlTokenizer,
														
 
															+        },
														
 
															     },
														
 
															     webpage::Html,
														
 
															     Result,
														
@@ -95,6 +98,10 @@ pub trait TextField:
 
															         false
														
 
															     }
														
 
															+    fn has_freqs(&self) -> bool {
														
 
															+        true
														
 
															+    }
														
 
															+
														
 
															     fn is_phrase_searchable(&self) -> bool {
														
 
															         self.is_searchable() && self.has_pos()
														
 
															     }
														
@@ -115,10 +122,10 @@ pub trait TextField:
 
															     }
														
 
															     fn record_option(&self) -> IndexRecordOption {
														
 
															-        if self.has_pos() {
														
 
															-            IndexRecordOption::WithFreqsAndPositions
														
 
															-        } else {
														
 
															-            IndexRecordOption::WithFreqs
														
 
															+        match (self.has_freqs(), self.has_pos()) {
														
 
															+            (true, true) => IndexRecordOption::WithFreqsAndPositions,
														
 
															+            (true, false) => IndexRecordOption::WithFreqs,
														
 
															+            (false, _) => IndexRecordOption::Basic,
														
 
															         }
														
 
															     }
														
@@ -190,6 +197,7 @@ pub enum TextFieldEnum {
 
															     InsertionTimestamp,
														
 
															     RecipeFirstIngredientTagId,
														
 
															     Keywords,
														
 
															+    KeyPhrases,
														
 
															     Links,
														
 
															 }
														
@@ -227,6 +235,7 @@ enum_dispatch_from_discriminant!(TextFieldEnumDiscriminants => TextFieldEnum,
 
															     InsertionTimestamp,
														
 
															     RecipeFirstIngredientTagId,
														
 
															     Keywords,
														
 
															+    KeyPhrases,
														
 
															     Links,
														
 
															 ]);
														
@@ -1516,6 +1525,41 @@ impl TextField for RecipeFirstIngredientTagId {
 
															     }
														
 
															 }
														
 
															+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
														
 
															+pub struct Links;
														
 
															+impl TextField for Links {
														
 
															+    fn name(&self) -> &str {
														
 
															+        "links"
														
 
															+    }
														
 
															+
														
 
															+    fn has_pos(&self) -> bool {
														
 
															+        true
														
 
															+    }
														
 
															+
														
 
															+    fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer {
														
 
															+        FieldTokenizer::Url(UrlTokenizer)
														
 
															+    }
														
 
															+
														
 
															+    fn add_html_tantivy(
														
 
															+        &self,
														
 
															+        html: &Html,
														
 
															+        _cache: &mut FnCache,
														
 
															+        doc: &mut TantivyDocument,
														
 
															+        schema: &tantivy::schema::Schema,
														
 
															+    ) -> Result<()> {
														
 
															+        doc.add_text(
														
 
															+            self.tantivy_field(schema)
														
 
															+                .unwrap_or_else(|| panic!("could not find field '{}' in index", self.name())),
														
 
															+            html.anchor_links()
														
 
															+                .into_iter()
														
 
															+                .map(|l| l.destination.as_str().to_string())
														
 
															+                .join("\n"),
														
 
															+        );
														
 
															+
														
 
															+        Ok(())
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
														
 
															 pub struct Keywords;
														
 
															 impl TextField for Keywords {
														
@@ -1554,36 +1598,42 @@ impl TextField for Keywords {
 
															 }
														
 
															 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
														
 
															-pub struct Links;
														
 
															-impl TextField for Links {
														
 
															+pub struct KeyPhrases;
														
 
															+impl TextField for KeyPhrases {
														
 
															     fn name(&self) -> &str {
														
 
															-        "links"
														
 
															+        "key_phrases"
														
 
															     }
														
 
															-    fn has_pos(&self) -> bool {
														
 
															+    fn is_stored(&self) -> bool {
														
 
															         true
														
 
															     }
														
 
															-    fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer {
														
 
															-        FieldTokenizer::Url(UrlTokenizer)
														
 
															-    }
														
 
															-
														
 
															-    fn add_html_tantivy(
														
 
															+    fn add_webpage_tantivy(
														
 
															         &self,
														
 
															-        html: &Html,
														
 
															-        _cache: &mut FnCache,
														
 
															+        webpage: &crate::webpage::Webpage,
														
 
															         doc: &mut TantivyDocument,
														
 
															         schema: &tantivy::schema::Schema,
														
 
															     ) -> Result<()> {
														
 
															         doc.add_text(
														
 
															             self.tantivy_field(schema)
														
 
															                 .unwrap_or_else(|| panic!("could not find field '{}' in index", self.name())),
														
 
															-            html.anchor_links()
														
 
															-                .into_iter()
														
 
															-                .map(|l| l.destination.as_str().to_string())
														
 
															-                .join("\n"),
														
 
															+            webpage.keywords.join("\n"),
														
 
															         );
														
 
															         Ok(())
														
 
															     }
														
 
															+
														
 
															+    fn add_html_tantivy(
														
 
															+        &self,
														
 
															+        _: &Html,
														
 
															+        _: &mut FnCache,
														
 
															+        _: &mut TantivyDocument,
														
 
															+        _: &tantivy::schema::Schema,
														
 
															+    ) -> Result<()> {
														
 
															+        Ok(())
														
 
															+    }
														
 
															+
														
 
															+    fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer {
														
 
															+        FieldTokenizer::Newline(NewlineTokenizer::default())
														
 
															+    }
														
 
															 }
														
--- a/crates/core/src/searcher/distributed.rs
+++ b/crates/core/src/searcher/distributed.rs
@@ -31,12 +31,12 @@ use crate::{
 
															     },
														
 
															     image_store::Image,
														
 
															     index::Index,
														
 
															-    inverted_index::{RetrievedWebpage, WebpagePointer},
														
 
															+    inverted_index::{KeyPhrase, RetrievedWebpage, WebpagePointer},
														
 
															     ranking::pipeline::{PrecisionRankingWebpage, RecallRankingWebpage},
														
 
															     Result,
														
 
															 };
														
 
															-use std::{collections::HashMap, sync::Arc};
														
 
															+use std::{collections::HashMap, sync::Arc, time::Duration};
														
 
															 use fnv::FnvHashMap;
														
 
															 use futures::future::join_all;
														
@@ -92,6 +92,8 @@ pub trait SearchClient {
 
															         max_height: Option<u64>,
														
 
															         max_width: Option<u64>,
														
 
															     ) -> impl Future<Output = Result<Option<Image>>> + Send;
														
 
															+
														
 
															+    fn top_key_phrases(&self, top_n: usize) -> impl Future<Output = Vec<KeyPhrase>> + Send;
														
 
															 }
														
 
															 #[derive(Clone, Debug)]
														
@@ -382,6 +384,45 @@ impl SearchClient for DistributedSearcher {
 
															             .and_then(|(_, mut v)| v.pop())
														
 
															             .and_then(|(_, v)| v)
														
 
															     }
														
 
															+
														
 
															+    async fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
														
 
															+        let client = self.conn().await;
														
 
															+
														
 
															+        let res = client
														
 
															+            .send_with_timeout(
														
 
															+                search_server::TopKeyPhrases { top_n },
														
 
															+                &AllShardsSelector,
														
 
															+                &RandomReplicaSelector,
														
 
															+                Duration::from_secs(60 * 60),
														
 
															+            )
														
 
															+            .await;
														
 
															+
														
 
															+        match res {
														
 
															+            Ok(res) => {
														
 
															+                let mut phrases = HashMap::new();
														
 
															+
														
 
															+                for (_, v) in res {
														
 
															+                    for (_, v) in v {
														
 
															+                        for phrase in v {
														
 
															+                            *phrases.entry(phrase.text().to_string()).or_default() +=
														
 
															+                                phrase.score();
														
 
															+                        }
														
 
															+                    }
														
 
															+                }
														
 
															+
														
 
															+                phrases
														
 
															+                    .into_iter()
														
 
															+                    .map(|(phrase, score)| KeyPhrase::new(phrase, score))
														
 
															+                    .sorted_by(|a, b| b.score().partial_cmp(&a.score()).unwrap())
														
 
															+                    .take(top_n)
														
 
															+                    .collect()
														
 
															+            }
														
 
															+            Err(e) => {
														
 
															+                tracing::error!("failed to get key phrases: {:?}", e);
														
 
															+                Vec::new()
														
 
															+            }
														
 
															+        }
														
 
															+    }
														
 
															 }
														
 
															 /// This should only be used for testing and benchmarks.
														
@@ -457,4 +498,8 @@ impl SearchClient for LocalSearchClient {
 
															     async fn search_entity(&self, _query: &str) -> Option<EntityMatch> {
														
 
															         None
														
 
															     }
														
 
															+
														
 
															+    async fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
														
 
															+        self.0.top_key_phrases(top_n)
														
 
															+    }
														
 
															 }
														
--- a/crates/core/src/searcher/local.rs
+++ b/crates/core/src/searcher/local.rs
@@ -23,7 +23,7 @@ use url::Url;
 
															 use crate::collector::approx_count;
														
 
															 use crate::config::{CollectorConfig, SnippetConfig};
														
 
															 use crate::index::Index;
														
 
															-use crate::inverted_index::{InvertedIndex, RetrievedWebpage};
														
 
															+use crate::inverted_index::{InvertedIndex, KeyPhrase, RetrievedWebpage};
														
 
															 use crate::models::dual_encoder::DualEncoder;
														
 
															 use crate::query::Query;
														
 
															 use crate::ranking::models::lambdamart::LambdaMART;
														
@@ -388,6 +388,10 @@ where
 
															     pub fn get_homepage(&self, url: &Url) -> Option<RetrievedWebpage> {
														
 
															         self.index.guard().inverted_index().get_homepage(url)
														
 
															     }
														
 
															+
														
 
															+    pub fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
														
 
															+        self.index.guard().inverted_index().top_key_phrases(top_n)
														
 
															+    }
														
 
															 }
														
 
															 #[cfg(test)]
														
--- a/crates/core/src/tokenizer/fields/mod.rs
+++ b/crates/core/src/tokenizer/fields/mod.rs
@@ -18,12 +18,14 @@ use tantivy::tokenizer::BoxTokenStream;
 
															 pub use self::{
														
 
															     bigram::BigramTokenizer, default::DefaultTokenizer, identity::Identity, json::FlattenedJson,
														
 
															-    json::JsonField, stemmed::Stemmed, trigram::TrigramTokenizer, url::UrlTokenizer,
														
 
															+    json::JsonField, split_newlines::NewlineTokenizer, stemmed::Stemmed, trigram::TrigramTokenizer,
														
 
															+    url::UrlTokenizer,
														
 
															 };
														
 
															 mod default;
														
 
															 mod identity;
														
 
															 mod json;
														
 
															+mod split_newlines;
														
 
															 mod stemmed;
														
 
															 mod url;
														
@@ -40,6 +42,7 @@ pub enum FieldTokenizer {
 
															     Trigram(TrigramTokenizer),
														
 
															     Json(JsonField),
														
 
															     Url(UrlTokenizer),
														
 
															+    Newline(NewlineTokenizer),
														
 
															 }
														
 
															 impl FieldTokenizer {
														
@@ -52,6 +55,7 @@ impl FieldTokenizer {
 
															             FieldTokenizer::Trigram(_) => TrigramTokenizer::as_str(),
														
 
															             FieldTokenizer::Json(_) => JsonField::as_str(),
														
 
															             FieldTokenizer::Url(_) => UrlTokenizer::as_str(),
														
 
															+            FieldTokenizer::Newline(_) => NewlineTokenizer::as_str(),
														
 
															         }
														
 
															     }
														
 
															 }
														
@@ -79,6 +83,7 @@ impl tantivy::tokenizer::Tokenizer for FieldTokenizer {
 
															             FieldTokenizer::Bigram(tokenizer) => tokenizer.token_stream(text),
														
 
															             FieldTokenizer::Trigram(tokenizer) => tokenizer.token_stream(text),
														
 
															             FieldTokenizer::Url(tokenizer) => tokenizer.token_stream(text),
														
 
															+            FieldTokenizer::Newline(tokenizer) => tokenizer.token_stream(text),
														
 
															         }
														
 
															     }
														
 
															 }
														
--- a/crates/core/src/tokenizer/fields/split_newlines.rs
+++ b/crates/core/src/tokenizer/fields/split_newlines.rs
@@ -0,0 +1,145 @@
 
															+// Stract is an open source web search engine.
														
 
															+// Copyright (C) 2024 Stract ApS
														
 
															+//
														
 
															+// This program is free software: you can redistribute it and/or modify
														
 
															+// it under the terms of the GNU Affero General Public License as
														
 
															+// published by the Free Software Foundation, either version 3 of the
														
 
															+// License, or (at your option) any later version.
														
 
															+//
														
 
															+// This program is distributed in the hope that it will be useful,
														
 
															+// but WITHOUT ANY WARRANTY; without even the implied warranty of
														
 
															+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
														
 
															+// GNU Affero General Public License for more details.
														
 
															+//
														
 
															+// You should have received a copy of the GNU Affero General Public License
														
 
															+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
														
 
															+
														
 
															+use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer};
														
 
															+
														
 
															+use crate::tokenizer::{self, normalizer, split_with_range::SplitWithRange, Normalize};
														
 
															+
														
 
															+#[derive(Clone, Default)]
														
 
															+pub struct NewlineTokenizer {
														
 
															+    analyzer: Option<TextAnalyzer>,
														
 
															+}
														
 
															+
														
 
															+impl NewlineTokenizer {
														
 
															+    pub fn as_str() -> &'static str {
														
 
															+        "newline"
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+impl tantivy::tokenizer::Tokenizer for NewlineTokenizer {
														
 
															+    type TokenStream<'a> = BoxTokenStream<'a>;
														
 
															+
														
 
															+    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
														
 
															+        let builder = TextAnalyzer::builder(Newline);
														
 
															+
														
 
															+        self.analyzer = Some(builder.build());
														
 
															+
														
 
															+        self.analyzer.as_mut().unwrap().token_stream(text)
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+#[derive(Clone)]
														
 
															+pub struct Newline;
														
 
															+
														
 
															+pub struct NewlineTokenStream<'a> {
														
 
															+    stream: Box<dyn Iterator<Item = tokenizer::Token<'a>> + 'a>,
														
 
															+    token: Option<tantivy::tokenizer::Token>,
														
 
															+    next_position: usize,
														
 
															+}
														
 
															+
														
 
															+impl tantivy::tokenizer::Tokenizer for Newline {
														
 
															+    type TokenStream<'a> = BoxTokenStream<'a>;
														
 
															+
														
 
															+    fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> {
														
 
															+        let stream = Box::new(
														
 
															+            text.split_with_range(|c| c == '\n' || c == '\r')
														
 
															+                .map(|(s, range)| tokenizer::Token::new(s, range))
														
 
															+                .normalize(&normalizer::Lowercase)
														
 
															+                .normalize(&normalizer::UnicodeNFKD)
														
 
															+                .normalize(&normalizer::UnicodeDiacritics),
														
 
															+        );
														
 
															+
														
 
															+        BoxTokenStream::new(NewlineTokenStream::new_boxed(stream))
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+impl<'a> tantivy::tokenizer::TokenStream for NewlineTokenStream<'a> {
														
 
															+    fn advance(&mut self) -> bool {
														
 
															+        self.token = self.stream.next().map(|token| {
														
 
															+            let span = token.span();
														
 
															+            let pos = self.next_position;
														
 
															+            self.next_position += 1;
														
 
															+            tantivy::tokenizer::Token {
														
 
															+                offset_from: span.start,
														
 
															+                offset_to: span.end,
														
 
															+                position: pos,
														
 
															+                text: token.text().to_string(),
														
 
															+                ..Default::default()
														
 
															+            }
														
 
															+        });
														
 
															+
														
 
															+        self.token.is_some()
														
 
															+    }
														
 
															+
														
 
															+    fn token(&self) -> &tantivy::tokenizer::Token {
														
 
															+        self.token.as_ref().unwrap()
														
 
															+    }
														
 
															+
														
 
															+    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
														
 
															+        self.token.as_mut().unwrap()
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+impl<'a> NewlineTokenStream<'a> {
														
 
															+    fn new_boxed(
														
 
															+        stream: Box<dyn Iterator<Item = tokenizer::Token<'a>> + 'a>,
														
 
															+    ) -> BoxTokenStream<'a> {
														
 
															+        BoxTokenStream::new(Self {
														
 
															+            stream,
														
 
															+            token: None,
														
 
															+            next_position: 0,
														
 
															+        })
														
 
															+    }
														
 
															+}
														
 
															+
														
 
															+#[cfg(test)]
														
 
															+mod tests {
														
 
															+    use super::*;
														
 
															+    use tantivy::tokenizer::Tokenizer as _;
														
 
															+
														
 
															+    fn tokenize_newline(s: &str) -> Vec<String> {
														
 
															+        let mut res = Vec::new();
														
 
															+        let mut tokenizer = NewlineTokenizer::default();
														
 
															+        let mut stream = tokenizer.token_stream(s);
														
 
															+
														
 
															+        while let Some(token) = stream.next() {
														
 
															+            res.push(token.text.clone());
														
 
															+        }
														
 
															+
														
 
															+        res
														
 
															+    }
														
 
															+
														
 
															+    #[test]
														
 
															+    fn newline_tokenizer() {
														
 
															+        assert!(tokenize_newline("").is_empty());
														
 
															+        assert_eq!(tokenize_newline("a\nb"), vec!["a", "b"]);
														
 
															+        assert_eq!(tokenize_newline("a\nb\n"), vec!["a", "b"]);
														
 
															+        assert_eq!(tokenize_newline("\na\nb\n"), vec!["a", "b"]);
														
 
															+        assert_eq!(tokenize_newline("\na\nb\nc"), vec!["a", "b", "c"]);
														
 
															+    }
														
 
															+
														
 
															+    #[test]
														
 
															+    fn newline_tokenizer_without_newlines() {
														
 
															+        assert!(tokenize_newline("").is_empty());
														
 
															+        assert_eq!(tokenize_newline("test"), vec!["test"]);
														
 
															+
														
 
															+        assert_eq!(tokenize_newline("this is"), vec!["this is"]);
														
 
															+        assert_eq!(tokenize_newline("this is a"), vec!["this is a",]);
														
 
															+        assert_eq!(tokenize_newline("this is a test"), vec!["this is a test",]);
														
 
															+
														
 
															+        assert_eq!(tokenize_newline("this.is"), vec!["this.is"]);
														
 
															+    }
														
 
															+}
														
--- a/crates/core/src/web_spell/term_freqs.rs
+++ b/crates/core/src/web_spell/term_freqs.rs
@@ -216,15 +216,17 @@ impl TermDict {
 
															             return;
														
 
															         }
														
 
															+        let num_chars = term.chars().count();
														
 
															+
														
 
															         let punctuation_percentage =
														
 
															-            term.chars().filter(|c| c.is_ascii_punctuation()).count() as f64 / term.len() as f64;
														
 
															+            term.chars().filter(|c| c.is_ascii_punctuation()).count() as f64 / num_chars as f64;
														
 
															         if punctuation_percentage > 0.5 {
														
 
															             return;
														
 
															         }
														
 
															         let non_alphabetic_percentage =
														
 
															-            term.chars().filter(|c| !c.is_alphabetic()).count() as f64 / term.len() as f64;
														
 
															+            term.chars().filter(|c| !c.is_alphabetic()).count() as f64 / num_chars as f64;
														
 
															         if non_alphabetic_percentage > 0.25 {
														
 
															             return;