浏览代码

automatically build autosuggest from search index keywords

Mikkel Denker 1 年之前
父节点
当前提交
cef67d6aaf

+ 2 - 1
assets/licenses.html

@@ -48,7 +48,7 @@
             <li><a href="#MIT">MIT License</a> (187)</li>
             <li><a href="#MIT">MIT License</a> (187)</li>
             <li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (9)</li>
             <li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (9)</li>
             <li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (8)</li>
             <li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (8)</li>
-            <li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (5)</li>
+            <li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (6)</li>
             <li><a href="#ISC">ISC License</a> (4)</li>
             <li><a href="#ISC">ISC License</a> (4)</li>
             <li><a href="#Unicode-3.0">Unicode License v3</a> (4)</li>
             <li><a href="#Unicode-3.0">Unicode License v3</a> (4)</li>
             <li><a href="#BSD-2-Clause">BSD 2-Clause &quot;Simplified&quot; License</a> (3)</li>
             <li><a href="#BSD-2-Clause">BSD 2-Clause &quot;Simplified&quot; License</a> (3)</li>
@@ -70,6 +70,7 @@
                 <ul class="license-used-by">
                 <ul class="license-used-by">
                     <li><a href=" https://crates.io/crates/bloom ">bloom 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/bloom ">bloom 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/file_store ">file_store 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/file_store ">file_store 0.1.0</a></li>
+                    <li><a href=" https://crates.io/crates/lending_iter ">lending_iter 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/ownedbytes ">ownedbytes 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/ownedbytes ">ownedbytes 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/robotstxt ">robotstxt 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/robotstxt ">robotstxt 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/speedy_kv ">speedy_kv 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/speedy_kv ">speedy_kv 0.1.0</a></li>

+ 1 - 1
configs/api.toml

@@ -7,7 +7,7 @@ host = "0.0.0.0:3000"
 bangs_path = "data/bangs.json"
 bangs_path = "data/bangs.json"
 # dual_encoder_model_path = "data/dual_encoder"
 # dual_encoder_model_path = "data/dual_encoder"
 prometheus_host = "0.0.0.0:3001"
 prometheus_host = "0.0.0.0:3001"
-queries_csv_path = "data/queries_us.csv"
+top_phrases_for_autosuggest = 1_000
 
 
 [spell_check]
 [spell_check]
 path = "data/web_spell/checker"
 path = "data/web_spell/checker"

+ 7 - 7
crates/core/examples/search_preindexed.rs

@@ -22,7 +22,6 @@ pub async fn main() {
     };
     };
 
 
     let config = ApiConfig {
     let config = ApiConfig {
-        queries_csv_path: Some("data/queries_us.csv".to_string()),
         host: "0.0.0.0:8000".parse().unwrap(),
         host: "0.0.0.0:8000".parse().unwrap(),
         prometheus_host: "0.0.0.0:8001".parse().unwrap(),
         prometheus_host: "0.0.0.0:8001".parse().unwrap(),
         crossencoder_model_path: None,
         crossencoder_model_path: None,
@@ -45,17 +44,18 @@ pub async fn main() {
         }),
         }),
         max_concurrent_searches: defaults::Api::max_concurrent_searches(),
         max_concurrent_searches: defaults::Api::max_concurrent_searches(),
         max_similar_hosts: defaults::Api::max_similar_hosts(),
         max_similar_hosts: defaults::Api::max_similar_hosts(),
+        top_phrases_for_autosuggest: defaults::Api::top_phrases_for_autosuggest(),
     };
     };
 
 
-    let mut queries =
-        stract::autosuggest::Autosuggest::load_csv(config.queries_csv_path.as_ref().unwrap())
-            .unwrap()
-            .all()
-            .unwrap();
+    let mut searcher = LocalSearcher::new(index);
 
 
+    let mut queries: Vec<_> = searcher
+        .top_key_phrases(1_000_000)
+        .into_iter()
+        .map(|phrase| phrase.text().to_string())
+        .collect();
     queries.shuffle(&mut rand::thread_rng());
     queries.shuffle(&mut rand::thread_rng());
 
 
-    let mut searcher = LocalSearcher::new(index);
     searcher.set_collector_config(collector_conf);
     searcher.set_collector_config(collector_conf);
     searcher.set_snippet_config(SnippetConfig {
     searcher.set_snippet_config(SnippetConfig {
         num_words_for_lang_detection: Some(250),
         num_words_for_lang_detection: Some(250),

+ 18 - 6
crates/core/src/api/mod.rs

@@ -34,7 +34,7 @@ use crate::{
     leaky_queue::LeakyQueue,
     leaky_queue::LeakyQueue,
     models::dual_encoder::DualEncoder,
     models::dual_encoder::DualEncoder,
     ranking::models::lambdamart::LambdaMART,
     ranking::models::lambdamart::LambdaMART,
-    searcher::{api::ApiSearcher, live::LiveSearcher, DistributedSearcher},
+    searcher::{api::ApiSearcher, live::LiveSearcher, DistributedSearcher, SearchClient},
     similar_hosts::SimilarHostsFinder,
     similar_hosts::SimilarHostsFinder,
     webgraph::remote::RemoteWebgraph,
     webgraph::remote::RemoteWebgraph,
 };
 };
@@ -148,11 +148,6 @@ fn build_router(state: Arc<State>) -> Router {
 }
 }
 
 
 pub async fn router(config: &ApiConfig, counters: Counters) -> Result<Router> {
 pub async fn router(config: &ApiConfig, counters: Counters) -> Result<Router> {
-    let autosuggest = match &config.queries_csv_path {
-        Some(queries_csv_path) => Autosuggest::load_csv(queries_csv_path)?,
-        None => Autosuggest::empty(),
-    };
-
     let lambda_model = match &config.lambda_model_path {
     let lambda_model = match &config.lambda_model_path {
         Some(path) => Some(LambdaMART::open(path)?),
         Some(path) => Some(LambdaMART::open(path)?),
         None => None,
         None => None,
@@ -194,6 +189,23 @@ pub async fn router(config: &ApiConfig, counters: Counters) -> Result<Router> {
     let dist_searcher = DistributedSearcher::new(Arc::clone(&cluster)).await;
     let dist_searcher = DistributedSearcher::new(Arc::clone(&cluster)).await;
     let live_searcher = LiveSearcher::new(Arc::clone(&cluster));
     let live_searcher = LiveSearcher::new(Arc::clone(&cluster));
 
 
+    if !cluster
+        .members()
+        .await
+        .iter()
+        .any(|m| m.service.is_searcher())
+    {
+        log::warn!("Waiting for search nodes to join the cluster");
+        cluster.await_member(|m| m.service.is_searcher()).await;
+        log::info!("Search nodes joined the cluster");
+    }
+
+    let autosuggest = Autosuggest::from_key_phrases(
+        dist_searcher
+            .top_key_phrases(config.top_phrases_for_autosuggest)
+            .await,
+    )?;
+
     let state = {
     let state = {
         let mut cross_encoder = None;
         let mut cross_encoder = None;
 
 

+ 5 - 15
crates/core/src/autosuggest.rs

@@ -21,38 +21,28 @@
 
 
 use fst::{automaton::Str, Automaton, IntoStreamer};
 use fst::{automaton::Str, Automaton, IntoStreamer};
 
 
-use crate::Result;
-use std::path::Path;
+use crate::{inverted_index::KeyPhrase, Result};
 
 
 pub struct Autosuggest {
 pub struct Autosuggest {
     queries: fst::Set<Vec<u8>>,
     queries: fst::Set<Vec<u8>>,
 }
 }
 
 
 impl Autosuggest {
 impl Autosuggest {
-    pub fn load_csv<P: AsRef<Path>>(path: P) -> Result<Self> {
+    pub fn from_key_phrases(key_phrases: Vec<KeyPhrase>) -> Result<Self> {
         let mut queries: Vec<String> = Vec::new();
         let mut queries: Vec<String> = Vec::new();
 
 
-        let mut rdr = csv::Reader::from_path(path)?;
-        for result in rdr.records() {
-            let record = result?;
-            if let Some(query) = record.get(0) {
-                queries.push(query.to_string());
-            }
+        for key_phrase in key_phrases {
+            queries.push(key_phrase.text().to_string());
         }
         }
 
 
         queries.sort();
         queries.sort();
+        queries.dedup();
 
 
         let queries = fst::Set::from_iter(queries)?;
         let queries = fst::Set::from_iter(queries)?;
 
 
         Ok(Self { queries })
         Ok(Self { queries })
     }
     }
 
 
-    pub fn empty() -> Self {
-        Self {
-            queries: fst::Set::default(),
-        }
-    }
-
     pub fn suggestions(&self, query: &str) -> Result<Vec<String>> {
     pub fn suggestions(&self, query: &str) -> Result<Vec<String>> {
         let query = query.to_ascii_lowercase();
         let query = query.to_ascii_lowercase();
         let q = Str::new(query.as_str()).starts_with();
         let q = Str::new(query.as_str()).starts_with();

+ 4 - 0
crates/core/src/config/defaults.rs

@@ -56,6 +56,10 @@ impl Api {
     pub fn max_similar_hosts() -> usize {
     pub fn max_similar_hosts() -> usize {
         1_000
         1_000
     }
     }
+
+    pub fn top_phrases_for_autosuggest() -> usize {
+        1_000_000
+    }
 }
 }
 
 
 pub struct Snippet;
 pub struct Snippet;

+ 3 - 1
crates/core/src/config/mod.rs

@@ -224,7 +224,6 @@ pub struct ApiSpellCheck {
 
 
 #[derive(Debug, serde::Serialize, serde::Deserialize, Clone)]
 #[derive(Debug, serde::Serialize, serde::Deserialize, Clone)]
 pub struct ApiConfig {
 pub struct ApiConfig {
-    pub queries_csv_path: Option<String>,
     pub host: SocketAddr,
     pub host: SocketAddr,
     pub prometheus_host: SocketAddr,
     pub prometheus_host: SocketAddr,
     pub crossencoder_model_path: Option<String>,
     pub crossencoder_model_path: Option<String>,
@@ -239,6 +238,9 @@ pub struct ApiConfig {
     #[serde(default = "defaults::Api::max_similar_hosts")]
     #[serde(default = "defaults::Api::max_similar_hosts")]
     pub max_similar_hosts: usize,
     pub max_similar_hosts: usize,
 
 
+    #[serde(default = "defaults::Api::top_phrases_for_autosuggest")]
+    pub top_phrases_for_autosuggest: usize,
+
     pub spell_check: Option<ApiSpellCheck>,
     pub spell_check: Option<ApiSpellCheck>,
 
 
     #[serde(default)]
     #[serde(default)]

+ 16 - 0
crates/core/src/distributed/cluster.rs

@@ -184,4 +184,20 @@ impl Cluster {
 
 
         res
         res
     }
     }
+
+    pub async fn await_member<P>(&self, pred: P) -> Member
+    where
+        P: Fn(&Member) -> bool,
+    {
+        loop {
+            let members = self.members().await;
+            for member in members {
+                if pred(&member) {
+                    return member;
+                }
+            }
+
+            tokio::time::sleep(Duration::from_secs(1)).await;
+        }
+    }
 }
 }

+ 6 - 0
crates/core/src/distributed/member.rs

@@ -107,6 +107,12 @@ pub enum Service {
     },
     },
 }
 }
 
 
+impl Service {
+    pub fn is_searcher(&self) -> bool {
+        matches!(self, Self::Searcher { .. })
+    }
+}
+
 #[derive(PartialEq, Eq, Hash, Clone, Debug)]
 #[derive(PartialEq, Eq, Hash, Clone, Debug)]
 pub struct Member {
 pub struct Member {
     pub id: String,
     pub id: String,

+ 3 - 3
crates/core/src/distributed/retry_strategy.rs

@@ -8,7 +8,7 @@ use std::time::Duration;
 /// A retry strategy driven by exponential back-off.
 /// A retry strategy driven by exponential back-off.
 ///
 ///
 /// The power corresponds to the number of past attempts.
 /// The power corresponds to the number of past attempts.
-#[derive(Debug, Clone)]
+#[derive(Debug, Clone, Copy)]
 pub struct ExponentialBackoff {
 pub struct ExponentialBackoff {
     current: u64,
     current: u64,
     base: u64,
     base: u64,
@@ -21,7 +21,7 @@ impl ExponentialBackoff {
     ///
     ///
     /// The resulting duration is calculated by taking the base to the `n`-th power,
     /// The resulting duration is calculated by taking the base to the `n`-th power,
     /// where `n` denotes the number of past attempts.
     /// where `n` denotes the number of past attempts.
-    pub fn from_millis(base: u64) -> ExponentialBackoff {
+    pub const fn from_millis(base: u64) -> ExponentialBackoff {
         ExponentialBackoff {
         ExponentialBackoff {
             current: base,
             current: base,
             base,
             base,
@@ -29,7 +29,7 @@ impl ExponentialBackoff {
         }
         }
     }
     }
 
 
-    pub fn with_limit(mut self, limit: Duration) -> Self {
+    pub const fn with_limit(mut self, limit: Duration) -> Self {
         self.max_delay = Some(limit);
         self.max_delay = Some(limit);
         self
         self
     }
     }

+ 48 - 19
crates/core/src/distributed/sonic/replication.rs

@@ -21,6 +21,10 @@ use super::Result;
 use crate::distributed::{cluster::Cluster, retry_strategy::ExponentialBackoff, sonic};
 use crate::distributed::{cluster::Cluster, retry_strategy::ExponentialBackoff, sonic};
 use std::{net::SocketAddr, ops::DerefMut, sync::Arc, time::Duration};
 use std::{net::SocketAddr, ops::DerefMut, sync::Arc, time::Duration};
 
 
+const DEFAULT_TIMEOUT: Duration = Duration::from_secs(60);
+const DEFAULT_RETRY: ExponentialBackoff =
+    ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(3));
+
 #[derive(Debug)]
 #[derive(Debug)]
 pub struct RemoteClient<S>
 pub struct RemoteClient<S>
 where
 where
@@ -73,24 +77,16 @@ where
     }
     }
 
 
     pub async fn send<R: sonic::service::Wrapper<S> + Clone>(&self, req: R) -> Result<R::Response> {
     pub async fn send<R: sonic::service::Wrapper<S> + Clone>(&self, req: R) -> Result<R::Response> {
-        self.send_with_timeout_retry(
-            req,
-            Duration::from_secs(60),
-            ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(3)),
-        )
-        .await
+        self.send_with_timeout_retry(req, DEFAULT_TIMEOUT, DEFAULT_RETRY)
+            .await
     }
     }
 
 
     pub async fn batch_send<R: sonic::service::Wrapper<S> + Clone>(
     pub async fn batch_send<R: sonic::service::Wrapper<S> + Clone>(
         &self,
         &self,
         reqs: &[R],
         reqs: &[R],
     ) -> Result<Vec<R::Response>> {
     ) -> Result<Vec<R::Response>> {
-        self.batch_send_with_timeout_retry(
-            reqs,
-            Duration::from_secs(60),
-            ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(3)),
-        )
-        .await
+        self.batch_send_with_timeout_retry(reqs, DEFAULT_TIMEOUT, DEFAULT_RETRY)
+            .await
     }
     }
 
 
     pub async fn send_with_timeout<R: sonic::service::Wrapper<S> + Clone>(
     pub async fn send_with_timeout<R: sonic::service::Wrapper<S> + Clone>(
@@ -201,17 +197,18 @@ where
         &self,
         &self,
         req: Req,
         req: Req,
         client: &RemoteClient<S>,
         client: &RemoteClient<S>,
+        timeout: Duration,
     ) -> Result<(SocketAddr, Req::Response)>
     ) -> Result<(SocketAddr, Req::Response)>
     where
     where
         Req: sonic::service::Wrapper<S> + Clone,
         Req: sonic::service::Wrapper<S> + Clone,
     {
     {
-        Ok((client.addr(), client.send(req).await?))
+        Ok((client.addr(), client.send_with_timeout(req, timeout).await?))
     }
     }
-
-    pub async fn send<Req, Sel>(
+    pub async fn send_with_timeout<Req, Sel>(
         &self,
         &self,
         req: Req,
         req: Req,
         selector: &Sel,
         selector: &Sel,
+        timeout: Duration,
     ) -> Result<Vec<(SocketAddr, Req::Response)>>
     ) -> Result<Vec<(SocketAddr, Req::Response)>>
     where
     where
         Req: sonic::service::Wrapper<S> + Clone,
         Req: sonic::service::Wrapper<S> + Clone,
@@ -219,7 +216,7 @@ where
     {
     {
         let mut futures = Vec::new();
         let mut futures = Vec::new();
         for client in selector.select(&self.clients) {
         for client in selector.select(&self.clients) {
-            futures.push(self.send_single(req.clone(), client));
+            futures.push(self.send_single(req.clone(), client, timeout));
         }
         }
 
 
         let mut results = Vec::new();
         let mut results = Vec::new();
@@ -235,6 +232,18 @@ where
         Ok(results)
         Ok(results)
     }
     }
 
 
+    pub async fn send<Req, Sel>(
+        &self,
+        req: Req,
+        selector: &Sel,
+    ) -> Result<Vec<(SocketAddr, Req::Response)>>
+    where
+        Req: sonic::service::Wrapper<S> + Clone,
+        Sel: ReplicaSelector<S>,
+    {
+        self.send_with_timeout(req, selector, DEFAULT_TIMEOUT).await
+    }
+
     async fn batch_send_single<Req>(
     async fn batch_send_single<Req>(
         &self,
         &self,
         reqs: &[Req],
         reqs: &[Req],
@@ -356,6 +365,7 @@ where
         req: Req,
         req: Req,
         shard: &Shard<S, Id>,
         shard: &Shard<S, Id>,
         replica_selector: &Sel,
         replica_selector: &Sel,
+        timeout: Duration,
     ) -> Result<(Id, Vec<(SocketAddr, Req::Response)>)>
     ) -> Result<(Id, Vec<(SocketAddr, Req::Response)>)>
     where
     where
         Req: sonic::service::Wrapper<S> + Clone,
         Req: sonic::service::Wrapper<S> + Clone,
@@ -363,15 +373,19 @@ where
     {
     {
         Ok((
         Ok((
             shard.id.clone(),
             shard.id.clone(),
-            shard.replicas.send(req, replica_selector).await?,
+            shard
+                .replicas
+                .send_with_timeout(req, replica_selector, timeout)
+                .await?,
         ))
         ))
     }
     }
 
 
-    pub async fn send<Req, SSel, RSel>(
+    pub async fn send_with_timeout<Req, SSel, RSel>(
         &self,
         &self,
         req: Req,
         req: Req,
         shard_selector: &SSel,
         shard_selector: &SSel,
         replica_selector: &RSel,
         replica_selector: &RSel,
+        timeout: Duration,
     ) -> Result<Vec<(Id, Vec<(SocketAddr, Req::Response)>)>>
     ) -> Result<Vec<(Id, Vec<(SocketAddr, Req::Response)>)>>
     where
     where
         Req: sonic::service::Wrapper<S> + Clone,
         Req: sonic::service::Wrapper<S> + Clone,
@@ -380,7 +394,7 @@ where
     {
     {
         let mut futures = Vec::new();
         let mut futures = Vec::new();
         for shard in shard_selector.select(&self.shards) {
         for shard in shard_selector.select(&self.shards) {
-            futures.push(self.send_single(req.clone(), shard, replica_selector));
+            futures.push(self.send_single(req.clone(), shard, replica_selector, timeout));
         }
         }
 
 
         let mut results = Vec::new();
         let mut results = Vec::new();
@@ -396,6 +410,21 @@ where
         Ok(results)
         Ok(results)
     }
     }
 
 
+    pub async fn send<Req, SSel, RSel>(
+        &self,
+        req: Req,
+        shard_selector: &SSel,
+        replica_selector: &RSel,
+    ) -> Result<Vec<(Id, Vec<(SocketAddr, Req::Response)>)>>
+    where
+        Req: sonic::service::Wrapper<S> + Clone,
+        SSel: ShardSelector<S, Id>,
+        RSel: ReplicaSelector<S>,
+    {
+        self.send_with_timeout(req, shard_selector, replica_selector, DEFAULT_TIMEOUT)
+            .await
+    }
+
     async fn batch_send_single<Req, Sel>(
     async fn batch_send_single<Req, Sel>(
         &self,
         &self,
         reqs: &[Req],
         reqs: &[Req],

+ 0 - 188
crates/core/src/entrypoint/autosuggest_scrape.rs

@@ -1,188 +0,0 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2023 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-use std::fmt::Display;
-use std::{
-    collections::{HashSet, VecDeque},
-    path::Path,
-    str::FromStr,
-};
-
-use csv::Writer;
-use indicatif::{ProgressBar, ProgressStyle};
-
-use crate::Result;
-
-fn suggestions(query: &str, gl: &str) -> Result<Vec<String>> {
-    let url = format!(
-        "https://www.google.com/complete/search?q={}&gl={}&client=gws-wiz&xssi=t",
-        crate::urlencode(query),
-        gl
-    );
-
-    let client = reqwest::blocking::Client::new();
-    let builder = client.get(url);
-
-    let input = builder.send()?.text()?;
-
-    let mut input = input.split('\n');
-    input.next();
-    let input = input.next().expect("None option");
-
-    let output: serde_json::Value = serde_json::from_str(input)?;
-    let mut suggestions = Vec::new();
-
-    if let serde_json::Value::Array(arr) = output {
-        if let serde_json::Value::Array(arr) = arr[0].clone() {
-            for result in arr {
-                if let serde_json::Value::Array(result) = result {
-                    if let serde_json::Value::String(result) = result[0].clone() {
-                        let result = result.replace("<b>", "").replace("</b>", "");
-                        suggestions.push(result);
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(suggestions)
-}
-
-#[derive(Clone)]
-pub enum Gl {
-    Us,
-}
-
-impl Display for Gl {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let code = match self {
-            Gl::Us => "us",
-        };
-        write!(f, "{code}")
-    }
-}
-
-impl FromStr for Gl {
-    type Err = crate::Error;
-
-    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
-        match s.to_ascii_lowercase().as_str() {
-            "us" => Ok(Gl::Us),
-            _ => Err(crate::Error::UnknownCLIOption),
-        }
-    }
-}
-
-fn save_queries<P: AsRef<Path>>(queries: &HashSet<String>, path: P) -> Result<()> {
-    let mut wtr = Writer::from_path(&path)?;
-
-    let mut queries: Vec<_> = queries.iter().collect();
-    queries.sort();
-
-    for query in queries {
-        wtr.write_record([query])?;
-    }
-
-    wtr.flush()?;
-
-    Ok(())
-}
-
-pub fn run<P: AsRef<Path>>(
-    queries_to_scrape: usize,
-    gl: Gl,
-    ms_sleep_between_req: u64,
-    output_dir: P,
-) -> Result<()> {
-    let mut queries = HashSet::new();
-    let mut queue = VecDeque::new();
-
-    let pb = ProgressBar::new(queries_to_scrape as u64);
-    pb.set_style(
-        ProgressStyle::default_bar()
-            .template("{spinner:.green} [{elapsed_precise}] [{wide_bar}] {pos:>7}/{len:7} ({eta})")
-            .unwrap()
-            .progress_chars("#>-"),
-    );
-
-    for c in 'a'..='z' {
-        queue.push_back(c.to_string());
-    }
-
-    let path = output_dir
-        .as_ref()
-        .join(format!("queries_{:}.csv", gl.to_string().as_str()));
-
-    let mut queries_since_last_save = 0;
-
-    while let Some(query) = queue.pop_front() {
-        let res = suggestions(&query, gl.to_string().as_str());
-
-        if res.is_err() {
-            continue;
-        }
-
-        let res = res.unwrap();
-        let mut new_queries = 0;
-
-        for next_query in res {
-            if queries.contains(&next_query) {
-                continue;
-            }
-
-            let mut new = Vec::new();
-            for c in next_query.chars() {
-                let q = new.clone().into_iter().collect();
-                new.push(c);
-
-                if !queries.contains(&q) {
-                    queue.push_back(q);
-                }
-            }
-
-            for q in next_query.split_whitespace() {
-                if !queries.contains(q) {
-                    queries.insert(q.to_string());
-                    queue.push_back(q.to_string());
-                }
-            }
-
-            new_queries += 1;
-            queries.insert(next_query);
-            pb.tick();
-            pb.set_position(queries.len() as u64);
-        }
-
-        if queries.len() >= queries_to_scrape {
-            break;
-        }
-
-        queries_since_last_save += new_queries;
-
-        if queries_since_last_save > 1_000 {
-            save_queries(&queries, &path)?;
-            queries_since_last_save = 0;
-        }
-
-        std::thread::sleep(std::time::Duration::from_millis(ms_sleep_between_req));
-    }
-
-    pb.finish();
-
-    save_queries(&queries, &path)?;
-
-    Ok(())
-}

+ 0 - 1
crates/core/src/entrypoint/configure.rs

@@ -41,7 +41,6 @@ fn download_files() {
         .unwrap()
         .unwrap()
         .block_on(async {
         .block_on(async {
             for name in [
             for name in [
-                "queries_us.csv",
                 "sample.warc.gz",
                 "sample.warc.gz",
                 "bangs.json",
                 "bangs.json",
                 "english-wordnet-2022-subset.ttl",
                 "english-wordnet-2022-subset.ttl",

+ 0 - 1
crates/core/src/entrypoint/mod.rs

@@ -17,7 +17,6 @@
 //! The entrypoint module contains all entrypoints that runs the executables.
 //! The entrypoint module contains all entrypoints that runs the executables.
 pub mod ampc;
 pub mod ampc;
 pub mod api;
 pub mod api;
-pub mod autosuggest_scrape;
 pub mod canonical;
 pub mod canonical;
 mod centrality;
 mod centrality;
 #[cfg(feature = "dev")]
 #[cfg(feature = "dev")]

+ 18 - 7
crates/core/src/entrypoint/search_server.rs

@@ -21,14 +21,13 @@ use url::Url;
 
 
 use crate::{
 use crate::{
     config,
     config,
-    distributed::sonic::service::sonic_service,
     distributed::{
     distributed::{
         cluster::Cluster,
         cluster::Cluster,
         member::{Member, Service},
         member::{Member, Service},
-        sonic,
+        sonic::{self, service::sonic_service},
     },
     },
     index::Index,
     index::Index,
-    inverted_index::{self, RetrievedWebpage},
+    inverted_index::{self, KeyPhrase, RetrievedWebpage},
     models::dual_encoder::DualEncoder,
     models::dual_encoder::DualEncoder,
     ranking::models::{lambdamart::LambdaMART, linear::LinearRegression},
     ranking::models::{lambdamart::LambdaMART, linear::LinearRegression},
     searcher::{InitialWebsiteResult, LocalSearcher, SearchQuery},
     searcher::{InitialWebsiteResult, LocalSearcher, SearchQuery},
@@ -42,6 +41,7 @@ sonic_service!(
         Search,
         Search,
         GetWebpage,
         GetWebpage,
         GetHomepageDescriptions,
         GetHomepageDescriptions,
+        TopKeyPhrases,
     ]
     ]
 );
 );
 
 
@@ -93,7 +93,7 @@ impl SearchService {
     }
     }
 }
 }
 
 
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
 pub struct RetrieveWebsites {
 pub struct RetrieveWebsites {
     pub websites: Vec<inverted_index::WebpagePointer>,
     pub websites: Vec<inverted_index::WebpagePointer>,
     pub query: String,
     pub query: String,
@@ -108,7 +108,7 @@ impl sonic::service::Message<SearchService> for RetrieveWebsites {
     }
     }
 }
 }
 
 
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
 pub struct Search {
 pub struct Search {
     pub query: SearchQuery,
     pub query: SearchQuery,
 }
 }
@@ -119,7 +119,7 @@ impl sonic::service::Message<SearchService> for Search {
     }
     }
 }
 }
 
 
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
 pub struct GetWebpage {
 pub struct GetWebpage {
     pub url: String,
     pub url: String,
 }
 }
@@ -130,7 +130,7 @@ impl sonic::service::Message<SearchService> for GetWebpage {
     }
     }
 }
 }
 
 
-#[derive(Debug, Clone, serde::Serialize, serde::Deserialize, bincode::Encode, bincode::Decode)]
+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
 pub struct GetHomepageDescriptions {
 pub struct GetHomepageDescriptions {
     #[bincode(with_serde)]
     #[bincode(with_serde)]
     pub urls: Vec<Url>,
     pub urls: Vec<Url>,
@@ -152,6 +152,17 @@ impl sonic::service::Message<SearchService> for GetHomepageDescriptions {
     }
     }
 }
 }
 
 
+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
+pub struct TopKeyPhrases {
+    pub top_n: usize,
+}
+impl sonic::service::Message<SearchService> for TopKeyPhrases {
+    type Response = Vec<KeyPhrase>;
+    async fn handle(self, server: &SearchService) -> Self::Response {
+        server.local_searcher.top_key_phrases(self.top_n)
+    }
+}
+
 pub async fn run(config: config::SearchServerConfig) -> Result<()> {
 pub async fn run(config: config::SearchServerConfig) -> Result<()> {
     let addr = config.host;
     let addr = config.host;
     let server = SearchService::new(config).await?.bind(addr).await.unwrap();
     let server = SearchService::new(config).await?.bind(addr).await.unwrap();

+ 204 - 0
crates/core/src/inverted_index/key_phrase.rs

@@ -0,0 +1,204 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use itertools::Itertools;
+use std::str;
+use std::{cmp::Reverse, collections::HashMap};
+
+use crate::{
+    schema::text_field::{self, TextField},
+    SortableFloat,
+};
+
+const DONT_SCORE_TOP_PERCENT_OF_WORDS: f64 = 0.002;
+const NON_ALPHABETIC_CHAR_THRESHOLD: f64 = 0.25;
+
+struct Scorer {
+    word_freq: HashMap<String, u64>,
+    word_docs: HashMap<String, u64>,
+    total_doc_freq: f64,
+    num_docs: u64,
+    word_freq_threshold: u64,
+}
+
+impl Scorer {
+    fn new(searcher: &tantivy::Searcher, field: tantivy::schema::Field) -> Self {
+        let mut word_freq = HashMap::new();
+        let mut word_docs = HashMap::new();
+        let mut total_doc_freq = 0.0;
+        let mut num_docs = 0;
+
+        for seg_reader in searcher.segment_readers() {
+            let inv_index = seg_reader.inverted_index(field).unwrap();
+            let mut stream = inv_index.terms().stream().unwrap();
+
+            while let Some((term, _)) = stream.next() {
+                let term_str = str::from_utf8(term).unwrap().to_string();
+
+                let words = term_str.split_whitespace().collect::<Vec<_>>();
+                if words.is_empty() {
+                    continue;
+                }
+
+                for word in &words {
+                    *word_freq.entry(word.to_string()).or_insert(0) += 1;
+                }
+
+                for word in words.into_iter().unique() {
+                    *word_docs.entry(word.to_string()).or_insert(0) += 1;
+                }
+            }
+
+            total_doc_freq += seg_reader.num_docs() as f64;
+            num_docs += inv_index.terms().num_terms() as u64;
+        }
+
+        let num_words = word_freq.len() as f64;
+        let word_freq_threshold = *word_freq
+            .values()
+            .sorted_by(|a, b| b.cmp(a))
+            .nth((num_words * DONT_SCORE_TOP_PERCENT_OF_WORDS).ceil() as usize)
+            .unwrap_or(&0);
+
+        Self {
+            word_freq,
+            word_docs,
+            total_doc_freq,
+            num_docs,
+            word_freq_threshold,
+        }
+    }
+
+    #[inline]
+    fn word_freq(&self) -> &HashMap<String, u64> {
+        &self.word_freq
+    }
+
+    #[inline]
+    fn word_docs(&self) -> &HashMap<String, u64> {
+        &self.word_docs
+    }
+
+    #[inline]
+    fn total_doc_freq(&self) -> f64 {
+        self.total_doc_freq
+    }
+
+    #[inline]
+    fn num_docs(&self) -> u64 {
+        self.num_docs
+    }
+
+    #[inline]
+    fn word_freq_threshold(&self) -> u64 {
+        self.word_freq_threshold
+    }
+
+    fn score(&self, words: &[&str], doc_freq: u32) -> f64 {
+        let word_freq_threshold = self.word_freq_threshold();
+        let mut score = 0.0;
+        let num_words = words.len();
+        for word in words.iter().unique() {
+            let word_chars = word.chars().count();
+            if word.chars().filter(|c| !c.is_alphabetic()).count() as f64 / word_chars as f64
+                > NON_ALPHABETIC_CHAR_THRESHOLD
+            {
+                continue;
+            }
+
+            let word_docs = self.word_docs().get(*word).unwrap_or(&0);
+            let wf = *self.word_freq().get(*word).unwrap_or(&0);
+
+            if wf > word_freq_threshold {
+                continue;
+            }
+
+            let tf = (wf as f64) / num_words as f64;
+            let idf = ((self.num_docs() as f64) / (*word_docs as f64) + 1.0).ln();
+
+            score += tf * idf;
+        }
+
+        let cf = doc_freq as f64 / (self.total_doc_freq() + 1.0);
+
+        score * (1.0 - cf)
+    }
+}
+
+#[derive(Debug, Clone, bincode::Encode, bincode::Decode)]
+pub struct KeyPhrase {
+    phrase: String,
+    score: f64,
+}
+
+impl KeyPhrase {
+    pub fn new(phrase: String, score: f64) -> Self {
+        Self { phrase, score }
+    }
+
+    pub fn compute_top(reader: &tantivy::IndexReader, top_n: usize) -> Vec<Self> {
+        let searcher = reader.searcher();
+        let field = searcher
+            .schema()
+            .get_field(text_field::KeyPhrases.name())
+            .unwrap();
+
+        let scorer = Scorer::new(&searcher, field);
+
+        let mut keywords: HashMap<String, f64> = HashMap::new();
+
+        for seg_reader in searcher.segment_readers() {
+            let inv_index = seg_reader.inverted_index(field).unwrap();
+            let mut stream = inv_index.terms().stream().unwrap();
+            while let Some((term, info)) = stream.next() {
+                let term_str = str::from_utf8(term).unwrap().to_string();
+                let num_chars = term_str.chars().count();
+
+                if term_str.chars().filter(|c| !c.is_alphabetic()).count() as f64 / num_chars as f64
+                    > NON_ALPHABETIC_CHAR_THRESHOLD
+                {
+                    continue;
+                }
+
+                let words = term_str.split_whitespace().collect::<Vec<_>>();
+                let score = scorer.score(&words, info.doc_freq);
+
+                if score.is_normal() {
+                    let term_str = words.join(" ");
+                    *keywords.entry(term_str).or_default() += score;
+                }
+            }
+        }
+
+        crate::sorted_k(
+            keywords
+                .into_iter()
+                .map(|(phrase, score)| (Reverse(SortableFloat::from(score)), phrase)),
+            top_n,
+        )
+        .into_iter()
+        .map(|(Reverse(score), phrase)| KeyPhrase::new(phrase, score.into()))
+        .collect()
+    }
+
+    pub fn score(&self) -> f64 {
+        self.score
+    }
+
+    pub fn text(&self) -> &str {
+        &self.phrase
+    }
+}

+ 12 - 123
crates/core/src/inverted_index/mod.rs

@@ -27,17 +27,19 @@
 //! but the principle is the same.
 //! but the principle is the same.
 
 
 mod indexing;
 mod indexing;
+mod key_phrase;
+mod retrieved_webpage;
 mod search;
 mod search;
 
 
 pub use indexing::merge_tantivy_segments;
 pub use indexing::merge_tantivy_segments;
-
-use chrono::{DateTime, NaiveDateTime};
+pub use key_phrase::KeyPhrase;
+pub use retrieved_webpage::RetrievedWebpage;
 
 
 use tantivy::directory::MmapDirectory;
 use tantivy::directory::MmapDirectory;
 
 
-use tantivy::schema::{Schema, Value};
+use tantivy::schema::Schema;
 use tantivy::tokenizer::TokenizerManager;
 use tantivy::tokenizer::TokenizerManager;
-use tantivy::{IndexReader, IndexWriter, TantivyDocument};
+use tantivy::{IndexReader, IndexWriter};
 
 
 use crate::collector::{approx_count, Hashes};
 use crate::collector::{approx_count, Hashes};
 use crate::config::SnippetConfig;
 use crate::config::SnippetConfig;
@@ -45,15 +47,10 @@ use crate::numericalfield_reader::NumericalFieldReader;
 
 
 use crate::ranking::initial::Score;
 use crate::ranking::initial::Score;
 
 
-use crate::schema::text_field::TextField;
-use crate::schema::{numerical_field, text_field, Field, NumericalFieldEnum, TextFieldEnum};
-use crate::snippet::TextSnippet;
+use crate::schema::{numerical_field, Field, NumericalFieldEnum};
 use crate::tokenizer::fields::{
 use crate::tokenizer::fields::{
-    BigramTokenizer, Identity, JsonField, Stemmed, TrigramTokenizer, UrlTokenizer,
+    BigramTokenizer, Identity, JsonField, NewlineTokenizer, Stemmed, TrigramTokenizer, UrlTokenizer,
 };
 };
-use crate::webpage::region::Region;
-
-use crate::webpage::schema_org;
 use crate::Result;
 use crate::Result;
 use crate::{schema::create_schema, tokenizer::FieldTokenizer};
 use crate::{schema::create_schema, tokenizer::FieldTokenizer};
 use std::fs;
 use std::fs;
@@ -129,6 +126,9 @@ fn register_tokenizers(manager: &TokenizerManager) {
 
 
     let tokenizer = FieldTokenizer::Json(JsonField);
     let tokenizer = FieldTokenizer::Json(JsonField);
     manager.register(tokenizer.as_str(), tokenizer);
     manager.register(tokenizer.as_str(), tokenizer);
+
+    let tokenizer = FieldTokenizer::Newline(NewlineTokenizer::default());
+    manager.register(tokenizer.as_str(), tokenizer);
 }
 }
 
 
 pub struct InvertedIndex {
 pub struct InvertedIndex {
@@ -169,7 +169,6 @@ impl InvertedIndex {
         register_tokenizers(tantivy_index.tokenizers());
         register_tokenizers(tantivy_index.tokenizers());
 
 
         let reader: IndexReader = tantivy_index.reader_builder().try_into()?;
         let reader: IndexReader = tantivy_index.reader_builder().try_into()?;
-
         let columnfield_reader = NumericalFieldReader::new(&reader.searcher());
         let columnfield_reader = NumericalFieldReader::new(&reader.searcher());
 
 
         Ok(InvertedIndex {
         Ok(InvertedIndex {
@@ -220,116 +219,6 @@ pub struct SearchResult {
     pub documents: Vec<RetrievedWebpage>,
     pub documents: Vec<RetrievedWebpage>,
 }
 }
 
 
-#[derive(
-    Default,
-    Debug,
-    Clone,
-    serde::Serialize,
-    serde::Deserialize,
-    bincode::Encode,
-    bincode::Decode,
-    PartialEq,
-)]
-pub struct RetrievedWebpage {
-    pub title: String,
-    pub url: String,
-    pub body: String,
-    pub snippet: TextSnippet,
-    pub dirty_body: String,
-    pub description: Option<String>,
-    pub dmoz_description: Option<String>,
-    #[bincode(with_serde)]
-    pub updated_time: Option<NaiveDateTime>,
-    pub schema_org: Vec<schema_org::Item>,
-    pub region: Region,
-    pub likely_has_ads: bool,
-    pub likely_has_paywall: bool,
-    pub recipe_first_ingredient_tag_id: Option<String>,
-    pub keywords: Vec<String>,
-}
-impl RetrievedWebpage {
-    pub fn description(&self) -> Option<&String> {
-        self.description.as_ref().or(self.dmoz_description.as_ref())
-    }
-}
-
-fn str_value(name: &str, value: &tantivy::schema::document::CompactDocValue) -> String {
-    value
-        .as_str()
-        .unwrap_or_else(|| panic!("{} field should be text", name))
-        .to_string()
-}
-
-impl From<TantivyDocument> for RetrievedWebpage {
-    fn from(doc: TantivyDocument) -> Self {
-        let mut webpage = RetrievedWebpage::default();
-
-        for (field, value) in doc.field_values() {
-            match Field::get(field.field_id() as usize) {
-                Some(Field::Text(TextFieldEnum::Title(_))) => {
-                    webpage.title = str_value(text_field::Title.name(), &value);
-                }
-                Some(Field::Text(TextFieldEnum::StemmedCleanBody(_))) => {
-                    webpage.body = str_value(text_field::StemmedCleanBody.name(), &value);
-                }
-                Some(Field::Text(TextFieldEnum::Description(_))) => {
-                    let desc = str_value(text_field::Description.name(), &value);
-                    webpage.description = if desc.is_empty() { None } else { Some(desc) }
-                }
-                Some(Field::Text(TextFieldEnum::Url(_))) => {
-                    webpage.url = str_value(text_field::Url.name(), &value);
-                }
-                Some(Field::Numerical(NumericalFieldEnum::LastUpdated(_))) => {
-                    webpage.updated_time = {
-                        let timestamp = value.as_u64().unwrap() as i64;
-                        if timestamp == 0 {
-                            None
-                        } else {
-                            DateTime::from_timestamp(timestamp, 0).map(|dt| dt.naive_utc())
-                        }
-                    }
-                }
-                Some(Field::Text(TextFieldEnum::AllBody(_))) => {
-                    webpage.dirty_body = str_value(text_field::AllBody.name(), &value);
-                }
-                Some(Field::Numerical(NumericalFieldEnum::Region(_))) => {
-                    webpage.region = {
-                        let id = value.as_u64().unwrap();
-                        Region::from_id(id)
-                    }
-                }
-                Some(Field::Text(TextFieldEnum::DmozDescription(_))) => {
-                    let desc = str_value(text_field::DmozDescription.name(), &value);
-                    webpage.dmoz_description = if desc.is_empty() { None } else { Some(desc) }
-                }
-                Some(Field::Text(TextFieldEnum::SchemaOrgJson(_))) => {
-                    let json = str_value(text_field::SchemaOrgJson.name(), &value);
-                    webpage.schema_org = serde_json::from_str(&json).unwrap_or_default();
-                }
-                Some(Field::Numerical(NumericalFieldEnum::LikelyHasAds(_))) => {
-                    webpage.likely_has_ads = value.as_bool().unwrap_or_default();
-                }
-                Some(Field::Numerical(NumericalFieldEnum::LikelyHasPaywall(_))) => {
-                    webpage.likely_has_paywall = value.as_bool().unwrap_or_default();
-                }
-                Some(Field::Text(TextFieldEnum::RecipeFirstIngredientTagId(_))) => {
-                    let tag_id = str_value(text_field::RecipeFirstIngredientTagId.name(), &value);
-                    if !tag_id.is_empty() {
-                        webpage.recipe_first_ingredient_tag_id = Some(tag_id);
-                    }
-                }
-                Some(Field::Text(TextFieldEnum::Keywords(_))) => {
-                    let keywords = str_value(text_field::Keywords.name(), &value);
-                    webpage.keywords = keywords.split('\n').map(|s| s.to_string()).collect();
-                }
-                _ => {}
-            }
-        }
-
-        webpage
-    }
-}
-
 #[cfg(test)]
 #[cfg(test)]
 mod tests {
 mod tests {
     use candle_core::Tensor;
     use candle_core::Tensor;
@@ -343,7 +232,7 @@ mod tests {
         ranking::{Ranker, SignalComputer},
         ranking::{Ranker, SignalComputer},
         search_ctx::Ctx,
         search_ctx::Ctx,
         searcher::SearchQuery,
         searcher::SearchQuery,
-        webpage::{Html, Webpage},
+        webpage::{schema_org, Html, Webpage},
         OneOrMany,
         OneOrMany,
     };
     };
 
 

+ 137 - 0
crates/core/src/inverted_index/retrieved_webpage.rs

@@ -0,0 +1,137 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use chrono::{DateTime, NaiveDateTime};
+use tantivy::{schema::Value, TantivyDocument};
+
+use crate::{
+    schema::{
+        text_field::{self, TextField},
+        Field, NumericalFieldEnum, TextFieldEnum,
+    },
+    snippet::TextSnippet,
+    webpage::{schema_org, Region},
+};
+
+#[derive(
+    Default,
+    Debug,
+    Clone,
+    serde::Serialize,
+    serde::Deserialize,
+    bincode::Encode,
+    bincode::Decode,
+    PartialEq,
+)]
+pub struct RetrievedWebpage {
+    pub title: String,
+    pub url: String,
+    pub body: String,
+    pub snippet: TextSnippet,
+    pub dirty_body: String,
+    pub description: Option<String>,
+    pub dmoz_description: Option<String>,
+    #[bincode(with_serde)]
+    pub updated_time: Option<NaiveDateTime>,
+    pub schema_org: Vec<schema_org::Item>,
+    pub region: Region,
+    pub likely_has_ads: bool,
+    pub likely_has_paywall: bool,
+    pub recipe_first_ingredient_tag_id: Option<String>,
+    pub keywords: Vec<String>,
+}
+impl RetrievedWebpage {
+    pub fn description(&self) -> Option<&String> {
+        self.description.as_ref().or(self.dmoz_description.as_ref())
+    }
+}
+
+fn str_value(name: &str, value: &tantivy::schema::document::CompactDocValue) -> String {
+    value
+        .as_str()
+        .unwrap_or_else(|| panic!("{} field should be text", name))
+        .to_string()
+}
+
+impl From<TantivyDocument> for RetrievedWebpage {
+    fn from(doc: TantivyDocument) -> Self {
+        let mut webpage = RetrievedWebpage::default();
+
+        for (field, value) in doc.field_values() {
+            match Field::get(field.field_id() as usize) {
+                Some(Field::Text(TextFieldEnum::Title(_))) => {
+                    webpage.title = str_value(text_field::Title.name(), &value);
+                }
+                Some(Field::Text(TextFieldEnum::StemmedCleanBody(_))) => {
+                    webpage.body = str_value(text_field::StemmedCleanBody.name(), &value);
+                }
+                Some(Field::Text(TextFieldEnum::Description(_))) => {
+                    let desc = str_value(text_field::Description.name(), &value);
+                    webpage.description = if desc.is_empty() { None } else { Some(desc) }
+                }
+                Some(Field::Text(TextFieldEnum::Url(_))) => {
+                    webpage.url = str_value(text_field::Url.name(), &value);
+                }
+                Some(Field::Numerical(NumericalFieldEnum::LastUpdated(_))) => {
+                    webpage.updated_time = {
+                        let timestamp = value.as_u64().unwrap() as i64;
+                        if timestamp == 0 {
+                            None
+                        } else {
+                            DateTime::from_timestamp(timestamp, 0).map(|dt| dt.naive_utc())
+                        }
+                    }
+                }
+                Some(Field::Text(TextFieldEnum::AllBody(_))) => {
+                    webpage.dirty_body = str_value(text_field::AllBody.name(), &value);
+                }
+                Some(Field::Numerical(NumericalFieldEnum::Region(_))) => {
+                    webpage.region = {
+                        let id = value.as_u64().unwrap();
+                        Region::from_id(id)
+                    }
+                }
+                Some(Field::Text(TextFieldEnum::DmozDescription(_))) => {
+                    let desc = str_value(text_field::DmozDescription.name(), &value);
+                    webpage.dmoz_description = if desc.is_empty() { None } else { Some(desc) }
+                }
+                Some(Field::Text(TextFieldEnum::SchemaOrgJson(_))) => {
+                    let json = str_value(text_field::SchemaOrgJson.name(), &value);
+                    webpage.schema_org = serde_json::from_str(&json).unwrap_or_default();
+                }
+                Some(Field::Numerical(NumericalFieldEnum::LikelyHasAds(_))) => {
+                    webpage.likely_has_ads = value.as_bool().unwrap_or_default();
+                }
+                Some(Field::Numerical(NumericalFieldEnum::LikelyHasPaywall(_))) => {
+                    webpage.likely_has_paywall = value.as_bool().unwrap_or_default();
+                }
+                Some(Field::Text(TextFieldEnum::RecipeFirstIngredientTagId(_))) => {
+                    let tag_id = str_value(text_field::RecipeFirstIngredientTagId.name(), &value);
+                    if !tag_id.is_empty() {
+                        webpage.recipe_first_ingredient_tag_id = Some(tag_id);
+                    }
+                }
+                Some(Field::Text(TextFieldEnum::Keywords(_))) => {
+                    let keywords = str_value(text_field::Keywords.name(), &value);
+                    webpage.keywords = keywords.split('\n').map(|s| s.to_string()).collect();
+                }
+                _ => {}
+            }
+        }
+
+        webpage
+    }
+}

+ 5 - 0
crates/core/src/inverted_index/search.rs

@@ -14,6 +14,7 @@
 // You should have received a copy of the GNU Affero General Public License
 // You should have received a copy of the GNU Affero General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>
 // along with this program.  If not, see <https://www.gnu.org/licenses/>
 
 
+use super::key_phrase::KeyPhrase;
 use super::{DocAddress, InitialSearchResult, InvertedIndex, RetrievedWebpage, WebpagePointer};
 use super::{DocAddress, InitialSearchResult, InvertedIndex, RetrievedWebpage, WebpagePointer};
 use itertools::Itertools;
 use itertools::Itertools;
 use tantivy::collector::Count;
 use tantivy::collector::Count;
@@ -298,4 +299,8 @@ impl InvertedIndex {
         res.pop()
         res.pop()
             .map(|(_, doc)| self.retrieve_doc(doc.into(), &tv_searcher).unwrap())
             .map(|(_, doc)| self.retrieve_doc(doc.into(), &tv_searcher).unwrap())
     }
     }
+
+    pub(crate) fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
+        KeyPhrase::compute_top(&self.reader, top_n)
+    }
 }
 }

+ 8 - 5
crates/core/src/lib.rs

@@ -344,14 +344,17 @@ impl<T> OneOrMany<T> {
     }
     }
 }
 }
 
 
-pub trait TopKOrderable: Ord {
+pub trait TopKOrderable {
     type SortKey: Ord + Copy;
     type SortKey: Ord + Copy;
 
 
     fn sort_key(&self) -> Self::SortKey;
     fn sort_key(&self) -> Self::SortKey;
 }
 }
 
 
-impl TopKOrderable for (SortableFloat, webgraph::NodeID) {
-    type SortKey = SortableFloat;
+impl<K, T> TopKOrderable for (K, T)
+where
+    K: Ord + Copy,
+{
+    type SortKey = K;
 
 
     fn sort_key(&self) -> Self::SortKey {
     fn sort_key(&self) -> Self::SortKey {
         self.0
         self.0
@@ -391,12 +394,12 @@ where
         top_k.push(hit);
         top_k.push(hit);
         if top_k.len() >= 2 * k {
         if top_k.len() >= 2 * k {
             // The standard library does all of the heavy lifting here.
             // The standard library does all of the heavy lifting here.
-            let (_, median_el, _) = top_k.select_nth_unstable(k - 1);
+            let (_, median_el, _) = top_k.select_nth_unstable_by_key(k - 1, |el| el.sort_key());
             threshold = Some(median_el.sort_key());
             threshold = Some(median_el.sort_key());
             top_k.truncate(k);
             top_k.truncate(k);
         }
         }
     }
     }
-    top_k.sort_unstable();
+    top_k.sort_unstable_by_key(|el| el.sort_key());
     top_k.truncate(k);
     top_k.truncate(k);
     top_k
     top_k
 }
 }

+ 0 - 17
crates/core/src/main.rs

@@ -19,7 +19,6 @@ use serde::de::DeserializeOwned;
 use std::fs;
 use std::fs;
 use std::path::Path;
 use std::path::Path;
 use stract::config;
 use stract::config;
-use stract::entrypoint::autosuggest_scrape::{self, Gl};
 
 
 #[cfg(feature = "dev")]
 #[cfg(feature = "dev")]
 use stract::entrypoint::configure;
 use stract::entrypoint::configure;
@@ -81,14 +80,6 @@ enum Commands {
         config_path: String,
         config_path: String,
     },
     },
 
 
-    /// Scrape the Google autosuggest API for search queries.
-    AutosuggestScrape {
-        num_queries: usize,
-        gl: Gl,
-        ms_sleep_between_req: u64,
-        output_dir: String,
-    },
-
     /// Deploy the crawler.
     /// Deploy the crawler.
     Crawler {
     Crawler {
         #[clap(subcommand)]
         #[clap(subcommand)]
@@ -375,14 +366,6 @@ fn main() -> Result<()> {
                 .build()?
                 .build()?
                 .block_on(entity_search_server::run(config))?;
                 .block_on(entity_search_server::run(config))?;
         }
         }
-        Commands::AutosuggestScrape {
-            num_queries: queries_to_scrape,
-            gl,
-            ms_sleep_between_req,
-            output_dir,
-        } => {
-            autosuggest_scrape::run(queries_to_scrape, gl, ms_sleep_between_req, output_dir)?;
-        }
         #[cfg(feature = "dev")]
         #[cfg(feature = "dev")]
         Commands::Configure { skip_download } => {
         Commands::Configure { skip_download } => {
             configure::run(skip_download)?;
             configure::run(skip_download)?;

+ 72 - 22
crates/core/src/schema/text_field.rs

@@ -29,9 +29,12 @@ use crate::{
     enum_dispatch_from_discriminant,
     enum_dispatch_from_discriminant,
     enum_map::InsertEnumMapKey,
     enum_map::InsertEnumMapKey,
     ranking::bm25::Bm25Constants,
     ranking::bm25::Bm25Constants,
-    tokenizer,
-    tokenizer::fields::{
-        BigramTokenizer, FieldTokenizer, Identity, JsonField, TrigramTokenizer, UrlTokenizer,
+    tokenizer::{
+        self,
+        fields::{
+            BigramTokenizer, FieldTokenizer, Identity, JsonField, NewlineTokenizer,
+            TrigramTokenizer, UrlTokenizer,
+        },
     },
     },
     webpage::Html,
     webpage::Html,
     Result,
     Result,
@@ -95,6 +98,10 @@ pub trait TextField:
         false
         false
     }
     }
 
 
+    fn has_freqs(&self) -> bool {
+        true
+    }
+
     fn is_phrase_searchable(&self) -> bool {
     fn is_phrase_searchable(&self) -> bool {
         self.is_searchable() && self.has_pos()
         self.is_searchable() && self.has_pos()
     }
     }
@@ -115,10 +122,10 @@ pub trait TextField:
     }
     }
 
 
     fn record_option(&self) -> IndexRecordOption {
     fn record_option(&self) -> IndexRecordOption {
-        if self.has_pos() {
-            IndexRecordOption::WithFreqsAndPositions
-        } else {
-            IndexRecordOption::WithFreqs
+        match (self.has_freqs(), self.has_pos()) {
+            (true, true) => IndexRecordOption::WithFreqsAndPositions,
+            (true, false) => IndexRecordOption::WithFreqs,
+            (false, _) => IndexRecordOption::Basic,
         }
         }
     }
     }
 
 
@@ -190,6 +197,7 @@ pub enum TextFieldEnum {
     InsertionTimestamp,
     InsertionTimestamp,
     RecipeFirstIngredientTagId,
     RecipeFirstIngredientTagId,
     Keywords,
     Keywords,
+    KeyPhrases,
     Links,
     Links,
 }
 }
 
 
@@ -227,6 +235,7 @@ enum_dispatch_from_discriminant!(TextFieldEnumDiscriminants => TextFieldEnum,
     InsertionTimestamp,
     InsertionTimestamp,
     RecipeFirstIngredientTagId,
     RecipeFirstIngredientTagId,
     Keywords,
     Keywords,
+    KeyPhrases,
     Links,
     Links,
 ]);
 ]);
 
 
@@ -1516,6 +1525,41 @@ impl TextField for RecipeFirstIngredientTagId {
     }
     }
 }
 }
 
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Links;
+impl TextField for Links {
+    fn name(&self) -> &str {
+        "links"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer {
+        FieldTokenizer::Url(UrlTokenizer)
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(
+            self.tantivy_field(schema)
+                .unwrap_or_else(|| panic!("could not find field '{}' in index", self.name())),
+            html.anchor_links()
+                .into_iter()
+                .map(|l| l.destination.as_str().to_string())
+                .join("\n"),
+        );
+
+        Ok(())
+    }
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub struct Keywords;
 pub struct Keywords;
 impl TextField for Keywords {
 impl TextField for Keywords {
@@ -1554,36 +1598,42 @@ impl TextField for Keywords {
 }
 }
 
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub struct Links;
-impl TextField for Links {
+pub struct KeyPhrases;
+impl TextField for KeyPhrases {
     fn name(&self) -> &str {
     fn name(&self) -> &str {
-        "links"
+        "key_phrases"
     }
     }
 
 
-    fn has_pos(&self) -> bool {
+    fn is_stored(&self) -> bool {
         true
         true
     }
     }
 
 
-    fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer {
-        FieldTokenizer::Url(UrlTokenizer)
-    }
-
-    fn add_html_tantivy(
+    fn add_webpage_tantivy(
         &self,
         &self,
-        html: &Html,
-        _cache: &mut FnCache,
+        webpage: &crate::webpage::Webpage,
         doc: &mut TantivyDocument,
         doc: &mut TantivyDocument,
         schema: &tantivy::schema::Schema,
         schema: &tantivy::schema::Schema,
     ) -> Result<()> {
     ) -> Result<()> {
         doc.add_text(
         doc.add_text(
             self.tantivy_field(schema)
             self.tantivy_field(schema)
                 .unwrap_or_else(|| panic!("could not find field '{}' in index", self.name())),
                 .unwrap_or_else(|| panic!("could not find field '{}' in index", self.name())),
-            html.anchor_links()
-                .into_iter()
-                .map(|l| l.destination.as_str().to_string())
-                .join("\n"),
+            webpage.keywords.join("\n"),
         );
         );
 
 
         Ok(())
         Ok(())
     }
     }
+
+    fn add_html_tantivy(
+        &self,
+        _: &Html,
+        _: &mut FnCache,
+        _: &mut TantivyDocument,
+        _: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn tokenizer(&self, _: Option<&whatlang::Lang>) -> FieldTokenizer {
+        FieldTokenizer::Newline(NewlineTokenizer::default())
+    }
 }
 }

+ 47 - 2
crates/core/src/searcher/distributed.rs

@@ -31,12 +31,12 @@ use crate::{
     },
     },
     image_store::Image,
     image_store::Image,
     index::Index,
     index::Index,
-    inverted_index::{RetrievedWebpage, WebpagePointer},
+    inverted_index::{KeyPhrase, RetrievedWebpage, WebpagePointer},
     ranking::pipeline::{PrecisionRankingWebpage, RecallRankingWebpage},
     ranking::pipeline::{PrecisionRankingWebpage, RecallRankingWebpage},
     Result,
     Result,
 };
 };
 
 
-use std::{collections::HashMap, sync::Arc};
+use std::{collections::HashMap, sync::Arc, time::Duration};
 
 
 use fnv::FnvHashMap;
 use fnv::FnvHashMap;
 use futures::future::join_all;
 use futures::future::join_all;
@@ -92,6 +92,8 @@ pub trait SearchClient {
         max_height: Option<u64>,
         max_height: Option<u64>,
         max_width: Option<u64>,
         max_width: Option<u64>,
     ) -> impl Future<Output = Result<Option<Image>>> + Send;
     ) -> impl Future<Output = Result<Option<Image>>> + Send;
+
+    fn top_key_phrases(&self, top_n: usize) -> impl Future<Output = Vec<KeyPhrase>> + Send;
 }
 }
 
 
 #[derive(Clone, Debug)]
 #[derive(Clone, Debug)]
@@ -382,6 +384,45 @@ impl SearchClient for DistributedSearcher {
             .and_then(|(_, mut v)| v.pop())
             .and_then(|(_, mut v)| v.pop())
             .and_then(|(_, v)| v)
             .and_then(|(_, v)| v)
     }
     }
+
+    async fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
+        let client = self.conn().await;
+
+        let res = client
+            .send_with_timeout(
+                search_server::TopKeyPhrases { top_n },
+                &AllShardsSelector,
+                &RandomReplicaSelector,
+                Duration::from_secs(60 * 60),
+            )
+            .await;
+
+        match res {
+            Ok(res) => {
+                let mut phrases = HashMap::new();
+
+                for (_, v) in res {
+                    for (_, v) in v {
+                        for phrase in v {
+                            *phrases.entry(phrase.text().to_string()).or_default() +=
+                                phrase.score();
+                        }
+                    }
+                }
+
+                phrases
+                    .into_iter()
+                    .map(|(phrase, score)| KeyPhrase::new(phrase, score))
+                    .sorted_by(|a, b| b.score().partial_cmp(&a.score()).unwrap())
+                    .take(top_n)
+                    .collect()
+            }
+            Err(e) => {
+                tracing::error!("failed to get key phrases: {:?}", e);
+                Vec::new()
+            }
+        }
+    }
 }
 }
 
 
 /// This should only be used for testing and benchmarks.
 /// This should only be used for testing and benchmarks.
@@ -457,4 +498,8 @@ impl SearchClient for LocalSearchClient {
     async fn search_entity(&self, _query: &str) -> Option<EntityMatch> {
     async fn search_entity(&self, _query: &str) -> Option<EntityMatch> {
         None
         None
     }
     }
+
+    async fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
+        self.0.top_key_phrases(top_n)
+    }
 }
 }

+ 5 - 1
crates/core/src/searcher/local.rs

@@ -23,7 +23,7 @@ use url::Url;
 use crate::collector::approx_count;
 use crate::collector::approx_count;
 use crate::config::{CollectorConfig, SnippetConfig};
 use crate::config::{CollectorConfig, SnippetConfig};
 use crate::index::Index;
 use crate::index::Index;
-use crate::inverted_index::{InvertedIndex, RetrievedWebpage};
+use crate::inverted_index::{InvertedIndex, KeyPhrase, RetrievedWebpage};
 use crate::models::dual_encoder::DualEncoder;
 use crate::models::dual_encoder::DualEncoder;
 use crate::query::Query;
 use crate::query::Query;
 use crate::ranking::models::lambdamart::LambdaMART;
 use crate::ranking::models::lambdamart::LambdaMART;
@@ -388,6 +388,10 @@ where
     pub fn get_homepage(&self, url: &Url) -> Option<RetrievedWebpage> {
     pub fn get_homepage(&self, url: &Url) -> Option<RetrievedWebpage> {
         self.index.guard().inverted_index().get_homepage(url)
         self.index.guard().inverted_index().get_homepage(url)
     }
     }
+
+    pub fn top_key_phrases(&self, top_n: usize) -> Vec<KeyPhrase> {
+        self.index.guard().inverted_index().top_key_phrases(top_n)
+    }
 }
 }
 
 
 #[cfg(test)]
 #[cfg(test)]

+ 6 - 1
crates/core/src/tokenizer/fields/mod.rs

@@ -18,12 +18,14 @@ use tantivy::tokenizer::BoxTokenStream;
 
 
 pub use self::{
 pub use self::{
     bigram::BigramTokenizer, default::DefaultTokenizer, identity::Identity, json::FlattenedJson,
     bigram::BigramTokenizer, default::DefaultTokenizer, identity::Identity, json::FlattenedJson,
-    json::JsonField, stemmed::Stemmed, trigram::TrigramTokenizer, url::UrlTokenizer,
+    json::JsonField, split_newlines::NewlineTokenizer, stemmed::Stemmed, trigram::TrigramTokenizer,
+    url::UrlTokenizer,
 };
 };
 
 
 mod default;
 mod default;
 mod identity;
 mod identity;
 mod json;
 mod json;
+mod split_newlines;
 mod stemmed;
 mod stemmed;
 mod url;
 mod url;
 
 
@@ -40,6 +42,7 @@ pub enum FieldTokenizer {
     Trigram(TrigramTokenizer),
     Trigram(TrigramTokenizer),
     Json(JsonField),
     Json(JsonField),
     Url(UrlTokenizer),
     Url(UrlTokenizer),
+    Newline(NewlineTokenizer),
 }
 }
 
 
 impl FieldTokenizer {
 impl FieldTokenizer {
@@ -52,6 +55,7 @@ impl FieldTokenizer {
             FieldTokenizer::Trigram(_) => TrigramTokenizer::as_str(),
             FieldTokenizer::Trigram(_) => TrigramTokenizer::as_str(),
             FieldTokenizer::Json(_) => JsonField::as_str(),
             FieldTokenizer::Json(_) => JsonField::as_str(),
             FieldTokenizer::Url(_) => UrlTokenizer::as_str(),
             FieldTokenizer::Url(_) => UrlTokenizer::as_str(),
+            FieldTokenizer::Newline(_) => NewlineTokenizer::as_str(),
         }
         }
     }
     }
 }
 }
@@ -79,6 +83,7 @@ impl tantivy::tokenizer::Tokenizer for FieldTokenizer {
             FieldTokenizer::Bigram(tokenizer) => tokenizer.token_stream(text),
             FieldTokenizer::Bigram(tokenizer) => tokenizer.token_stream(text),
             FieldTokenizer::Trigram(tokenizer) => tokenizer.token_stream(text),
             FieldTokenizer::Trigram(tokenizer) => tokenizer.token_stream(text),
             FieldTokenizer::Url(tokenizer) => tokenizer.token_stream(text),
             FieldTokenizer::Url(tokenizer) => tokenizer.token_stream(text),
+            FieldTokenizer::Newline(tokenizer) => tokenizer.token_stream(text),
         }
         }
     }
     }
 }
 }

+ 145 - 0
crates/core/src/tokenizer/fields/split_newlines.rs

@@ -0,0 +1,145 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use tantivy::tokenizer::{BoxTokenStream, TextAnalyzer};
+
+use crate::tokenizer::{self, normalizer, split_with_range::SplitWithRange, Normalize};
+
+#[derive(Clone, Default)]
+pub struct NewlineTokenizer {
+    analyzer: Option<TextAnalyzer>,
+}
+
+impl NewlineTokenizer {
+    pub fn as_str() -> &'static str {
+        "newline"
+    }
+}
+
+impl tantivy::tokenizer::Tokenizer for NewlineTokenizer {
+    type TokenStream<'a> = BoxTokenStream<'a>;
+
+    fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
+        let builder = TextAnalyzer::builder(Newline);
+
+        self.analyzer = Some(builder.build());
+
+        self.analyzer.as_mut().unwrap().token_stream(text)
+    }
+}
+
+#[derive(Clone)]
+pub struct Newline;
+
+pub struct NewlineTokenStream<'a> {
+    stream: Box<dyn Iterator<Item = tokenizer::Token<'a>> + 'a>,
+    token: Option<tantivy::tokenizer::Token>,
+    next_position: usize,
+}
+
+impl tantivy::tokenizer::Tokenizer for Newline {
+    type TokenStream<'a> = BoxTokenStream<'a>;
+
+    fn token_stream<'a>(&mut self, text: &'a str) -> Self::TokenStream<'a> {
+        let stream = Box::new(
+            text.split_with_range(|c| c == '\n' || c == '\r')
+                .map(|(s, range)| tokenizer::Token::new(s, range))
+                .normalize(&normalizer::Lowercase)
+                .normalize(&normalizer::UnicodeNFKD)
+                .normalize(&normalizer::UnicodeDiacritics),
+        );
+
+        BoxTokenStream::new(NewlineTokenStream::new_boxed(stream))
+    }
+}
+
+impl<'a> tantivy::tokenizer::TokenStream for NewlineTokenStream<'a> {
+    fn advance(&mut self) -> bool {
+        self.token = self.stream.next().map(|token| {
+            let span = token.span();
+            let pos = self.next_position;
+            self.next_position += 1;
+            tantivy::tokenizer::Token {
+                offset_from: span.start,
+                offset_to: span.end,
+                position: pos,
+                text: token.text().to_string(),
+                ..Default::default()
+            }
+        });
+
+        self.token.is_some()
+    }
+
+    fn token(&self) -> &tantivy::tokenizer::Token {
+        self.token.as_ref().unwrap()
+    }
+
+    fn token_mut(&mut self) -> &mut tantivy::tokenizer::Token {
+        self.token.as_mut().unwrap()
+    }
+}
+
+impl<'a> NewlineTokenStream<'a> {
+    fn new_boxed(
+        stream: Box<dyn Iterator<Item = tokenizer::Token<'a>> + 'a>,
+    ) -> BoxTokenStream<'a> {
+        BoxTokenStream::new(Self {
+            stream,
+            token: None,
+            next_position: 0,
+        })
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use tantivy::tokenizer::Tokenizer as _;
+
+    fn tokenize_newline(s: &str) -> Vec<String> {
+        let mut res = Vec::new();
+        let mut tokenizer = NewlineTokenizer::default();
+        let mut stream = tokenizer.token_stream(s);
+
+        while let Some(token) = stream.next() {
+            res.push(token.text.clone());
+        }
+
+        res
+    }
+
+    #[test]
+    fn newline_tokenizer() {
+        assert!(tokenize_newline("").is_empty());
+        assert_eq!(tokenize_newline("a\nb"), vec!["a", "b"]);
+        assert_eq!(tokenize_newline("a\nb\n"), vec!["a", "b"]);
+        assert_eq!(tokenize_newline("\na\nb\n"), vec!["a", "b"]);
+        assert_eq!(tokenize_newline("\na\nb\nc"), vec!["a", "b", "c"]);
+    }
+
+    #[test]
+    fn newline_tokenizer_without_newlines() {
+        assert!(tokenize_newline("").is_empty());
+        assert_eq!(tokenize_newline("test"), vec!["test"]);
+
+        assert_eq!(tokenize_newline("this is"), vec!["this is"]);
+        assert_eq!(tokenize_newline("this is a"), vec!["this is a",]);
+        assert_eq!(tokenize_newline("this is a test"), vec!["this is a test",]);
+
+        assert_eq!(tokenize_newline("this.is"), vec!["this.is"]);
+    }
+}

+ 4 - 2
crates/core/src/web_spell/term_freqs.rs

@@ -216,15 +216,17 @@ impl TermDict {
             return;
             return;
         }
         }
 
 
+        let num_chars = term.chars().count();
+
         let punctuation_percentage =
         let punctuation_percentage =
-            term.chars().filter(|c| c.is_ascii_punctuation()).count() as f64 / term.len() as f64;
+            term.chars().filter(|c| c.is_ascii_punctuation()).count() as f64 / num_chars as f64;
 
 
         if punctuation_percentage > 0.5 {
         if punctuation_percentage > 0.5 {
             return;
             return;
         }
         }
 
 
         let non_alphabetic_percentage =
         let non_alphabetic_percentage =
-            term.chars().filter(|c| !c.is_alphabetic()).count() as f64 / term.len() as f64;
+            term.chars().filter(|c| !c.is_alphabetic()).count() as f64 / num_chars as f64;
 
 
         if non_alphabetic_percentage > 0.25 {
         if non_alphabetic_percentage > 0.25 {
             return;
             return;