ソースを参照

Schema fields as traits (#185)

* refactor data that is re-used across fields for a particular page during indexing into an 'FnCache'

* automatically generate ALL_FIELDS and ALL_SIGNALS arrays with strum macro. ensures the arrays are always fully up to date

* split up schema fields into submodules

* add textfield trait with enum-dispatch

* add fastfield trait with enum-dispatch

* move field names into trait

* move some trivial functions from 'FastFieldEnum' and 'TextFieldEnum' into their respective traits

* move methods from Field into TextField and FastField traits

* extract html .as_tantivy into textfield trait

* extract html .as_tantivy into fastfield trait

* extract webpage .as_tantivy into field traits

* fix indexer example cleanup
Mikkel Denker 1 年間 前
コミット
2dadbf70d6
34 ファイル変更3325 行追加1751 行削除
  1. 1 0
      .gitignore
  2. 38 2
      Cargo.lock
  3. 2 0
      Cargo.toml
  4. 5 2
      assets/licenses.html
  5. 2 0
      crates/core/Cargo.toml
  6. 1 1
      crates/core/examples/indexer.rs
  7. 20 8
      crates/core/src/collector.rs
  8. 24 19
      crates/core/src/enum_map.rs
  9. 5 5
      crates/core/src/fastfield_reader.rs
  10. 36 33
      crates/core/src/inverted_index.rs
  11. 4 4
      crates/core/src/mapreduce/dht/mod.rs
  12. 10 10
      crates/core/src/mapreduce/dht/network/mod.rs
  13. 12 16
      crates/core/src/mapreduce/dht/network/raft.rs
  14. 2 2
      crates/core/src/query/mod.rs
  15. 10 10
      crates/core/src/query/optic.rs
  16. 10 10
      crates/core/src/query/parser/as_tantivy.rs
  17. 3 2
      crates/core/src/query/parser/mod.rs
  18. 4 4
      crates/core/src/query/pattern_query/mod.rs
  19. 4 4
      crates/core/src/query/pattern_query/scorer.rs
  20. 26 16
      crates/core/src/query/pattern_query/weight.rs
  21. 3 3
      crates/core/src/ranking/pipeline/stages/recall.rs
  22. 62 96
      crates/core/src/ranking/signal.rs
  23. 0 850
      crates/core/src/schema.rs
  24. 1233 0
      crates/core/src/schema/fast_field.rs
  25. 139 0
      crates/core/src/schema/mod.rs
  26. 1427 0
      crates/core/src/schema/text_field.rs
  27. 3 3
      crates/core/src/searcher/api/mod.rs
  28. 2 2
      crates/core/src/searcher/local.rs
  29. 136 0
      crates/core/src/webpage/html/fn_cache.rs
  30. 70 461
      crates/core/src/webpage/html/into_tantivy.rs
  31. 13 15
      crates/core/src/webpage/html/microformats.rs
  32. 3 0
      crates/core/src/webpage/html/mod.rs
  33. 7 4
      crates/core/src/webpage/html/robots_meta.rs
  34. 8 169
      crates/core/src/webpage/mod.rs

+ 1 - 0
.gitignore

@@ -16,3 +16,4 @@ proptest-regressions
 *.pending-snap
 *.pending-snap
 .ipynb_checkpoints
 .ipynb_checkpoints
 .zed
 .zed
+rustc-ice-*

+ 38 - 2
Cargo.lock

@@ -1373,6 +1373,18 @@ dependencies = [
  "syn 2.0.52",
  "syn 2.0.52",
 ]
 ]
 
 
+[[package]]
+name = "enum_dispatch"
+version = "0.3.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f33313078bb8d4d05a2733a94ac4c2d8a0df9a2b84424ebf4f33bfc224a890e"
+dependencies = [
+ "once_cell",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.52",
+]
+
 [[package]]
 [[package]]
 name = "equivalent"
 name = "equivalent"
 version = "1.0.1"
 version = "1.0.1"
@@ -4174,8 +4186,8 @@ dependencies = [
  "smallvec",
  "smallvec",
  "snap",
  "snap",
  "socket2",
  "socket2",
- "strum",
- "strum_macros",
+ "strum 0.23.0",
+ "strum_macros 0.23.1",
  "thiserror",
  "thiserror",
  "tokio",
  "tokio",
  "tracing",
  "tracing",
@@ -4536,6 +4548,7 @@ dependencies = [
  "csv",
  "csv",
  "dashmap",
  "dashmap",
  "encoding_rs",
  "encoding_rs",
+ "enum_dispatch",
  "eventsource-stream",
  "eventsource-stream",
  "fend-core",
  "fend-core",
  "flate2",
  "flate2",
@@ -4586,6 +4599,7 @@ dependencies = [
  "serde",
  "serde",
  "serde_json",
  "serde_json",
  "serde_urlencoded",
  "serde_urlencoded",
+ "strum 0.26.2",
  "tantivy",
  "tantivy",
  "thiserror",
  "thiserror",
  "tikv-jemallocator",
  "tikv-jemallocator",
@@ -4650,6 +4664,15 @@ version = "0.23.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
 checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
 
 
+[[package]]
+name = "strum"
+version = "0.26.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5d8cec3501a5194c432b2b7976db6b7d10ec95c253208b45f83f7136aa985e29"
+dependencies = [
+ "strum_macros 0.26.2",
+]
+
 [[package]]
 [[package]]
 name = "strum_macros"
 name = "strum_macros"
 version = "0.23.1"
 version = "0.23.1"
@@ -4663,6 +4686,19 @@ dependencies = [
  "syn 1.0.109",
  "syn 1.0.109",
 ]
 ]
 
 
+[[package]]
+name = "strum_macros"
+version = "0.26.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6cf59daf282c0a494ba14fd21610a0325f9f90ec9d1231dea26bcb1d696c946"
+dependencies = [
+ "heck 0.4.1",
+ "proc-macro2",
+ "quote",
+ "rustversion",
+ "syn 2.0.52",
+]
+
 [[package]]
 [[package]]
 name = "subtle"
 name = "subtle"
 version = "2.5.0"
 version = "2.5.0"

+ 2 - 0
Cargo.toml

@@ -40,6 +40,7 @@ crossbeam-channel = "0.5.6"
 csv = "1.1.6"
 csv = "1.1.6"
 dashmap = { version = "5.4.0", features = ["rayon"] }
 dashmap = { version = "5.4.0", features = ["rayon"] }
 encoding_rs = "0.8.31"
 encoding_rs = "0.8.31"
+enum_dispatch = "0.3.12"
 eventsource-stream = "0.2.3"
 eventsource-stream = "0.2.3"
 fend-core = "1.2.2"
 fend-core = "1.2.2"
 flate2 = "1.0.28"
 flate2 = "1.0.28"
@@ -95,6 +96,7 @@ scylla = { version = "0.12.0", features = ["chrono"] }
 serde = { version = "1.0.137", features = ["rc", "derive"] }
 serde = { version = "1.0.137", features = ["rc", "derive"] }
 serde_json = "1.0.81"
 serde_json = "1.0.81"
 serde_urlencoded = "0.7.1"
 serde_urlencoded = "0.7.1"
+strum = { version = "0.26.2", features = ["derive"] }
 tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "182f58cea" }
 tantivy = { git = "https://github.com/quickwit-oss/tantivy", rev = "182f58cea" }
 thiserror = "1.0.31"
 thiserror = "1.0.31"
 tikv-jemallocator = "0.5"
 tikv-jemallocator = "0.5"

+ 5 - 2
assets/licenses.html

@@ -44,8 +44,8 @@
     
     
         <h2>Overview of licenses:</h2>
         <h2>Overview of licenses:</h2>
         <ul class="licenses-overview">
         <ul class="licenses-overview">
-            <li><a href="#Apache-2.0">Apache License 2.0</a> (394)</li>
-            <li><a href="#MIT">MIT License</a> (180)</li>
+            <li><a href="#Apache-2.0">Apache License 2.0</a> (395)</li>
+            <li><a href="#MIT">MIT License</a> (182)</li>
             <li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (9)</li>
             <li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (9)</li>
             <li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (8)</li>
             <li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (8)</li>
             <li><a href="#Unicode-DFS-2016">Unicode License Agreement - Data Files and Software (2016)</a> (5)</li>
             <li><a href="#Unicode-DFS-2016">Unicode License Agreement - Data Files and Software (2016)</a> (5)</li>
@@ -9889,6 +9889,7 @@ limitations under the License.
                     <li><a href=" https://github.com/zrzka/anes-rs ">anes 0.1.6</a></li>
                     <li><a href=" https://github.com/zrzka/anes-rs ">anes 0.1.6</a></li>
                     <li><a href=" https://github.com/huggingface/candle ">candle-nn 0.3.3</a></li>
                     <li><a href=" https://github.com/huggingface/candle ">candle-nn 0.3.3</a></li>
                     <li><a href=" https://github.com/huggingface/candle ">candle-transformers 0.3.3</a></li>
                     <li><a href=" https://github.com/huggingface/candle ">candle-transformers 0.3.3</a></li>
+                    <li><a href=" https://gitlab.com/antonok/enum_dispatch ">enum_dispatch 0.3.12</a></li>
                     <li><a href=" https://github.com/jpopesculian/eventsource-stream ">eventsource-stream 0.2.3</a></li>
                     <li><a href=" https://github.com/jpopesculian/eventsource-stream ">eventsource-stream 0.2.3</a></li>
                     <li><a href=" https://github.com/cbreeden/fxhash ">fxhash 0.2.1</a></li>
                     <li><a href=" https://github.com/cbreeden/fxhash ">fxhash 0.2.1</a></li>
                     <li><a href=" https://github.com/starkat99/half-rs ">half 2.4.0</a></li>
                     <li><a href=" https://github.com/starkat99/half-rs ">half 2.4.0</a></li>
@@ -13161,7 +13162,9 @@ SOFTWARE.
                 <h4>Used by:</h4>
                 <h4>Used by:</h4>
                 <ul class="license-used-by">
                 <ul class="license-used-by">
                     <li><a href=" https://github.com/Peternator7/strum ">strum 0.23.0</a></li>
                     <li><a href=" https://github.com/Peternator7/strum ">strum 0.23.0</a></li>
+                    <li><a href=" https://github.com/Peternator7/strum ">strum 0.26.2</a></li>
                     <li><a href=" https://github.com/Peternator7/strum ">strum_macros 0.23.1</a></li>
                     <li><a href=" https://github.com/Peternator7/strum ">strum_macros 0.23.1</a></li>
+                    <li><a href=" https://github.com/Peternator7/strum ">strum_macros 0.26.2</a></li>
                 </ul>
                 </ul>
                 <pre class="license-text">MIT License
                 <pre class="license-text">MIT License
 
 

+ 2 - 0
crates/core/Cargo.toml

@@ -45,6 +45,7 @@ crossbeam-channel = { workspace = true }
 csv = { workspace = true }
 csv = { workspace = true }
 dashmap = { workspace = true }
 dashmap = { workspace = true }
 encoding_rs = { workspace = true }
 encoding_rs = { workspace = true }
+enum_dispatch = { workspace = true }
 eventsource-stream = { workspace = true }
 eventsource-stream = { workspace = true }
 fend-core = { workspace = true }
 fend-core = { workspace = true }
 flate2 = { workspace = true }
 flate2 = { workspace = true }
@@ -91,6 +92,7 @@ scylla = { workspace = true }
 serde = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
 serde_json = { workspace = true }
 serde_urlencoded = { workspace = true }
 serde_urlencoded = { workspace = true }
+strum = { workspace = true }
 tantivy = { workspace = true }
 tantivy = { workspace = true }
 thiserror = { workspace = true }
 thiserror = { workspace = true }
 tokenizers = { workspace = true }
 tokenizers = { workspace = true }

+ 1 - 1
crates/core/examples/indexer.rs

@@ -62,6 +62,6 @@ fn main() -> anyhow::Result<()> {
 
 
     println!("Indexing took {:?}", start.elapsed());
     println!("Indexing took {:?}", start.elapsed());
 
 
-    std::fs::remove_dir(path)?;
+    std::fs::remove_dir_all(path)?;
     Ok(())
     Ok(())
 }
 }

+ 20 - 8
crates/core/src/collector.rs

@@ -30,7 +30,7 @@ use crate::{
     inverted_index::{DocAddress, WebpagePointer},
     inverted_index::{DocAddress, WebpagePointer},
     prehashed::Prehashed,
     prehashed::Prehashed,
     ranking::initial::{InitialScoreTweaker, Score},
     ranking::initial::{InitialScoreTweaker, Score},
-    schema::FastField,
+    schema::{fast_field, FastFieldEnum},
     simhash,
     simhash,
 };
 };
 
 
@@ -139,7 +139,7 @@ pub struct TopSegmentCollector {
 }
 }
 
 
 impl TopSegmentCollector {
 impl TopSegmentCollector {
-    fn get_hash(&self, doc: DocId, field1: FastField, field2: FastField) -> Prehashed {
+    fn get_hash(&self, doc: DocId, field1: FastFieldEnum, field2: FastFieldEnum) -> Prehashed {
         let field_reader = self.fastfield_segment_reader.get_field_reader(doc);
         let field_reader = self.fastfield_segment_reader.get_field_reader(doc);
 
 
         let hash = [
         let hash = [
@@ -169,19 +169,31 @@ impl TopSegmentCollector {
         let simhash: Option<u64> = self
         let simhash: Option<u64> = self
             .fastfield_segment_reader
             .fastfield_segment_reader
             .get_field_reader(doc)
             .get_field_reader(doc)
-            .get(FastField::SimHash)
+            .get(fast_field::SimHash.into())
             .unwrap()
             .unwrap()
             .into();
             .into();
 
 
         self.bucket_collector.insert(SegmentDoc {
         self.bucket_collector.insert(SegmentDoc {
             hashes: Hashes {
             hashes: Hashes {
-                site: self.get_hash(doc, FastField::SiteHash1, FastField::SiteHash2),
-                title: self.get_hash(doc, FastField::TitleHash1, FastField::TitleHash2),
-                url: self.get_hash(doc, FastField::UrlHash1, FastField::UrlHash2),
+                site: self.get_hash(
+                    doc,
+                    fast_field::SiteHash1.into(),
+                    fast_field::SiteHash2.into(),
+                ),
+                title: self.get_hash(
+                    doc,
+                    fast_field::TitleHash1.into(),
+                    fast_field::TitleHash2.into(),
+                ),
+                url: self.get_hash(
+                    doc,
+                    fast_field::UrlHash1.into(),
+                    fast_field::UrlHash2.into(),
+                ),
                 url_without_tld: self.get_hash(
                 url_without_tld: self.get_hash(
                     doc,
                     doc,
-                    FastField::UrlWithoutTldHash1,
-                    FastField::UrlWithoutTldHash2,
+                    fast_field::UrlWithoutTldHash1.into(),
+                    fast_field::UrlWithoutTldHash2.into(),
                 ),
                 ),
                 simhash: simhash.unwrap(),
                 simhash: simhash.unwrap(),
             },
             },

+ 24 - 19
crates/core/src/enum_map.rs

@@ -16,14 +16,22 @@
 
 
 use serde::{Deserialize, Serialize};
 use serde::{Deserialize, Serialize};
 
 
+pub trait InsertEnumMapKey: Sized {
+    fn into_usize(self) -> usize;
+}
+
+pub trait GetEnumMapKey: Sized {
+    fn from_usize(value: usize) -> Option<Self>;
+}
+
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[derive(Clone, Debug, Serialize, Deserialize)]
-pub struct EnumMap<K: Into<usize>, V> {
+pub struct EnumMap<K: InsertEnumMapKey, V> {
     inner: Vec<Option<V>>,
     inner: Vec<Option<V>>,
     len: usize,
     len: usize,
     _phantom: std::marker::PhantomData<K>,
     _phantom: std::marker::PhantomData<K>,
 }
 }
 
 
-impl<K: Into<usize>, V> Default for EnumMap<K, V> {
+impl<K: InsertEnumMapKey, V> Default for EnumMap<K, V> {
     fn default() -> Self {
     fn default() -> Self {
         Self::new()
         Self::new()
     }
     }
@@ -31,7 +39,7 @@ impl<K: Into<usize>, V> Default for EnumMap<K, V> {
 
 
 impl<K, V> EnumMap<K, V>
 impl<K, V> EnumMap<K, V>
 where
 where
-    K: Into<usize>,
+    K: InsertEnumMapKey,
 {
 {
     pub fn new() -> Self {
     pub fn new() -> Self {
         Self {
         Self {
@@ -42,7 +50,7 @@ where
     }
     }
 
 
     pub fn insert(&mut self, key: K, value: V) {
     pub fn insert(&mut self, key: K, value: V) {
-        let key = key.into();
+        let key = key.into_usize();
 
 
         if key >= self.inner.len() {
         if key >= self.inner.len() {
             self.inner.resize_with(key + 1, || None);
             self.inner.resize_with(key + 1, || None);
@@ -60,7 +68,7 @@ where
     }
     }
 
 
     pub fn get(&self, key: K) -> Option<&V> {
     pub fn get(&self, key: K) -> Option<&V> {
-        let key = key.into();
+        let key = key.into_usize();
         if key >= self.inner.len() {
         if key >= self.inner.len() {
             None
             None
         } else {
         } else {
@@ -81,7 +89,7 @@ where
     }
     }
 
 
     pub fn get_mut(&mut self, key: K) -> Option<&mut V> {
     pub fn get_mut(&mut self, key: K) -> Option<&mut V> {
-        let key = key.into();
+        let key = key.into_usize();
         if key >= self.inner.len() {
         if key >= self.inner.len() {
             None
             None
         } else {
         } else {
@@ -92,19 +100,19 @@ where
 
 
 impl<K, V> EnumMap<K, V>
 impl<K, V> EnumMap<K, V>
 where
 where
-    K: TryFrom<usize> + Into<usize>,
+    K: GetEnumMapKey + InsertEnumMapKey,
 {
 {
     pub fn keys(&self) -> impl Iterator<Item = K> + '_ {
     pub fn keys(&self) -> impl Iterator<Item = K> + '_ {
         self.inner
         self.inner
             .iter()
             .iter()
             .enumerate()
             .enumerate()
-            .filter_map(|(key, value)| value.as_ref().and_then(|_| K::try_from(key).ok()))
+            .filter_map(|(key, value)| value.as_ref().and_then(|_| K::from_usize(key)))
     }
     }
 }
 }
 
 
 impl<K, V> FromIterator<(K, V)> for EnumMap<K, V>
 impl<K, V> FromIterator<(K, V)> for EnumMap<K, V>
 where
 where
-    K: Into<usize>,
+    K: InsertEnumMapKey,
 {
 {
     fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> Self {
     fn from_iter<T: IntoIterator<Item = (K, V)>>(iter: T) -> Self {
         let mut map = Self::new();
         let mut map = Self::new();
@@ -118,17 +126,17 @@ where
 }
 }
 
 
 #[derive(Clone, Debug, Serialize, Deserialize)]
 #[derive(Clone, Debug, Serialize, Deserialize)]
-pub struct EnumSet<K: Into<usize>> {
+pub struct EnumSet<K: InsertEnumMapKey> {
     map: EnumMap<K, ()>,
     map: EnumMap<K, ()>,
 }
 }
 
 
-impl<K: Into<usize>> Default for EnumSet<K> {
+impl<K: InsertEnumMapKey> Default for EnumSet<K> {
     fn default() -> Self {
     fn default() -> Self {
         Self::new()
         Self::new()
     }
     }
 }
 }
 
 
-impl<K: Into<usize>> EnumSet<K> {
+impl<K: InsertEnumMapKey> EnumSet<K> {
     pub fn new() -> Self {
     pub fn new() -> Self {
         Self {
         Self {
             map: EnumMap::new(),
             map: EnumMap::new(),
@@ -148,10 +156,7 @@ impl<K: Into<usize>> EnumSet<K> {
     }
     }
 }
 }
 
 
-impl<K> EnumSet<K>
-where
-    K: TryFrom<usize> + Into<usize>,
-{
+impl<K: InsertEnumMapKey + GetEnumMapKey> EnumSet<K> {
     pub fn iter(&self) -> impl Iterator<Item = K> + '_ {
     pub fn iter(&self) -> impl Iterator<Item = K> + '_ {
         self.map.keys()
         self.map.keys()
     }
     }
@@ -168,9 +173,9 @@ mod tests {
         C,
         C,
     }
     }
 
 
-    impl From<TestEnum> for usize {
-        fn from(val: TestEnum) -> Self {
-            val as usize
+    impl InsertEnumMapKey for TestEnum {
+        fn into_usize(self) -> usize {
+            self as usize
         }
         }
     }
     }
 
 

+ 5 - 5
crates/core/src/fastfield_reader.rs

@@ -20,7 +20,7 @@ use tantivy::{columnar::ColumnValues, DocId, SegmentId};
 
 
 use crate::{
 use crate::{
     enum_map::EnumMap,
     enum_map::EnumMap,
-    schema::{DataType, FastField, Field},
+    schema::{fast_field::FastField, DataType, FastFieldEnum, Field},
 };
 };
 
 
 #[derive(Default, Clone)]
 #[derive(Default, Clone)]
@@ -53,7 +53,7 @@ impl FastFieldReader {
             let mut u64s = EnumMap::new();
             let mut u64s = EnumMap::new();
             let mut bytes = EnumMap::new();
             let mut bytes = EnumMap::new();
 
 
-            for field in Field::all().filter_map(Field::as_fast) {
+            for field in Field::all().filter_map(|f| f.as_fast()) {
                 match field.data_type() {
                 match field.data_type() {
                     DataType::U64 => {
                     DataType::U64 => {
                         if let Ok(reader) = fastfield_readers.u64(field.name()) {
                         if let Ok(reader) = fastfield_readers.u64(field.name()) {
@@ -83,8 +83,8 @@ impl FastFieldReader {
 }
 }
 
 
 struct AllReaders {
 struct AllReaders {
-    u64s: EnumMap<FastField, tantivy::columnar::Column<u64>>,
-    bytes: EnumMap<FastField, tantivy::columnar::BytesColumn>,
+    u64s: EnumMap<FastFieldEnum, tantivy::columnar::Column<u64>>,
+    bytes: EnumMap<FastFieldEnum, tantivy::columnar::BytesColumn>,
 }
 }
 
 
 pub enum Value {
 pub enum Value {
@@ -147,7 +147,7 @@ pub struct FieldReader<'a> {
 }
 }
 
 
 impl<'a> FieldReader<'a> {
 impl<'a> FieldReader<'a> {
-    pub fn get(&self, field: FastField) -> Option<Value> {
+    pub fn get(&self, field: FastFieldEnum) -> Option<Value> {
         match field.data_type() {
         match field.data_type() {
             DataType::U64 => Some(
             DataType::U64 => Some(
                 self.readers
                 self.readers

+ 36 - 33
crates/core/src/inverted_index.rs

@@ -45,7 +45,8 @@ use crate::query::Query;
 use crate::ranking::initial::Score;
 use crate::ranking::initial::Score;
 use crate::ranking::pipeline::RecallRankingWebpage;
 use crate::ranking::pipeline::RecallRankingWebpage;
 use crate::ranking::SignalAggregator;
 use crate::ranking::SignalAggregator;
-use crate::schema::{FastField, Field, TextField};
+use crate::schema::text_field::TextField;
+use crate::schema::{fast_field, text_field, FastFieldEnum, Field, TextFieldEnum};
 use crate::search_ctx::Ctx;
 use crate::search_ctx::Ctx;
 use crate::snippet;
 use crate::snippet;
 use crate::snippet::TextSnippet;
 use crate::snippet::TextSnippet;
@@ -177,7 +178,9 @@ impl InvertedIndex {
         } else {
         } else {
             let index_settings = tantivy::IndexSettings {
             let index_settings = tantivy::IndexSettings {
                 sort_by_field: Some(tantivy::IndexSortByField {
                 sort_by_field: Some(tantivy::IndexSortByField {
-                    field: Field::Fast(FastField::PreComputedScore).name().to_string(),
+                    field: Field::Fast(FastFieldEnum::from(fast_field::PreComputedScore))
+                        .name()
+                        .to_string(),
                     order: tantivy::Order::Desc,
                     order: tantivy::Order::Desc,
                 }),
                 }),
                 ..Default::default()
                 ..Default::default()
@@ -316,9 +319,7 @@ impl InvertedIndex {
 
 
     pub fn delete_all_before(&self, timestamp: tantivy::DateTime) -> Result<()> {
     pub fn delete_all_before(&self, timestamp: tantivy::DateTime) -> Result<()> {
         let query = tantivy::query::RangeQuery::new_date_bounds(
         let query = tantivy::query::RangeQuery::new_date_bounds(
-            Field::Text(TextField::InsertionTimestamp)
-                .name()
-                .to_string(),
+            text_field::InsertionTimestamp.name().to_string(),
             std::ops::Bound::Unbounded,
             std::ops::Bound::Unbounded,
             std::ops::Bound::Excluded(timestamp),
             std::ops::Bound::Excluded(timestamp),
         );
         );
@@ -425,7 +426,7 @@ impl InvertedIndex {
 
 
         let field = self
         let field = self
             .schema()
             .schema()
-            .get_field(Field::Fast(FastField::HostNodeID).name())
+            .get_field(Field::Fast(FastFieldEnum::from(fast_field::HostNodeID)).name())
             .unwrap();
             .unwrap();
 
 
         let id = doc.get_first(field).unwrap().as_u64().unwrap();
         let id = doc.get_first(field).unwrap().as_u64().unwrap();
@@ -641,7 +642,7 @@ impl InvertedIndex {
         let tv_searcher = self.reader.searcher();
         let tv_searcher = self.reader.searcher();
         let field = tv_searcher
         let field = tv_searcher
             .schema()
             .schema()
-            .get_field(Field::Text(TextField::UrlNoTokenizer).name())
+            .get_field(Field::Text(TextFieldEnum::from(text_field::UrlNoTokenizer)).name())
             .unwrap();
             .unwrap();
 
 
         let term = tantivy::Term::from_field_text(field, url.as_str());
         let term = tantivy::Term::from_field_text(field, url.as_str());
@@ -660,7 +661,9 @@ impl InvertedIndex {
         let tv_searcher = self.reader.searcher();
         let tv_searcher = self.reader.searcher();
         let field = tv_searcher
         let field = tv_searcher
             .schema()
             .schema()
-            .get_field(Field::Text(TextField::SiteIfHomepageNoTokenizer).name())
+            .get_field(
+                Field::Text(TextFieldEnum::from(text_field::SiteIfHomepageNoTokenizer)).name(),
+            )
             .unwrap();
             .unwrap();
 
 
         let host = url.normalized_host().unwrap_or_default();
         let host = url.normalized_host().unwrap_or_default();
@@ -707,12 +710,12 @@ impl RetrievedWebpage {
     }
     }
 }
 }
 
 
-fn str_value(field: TextField, value: &tantivy::schema::FieldValue) -> String {
+fn str_value(name: &str, value: &tantivy::schema::FieldValue) -> String {
     value
     value
         .value()
         .value()
         .as_value()
         .as_value()
         .as_str()
         .as_str()
-        .unwrap_or_else(|| panic!("{} field should be text", field.name()))
+        .unwrap_or_else(|| panic!("{} field should be text", name))
         .to_string()
         .to_string()
 }
 }
 
 
@@ -721,21 +724,21 @@ impl From<TantivyDocument> for RetrievedWebpage {
         let mut webpage = RetrievedWebpage::default();
         let mut webpage = RetrievedWebpage::default();
 
 
         for value in doc.field_values() {
         for value in doc.field_values() {
-            match Field::get(value.field.field_id() as usize).copied() {
-                Some(Field::Text(TextField::Title)) => {
-                    webpage.title = str_value(TextField::Title, value);
+            match Field::get(value.field.field_id() as usize) {
+                Some(Field::Text(TextFieldEnum::Title(_))) => {
+                    webpage.title = str_value(text_field::Title.name(), value);
                 }
                 }
-                Some(Field::Text(TextField::StemmedCleanBody)) => {
-                    webpage.body = str_value(TextField::StemmedCleanBody, value);
+                Some(Field::Text(TextFieldEnum::StemmedCleanBody(_))) => {
+                    webpage.body = str_value(text_field::StemmedCleanBody.name(), value);
                 }
                 }
-                Some(Field::Text(TextField::Description)) => {
-                    let desc = str_value(TextField::Description, value);
+                Some(Field::Text(TextFieldEnum::Description(_))) => {
+                    let desc = str_value(text_field::Description.name(), value);
                     webpage.description = if desc.is_empty() { None } else { Some(desc) }
                     webpage.description = if desc.is_empty() { None } else { Some(desc) }
                 }
                 }
-                Some(Field::Text(TextField::Url)) => {
-                    webpage.url = str_value(TextField::Url, value);
+                Some(Field::Text(TextFieldEnum::Url(_))) => {
+                    webpage.url = str_value(text_field::Url.name(), value);
                 }
                 }
-                Some(Field::Fast(FastField::LastUpdated)) => {
+                Some(Field::Fast(FastFieldEnum::LastUpdated(_))) => {
                     webpage.updated_time = {
                     webpage.updated_time = {
                         let timestamp = value.value().as_value().as_u64().unwrap() as i64;
                         let timestamp = value.value().as_value().as_u64().unwrap() as i64;
                         if timestamp == 0 {
                         if timestamp == 0 {
@@ -745,39 +748,39 @@ impl From<TantivyDocument> for RetrievedWebpage {
                         }
                         }
                     }
                     }
                 }
                 }
-                Some(Field::Text(TextField::AllBody)) => {
-                    webpage.dirty_body = str_value(TextField::AllBody, value);
+                Some(Field::Text(TextFieldEnum::AllBody(_))) => {
+                    webpage.dirty_body = str_value(text_field::AllBody.name(), value);
                 }
                 }
-                Some(Field::Fast(FastField::Region)) => {
+                Some(Field::Fast(FastFieldEnum::Region(_))) => {
                     webpage.region = {
                     webpage.region = {
                         let id = value.value().as_value().as_u64().unwrap();
                         let id = value.value().as_value().as_u64().unwrap();
                         Region::from_id(id)
                         Region::from_id(id)
                     }
                     }
                 }
                 }
-                Some(Field::Text(TextField::DmozDescription)) => {
-                    let desc = str_value(TextField::DmozDescription, value);
+                Some(Field::Text(TextFieldEnum::DmozDescription(_))) => {
+                    let desc = str_value(text_field::DmozDescription.name(), value);
                     webpage.dmoz_description = if desc.is_empty() { None } else { Some(desc) }
                     webpage.dmoz_description = if desc.is_empty() { None } else { Some(desc) }
                 }
                 }
-                Some(Field::Text(TextField::SchemaOrgJson)) => {
-                    let json = str_value(TextField::SchemaOrgJson, value);
+                Some(Field::Text(TextFieldEnum::SchemaOrgJson(_))) => {
+                    let json = str_value(text_field::SchemaOrgJson.name(), value);
                     webpage.schema_org = serde_json::from_str(&json).unwrap_or_default();
                     webpage.schema_org = serde_json::from_str(&json).unwrap_or_default();
                 }
                 }
-                Some(Field::Fast(FastField::LikelyHasAds)) => {
+                Some(Field::Fast(FastFieldEnum::LikelyHasAds(_))) => {
                     webpage.likely_has_ads =
                     webpage.likely_has_ads =
                         value.value().as_value().as_u64().unwrap_or_default() != 0;
                         value.value().as_value().as_u64().unwrap_or_default() != 0;
                 }
                 }
-                Some(Field::Fast(FastField::LikelyHasPaywall)) => {
+                Some(Field::Fast(FastFieldEnum::LikelyHasPaywall(_))) => {
                     webpage.likely_has_paywall =
                     webpage.likely_has_paywall =
                         value.value().as_value().as_u64().unwrap_or_default() != 0;
                         value.value().as_value().as_u64().unwrap_or_default() != 0;
                 }
                 }
-                Some(Field::Text(TextField::RecipeFirstIngredientTagId)) => {
-                    let tag_id = str_value(TextField::RecipeFirstIngredientTagId, value);
+                Some(Field::Text(TextFieldEnum::RecipeFirstIngredientTagId(_))) => {
+                    let tag_id = str_value(text_field::RecipeFirstIngredientTagId.name(), value);
                     if !tag_id.is_empty() {
                     if !tag_id.is_empty() {
                         webpage.recipe_first_ingredient_tag_id = Some(tag_id);
                         webpage.recipe_first_ingredient_tag_id = Some(tag_id);
                     }
                     }
                 }
                 }
-                Some(Field::Text(TextField::Keywords)) => {
-                    let keywords = str_value(TextField::Keywords, value);
+                Some(Field::Text(TextFieldEnum::Keywords(_))) => {
+                    let keywords = str_value(text_field::Keywords.name(), value);
                     webpage.keywords = keywords.split('\n').map(|s| s.to_string()).collect();
                     webpage.keywords = keywords.split('\n').map(|s| s.to_string()).collect();
                 }
                 }
                 _ => {}
                 _ => {}

+ 4 - 4
crates/core/src/mapreduce/dht/mod.rs

@@ -73,18 +73,18 @@ macro_rules! raft_sonic_request_response {
         #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
         #[derive(serde::Serialize, serde::Deserialize, Clone, Debug)]
         pub enum Response {
         pub enum Response {
             $(
             $(
-                $req(<$req as crate::distributed::sonic::service::Message<$service>>::Response),
+                $req(<$req as $crate::distributed::sonic::service::Message<$service>>::Response),
             )*
             )*
             Empty,
             Empty,
         }
         }
 
 
         $(
         $(
-        impl TryFrom<Response> for <$req as crate::distributed::sonic::service::Message<$service>>::Response {
-            type Error = crate::distributed::sonic::Error;
+        impl TryFrom<Response> for <$req as $crate::distributed::sonic::service::Message<$service>>::Response {
+            type Error = $crate::distributed::sonic::Error;
             fn try_from(res: Response) -> Result<Self, Self::Error> {
             fn try_from(res: Response) -> Result<Self, Self::Error> {
                 match res {
                 match res {
                     Response::$req(res) => Ok(res),
                     Response::$req(res) => Ok(res),
-                    _ => Err(crate::distributed::sonic::Error::Application(anyhow::anyhow!("Invalid response for request from Raft"))),
+                    _ => Err($crate::distributed::sonic::Error::Application(anyhow::anyhow!("Invalid response for request from Raft"))),
                 }
                 }
             }
             }
         }
         }

+ 10 - 10
crates/core/src/mapreduce/dht/network/mod.rs

@@ -39,34 +39,34 @@ impl RaftNetworkFactory<TypeConfig> for Network {
     }
     }
 }
 }
 
 
-pub type AppendEntriesRequest = openraft::raft::AppendEntriesRequest<TypeConfig>;
+pub type AppendEntries = openraft::raft::AppendEntriesRequest<TypeConfig>;
 pub type AppendEntriesResponse = openraft::raft::AppendEntriesResponse<NodeId>;
 pub type AppendEntriesResponse = openraft::raft::AppendEntriesResponse<NodeId>;
 
 
-pub type InstallSnapshotRequest = openraft::raft::InstallSnapshotRequest<TypeConfig>;
+pub type InstallSnapshot = openraft::raft::InstallSnapshotRequest<TypeConfig>;
 pub type InstallSnapshotResponse = openraft::raft::InstallSnapshotResponse<NodeId>;
 pub type InstallSnapshotResponse = openraft::raft::InstallSnapshotResponse<NodeId>;
 
 
-pub type VoteRequest = openraft::raft::VoteRequest<NodeId>;
+pub type Vote = openraft::raft::VoteRequest<NodeId>;
 pub type VoteResponse = openraft::raft::VoteResponse<NodeId>;
 pub type VoteResponse = openraft::raft::VoteResponse<NodeId>;
 
 
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
-pub struct AddLearnerRequest {
+pub struct AddLearner {
     pub id: NodeId,
     pub id: NodeId,
     pub addr: SocketAddr,
     pub addr: SocketAddr,
 }
 }
 
 
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
 #[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
-pub struct AddNodesRequest {
+pub struct AddNodes {
     members: BTreeMap<NodeId, BasicNode>,
     members: BTreeMap<NodeId, BasicNode>,
 }
 }
 
 
 sonic_service!(
 sonic_service!(
     Server,
     Server,
     [
     [
-        AppendEntriesRequest,
-        InstallSnapshotRequest,
-        VoteRequest,
-        AddLearnerRequest,
-        AddNodesRequest,
+        AppendEntries,
+        InstallSnapshot,
+        Vote,
+        AddLearner,
+        AddNodes,
         Get,
         Get,
         Set,
         Set,
     ]
     ]

+ 12 - 16
crates/core/src/mapreduce/dht/network/raft.rs

@@ -37,11 +37,11 @@ use crate::{
 };
 };
 
 
 use super::{
 use super::{
-    AddLearnerRequest, AddNodesRequest, AppendEntriesRequest, AppendEntriesResponse,
-    InstallSnapshotRequest, InstallSnapshotResponse, Server, VoteRequest, VoteResponse,
+    AddLearner, AddNodes, AppendEntries, AppendEntriesResponse, InstallSnapshot,
+    InstallSnapshotResponse, Server, Vote, VoteResponse,
 };
 };
 
 
-impl sonic::service::Message<Server> for AppendEntriesRequest {
+impl sonic::service::Message<Server> for AppendEntries {
     type Response = Result<AppendEntriesResponse, RaftError<NodeId>>;
     type Response = Result<AppendEntriesResponse, RaftError<NodeId>>;
 
 
     async fn handle(self, server: &Server) -> Self::Response {
     async fn handle(self, server: &Server) -> Self::Response {
@@ -50,7 +50,7 @@ impl sonic::service::Message<Server> for AppendEntriesRequest {
     }
     }
 }
 }
 
 
-impl sonic::service::Message<Server> for InstallSnapshotRequest {
+impl sonic::service::Message<Server> for InstallSnapshot {
     type Response = Result<InstallSnapshotResponse, RaftError<NodeId, InstallSnapshotError>>;
     type Response = Result<InstallSnapshotResponse, RaftError<NodeId, InstallSnapshotError>>;
 
 
     async fn handle(self, server: &Server) -> Self::Response {
     async fn handle(self, server: &Server) -> Self::Response {
@@ -59,7 +59,7 @@ impl sonic::service::Message<Server> for InstallSnapshotRequest {
     }
     }
 }
 }
 
 
-impl sonic::service::Message<Server> for VoteRequest {
+impl sonic::service::Message<Server> for Vote {
     type Response = Result<VoteResponse, RaftError<NodeId>>;
     type Response = Result<VoteResponse, RaftError<NodeId>>;
 
 
     async fn handle(self, server: &Server) -> Self::Response {
     async fn handle(self, server: &Server) -> Self::Response {
@@ -68,7 +68,7 @@ impl sonic::service::Message<Server> for VoteRequest {
     }
     }
 }
 }
 
 
-impl sonic::service::Message<Server> for AddLearnerRequest {
+impl sonic::service::Message<Server> for AddLearner {
     type Response = Result<(), RaftError<NodeId, ClientWriteError<NodeId, BasicNode>>>;
     type Response = Result<(), RaftError<NodeId, ClientWriteError<NodeId, BasicNode>>>;
 
 
     async fn handle(self, server: &Server) -> Self::Response {
     async fn handle(self, server: &Server) -> Self::Response {
@@ -93,7 +93,7 @@ impl sonic::service::Message<Server> for AddLearnerRequest {
     }
     }
 }
 }
 
 
-impl sonic::service::Message<Server> for AddNodesRequest {
+impl sonic::service::Message<Server> for AddNodes {
     type Response = Result<(), RaftError<NodeId, ClientWriteError<NodeId, BasicNode>>>;
     type Response = Result<(), RaftError<NodeId, ClientWriteError<NodeId, BasicNode>>>;
 
 
     async fn handle(self, server: &Server) -> Self::Response {
     async fn handle(self, server: &Server) -> Self::Response {
@@ -160,7 +160,7 @@ impl RemoteClient {
     }
     }
 
 
     async fn add_learner(&self, id: NodeId, addr: SocketAddr) -> Result<()> {
     async fn add_learner(&self, id: NodeId, addr: SocketAddr) -> Result<()> {
-        let rpc = AddLearnerRequest { id, addr };
+        let rpc = AddLearner { id, addr };
         let retry = ExponentialBackoff::from_millis(500)
         let retry = ExponentialBackoff::from_millis(500)
             .with_limit(Duration::from_secs(60))
             .with_limit(Duration::from_secs(60))
             .take(5);
             .take(5);
@@ -220,7 +220,7 @@ impl RemoteClient {
     }
     }
 
 
     async fn add_nodes(&self, members: BTreeMap<NodeId, BasicNode>) -> Result<()> {
     async fn add_nodes(&self, members: BTreeMap<NodeId, BasicNode>) -> Result<()> {
-        let rpc = AddNodesRequest { members };
+        let rpc = AddNodes { members };
         let retry = ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(10));
         let retry = ExponentialBackoff::from_millis(500).with_limit(Duration::from_secs(10));
 
 
         for backoff in retry {
         for backoff in retry {
@@ -293,7 +293,7 @@ impl RemoteClient {
 impl RaftNetwork<TypeConfig> for RemoteClient {
 impl RaftNetwork<TypeConfig> for RemoteClient {
     async fn append_entries(
     async fn append_entries(
         &mut self,
         &mut self,
-        rpc: AppendEntriesRequest,
+        rpc: AppendEntries,
         option: RPCOption,
         option: RPCOption,
     ) -> Result<AppendEntriesResponse, RPCError> {
     ) -> Result<AppendEntriesResponse, RPCError> {
         self.send_raft_rpc(rpc, option).await?.map_err(|e| {
         self.send_raft_rpc(rpc, option).await?.map_err(|e| {
@@ -308,7 +308,7 @@ impl RaftNetwork<TypeConfig> for RemoteClient {
 
 
     async fn install_snapshot(
     async fn install_snapshot(
         &mut self,
         &mut self,
-        rpc: InstallSnapshotRequest,
+        rpc: InstallSnapshot,
         option: RPCOption,
         option: RPCOption,
     ) -> Result<InstallSnapshotResponse, RPCError<InstallSnapshotError>> {
     ) -> Result<InstallSnapshotResponse, RPCError<InstallSnapshotError>> {
         self.send_raft_rpc(rpc, option).await?.map_err(|e| {
         self.send_raft_rpc(rpc, option).await?.map_err(|e| {
@@ -321,11 +321,7 @@ impl RaftNetwork<TypeConfig> for RemoteClient {
         })
         })
     }
     }
 
 
-    async fn vote(
-        &mut self,
-        rpc: VoteRequest,
-        option: RPCOption,
-    ) -> Result<VoteResponse, RPCError> {
+    async fn vote(&mut self, rpc: Vote, option: RPCOption) -> Result<VoteResponse, RPCError> {
         self.send_raft_rpc(rpc, option).await?.map_err(|e| {
         self.send_raft_rpc(rpc, option).await?.map_err(|e| {
             openraft::error::RemoteError {
             openraft::error::RemoteError {
                 target: self.target,
                 target: self.target,

+ 2 - 2
crates/core/src/query/mod.rs

@@ -18,7 +18,7 @@ use crate::{
     inverted_index::InvertedIndex,
     inverted_index::InvertedIndex,
     query::parser::TermCompound,
     query::parser::TermCompound,
     ranking::SignalCoefficient,
     ranking::SignalCoefficient,
-    schema::{Field, TextField},
+    schema::{text_field, Field},
     search_ctx::Ctx,
     search_ctx::Ctx,
     searcher::SearchQuery,
     searcher::SearchQuery,
     webpage::{region::Region, safety_classifier},
     webpage::{region::Region, safety_classifier},
@@ -116,7 +116,7 @@ impl Query {
             .collect();
             .collect();
 
 
         if query.safe_search {
         if query.safe_search {
-            let field = Field::Text(TextField::SafetyClassification);
+            let field = Field::Text(text_field::SafetyClassification.into());
             let field = schema.get_field(field.name()).unwrap();
             let field = schema.get_field(field.name()).unwrap();
 
 
             queries.push((
             queries.push((

+ 10 - 10
crates/core/src/query/optic.rs

@@ -22,7 +22,7 @@ use tantivy::{
     schema::Schema,
     schema::Schema,
 };
 };
 
 
-use crate::{fastfield_reader::FastFieldReader, schema::TextField};
+use crate::{fastfield_reader::FastFieldReader, schema::text_field};
 
 
 use super::{const_query::ConstQuery, pattern_query::PatternQuery, union::UnionQuery};
 use super::{const_query::ConstQuery, pattern_query::PatternQuery, union::UnionQuery};
 
 
@@ -173,7 +173,7 @@ impl AsTantivyQuery for Matching {
             MatchLocation::Site => ConstQuery::new(
             MatchLocation::Site => ConstQuery::new(
                 PatternQuery::new(
                 PatternQuery::new(
                     self.pattern.clone(),
                     self.pattern.clone(),
-                    TextField::UrlForSiteOperator,
+                    text_field::UrlForSiteOperator.into(),
                     schema,
                     schema,
                     fastfield_reader.clone(),
                     fastfield_reader.clone(),
                 )
                 )
@@ -184,7 +184,7 @@ impl AsTantivyQuery for Matching {
             MatchLocation::Url => Box::new(ConstQuery::new(
             MatchLocation::Url => Box::new(ConstQuery::new(
                 Box::new(PatternQuery::new(
                 Box::new(PatternQuery::new(
                     self.pattern.clone(),
                     self.pattern.clone(),
-                    TextField::Url,
+                    text_field::Url.into(),
                     schema,
                     schema,
                     fastfield_reader.clone(),
                     fastfield_reader.clone(),
                 )),
                 )),
@@ -193,7 +193,7 @@ impl AsTantivyQuery for Matching {
             MatchLocation::Domain => Box::new(ConstQuery::new(
             MatchLocation::Domain => Box::new(ConstQuery::new(
                 Box::new(PatternQuery::new(
                 Box::new(PatternQuery::new(
                     self.pattern.clone(),
                     self.pattern.clone(),
-                    TextField::Domain,
+                    text_field::Domain.into(),
                     schema,
                     schema,
                     fastfield_reader.clone(),
                     fastfield_reader.clone(),
                 )),
                 )),
@@ -202,7 +202,7 @@ impl AsTantivyQuery for Matching {
             MatchLocation::Title => Box::new(ConstQuery::new(
             MatchLocation::Title => Box::new(ConstQuery::new(
                 Box::new(PatternQuery::new(
                 Box::new(PatternQuery::new(
                     self.pattern.clone(),
                     self.pattern.clone(),
-                    TextField::Title,
+                    text_field::Title.into(),
                     schema,
                     schema,
                     fastfield_reader.clone(),
                     fastfield_reader.clone(),
                 )),
                 )),
@@ -212,7 +212,7 @@ impl AsTantivyQuery for Matching {
                 Box::new(ConstQuery::new(
                 Box::new(ConstQuery::new(
                     Box::new(PatternQuery::new(
                     Box::new(PatternQuery::new(
                         self.pattern.clone(),
                         self.pattern.clone(),
-                        TextField::Description,
+                        text_field::Description.into(),
                         schema,
                         schema,
                         fastfield_reader.clone(),
                         fastfield_reader.clone(),
                     )),
                     )),
@@ -221,7 +221,7 @@ impl AsTantivyQuery for Matching {
                 Box::new(ConstQuery::new(
                 Box::new(ConstQuery::new(
                     Box::new(PatternQuery::new(
                     Box::new(PatternQuery::new(
                         self.pattern.clone(),
                         self.pattern.clone(),
-                        TextField::DmozDescription,
+                        text_field::DmozDescription.into(),
                         schema,
                         schema,
                         fastfield_reader.clone(),
                         fastfield_reader.clone(),
                     )),
                     )),
@@ -232,7 +232,7 @@ impl AsTantivyQuery for Matching {
             MatchLocation::Content => Box::new(ConstQuery::new(
             MatchLocation::Content => Box::new(ConstQuery::new(
                 Box::new(PatternQuery::new(
                 Box::new(PatternQuery::new(
                     self.pattern.clone(),
                     self.pattern.clone(),
-                    TextField::CleanBody,
+                    text_field::CleanBody.into(),
                     schema,
                     schema,
                     fastfield_reader.clone(),
                     fastfield_reader.clone(),
                 )),
                 )),
@@ -241,7 +241,7 @@ impl AsTantivyQuery for Matching {
             MatchLocation::MicroformatTag => Box::new(ConstQuery::new(
             MatchLocation::MicroformatTag => Box::new(ConstQuery::new(
                 Box::new(PatternQuery::new(
                 Box::new(PatternQuery::new(
                     self.pattern.clone(),
                     self.pattern.clone(),
-                    TextField::MicroformatTags,
+                    text_field::MicroformatTags.into(),
                     schema,
                     schema,
                     fastfield_reader.clone(),
                     fastfield_reader.clone(),
                 )),
                 )),
@@ -250,7 +250,7 @@ impl AsTantivyQuery for Matching {
             MatchLocation::Schema => Box::new(ConstQuery::new(
             MatchLocation::Schema => Box::new(ConstQuery::new(
                 Box::new(PatternQuery::new(
                 Box::new(PatternQuery::new(
                     self.pattern.clone(),
                     self.pattern.clone(),
-                    TextField::FlattenedSchemaOrgJson,
+                    text_field::FlattenedSchemaOrgJson.into(),
                     schema,
                     schema,
                     fastfield_reader.clone(),
                     fastfield_reader.clone(),
                 )),
                 )),

+ 10 - 10
crates/core/src/query/parser/as_tantivy.rs

@@ -21,7 +21,7 @@ use tantivy::{
 
 
 use crate::{
 use crate::{
     bangs::BANG_PREFIXES,
     bangs::BANG_PREFIXES,
-    schema::{Field, TextField},
+    schema::{text_field::TextField, Field, TextFieldEnum},
 };
 };
 
 
 use super::{CompoundAwareTerm, SimpleOrPhrase, SimpleTerm, Term, TermCompound};
 use super::{CompoundAwareTerm, SimpleOrPhrase, SimpleTerm, Term, TermCompound};
@@ -53,9 +53,9 @@ fn simple_into_tantivy(
         .filter(|field| {
         .filter(|field| {
             matches!(
             matches!(
                 Field::get(field.field_id() as usize),
                 Field::get(field.field_id() as usize),
-                Some(Field::Text(TextField::AllBody))
-                    | Some(Field::Text(TextField::Title))
-                    | Some(Field::Text(TextField::Url))
+                Some(Field::Text(TextFieldEnum::AllBody(_)))
+                    | Some(Field::Text(TextFieldEnum::Title(_)))
+                    | Some(Field::Text(TextFieldEnum::Url(_)))
             )
             )
         })
         })
         .copied()
         .copied()
@@ -98,7 +98,7 @@ fn phrase_query(
         }
         }
 
 
         if processed_terms.len() == 1 {
         if processed_terms.len() == 1 {
-            let options = field.as_text().unwrap().index_option();
+            let options = field.as_text().unwrap().record_option();
 
 
             phrases.push((
             phrases.push((
                 Occur::Should,
                 Occur::Should,
@@ -140,7 +140,7 @@ impl Term {
                     .find(|field| {
                     .find(|field| {
                         matches!(
                         matches!(
                             Field::get(field.field_id() as usize),
                             Field::get(field.field_id() as usize),
-                            Some(Field::Text(TextField::Title))
+                            Some(Field::Text(TextFieldEnum::Title(_)))
                         )
                         )
                     })
                     })
                     .unwrap();
                     .unwrap();
@@ -156,7 +156,7 @@ impl Term {
                     .find(|field| {
                     .find(|field| {
                         matches!(
                         matches!(
                             Field::get(field.field_id() as usize),
                             Field::get(field.field_id() as usize),
-                            Some(Field::Text(TextField::AllBody))
+                            Some(Field::Text(TextFieldEnum::AllBody(_)))
                         )
                         )
                     })
                     })
                     .unwrap();
                     .unwrap();
@@ -172,7 +172,7 @@ impl Term {
                     .find(|field| {
                     .find(|field| {
                         matches!(
                         matches!(
                             Field::get(field.field_id() as usize),
                             Field::get(field.field_id() as usize),
-                            Some(Field::Text(TextField::Url))
+                            Some(Field::Text(TextFieldEnum::Url(_)))
                         )
                         )
                     })
                     })
                     .unwrap();
                     .unwrap();
@@ -222,7 +222,7 @@ impl Term {
             .filter(|field| {
             .filter(|field| {
                 matches!(
                 matches!(
                     Field::get(field.field_id() as usize),
                     Field::get(field.field_id() as usize),
-                    Some(Field::Text(TextField::UrlForSiteOperator))
+                    Some(Field::Text(TextFieldEnum::UrlForSiteOperator(_)))
                 )
                 )
             })
             })
             .map(|field| {
             .map(|field| {
@@ -258,7 +258,7 @@ impl Term {
             .unwrap()
             .unwrap()
             .as_text()
             .as_text()
             .unwrap()
             .unwrap()
-            .index_option();
+            .record_option();
 
 
         let processed_query = if processed_terms.len() == 1 {
         let processed_query = if processed_terms.len() == 1 {
             let term = processed_terms.remove(0);
             let term = processed_terms.remove(0);

+ 3 - 2
crates/core/src/query/parser/mod.rs

@@ -93,8 +93,9 @@ fn single_bang(input: &str, pref: char) -> nom::IResult<&str, Term> {
 
 
 fn bang(input: &str) -> nom::IResult<&str, Term> {
 fn bang(input: &str) -> nom::IResult<&str, Term> {
     for pref in BANG_PREFIXES.iter() {
     for pref in BANG_PREFIXES.iter() {
-        let (input, output) = single_bang(input, *pref)?;
-        return Ok((input, output));
+        if let Ok((input, output)) = single_bang(input, *pref) {
+            return Ok((input, output));
+        }
     }
     }
 
 
     Err(nom::Err::Error(nom::error::Error::new(
     Err(nom::Err::Error(nom::error::Error::new(

+ 4 - 4
crates/core/src/query/pattern_query/mod.rs

@@ -23,7 +23,7 @@ use tantivy::tokenizer::Tokenizer;
 
 
 use crate::{
 use crate::{
     fastfield_reader::FastFieldReader,
     fastfield_reader::FastFieldReader,
-    schema::{Field, TextField},
+    schema::{text_field::TextField, Field, TextFieldEnum},
 };
 };
 
 
 use self::weight::{FastSiteDomainPatternWeight, PatternWeight};
 use self::weight::{FastSiteDomainPatternWeight, PatternWeight};
@@ -50,7 +50,7 @@ impl std::fmt::Debug for PatternQuery {
 impl PatternQuery {
 impl PatternQuery {
     pub fn new(
     pub fn new(
         patterns: Vec<PatternPart>,
         patterns: Vec<PatternPart>,
-        field: TextField,
+        field: TextFieldEnum,
         schema: &tantivy::schema::Schema,
         schema: &tantivy::schema::Schema,
         fastfield_reader: FastFieldReader,
         fastfield_reader: FastFieldReader,
     ) -> Self {
     ) -> Self {
@@ -165,6 +165,6 @@ fn can_optimize_site_domain(patterns: &[PatternPart], field: Field) -> bool {
         && patterns[1..patterns.len() - 1]
         && patterns[1..patterns.len() - 1]
             .iter()
             .iter()
             .all(|pattern| matches!(pattern, PatternPart::Raw(_)))
             .all(|pattern| matches!(pattern, PatternPart::Raw(_)))
-        && (matches!(field, Field::Text(TextField::UrlForSiteOperator))
-            || matches!(field, Field::Text(TextField::Domain)))
+        && (matches!(field, Field::Text(TextFieldEnum::UrlForSiteOperator(_)))
+            || matches!(field, Field::Text(TextFieldEnum::Domain(_))))
 }
 }

+ 4 - 4
crates/core/src/query/pattern_query/scorer.rs

@@ -24,7 +24,7 @@ use tantivy::{
 use crate::{
 use crate::{
     fastfield_reader::{self, FastFieldReader},
     fastfield_reader::{self, FastFieldReader},
     query::intersection::Intersection,
     query::intersection::Intersection,
-    schema::FastField,
+    schema::FastFieldEnum,
 };
 };
 
 
 use super::SmallPatternPart;
 use super::SmallPatternPart;
@@ -126,7 +126,7 @@ impl Scorer for AllScorer {
 
 
 pub struct EmptyFieldScorer {
 pub struct EmptyFieldScorer {
     pub segment_reader: Arc<fastfield_reader::SegmentReader>,
     pub segment_reader: Arc<fastfield_reader::SegmentReader>,
-    pub num_tokens_fastfield: FastField,
+    pub num_tokens_fastfield: FastFieldEnum,
     pub all_scorer: AllScorer,
     pub all_scorer: AllScorer,
 }
 }
 
 
@@ -210,7 +210,7 @@ pub struct NormalPatternScorer {
     left: Vec<u32>,
     left: Vec<u32>,
     right: Vec<u32>,
     right: Vec<u32>,
     phrase_count: u32,
     phrase_count: u32,
-    num_tokens_field: FastField,
+    num_tokens_field: FastFieldEnum,
     segment_reader: Arc<fastfield_reader::SegmentReader>,
     segment_reader: Arc<fastfield_reader::SegmentReader>,
 }
 }
 
 
@@ -219,7 +219,7 @@ impl NormalPatternScorer {
         term_postings_list: Vec<SegmentPostings>,
         term_postings_list: Vec<SegmentPostings>,
         pattern: Vec<SmallPatternPart>,
         pattern: Vec<SmallPatternPart>,
         segment: tantivy::SegmentId,
         segment: tantivy::SegmentId,
-        num_tokens_field: FastField,
+        num_tokens_field: FastFieldEnum,
         fastfield_reader: FastFieldReader,
         fastfield_reader: FastFieldReader,
     ) -> Self {
     ) -> Self {
         let num_query_terms = term_postings_list.len();
         let num_query_terms = term_postings_list.len();

+ 26 - 16
crates/core/src/query/pattern_query/weight.rs

@@ -24,7 +24,11 @@ use tantivy::{
 
 
 use crate::{
 use crate::{
     fastfield_reader::FastFieldReader,
     fastfield_reader::FastFieldReader,
-    schema::{FastField, Field, TextField},
+    schema::{
+        fast_field,
+        text_field::{self, TextField},
+        Field, TextFieldEnum,
+    },
 };
 };
 
 
 use super::scorer::{
 use super::scorer::{
@@ -52,10 +56,12 @@ impl FastSiteDomainPatternWeight {
         let fieldnorm_reader = self.fieldnorm_reader(reader)?;
         let fieldnorm_reader = self.fieldnorm_reader(reader)?;
 
 
         let field_no_tokenizer = match Field::get(self.field.field_id() as usize) {
         let field_no_tokenizer = match Field::get(self.field.field_id() as usize) {
-            Some(Field::Text(TextField::UrlForSiteOperator)) => {
-                Field::Text(TextField::SiteNoTokenizer)
+            Some(Field::Text(TextFieldEnum::UrlForSiteOperator(_))) => {
+                Field::Text(text_field::SiteNoTokenizer.into())
+            }
+            Some(Field::Text(TextFieldEnum::Domain(_))) => {
+                Field::Text(text_field::DomainNoTokenizer.into())
             }
             }
-            Some(Field::Text(TextField::Domain)) => Field::Text(TextField::DomainNoTokenizer),
             _ => unreachable!(),
             _ => unreachable!(),
         };
         };
 
 
@@ -65,7 +71,7 @@ impl FastSiteDomainPatternWeight {
             .unwrap();
             .unwrap();
 
 
         let opt = match field_no_tokenizer {
         let opt = match field_no_tokenizer {
-            Field::Text(t) => t.index_option(),
+            Field::Text(t) => t.record_option(),
             Field::Fast(_) => unreachable!(),
             Field::Fast(_) => unreachable!(),
         };
         };
 
 
@@ -134,19 +140,23 @@ impl PatternWeight {
         }
         }
 
 
         let num_tokens_fastfield = match Field::get(self.field.field_id() as usize) {
         let num_tokens_fastfield = match Field::get(self.field.field_id() as usize) {
-            Some(Field::Text(TextField::Title)) => Ok(FastField::NumTitleTokens),
-            Some(Field::Text(TextField::CleanBody)) => Ok(FastField::NumCleanBodyTokens),
-            Some(Field::Text(TextField::Url)) => Ok(FastField::NumUrlTokens),
-            Some(Field::Text(TextField::Domain)) => Ok(FastField::NumDomainTokens),
-            Some(Field::Text(TextField::UrlForSiteOperator)) => {
-                Ok(FastField::NumUrlForSiteOperatorTokens)
+            Some(Field::Text(TextFieldEnum::Title(_))) => Ok(fast_field::NumTitleTokens.into()),
+            Some(Field::Text(TextFieldEnum::CleanBody(_))) => {
+                Ok(fast_field::NumCleanBodyTokens.into())
+            }
+            Some(Field::Text(TextFieldEnum::Url(_))) => Ok(fast_field::NumUrlTokens.into()),
+            Some(Field::Text(TextFieldEnum::Domain(_))) => Ok(fast_field::NumDomainTokens.into()),
+            Some(Field::Text(TextFieldEnum::UrlForSiteOperator(_))) => {
+                Ok(fast_field::NumUrlForSiteOperatorTokens.into())
+            }
+            Some(Field::Text(TextFieldEnum::Description(_))) => {
+                Ok(fast_field::NumDescriptionTokens.into())
             }
             }
-            Some(Field::Text(TextField::Description)) => Ok(FastField::NumDescriptionTokens),
-            Some(Field::Text(TextField::MicroformatTags)) => {
-                Ok(FastField::NumMicroformatTagsTokens)
+            Some(Field::Text(TextFieldEnum::MicroformatTags(_))) => {
+                Ok(fast_field::NumMicroformatTagsTokens.into())
             }
             }
-            Some(Field::Text(TextField::FlattenedSchemaOrgJson)) => {
-                Ok(FastField::NumFlattenedSchemaTokens)
+            Some(Field::Text(TextFieldEnum::FlattenedSchemaOrgJson(_))) => {
+                Ok(fast_field::NumFlattenedSchemaTokens.into())
             }
             }
             Some(field) => Err(TantivyError::InvalidArgument(format!(
             Some(field) => Err(TantivyError::InvalidArgument(format!(
                 "{} is not supported in pattern query",
                 "{} is not supported in pattern query",

+ 3 - 3
crates/core/src/ranking/pipeline/stages/recall.rs

@@ -30,7 +30,7 @@ use crate::{
         pipeline::{RankableWebpage, RankingPipeline, RankingStage, Recall, Scorer},
         pipeline::{RankableWebpage, RankingPipeline, RankingStage, Recall, Scorer},
         Signal, SignalAggregator, SignalScore,
         Signal, SignalAggregator, SignalScore,
     },
     },
-    schema::FastField,
+    schema::fast_field,
     searcher::SearchQuery,
     searcher::SearchQuery,
 };
 };
 
 
@@ -62,11 +62,11 @@ impl RecallRankingWebpage {
         let fastfields = fastfield_reader.get_field_reader(pointer.address.doc_id);
         let fastfields = fastfield_reader.get_field_reader(pointer.address.doc_id);
 
 
         let title_embedding: Option<Vec<u8>> = fastfields
         let title_embedding: Option<Vec<u8>> = fastfields
-            .get(FastField::TitleEmbeddings)
+            .get(fast_field::TitleEmbeddings.into())
             .and_then(|v| v.into());
             .and_then(|v| v.into());
 
 
         let keyword_embedding: Option<Vec<u8>> = fastfields
         let keyword_embedding: Option<Vec<u8>> = fastfields
-            .get(FastField::KeywordEmbeddings)
+            .get(fast_field::KeywordEmbeddings.into())
             .and_then(|v| v.into());
             .and_then(|v| v.into());
 
 
         let mut res = RecallRankingWebpage {
         let mut res = RecallRankingWebpage {

+ 62 - 96
crates/core/src/ranking/signal.rs

@@ -14,13 +14,16 @@
 // You should have received a copy of the GNU Affero General Public License
 // You should have received a copy of the GNU Affero General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 
+use crate::enum_map::InsertEnumMapKey;
 use crate::query::optic::AsSearchableRule;
 use crate::query::optic::AsSearchableRule;
 use crate::query::Query;
 use crate::query::Query;
+use crate::schema::text_field::TextField;
+use crate::schema::{fast_field, text_field};
 use crate::Result;
 use crate::Result;
 use crate::{
 use crate::{
     enum_map::EnumMap,
     enum_map::EnumMap,
     fastfield_reader,
     fastfield_reader,
-    schema::{FastField, TextField},
+    schema::{FastFieldEnum, TextFieldEnum},
     webgraph::NodeID,
     webgraph::NodeID,
     webpage::Webpage,
     webpage::Webpage,
 };
 };
@@ -31,10 +34,11 @@ use serde::{Deserialize, Serialize};
 use std::cell::RefCell;
 use std::cell::RefCell;
 use std::str::FromStr;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::sync::Arc;
+use strum::VariantArray;
 use tantivy::fieldnorm::FieldNormReader;
 use tantivy::fieldnorm::FieldNormReader;
 use tantivy::postings::SegmentPostings;
 use tantivy::postings::SegmentPostings;
 use tantivy::query::{Query as _, Scorer};
 use tantivy::query::{Query as _, Scorer};
-use tantivy::tokenizer::Tokenizer;
+use tantivy::tokenizer::Tokenizer as _;
 use thiserror::Error;
 use thiserror::Error;
 use utoipa::ToSchema;
 use utoipa::ToSchema;
 
 
@@ -56,7 +60,9 @@ pub enum Error {
     UnknownSignal(#[from] serde_json::Error),
     UnknownSignal(#[from] serde_json::Error),
 }
 }
 
 
-#[derive(Debug, serde::Serialize, serde::Deserialize, Clone, Copy, PartialEq, Eq, Hash)]
+#[derive(
+    Debug, serde::Serialize, serde::Deserialize, Clone, Copy, PartialEq, Eq, Hash, VariantArray,
+)]
 pub enum Signal {
 pub enum Signal {
     #[serde(rename = "bm25_title")]
     #[serde(rename = "bm25_title")]
     Bm25Title,
     Bm25Title,
@@ -140,55 +146,12 @@ pub enum Signal {
     KeywordEmbeddingSimilarity,
     KeywordEmbeddingSimilarity,
 }
 }
 
 
-impl From<Signal> for usize {
-    fn from(signal: Signal) -> Self {
-        signal as usize
+impl InsertEnumMapKey for Signal {
+    fn into_usize(self) -> usize {
+        self as usize
     }
     }
 }
 }
 
 
-pub const ALL_SIGNALS: [Signal; 40] = [
-    Signal::Bm25Title,
-    Signal::Bm25TitleBigrams,
-    Signal::Bm25TitleTrigrams,
-    Signal::Bm25CleanBody,
-    Signal::Bm25CleanBodyBigrams,
-    Signal::Bm25CleanBodyTrigrams,
-    Signal::Bm25StemmedTitle,
-    Signal::Bm25StemmedCleanBody,
-    Signal::Bm25AllBody,
-    Signal::Bm25BacklinkText,
-    Signal::Bm25Keywords,
-    Signal::IdfSumUrl,
-    Signal::IdfSumSite,
-    Signal::IdfSumDomain,
-    Signal::IdfSumSiteNoTokenizer,
-    Signal::IdfSumDomainNoTokenizer,
-    Signal::IdfSumDomainNameNoTokenizer,
-    Signal::IdfSumDomainIfHomepage,
-    Signal::IdfSumDomainNameIfHomepageNoTokenizer,
-    Signal::IdfSumDomainIfHomepageNoTokenizer,
-    Signal::IdfSumTitleIfHomepage,
-    Signal::CrossEncoderSnippet,
-    Signal::CrossEncoderTitle,
-    Signal::HostCentrality,
-    Signal::HostCentralityRank,
-    Signal::PageCentrality,
-    Signal::PageCentralityRank,
-    Signal::IsHomepage,
-    Signal::FetchTimeMs,
-    Signal::UpdateTimestamp,
-    Signal::TrackerScore,
-    Signal::Region,
-    Signal::QueryCentrality,
-    Signal::InboundSimilarity,
-    Signal::LambdaMART,
-    Signal::UrlDigits,
-    Signal::UrlSlashes,
-    Signal::LinkDensity,
-    Signal::TitleEmbeddingSimilarity,
-    Signal::KeywordEmbeddingSimilarity,
-];
-
 fn score_timestamp(timestamp: usize, signal_aggregator: &SignalAggregator) -> f64 {
 fn score_timestamp(timestamp: usize, signal_aggregator: &SignalAggregator) -> f64 {
     if timestamp >= signal_aggregator.current_timestamp.unwrap_or(0) {
     if timestamp >= signal_aggregator.current_timestamp.unwrap_or(0) {
         return 0.0;
         return 0.0;
@@ -359,7 +322,7 @@ impl Signal {
         let fastfield_reader = seg_reader.fastfield_reader.get_field_reader(doc);
         let fastfield_reader = seg_reader.fastfield_reader.get_field_reader(doc);
 
 
         let node_id = fastfield_reader
         let node_id = fastfield_reader
-            .get(FastField::HostNodeID)
+            .get(fast_field::HostNodeID.into())
             .and_then(|n| n.as_u64())
             .and_then(|n| n.as_u64())
             .unwrap();
             .unwrap();
 
 
@@ -610,56 +573,60 @@ impl Signal {
         })
         })
     }
     }
 
 
-    fn as_fastfield(&self) -> Option<FastField> {
+    fn as_fastfield(&self) -> Option<FastFieldEnum> {
         match self {
         match self {
-            Signal::HostCentrality => Some(FastField::HostCentrality),
-            Signal::HostCentralityRank => Some(FastField::HostCentralityRank),
-            Signal::PageCentrality => Some(FastField::PageCentrality),
-            Signal::PageCentralityRank => Some(FastField::PageCentralityRank),
-            Signal::IsHomepage => Some(FastField::IsHomepage),
-            Signal::FetchTimeMs => Some(FastField::FetchTimeMs),
-            Signal::UpdateTimestamp => Some(FastField::LastUpdated),
-            Signal::TrackerScore => Some(FastField::TrackerScore),
-            Signal::Region => Some(FastField::Region),
-            Signal::UrlSlashes => Some(FastField::NumPathAndQuerySlashes),
-            Signal::UrlDigits => Some(FastField::NumPathAndQueryDigits),
-            Signal::LinkDensity => Some(FastField::LinkDensity),
-            Signal::TitleEmbeddingSimilarity => Some(FastField::TitleEmbeddings),
-            Signal::KeywordEmbeddingSimilarity => Some(FastField::KeywordEmbeddings),
+            Signal::HostCentrality => Some(fast_field::HostCentrality.into()),
+            Signal::HostCentralityRank => Some(fast_field::HostCentralityRank.into()),
+            Signal::PageCentrality => Some(fast_field::PageCentrality.into()),
+            Signal::PageCentralityRank => Some(fast_field::PageCentralityRank.into()),
+            Signal::IsHomepage => Some(fast_field::IsHomepage.into()),
+            Signal::FetchTimeMs => Some(fast_field::FetchTimeMs.into()),
+            Signal::UpdateTimestamp => Some(fast_field::LastUpdated.into()),
+            Signal::TrackerScore => Some(fast_field::TrackerScore.into()),
+            Signal::Region => Some(fast_field::Region.into()),
+            Signal::UrlSlashes => Some(fast_field::NumPathAndQuerySlashes.into()),
+            Signal::UrlDigits => Some(fast_field::NumPathAndQueryDigits.into()),
+            Signal::LinkDensity => Some(fast_field::LinkDensity.into()),
+            Signal::TitleEmbeddingSimilarity => Some(fast_field::TitleEmbeddings.into()),
+            Signal::KeywordEmbeddingSimilarity => Some(fast_field::KeywordEmbeddings.into()),
             _ => None,
             _ => None,
         }
         }
     }
     }
 
 
-    fn as_textfield(&self) -> Option<TextField> {
+    fn as_textfield(&self) -> Option<TextFieldEnum> {
         match self {
         match self {
-            Signal::Bm25Title => Some(TextField::Title),
-            Signal::Bm25TitleBigrams => Some(TextField::TitleBigrams),
-            Signal::Bm25TitleTrigrams => Some(TextField::TitleTrigrams),
-            Signal::Bm25CleanBody => Some(TextField::CleanBody),
-            Signal::Bm25CleanBodyBigrams => Some(TextField::CleanBodyBigrams),
-            Signal::Bm25CleanBodyTrigrams => Some(TextField::CleanBodyTrigrams),
-            Signal::Bm25StemmedTitle => Some(TextField::StemmedTitle),
-            Signal::Bm25StemmedCleanBody => Some(TextField::StemmedCleanBody),
-            Signal::Bm25AllBody => Some(TextField::AllBody),
-            Signal::Bm25BacklinkText => Some(TextField::BacklinkText),
-            Signal::Bm25Keywords => Some(TextField::Keywords),
-            Signal::IdfSumUrl => Some(TextField::Url),
-            Signal::IdfSumSite => Some(TextField::SiteWithout),
-            Signal::IdfSumDomain => Some(TextField::Domain),
-            Signal::IdfSumSiteNoTokenizer => Some(TextField::SiteNoTokenizer),
-            Signal::IdfSumDomainNoTokenizer => Some(TextField::DomainNoTokenizer),
-            Signal::IdfSumDomainNameNoTokenizer => Some(TextField::DomainNameNoTokenizer),
-            Signal::IdfSumDomainIfHomepage => Some(TextField::DomainIfHomepage),
+            Signal::Bm25Title => Some(text_field::Title.into()),
+            Signal::Bm25TitleBigrams => Some(text_field::TitleBigrams.into()),
+            Signal::Bm25TitleTrigrams => Some(text_field::TitleTrigrams.into()),
+            Signal::Bm25CleanBody => Some(text_field::CleanBody.into()),
+            Signal::Bm25CleanBodyBigrams => Some(text_field::CleanBodyBigrams.into()),
+            Signal::Bm25CleanBodyTrigrams => Some(text_field::CleanBodyTrigrams.into()),
+            Signal::Bm25StemmedTitle => Some(text_field::StemmedTitle.into()),
+            Signal::Bm25StemmedCleanBody => Some(text_field::StemmedCleanBody.into()),
+            Signal::Bm25AllBody => Some(text_field::AllBody.into()),
+            Signal::Bm25BacklinkText => Some(text_field::BacklinkText.into()),
+            Signal::Bm25Keywords => Some(text_field::Keywords.into()),
+            Signal::IdfSumUrl => Some(text_field::Url.into()),
+            Signal::IdfSumSite => Some(text_field::SiteWithout.into()),
+            Signal::IdfSumDomain => Some(text_field::Domain.into()),
+            Signal::IdfSumSiteNoTokenizer => Some(text_field::SiteNoTokenizer.into()),
+            Signal::IdfSumDomainNoTokenizer => Some(text_field::DomainNoTokenizer.into()),
+            Signal::IdfSumDomainNameNoTokenizer => Some(text_field::DomainNameNoTokenizer.into()),
+            Signal::IdfSumDomainIfHomepage => Some(text_field::DomainIfHomepage.into()),
             Signal::IdfSumDomainNameIfHomepageNoTokenizer => {
             Signal::IdfSumDomainNameIfHomepageNoTokenizer => {
-                Some(TextField::DomainNameIfHomepageNoTokenizer)
+                Some(text_field::DomainNameIfHomepageNoTokenizer.into())
             }
             }
-            Signal::IdfSumTitleIfHomepage => Some(TextField::TitleIfHomepage),
+            Signal::IdfSumTitleIfHomepage => Some(text_field::TitleIfHomepage.into()),
             Signal::IdfSumDomainIfHomepageNoTokenizer => {
             Signal::IdfSumDomainIfHomepageNoTokenizer => {
-                Some(TextField::DomainIfHomepageNoTokenizer)
+                Some(text_field::DomainIfHomepageNoTokenizer.into())
             }
             }
             _ => None,
             _ => None,
         }
         }
     }
     }
+
+    pub fn all() -> impl Iterator<Item = Self> {
+        Self::VARIANTS.iter().copied()
+    }
 }
 }
 
 
 impl FromStr for Signal {
 impl FromStr for Signal {
@@ -706,7 +673,7 @@ impl SignalCoefficient {
     }
     }
 
 
     pub fn merge_into(&mut self, coeffs: SignalCoefficient) {
     pub fn merge_into(&mut self, coeffs: SignalCoefficient) {
-        for signal in ALL_SIGNALS {
+        for signal in Signal::all() {
             if let Some(coeff) = coeffs.map.get(signal).copied() {
             if let Some(coeff) = coeffs.map.get(signal).copied() {
                 match self.map.get_mut(signal) {
                 match self.map.get_mut(signal) {
                     Some(existing_coeff) => *existing_coeff += coeff,
                     Some(existing_coeff) => *existing_coeff += coeff,
@@ -736,7 +703,7 @@ struct OpticBoosts {
 }
 }
 
 
 struct SegmentReader {
 struct SegmentReader {
-    text_fields: EnumMap<TextField, TextFieldData>,
+    text_fields: EnumMap<TextFieldEnum, TextFieldData>,
     optic_boosts: OpticBoosts,
     optic_boosts: OpticBoosts,
     fastfield_reader: Arc<fastfield_reader::SegmentReader>,
     fastfield_reader: Arc<fastfield_reader::SegmentReader>,
 }
 }
@@ -856,13 +823,13 @@ impl SignalAggregator {
         &self,
         &self,
         tv_searcher: &tantivy::Searcher,
         tv_searcher: &tantivy::Searcher,
         segment_reader: &tantivy::SegmentReader,
         segment_reader: &tantivy::SegmentReader,
-    ) -> Result<EnumMap<TextField, TextFieldData>> {
+    ) -> Result<EnumMap<TextFieldEnum, TextFieldData>> {
         let mut text_fields = EnumMap::new();
         let mut text_fields = EnumMap::new();
         let schema = tv_searcher.schema();
         let schema = tv_searcher.schema();
 
 
         if let Some(query) = &self.query_data {
         if let Some(query) = &self.query_data {
             if !query.simple_terms.is_empty() {
             if !query.simple_terms.is_empty() {
-                for signal in ALL_SIGNALS {
+                for signal in Signal::all() {
                     if let Some(text_field) = signal.as_textfield() {
                     if let Some(text_field) = signal.as_textfield() {
                         let tv_field = schema.get_field(text_field.name()).unwrap();
                         let tv_field = schema.get_field(text_field.name()).unwrap();
                         let simple_query = itertools::intersperse(
                         let simple_query = itertools::intersperse(
@@ -891,7 +858,7 @@ impl SignalAggregator {
                         let mut postings = Vec::with_capacity(terms.len());
                         let mut postings = Vec::with_capacity(terms.len());
                         for term in &terms {
                         for term in &terms {
                             if let Some(p) =
                             if let Some(p) =
-                                inverted_index.read_postings(term, text_field.index_option())?
+                                inverted_index.read_postings(term, text_field.record_option())?
                             {
                             {
                                 postings.push(p);
                                 postings.push(p);
                                 matching_terms.push(term.clone());
                                 matching_terms.push(term.clone());
@@ -1043,8 +1010,7 @@ impl SignalAggregator {
     }
     }
 
 
     pub fn precompute_score(&self, webpage: &Webpage) -> f64 {
     pub fn precompute_score(&self, webpage: &Webpage) -> f64 {
-        ALL_SIGNALS
-            .into_iter()
+        Signal::all()
             .filter_map(|signal| signal.precompute(self, webpage))
             .filter_map(|signal| signal.precompute(self, webpage))
             .map(|computed| computed.score.coefficient * computed.score.value)
             .map(|computed| computed.score.coefficient * computed.score.value)
             .sum()
             .sum()
@@ -1078,7 +1044,7 @@ pub struct SignalScore {
 
 
 #[derive(Clone)]
 #[derive(Clone)]
 pub struct SignalOrder {
 pub struct SignalOrder {
-    text_signals: EnumMap<TextField, NGramSignalOrder>,
+    text_signals: EnumMap<TextFieldEnum, NGramSignalOrder>,
     other_signals: Vec<Signal>,
     other_signals: Vec<Signal>,
 }
 }
 
 
@@ -1094,7 +1060,7 @@ impl SignalOrder {
         let mut text_signals = EnumMap::new();
         let mut text_signals = EnumMap::new();
         let mut other_signals = Vec::new();
         let mut other_signals = Vec::new();
 
 
-        for signal in ALL_SIGNALS {
+        for signal in Signal::all() {
             if signal_aggregator.coefficient(&signal) == 0.0 {
             if signal_aggregator.coefficient(&signal) == 0.0 {
                 continue;
                 continue;
             }
             }

+ 0 - 850
crates/core/src/schema.rs

@@ -1,850 +0,0 @@
-// Stract is an open source web search engine.
-// Copyright (C) 2023 Stract ApS
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU Affero General Public License as
-// published by the Free Software Foundation, either version 3 of the
-// License, or (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU Affero General Public License for more details.
-//
-// You should have received a copy of the GNU Affero General Public License
-// along with this program.  If not, see <https://www.gnu.org/licenses/>.
-
-use tantivy::{
-    schema::{BytesOptions, IndexRecordOption, NumericOptions, TextFieldIndexing, TextOptions},
-    DateOptions,
-};
-
-use crate::tokenizer::{
-    BigramTokenizer, Identity, JsonField, SiteOperatorUrlTokenizer, Tokenizer, TrigramTokenizer,
-};
-
-pub const FLOAT_SCALING: u64 = 1_000_000_000;
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum TextField {
-    Title,
-    CleanBody,
-    StemmedTitle,
-    StemmedCleanBody,
-    AllBody,
-    Url,
-    UrlNoTokenizer,
-    UrlForSiteOperator,
-    SiteWithout,
-    Domain,
-    SiteNoTokenizer,
-    DomainNoTokenizer,
-    DomainNameNoTokenizer,
-    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
-    SiteIfHomepageNoTokenizer,
-    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
-    DomainIfHomepage,
-    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
-    DomainNameIfHomepageNoTokenizer,
-    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
-    DomainIfHomepageNoTokenizer,
-    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
-    TitleIfHomepage,
-    BacklinkText,
-    Description,
-    DmozDescription,
-    SchemaOrgJson,
-    FlattenedSchemaOrgJson,
-    CleanBodyBigrams,
-    TitleBigrams,
-    CleanBodyTrigrams,
-    TitleTrigrams,
-    MicroformatTags,
-    /// can either be NSFW or SFW (see safety classifier)
-    SafetyClassification,
-    InsertionTimestamp,
-    RecipeFirstIngredientTagId,
-    Keywords,
-}
-
-impl From<TextField> for usize {
-    fn from(value: TextField) -> Self {
-        value as usize
-    }
-}
-
-impl TextField {
-    pub fn ngram_size(&self) -> usize {
-        match self {
-            TextField::Title => 1,
-            TextField::CleanBody => 1,
-            TextField::StemmedTitle => 1,
-            TextField::StemmedCleanBody => 1,
-            TextField::AllBody => 1,
-            TextField::Url => 1,
-            TextField::UrlNoTokenizer => 1,
-            TextField::UrlForSiteOperator => 1,
-            TextField::SiteWithout => 1,
-            TextField::Domain => 1,
-            TextField::SiteNoTokenizer => 1,
-            TextField::DomainNoTokenizer => 1,
-            TextField::DomainNameNoTokenizer => 1,
-            TextField::SiteIfHomepageNoTokenizer => 1,
-            TextField::DomainIfHomepage => 1,
-            TextField::DomainNameIfHomepageNoTokenizer => 1,
-            TextField::DomainIfHomepageNoTokenizer => 1,
-            TextField::TitleIfHomepage => 1,
-            TextField::BacklinkText => 1,
-            TextField::Description => 1,
-            TextField::DmozDescription => 1,
-            TextField::SchemaOrgJson => 1,
-            TextField::FlattenedSchemaOrgJson => 1,
-            TextField::CleanBodyBigrams => 2,
-            TextField::TitleBigrams => 2,
-            TextField::CleanBodyTrigrams => 3,
-            TextField::TitleTrigrams => 3,
-            TextField::MicroformatTags => 1,
-            TextField::SafetyClassification => 1,
-            TextField::InsertionTimestamp => 1,
-            TextField::RecipeFirstIngredientTagId => 1,
-            TextField::Keywords => 1,
-        }
-    }
-
-    pub fn monogram_field(&self) -> TextField {
-        match self {
-            TextField::Title => TextField::Title,
-            TextField::CleanBody => TextField::CleanBody,
-            TextField::StemmedTitle => TextField::StemmedTitle,
-            TextField::StemmedCleanBody => TextField::StemmedCleanBody,
-            TextField::AllBody => TextField::AllBody,
-            TextField::Url => TextField::Url,
-            TextField::UrlNoTokenizer => TextField::UrlNoTokenizer,
-            TextField::UrlForSiteOperator => TextField::UrlForSiteOperator,
-            TextField::SiteWithout => TextField::SiteWithout,
-            TextField::Domain => TextField::Domain,
-            TextField::SiteNoTokenizer => TextField::SiteNoTokenizer,
-            TextField::DomainNoTokenizer => TextField::DomainNoTokenizer,
-            TextField::DomainNameNoTokenizer => TextField::DomainNameNoTokenizer,
-            TextField::SiteIfHomepageNoTokenizer => TextField::SiteIfHomepageNoTokenizer,
-            TextField::DomainIfHomepage => TextField::DomainIfHomepage,
-            TextField::DomainNameIfHomepageNoTokenizer => {
-                TextField::DomainNameIfHomepageNoTokenizer
-            }
-            TextField::DomainIfHomepageNoTokenizer => TextField::DomainIfHomepageNoTokenizer,
-            TextField::TitleIfHomepage => TextField::TitleIfHomepage,
-            TextField::BacklinkText => TextField::BacklinkText,
-            TextField::Description => TextField::Description,
-            TextField::DmozDescription => TextField::DmozDescription,
-            TextField::SchemaOrgJson => TextField::SchemaOrgJson,
-            TextField::FlattenedSchemaOrgJson => TextField::FlattenedSchemaOrgJson,
-            TextField::CleanBodyBigrams => TextField::CleanBody,
-            TextField::TitleBigrams => TextField::Title,
-            TextField::CleanBodyTrigrams => TextField::CleanBody,
-            TextField::TitleTrigrams => TextField::Title,
-            TextField::MicroformatTags => TextField::MicroformatTags,
-            TextField::SafetyClassification => TextField::SafetyClassification,
-            TextField::InsertionTimestamp => TextField::InsertionTimestamp,
-            TextField::RecipeFirstIngredientTagId => TextField::RecipeFirstIngredientTagId,
-            TextField::Keywords => TextField::Keywords,
-        }
-    }
-
-    pub fn query_tokenizer(&self) -> Tokenizer {
-        match self {
-            TextField::TitleBigrams => Tokenizer::default(),
-            TextField::CleanBodyBigrams => Tokenizer::default(),
-            TextField::TitleTrigrams => Tokenizer::default(),
-            TextField::CleanBodyTrigrams => Tokenizer::default(),
-            _ => self.indexing_tokenizer(),
-        }
-    }
-
-    pub fn indexing_tokenizer(&self) -> Tokenizer {
-        match self {
-            TextField::Title => Tokenizer::default(),
-            TextField::CleanBody => Tokenizer::default(),
-            TextField::StemmedTitle => Tokenizer::new_stemmed(),
-            TextField::StemmedCleanBody => Tokenizer::new_stemmed(),
-            TextField::AllBody => Tokenizer::default(),
-            TextField::Url => Tokenizer::default(),
-            TextField::UrlNoTokenizer => Tokenizer::Identity(Identity {}),
-            TextField::UrlForSiteOperator => Tokenizer::SiteOperator(SiteOperatorUrlTokenizer),
-            TextField::SiteWithout => Tokenizer::default(),
-            TextField::Domain => Tokenizer::default(),
-            TextField::SiteNoTokenizer => Tokenizer::Identity(Identity {}),
-            TextField::SiteIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}),
-            TextField::DomainNoTokenizer => Tokenizer::Identity(Identity {}),
-            TextField::DomainNameNoTokenizer => Tokenizer::Identity(Identity {}),
-            TextField::DomainIfHomepage => Tokenizer::default(),
-            TextField::DomainNameIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}),
-            TextField::DomainIfHomepageNoTokenizer => Tokenizer::Identity(Identity {}),
-            TextField::TitleIfHomepage => Tokenizer::default(),
-            TextField::BacklinkText => Tokenizer::default(),
-            TextField::Description => Tokenizer::default(),
-            TextField::DmozDescription => Tokenizer::default(),
-            TextField::SchemaOrgJson => Tokenizer::Identity(Identity {}),
-            TextField::FlattenedSchemaOrgJson => Tokenizer::Json(JsonField),
-            TextField::CleanBodyBigrams => Tokenizer::Bigram(BigramTokenizer::default()),
-            TextField::TitleBigrams => Tokenizer::Bigram(BigramTokenizer::default()),
-            TextField::CleanBodyTrigrams => Tokenizer::Trigram(TrigramTokenizer::default()),
-            TextField::TitleTrigrams => Tokenizer::Trigram(TrigramTokenizer::default()),
-            TextField::MicroformatTags => Tokenizer::default(),
-            TextField::SafetyClassification => Tokenizer::Identity(Identity {}),
-            TextField::InsertionTimestamp => Tokenizer::Identity(Identity {}),
-            TextField::RecipeFirstIngredientTagId => Tokenizer::Identity(Identity {}),
-            TextField::Keywords => Tokenizer::default(),
-        }
-    }
-
-    pub fn index_option(&self) -> IndexRecordOption {
-        if self.has_pos() {
-            IndexRecordOption::WithFreqsAndPositions
-        } else {
-            IndexRecordOption::WithFreqs
-        }
-    }
-
-    pub fn has_pos(&self) -> bool {
-        match self {
-            TextField::Title => true,
-            TextField::CleanBody => true,
-            TextField::StemmedTitle => false,
-            TextField::StemmedCleanBody => false,
-            TextField::AllBody => false,
-            TextField::Url => true,
-            TextField::UrlNoTokenizer => false,
-            TextField::UrlForSiteOperator => true,
-            TextField::SiteWithout => true,
-            TextField::Domain => true,
-            TextField::SiteNoTokenizer => false,
-            TextField::SiteIfHomepageNoTokenizer => false,
-            TextField::DomainNoTokenizer => false,
-            TextField::DomainNameNoTokenizer => false,
-            TextField::DomainIfHomepage => false,
-            TextField::DomainNameIfHomepageNoTokenizer => false,
-            TextField::DomainIfHomepageNoTokenizer => false,
-            TextField::TitleIfHomepage => false,
-            TextField::BacklinkText => false,
-            TextField::Description => true,
-            TextField::DmozDescription => true,
-            TextField::SchemaOrgJson => false,
-            TextField::FlattenedSchemaOrgJson => true,
-            TextField::CleanBodyBigrams => false,
-            TextField::TitleBigrams => false,
-            TextField::CleanBodyTrigrams => false,
-            TextField::TitleTrigrams => false,
-            TextField::MicroformatTags => true,
-            TextField::SafetyClassification => false,
-            TextField::InsertionTimestamp => false,
-            TextField::RecipeFirstIngredientTagId => false,
-            TextField::Keywords => false,
-        }
-    }
-
-    pub fn name(&self) -> &str {
-        match self {
-            TextField::Title => "title",
-            TextField::CleanBody => "body",
-            TextField::Url => "url",
-            TextField::UrlNoTokenizer => "url_no_tokenizer",
-            TextField::UrlForSiteOperator => "url_for_site_operator",
-            TextField::SiteWithout => "site",
-            TextField::Domain => "domain",
-            TextField::SiteNoTokenizer => "site_no_tokenizer",
-            TextField::SiteIfHomepageNoTokenizer => "site_if_homepage_no_tokenizer",
-            TextField::DomainNoTokenizer => "domain_no_tokenizer",
-            TextField::DomainNameNoTokenizer => "domain_name_no_tokenizer",
-            TextField::BacklinkText => "backlink_text",
-            TextField::StemmedTitle => "stemmed_title",
-            TextField::StemmedCleanBody => "stemmed_body",
-            TextField::DomainIfHomepage => "domain_if_homepage",
-            TextField::DomainNameIfHomepageNoTokenizer => "domain_name_if_homepage_no_tokenizer",
-            TextField::DomainIfHomepageNoTokenizer => "domain_if_homepage_no_tokenizer",
-            TextField::Description => "description",
-            TextField::TitleIfHomepage => "title_if_homepage",
-            TextField::AllBody => "all_body",
-            TextField::DmozDescription => "dmoz_description",
-            TextField::SchemaOrgJson => "schema_org_json",
-            TextField::FlattenedSchemaOrgJson => "flattened_schema_org_json",
-            TextField::CleanBodyBigrams => "clean_body_bigrams",
-            TextField::TitleBigrams => "title_bigrams",
-            TextField::CleanBodyTrigrams => "clean_body_trigrams",
-            TextField::TitleTrigrams => "title_trigrams",
-            TextField::MicroformatTags => "microformat_tags",
-            TextField::SafetyClassification => "safety_classification",
-            TextField::InsertionTimestamp => "insertion_timestamp",
-            TextField::RecipeFirstIngredientTagId => "recipe_first_ingredient_tag_id",
-            TextField::Keywords => "keywords",
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum FastField {
-    IsHomepage,
-    HostCentrality,
-    HostCentralityRank,
-    PageCentrality,
-    PageCentralityRank,
-    FetchTimeMs,
-    LastUpdated,
-    TrackerScore,
-    Region,
-    NumUrlTokens,
-    NumTitleTokens,
-    NumCleanBodyTokens,
-    NumDescriptionTokens,
-    NumUrlForSiteOperatorTokens,
-    NumDomainTokens,
-    NumMicroformatTagsTokens,
-    SiteHash1,
-    SiteHash2,
-    UrlWithoutQueryHash1,
-    UrlWithoutQueryHash2,
-    TitleHash1,
-    TitleHash2,
-    UrlHash1,
-    UrlHash2,
-    DomainHash1,
-    DomainHash2,
-    UrlWithoutTldHash1,
-    UrlWithoutTldHash2,
-    PreComputedScore,
-    HostNodeID,
-    SimHash,
-    NumFlattenedSchemaTokens,
-    NumPathAndQuerySlashes,
-    NumPathAndQueryDigits,
-    LikelyHasAds,
-    LikelyHasPaywall,
-    LinkDensity,
-    TitleEmbeddings,
-    KeywordEmbeddings,
-}
-
-impl FastField {
-    pub fn name(&self) -> &str {
-        match self {
-            FastField::HostCentrality => "host_centrality",
-            FastField::HostCentralityRank => "host_centrality_rank",
-            FastField::PageCentrality => "page_centrality",
-            FastField::PageCentralityRank => "page_centrality_rank",
-            FastField::IsHomepage => "is_homepage",
-            FastField::FetchTimeMs => "fetch_time_ms",
-            FastField::LastUpdated => "last_updated",
-            FastField::TrackerScore => "tracker_score",
-            FastField::Region => "region",
-            FastField::NumUrlTokens => "num_url_tokens",
-            FastField::NumTitleTokens => "num_title_tokens",
-            FastField::NumCleanBodyTokens => "num_clean_body_tokens",
-            FastField::NumDescriptionTokens => "num_description_tokens",
-            FastField::NumDomainTokens => "num_domain_tokens",
-            FastField::NumUrlForSiteOperatorTokens => "num_url_for_site_operator_tokens",
-            FastField::NumFlattenedSchemaTokens => "num_flattened_schema_tokens",
-            FastField::NumMicroformatTagsTokens => "num_microformat_tags_tokens",
-            FastField::SiteHash1 => "site_hash1",
-            FastField::SiteHash2 => "site_hash2",
-            FastField::UrlWithoutQueryHash1 => "url_without_query_hash1",
-            FastField::UrlWithoutQueryHash2 => "url_without_query_hash2",
-            FastField::TitleHash1 => "title_hash1",
-            FastField::TitleHash2 => "title_hash2",
-            FastField::UrlHash1 => "url_hash1",
-            FastField::UrlHash2 => "url_hash2",
-            FastField::DomainHash1 => "domain_hash1",
-            FastField::DomainHash2 => "domain_hash2",
-            FastField::UrlWithoutTldHash1 => "url_without_tld_hash1",
-            FastField::UrlWithoutTldHash2 => "url_without_tld_hash2",
-            FastField::PreComputedScore => "pre_computed_score",
-            FastField::HostNodeID => "host_node_id",
-            FastField::SimHash => "sim_hash",
-            FastField::NumPathAndQuerySlashes => "num_path_and_query_slashes",
-            FastField::NumPathAndQueryDigits => "num_path_and_query_digits",
-            FastField::LikelyHasAds => "likely_has_ads",
-            FastField::LikelyHasPaywall => "likely_has_paywall",
-            FastField::LinkDensity => "link_density",
-            FastField::TitleEmbeddings => "title_embeddings",
-            FastField::KeywordEmbeddings => "keyword_embeddings",
-        }
-    }
-}
-
-impl From<FastField> for usize {
-    fn from(value: FastField) -> Self {
-        value as usize
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
-pub enum Field {
-    Fast(FastField),
-    Text(TextField),
-}
-
-static ALL_FIELDS: [Field; 70] = [
-    Field::Text(TextField::Title),
-    Field::Text(TextField::CleanBody),
-    Field::Text(TextField::StemmedTitle),
-    Field::Text(TextField::StemmedCleanBody),
-    Field::Text(TextField::AllBody),
-    Field::Text(TextField::Url),
-    Field::Text(TextField::UrlNoTokenizer),
-    Field::Text(TextField::UrlForSiteOperator),
-    Field::Text(TextField::SiteWithout),
-    Field::Text(TextField::Domain),
-    Field::Text(TextField::SiteNoTokenizer),
-    Field::Text(TextField::SiteIfHomepageNoTokenizer),
-    Field::Text(TextField::DomainNoTokenizer),
-    Field::Text(TextField::DomainNameNoTokenizer),
-    Field::Text(TextField::DomainIfHomepage),
-    Field::Text(TextField::DomainNameIfHomepageNoTokenizer),
-    Field::Text(TextField::DomainIfHomepageNoTokenizer),
-    Field::Text(TextField::TitleIfHomepage),
-    Field::Text(TextField::BacklinkText),
-    Field::Text(TextField::Description),
-    Field::Text(TextField::DmozDescription),
-    Field::Text(TextField::SchemaOrgJson),
-    Field::Text(TextField::FlattenedSchemaOrgJson),
-    Field::Text(TextField::CleanBodyBigrams),
-    Field::Text(TextField::TitleBigrams),
-    Field::Text(TextField::CleanBodyTrigrams),
-    Field::Text(TextField::TitleTrigrams),
-    Field::Text(TextField::MicroformatTags),
-    Field::Text(TextField::SafetyClassification),
-    Field::Text(TextField::InsertionTimestamp),
-    Field::Text(TextField::Keywords),
-    // FAST FIELDS
-    Field::Fast(FastField::IsHomepage),
-    Field::Fast(FastField::HostCentrality),
-    Field::Fast(FastField::HostCentralityRank),
-    Field::Fast(FastField::PageCentrality),
-    Field::Fast(FastField::PageCentralityRank),
-    Field::Fast(FastField::FetchTimeMs),
-    Field::Fast(FastField::LastUpdated),
-    Field::Fast(FastField::TrackerScore),
-    Field::Fast(FastField::Region),
-    Field::Fast(FastField::NumUrlTokens),
-    Field::Fast(FastField::NumTitleTokens),
-    Field::Fast(FastField::NumCleanBodyTokens),
-    Field::Fast(FastField::NumDescriptionTokens),
-    Field::Fast(FastField::NumDomainTokens),
-    Field::Fast(FastField::NumUrlForSiteOperatorTokens),
-    Field::Fast(FastField::NumFlattenedSchemaTokens),
-    Field::Fast(FastField::NumMicroformatTagsTokens),
-    Field::Fast(FastField::SiteHash1),
-    Field::Fast(FastField::SiteHash2),
-    Field::Fast(FastField::UrlWithoutQueryHash1),
-    Field::Fast(FastField::UrlWithoutQueryHash2),
-    Field::Fast(FastField::TitleHash1),
-    Field::Fast(FastField::TitleHash2),
-    Field::Fast(FastField::UrlHash1),
-    Field::Fast(FastField::UrlHash2),
-    Field::Fast(FastField::DomainHash1),
-    Field::Fast(FastField::DomainHash2),
-    Field::Fast(FastField::UrlWithoutTldHash1),
-    Field::Fast(FastField::UrlWithoutTldHash2),
-    Field::Fast(FastField::PreComputedScore),
-    Field::Fast(FastField::HostNodeID),
-    Field::Fast(FastField::SimHash),
-    Field::Fast(FastField::NumPathAndQuerySlashes),
-    Field::Fast(FastField::NumPathAndQueryDigits),
-    Field::Fast(FastField::LikelyHasAds),
-    Field::Fast(FastField::LikelyHasPaywall),
-    Field::Fast(FastField::LinkDensity),
-    Field::Fast(FastField::TitleEmbeddings),
-    Field::Fast(FastField::KeywordEmbeddings),
-];
-
-impl Field {
-    #[inline]
-    pub fn get(field_id: usize) -> Option<&'static Field> {
-        ALL_FIELDS.get(field_id)
-    }
-
-    #[inline]
-    pub fn all() -> impl Iterator<Item = &'static Field> {
-        ALL_FIELDS.iter()
-    }
-    fn default_text_options(&self) -> tantivy::schema::TextOptions {
-        let tokenizer = self.as_text().unwrap().indexing_tokenizer();
-        let option = self.as_text().unwrap().index_option();
-
-        TextOptions::default().set_indexing_options(
-            TextFieldIndexing::default()
-                .set_tokenizer(tokenizer.as_str())
-                .set_index_option(option),
-        )
-    }
-
-    pub fn has_pos(&self) -> bool {
-        match self {
-            Field::Fast(_) => false,
-            Field::Text(text) => text.has_pos(),
-        }
-    }
-
-    pub fn options(&self) -> IndexingOption {
-        match self {
-            Field::Text(TextField::Title) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Text(TextField::CleanBody) => IndexingOption::Text(self.default_text_options()),
-            Field::Text(TextField::Url) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Text(TextField::UrlNoTokenizer) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::UrlForSiteOperator) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::SiteWithout) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::SiteIfHomepageNoTokenizer) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::Domain) => IndexingOption::Text(self.default_text_options()),
-            Field::Text(TextField::SiteNoTokenizer) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::DomainNoTokenizer) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::DomainNameNoTokenizer) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::AllBody) => IndexingOption::Text(self.default_text_options()),
-            Field::Text(TextField::DomainIfHomepage) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::TitleIfHomepage) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::DomainNameIfHomepageNoTokenizer) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::DomainIfHomepageNoTokenizer) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::BacklinkText) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::StemmedTitle) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::StemmedCleanBody) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Text(TextField::Description) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Text(TextField::DmozDescription) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Text(TextField::SchemaOrgJson) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Text(TextField::FlattenedSchemaOrgJson) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::CleanBodyBigrams) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::TitleBigrams) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::CleanBodyTrigrams) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::TitleTrigrams) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::MicroformatTags) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::SafetyClassification) => {
-                IndexingOption::Text(self.default_text_options())
-            }
-            Field::Text(TextField::RecipeFirstIngredientTagId) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Text(TextField::InsertionTimestamp) => {
-                IndexingOption::DateTime(tantivy::schema::DateOptions::default().set_indexed())
-            }
-            Field::Text(TextField::Keywords) => {
-                IndexingOption::Text(self.default_text_options().set_stored())
-            }
-            Field::Fast(FastField::IsHomepage) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::HostCentrality) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::HostCentralityRank) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::PageCentrality) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::PageCentralityRank) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::FetchTimeMs) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::TrackerScore) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::LastUpdated) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_stored()
-                    .set_indexed(),
-            ),
-            Field::Fast(FastField::Region) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_stored()
-                    .set_indexed(),
-            ),
-            Field::Fast(FastField::NumCleanBodyTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::NumDescriptionTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::NumTitleTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::NumMicroformatTagsTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::NumUrlTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::NumDomainTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::NumUrlForSiteOperatorTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::NumFlattenedSchemaTokens) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_indexed())
-            }
-            Field::Fast(FastField::SiteHash1) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::SiteHash2) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::UrlWithoutQueryHash1) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::UrlWithoutQueryHash2) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::UrlHash1) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::UrlHash2) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::UrlWithoutTldHash1) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::UrlWithoutTldHash2) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::DomainHash1) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::DomainHash2) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::TitleHash1) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::TitleHash2) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast())
-            }
-            Field::Fast(FastField::PreComputedScore) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_indexed()
-                    .set_stored(),
-            ),
-            Field::Fast(FastField::HostNodeID) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_indexed()
-                    .set_stored(),
-            ),
-            Field::Fast(FastField::SimHash) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_indexed()
-                    .set_stored(),
-            ),
-            Field::Fast(FastField::NumPathAndQuerySlashes) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_indexed()
-                    .set_stored(),
-            ),
-            Field::Fast(FastField::NumPathAndQueryDigits) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_indexed()
-                    .set_stored(),
-            ),
-            Field::Fast(FastField::LikelyHasAds) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_indexed()
-                    .set_stored(),
-            ),
-            Field::Fast(FastField::LikelyHasPaywall) => IndexingOption::Integer(
-                NumericOptions::default()
-                    .set_fast()
-                    .set_indexed()
-                    .set_stored(),
-            ),
-            Field::Fast(FastField::LinkDensity) => {
-                IndexingOption::Integer(NumericOptions::default().set_fast().set_stored())
-            }
-            Field::Fast(FastField::TitleEmbeddings) => {
-                IndexingOption::Bytes(BytesOptions::default().set_fast())
-            }
-            Field::Fast(FastField::KeywordEmbeddings) => {
-                IndexingOption::Bytes(BytesOptions::default().set_fast())
-            }
-        }
-    }
-
-    pub fn name(&self) -> &str {
-        match self {
-            Field::Text(t) => t.name(),
-            Field::Fast(f) => f.name(),
-        }
-    }
-
-    /// Whether or not the field should be included
-    /// in the fields that the `Query` searches.
-    ///
-    /// The fields can still be searched by manually
-    /// constructing a tantivy query.
-    pub fn is_searchable(&self) -> bool {
-        !matches!(
-            self,
-            Field::Text(TextField::BacklinkText)
-                | Field::Text(TextField::SchemaOrgJson)
-                | Field::Text(TextField::MicroformatTags)
-                | Field::Text(TextField::SafetyClassification)
-                | Field::Text(TextField::FlattenedSchemaOrgJson)
-                | Field::Text(TextField::UrlForSiteOperator)
-                | Field::Text(TextField::Description)
-                | Field::Text(TextField::DmozDescription)
-                | Field::Text(TextField::SiteIfHomepageNoTokenizer)
-                | Field::Text(TextField::DomainIfHomepage)
-                | Field::Text(TextField::DomainNameIfHomepageNoTokenizer)
-                | Field::Text(TextField::DomainIfHomepageNoTokenizer)
-                | Field::Text(TextField::TitleIfHomepage)
-                | Field::Text(TextField::SiteWithout) // will match url
-                | Field::Text(TextField::Domain) // will match url
-                | Field::Text(TextField::InsertionTimestamp)
-                | Field::Text(TextField::RecipeFirstIngredientTagId)
-        ) && !self.is_fast()
-    }
-
-    pub fn is_fast(&self) -> bool {
-        matches!(self, Field::Fast(_))
-    }
-
-    pub fn as_text(&self) -> Option<TextField> {
-        match self {
-            Field::Fast(_) => None,
-            Field::Text(field) => Some(*field),
-        }
-    }
-
-    pub fn as_fast(&self) -> Option<FastField> {
-        match self {
-            Field::Fast(field) => Some(*field),
-            Field::Text(_) => None,
-        }
-    }
-}
-
-pub fn create_schema() -> tantivy::schema::Schema {
-    let mut builder = tantivy::schema::Schema::builder();
-
-    for field in &ALL_FIELDS {
-        match field.options() {
-            IndexingOption::Text(options) => builder.add_text_field(field.name(), options),
-            IndexingOption::Integer(options) => builder.add_u64_field(field.name(), options),
-            IndexingOption::DateTime(options) => builder.add_date_field(field.name(), options),
-            IndexingOption::Bytes(options) => builder.add_bytes_field(field.name(), options),
-        };
-    }
-
-    builder.build()
-}
-
-pub enum IndexingOption {
-    Text(TextOptions),
-    Integer(NumericOptions),
-    DateTime(DateOptions),
-    Bytes(BytesOptions),
-}
-
-pub enum DataType {
-    U64,
-    Bytes,
-}
-
-impl FastField {
-    pub fn data_type(&self) -> DataType {
-        match self {
-            FastField::IsHomepage => DataType::U64,
-            FastField::HostCentrality => DataType::U64,
-            FastField::HostCentralityRank => DataType::U64,
-            FastField::PageCentrality => DataType::U64,
-            FastField::PageCentralityRank => DataType::U64,
-            FastField::FetchTimeMs => DataType::U64,
-            FastField::LastUpdated => DataType::U64,
-            FastField::TrackerScore => DataType::U64,
-            FastField::Region => DataType::U64,
-            FastField::NumUrlTokens => DataType::U64,
-            FastField::NumTitleTokens => DataType::U64,
-            FastField::NumMicroformatTagsTokens => DataType::U64,
-            FastField::NumCleanBodyTokens => DataType::U64,
-            FastField::NumDescriptionTokens => DataType::U64,
-            FastField::NumDomainTokens => DataType::U64,
-            FastField::NumUrlForSiteOperatorTokens => DataType::U64,
-            FastField::NumFlattenedSchemaTokens => DataType::U64,
-            FastField::SiteHash1 => DataType::U64,
-            FastField::SiteHash2 => DataType::U64,
-            FastField::UrlWithoutQueryHash1 => DataType::U64,
-            FastField::UrlWithoutQueryHash2 => DataType::U64,
-            FastField::TitleHash1 => DataType::U64,
-            FastField::TitleHash2 => DataType::U64,
-            FastField::UrlHash1 => DataType::U64,
-            FastField::UrlHash2 => DataType::U64,
-            FastField::DomainHash1 => DataType::U64,
-            FastField::DomainHash2 => DataType::U64,
-            FastField::UrlWithoutTldHash1 => DataType::U64,
-            FastField::UrlWithoutTldHash2 => DataType::U64,
-            FastField::PreComputedScore => DataType::U64,
-            FastField::HostNodeID => DataType::U64,
-            FastField::SimHash => DataType::U64,
-            FastField::NumPathAndQuerySlashes => DataType::U64,
-            FastField::NumPathAndQueryDigits => DataType::U64,
-            FastField::LikelyHasAds => DataType::U64,
-            FastField::LikelyHasPaywall => DataType::U64,
-            FastField::LinkDensity => DataType::U64,
-            FastField::TitleEmbeddings => DataType::Bytes,
-            FastField::KeywordEmbeddings => DataType::Bytes,
-        }
-    }
-}

+ 1233 - 0
crates/core/src/schema/fast_field.rs

@@ -0,0 +1,1233 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>
+
+use enum_dispatch::enum_dispatch;
+use strum::{EnumDiscriminants, VariantArray};
+use tantivy::{
+    schema::{BytesOptions, NumericOptions},
+    TantivyDocument,
+};
+
+use crate::{
+    enum_map::InsertEnumMapKey,
+    from_discriminant, simhash,
+    webpage::{html::FnCache, Html, Webpage},
+    Result,
+};
+
+use super::{IndexingOption, FLOAT_SCALING};
+
+#[enum_dispatch]
+pub trait FastField: Clone + Copy + std::fmt::Debug + PartialEq + Eq + std::hash::Hash {
+    fn name(&self) -> &str;
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()>;
+
+    fn add_webpage_tantivy(
+        &self,
+        _webpage: &crate::webpage::Webpage,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn data_type(&self) -> DataType {
+        DataType::U64
+    }
+
+    fn is_stored(&self) -> bool {
+        false
+    }
+
+    fn is_indexed(&self) -> bool {
+        true
+    }
+
+    fn indexing_option(&self) -> IndexingOption {
+        debug_assert!(matches!(self.data_type(), DataType::U64));
+
+        let mut opt = NumericOptions::default().set_fast();
+
+        if self.is_stored() {
+            opt = opt.set_stored();
+        }
+
+        if self.is_indexed() {
+            opt = opt.set_indexed();
+        }
+
+        IndexingOption::Integer(opt)
+    }
+
+    fn tantivy_field(&self, schema: &tantivy::schema::Schema) -> tantivy::schema::Field {
+        schema
+            .get_field(self.name())
+            .unwrap_or_else(|_| unreachable!("Unknown field: {}", self.name()))
+    }
+}
+
+#[enum_dispatch(FastField)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EnumDiscriminants)]
+#[strum_discriminants(derive(VariantArray))]
+pub enum FastFieldEnum {
+    IsHomepage,
+    HostCentrality,
+    HostCentralityRank,
+    PageCentrality,
+    PageCentralityRank,
+    FetchTimeMs,
+    LastUpdated,
+    TrackerScore,
+    Region,
+    NumUrlTokens,
+    NumTitleTokens,
+    NumCleanBodyTokens,
+    NumDescriptionTokens,
+    NumUrlForSiteOperatorTokens,
+    NumDomainTokens,
+    NumMicroformatTagsTokens,
+    SiteHash1,
+    SiteHash2,
+    UrlWithoutQueryHash1,
+    UrlWithoutQueryHash2,
+    TitleHash1,
+    TitleHash2,
+    UrlHash1,
+    UrlHash2,
+    DomainHash1,
+    DomainHash2,
+    UrlWithoutTldHash1,
+    UrlWithoutTldHash2,
+    PreComputedScore,
+    HostNodeID,
+    SimHash,
+    NumFlattenedSchemaTokens,
+    NumPathAndQuerySlashes,
+    NumPathAndQueryDigits,
+    LikelyHasAds,
+    LikelyHasPaywall,
+    LinkDensity,
+    TitleEmbeddings,
+    KeywordEmbeddings,
+}
+
+from_discriminant!(FastFieldEnumDiscriminants => FastFieldEnum,
+[
+    IsHomepage,
+    HostCentrality,
+    HostCentralityRank,
+    PageCentrality,
+    PageCentralityRank,
+    FetchTimeMs,
+    LastUpdated,
+    TrackerScore,
+    Region,
+    NumUrlTokens,
+    NumTitleTokens,
+    NumCleanBodyTokens,
+    NumDescriptionTokens,
+    NumUrlForSiteOperatorTokens,
+    NumDomainTokens,
+    NumMicroformatTagsTokens,
+    SiteHash1,
+    SiteHash2,
+    UrlWithoutQueryHash1,
+    UrlWithoutQueryHash2,
+    TitleHash1,
+    TitleHash2,
+    UrlHash1,
+    UrlHash2,
+    DomainHash1,
+    DomainHash2,
+    UrlWithoutTldHash1,
+    UrlWithoutTldHash2,
+    PreComputedScore,
+    HostNodeID,
+    SimHash,
+    NumFlattenedSchemaTokens,
+    NumPathAndQuerySlashes,
+    NumPathAndQueryDigits,
+    LikelyHasAds,
+    LikelyHasPaywall,
+    LinkDensity,
+    TitleEmbeddings,
+    KeywordEmbeddings,
+]);
+
+impl FastFieldEnum {
+    pub fn all() -> impl Iterator<Item = FastFieldEnum> {
+        FastFieldEnumDiscriminants::VARIANTS
+            .iter()
+            .copied()
+            .map(|v| v.into())
+    }
+
+    pub fn get(field_id: usize) -> Option<FastFieldEnum> {
+        FastFieldEnumDiscriminants::VARIANTS
+            .get(field_id)
+            .copied()
+            .map(FastFieldEnum::from)
+    }
+
+    pub fn num_variants() -> usize {
+        FastFieldEnumDiscriminants::VARIANTS.len()
+    }
+}
+
+pub enum DataType {
+    U64,
+    Bytes,
+}
+
+impl InsertEnumMapKey for FastFieldEnum {
+    fn into_usize(self) -> usize {
+        FastFieldEnumDiscriminants::from(self) as usize
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct IsHomepage;
+impl FastField for IsHomepage {
+    fn name(&self) -> &str {
+        "is_homepage"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), (html.is_homepage()).into());
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct HostCentrality;
+impl FastField for HostCentrality {
+    fn name(&self) -> &str {
+        "host_centrality"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            (webpage.host_centrality * FLOAT_SCALING as f64) as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct HostCentralityRank;
+impl FastField for HostCentralityRank {
+    fn name(&self) -> &str {
+        "host_centrality_rank"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), webpage.host_centrality_rank);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PageCentrality;
+impl FastField for PageCentrality {
+    fn name(&self) -> &str {
+        "page_centrality"
+    }
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            (webpage.page_centrality * FLOAT_SCALING as f64) as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PageCentralityRank;
+impl FastField for PageCentralityRank {
+    fn name(&self) -> &str {
+        "page_centrality_rank"
+    }
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), webpage.page_centrality_rank);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct FetchTimeMs;
+impl FastField for FetchTimeMs {
+    fn name(&self) -> &str {
+        "fetch_time_ms"
+    }
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), webpage.fetch_time_ms);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct LastUpdated;
+impl FastField for LastUpdated {
+    fn name(&self) -> &str {
+        "last_updated"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            html.updated_time()
+                .map_or(0, |time| time.timestamp().max(0) as u64),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TrackerScore;
+impl FastField for TrackerScore {
+    fn name(&self) -> &str {
+        "tracker_score"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), html.trackers().len() as u64);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Region;
+impl FastField for Region {
+    fn name(&self) -> &str {
+        "region"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let region = crate::webpage::region::Region::guess_from(webpage);
+        if let Ok(region) = region {
+            doc.add_u64(self.tantivy_field(schema), region.id());
+        } else {
+            doc.add_u64(
+                self.tantivy_field(schema),
+                crate::webpage::region::Region::All.id(),
+            );
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumUrlTokens;
+impl FastField for NumUrlTokens {
+    fn name(&self) -> &str {
+        "num_url_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.pretokenize_url().tokens.len() as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumTitleTokens;
+impl FastField for NumTitleTokens {
+    fn name(&self) -> &str {
+        "num_title_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache
+                .pretokenize_title()
+                .as_ref()
+                .map(|n| n.tokens.len() as u64)
+                .map_err(|e| anyhow::anyhow!("{}", e))?,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumCleanBodyTokens;
+impl FastField for NumCleanBodyTokens {
+    fn name(&self) -> &str {
+        "num_clean_body_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.pretokenize_clean_text().tokens.len() as u64,
+        );
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumDescriptionTokens;
+impl FastField for NumDescriptionTokens {
+    fn name(&self) -> &str {
+        "num_description_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.pretokenize_description().tokens.len() as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumUrlForSiteOperatorTokens;
+impl FastField for NumUrlForSiteOperatorTokens {
+    fn name(&self) -> &str {
+        "num_url_for_site_operator_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.pretokenize_url_for_site_operator().tokens.len() as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumDomainTokens;
+impl FastField for NumDomainTokens {
+    fn name(&self) -> &str {
+        "num_domain_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.pretokenize_domain().tokens.len() as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumMicroformatTagsTokens;
+impl FastField for NumMicroformatTagsTokens {
+    fn name(&self) -> &str {
+        "num_microformat_tags_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.pretokenize_microformats().tokens.len() as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SiteHash1;
+impl FastField for SiteHash1 {
+    fn name(&self) -> &str {
+        "site_hash1"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.site_hash()[0]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SiteHash2;
+impl FastField for SiteHash2 {
+    fn name(&self) -> &str {
+        "site_hash2"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.site_hash()[1]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlWithoutQueryHash1;
+impl FastField for UrlWithoutQueryHash1 {
+    fn name(&self) -> &str {
+        "url_without_query_hash1"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.url_without_query_hash()[0],
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlWithoutQueryHash2;
+impl FastField for UrlWithoutQueryHash2 {
+    fn name(&self) -> &str {
+        "url_without_query_hash2"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.url_without_query_hash()[1],
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TitleHash1;
+impl FastField for TitleHash1 {
+    fn name(&self) -> &str {
+        "title_hash1"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.title_hash()[0]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TitleHash2;
+impl FastField for TitleHash2 {
+    fn name(&self) -> &str {
+        "title_hash2"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.title_hash()[1]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlHash1;
+impl FastField for UrlHash1 {
+    fn name(&self) -> &str {
+        "url_hash1"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.url_hash()[0]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlHash2;
+impl FastField for UrlHash2 {
+    fn name(&self) -> &str {
+        "url_hash2"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.url_hash()[1]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DomainHash1;
+impl FastField for DomainHash1 {
+    fn name(&self) -> &str {
+        "domain_hash1"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.domain_hash()[0]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DomainHash2;
+impl FastField for DomainHash2 {
+    fn name(&self) -> &str {
+        "domain_hash2"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.domain_hash()[1]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlWithoutTldHash1;
+impl FastField for UrlWithoutTldHash1 {
+    fn name(&self) -> &str {
+        "url_without_tld_hash1"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.url_without_tld_hash()[0]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlWithoutTldHash2;
+impl FastField for UrlWithoutTldHash2 {
+    fn name(&self) -> &str {
+        "url_without_tld_hash2"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), cache.url_without_tld_hash()[1]);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct PreComputedScore;
+impl FastField for PreComputedScore {
+    fn name(&self) -> &str {
+        "pre_computed_score"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            (webpage.pre_computed_score * FLOAT_SCALING as f64) as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct HostNodeID;
+impl FastField for HostNodeID {
+    fn name(&self) -> &str {
+        "host_node_id"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        match &webpage.node_id {
+            Some(node_id) => {
+                doc.add_u64(self.tantivy_field(schema), node_id.as_u64());
+            }
+            None => {
+                doc.add_u64(self.tantivy_field(schema), u64::MAX);
+            }
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SimHash;
+impl FastField for SimHash {
+    fn name(&self) -> &str {
+        "sim_hash"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let clean_text = cache.pretokenize_clean_text();
+
+        let hash = if !clean_text.text.is_empty() {
+            simhash::hash(&clean_text.text)
+        } else {
+            0
+        };
+        doc.add_u64(self.tantivy_field(schema), hash);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumFlattenedSchemaTokens;
+impl FastField for NumFlattenedSchemaTokens {
+    fn name(&self) -> &str {
+        "num_flattened_schema_tokens"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            cache.pretokenized_schema_json().tokens.len() as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumPathAndQuerySlashes;
+impl FastField for NumPathAndQuerySlashes {
+    fn name(&self) -> &str {
+        "num_path_and_query_slashes"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let num_slashes = html
+            .url()
+            .path_segments()
+            .map(|segments| segments.count())
+            .unwrap_or(0);
+
+        doc.add_u64(self.tantivy_field(schema), num_slashes as u64);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct NumPathAndQueryDigits;
+impl FastField for NumPathAndQueryDigits {
+    fn name(&self) -> &str {
+        "num_path_and_query_digits"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let num_digits = html
+            .url()
+            .path()
+            .chars()
+            .filter(|c| c.is_ascii_digit())
+            .count()
+            + html
+                .url()
+                .query()
+                .unwrap_or_default()
+                .chars()
+                .filter(|c| c.is_ascii_digit())
+                .count();
+
+        doc.add_u64(self.tantivy_field(schema), num_digits as u64);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct LikelyHasAds;
+impl FastField for LikelyHasAds {
+    fn name(&self) -> &str {
+        "likely_has_ads"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), html.likely_has_ads() as u64);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct LikelyHasPaywall;
+impl FastField for LikelyHasPaywall {
+    fn name(&self) -> &str {
+        "likely_has_paywall"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(self.tantivy_field(schema), html.likely_has_paywall() as u64);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct LinkDensity;
+impl FastField for LinkDensity {
+    fn name(&self) -> &str {
+        "link_density"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_u64(
+            self.tantivy_field(schema),
+            (html.link_density() * FLOAT_SCALING as f64) as u64,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TitleEmbeddings;
+impl FastField for TitleEmbeddings {
+    fn name(&self) -> &str {
+        "title_embeddings"
+    }
+
+    fn data_type(&self) -> DataType {
+        DataType::Bytes
+    }
+
+    fn indexing_option(&self) -> IndexingOption {
+        IndexingOption::Bytes(BytesOptions::default().set_fast().set_stored())
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        if let Some(emb) = &webpage.title_embedding {
+            let mut serialized = Vec::new();
+            emb.write_bytes(&mut serialized)?;
+
+            doc.add_bytes(self.tantivy_field(schema), serialized);
+        } else {
+            doc.add_bytes(self.tantivy_field(schema), Vec::new());
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct KeywordEmbeddings;
+impl FastField for KeywordEmbeddings {
+    fn name(&self) -> &str {
+        "keyword_embeddings"
+    }
+
+    fn data_type(&self) -> DataType {
+        DataType::Bytes
+    }
+
+    fn indexing_option(&self) -> IndexingOption {
+        IndexingOption::Bytes(BytesOptions::default().set_fast().set_stored())
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        if let Some(emb) = &webpage.keyword_embedding {
+            let mut serialized = Vec::new();
+            emb.write_bytes(&mut serialized)?;
+
+            doc.add_bytes(self.tantivy_field(schema), serialized);
+        } else {
+            doc.add_bytes(self.tantivy_field(schema), Vec::new());
+        }
+
+        Ok(())
+    }
+}

+ 139 - 0
crates/core/src/schema/mod.rs

@@ -0,0 +1,139 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+pub mod fast_field;
+pub mod text_field;
+
+use tantivy::{
+    schema::{BytesOptions, NumericOptions, TextOptions},
+    DateOptions,
+};
+
+pub use fast_field::{DataType, FastFieldEnum};
+pub use text_field::TextFieldEnum;
+
+use self::{fast_field::FastField, text_field::TextField};
+
+pub const FLOAT_SCALING: u64 = 1_000_000_000;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub enum Field {
+    Fast(FastFieldEnum),
+    Text(TextFieldEnum),
+}
+
+impl Field {
+    #[inline]
+    pub fn get(field_id: usize) -> Option<Field> {
+        if field_id < TextFieldEnum::num_variants() {
+            return Some(Field::Text(TextFieldEnum::get(field_id).unwrap()));
+        }
+        let field_id = field_id - TextFieldEnum::num_variants();
+
+        if field_id < FastFieldEnum::num_variants() {
+            return Some(Field::Fast(FastFieldEnum::get(field_id).unwrap()));
+        }
+        let _field_id = field_id - FastFieldEnum::num_variants();
+
+        return None;
+    }
+
+    #[inline]
+    pub fn all() -> impl Iterator<Item = Field> {
+        TextFieldEnum::all()
+            .map(Field::Text)
+            .chain(FastFieldEnum::all().map(Field::Fast))
+    }
+
+    pub fn has_pos(&self) -> bool {
+        match self {
+            Field::Fast(_) => false,
+            Field::Text(text) => text.has_pos(),
+        }
+    }
+
+    pub fn indexing_option(&self) -> IndexingOption {
+        match self {
+            Field::Text(f) => f.indexing_option(),
+            Field::Fast(f) => f.indexing_option(),
+        }
+    }
+
+    pub fn name(&self) -> &str {
+        match self {
+            Field::Text(f) => f.name(),
+            Field::Fast(f) => f.name(),
+        }
+    }
+
+    pub fn is_searchable(&self) -> bool {
+        match self {
+            Field::Text(f) => f.is_searchable(),
+            Field::Fast(_) => false,
+        }
+    }
+
+    pub fn as_text(&self) -> Option<TextFieldEnum> {
+        match self {
+            Field::Fast(_) => None,
+            Field::Text(field) => Some(*field),
+        }
+    }
+
+    pub fn as_fast(&self) -> Option<FastFieldEnum> {
+        match self {
+            Field::Fast(field) => Some(*field),
+            Field::Text(_) => None,
+        }
+    }
+}
+
+pub fn create_schema() -> tantivy::schema::Schema {
+    let mut builder = tantivy::schema::Schema::builder();
+
+    for field in Field::all() {
+        match field.indexing_option() {
+            IndexingOption::Text(options) => builder.add_text_field(field.name(), options),
+            IndexingOption::Integer(options) => builder.add_u64_field(field.name(), options),
+            IndexingOption::DateTime(options) => builder.add_date_field(field.name(), options),
+            IndexingOption::Bytes(options) => builder.add_bytes_field(field.name(), options),
+        };
+    }
+
+    builder.build()
+}
+
+pub enum IndexingOption {
+    Text(TextOptions),
+    Integer(NumericOptions),
+    DateTime(DateOptions),
+    Bytes(BytesOptions),
+}
+
+#[macro_export]
+macro_rules! from_discriminant {
+    ($discenum:ident => $enum:ident, [$($disc:ident),*$(,)?]) => {
+        impl From<$discenum> for $enum {
+            fn from(value: $discenum) -> Self {
+                match value {
+                    $(
+                    $discenum::$disc => $disc.into(),
+                    )*
+                }
+            }
+        }
+    };
+}

+ 1427 - 0
crates/core/src/schema/text_field.rs

@@ -0,0 +1,1427 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use enum_dispatch::enum_dispatch;
+use strum::{EnumDiscriminants, VariantArray};
+use tantivy::{
+    schema::{IndexRecordOption, TextFieldIndexing, TextOptions},
+    time::OffsetDateTime,
+    tokenizer::PreTokenizedString,
+    TantivyDocument,
+};
+use whatlang::Lang;
+
+use crate::{
+    enum_map::InsertEnumMapKey,
+    from_discriminant,
+    tokenizer::{
+        BigramTokenizer, Identity, JsonField, SiteOperatorUrlTokenizer, Tokenizer, TrigramTokenizer,
+    },
+    webpage::Html,
+    Result,
+};
+
+use crate::webpage::html::FnCache;
+
+use super::IndexingOption;
+
+#[enum_dispatch]
+pub trait TextField:
+    Clone + Copy + std::fmt::Debug + PartialEq + Eq + std::hash::Hash + Into<TextFieldEnum>
+{
+    fn name(&self) -> &str;
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()>;
+
+    fn add_webpage_tantivy(
+        &self,
+        _webpage: &crate::webpage::Webpage,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::default()
+    }
+
+    fn query_tokenizer(&self) -> Tokenizer {
+        self.indexing_tokenizer()
+    }
+
+    fn ngram_size(&self) -> usize {
+        1
+    }
+
+    fn monogram_field(&self) -> TextFieldEnum {
+        debug_assert_eq!(self.ngram_size(), 1);
+        (*self).into()
+    }
+
+    /// Whether or not the field should be included
+    /// in the fields that the `Query` searches.
+    ///
+    /// The fields can still be searched by manually
+    /// constructing a tantivy query.
+    fn is_searchable(&self) -> bool {
+        false
+    }
+
+    fn has_pos(&self) -> bool {
+        false
+    }
+
+    fn is_stored(&self) -> bool {
+        false
+    }
+
+    fn record_option(&self) -> IndexRecordOption {
+        if self.has_pos() {
+            IndexRecordOption::WithFreqsAndPositions
+        } else {
+            IndexRecordOption::WithFreqs
+        }
+    }
+
+    fn indexing_option(&self) -> IndexingOption {
+        let tokenizer = self.indexing_tokenizer();
+        let option = self.record_option();
+
+        let mut opt = TextOptions::default().set_indexing_options(
+            TextFieldIndexing::default()
+                .set_tokenizer(tokenizer.as_str())
+                .set_index_option(option),
+        );
+
+        if self.is_stored() {
+            opt = opt.set_stored();
+        }
+
+        IndexingOption::Text(opt)
+    }
+
+    fn tantivy_field(&self, schema: &tantivy::schema::Schema) -> tantivy::schema::Field {
+        schema
+            .get_field(self.name())
+            .unwrap_or_else(|_| unreachable!("Unknown field: {}", self.name()))
+    }
+}
+
+#[enum_dispatch(TextField)]
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EnumDiscriminants)]
+#[strum_discriminants(derive(VariantArray))]
+pub enum TextFieldEnum {
+    Title,
+    CleanBody,
+    StemmedTitle,
+    StemmedCleanBody,
+    AllBody,
+    Url,
+    UrlNoTokenizer,
+    UrlForSiteOperator,
+    SiteWithout,
+    Domain,
+    SiteNoTokenizer,
+    DomainNoTokenizer,
+    DomainNameNoTokenizer,
+    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
+    SiteIfHomepageNoTokenizer,
+    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
+    DomainIfHomepage,
+    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
+    DomainNameIfHomepageNoTokenizer,
+    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
+    DomainIfHomepageNoTokenizer,
+    /// this field is only set if the webpage is the homepage for the site. Allows us to boost
+    TitleIfHomepage,
+    BacklinkText,
+    Description,
+    DmozDescription,
+    SchemaOrgJson,
+    FlattenedSchemaOrgJson,
+    CleanBodyBigrams,
+    TitleBigrams,
+    CleanBodyTrigrams,
+    TitleTrigrams,
+    MicroformatTags,
+    /// can either be NSFW or SFW (see safety classifier)
+    SafetyClassification,
+    InsertionTimestamp,
+    RecipeFirstIngredientTagId,
+    Keywords,
+}
+
+from_discriminant!(TextFieldEnumDiscriminants => TextFieldEnum,
+[
+    Title,
+    CleanBody,
+    StemmedTitle,
+    StemmedCleanBody,
+    AllBody,
+    Url,
+    UrlNoTokenizer,
+    UrlForSiteOperator,
+    SiteWithout,
+    Domain,
+    SiteNoTokenizer,
+    DomainNoTokenizer,
+    DomainNameNoTokenizer,
+    SiteIfHomepageNoTokenizer,
+    DomainIfHomepage,
+    DomainNameIfHomepageNoTokenizer,
+    DomainIfHomepageNoTokenizer,
+    TitleIfHomepage,
+    BacklinkText,
+    Description,
+    DmozDescription,
+    SchemaOrgJson,
+    FlattenedSchemaOrgJson,
+    CleanBodyBigrams,
+    TitleBigrams,
+    CleanBodyTrigrams,
+    TitleTrigrams,
+    MicroformatTags,
+    SafetyClassification,
+    InsertionTimestamp,
+    RecipeFirstIngredientTagId,
+    Keywords,
+]);
+
+impl TextFieldEnum {
+    pub fn num_variants() -> usize {
+        TextFieldEnumDiscriminants::VARIANTS.len()
+    }
+
+    pub fn all() -> impl Iterator<Item = TextFieldEnum> {
+        TextFieldEnumDiscriminants::VARIANTS
+            .iter()
+            .copied()
+            .map(|v| v.into())
+    }
+
+    pub fn get(field_id: usize) -> Option<TextFieldEnum> {
+        TextFieldEnumDiscriminants::VARIANTS
+            .get(field_id)
+            .copied()
+            .map(TextFieldEnum::from)
+    }
+}
+
+impl InsertEnumMapKey for TextFieldEnum {
+    fn into_usize(self) -> usize {
+        TextFieldEnumDiscriminants::from(self) as usize
+    }
+}
+
+fn stemmer_from_lang(lang: &Lang) -> rust_stemmers::Stemmer {
+    match lang {
+        Lang::Ara => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Arabic),
+        Lang::Dan => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Danish),
+        Lang::Nld => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Dutch),
+        Lang::Fin => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Finnish),
+        Lang::Fra => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::French),
+        Lang::Deu => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::German),
+        Lang::Ell => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Greek),
+        Lang::Hun => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Hungarian),
+        Lang::Ita => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Italian),
+        Lang::Por => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Portuguese),
+        Lang::Ron => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Romanian),
+        Lang::Rus => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Russian),
+        Lang::Spa => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Spanish),
+        Lang::Swe => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Swedish),
+        Lang::Tam => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Tamil),
+        Lang::Tur => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Turkish),
+        _ => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English),
+    }
+}
+
+fn stem_tokens(tokens: &mut [tantivy::tokenizer::Token], lang: Lang) {
+    let stemmer = stemmer_from_lang(&lang);
+    for token in tokens {
+        // TODO remove allocation
+        if let Ok(stemmed_str) = std::panic::catch_unwind(|| stemmer.stem(&token.text).into_owned())
+        {
+            token.text.clear();
+            token.text.push_str(&stemmed_str);
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Title;
+impl TextField for Title {
+    fn name(&self) -> &str {
+        "title"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            cache
+                .pretokenize_title()
+                .as_ref()
+                .map(Clone::clone)
+                .map_err(|e| anyhow::anyhow!("{}", e))?,
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct CleanBody;
+impl TextField for CleanBody {
+    fn name(&self) -> &str {
+        "body"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            cache.pretokenize_clean_text().clone(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct StemmedTitle;
+impl TextField for StemmedTitle {
+    fn name(&self) -> &str {
+        "stemmed_title"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::new_stemmed()
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let title = cache
+            .pretokenize_title()
+            .as_ref()
+            .map(Clone::clone)
+            .map_err(|e| anyhow::anyhow!("{}", e))?;
+        let mut tokens = title.tokens.clone();
+        stem_tokens(&mut tokens, html.lang().copied().unwrap_or(Lang::Eng));
+
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            PreTokenizedString {
+                text: title.text.clone(),
+                tokens,
+            },
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct StemmedCleanBody;
+impl TextField for StemmedCleanBody {
+    fn name(&self) -> &str {
+        "stemmed_body"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::new_stemmed()
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let clean_text = cache.pretokenize_clean_text();
+        let mut tokens = clean_text.tokens.clone();
+        stem_tokens(&mut tokens, html.lang().copied().unwrap_or(Lang::Eng));
+
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            PreTokenizedString {
+                text: clean_text.text.clone(),
+                tokens,
+            },
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct AllBody;
+impl TextField for AllBody {
+    fn name(&self) -> &str {
+        "all_body"
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let all_text = cache
+            .pretokenize_all_text()
+            .as_ref()
+            .map(Clone::clone)
+            .map_err(|e| anyhow::anyhow!("{}", e))?;
+
+        doc.add_pre_tokenized_text(self.tantivy_field(schema), all_text.clone());
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Url;
+impl TextField for Url {
+    fn name(&self) -> &str {
+        "url"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let url = cache.pretokenize_url();
+        doc.add_pre_tokenized_text(self.tantivy_field(schema), url.clone());
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlNoTokenizer;
+impl TextField for UrlNoTokenizer {
+    fn name(&self) -> &str {
+        "url_no_tokenizer"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let url = html.url().to_string();
+
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            PreTokenizedString {
+                text: url.clone(),
+                tokens: vec![tantivy::tokenizer::Token {
+                    offset_from: 0,
+                    offset_to: url.len(),
+                    position: 0,
+                    text: url,
+                    position_length: 1,
+                }],
+            },
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct UrlForSiteOperator;
+impl TextField for UrlForSiteOperator {
+    fn name(&self) -> &str {
+        "url_for_site_operator"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::SiteOperator(SiteOperatorUrlTokenizer)
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            cache.pretokenize_url_for_site_operator().clone(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SiteWithout;
+impl TextField for SiteWithout {
+    fn name(&self) -> &str {
+        "site"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_pre_tokenized_text(self.tantivy_field(schema), cache.pretokenize_site().clone());
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Domain;
+impl TextField for Domain {
+    fn name(&self) -> &str {
+        "domain"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            cache.pretokenize_domain().clone(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SiteNoTokenizer;
+impl TextField for SiteNoTokenizer {
+    fn name(&self) -> &str {
+        "site_no_tokenizer"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let site = cache.pretokenize_site();
+
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            PreTokenizedString {
+                text: site.text.clone(),
+                tokens: vec![tantivy::tokenizer::Token {
+                    offset_from: 0,
+                    offset_to: site.text.len(),
+                    position: 0,
+                    text: site.text.clone(),
+                    position_length: 1,
+                }],
+            },
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DomainNoTokenizer;
+impl TextField for DomainNoTokenizer {
+    fn name(&self) -> &str {
+        "domain_no_tokenizer"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let domain = cache.pretokenize_domain();
+
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            PreTokenizedString {
+                text: domain.text.clone(),
+                tokens: vec![tantivy::tokenizer::Token {
+                    offset_from: 0,
+                    offset_to: domain.text.len(),
+                    position: 0,
+                    text: domain.text.clone(),
+                    position_length: 1,
+                }],
+            },
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DomainNameNoTokenizer;
+impl TextField for DomainNameNoTokenizer {
+    fn name(&self) -> &str {
+        "domain_name_no_tokenizer"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let domain_name = cache.domain_name();
+
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            PreTokenizedString {
+                text: domain_name.clone(),
+                tokens: vec![tantivy::tokenizer::Token {
+                    offset_from: 0,
+                    offset_to: domain_name.len(),
+                    position: 0,
+                    text: domain_name.clone(),
+                    position_length: 1,
+                }],
+            },
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SiteIfHomepageNoTokenizer;
+impl TextField for SiteIfHomepageNoTokenizer {
+    fn name(&self) -> &str {
+        "site_if_homepage_no_tokenizer"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let site = cache.pretokenize_site();
+
+        if html.is_homepage() {
+            doc.add_pre_tokenized_text(
+                self.tantivy_field(schema),
+                PreTokenizedString {
+                    text: site.text.clone(),
+                    tokens: vec![tantivy::tokenizer::Token {
+                        offset_from: 0,
+                        offset_to: site.text.len(),
+                        position: 0,
+                        text: site.text.clone(),
+                        position_length: 1,
+                    }],
+                },
+            );
+        } else {
+            doc.add_text(self.tantivy_field(schema), "");
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DomainIfHomepage;
+impl TextField for DomainIfHomepage {
+    fn name(&self) -> &str {
+        "domain_if_homepage"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let domain = cache.pretokenize_domain();
+        if html.is_homepage() {
+            doc.add_text(self.tantivy_field(schema), domain.text.clone());
+        } else {
+            doc.add_text(self.tantivy_field(schema), "");
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DomainNameIfHomepageNoTokenizer;
+impl TextField for DomainNameIfHomepageNoTokenizer {
+    fn name(&self) -> &str {
+        "domain_name_if_homepage_no_tokenizer"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let domain_name = cache.domain_name();
+
+        if html.is_homepage() {
+            doc.add_pre_tokenized_text(
+                self.tantivy_field(schema),
+                PreTokenizedString {
+                    text: domain_name.clone(),
+                    tokens: vec![tantivy::tokenizer::Token {
+                        offset_from: 0,
+                        offset_to: domain_name.len(),
+                        position: 0,
+                        text: domain_name.clone(),
+                        position_length: 1,
+                    }],
+                },
+            );
+        } else {
+            doc.add_text(self.tantivy_field(schema), "");
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DomainIfHomepageNoTokenizer;
+impl TextField for DomainIfHomepageNoTokenizer {
+    fn name(&self) -> &str {
+        "domain_if_homepage_no_tokenizer"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let domain = cache.pretokenize_domain();
+
+        if html.is_homepage() {
+            doc.add_pre_tokenized_text(
+                self.tantivy_field(schema),
+                PreTokenizedString {
+                    text: domain.text.clone(),
+                    tokens: vec![tantivy::tokenizer::Token {
+                        offset_from: 0,
+                        offset_to: domain.text.len(),
+                        position: 0,
+                        text: domain.text.clone(),
+                        position_length: 1,
+                    }],
+                },
+            );
+        } else {
+            doc.add_text(self.tantivy_field(schema), "");
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TitleIfHomepage;
+impl TextField for TitleIfHomepage {
+    fn name(&self) -> &str {
+        "title_if_homepage"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let title = cache
+            .pretokenize_title()
+            .as_ref()
+            .map(Clone::clone)
+            .map_err(|e| anyhow::anyhow!("{}", e))?;
+
+        if html.is_homepage() {
+            doc.add_pre_tokenized_text(self.tantivy_field(schema), title);
+        } else {
+            doc.add_text(self.tantivy_field(schema), "");
+        }
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct BacklinkText;
+impl TextField for BacklinkText {
+    fn name(&self) -> &str {
+        "backlink_text"
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &crate::webpage::Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(
+            self.tantivy_field(schema),
+            webpage.backlink_labels.join("\n"),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Description;
+impl TextField for Description {
+    fn name(&self) -> &str {
+        "description"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let description = cache.pretokenize_description();
+        doc.add_pre_tokenized_text(self.tantivy_field(schema), description.clone());
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct DmozDescription;
+impl TextField for DmozDescription {
+    fn name(&self) -> &str {
+        "dmoz_description"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &crate::webpage::Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(
+            self.tantivy_field(schema),
+            webpage.dmoz_description().unwrap_or_default(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SchemaOrgJson;
+impl TextField for SchemaOrgJson {
+    fn name(&self) -> &str {
+        "schema_org_json"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(self.tantivy_field(schema), cache.schema_json());
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct FlattenedSchemaOrgJson;
+impl TextField for FlattenedSchemaOrgJson {
+    fn name(&self) -> &str {
+        "flattened_schema_org_json"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Json(JsonField)
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            cache.pretokenized_schema_json().clone(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct CleanBodyBigrams;
+impl TextField for CleanBodyBigrams {
+    fn name(&self) -> &str {
+        "clean_body_bigrams"
+    }
+
+    fn ngram_size(&self) -> usize {
+        2
+    }
+
+    fn monogram_field(&self) -> TextFieldEnum {
+        CleanBody.into()
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Bigram(BigramTokenizer::default())
+    }
+
+    fn query_tokenizer(&self) -> Tokenizer {
+        Tokenizer::default()
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(
+            self.tantivy_field(schema),
+            html.clean_text().cloned().unwrap_or_default(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TitleBigrams;
+impl TextField for TitleBigrams {
+    fn name(&self) -> &str {
+        "title_bigrams"
+    }
+
+    fn ngram_size(&self) -> usize {
+        2
+    }
+
+    fn monogram_field(&self) -> TextFieldEnum {
+        Title.into()
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Bigram(BigramTokenizer::default())
+    }
+
+    fn query_tokenizer(&self) -> Tokenizer {
+        Tokenizer::default()
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let title = cache
+            .pretokenize_title()
+            .as_ref()
+            .map(Clone::clone)
+            .map_err(|e| anyhow::anyhow!("{}", e))?;
+
+        doc.add_text(self.tantivy_field(schema), title.text.clone());
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct CleanBodyTrigrams;
+impl TextField for CleanBodyTrigrams {
+    fn name(&self) -> &str {
+        "clean_body_trigrams"
+    }
+
+    fn ngram_size(&self) -> usize {
+        3
+    }
+
+    fn monogram_field(&self) -> TextFieldEnum {
+        CleanBody.into()
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Trigram(TrigramTokenizer::default())
+    }
+
+    fn query_tokenizer(&self) -> Tokenizer {
+        Tokenizer::default()
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        html: &Html,
+        _cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(
+            self.tantivy_field(schema),
+            html.clean_text().cloned().unwrap_or_default(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct TitleTrigrams;
+impl TextField for TitleTrigrams {
+    fn name(&self) -> &str {
+        "title_trigrams"
+    }
+
+    fn ngram_size(&self) -> usize {
+        3
+    }
+
+    fn monogram_field(&self) -> TextFieldEnum {
+        Title.into()
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Trigram(TrigramTokenizer::default())
+    }
+
+    fn query_tokenizer(&self) -> Tokenizer {
+        Tokenizer::default()
+    }
+
+    fn is_searchable(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let title = cache
+            .pretokenize_title()
+            .as_ref()
+            .map(Clone::clone)
+            .map_err(|e| anyhow::anyhow!("{}", e))?;
+
+        doc.add_text(self.tantivy_field(schema), title.text.clone());
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct MicroformatTags;
+impl TextField for MicroformatTags {
+    fn name(&self) -> &str {
+        "microformat_tags"
+    }
+
+    fn has_pos(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_pre_tokenized_text(
+            self.tantivy_field(schema),
+            cache.pretokenize_microformats().clone(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct SafetyClassification;
+impl TextField for SafetyClassification {
+    fn name(&self) -> &str {
+        "safety_classification"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &crate::webpage::Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        let safety = webpage
+            .safety_classification
+            .map(|label| label.to_string())
+            .unwrap_or_default();
+
+        doc.add_text(self.tantivy_field(schema), safety);
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct InsertionTimestamp;
+impl TextField for InsertionTimestamp {
+    fn name(&self) -> &str {
+        "insertion_timestamp"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn indexing_option(&self) -> IndexingOption {
+        IndexingOption::DateTime(tantivy::schema::DateOptions::default().set_indexed())
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &crate::webpage::Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_date(
+            self.tantivy_field(schema),
+            tantivy::DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
+                webpage.inserted_at.timestamp(),
+            )?),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct RecipeFirstIngredientTagId;
+impl TextField for RecipeFirstIngredientTagId {
+    fn name(&self) -> &str {
+        "recipe_first_ingredient_tag_id"
+    }
+
+    fn indexing_tokenizer(&self) -> Tokenizer {
+        Tokenizer::Identity(Identity {})
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        cache: &mut FnCache,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(
+            self.tantivy_field(schema),
+            cache.first_ingredient_tag_id().cloned().unwrap_or_default(),
+        );
+
+        Ok(())
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Keywords;
+impl TextField for Keywords {
+    fn name(&self) -> &str {
+        "keywords"
+    }
+
+    fn is_stored(&self) -> bool {
+        true
+    }
+
+    fn add_html_tantivy(
+        &self,
+        _html: &Html,
+        _cache: &mut FnCache,
+        _doc: &mut TantivyDocument,
+        _schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        Ok(())
+    }
+
+    fn add_webpage_tantivy(
+        &self,
+        webpage: &crate::webpage::Webpage,
+        doc: &mut TantivyDocument,
+        schema: &tantivy::schema::Schema,
+    ) -> Result<()> {
+        doc.add_text(self.tantivy_field(schema), webpage.keywords.join("\n"));
+
+        Ok(())
+    }
+}

+ 3 - 3
crates/core/src/searcher/api/mod.rs

@@ -32,7 +32,7 @@ use crate::inverted_index::RetrievedWebpage;
 use crate::models::dual_encoder::DualEncoder;
 use crate::models::dual_encoder::DualEncoder;
 use crate::ranking::models::cross_encoder::CrossEncoderModel;
 use crate::ranking::models::cross_encoder::CrossEncoderModel;
 use crate::ranking::pipeline::{PrecisionRankingWebpage, RankableWebpage, RecallRankingWebpage};
 use crate::ranking::pipeline::{PrecisionRankingWebpage, RankableWebpage, RecallRankingWebpage};
-use crate::ranking::ALL_SIGNALS;
+use crate::ranking::Signal;
 use crate::search_prettifier::{DisplayedSidebar, DisplayedWebpage, HighlightedSpellCorrection};
 use crate::search_prettifier::{DisplayedSidebar, DisplayedWebpage, HighlightedSpellCorrection};
 use crate::web_spell::SpellChecker;
 use crate::web_spell::SpellChecker;
 use crate::widgets::{Widget, Widgets};
 use crate::widgets::{Widget, Widgets};
@@ -145,9 +145,9 @@ pub fn combine_results(
 }
 }
 pub fn add_ranking_signals(websites: &mut [DisplayedWebpage], pointers: &[ScoredWebpagePointer]) {
 pub fn add_ranking_signals(websites: &mut [DisplayedWebpage], pointers: &[ScoredWebpagePointer]) {
     for (website, pointer) in websites.iter_mut().zip(pointers.iter()) {
     for (website, pointer) in websites.iter_mut().zip(pointers.iter()) {
-        let mut signals = HashMap::with_capacity(ALL_SIGNALS.len());
+        let mut signals = HashMap::new();
 
 
-        for signal in ALL_SIGNALS {
+        for signal in Signal::all() {
             if let Some(signal_value) = pointer.as_ranking().signals.get(signal) {
             if let Some(signal_value) = pointer.as_ranking().signals.get(signal) {
                 signals.insert(signal, *signal_value);
                 signals.insert(signal, *signal_value);
             }
             }

+ 2 - 2
crates/core/src/searcher/local.rs

@@ -29,7 +29,7 @@ use crate::ranking::inbound_similarity::InboundSimilarity;
 use crate::ranking::models::lambdamart::LambdaMART;
 use crate::ranking::models::lambdamart::LambdaMART;
 use crate::ranking::models::linear::LinearRegression;
 use crate::ranking::models::linear::LinearRegression;
 use crate::ranking::pipeline::{PrecisionRankingWebpage, RankingPipeline, RecallRankingWebpage};
 use crate::ranking::pipeline::{PrecisionRankingWebpage, RankingPipeline, RecallRankingWebpage};
-use crate::ranking::{query_centrality, Ranker, Signal, SignalAggregator, ALL_SIGNALS};
+use crate::ranking::{query_centrality, Ranker, Signal, SignalAggregator};
 use crate::search_ctx::Ctx;
 use crate::search_ctx::Ctx;
 use crate::search_prettifier::DisplayedWebpage;
 use crate::search_prettifier::DisplayedWebpage;
 use crate::webgraph::Node;
 use crate::webgraph::Node;
@@ -409,7 +409,7 @@ where
         for (webpage, ranking) in webpages.iter_mut().zip(top_websites) {
         for (webpage, ranking) in webpages.iter_mut().zip(top_websites) {
             let mut ranking_signals = HashMap::new();
             let mut ranking_signals = HashMap::new();
 
 
-            for signal in ALL_SIGNALS {
+            for signal in Signal::all() {
                 if let Some(score) = ranking.ranking.signals.get(signal) {
                 if let Some(score) = ranking.ranking.signals.get(signal) {
                     ranking_signals.insert(signal, *score);
                     ranking_signals.insert(signal, *score);
                 }
                 }

+ 136 - 0
crates/core/src/webpage/html/fn_cache.rs

@@ -0,0 +1,136 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use crate::{webpage::schema_org, Result};
+use tantivy::tokenizer::PreTokenizedString;
+
+use super::{find_recipe_first_ingredient_tag_id, Html};
+
+#[macro_export]
+macro_rules! cache {
+    ($($fn:ident -> $res:ty),*$(,)?) => {
+        /// Dynamically compute the different webpage functions
+        /// and cache the results for subsequent calls.
+        ///
+        /// Used during indexing as some of the fields use
+        /// the same data from the webpage and we don't want to
+        /// recompute the same data multiple times.
+        pub struct FnCache<'a> {
+            html: &'a Html,
+            first_ingredient_tag_id: Option<String>,
+            schema_json: Option<String>,
+            pretokenized_schema_json: Option<PreTokenizedString>,
+            $($fn: Option<$res>,)*
+        }
+
+        impl<'a> FnCache<'a> {
+            /// Create a new instance of the IndexingCacher
+            pub fn new(html: &'a Html) -> Self {
+                Self {
+                    html,
+                    first_ingredient_tag_id: None,
+                    schema_json: None,
+                    pretokenized_schema_json: None,
+                    $($fn: None,)*
+                }
+            }
+
+            $(
+                /// Compute $fn from webpage and cache the result
+                pub fn $fn(&mut self) -> &$res {
+                    if self.$fn.is_none() {
+                        self.$fn = Some(self.html.$fn());
+                    }
+
+                    self.$fn.as_ref().unwrap()
+                }
+            )*
+        }
+    };
+}
+
+cache! {
+    pretokenize_title -> Result<PreTokenizedString>,
+    pretokenize_all_text -> Result<PreTokenizedString>,
+    pretokenize_clean_text -> PreTokenizedString,
+    pretokenize_url -> PreTokenizedString,
+    pretokenize_url_for_site_operator -> PreTokenizedString,
+    pretokenize_domain -> PreTokenizedString,
+    pretokenize_site -> PreTokenizedString,
+    pretokenize_description -> PreTokenizedString,
+    pretokenize_microformats -> PreTokenizedString,
+    domain_name -> String,
+    schema_org -> Vec<schema_org::Item>,
+    site_hash -> [u64; 2],
+    url_without_query_hash -> [u64; 2],
+    url_hash -> [u64; 2],
+    url_without_tld_hash -> [u64; 2],
+    domain_hash -> [u64; 2],
+    title_hash -> [u64; 2],
+}
+
+/// Some manual implementations so we can use previously cached data
+/// to compute the next field.
+impl<'a> FnCache<'a> {
+    pub fn first_ingredient_tag_id(&mut self) -> Option<&String> {
+        if self.first_ingredient_tag_id.is_none() {
+            let root = self.html.root.clone(); // Node is just a NodeRef, so it's cheap to clone
+
+            self.first_ingredient_tag_id =
+                find_recipe_first_ingredient_tag_id(self.schema_org().as_slice(), &root);
+        }
+
+        self.first_ingredient_tag_id.as_ref()
+    }
+
+    pub fn schema_json(&mut self) -> &String {
+        if self.schema_json.is_none() {
+            self.schema_json = Some(serde_json::to_string(self.schema_org()).unwrap());
+        }
+
+        self.schema_json.as_ref().unwrap()
+    }
+
+    pub fn pretokenized_schema_json(&mut self) -> &PreTokenizedString {
+        if self.pretokenized_schema_json.is_none() {
+            self.pretokenized_schema_json =
+                match schema_org::flattened_json(self.schema_org().clone()) {
+                    Ok(mut f) => {
+                        let mut tokens = Vec::new();
+
+                        {
+                            let mut stream = f.token_stream();
+
+                            while let Some(token) = stream.next() {
+                                tokens.push(token.clone());
+                            }
+                        }
+
+                        Some(PreTokenizedString {
+                            text: f.text().to_string(),
+                            tokens,
+                        })
+                    }
+                    Err(_) => Some(PreTokenizedString {
+                        text: String::new(),
+                        tokens: Vec::new(),
+                    }),
+                };
+        }
+
+        self.pretokenized_schema_json.as_ref().unwrap()
+    }
+}

+ 70 - 461
crates/core/src/webpage/html/into_tantivy.rs

@@ -1,5 +1,5 @@
 // Stract is an open source web search engine.
 // Stract is an open source web search engine.
-// Copyright (C) 2023 Stract ApS
+// Copyright (C) 2024 Stract ApS
 //
 //
 // This program is free software: you can redistribute it and/or modify
 // This program is free software: you can redistribute it and/or modify
 // it under the terms of the GNU Affero General Public License as
 // it under the terms of the GNU Affero General Public License as
@@ -18,8 +18,12 @@ use crate::{
     ceil_char_boundary,
     ceil_char_boundary,
     prehashed::hash,
     prehashed::hash,
     rake::RakeModel,
     rake::RakeModel,
-    schema::{FastField, TextField},
-    simhash, split_u128, tokenizer,
+    schema::{
+        fast_field::FastField,
+        text_field::{self, TextField},
+        TextFieldEnum,
+    },
+    split_u128, tokenizer,
     webpage::url_ext::UrlExt,
     webpage::url_ext::UrlExt,
     Error, Result,
     Error, Result,
 };
 };
@@ -29,12 +33,12 @@ use tantivy::{
 };
 };
 use whatlang::Lang;
 use whatlang::Lang;
 
 
-use super::{find_recipe_first_ingredient_tag_id, schema_org, Html};
+use super::{fn_cache::FnCache, Html};
 
 
-use crate::schema::{Field, FLOAT_SCALING};
+use crate::schema::Field;
 
 
 impl Html {
 impl Html {
-    fn pretokenize_title(&self) -> Result<PreTokenizedString> {
+    pub fn pretokenize_title(&self) -> Result<PreTokenizedString> {
         let title = self.title();
         let title = self.title();
 
 
         if title.is_none() {
         if title.is_none() {
@@ -42,10 +46,10 @@ impl Html {
         }
         }
         let title = title.unwrap();
         let title = title.unwrap();
 
 
-        Ok(self.pretokenize_string(title, TextField::Title))
+        Ok(self.pretokenize_string(title, text_field::Title.into()))
     }
     }
 
 
-    fn pretokenize_all_text(&self) -> Result<PreTokenizedString> {
+    pub fn pretokenize_all_text(&self) -> Result<PreTokenizedString> {
         let all_text = self.all_text();
         let all_text = self.all_text();
 
 
         if all_text.is_none() {
         if all_text.is_none() {
@@ -53,38 +57,45 @@ impl Html {
         }
         }
         let all_text = all_text.unwrap();
         let all_text = all_text.unwrap();
 
 
-        Ok(self.pretokenize_string(all_text, TextField::AllBody))
+        Ok(self.pretokenize_string(all_text, text_field::AllBody.into()))
     }
     }
 
 
-    fn pretokenize_clean_text(&self) -> PreTokenizedString {
+    pub fn pretokenize_clean_text(&self) -> PreTokenizedString {
         let clean_text = self.clean_text().cloned().unwrap_or_default();
         let clean_text = self.clean_text().cloned().unwrap_or_default();
-        self.pretokenize_string(clean_text, TextField::CleanBody)
+        self.pretokenize_string(clean_text, text_field::CleanBody.into())
     }
     }
 
 
-    fn pretokenize_url(&self) -> PreTokenizedString {
+    pub fn pretokenize_url(&self) -> PreTokenizedString {
         let url = self.url().to_string();
         let url = self.url().to_string();
-        self.pretokenize_string(url, TextField::Url)
+        self.pretokenize_string(url, text_field::Url.into())
     }
     }
 
 
-    fn pretokenize_domain(&self) -> PreTokenizedString {
+    pub fn pretokenize_url_for_site_operator(&self) -> PreTokenizedString {
+        self.pretokenize_string_with(
+            self.url().to_string(),
+            tokenizer::Tokenizer::SiteOperator(tokenizer::SiteOperatorUrlTokenizer),
+        )
+    }
+
+    pub fn pretokenize_domain(&self) -> PreTokenizedString {
         let domain = self.url().root_domain().unwrap_or_default().to_string();
         let domain = self.url().root_domain().unwrap_or_default().to_string();
 
 
-        self.pretokenize_string(domain, TextField::Domain)
+        self.pretokenize_string(domain, text_field::Domain.into())
     }
     }
 
 
-    fn pretokenize_site(&self) -> PreTokenizedString {
+    pub fn pretokenize_site(&self) -> PreTokenizedString {
         let site = self.url().normalized_host().unwrap_or_default().to_string();
         let site = self.url().normalized_host().unwrap_or_default().to_string();
 
 
-        self.pretokenize_string(site, TextField::SiteWithout)
+        self.pretokenize_string(site, text_field::SiteWithout.into())
     }
     }
 
 
-    fn pretokenize_description(&self) -> PreTokenizedString {
+    pub fn pretokenize_description(&self) -> PreTokenizedString {
         let text = self.description().unwrap_or_default();
         let text = self.description().unwrap_or_default();
 
 
-        self.pretokenize_string(text, TextField::Description)
+        self.pretokenize_string(text, text_field::Description.into())
     }
     }
 
 
-    fn pretokenize_microformats(&self) -> PreTokenizedString {
+    pub fn pretokenize_microformats(&self) -> PreTokenizedString {
         let mut text = String::new();
         let mut text = String::new();
 
 
         for microformat in self.microformats().iter() {
         for microformat in self.microformats().iter() {
@@ -92,10 +103,10 @@ impl Html {
             text.push(' ');
             text.push(' ');
         }
         }
 
 
-        self.pretokenize_string(text, TextField::MicroformatTags)
+        self.pretokenize_string(text, text_field::MicroformatTags.into())
     }
     }
 
 
-    fn pretokenize_string(&self, text: String, field: TextField) -> PreTokenizedString {
+    fn pretokenize_string(&self, text: String, field: TextFieldEnum) -> PreTokenizedString {
         self.pretokenize_string_with(text, field.indexing_tokenizer())
         self.pretokenize_string_with(text, field.indexing_tokenizer())
     }
     }
 
 
@@ -118,6 +129,17 @@ impl Html {
         PreTokenizedString { text, tokens }
         PreTokenizedString { text, tokens }
     }
     }
 
 
+    pub fn domain_name(&self) -> String {
+        let domain = self.url().domain().unwrap_or_default();
+        self.url()
+            .root_domain()
+            .unwrap_or_default()
+            .find('.')
+            .map(|index| &domain[..ceil_char_boundary(&domain, index).min(domain.len())])
+            .unwrap_or_default()
+            .to_string()
+    }
+
     pub fn keywords(&self, rake: &RakeModel) -> Vec<String> {
     pub fn keywords(&self, rake: &RakeModel) -> Vec<String> {
         self.clean_text()
         self.clean_text()
             .map(|text| {
             .map(|text| {
@@ -129,70 +151,18 @@ impl Html {
             .unwrap_or_default()
             .unwrap_or_default()
     }
     }
 
 
-    pub fn as_tantivy(&self, schema: &tantivy::schema::Schema) -> Result<TantivyDocument> {
-        let mut doc = TantivyDocument::new();
-
-        let title = self.pretokenize_title()?;
-        let all_text = self.pretokenize_all_text()?;
-        let clean_text = self.pretokenize_clean_text();
-        let url = self.pretokenize_url();
-        let domain = self.pretokenize_domain();
-        let site = self.pretokenize_site();
-        let description = self.pretokenize_description();
-        let microformats = self.pretokenize_microformats();
-        let url_for_site_operator = self.pretokenize_string_with(
-            self.url().to_string(),
-            tokenizer::Tokenizer::SiteOperator(tokenizer::SiteOperatorUrlTokenizer),
-        );
-
-        let domain_name = self
-            .url()
-            .root_domain()
-            .unwrap_or_default()
-            .find('.')
-            .map(|index| {
-                &domain.text[..ceil_char_boundary(&domain.text, index).min(domain.text.len())]
-            })
-            .unwrap_or_default()
-            .to_string();
-
-        let schemas: Vec<_> = self.schema_org();
-        let first_ingredient_tag_id =
-            find_recipe_first_ingredient_tag_id(&schemas, &self.root).unwrap_or_default();
-
-        let schema_json = serde_json::to_string(&schemas).ok().unwrap_or_default();
-
-        let pretokenized_schema_json = match schema_org::flattened_json(schemas) {
-            Ok(mut f) => {
-                let mut tokens = Vec::new();
-
-                {
-                    let mut stream = f.token_stream();
-
-                    while let Some(token) = stream.next() {
-                        tokens.push(token.clone());
-                    }
-                }
-
-                PreTokenizedString {
-                    text: f.text().to_string(),
-                    tokens,
-                }
-            }
-            Err(_) => PreTokenizedString {
-                text: String::new(),
-                tokens: Vec::new(),
-            },
-        };
-
-        let site_hash = split_u128(hash(self.url().normalized_host().unwrap_or_default()).0);
+    pub fn site_hash(&self) -> [u64; 2] {
+        split_u128(hash(self.url().normalized_host().unwrap_or_default()).0)
+    }
 
 
+    pub fn url_without_query_hash(&self) -> [u64; 2] {
         let mut url_without_query = self.url().clone();
         let mut url_without_query = self.url().clone();
         url_without_query.set_query(None);
         url_without_query.set_query(None);
 
 
-        let url_without_query_hash = split_u128(hash(url_without_query.as_str()).0);
-        let url_hash = split_u128(hash(self.url().as_str()).0);
+        split_u128(hash(url_without_query.as_str()).0)
+    }
 
 
+    pub fn url_without_tld_hash(&self) -> [u64; 2] {
         let tld = self.url().tld().unwrap_or_default();
         let tld = self.url().tld().unwrap_or_default();
         let url_without_tld = self
         let url_without_tld = self
             .url()
             .url()
@@ -205,396 +175,35 @@ impl Html {
             + "?"
             + "?"
             + self.url().query().unwrap_or_default();
             + self.url().query().unwrap_or_default();
 
 
-        let url_without_tld_hash = split_u128(hash(url_without_tld).0);
+        split_u128(hash(url_without_tld).0)
+    }
+
+    pub fn url_hash(&self) -> [u64; 2] {
+        split_u128(hash(self.url().as_str()).0)
+    }
 
 
-        let domain_hash = split_u128(hash(self.url().root_domain().unwrap_or_default()).0);
-        let title_hash = split_u128(hash(self.title().unwrap_or_default()).0);
+    pub fn domain_hash(&self) -> [u64; 2] {
+        split_u128(hash(self.url().root_domain().unwrap_or_default()).0)
+    }
+
+    pub fn title_hash(&self) -> [u64; 2] {
+        split_u128(hash(self.title().unwrap_or_default()).0)
+    }
+
+    pub fn as_tantivy(&self, schema: &tantivy::schema::Schema) -> Result<TantivyDocument> {
+        let mut doc = TantivyDocument::new();
+        let mut cache = FnCache::new(self);
 
 
         for field in schema
         for field in schema
             .fields()
             .fields()
             .filter_map(|(field, _)| Field::get(field.field_id() as usize))
             .filter_map(|(field, _)| Field::get(field.field_id() as usize))
         {
         {
-            let tantivy_field = schema
-                .get_field(field.name())
-                .unwrap_or_else(|_| panic!("Unknown field: {}", field.name()));
-
             match field {
             match field {
-                Field::Text(TextField::Title) => {
-                    doc.add_pre_tokenized_text(tantivy_field, title.clone())
-                }
-                Field::Text(TextField::StemmedTitle) => {
-                    let mut tokens = title.tokens.clone();
-                    stem_tokens(&mut tokens, self.lang.unwrap_or(Lang::Eng));
-
-                    doc.add_pre_tokenized_text(
-                        tantivy_field,
-                        PreTokenizedString {
-                            text: title.text.clone(),
-                            tokens,
-                        },
-                    );
-                }
-                Field::Text(TextField::CleanBody) => {
-                    doc.add_pre_tokenized_text(tantivy_field, clean_text.clone())
-                }
-                Field::Text(TextField::StemmedCleanBody) => {
-                    let mut tokens = clean_text.tokens.clone();
-                    stem_tokens(&mut tokens, self.lang.unwrap_or(Lang::Eng));
-
-                    doc.add_pre_tokenized_text(
-                        tantivy_field,
-                        PreTokenizedString {
-                            text: clean_text.text.clone(),
-                            tokens,
-                        },
-                    );
-                }
-                Field::Text(TextField::CleanBodyBigrams) => {
-                    doc.add_text(
-                        tantivy_field,
-                        self.clean_text().cloned().unwrap_or_default(),
-                    );
-                }
-                Field::Text(TextField::CleanBodyTrigrams) => {
-                    doc.add_text(
-                        tantivy_field,
-                        self.clean_text().cloned().unwrap_or_default(),
-                    );
-                }
-                Field::Text(TextField::TitleBigrams) => {
-                    doc.add_text(tantivy_field, title.text.clone());
-                }
-                Field::Text(TextField::TitleTrigrams) => {
-                    doc.add_text(tantivy_field, title.text.clone());
-                }
-                Field::Text(TextField::Description) => {
-                    doc.add_pre_tokenized_text(tantivy_field, description.clone());
-                }
-                Field::Text(TextField::Url) => {
-                    doc.add_pre_tokenized_text(tantivy_field, url.clone())
-                }
-                Field::Text(TextField::UrlForSiteOperator) => {
-                    doc.add_pre_tokenized_text(tantivy_field, url_for_site_operator.clone())
-                }
-                Field::Text(TextField::UrlNoTokenizer) => {
-                    let url = self.url().to_string();
-
-                    doc.add_pre_tokenized_text(
-                        tantivy_field,
-                        PreTokenizedString {
-                            text: url.clone(),
-                            tokens: vec![tantivy::tokenizer::Token {
-                                offset_from: 0,
-                                offset_to: url.len(),
-                                position: 0,
-                                text: url,
-                                position_length: 1,
-                            }],
-                        },
-                    );
-                }
-                Field::Text(TextField::SiteWithout) => {
-                    doc.add_pre_tokenized_text(tantivy_field, site.clone())
-                }
-                Field::Text(TextField::Domain) => {
-                    doc.add_pre_tokenized_text(tantivy_field, domain.clone())
-                }
-                Field::Text(TextField::SiteNoTokenizer) => doc.add_pre_tokenized_text(
-                    tantivy_field,
-                    PreTokenizedString {
-                        text: site.text.clone(),
-                        tokens: vec![tantivy::tokenizer::Token {
-                            offset_from: 0,
-                            offset_to: site.text.len(),
-                            position: 0,
-                            text: site.text.clone(),
-                            position_length: 1,
-                        }],
-                    },
-                ),
-                Field::Text(TextField::SiteIfHomepageNoTokenizer) => {
-                    if self.is_homepage() {
-                        doc.add_pre_tokenized_text(
-                            tantivy_field,
-                            PreTokenizedString {
-                                text: site.text.clone(),
-                                tokens: vec![tantivy::tokenizer::Token {
-                                    offset_from: 0,
-                                    offset_to: site.text.len(),
-                                    position: 0,
-                                    text: site.text.clone(),
-                                    position_length: 1,
-                                }],
-                            },
-                        )
-                    } else {
-                        doc.add_text(tantivy_field, "");
-                    }
-                }
-                Field::Text(TextField::DomainNoTokenizer) => doc.add_pre_tokenized_text(
-                    tantivy_field,
-                    PreTokenizedString {
-                        text: domain.text.clone(),
-                        tokens: vec![tantivy::tokenizer::Token {
-                            offset_from: 0,
-                            offset_to: domain.text.len(),
-                            position: 0,
-                            text: domain.text.clone(),
-                            position_length: 1,
-                        }],
-                    },
-                ),
-                Field::Text(TextField::TitleIfHomepage) => {
-                    if self.is_homepage() {
-                        doc.add_pre_tokenized_text(tantivy_field, title.clone());
-                    } else {
-                        doc.add_text(tantivy_field, "");
-                    }
-                }
-                Field::Text(TextField::DomainIfHomepage) => {
-                    if self.is_homepage() {
-                        doc.add_text(tantivy_field, domain.text.clone());
-                    } else {
-                        doc.add_text(tantivy_field, "");
-                    }
-                }
-                Field::Text(TextField::DomainNameNoTokenizer) => {
-                    doc.add_pre_tokenized_text(
-                        tantivy_field,
-                        PreTokenizedString {
-                            text: domain_name.to_string(),
-                            tokens: vec![tantivy::tokenizer::Token {
-                                offset_from: 0,
-                                offset_to: domain_name.len(),
-                                position: 0,
-                                text: domain_name.to_string(),
-                                position_length: 1,
-                            }],
-                        },
-                    );
-                }
-                Field::Text(TextField::DomainNameIfHomepageNoTokenizer) => {
-                    if self.is_homepage() {
-                        doc.add_pre_tokenized_text(
-                            tantivy_field,
-                            PreTokenizedString {
-                                text: domain_name.to_string(),
-                                tokens: vec![tantivy::tokenizer::Token {
-                                    offset_from: 0,
-                                    offset_to: domain_name.len(),
-                                    position: 0,
-                                    text: domain_name.to_string(),
-                                    position_length: 1,
-                                }],
-                            },
-                        );
-                    } else {
-                        doc.add_text(tantivy_field, "");
-                    }
-                }
-                Field::Text(TextField::DomainIfHomepageNoTokenizer) => {
-                    if self.is_homepage() {
-                        doc.add_pre_tokenized_text(
-                            tantivy_field,
-                            PreTokenizedString {
-                                text: domain.text.clone(),
-                                tokens: vec![tantivy::tokenizer::Token {
-                                    offset_from: 0,
-                                    offset_to: domain.text.len(),
-                                    position: 0,
-                                    text: domain.text.clone(),
-                                    position_length: 1,
-                                }],
-                            },
-                        );
-                    } else {
-                        doc.add_text(tantivy_field, "");
-                    }
-                }
-                Field::Text(TextField::AllBody) => {
-                    doc.add_pre_tokenized_text(tantivy_field, all_text.clone())
-                }
-                Field::Text(TextField::RecipeFirstIngredientTagId) => {
-                    doc.add_text(tantivy_field, first_ingredient_tag_id.clone());
-                }
-                Field::Text(TextField::SchemaOrgJson) => {
-                    doc.add_text(tantivy_field, schema_json.clone());
-                }
-                Field::Text(TextField::FlattenedSchemaOrgJson) => {
-                    doc.add_pre_tokenized_text(tantivy_field, pretokenized_schema_json.clone());
-                }
-                Field::Text(TextField::MicroformatTags) => {
-                    doc.add_pre_tokenized_text(tantivy_field, microformats.clone());
-                }
-                Field::Fast(FastField::IsHomepage) => {
-                    doc.add_u64(tantivy_field, (self.is_homepage()).into());
-                }
-                Field::Fast(FastField::LastUpdated) => doc.add_u64(
-                    tantivy_field,
-                    self.updated_time()
-                        .map_or(0, |time| time.timestamp().max(0) as u64),
-                ),
-                Field::Fast(FastField::TrackerScore) => {
-                    doc.add_u64(tantivy_field, self.trackers().len() as u64)
-                }
-                Field::Fast(FastField::NumUrlTokens) => {
-                    doc.add_u64(tantivy_field, url.tokens.len() as u64)
-                }
-                Field::Fast(FastField::NumMicroformatTagsTokens) => {
-                    doc.add_u64(tantivy_field, microformats.tokens.len() as u64)
-                }
-                Field::Fast(FastField::NumTitleTokens) => {
-                    doc.add_u64(tantivy_field, title.tokens.len() as u64)
-                }
-                Field::Fast(FastField::NumCleanBodyTokens) => {
-                    doc.add_u64(tantivy_field, clean_text.tokens.len() as u64)
-                }
-                Field::Fast(FastField::NumDescriptionTokens) => {
-                    doc.add_u64(tantivy_field, description.tokens.len() as u64)
-                }
-                Field::Fast(FastField::NumUrlForSiteOperatorTokens) => {
-                    doc.add_u64(tantivy_field, url_for_site_operator.tokens.len() as u64)
-                }
-                Field::Fast(FastField::NumDomainTokens) => {
-                    doc.add_u64(tantivy_field, domain.tokens.len() as u64)
-                }
-                Field::Fast(FastField::NumFlattenedSchemaTokens) => {
-                    doc.add_u64(tantivy_field, pretokenized_schema_json.tokens.len() as u64)
-                }
-                Field::Fast(FastField::SiteHash1) => {
-                    doc.add_u64(tantivy_field, site_hash[0]);
-                }
-                Field::Fast(FastField::SiteHash2) => {
-                    doc.add_u64(tantivy_field, site_hash[1]);
-                }
-                Field::Fast(FastField::UrlWithoutQueryHash1) => {
-                    doc.add_u64(tantivy_field, url_without_query_hash[0]);
-                }
-                Field::Fast(FastField::UrlWithoutQueryHash2) => {
-                    doc.add_u64(tantivy_field, url_without_query_hash[1]);
-                }
-                Field::Fast(FastField::UrlHash1) => {
-                    doc.add_u64(tantivy_field, url_hash[0]);
-                }
-                Field::Fast(FastField::UrlHash2) => {
-                    doc.add_u64(tantivy_field, url_hash[1]);
-                }
-                Field::Fast(FastField::UrlWithoutTldHash1) => {
-                    doc.add_u64(tantivy_field, url_without_tld_hash[0]);
-                }
-                Field::Fast(FastField::UrlWithoutTldHash2) => {
-                    doc.add_u64(tantivy_field, url_without_tld_hash[1]);
-                }
-                Field::Fast(FastField::DomainHash1) => {
-                    doc.add_u64(tantivy_field, domain_hash[0]);
-                }
-                Field::Fast(FastField::DomainHash2) => {
-                    doc.add_u64(tantivy_field, domain_hash[1]);
-                }
-                Field::Fast(FastField::TitleHash1) => {
-                    doc.add_u64(tantivy_field, title_hash[0]);
-                }
-                Field::Fast(FastField::TitleHash2) => {
-                    doc.add_u64(tantivy_field, title_hash[1]);
-                }
-                Field::Fast(FastField::SimHash) => {
-                    let hash = if !clean_text.text.is_empty() {
-                        simhash::hash(&clean_text.text)
-                    } else {
-                        0
-                    };
-                    doc.add_u64(tantivy_field, hash);
-                }
-                Field::Fast(FastField::NumPathAndQuerySlashes) => {
-                    let num_slashes = self
-                        .url()
-                        .path_segments()
-                        .map(|segments| segments.count())
-                        .unwrap_or(0);
-
-                    doc.add_u64(tantivy_field, num_slashes as u64);
-                }
-                Field::Fast(FastField::NumPathAndQueryDigits) => {
-                    let num_digits = self
-                        .url()
-                        .path()
-                        .chars()
-                        .filter(|c| c.is_ascii_digit())
-                        .count()
-                        + self
-                            .url()
-                            .query()
-                            .unwrap_or_default()
-                            .chars()
-                            .filter(|c| c.is_ascii_digit())
-                            .count();
-
-                    doc.add_u64(tantivy_field, num_digits as u64);
-                }
-                Field::Fast(FastField::LikelyHasAds) => {
-                    doc.add_u64(tantivy_field, self.likely_has_ads() as u64);
-                }
-                Field::Fast(FastField::LikelyHasPaywall) => {
-                    doc.add_u64(tantivy_field, self.likely_has_paywall() as u64);
-                }
-                Field::Fast(FastField::LinkDensity) => {
-                    doc.add_u64(
-                        tantivy_field,
-                        (self.link_density() * FLOAT_SCALING as f64) as u64,
-                    );
-                }
-                Field::Text(TextField::BacklinkText)
-                | Field::Text(TextField::SafetyClassification)
-                | Field::Text(TextField::InsertionTimestamp)
-                | Field::Fast(FastField::HostCentrality)
-                | Field::Fast(FastField::HostCentralityRank)
-                | Field::Fast(FastField::PageCentrality)
-                | Field::Fast(FastField::PageCentralityRank)
-                | Field::Fast(FastField::FetchTimeMs)
-                | Field::Fast(FastField::PreComputedScore)
-                | Field::Fast(FastField::Region)
-                | Field::Fast(FastField::HostNodeID)
-                | Field::Fast(FastField::TitleEmbeddings)
-                | Field::Fast(FastField::KeywordEmbeddings)
-                | Field::Text(TextField::Keywords)
-                | Field::Text(TextField::DmozDescription) => {}
+                Field::Text(f) => f.add_html_tantivy(self, &mut cache, &mut doc, schema)?,
+                Field::Fast(f) => f.add_html_tantivy(self, &mut cache, &mut doc, schema)?,
             }
             }
         }
         }
 
 
         Ok(doc)
         Ok(doc)
     }
     }
 }
 }
-
-fn stemmer_from_lang(lang: &Lang) -> rust_stemmers::Stemmer {
-    match lang {
-        Lang::Ara => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Arabic),
-        Lang::Dan => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Danish),
-        Lang::Nld => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Dutch),
-        Lang::Fin => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Finnish),
-        Lang::Fra => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::French),
-        Lang::Deu => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::German),
-        Lang::Ell => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Greek),
-        Lang::Hun => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Hungarian),
-        Lang::Ita => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Italian),
-        Lang::Por => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Portuguese),
-        Lang::Ron => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Romanian),
-        Lang::Rus => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Russian),
-        Lang::Spa => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Spanish),
-        Lang::Swe => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Swedish),
-        Lang::Tam => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Tamil),
-        Lang::Tur => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::Turkish),
-        _ => rust_stemmers::Stemmer::create(rust_stemmers::Algorithm::English),
-    }
-}
-
-fn stem_tokens(tokens: &mut [tantivy::tokenizer::Token], lang: Lang) {
-    let stemmer = stemmer_from_lang(&lang);
-    for token in tokens {
-        // TODO remove allocation
-        if let Ok(stemmed_str) = std::panic::catch_unwind(|| stemmer.stem(&token.text).into_owned())
-        {
-            token.text.clear();
-            token.text.push_str(&stemmed_str);
-        }
-    }
-}

+ 13 - 15
crates/core/src/webpage/html/microformats.rs

@@ -13,7 +13,7 @@
 //
 //
 // You should have received a copy of the GNU Affero General Public License
 // You should have received a copy of the GNU Affero General Public License
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
-use crate::{enum_map::EnumSet, Result};
+use crate::enum_map::{EnumSet, GetEnumMapKey, InsertEnumMapKey};
 
 
 use super::Html;
 use super::Html;
 
 
@@ -49,9 +49,9 @@ impl Microformat {
     }
     }
 }
 }
 
 
-impl From<Microformat> for usize {
-    fn from(value: Microformat) -> Self {
-        match value {
+impl InsertEnumMapKey for Microformat {
+    fn into_usize(self) -> usize {
+        match self {
             Microformat::HCard => 0,
             Microformat::HCard => 0,
             Microformat::HEvent => 1,
             Microformat::HEvent => 1,
             Microformat::HEntry => 2,
             Microformat::HEntry => 2,
@@ -62,18 +62,16 @@ impl From<Microformat> for usize {
     }
     }
 }
 }
 
 
-impl TryFrom<usize> for Microformat {
-    type Error = anyhow::Error;
-
-    fn try_from(value: usize) -> Result<Self> {
+impl GetEnumMapKey for Microformat {
+    fn from_usize(value: usize) -> Option<Self> {
         match value {
         match value {
-            0 => Ok(Microformat::HCard),
-            1 => Ok(Microformat::HEvent),
-            2 => Ok(Microformat::HEntry),
-            3 => Ok(Microformat::HRecipe),
-            4 => Ok(Microformat::HReview),
-            5 => Ok(Microformat::HProduct),
-            _ => Err(anyhow::anyhow!("Unknown microformat")),
+            0 => Some(Microformat::HCard),
+            1 => Some(Microformat::HEvent),
+            2 => Some(Microformat::HEntry),
+            3 => Some(Microformat::HRecipe),
+            4 => Some(Microformat::HReview),
+            5 => Some(Microformat::HProduct),
+            _ => None,
         }
         }
     }
     }
 }
 }

+ 3 - 0
crates/core/src/webpage/html/mod.rs

@@ -28,6 +28,9 @@ use super::{adservers::AD_SERVERS, schema_org, Meta, Script};
 
 
 use super::url_ext::UrlExt;
 use super::url_ext::UrlExt;
 
 
+pub use fn_cache::FnCache;
+
+mod fn_cache;
 mod into_tantivy;
 mod into_tantivy;
 mod links;
 mod links;
 mod microformats;
 mod microformats;

+ 7 - 4
crates/core/src/webpage/html/robots_meta.rs

@@ -16,7 +16,10 @@
 
 
 use std::str::FromStr;
 use std::str::FromStr;
 
 
-use crate::{enum_map::EnumSet, Error, Result};
+use crate::{
+    enum_map::{EnumSet, InsertEnumMapKey},
+    Error, Result,
+};
 
 
 use super::Html;
 use super::Html;
 
 
@@ -38,9 +41,9 @@ impl FromStr for RobotsMeta {
     }
     }
 }
 }
 
 
-impl From<RobotsMeta> for usize {
-    fn from(val: RobotsMeta) -> Self {
-        match val {
+impl InsertEnumMapKey for RobotsMeta {
+    fn into_usize(self) -> usize {
+        match self {
             RobotsMeta::NoIndex => 0,
             RobotsMeta::NoIndex => 0,
             RobotsMeta::NoFollow => 1,
             RobotsMeta::NoFollow => 1,
         }
         }

+ 8 - 169
crates/core/src/webpage/mod.rs

@@ -15,7 +15,7 @@
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 // along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 
 use crate::{
 use crate::{
-    schema::{FastField, TextField},
+    schema::{fast_field::FastField, text_field::TextField, Field},
     webgraph::NodeID,
     webgraph::NodeID,
     Result,
     Result,
 };
 };
@@ -23,15 +23,11 @@ use candle_core::Tensor;
 use chrono::{DateTime, Utc};
 use chrono::{DateTime, Utc};
 
 
 use std::collections::HashMap;
 use std::collections::HashMap;
-use tantivy::{time::OffsetDateTime, TantivyDocument};
+use tantivy::TantivyDocument;
 use url::Url;
 use url::Url;
 
 
-use crate::schema::{Field, FLOAT_SCALING};
-
-use self::region::Region;
-
 mod adservers;
 mod adservers;
-mod html;
+pub mod html;
 mod just_text;
 mod just_text;
 pub mod region;
 pub mod region;
 pub mod safety_classifier;
 pub mod safety_classifier;
@@ -114,7 +110,7 @@ impl Webpage {
         })
         })
     }
     }
 
 
-    fn dmoz_description(&self) -> Option<String> {
+    pub fn dmoz_description(&self) -> Option<String> {
         self.dmoz_description.as_ref().and_then(|desc| {
         self.dmoz_description.as_ref().and_then(|desc| {
             if !self.html.metadata().iter().any(|metadata| {
             if !self.html.metadata().iter().any(|metadata| {
                 if let Some(content) = metadata.get(&"content".to_string()) {
                 if let Some(content) = metadata.get(&"content".to_string()) {
@@ -131,172 +127,15 @@ impl Webpage {
     }
     }
 
 
     pub fn as_tantivy(&self, schema: &tantivy::schema::Schema) -> Result<TantivyDocument> {
     pub fn as_tantivy(&self, schema: &tantivy::schema::Schema) -> Result<TantivyDocument> {
-        let region = Region::guess_from(self);
-
-        let dmoz_description = self.dmoz_description();
-
         let mut doc = self.html.as_tantivy(schema)?;
         let mut doc = self.html.as_tantivy(schema)?;
 
 
-        if let Ok(region) = region {
-            doc.add_u64(
-                schema
-                    .get_field(Field::Fast(FastField::Region).name())
-                    .expect("Failed to get region field"),
-                region.id(),
-            );
-        } else {
-            doc.add_u64(
-                schema
-                    .get_field(Field::Fast(FastField::Region).name())
-                    .expect("Failed to get region field"),
-                Region::All.id(),
-            );
-        }
-
-        let backlink_text: String =
-            itertools::intersperse(self.backlink_labels.clone(), "\n".to_string()).collect();
-
-        doc.add_text(
-            schema
-                .get_field(Field::Text(TextField::BacklinkText).name())
-                .expect("Failed to get backlink-text field"),
-            backlink_text,
-        );
-
-        doc.add_text(
-            schema
-                .get_field(Field::Text(TextField::Keywords).name())
-                .expect("Failed to get keywords field"),
-            self.keywords.join("\n"),
-        );
-
-        doc.add_date(
-            schema
-                .get_field(Field::Text(TextField::InsertionTimestamp).name())
-                .expect("Failed to get insertion-timestamp field"),
-            tantivy::DateTime::from_utc(OffsetDateTime::from_unix_timestamp(
-                self.inserted_at.timestamp(),
-            )?),
-        );
-
-        let safety = self
-            .safety_classification
-            .map(|label| label.to_string())
-            .unwrap_or_default();
-
-        doc.add_text(
-            schema
-                .get_field(Field::Text(TextField::SafetyClassification).name())
-                .expect("Failed to get safety_classification field"),
-            safety,
-        );
-
-        doc.add_u64(
-            schema
-                .get_field(Field::Fast(FastField::HostCentrality).name())
-                .expect("Failed to get host_centrality field"),
-            (self.host_centrality * FLOAT_SCALING as f64) as u64,
-        );
-
-        doc.add_u64(
-            schema
-                .get_field(Field::Fast(FastField::HostCentralityRank).name())
-                .expect("Failed to get host_centrality_rank field"),
-            self.host_centrality_rank,
-        );
-
-        doc.add_u64(
-            schema
-                .get_field(Field::Fast(FastField::PageCentrality).name())
-                .expect("Failed to get page_centrality field"),
-            (self.page_centrality * FLOAT_SCALING as f64) as u64,
-        );
-
-        doc.add_u64(
-            schema
-                .get_field(Field::Fast(FastField::PageCentralityRank).name())
-                .expect("Failed to get page_centrality_rank field"),
-            self.page_centrality_rank,
-        );
-
-        doc.add_u64(
-            schema
-                .get_field(Field::Fast(FastField::FetchTimeMs).name())
-                .expect("Failed to get fetch_time_ms field"),
-            self.fetch_time_ms,
-        );
-
-        doc.add_u64(
-            schema
-                .get_field(Field::Fast(FastField::PreComputedScore).name())
-                .expect("failed to get pre_computed_score field"),
-            (self.pre_computed_score * FLOAT_SCALING as f64) as u64,
-        );
-
-        if let Some(emb) = &self.title_embedding {
-            let mut serialized = Vec::new();
-            emb.write_bytes(&mut serialized)?;
-
-            doc.add_bytes(
-                schema
-                    .get_field(Field::Fast(FastField::TitleEmbeddings).name())
-                    .expect("Failed to get title_embeddings field"),
-                serialized,
-            );
-        } else {
-            doc.add_bytes(
-                schema
-                    .get_field(Field::Fast(FastField::TitleEmbeddings).name())
-                    .expect("Failed to get title_embeddings field"),
-                Vec::new(),
-            );
-        }
-
-        if let Some(emb) = &self.keyword_embedding {
-            let mut serialized = Vec::new();
-            emb.write_bytes(&mut serialized)?;
-
-            doc.add_bytes(
-                schema
-                    .get_field(Field::Fast(FastField::KeywordEmbeddings).name())
-                    .expect("Failed to get keyword_embeddings field"),
-                serialized,
-            );
-        } else {
-            doc.add_bytes(
-                schema
-                    .get_field(Field::Fast(FastField::KeywordEmbeddings).name())
-                    .expect("Failed to get keyword_embeddings field"),
-                Vec::new(),
-            );
-        }
-
-        match &self.node_id {
-            Some(node_id) => {
-                doc.add_u64(
-                    schema
-                        .get_field(Field::Fast(FastField::HostNodeID).name())
-                        .expect("Failed to get node_id field"),
-                    node_id.as_u64(),
-                );
-            }
-            None => {
-                doc.add_u64(
-                    schema
-                        .get_field(Field::Fast(FastField::HostNodeID).name())
-                        .expect("Failed to get node_id field"),
-                    u64::MAX,
-                );
+        for field in Field::all() {
+            match field {
+                Field::Fast(f) => f.add_webpage_tantivy(self, &mut doc, schema)?,
+                Field::Text(f) => f.add_webpage_tantivy(self, &mut doc, schema)?,
             }
             }
         }
         }
 
 
-        doc.add_text(
-            schema
-                .get_field(Field::Text(TextField::DmozDescription).name())
-                .expect("failed to get dmoz_description field"),
-            dmoz_description.unwrap_or_default(),
-        );
-
         Ok(doc)
         Ok(doc)
     }
     }
 }
 }