Переглянути джерело

Very simple WAL built on top of file-store primitives (#219)

Doesn't handle concurrent writes and flushes after each write. This will cause a lot of fsync's which will impact performance, but as this will be used for the live index where each item (a full webpage) is quite large, this will hopefully not be too detrimental.
Mikkel Denker 11 місяців тому
батько
коміт
365ed02813

+ 9 - 0
Cargo.lock

@@ -5042,6 +5042,15 @@ version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e"
 
+[[package]]
+name = "simple_wal"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bincode",
+ "file_store",
+]
+
 [[package]]
 name = "siphasher"
 version = "0.3.11"

+ 1 - 0
Cargo.toml

@@ -15,6 +15,7 @@ members = [
   "crates/tantivy",
   "crates/leechy",
   "crates/common",
+  "crates/simple-wal",
   "fuzz",
 ]
 resolver = "2"

+ 2 - 1
assets/licenses.html

@@ -46,9 +46,9 @@
         <ul class="licenses-overview">
             <li><a href="#Apache-2.0">Apache License 2.0</a> (428)</li>
             <li><a href="#MIT">MIT License</a> (187)</li>
+            <li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (9)</li>
             <li><a href="#BSD-3-Clause">BSD 3-Clause &quot;New&quot; or &quot;Revised&quot; License</a> (9)</li>
             <li><a href="#MPL-2.0">Mozilla Public License 2.0</a> (9)</li>
-            <li><a href="#AGPL-3.0">GNU Affero General Public License v3.0</a> (8)</li>
             <li><a href="#ISC">ISC License</a> (4)</li>
             <li><a href="#Unicode-3.0">Unicode License v3</a> (4)</li>
             <li><a href="#BSD-2-Clause">BSD 2-Clause &quot;Simplified&quot; License</a> (3)</li>
@@ -74,6 +74,7 @@
                     <li><a href=" https://crates.io/crates/lending-iter ">lending-iter 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/ownedbytes ">ownedbytes 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/robotstxt ">robotstxt 0.1.0</a></li>
+                    <li><a href=" https://crates.io/crates/simple_wal ">simple_wal 0.1.0</a></li>
                     <li><a href=" https://crates.io/crates/speedy_kv ">speedy_kv 0.1.0</a></li>
                 </ul>
                 <pre class="license-text">GNU AFFERO GENERAL PUBLIC LICENSE

+ 2 - 28
crates/core/src/lib.rs

@@ -33,7 +33,8 @@ use distributed::{
     cluster::Cluster,
     member::{Member, Service},
 };
-use std::{cmp::Reverse, path::PathBuf, sync::Arc};
+pub use file_store::gen_temp_path;
+use std::{cmp::Reverse, sync::Arc};
 use thiserror::Error;
 
 pub mod entrypoint;
@@ -139,33 +140,6 @@ pub enum Error {
 
 pub type Result<T, E = anyhow::Error> = std::result::Result<T, E>;
 
-// taken from https://docs.rs/sled/0.34.7/src/sled/config.rs.html#445
-pub fn gen_temp_path() -> PathBuf {
-    use std::sync::atomic::{AtomicUsize, Ordering};
-    use std::time::SystemTime;
-
-    static SALT_COUNTER: AtomicUsize = AtomicUsize::new(0);
-
-    let seed = SALT_COUNTER.fetch_add(1, Ordering::SeqCst) as u128;
-
-    let now = SystemTime::now()
-        .duration_since(SystemTime::UNIX_EPOCH)
-        .unwrap()
-        .as_nanos()
-        << 48;
-
-    let pid = u128::from(std::process::id());
-
-    let salt = (pid << 16) + now + seed;
-
-    if cfg!(target_os = "linux") {
-        // use shared memory for temporary linux files
-        format!("/dev/shm/pagecache.tmp.{salt}").into()
-    } else {
-        std::env::temp_dir().join(format!("pagecache.tmp.{salt}"))
-    }
-}
-
 /// Starts a gossip cluster in the background and returns a handle to it.
 /// This is useful for blocking contexts where there is no runtime to spawn the cluster on.
 pub fn start_gossip_cluster_thread(config: GossipConfig, service: Option<Service>) -> Arc<Cluster> {

+ 6 - 2
crates/file-store/src/iterable.rs

@@ -87,10 +87,8 @@ where
     next_start: u64,
     _marker: std::marker::PhantomData<T>,
 }
-
 impl<T, W> IterableStoreWriter<T, W>
 where
-    T: bincode::Encode,
     W: io::Write,
 {
     pub fn new(writer: W) -> Self {
@@ -100,7 +98,13 @@ where
             next_start: 0,
         }
     }
+}
 
+impl<T, W> IterableStoreWriter<T, W>
+where
+    T: bincode::Encode,
+    W: io::Write,
+{
     pub fn write(&mut self, item: &T) -> Result<WrittenOffset> {
         let serialized = bincode::encode_to_vec(item, common::bincode_config())?;
         let header = IterableHeader {

+ 27 - 0
crates/file-store/src/lib.rs

@@ -25,3 +25,30 @@ pub mod random_lookup;
 
 pub use const_serializable::ConstSerializable;
 pub use peekable::Peekable;
+
+// taken from https://docs.rs/sled/0.34.7/src/sled/config.rs.html#445
+pub fn gen_temp_path() -> std::path::PathBuf {
+    use std::sync::atomic::{AtomicUsize, Ordering};
+    use std::time::SystemTime;
+
+    static SALT_COUNTER: AtomicUsize = AtomicUsize::new(0);
+
+    let seed = SALT_COUNTER.fetch_add(1, Ordering::SeqCst) as u128;
+
+    let now = SystemTime::now()
+        .duration_since(SystemTime::UNIX_EPOCH)
+        .unwrap()
+        .as_nanos()
+        << 48;
+
+    let pid = u128::from(std::process::id());
+
+    let salt = (pid << 16) + now + seed;
+
+    if cfg!(target_os = "linux") {
+        // use shared memory for temporary linux files
+        format!("/dev/shm/pagecache.tmp.{salt}").into()
+    } else {
+        std::env::temp_dir().join(format!("pagecache.tmp.{salt}"))
+    }
+}

+ 12 - 0
crates/simple-wal/Cargo.toml

@@ -0,0 +1,12 @@
+[package]
+edition = "2021"
+license = "AGPL-3.0"
+name = "simple_wal"
+version = "0.1.0"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+anyhow.workspace = true
+bincode.workspace = true
+file_store = {path = "../file-store"}

+ 151 - 0
crates/simple-wal/src/lib.rs

@@ -0,0 +1,151 @@
+// Stract is an open source web search engine.
+// Copyright (C) 2024 Stract ApS
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU Affero General Public License as
+// published by the Free Software Foundation, either version 3 of the
+// License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU Affero General Public License for more details.
+//
+// You should have received a copy of the GNU Affero General Public License
+// along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+use anyhow::Result;
+use std::{
+    fs::File,
+    path::{Path, PathBuf},
+};
+
+pub struct Wal<T> {
+    writer: file_store::iterable::IterableStoreWriter<T, File>,
+    path: PathBuf,
+}
+
+impl<T> Wal<T> {
+    pub fn open<P: AsRef<Path>>(file: P) -> Result<Self> {
+        let path = file.as_ref().to_path_buf();
+        let file = if file.as_ref().exists() {
+            File::open(file)?
+        } else {
+            File::create(file)?
+        };
+
+        Ok(Wal {
+            writer: file_store::iterable::IterableStoreWriter::new(file),
+            path,
+        })
+    }
+
+    pub fn clear(&mut self) -> Result<()> {
+        std::fs::remove_file(&self.path)?;
+        self.writer = file_store::iterable::IterableStoreWriter::new(File::create(&self.path)?);
+
+        Ok(())
+    }
+}
+
+impl<T> Wal<T>
+where
+    T: bincode::Encode,
+{
+    pub fn write(&mut self, item: &T) -> Result<()> {
+        self.writer.write(item)?;
+        self.writer.flush()?;
+
+        Ok(())
+    }
+}
+
+impl<T> Wal<T>
+where
+    T: bincode::Decode,
+{
+    pub fn iter(&self) -> Result<WalIterator<T>> {
+        WalIterator::open(&self.path)
+    }
+}
+
+pub struct WalIterator<T> {
+    iter: file_store::iterable::IterableStoreReader<T>,
+}
+
+impl<T> WalIterator<T> {
+    pub fn open<P: AsRef<Path>>(file: P) -> Result<Self> {
+        Ok(Self {
+            iter: file_store::iterable::IterableStoreReader::open(file)?,
+        })
+    }
+}
+
+impl<T> Iterator for WalIterator<T>
+where
+    T: bincode::Decode,
+{
+    type Item = T;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        self.iter.next()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use anyhow::Result;
+
+    #[test]
+    fn test_write_read() -> Result<()> {
+        let mut writer = Wal::open(file_store::gen_temp_path())?;
+
+        writer.write(&1u64)?;
+        writer.write(&2u64)?;
+        writer.write(&3u64)?;
+
+        let res: Vec<_> = writer.iter()?.collect();
+
+        assert_eq!(&res, &[1, 2, 3]);
+
+        writer.write(&4u64)?;
+
+        let res: Vec<_> = writer.iter()?.collect();
+
+        assert_eq!(&res, &[1, 2, 3, 4]);
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_empty_write() -> Result<()> {
+        let writer: Wal<u64> = Wal::open(file_store::gen_temp_path())?;
+
+        let res: Vec<_> = writer.iter()?.collect();
+
+        assert!(res.is_empty());
+
+        Ok(())
+    }
+
+    #[test]
+    fn test_clear() -> Result<()> {
+        let mut writer = Wal::open(file_store::gen_temp_path())?;
+
+        writer.write(&1u64)?;
+        writer.write(&2u64)?;
+        writer.write(&3u64)?;
+
+        let res: Vec<_> = writer.iter()?.collect();
+
+        assert_eq!(&res, &[1, 2, 3]);
+
+        writer.clear()?;
+
+        let res: Vec<_> = writer.iter()?.collect();
+        assert!(res.is_empty());
+
+        Ok(())
+    }
+}

+ 0 - 4
frontend/src/routes/sorry/+page.svelte

@@ -19,7 +19,3 @@
     <Button title="Verify challenge" type="submit" kind="accent">VERIFY</Button>
   </div>
 </Captcha>
-
-<!-- <audio controls="controls">
-  <source src={audioBase64} />
-</audio> -->