3 jaren geleden · a8a6c67239
--- a/mwmbl/background.py
+++ b/mwmbl/background.py
@@ -6,7 +6,7 @@ from pathlib import Path
 
				 from time import sleep
			
 
				 
			
 
				 from mwmbl.indexer import historical
			
 
				-from mwmbl.indexer.batch_repo import BatchCache
			
 
				+from mwmbl.indexer.batch_cache import BatchCache
			
 
				 from mwmbl.indexer.paths import INDEX_PATH, BATCH_DIR_NAME
			
 
				 from mwmbl.indexer.preprocess import run_preprocessing
			
 
				 from mwmbl.indexer.update_pages import run_update
			
@@ -15,7 +15,7 @@ logger = getLogger(__name__)
 
				 
			
 
				 
			
 
				 def run(data_path: str):
			
 
				-    historical.run()
			
 
				+    # historical.run()
			
 
				     index_path = Path(data_path) / INDEX_PATH
			
 
				     batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME)
			
 
				     while True:
			
--- a/mwmbl/indexer/batch_cache.py
+++ b/mwmbl/indexer/batch_cache.py
@@ -7,7 +7,9 @@ import gzip
 
				 import json
			
 
				 import os
			
 
				 from multiprocessing.pool import ThreadPool
			
 
				+from pathlib import Path
			
 
				 from tempfile import NamedTemporaryFile
			
 
				+from urllib.parse import urlparse
			
 
				 
			
 
				 from pydantic import ValidationError
			
 
				 
			
@@ -24,10 +26,6 @@ class BatchCache:
 
				         os.makedirs(repo_path, exist_ok=True)
			
 
				         self.path = repo_path
			
 
				 
			
 
				-    def store(self, batch: HashedBatch):
			
 
				-        with NamedTemporaryFile(mode='w', dir=self.path, prefix='batch_', suffix='.json', delete=False) as output_file:
			
 
				-            output_file.write(batch.json())
			
 
				-
			
 
				     def get(self, num_batches) -> dict[str, HashedBatch]:
			
 
				         batches = {}
			
 
				         for path in os.listdir(self.path):
			
@@ -65,5 +63,17 @@ class BatchCache:
 
				             print("Failed to validate batch", data)
			
 
				             return 0
			
 
				         if len(batch.items) > 0:
			
 
				-            self.store(batch)
			
 
				+            self.store(batch, url)
			
 
				         return len(batch.items)
			
 
				+
			
 
				+    def store(self, batch, url):
			
 
				+        path = self.get_path_from_url(url)
			
 
				+        print("Path", path)
			
 
				+        os.makedirs(path.parent, exist_ok=True)
			
 
				+        with open(path, 'wb') as output_file:
			
 
				+            data = gzip.compress(batch.json().encode('utf8'))
			
 
				+            output_file.write(data)
			
 
				+
			
 
				+    def get_path_from_url(self, url) -> Path:
			
 
				+        url_path = urlparse(url).path
			
 
				+        return Path(self.path) / url_path.lstrip('/')
			
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@@ -9,7 +9,6 @@ import uvicorn
 
				 from fastapi import FastAPI
			
 
				 
			
 
				 from mwmbl import background
			
 
				-from mwmbl.indexer import historical, retrieve, preprocess, update_pages
			
 
				 from mwmbl.crawler.app import router as crawler_router
			
 
				 from mwmbl.indexer.paths import INDEX_NAME
			
 
				 from mwmbl.tinysearchengine import search