Browse Source

Use URL path to store locally so that we can easily get a local path from a URL

Daoud Clarke 3 years ago
parent
commit
a8a6c67239
3 changed files with 17 additions and 8 deletions
  1. 2 2
      mwmbl/background.py
  2. 15 5
      mwmbl/indexer/batch_cache.py
  3. 0 1
      mwmbl/main.py

+ 2 - 2
mwmbl/background.py

@@ -6,7 +6,7 @@ from pathlib import Path
 from time import sleep
 
 from mwmbl.indexer import historical
-from mwmbl.indexer.batch_repo import BatchCache
+from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import INDEX_PATH, BATCH_DIR_NAME
 from mwmbl.indexer.preprocess import run_preprocessing
 from mwmbl.indexer.update_pages import run_update
@@ -15,7 +15,7 @@ logger = getLogger(__name__)
 
 
 def run(data_path: str):
-    historical.run()
+    # historical.run()
     index_path = Path(data_path) / INDEX_PATH
     batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME)
     while True:

+ 15 - 5
mwmbl/indexer/batch_cache.py

@@ -7,7 +7,9 @@ import gzip
 import json
 import os
 from multiprocessing.pool import ThreadPool
+from pathlib import Path
 from tempfile import NamedTemporaryFile
+from urllib.parse import urlparse
 
 from pydantic import ValidationError
 
@@ -24,10 +26,6 @@ class BatchCache:
         os.makedirs(repo_path, exist_ok=True)
         self.path = repo_path
 
-    def store(self, batch: HashedBatch):
-        with NamedTemporaryFile(mode='w', dir=self.path, prefix='batch_', suffix='.json', delete=False) as output_file:
-            output_file.write(batch.json())
-
     def get(self, num_batches) -> dict[str, HashedBatch]:
         batches = {}
         for path in os.listdir(self.path):
@@ -65,5 +63,17 @@ class BatchCache:
             print("Failed to validate batch", data)
             return 0
         if len(batch.items) > 0:
-            self.store(batch)
+            self.store(batch, url)
         return len(batch.items)
+
+    def store(self, batch, url):
+        path = self.get_path_from_url(url)
+        print("Path", path)
+        os.makedirs(path.parent, exist_ok=True)
+        with open(path, 'wb') as output_file:
+            data = gzip.compress(batch.json().encode('utf8'))
+            output_file.write(data)
+
+    def get_path_from_url(self, url) -> Path:
+        url_path = urlparse(url).path
+        return Path(self.path) / url_path.lstrip('/')

+ 0 - 1
mwmbl/main.py

@@ -9,7 +9,6 @@ import uvicorn
 from fastapi import FastAPI
 
 from mwmbl import background
-from mwmbl.indexer import historical, retrieve, preprocess, update_pages
 from mwmbl.crawler.app import router as crawler_router
 from mwmbl.indexer.paths import INDEX_NAME
 from mwmbl.tinysearchengine import search