From a8a6c6723985b8a86c9968ccb266b132f6a63941 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Wed, 20 Jul 2022 22:21:35 +0100 Subject: [PATCH] Use URL path to store locally so that we can easily get a local path from a URL --- mwmbl/background.py | 4 ++-- mwmbl/indexer/batch_cache.py | 20 +++++++++++++++----- mwmbl/main.py | 1 - 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/mwmbl/background.py b/mwmbl/background.py index 3e6df0d..7ff2edb 100644 --- a/mwmbl/background.py +++ b/mwmbl/background.py @@ -6,7 +6,7 @@ from pathlib import Path from time import sleep from mwmbl.indexer import historical -from mwmbl.indexer.batch_repo import BatchCache +from mwmbl.indexer.batch_cache import BatchCache from mwmbl.indexer.paths import INDEX_PATH, BATCH_DIR_NAME from mwmbl.indexer.preprocess import run_preprocessing from mwmbl.indexer.update_pages import run_update @@ -15,7 +15,7 @@ logger = getLogger(__name__) def run(data_path: str): - historical.run() + # historical.run() index_path = Path(data_path) / INDEX_PATH batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME) while True: diff --git a/mwmbl/indexer/batch_cache.py b/mwmbl/indexer/batch_cache.py index c012212..f811578 100644 --- a/mwmbl/indexer/batch_cache.py +++ b/mwmbl/indexer/batch_cache.py @@ -7,7 +7,9 @@ import gzip import json import os from multiprocessing.pool import ThreadPool +from pathlib import Path from tempfile import NamedTemporaryFile +from urllib.parse import urlparse from pydantic import ValidationError @@ -24,10 +26,6 @@ class BatchCache: os.makedirs(repo_path, exist_ok=True) self.path = repo_path - def store(self, batch: HashedBatch): - with NamedTemporaryFile(mode='w', dir=self.path, prefix='batch_', suffix='.json', delete=False) as output_file: - output_file.write(batch.json()) - def get(self, num_batches) -> dict[str, HashedBatch]: batches = {} for path in os.listdir(self.path): @@ -65,5 +63,17 @@ class BatchCache: print("Failed to validate batch", data) return 0 if len(batch.items) > 0: - self.store(batch) + self.store(batch, url) return len(batch.items) + + def store(self, batch, url): + path = self.get_path_from_url(url) + print("Path", path) + os.makedirs(path.parent, exist_ok=True) + with open(path, 'wb') as output_file: + data = gzip.compress(batch.json().encode('utf8')) + output_file.write(data) + + def get_path_from_url(self, url) -> Path: + url_path = urlparse(url).path + return Path(self.path) / url_path.lstrip('/') diff --git a/mwmbl/main.py b/mwmbl/main.py index ac2fade..9896cc9 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -9,7 +9,6 @@ import uvicorn from fastapi import FastAPI from mwmbl import background -from mwmbl.indexer import historical, retrieve, preprocess, update_pages from mwmbl.crawler.app import router as crawler_router from mwmbl.indexer.paths import INDEX_NAME from mwmbl.tinysearchengine import search