Use URL path to store locally so that we can easily get a local path from a URL

This commit is contained in:
Daoud Clarke 2022-07-20 22:21:35 +01:00
parent 0d1e7d841c
commit a8a6c67239
3 changed files with 17 additions and 8 deletions

View file

@ -6,7 +6,7 @@ from pathlib import Path
from time import sleep
from mwmbl.indexer import historical
from mwmbl.indexer.batch_repo import BatchCache
from mwmbl.indexer.batch_cache import BatchCache
from mwmbl.indexer.paths import INDEX_PATH, BATCH_DIR_NAME
from mwmbl.indexer.preprocess import run_preprocessing
from mwmbl.indexer.update_pages import run_update
@ -15,7 +15,7 @@ logger = getLogger(__name__)
def run(data_path: str):
historical.run()
# historical.run()
index_path = Path(data_path) / INDEX_PATH
batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME)
while True:

View file

@ -7,7 +7,9 @@ import gzip
import json
import os
from multiprocessing.pool import ThreadPool
from pathlib import Path
from tempfile import NamedTemporaryFile
from urllib.parse import urlparse
from pydantic import ValidationError
@ -24,10 +26,6 @@ class BatchCache:
os.makedirs(repo_path, exist_ok=True)
self.path = repo_path
def store(self, batch: HashedBatch):
with NamedTemporaryFile(mode='w', dir=self.path, prefix='batch_', suffix='.json', delete=False) as output_file:
output_file.write(batch.json())
def get(self, num_batches) -> dict[str, HashedBatch]:
batches = {}
for path in os.listdir(self.path):
@ -65,5 +63,17 @@ class BatchCache:
print("Failed to validate batch", data)
return 0
if len(batch.items) > 0:
self.store(batch)
self.store(batch, url)
return len(batch.items)
def store(self, batch, url):
path = self.get_path_from_url(url)
print("Path", path)
os.makedirs(path.parent, exist_ok=True)
with open(path, 'wb') as output_file:
data = gzip.compress(batch.json().encode('utf8'))
output_file.write(data)
def get_path_from_url(self, url) -> Path:
url_path = urlparse(url).path
return Path(self.path) / url_path.lstrip('/')

View file

@ -9,7 +9,6 @@ import uvicorn
from fastapi import FastAPI
from mwmbl import background
from mwmbl.indexer import historical, retrieve, preprocess, update_pages
from mwmbl.crawler.app import router as crawler_router
from mwmbl.indexer.paths import INDEX_NAME
from mwmbl.tinysearchengine import search