Use URL path to store locally so that we can easily get a local path from a URL
This commit is contained in:
parent
0d1e7d841c
commit
a8a6c67239
3 changed files with 17 additions and 8 deletions
|
@ -6,7 +6,7 @@ from pathlib import Path
|
|||
from time import sleep
|
||||
|
||||
from mwmbl.indexer import historical
|
||||
from mwmbl.indexer.batch_repo import BatchCache
|
||||
from mwmbl.indexer.batch_cache import BatchCache
|
||||
from mwmbl.indexer.paths import INDEX_PATH, BATCH_DIR_NAME
|
||||
from mwmbl.indexer.preprocess import run_preprocessing
|
||||
from mwmbl.indexer.update_pages import run_update
|
||||
|
@ -15,7 +15,7 @@ logger = getLogger(__name__)
|
|||
|
||||
|
||||
def run(data_path: str):
|
||||
historical.run()
|
||||
# historical.run()
|
||||
index_path = Path(data_path) / INDEX_PATH
|
||||
batch_cache = BatchCache(Path(data_path) / BATCH_DIR_NAME)
|
||||
while True:
|
||||
|
|
|
@ -7,7 +7,9 @@ import gzip
|
|||
import json
|
||||
import os
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from pathlib import Path
|
||||
from tempfile import NamedTemporaryFile
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from pydantic import ValidationError
|
||||
|
||||
|
@ -24,10 +26,6 @@ class BatchCache:
|
|||
os.makedirs(repo_path, exist_ok=True)
|
||||
self.path = repo_path
|
||||
|
||||
def store(self, batch: HashedBatch):
|
||||
with NamedTemporaryFile(mode='w', dir=self.path, prefix='batch_', suffix='.json', delete=False) as output_file:
|
||||
output_file.write(batch.json())
|
||||
|
||||
def get(self, num_batches) -> dict[str, HashedBatch]:
|
||||
batches = {}
|
||||
for path in os.listdir(self.path):
|
||||
|
@ -65,5 +63,17 @@ class BatchCache:
|
|||
print("Failed to validate batch", data)
|
||||
return 0
|
||||
if len(batch.items) > 0:
|
||||
self.store(batch)
|
||||
self.store(batch, url)
|
||||
return len(batch.items)
|
||||
|
||||
def store(self, batch, url):
|
||||
path = self.get_path_from_url(url)
|
||||
print("Path", path)
|
||||
os.makedirs(path.parent, exist_ok=True)
|
||||
with open(path, 'wb') as output_file:
|
||||
data = gzip.compress(batch.json().encode('utf8'))
|
||||
output_file.write(data)
|
||||
|
||||
def get_path_from_url(self, url) -> Path:
|
||||
url_path = urlparse(url).path
|
||||
return Path(self.path) / url_path.lstrip('/')
|
||||
|
|
|
@ -9,7 +9,6 @@ import uvicorn
|
|||
from fastapi import FastAPI
|
||||
|
||||
from mwmbl import background
|
||||
from mwmbl.indexer import historical, retrieve, preprocess, update_pages
|
||||
from mwmbl.crawler.app import router as crawler_router
|
||||
from mwmbl.indexer.paths import INDEX_NAME
|
||||
from mwmbl.tinysearchengine import search
|
||||
|
|
Loading…
Add table
Reference in a new issue