57 lines
1.5 KiB
Python
57 lines
1.5 KiB
Python
"""
|
|
Index batches stored locally on the filesystem for the purpose of evaluation.
|
|
"""
|
|
import glob
|
|
import gzip
|
|
import json
|
|
import logging
|
|
import os
|
|
import sys
|
|
from datetime import datetime
|
|
|
|
import spacy
|
|
|
|
from mwmbl.crawler import HashedBatch
|
|
from mwmbl.crawler.urls import URLDatabase
|
|
from mwmbl.database import Database
|
|
from mwmbl.indexer import index_batches
|
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
|
|
|
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
|
|
NUM_BATCHES = 10000
|
|
EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
|
|
NUM_PAGES = 1_024_000
|
|
PAGE_SIZE = 4096
|
|
|
|
|
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
|
|
|
|
|
def get_batches():
|
|
for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
|
|
data = json.load(gzip.open(path))
|
|
yield HashedBatch.parse_obj(data)
|
|
|
|
|
|
def run():
|
|
try:
|
|
os.remove(EVALUATE_INDEX_PATH)
|
|
except FileNotFoundError:
|
|
pass
|
|
TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
|
|
|
|
batches = get_batches()
|
|
|
|
start = datetime.now()
|
|
with Database() as db:
|
|
nlp = spacy.load("en_core_web_sm")
|
|
url_db = URLDatabase(db.connection)
|
|
index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
|
|
end = datetime.now()
|
|
|
|
total_time = (end - start).total_seconds()
|
|
print("total_seconds:", total_time)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
run()
|