Script to index local batch for evaluation

This commit is contained in:
Daoud Clarke 2022-08-22 22:47:42 +01:00
parent 480be85cfd
commit b1eea2457f
3 changed files with 82 additions and 20 deletions

51
analyse/index_local.py Normal file
View file

@ -0,0 +1,51 @@
"""
Index batches stored locally on the filesystem for the purpose of evaluation.
"""
import glob
import gzip
import json
import logging
import os
import sys
from pathlib import Path
import spacy
from mwmbl.crawler.batch import HashedBatch
from mwmbl.crawler.urls import URLDatabase
from mwmbl.database import Database
from mwmbl.indexer.index_batches import index_batches
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
NUM_BATCHES = 10000
EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
NUM_PAGES = 1_024_000
PAGE_SIZE = 4096
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
def get_batches():
for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
data = json.load(gzip.open(path))
yield HashedBatch.parse_obj(data)
def run():
try:
os.remove(EVALUATE_INDEX_PATH)
except FileNotFoundError:
pass
TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
batches = get_batches()
with Database() as db:
nlp = spacy.load("en_core_web_sm")
url_db = URLDatabase(db.connection)
index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
if __name__ == '__main__':
run()

View file

@ -1,8 +1,10 @@
import logging import logging
import sys import sys
import numpy as np
import spacy import spacy
from analyse.index_local import EVALUATE_INDEX_PATH
from mwmbl.indexer.index import tokenize_document from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.paths import INDEX_PATH from mwmbl.indexer.paths import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document from mwmbl.tinysearchengine.indexer import TinyIndex, Document
@ -35,16 +37,24 @@ def get_items():
print("Items", item) print("Items", item)
def run(): def run(index_path):
with TinyIndex(Document, INDEX_PATH) as tiny_index: with TinyIndex(Document, index_path) as tiny_index:
for i in range(100000): sizes = {}
for i in range(tiny_index.num_pages):
page = tiny_index.get_page(i) page = tiny_index.get_page(i)
for item in page: if page:
if ' search' in item.title: sizes[i] = len(page)
print("Page", i, item) if len(page) > 50:
print("Page", len(page), page)
# for item in page:
# if ' search' in item.title:
# print("Page", i, item)
print("Max", max(sizes.values()))
print("Top", sorted(sizes.values())[-100:])
print("Mean", np.mean(list(sizes.values())))
if __name__ == '__main__': if __name__ == '__main__':
# store() # store()
# run() run(EVALUATE_INDEX_PATH)
get_items() # get_items()

View file

@ -8,6 +8,7 @@ from typing import Iterable
from urllib.parse import urlparse from urllib.parse import urlparse
import spacy import spacy
from spacy import Language
from mwmbl.crawler.batch import HashedBatch, Item from mwmbl.crawler.batch import HashedBatch, Item
from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
@ -49,23 +50,23 @@ def run(batch_cache: BatchCache, index_path: str):
record_urls_in_database(batch_data.values()) record_urls_in_database(batch_data.values())
document_tuples = list(get_documents_from_batches(batch_data.values()))
urls = [url for title, url, extract in document_tuples]
logger.info(f"Got {len(urls)} document tuples")
url_db = URLDatabase(db.connection) url_db = URLDatabase(db.connection)
url_scores = url_db.get_url_scores(urls) index_batches(batch_data.values(), index_path, nlp, url_db)
logger.info(f"Got {len(url_scores)} scores")
documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
page_documents = preprocess_documents(documents, index_path, nlp)
index_pages(index_path, page_documents)
logger.info("Indexed pages") logger.info("Indexed pages")
index_db.update_batch_status([batch.url for batch in batches], BatchStatus.INDEXED) index_db.update_batch_status([batch.url for batch in batches], BatchStatus.INDEXED)
def index_batches(batch_data: Iterable[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
document_tuples = list(get_documents_from_batches(batch_data))
urls = [url for title, url, extract in document_tuples]
logger.info(f"Got {len(urls)} document tuples")
url_scores = url_db.get_url_scores(urls)
logger.info(f"Got {len(url_scores)} scores")
documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
page_documents = preprocess_documents(documents, index_path, nlp)
index_pages(index_path, page_documents)
def index_pages(index_path, page_documents): def index_pages(index_path, page_documents):
with TinyIndex(Document, index_path, 'w') as indexer: with TinyIndex(Document, index_path, 'w') as indexer:
for page, documents in page_documents.items(): for page, documents in page_documents.items():