Script to index local batch for evaluation

2022-08-22 22:47:42 +01:00 · 2022-08-22 22:47:42 +01:00 · b1eea2457f
commit b1eea2457f
parent 480be85cfd
3 changed files with 82 additions and 20 deletions
--- a/analyse/index_local.py
+++ b/analyse/index_local.py
@ -0,0 +1,51 @@
 """
 Index batches stored locally on the filesystem for the purpose of evaluation.
 """
 import glob
 import gzip
 import json
 import logging
 import os
 import sys
 from pathlib import Path
 import spacy
 from mwmbl.crawler.batch import HashedBatch
 from mwmbl.crawler.urls import URLDatabase
 from mwmbl.database import Database
 from mwmbl.indexer.index_batches import index_batches
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
 NUM_BATCHES = 10000
 EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
 NUM_PAGES = 1_024_000
 PAGE_SIZE = 4096
 logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
 def get_batches():
    for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
        data = json.load(gzip.open(path))
        yield HashedBatch.parse_obj(data)
 def run():
    try:
        os.remove(EVALUATE_INDEX_PATH)
    except FileNotFoundError:
        pass
    TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
    batches = get_batches()
    with Database() as db:
        nlp = spacy.load("en_core_web_sm")
        url_db = URLDatabase(db.connection)
        index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
 if __name__ == '__main__':
    run()
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@ -1,8 +1,10 @@
 import logging
 import sys
 import numpy as np
 import spacy
 from analyse.index_local import EVALUATE_INDEX_PATH
 from mwmbl.indexer.index import tokenize_document
 from mwmbl.indexer.paths import INDEX_PATH
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
@ -35,16 +37,24 @@ def get_items():
                print("Items", item)
-def run():
+def run(index_path):
-    with TinyIndex(Document, INDEX_PATH) as tiny_index:
+    with TinyIndex(Document, index_path) as tiny_index:
-        for i in range(100000):
+        sizes = {}
        for i in range(tiny_index.num_pages):
            page = tiny_index.get_page(i)
-            for item in page:
+            if page:
-                if ' search' in item.title:
+                sizes[i] = len(page)
-                    print("Page", i, item)
+            if len(page) > 50:
                print("Page", len(page), page)
            # for item in page:
            #     if ' search' in item.title:
            #         print("Page", i, item)
        print("Max", max(sizes.values()))
        print("Top", sorted(sizes.values())[-100:])
        print("Mean", np.mean(list(sizes.values())))
 if __name__ == '__main__':
    # store()
-    # run()
+    run(EVALUATE_INDEX_PATH)
-    get_items()
+    # get_items()
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@ -8,6 +8,7 @@ from typing import Iterable
 from urllib.parse import urlparse
 import spacy
 from spacy import Language
 from mwmbl.crawler.batch import HashedBatch, Item
 from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
@ -49,23 +50,23 @@ def run(batch_cache: BatchCache, index_path: str):
        record_urls_in_database(batch_data.values())
        document_tuples = list(get_documents_from_batches(batch_data.values()))
        urls = [url for title, url, extract in document_tuples]
        logger.info(f"Got {len(urls)} document tuples")
        url_db = URLDatabase(db.connection)
-        url_scores = url_db.get_url_scores(urls)
+        index_batches(batch_data.values(), index_path, nlp, url_db)
        logger.info(f"Got {len(url_scores)} scores")
        documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
        page_documents = preprocess_documents(documents, index_path, nlp)
        index_pages(index_path, page_documents)
        logger.info("Indexed pages")
        index_db.update_batch_status([batch.url for batch in batches], BatchStatus.INDEXED)
 def index_batches(batch_data: Iterable[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
    document_tuples = list(get_documents_from_batches(batch_data))
    urls = [url for title, url, extract in document_tuples]
    logger.info(f"Got {len(urls)} document tuples")
    url_scores = url_db.get_url_scores(urls)
    logger.info(f"Got {len(url_scores)} scores")
    documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
    page_documents = preprocess_documents(documents, index_path, nlp)
    index_pages(index_path, page_documents)
 def index_pages(index_path, page_documents):
    with TinyIndex(Document, index_path, 'w') as indexer:
        for page, documents in page_documents.items():