Script to index local batch for evaluation

2022-08-22 22:47:42 +01:00 · 2022-08-22 22:47:42 +01:00 · b1eea2457f
commit b1eea2457f
parent 480be85cfd
3 changed files with 82 additions and 20 deletions
--- a/analyse/index_local.py
+++ b/analyse/index_local.py
@ -0,0 +1,51 @@
+"""
+Index batches stored locally on the filesystem for the purpose of evaluation.
+"""
+import glob
+import gzip
+import json
+import logging
+import os
+import sys
+from pathlib import Path
+
+import spacy
+
+from mwmbl.crawler.batch import HashedBatch
+from mwmbl.crawler.urls import URLDatabase
+from mwmbl.database import Database
+from mwmbl.indexer.index_batches import index_batches
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+
+LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
+NUM_BATCHES = 10000
+EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
+NUM_PAGES = 1_024_000
+PAGE_SIZE = 4096
+
+
+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
+
+
+def get_batches():
+    for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
+        data = json.load(gzip.open(path))
+        yield HashedBatch.parse_obj(data)
+
+
+def run():
+    try:
+        os.remove(EVALUATE_INDEX_PATH)
+    except FileNotFoundError:
+        pass
+    TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
+
+    batches = get_batches()
+    with Database() as db:
+        nlp = spacy.load("en_core_web_sm")
+        url_db = URLDatabase(db.connection)
+        index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
+
+
+if __name__ == '__main__':
+    run()
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@ -1,8 +1,10 @@
 import logging
 import sys

+import numpy as np
 import spacy

+from analyse.index_local import EVALUATE_INDEX_PATH
 from mwmbl.indexer.index import tokenize_document
 from mwmbl.indexer.paths import INDEX_PATH
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
@ -35,16 +37,24 @@ def get_items():
                print("Items", item)


-def run():
-    with TinyIndex(Document, INDEX_PATH) as tiny_index:
-        for i in range(100000):
+def run(index_path):
+    with TinyIndex(Document, index_path) as tiny_index:
+        sizes = {}
+        for i in range(tiny_index.num_pages):
            page = tiny_index.get_page(i)
-            for item in page:
-                if ' search' in item.title:
-                    print("Page", i, item)
+            if page:
+                sizes[i] = len(page)
+            if len(page) > 50:
+                print("Page", len(page), page)
+            # for item in page:
+            #     if ' search' in item.title:
+            #         print("Page", i, item)
+        print("Max", max(sizes.values()))
+        print("Top", sorted(sizes.values())[-100:])
+        print("Mean", np.mean(list(sizes.values())))


 if __name__ == '__main__':
    # store()
-    # run()
-    get_items()
+    run(EVALUATE_INDEX_PATH)
+    # get_items()
--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@ -8,6 +8,7 @@ from typing import Iterable
 from urllib.parse import urlparse

 import spacy
+from spacy import Language

 from mwmbl.crawler.batch import HashedBatch, Item
 from mwmbl.crawler.urls import URLDatabase, URLStatus, FoundURL
@ -49,23 +50,23 @@ def run(batch_cache: BatchCache, index_path: str):

        record_urls_in_database(batch_data.values())

-        document_tuples = list(get_documents_from_batches(batch_data.values()))
-        urls = [url for title, url, extract in document_tuples]
-
-        logger.info(f"Got {len(urls)} document tuples")
-
        url_db = URLDatabase(db.connection)
-        url_scores = url_db.get_url_scores(urls)
-
-        logger.info(f"Got {len(url_scores)} scores")
-        documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
-
-        page_documents = preprocess_documents(documents, index_path, nlp)
-        index_pages(index_path, page_documents)
+        index_batches(batch_data.values(), index_path, nlp, url_db)
        logger.info("Indexed pages")
        index_db.update_batch_status([batch.url for batch in batches], BatchStatus.INDEXED)


+def index_batches(batch_data: Iterable[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
+    document_tuples = list(get_documents_from_batches(batch_data))
+    urls = [url for title, url, extract in document_tuples]
+    logger.info(f"Got {len(urls)} document tuples")
+    url_scores = url_db.get_url_scores(urls)
+    logger.info(f"Got {len(url_scores)} scores")
+    documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
+    page_documents = preprocess_documents(documents, index_path, nlp)
+    index_pages(index_path, page_documents)
+
+
 def index_pages(index_path, page_documents):
    with TinyIndex(Document, index_path, 'w') as indexer:
        for page, documents in page_documents.items():