Add term info to index

2023-11-18 18:49:41 +00:00 · 2023-11-18 18:49:41 +00:00 · 204304e18e
commit 204304e18e
parent a2b872008f
11 changed files with 71 additions and 150 deletions
--- a/analyse/add_term_info.py
+++ b/analyse/add_term_info.py
@ -10,24 +10,15 @@ from random import Random
 import numpy as np
 from scipy.stats import sem

-from mwmbl.indexer.index import tokenize_document
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, _trim_items_to_page, astuple

 from zstandard import ZstdCompressor

+from mwmbl.utils import add_term_info
+
 random = Random(1)

-INDEX_PATH = Path(os.environ["HOME"]) / "Downloads" / "index-v2.tinysearch"
-# INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"
-
-
-def add_term_info(document: Document, index: TinyIndex, page_index: int):
-    tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
-    for token in tokenized.tokens:
-        token_page_index = index.get_key_page_index(token)
-        if token_page_index == page_index:
-            return Document(document.title, document.url, document.extract, document.score, token)
-    raise ValueError("Could not find token in page index")
+INDEX_PATH = Path(__file__).parent.parent / "devdata" / "index-v2.tinysearch"


 def run():
--- a/analyse/index_local.py
+++ b/analyse/index_local.py
@ -1,57 +0,0 @@
-"""
-Index batches stored locally on the filesystem for the purpose of evaluation.
-"""
-import glob
-import gzip
-import json
-import logging
-import os
-import sys
-from datetime import datetime
-
-import spacy
-
-from mwmbl.crawler import HashedBatch
-from mwmbl.crawler.urls import URLDatabase
-from mwmbl.database import Database
-from mwmbl.indexer import index_batches
-from mwmbl.tinysearchengine import TinyIndex, Document
-
-LOCAL_BATCHES_PATH = f'{os.environ["HOME"]}/data/mwmbl/file/**/*.json.gz'
-NUM_BATCHES = 10000
-EVALUATE_INDEX_PATH = f'{os.environ["HOME"]}/data/mwmbl/evaluate-index.tinysearch'
-NUM_PAGES = 1_024_000
-PAGE_SIZE = 4096
-
-
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-
-
-def get_batches():
-    for path in sorted(glob.glob(LOCAL_BATCHES_PATH, recursive=True))[:NUM_BATCHES]:
-        data = json.load(gzip.open(path))
-        yield HashedBatch.parse_obj(data)
-
-
-def run():
-    try:
-        os.remove(EVALUATE_INDEX_PATH)
-    except FileNotFoundError:
-        pass
-    TinyIndex.create(item_factory=Document, index_path=EVALUATE_INDEX_PATH, num_pages=NUM_PAGES, page_size=PAGE_SIZE)
-
-    batches = get_batches()
-
-    start = datetime.now()
-    with Database() as db:
-        nlp = spacy.load("en_core_web_sm")
-        url_db = URLDatabase(db.connection)
-        index_batches(batches, EVALUATE_INDEX_PATH, nlp, url_db)
-    end = datetime.now()
-
-    total_time = (end - start).total_seconds()
-    print("total_seconds:", total_time)
-
-
-if __name__ == '__main__':
-    run()
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@ -1,60 +0,0 @@
-import logging
-import sys
-
-import numpy as np
-import spacy
-
-from analyse.index_local import EVALUATE_INDEX_PATH
-from mwmbl.indexer import tokenize_document
-from mwmbl.indexer import INDEX_PATH
-from mwmbl.tinysearchengine import TinyIndex, Document
-
-
-logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
-nlp = spacy.load("en_core_web_sm")
-
-
-def store():
-    document = Document(
-        title='A nation in search of the new black | Theatre | The Guardian',
-        url='https://www.theguardian.com/stage/2007/nov/18/theatre',
-        extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
-        score=1.0
-    )
-    with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
-        tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
-        print("Tokenized", tokenized)
-        # for token in tokenized.tokens:
-        #
-        #     tiny_index.index(token, document)
-
-
-def get_items():
-    with TinyIndex(Document, INDEX_PATH) as tiny_index:
-        items = tiny_index.retrieve('wikipedia')
-        if items:
-            for item in items:
-                print("Items", item)
-
-
-def run(index_path):
-    with TinyIndex(Document, index_path) as tiny_index:
-        sizes = {}
-        for i in range(tiny_index.num_pages):
-            page = tiny_index.get_page(i)
-            if page:
-                sizes[i] = len(page)
-            if len(page) > 50:
-                print("Page", len(page), page)
-            # for item in page:
-            #     if ' search' in item.title:
-            #         print("Page", i, item)
-        print("Max", max(sizes.values()))
-        print("Top", sorted(sizes.values())[-100:])
-        print("Mean", np.mean(list(sizes.values())))
-
-
-if __name__ == '__main__':
-    # store()
-    run(EVALUATE_INDEX_PATH)
-    # get_items()
--- a/devdata/index-v2.tinysearch
+++ b/devdata/index-v2.tinysearch
--- a/mwmbl/apps.py
+++ b/mwmbl/apps.py
@ -6,6 +6,10 @@ from pathlib import Path
 from django.apps import AppConfig
 from django.conf import settings

+from mwmbl.crawler.urls import URLDatabase
+from mwmbl.database import Database
+from mwmbl.indexer.indexdb import IndexDatabase
+

 class MwmblConfig(AppConfig):
    name = "mwmbl"
@ -31,6 +35,12 @@ class MwmblConfig(AppConfig):
            TinyIndex.create(item_factory=Document, index_path=index_path, num_pages=settings.NUM_PAGES,
                             page_size=PAGE_SIZE)

+        with Database() as db:
+            url_db = URLDatabase(db.connection)
+            url_db.create_tables()
+            index_db = IndexDatabase(db.connection)
+            index_db.create_tables()
+
        if settings.RUN_BACKGROUND_PROCESSES:
            new_item_queue = Queue()
            Process(target=background.run, args=(settings.DATA_PATH,)).start()
--- a/mwmbl/background.py
+++ b/mwmbl/background.py
@ -1,7 +1,9 @@
 """
 Script that updates data in a background process.
 """
-from logging import getLogger
+import logging
+import sys
+from logging import getLogger, basicConfig
 from pathlib import Path
 from time import sleep

@ -11,6 +13,8 @@ from mwmbl.indexer import index_batches, historical
 from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.paths import BATCH_DIR_NAME, INDEX_NAME

+
+basicConfig(stream=sys.stdout, level=logging.INFO)
 logger = getLogger(__name__)


--- a/mwmbl/indexer/index_batches.py
+++ b/mwmbl/indexer/index_batches.py
@ -16,6 +16,7 @@ from mwmbl.indexer.batch_cache import BatchCache
 from mwmbl.indexer.index import tokenize_document
 from mwmbl.indexer.indexdb import BatchStatus
 from mwmbl.tinysearchengine.indexer import Document, TinyIndex
+from mwmbl.utils import add_term_info, add_term_infos

 logger = getLogger(__name__)

@ -31,22 +32,20 @@ def run(batch_cache: BatchCache, index_path: str):

    def process(batches: Collection[HashedBatch]):
        with Database() as db:
-            nlp = spacy.load("en_core_web_sm")
            url_db = URLDatabase(db.connection)
-            index_batches(batches, index_path, nlp, url_db)
+            index_batches(batches, index_path, url_db)
            logger.info("Indexed pages")

    process_batch.run(batch_cache, BatchStatus.URLS_UPDATED, BatchStatus.INDEXED, process)


-def index_batches(batch_data: Collection[HashedBatch], index_path: str, nlp: Language, url_db: URLDatabase):
+def index_batches(batch_data: Collection[HashedBatch], index_path: str, url_db: URLDatabase):
    document_tuples = list(get_documents_from_batches(batch_data))
    urls = [url for title, url, extract in document_tuples]
-    logger.info(f"Got {len(urls)} document tuples")
    url_scores = url_db.get_url_scores(urls)
-    logger.info(f"Got {len(url_scores)} scores")
+    logger.info(f"Indexing {len(urls)} document tuples and {len(url_scores)} URL scores")
    documents = [Document(title, url, extract, url_scores.get(url, 1.0)) for title, url, extract in document_tuples]
-    page_documents = preprocess_documents(documents, index_path, nlp)
+    page_documents = preprocess_documents(documents, index_path)
    index_pages(index_path, page_documents)


@ -58,24 +57,27 @@ def index_pages(index_path, page_documents):
            seen_urls = set()
            seen_titles = set()
            sorted_documents = sorted(documents + existing_documents, key=lambda x: x.score, reverse=True)
-            for document in sorted_documents:
+            # TODO: for now we add the term here, until all the documents in the index have terms
+            sorted_documents_with_terms = add_term_infos(sorted_documents, indexer, page)
+            for document in sorted_documents_with_terms:
                if document.title in seen_titles or document.url in seen_urls:
                    continue
                new_documents.append(document)
                seen_urls.add(document.url)
                seen_titles.add(document.title)
+            logger.info(f"Storing {len(new_documents)} documents for page {page}, originally {len(existing_documents)}")
            indexer.store_in_page(page, new_documents)


-def preprocess_documents(documents, index_path, nlp):
+def preprocess_documents(documents, index_path):
    page_documents = defaultdict(list)
    with TinyIndex(Document, index_path, 'w') as indexer:
        for document in documents:
            tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
-            # logger.debug(f"Tokenized: {tokenized}")
-            page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
-            for page in page_indexes:
-                page_documents[page].append(document)
+            for token in tokenized.tokens:
+                page = indexer.get_key_page_index(token)
+                term_document = Document(document.title, document.url, document.extract, document.score, token)
+                page_documents[page].append(term_document)
    print(f"Preprocessed for {len(page_documents)} pages")
    return page_documents

--- a/mwmbl/indexer/update_urls.py
+++ b/mwmbl/indexer/update_urls.py
@ -86,7 +86,7 @@ def record_urls_in_database(batches: Collection[HashedBatch], new_item_queue: Qu
 def process_link(user_id_hash, crawled_page_domain, link, unknown_domain_multiplier, timestamp, url_scores, url_timestamps, url_users, is_extra: bool, blacklist_domains):
    parsed_link = urlparse(link)
    if is_domain_blacklisted(parsed_link.netloc, blacklist_domains):
-        logger.info(f"Excluding link for blacklisted domain: {parsed_link}")
+        logger.debug(f"Excluding link for blacklisted domain: {parsed_link}")
        return

    extra_multiplier = EXTRA_LINK_MULTIPLIER if is_extra else 1.0
--- a/mwmbl/platform/curate.py
+++ b/mwmbl/platform/curate.py
@ -1,3 +1,4 @@
+from logging import getLogger
 from typing import Any
 from urllib.parse import parse_qs

@ -9,11 +10,15 @@ from mwmbl.platform.data import CurateBegin, CurateMove, CurateDelete, CurateAdd
    make_curation_type
 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
 from mwmbl.tokenizer import tokenize
+from mwmbl.utils import add_term_info, add_term_infos

 RESULT_URL = "https://mwmbl.org/?q="
 MAX_CURATED_SCORE = 1_111_111.0


+logger = getLogger(__name__)
+
+
 def create_router(index_path: str) -> Router:
    router = Router(tags=["user"])

@ -58,20 +63,24 @@ def create_router(index_path: str) -> Router:
                raise ValueError(f"Should be one query value in the URL: {curation.url}")

            query = queries[0]
-            print("Query", query)
            tokens = tokenize(query)
-            print("Tokens", tokens)
            term = " ".join(tokens)
-            print("Key", term)

            documents = [
                Document(result.title, result.url, result.extract, MAX_CURATED_SCORE - i, term, result.curated)
                for i, result in enumerate(curation.results)
            ]
+
            page_index = indexer.get_key_page_index(term)
-            print("Page index", page_index)
-            print("Storing documents", documents)
-            indexer.store_in_page(page_index, documents)
+            existing_documents_no_terms = indexer.get_page(page_index)
+            existing_documents = add_term_infos(existing_documents_no_terms, indexer, page_index)
+            other_documents = [doc for doc in existing_documents if doc.term != term]
+            logger.info(f"Found {len(other_documents)} other documents for term {term} at page {page_index} "
+                        f"with terms { {doc.term for doc in other_documents} }")
+
+            all_documents = documents + other_documents
+            logger.info(f"Storing {len(all_documents)} documents at page {page_index}")
+            indexer.store_in_page(page_index, all_documents)

        return {"curation": "ok"}

--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@ -189,7 +189,6 @@ class TinyIndex(Generic[T]):
        except ZstdError:
            logger.exception(f"Error decompressing page data, content: {page_data}")
            return []
-        # logger.debug(f"Decompressed data: {decompressed_data}")
        return json.loads(decompressed_data.decode('utf8'))

    def store_in_page(self, page_index: int, values: list[T]):
--- a/mwmbl/utils.py
+++ b/mwmbl/utils.py
@ -1,5 +1,8 @@
 import re

+from mwmbl.indexer.index import tokenize_document
+from mwmbl.tinysearchengine.indexer import Document, TinyIndex
+
 DOMAIN_REGEX = re.compile(r".*://([^/]*)")


@ -17,3 +20,23 @@ def get_domain(url):
    if results is None or len(results.groups()) == 0:
        raise ValueError(f"Unable to parse domain from URL {url}")
    return results.group(1)
+
+
+def add_term_info(document: Document, index: TinyIndex, page_index: int):
+    tokenized = tokenize_document(document.url, document.title, document.extract, document.score)
+    for token in tokenized.tokens:
+        token_page_index = index.get_key_page_index(token)
+        if token_page_index == page_index:
+            return Document(document.title, document.url, document.extract, document.score, token)
+    raise ValueError("Could not find token in page index")
+
+
+def add_term_infos(documents: list[Document], index: TinyIndex, page_index: int):
+    for document in documents:
+        if document.term is not None:
+            yield document
+            continue
+        try:
+            yield add_term_info(document, index, page_index)
+        except ValueError:
+            continue