3 years ago · 93307ad1ec
--- a/analyse/inspect_index.py
+++ b/analyse/inspect_index.py
@@ -1,20 +1,50 @@
 
				-from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
			
 
				+import logging
			
 
				+import sys
			
 
				+
			
 
				+import spacy
			
 
				+
			
 
				+from mwmbl.indexer.index import tokenize_document
			
 
				 from mwmbl.indexer.paths import INDEX_PATH
			
 
				+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
			
 
				+
			
 
				+
			
 
				+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
			
 
				+nlp = spacy.load("en_core_web_sm")
			
 
				+
			
 
				+
			
 
				+def store():
			
 
				+    document = Document(
			
 
				+        title='A nation in search of the new black | Theatre | The Guardian',
			
 
				+        url='https://www.theguardian.com/stage/2007/nov/18/theatre',
			
 
				+        extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
			
 
				+        score=1.0
			
 
				+    )
			
 
				+    with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
			
 
				+        tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
			
 
				+        print("Tokenized", tokenized)
			
 
				+        # for token in tokenized.tokens:
			
 
				+        #
			
 
				+        #     tiny_index.index(token, document)
			
 
				 
			
 
				 
			
 
				 def get_items():
			
 
				-    tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
			
 
				-    items = tiny_index.retrieve('soup')
			
 
				-    if items:
			
 
				-        for item in items:
			
 
				-            print("Items", item)
			
 
				+    with TinyIndex(Document, INDEX_PATH) as tiny_index:
			
 
				+        items = tiny_index.retrieve('search')
			
 
				+        if items:
			
 
				+            for item in items:
			
 
				+                print("Items", item)
			
 
				 
			
 
				 
			
 
				 def run():
			
 
				-    tiny_index = TinyIndex(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE)
			
 
				-    for i in range(100):
			
 
				-        tiny_index.get_page(i)
			
 
				+    with TinyIndex(Document, INDEX_PATH) as tiny_index:
			
 
				+        for i in range(100000):
			
 
				+            page = tiny_index.get_page(i)
			
 
				+            for item in page:
			
 
				+                if ' search' in item.title:
			
 
				+                    print("Page", i, item)
			
 
				 
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    run()
			
 
				+    # store()
			
 
				+    # run()
			
 
				+    get_items()
			
--- a/analyse/send_batch.py
+++ b/analyse/send_batch.py
@@ -0,0 +1,27 @@
 
				+"""
			
 
				+Send a batch to a running instance.
			
 
				+"""
			
 
				+import requests
			
 
				+
			
 
				+from mwmbl.crawler.batch import Batch, Item, ItemContent
			
 
				+
			
 
				+
			
 
				+URL = 'http://localhost:5000/crawler/batches/'
			
 
				+
			
 
				+
			
 
				+def run():
			
 
				+    batch = Batch(user_id='test_user_id111111111111111111111111', items=[Item(
			
 
				+        url='https://www.theguardian.com/stage/2007/nov/18/theatre',
			
 
				+        content=ItemContent(
			
 
				+            title='A nation in search of the new black | Theatre | The Guardian',
			
 
				+            extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
			
 
				+            links=[]),
			
 
				+        timestamp=123456,
			
 
				+        status=200,
			
 
				+    )])
			
 
				+    result = requests.post(URL, data=batch.json())
			
 
				+    print("Result", result.content)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    run()
			
--- a/mwmbl/background.py
+++ b/mwmbl/background.py
@@ -13,18 +13,18 @@ logger = getLogger(__name__)
 
				 
			
 
				 
			
 
				 def run(index_path: str):
			
 
				-    historical.run()
			
 
				+    # historical.run()
			
 
				     while True:
			
 
				-        try:
			
 
				-            retrieve_batches()
			
 
				-        except Exception:
			
 
				-            logger.exception("Error retrieving batches")
			
 
				+        # try:
			
 
				+        #     retrieve_batches()
			
 
				+        # except Exception:
			
 
				+        #     logger.exception("Error retrieving batches")
			
 
				         try:
			
 
				             run_preprocessing(index_path)
			
 
				         except Exception:
			
 
				             logger.exception("Error preprocessing")
			
 
				-        try:
			
 
				-            run_update(index_path)
			
 
				-        except Exception:
			
 
				-            logger.exception("Error running index update")
			
 
				+        # try:
			
 
				+        #     run_update(index_path)
			
 
				+        # except Exception:
			
 
				+        #     logger.exception("Error running index update")
			
 
				         sleep(10)
			
--- a/mwmbl/crawler/app.py
+++ b/mwmbl/crawler/app.py
@@ -277,7 +277,6 @@ def status():
 
				 
			
 
				 def queue_batch(batch: HashedBatch):
			
 
				     # TODO: get the score from the URLs database
			
 
				-    # TODO: also queue documents for batches sent through the API
			
 
				     documents = [Document(item.content.title, item.url, item.content.extract, 1)
			
 
				                  for item in batch.items if item.content is not None]
			
 
				     with Database() as db:
			
--- a/mwmbl/indexer/preprocess.py
+++ b/mwmbl/indexer/preprocess.py
@@ -2,6 +2,7 @@
 
				 Preprocess local documents for indexing.
			
 
				 """
			
 
				 import traceback
			
 
				+from logging import getLogger
			
 
				 from time import sleep
			
 
				 
			
 
				 import spacy
			
@@ -12,6 +13,9 @@ from mwmbl.indexer.index import tokenize_document
 
				 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
			
 
				 
			
 
				 
			
 
				+logger = getLogger(__name__)
			
 
				+
			
 
				+
			
 
				 def run(index_path):
			
 
				     while True:
			
 
				         try:
			
@@ -34,7 +38,9 @@ def run_preprocessing(index_path):
 
				             with TinyIndex(Document, index_path, 'w') as indexer:
			
 
				                 for document in documents:
			
 
				                     tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
			
 
				+                    logger.debug(f"Tokenized: {tokenized}")
			
 
				                     page_indexes = [indexer.get_key_page_index(token) for token in tokenized.tokens]
			
 
				+                    logger.debug(f"Page indexes: {page_indexes}")
			
 
				                     index_db.queue_documents_for_page([(tokenized.url, i) for i in page_indexes])
			
 
				 
			
 
				 
			
--- a/mwmbl/indexer/update_pages.py
+++ b/mwmbl/indexer/update_pages.py
@@ -30,6 +30,7 @@ def run_update(index_path):
 
				                         except ValueError:
			
 
				                             documents = documents[:len(documents)//2]
			
 
				                             if len(documents) == 0:
			
 
				+                                print("No more space")
			
 
				                                 break
			
 
				                             print(f"Not enough space, adding {len(documents)}")
			
 
				                 index_db.clear_queued_documents_for_page(i)
			
--- a/mwmbl/main.py
+++ b/mwmbl/main.py
@@ -1,6 +1,7 @@
 
				 import argparse
			
 
				 import logging
			
 
				 import os
			
 
				+import sys
			
 
				 from multiprocessing import Process
			
 
				 
			
 
				 import uvicorn
			
@@ -14,7 +15,7 @@ from mwmbl.tinysearchengine.completer import Completer
 
				 from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE
			
 
				 from mwmbl.tinysearchengine.rank import HeuristicRanker
			
 
				 
			
 
				-logging.basicConfig()
			
 
				+logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
			
 
				 
			
 
				 
			
 
				 def setup_args():
			
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -2,6 +2,7 @@ import json
 
				 import os
			
 
				 from dataclasses import astuple, dataclass, asdict
			
 
				 from io import UnsupportedOperation
			
 
				+from logging import getLogger
			
 
				 from mmap import mmap, PROT_READ, PROT_WRITE
			
 
				 from typing import TypeVar, Generic, Callable, List
			
 
				 
			
@@ -16,6 +17,9 @@ NUM_PAGES = 5_120_000
 
				 PAGE_SIZE = 4096
			
 
				 
			
 
				 
			
 
				+logger = getLogger(__name__)
			
 
				+
			
 
				+
			
 
				 @dataclass
			
 
				 class Document:
			
 
				     title: str
			
@@ -92,6 +96,7 @@ class TinyIndex(Generic[T]):
 
				         self.page_size = metadata.page_size
			
 
				         self.compressor = ZstdCompressor()
			
 
				         self.decompressor = ZstdDecompressor()
			
 
				+        logger.info(f"Loaded index with {self.num_pages} pages and {self.page_size} page size")
			
 
				         self.index_file = None
			
 
				         self.mmap = None
			
 
				 
			
@@ -107,13 +112,14 @@ class TinyIndex(Generic[T]):
 
				 
			
 
				     def retrieve(self, key: str) -> List[T]:
			
 
				         index = self.get_key_page_index(key)
			
 
				+        logger.debug(f"Retrieving index {index}")
			
 
				         return self.get_page(index)
			
 
				 
			
 
				     def get_key_page_index(self, key) -> int:
			
 
				         key_hash = mmh3.hash(key, signed=False)
			
 
				         return key_hash % self.num_pages
			
 
				 
			
 
				-    def get_page(self, i):
			
 
				+    def get_page(self, i) -> list[T]:
			
 
				         """
			
 
				         Get the page at index i, decompress and deserialise it using JSON
			
 
				         """
			
@@ -123,6 +129,7 @@ class TinyIndex(Generic[T]):
 
				     def _get_page_tuples(self, i):
			
 
				         page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
			
 
				         decompressed_data = self.decompressor.decompress(page_data)
			
 
				+        # logger.debug(f"Decompressed data: {decompressed_data}")
			
 
				         return json.loads(decompressed_data.decode('utf8'))
			
 
				 
			
 
				     def index(self, key: str, value: T):