From 5b89bbf05d2b67e9071265ae44a609e58e029d73 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Sat, 29 Jan 2022 08:26:42 +0000
Subject: [PATCH] Index Mwmbl crawled data

---
 mwmbl/indexer/fsqueue.py     |  9 ++++++++
 mwmbl/indexer/index.py       |  3 ++-
 mwmbl/indexer/index_crawl.py | 41 ++++++++++++++++++++++++++++++++++++
 mwmbl/indexer/paths.py       |  3 ++-
 4 files changed, 54 insertions(+), 2 deletions(-)

diff --git a/mwmbl/indexer/fsqueue.py b/mwmbl/indexer/fsqueue.py
index d81d869..f05b2d4 100644
--- a/mwmbl/indexer/fsqueue.py
+++ b/mwmbl/indexer/fsqueue.py
@@ -58,6 +58,15 @@ class GzipJsonRowSerializer(Serializer):
         return [json.loads(line) for line in lines.strip().split('\n')]
 
 
+class GzipJsonBlobSerializer(Serializer):
+    def serialize(self, items: list[object]) -> bytes:
+        raise NotImplementedError("Serializer not needed - blob is generated by browser extension")
+
+    def deserialize(self, serialized_items: bytes) -> list[object]:
+        data = gzip.decompress(serialized_items).decode('utf8')
+        return json.loads(data)
+
+
 class FSQueue:
     def __init__(self, directory: Union[str, Path], name: str, serializer: Serializer):
         self.directory = str(directory)
diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py
index d0f0efe..d8aef72 100644
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@@ -24,7 +24,8 @@ def is_content_token(nlp, token):
     return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
 
 
-def tokenize(nlp, cleaned_text):
+def tokenize(nlp, input_text):
+    cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
     tokens = nlp.tokenizer(cleaned_text)
     content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
                       if is_content_token(nlp, token)]
diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py
index bc4a409..3c30482 100644
--- a/mwmbl/indexer/index_crawl.py
+++ b/mwmbl/indexer/index_crawl.py
@@ -1,5 +1,46 @@
 """
 Index data crawled through the Mwmbl crawler.
 """
+from logging import getLogger
+
+import spacy
+
+from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
+from mwmbl.indexer.index import index_titles_urls_and_extracts
+from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
+from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
 
 
+logger = getLogger(__name__)
+
+
+def index_mwmbl_craw_data():
+    nlp = spacy.load("en_core_web_sm")
+
+    with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
+        titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
+        index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
+
+
+def get_mwmbl_crawl_titles_urls_and_extracts():
+    input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
+    input_queue.unlock_all()
+    while True:
+        try:
+            next_item = input_queue.get()
+        except FSQueueError as e:
+            logger.exception(f'Error with item {e.item_id}')
+            input_queue.error(e.item_id)
+            continue
+        if next_item is None:
+            logger.info('Not more items to process, stopping')
+            break
+        item_id, item_data = next_item
+        logger.info(f'Processing item {item_id}')
+        for item in item_data['items']:
+            yield item['title'], item['url'], item['extract']
+        input_queue.done(item_id)
+
+
+if __name__ == '__main__':
+    index_mwmbl_craw_data()
diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py
index 41e4672..75e81e8 100644
--- a/mwmbl/indexer/paths.py
+++ b/mwmbl/indexer/paths.py
@@ -5,6 +5,7 @@ HOME = os.getenv('HOME')
 
 DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
 COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
+MWMBL_CRAWL_TERMS_PATH = DATA_DIR / 'mwmbl-craw-terms.csv'
 
 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 CRAWL_PREFIX = 'crawl_'
@@ -19,6 +20,6 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs'
 DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
 DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
 
-INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'
+INDEX_PATH = Path(__file__).parent.parent.parent / 'data' / 'index.tinysearch'
 
 TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'