From 5b89bbf05d2b67e9071265ae44a609e58e029d73 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 29 Jan 2022 08:26:42 +0000 Subject: [PATCH] Index Mwmbl crawled data --- mwmbl/indexer/fsqueue.py | 9 ++++++++ mwmbl/indexer/index.py | 3 ++- mwmbl/indexer/index_crawl.py | 41 ++++++++++++++++++++++++++++++++++++ mwmbl/indexer/paths.py | 3 ++- 4 files changed, 54 insertions(+), 2 deletions(-) diff --git a/mwmbl/indexer/fsqueue.py b/mwmbl/indexer/fsqueue.py index d81d869..f05b2d4 100644 --- a/mwmbl/indexer/fsqueue.py +++ b/mwmbl/indexer/fsqueue.py @@ -58,6 +58,15 @@ class GzipJsonRowSerializer(Serializer): return [json.loads(line) for line in lines.strip().split('\n')] +class GzipJsonBlobSerializer(Serializer): + def serialize(self, items: list[object]) -> bytes: + raise NotImplementedError("Serializer not needed - blob is generated by browser extension") + + def deserialize(self, serialized_items: bytes) -> list[object]: + data = gzip.decompress(serialized_items).decode('utf8') + return json.loads(data) + + class FSQueue: def __init__(self, directory: Union[str, Path], name: str, serializer: Serializer): self.directory = str(directory) diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index d0f0efe..d8aef72 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -24,7 +24,8 @@ def is_content_token(nlp, token): return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop -def tokenize(nlp, cleaned_text): +def tokenize(nlp, input_text): + cleaned_text = input_text.encode('utf8', 'replace').decode('utf8') tokens = nlp.tokenizer(cleaned_text) content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS] if is_content_token(nlp, token)] diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py index bc4a409..3c30482 100644 --- a/mwmbl/indexer/index_crawl.py +++ b/mwmbl/indexer/index_crawl.py @@ -1,5 +1,46 @@ """ Index data crawled through the Mwmbl crawler. """ +from logging import getLogger + +import spacy + +from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError +from mwmbl.indexer.index import index_titles_urls_and_extracts +from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR +from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE +logger = getLogger(__name__) + + +def index_mwmbl_craw_data(): + nlp = spacy.load("en_core_web_sm") + + with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: + titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts() + index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH) + + +def get_mwmbl_crawl_titles_urls_and_extracts(): + input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer()) + input_queue.unlock_all() + while True: + try: + next_item = input_queue.get() + except FSQueueError as e: + logger.exception(f'Error with item {e.item_id}') + input_queue.error(e.item_id) + continue + if next_item is None: + logger.info('Not more items to process, stopping') + break + item_id, item_data = next_item + logger.info(f'Processing item {item_id}') + for item in item_data['items']: + yield item['title'], item['url'], item['extract'] + input_queue.done(item_id) + + +if __name__ == '__main__': + index_mwmbl_craw_data() diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py index 41e4672..75e81e8 100644 --- a/mwmbl/indexer/paths.py +++ b/mwmbl/indexer/paths.py @@ -5,6 +5,7 @@ HOME = os.getenv('HOME') DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch' COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv' +MWMBL_CRAWL_TERMS_PATH = DATA_DIR / 'mwmbl-craw-terms.csv' HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') CRAWL_PREFIX = 'crawl_' @@ -19,6 +20,6 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs' DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') -INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch' +INDEX_PATH = Path(__file__).parent.parent.parent / 'data' / 'index.tinysearch' TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'