Index Mwmbl crawled data

This commit is contained in:
Daoud Clarke 2022-01-29 08:26:42 +00:00
parent ef36513f64
commit 5b89bbf05d
4 changed files with 54 additions and 2 deletions

View file

@ -58,6 +58,15 @@ class GzipJsonRowSerializer(Serializer):
return [json.loads(line) for line in lines.strip().split('\n')]
class GzipJsonBlobSerializer(Serializer):
def serialize(self, items: list[object]) -> bytes:
raise NotImplementedError("Serializer not needed - blob is generated by browser extension")
def deserialize(self, serialized_items: bytes) -> list[object]:
data = gzip.decompress(serialized_items).decode('utf8')
return json.loads(data)
class FSQueue:
def __init__(self, directory: Union[str, Path], name: str, serializer: Serializer):
self.directory = str(directory)

View file

@ -24,7 +24,8 @@ def is_content_token(nlp, token):
return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
def tokenize(nlp, cleaned_text):
def tokenize(nlp, input_text):
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
tokens = nlp.tokenizer(cleaned_text)
content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
if is_content_token(nlp, token)]

View file

@ -1,5 +1,46 @@
"""
Index data crawled through the Mwmbl crawler.
"""
from logging import getLogger
import spacy
from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
from mwmbl.indexer.index import index_titles_urls_and_extracts
from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
logger = getLogger(__name__)
def index_mwmbl_craw_data():
nlp = spacy.load("en_core_web_sm")
with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
def get_mwmbl_crawl_titles_urls_and_extracts():
input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
input_queue.unlock_all()
while True:
try:
next_item = input_queue.get()
except FSQueueError as e:
logger.exception(f'Error with item {e.item_id}')
input_queue.error(e.item_id)
continue
if next_item is None:
logger.info('Not more items to process, stopping')
break
item_id, item_data = next_item
logger.info(f'Processing item {item_id}')
for item in item_data['items']:
yield item['title'], item['url'], item['extract']
input_queue.done(item_id)
if __name__ == '__main__':
index_mwmbl_craw_data()

View file

@ -5,6 +5,7 @@ HOME = os.getenv('HOME')
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
MWMBL_CRAWL_TERMS_PATH = DATA_DIR / 'mwmbl-craw-terms.csv'
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
@ -19,6 +20,6 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs'
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'
INDEX_PATH = Path(__file__).parent.parent.parent / 'data' / 'index.tinysearch'
TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'