From 25918e42ef7fbfb894acc62853e862f5c59dc443 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 2 Jan 2022 20:06:13 +0000 Subject: [PATCH] Export URLs to sqlite for evaluation purposes --- analyse/export_urls.py | 41 +++++++++++++++++++++++++++++++++++ mwmbl/indexer/paths.py | 1 + mwmbl/tinysearchengine/app.py | 20 ++++++++++------- 3 files changed, 54 insertions(+), 8 deletions(-) create mode 100644 analyse/export_urls.py diff --git a/analyse/export_urls.py b/analyse/export_urls.py new file mode 100644 index 0000000..129db83 --- /dev/null +++ b/analyse/export_urls.py @@ -0,0 +1,41 @@ +""" +Export the list of unique URLs to a SQLite file for analysis/evaluation. +""" +import sqlite3 + +from mwmbl.indexer.paths import URLS_PATH +from mwmbl.tinysearchengine.app import get_config_and_index + + +def create_database(): + with sqlite3.connect(URLS_PATH) as connection: + connection.execute(""" + CREATE TABLE urls (url TEXT PRIMARY KEY) + """) + + +def get_url_batches(): + config, index = get_config_and_index() + for page_num in range(config.index_config.num_pages): + if page_num % 1000 == 0: + print("Processing page", page_num) + page = index.get_page(page_num) + if page is None: + continue + yield [url for title, url, extract in page] + + +def run(): + create_database() + url_batches = get_url_batches() + + with sqlite3.connect(URLS_PATH) as connection: + for url_batch in url_batches: + parameters = [(url,) for url in url_batch] + connection.executemany(""" + INSERT OR IGNORE INTO urls VALUES (?) + """, parameters) + + +if __name__ == '__main__': + run() diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py index 7f02121..f9cf1e0 100644 --- a/mwmbl/indexer/paths.py +++ b/mwmbl/indexer/paths.py @@ -14,6 +14,7 @@ TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv') WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz') +URLS_PATH = DATA_DIR / 'urls.sqlite3' DOMAINS_QUEUE_NAME = 'domains-queue-fs' DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') diff --git a/mwmbl/tinysearchengine/app.py b/mwmbl/tinysearchengine/app.py index 08f0734..e692284 100644 --- a/mwmbl/tinysearchengine/app.py +++ b/mwmbl/tinysearchengine/app.py @@ -26,14 +26,7 @@ def main(): * Initialize a FastAPI app instance * Starts uvicorn server using app instance """ - args = setup_args() - config = parse_config_file(config_filename=args.config) - - # Initialize TinyIndex using index config params - tiny_index = TinyIndex( - item_factory=Document, - **config.index_config.dict() - ) + config, tiny_index = get_config_and_index() # Initialize FastApi instance app = create_app.create(tiny_index) @@ -42,5 +35,16 @@ def main(): uvicorn.run(app, **config.server_config.dict()) +def get_config_and_index(): + args = setup_args() + config = parse_config_file(config_filename=args.config) + # Initialize TinyIndex using index config params + tiny_index = TinyIndex( + item_factory=Document, + **config.index_config.dict() + ) + return config, tiny_index + + if __name__ == "__main__": main()