From 171fa645d2f8dc00eec452859357e514eb9fb018 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 23 Jan 2022 22:04:30 +0000 Subject: [PATCH] Add script to export top domains --- analyse/export_top_domains.py | 13 +++++++++++++ mwmbl/indexer/paths.py | 2 ++ 2 files changed, 15 insertions(+) create mode 100644 analyse/export_top_domains.py diff --git a/analyse/export_top_domains.py b/analyse/export_top_domains.py new file mode 100644 index 0000000..2794804 --- /dev/null +++ b/analyse/export_top_domains.py @@ -0,0 +1,13 @@ +import json + +from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH +from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS + + +def export_top_domains_to_json(): + with open(TOP_DOMAINS_JSON_PATH, 'w') as output_file: + json.dump(DOMAINS, output_file, indent=2) + + +if __name__ == '__main__': + export_top_domains_to_json() diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py index f9cf1e0..41e4672 100644 --- a/mwmbl/indexer/paths.py +++ b/mwmbl/indexer/paths.py @@ -20,3 +20,5 @@ DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch' + +TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'