diff --git a/analyse/analyse_crawled_domains.py b/analyse/analyse_crawled_domains.py new file mode 100644 index 0000000..45b3275 --- /dev/null +++ b/analyse/analyse_crawled_domains.py @@ -0,0 +1,42 @@ +""" +See how many unique URLs and root domains we have crawled. +""" +import glob +import gzip +import json +from urllib.parse import urlparse + +CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz" + + +def get_urls(): + for path in glob.glob(CRAWL_GLOB): + data = json.load(gzip.open(path)) + for item in data['items']: + yield item['url'] + + +def analyse_urls(urls): + url_set = set() + domains = set() + count = 0 + for url in urls: + count += 1 + url_set.add(url) + parsed_url = urlparse(url) + path = parsed_url.path.strip('/') + if path == '': + domains.add(parsed_url.netloc) + + print("Root pages crawled", sorted(domains)) + print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items") + + +def run(): + urls = get_urls() + analyse_urls(urls) + + +if __name__ == '__main__': + run() + diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py new file mode 100644 index 0000000..bc4a409 --- /dev/null +++ b/mwmbl/indexer/index_crawl.py @@ -0,0 +1,5 @@ +""" +Index data crawled through the Mwmbl crawler. +""" + +