From ef36513f642b0bbed3234ad5d725733cfa24c5cf Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 29 Jan 2022 07:06:53 +0000 Subject: [PATCH] Analyse the pages that are crawled most often --- analyse/analyse_crawled_domains.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/analyse/analyse_crawled_domains.py b/analyse/analyse_crawled_domains.py index 45b3275..8641f61 100644 --- a/analyse/analyse_crawled_domains.py +++ b/analyse/analyse_crawled_domains.py @@ -4,6 +4,7 @@ See how many unique URLs and root domains we have crawled. import glob import gzip import json +from collections import defaultdict, Counter from urllib.parse import urlparse CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz" @@ -12,24 +13,34 @@ CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz" def get_urls(): for path in glob.glob(CRAWL_GLOB): data = json.load(gzip.open(path)) + user = data['user_id_hash'] for item in data['items']: - yield item['url'] + yield user, item['url'] def analyse_urls(urls): - url_set = set() + url_set = defaultdict(list) domains = set() - count = 0 - for url in urls: - count += 1 - url_set.add(url) + for user, url in urls: + url_set[url].append(user) + parsed_url = urlparse(url) path = parsed_url.path.strip('/') if path == '': domains.add(parsed_url.netloc) + count = sum(len(x) for x in url_set.values()) print("Root pages crawled", sorted(domains)) + find_worst_pages(url_set) print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items") + url_list_size = len(json.dumps(list(url_set.keys()))) + print("Length of all URLs", url_list_size) + + +def find_worst_pages(url_set): + worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50] + for count, url in worst: + print("Worst", count, url, Counter(url_set[url])) def run():