From d77b72d7dfb13ad36c00446357606bc1fd79db22 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Wed, 2 Feb 2022 19:47:38 +0000 Subject: [PATCH] Analyse links to find most popular ones --- analyse/top_links.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 analyse/top_links.py diff --git a/analyse/top_links.py b/analyse/top_links.py new file mode 100644 index 0000000..466d4a9 --- /dev/null +++ b/analyse/top_links.py @@ -0,0 +1,37 @@ +""" +Analyse crawl data to find the most popular links +""" +import glob +import gzip +import json +from collections import defaultdict + +from analyse.analyse_crawled_domains import CRAWL_GLOB + + +def get_urls(): + for path in glob.glob(CRAWL_GLOB): + data = json.load(gzip.open(path)) + for item in data['items']: + url = item['url'] + for link in item['links']: + yield url, link + + +def collect_links(urls): + links = defaultdict(set) + for url, link in urls: + links[link].add(url) + return links + + +def run(): + url_links = get_urls() + collected = collect_links(url_links) + top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100] + for url, items in top: + print("URL", url, len(items)) + + +if __name__ == '__main__': + run()