Analyse links to find most popular ones

This commit is contained in:
Daoud Clarke 2022-02-02 19:47:38 +00:00
parent fe6ace93e6
commit d77b72d7df

37
analyse/top_links.py Normal file
View file

@ -0,0 +1,37 @@
"""
Analyse crawl data to find the most popular links
"""
import glob
import gzip
import json
from collections import defaultdict
from analyse.analyse_crawled_domains import CRAWL_GLOB
def get_urls():
for path in glob.glob(CRAWL_GLOB):
data = json.load(gzip.open(path))
for item in data['items']:
url = item['url']
for link in item['links']:
yield url, link
def collect_links(urls):
links = defaultdict(set)
for url, link in urls:
links[link].add(url)
return links
def run():
url_links = get_urls()
collected = collect_links(url_links)
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100]
for url, items in top:
print("URL", url, len(items))
if __name__ == '__main__':
run()