Merge pull request #39 from mwmbl/analyse-links

Analyse links
This commit is contained in:
Daoud Clarke 2022-02-03 19:33:52 +00:00 committed by GitHub
commit bdf0fd1797
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

39
analyse/top_links.py Normal file
View file

@ -0,0 +1,39 @@
"""
Analyse crawl data to find the most popular links
"""
import glob
import gzip
import json
from collections import defaultdict
from urllib.parse import urlparse
from analyse.analyse_crawled_domains import CRAWL_GLOB
def get_urls():
for path in glob.glob(CRAWL_GLOB):
data = json.load(gzip.open(path))
for item in data['items']:
url = item['url']
domain = urlparse(url).hostname
for link in item['links']:
yield domain, link
def collect_links(urls):
links = defaultdict(set)
for url, link in urls:
links[link].add(url)
return links
def run():
url_links = get_urls()
collected = collect_links(url_links)
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
for url, items in top:
print("URL", url, len(items))
if __name__ == '__main__':
run()