Count unique domains instead of links

This commit is contained in:
Daoud Clarke 2022-02-02 20:09:59 +00:00
parent 26e90c6e57
commit 2fc999b402

View file

@ -5,6 +5,7 @@ import glob
import gzip
import json
from collections import defaultdict
from urllib.parse import urlparse
from analyse.analyse_crawled_domains import CRAWL_GLOB
@ -14,8 +15,9 @@ def get_urls():
data = json.load(gzip.open(path))
for item in data['items']:
url = item['url']
domain = urlparse(url).hostname
for link in item['links']:
yield url, link
yield domain, link
def collect_links(urls):
@ -28,7 +30,7 @@ def collect_links(urls):
def run():
url_links = get_urls()
collected = collect_links(url_links)
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100]
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
for url, items in top:
print("URL", url, len(items))