From 2fc999b4027978cd313f5abfd5ce1c7208b91cc9 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Wed, 2 Feb 2022 20:09:59 +0000 Subject: [PATCH] Count unique domains instead of links --- analyse/top_links.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/analyse/top_links.py b/analyse/top_links.py index 466d4a9..012c44f 100644 --- a/analyse/top_links.py +++ b/analyse/top_links.py @@ -5,6 +5,7 @@ import glob import gzip import json from collections import defaultdict +from urllib.parse import urlparse from analyse.analyse_crawled_domains import CRAWL_GLOB @@ -14,8 +15,9 @@ def get_urls(): data = json.load(gzip.open(path)) for item in data['items']: url = item['url'] + domain = urlparse(url).hostname for link in item['links']: - yield url, link + yield domain, link def collect_links(urls): @@ -28,7 +30,7 @@ def collect_links(urls): def run(): url_links = get_urls() collected = collect_links(url_links) - top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100] + top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000] for url, items in top: print("URL", url, len(items))