From 2fc999b4027978cd313f5abfd5ce1c7208b91cc9 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Wed, 2 Feb 2022 20:09:59 +0000
Subject: [PATCH] Count unique domains instead of links

---
 analyse/top_links.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/analyse/top_links.py b/analyse/top_links.py
index 466d4a9..012c44f 100644
--- a/analyse/top_links.py
+++ b/analyse/top_links.py
@@ -5,6 +5,7 @@ import glob
 import gzip
 import json
 from collections import defaultdict
+from urllib.parse import urlparse
 
 from analyse.analyse_crawled_domains import CRAWL_GLOB
 
@@ -14,8 +15,9 @@ def get_urls():
         data = json.load(gzip.open(path))
         for item in data['items']:
             url = item['url']
+            domain = urlparse(url).hostname
             for link in item['links']:
-                yield url, link
+                yield domain, link
 
 
 def collect_links(urls):
@@ -28,7 +30,7 @@ def collect_links(urls):
 def run():
     url_links = get_urls()
     collected = collect_links(url_links)
-    top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100]
+    top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
     for url, items in top:
         print("URL", url, len(items))