Count unique domains instead of links

2022-02-02 20:09:59 +00:00 · 2022-02-02 20:09:59 +00:00 · 2fc999b402
commit 2fc999b402
parent 26e90c6e57
1 changed files with 4 additions and 2 deletions
--- a/analyse/top_links.py
+++ b/analyse/top_links.py
@ -5,6 +5,7 @@ import glob
 import gzip
 import json
 from collections import defaultdict
+from urllib.parse import urlparse

 from analyse.analyse_crawled_domains import CRAWL_GLOB

@ -14,8 +15,9 @@ def get_urls():
        data = json.load(gzip.open(path))
        for item in data['items']:
            url = item['url']
+            domain = urlparse(url).hostname
            for link in item['links']:
-                yield url, link
+                yield domain, link


 def collect_links(urls):
@ -28,7 +30,7 @@ def collect_links(urls):
 def run():
    url_links = get_urls()
    collected = collect_links(url_links)
-    top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100]
+    top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
    for url, items in top:
        print("URL", url, len(items))