瀏覽代碼

Analyse links to find most popular ones

Daoud Clarke 3 年之前
父節點
當前提交
d77b72d7df
共有 1 個文件被更改,包括 37 次插入0 次删除
  1. 37 0
      analyse/top_links.py

+ 37 - 0
analyse/top_links.py

@@ -0,0 +1,37 @@
+"""
+Analyse crawl data to find the most popular links
+"""
+import glob
+import gzip
+import json
+from collections import defaultdict
+
+from analyse.analyse_crawled_domains import CRAWL_GLOB
+
+
+def get_urls():
+    for path in glob.glob(CRAWL_GLOB):
+        data = json.load(gzip.open(path))
+        for item in data['items']:
+            url = item['url']
+            for link in item['links']:
+                yield url, link
+
+
+def collect_links(urls):
+    links = defaultdict(set)
+    for url, link in urls:
+        links[link].add(url)
+    return links
+
+
+def run():
+    url_links = get_urls()
+    collected = collect_links(url_links)
+    top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100]
+    for url, items in top:
+        print("URL", url, len(items))
+
+
+if __name__ == '__main__':
+    run()