Analyse links to find most popular ones
This commit is contained in:
parent
fe6ace93e6
commit
d77b72d7df
1 changed files with 37 additions and 0 deletions
37
analyse/top_links.py
Normal file
37
analyse/top_links.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Analyse crawl data to find the most popular links
|
||||
"""
|
||||
import glob
|
||||
import gzip
|
||||
import json
|
||||
from collections import defaultdict
|
||||
|
||||
from analyse.analyse_crawled_domains import CRAWL_GLOB
|
||||
|
||||
|
||||
def get_urls():
|
||||
for path in glob.glob(CRAWL_GLOB):
|
||||
data = json.load(gzip.open(path))
|
||||
for item in data['items']:
|
||||
url = item['url']
|
||||
for link in item['links']:
|
||||
yield url, link
|
||||
|
||||
|
||||
def collect_links(urls):
|
||||
links = defaultdict(set)
|
||||
for url, link in urls:
|
||||
links[link].add(url)
|
||||
return links
|
||||
|
||||
|
||||
def run():
|
||||
url_links = get_urls()
|
||||
collected = collect_links(url_links)
|
||||
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100]
|
||||
for url, items in top:
|
||||
print("URL", url, len(items))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
Loading…
Add table
Reference in a new issue