Count unique domains instead of links
This commit is contained in:
parent
26e90c6e57
commit
2fc999b402
1 changed files with 4 additions and 2 deletions
|
@ -5,6 +5,7 @@ import glob
|
|||
import gzip
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from analyse.analyse_crawled_domains import CRAWL_GLOB
|
||||
|
||||
|
@ -14,8 +15,9 @@ def get_urls():
|
|||
data = json.load(gzip.open(path))
|
||||
for item in data['items']:
|
||||
url = item['url']
|
||||
domain = urlparse(url).hostname
|
||||
for link in item['links']:
|
||||
yield url, link
|
||||
yield domain, link
|
||||
|
||||
|
||||
def collect_links(urls):
|
||||
|
@ -28,7 +30,7 @@ def collect_links(urls):
|
|||
def run():
|
||||
url_links = get_urls()
|
||||
collected = collect_links(url_links)
|
||||
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:100]
|
||||
top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
|
||||
for url, items in top:
|
||||
print("URL", url, len(items))
|
||||
|
||||
|
|
Loading…
Reference in a new issue