Analyse the pages that are crawled most often

This commit is contained in:
Daoud Clarke 2022-01-29 07:06:53 +00:00
parent 70254ae160
commit ef36513f64

View file

@ -4,6 +4,7 @@ See how many unique URLs and root domains we have crawled.
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
@ -12,24 +13,34 @@ CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
def get_urls():
for path in glob.glob(CRAWL_GLOB):
data = json.load(gzip.open(path))
user = data['user_id_hash']
for item in data['items']:
yield item['url']
yield user, item['url']
def analyse_urls(urls):
url_set = set()
url_set = defaultdict(list)
domains = set()
count = 0
for url in urls:
count += 1
url_set.add(url)
for user, url in urls:
url_set[url].append(user)
parsed_url = urlparse(url)
path = parsed_url.path.strip('/')
if path == '':
domains.add(parsed_url.netloc)
count = sum(len(x) for x in url_set.values())
print("Root pages crawled", sorted(domains))
find_worst_pages(url_set)
print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
url_list_size = len(json.dumps(list(url_set.keys())))
print("Length of all URLs", url_list_size)
def find_worst_pages(url_set):
worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50]
for count, url in worst:
print("Worst", count, url, Counter(url_set[url]))
def run():