Analyse the pages that are crawled most often
This commit is contained in:
parent
70254ae160
commit
ef36513f64
1 changed files with 17 additions and 6 deletions
|
@ -4,6 +4,7 @@ See how many unique URLs and root domains we have crawled.
|
|||
import glob
|
||||
import gzip
|
||||
import json
|
||||
from collections import defaultdict, Counter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
|
||||
|
@ -12,24 +13,34 @@ CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
|
|||
def get_urls():
|
||||
for path in glob.glob(CRAWL_GLOB):
|
||||
data = json.load(gzip.open(path))
|
||||
user = data['user_id_hash']
|
||||
for item in data['items']:
|
||||
yield item['url']
|
||||
yield user, item['url']
|
||||
|
||||
|
||||
def analyse_urls(urls):
|
||||
url_set = set()
|
||||
url_set = defaultdict(list)
|
||||
domains = set()
|
||||
count = 0
|
||||
for url in urls:
|
||||
count += 1
|
||||
url_set.add(url)
|
||||
for user, url in urls:
|
||||
url_set[url].append(user)
|
||||
|
||||
parsed_url = urlparse(url)
|
||||
path = parsed_url.path.strip('/')
|
||||
if path == '':
|
||||
domains.add(parsed_url.netloc)
|
||||
|
||||
count = sum(len(x) for x in url_set.values())
|
||||
print("Root pages crawled", sorted(domains))
|
||||
find_worst_pages(url_set)
|
||||
print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
|
||||
url_list_size = len(json.dumps(list(url_set.keys())))
|
||||
print("Length of all URLs", url_list_size)
|
||||
|
||||
|
||||
def find_worst_pages(url_set):
|
||||
worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50]
|
||||
for count, url in worst:
|
||||
print("Worst", count, url, Counter(url_set[url]))
|
||||
|
||||
|
||||
def run():
|
||||
|
|
Loading…
Add table
Reference in a new issue