|
@@ -4,6 +4,7 @@ See how many unique URLs and root domains we have crawled.
|
|
|
import glob
|
|
|
import gzip
|
|
|
import json
|
|
|
+from collections import defaultdict, Counter
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
|
|
@@ -12,24 +13,34 @@ CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
|
|
|
def get_urls():
|
|
|
for path in glob.glob(CRAWL_GLOB):
|
|
|
data = json.load(gzip.open(path))
|
|
|
+ user = data['user_id_hash']
|
|
|
for item in data['items']:
|
|
|
- yield item['url']
|
|
|
+ yield user, item['url']
|
|
|
|
|
|
|
|
|
def analyse_urls(urls):
|
|
|
- url_set = set()
|
|
|
+ url_set = defaultdict(list)
|
|
|
domains = set()
|
|
|
- count = 0
|
|
|
- for url in urls:
|
|
|
- count += 1
|
|
|
- url_set.add(url)
|
|
|
+ for user, url in urls:
|
|
|
+ url_set[url].append(user)
|
|
|
+
|
|
|
parsed_url = urlparse(url)
|
|
|
path = parsed_url.path.strip('/')
|
|
|
if path == '':
|
|
|
domains.add(parsed_url.netloc)
|
|
|
|
|
|
+ count = sum(len(x) for x in url_set.values())
|
|
|
print("Root pages crawled", sorted(domains))
|
|
|
+ find_worst_pages(url_set)
|
|
|
print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
|
|
|
+ url_list_size = len(json.dumps(list(url_set.keys())))
|
|
|
+ print("Length of all URLs", url_list_size)
|
|
|
+
|
|
|
+
|
|
|
+def find_worst_pages(url_set):
|
|
|
+ worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50]
|
|
|
+ for count, url in worst:
|
|
|
+ print("Worst", count, url, Counter(url_set[url]))
|
|
|
|
|
|
|
|
|
def run():
|