mwmbl/analyse/record_historical_batches.py
2022-06-09 22:24:54 +01:00

57 lines
1.3 KiB
Python

"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests
from mwmbl.indexer.paths import CRAWL_GLOB
API_ENDPOINT = "http://95.216.215.29/batches/historical"
def total_num_batches():
return len(glob.glob(CRAWL_GLOB))
def get_batches():
for path in sorted(glob.glob(CRAWL_GLOB)):
hashed_batch = json.load(gzip.open(path))
yield hashed_batch
def convert_item(item):
return {
'url': item['url'],
'status': 200,
'timestamp': item['timestamp'],
'content': {
'title': item['title'],
'extract': item['extract'],
'links': item['links'],
}
}
def run():
total_batches = total_num_batches()
batches = get_batches()
for i, hashed_batch in enumerate(batches):
new_batch = {
'user_id_hash': hashed_batch['user_id_hash'],
'timestamp': hashed_batch['timestamp'],
'items': [convert_item(item) for item in hashed_batch['items']]
}
response = requests.post(API_ENDPOINT, json=new_batch)
print(f"Response {i} of {total_batches}", response)
if __name__ == '__main__':
run()