mwmbl/analyse/record_historical_batches.py

56 lines
1.2 KiB
Python
Raw Normal View History

2022-06-05 08:15:04 +00:00
"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
import requests
2023-10-10 12:51:06 +00:00
from mwmbl.indexer import CRAWL_GLOB
2022-06-05 08:15:04 +00:00
2022-06-09 21:24:54 +00:00
API_ENDPOINT = "http://95.216.215.29/batches/historical"
def total_num_batches():
return len(glob.glob(CRAWL_GLOB))
2022-06-05 08:15:04 +00:00
def get_batches():
2022-06-09 21:24:54 +00:00
for path in sorted(glob.glob(CRAWL_GLOB)):
2022-06-05 08:15:04 +00:00
hashed_batch = json.load(gzip.open(path))
yield hashed_batch
2022-06-09 21:24:54 +00:00
def convert_item(item):
return {
'url': item['url'],
'status': 200,
'timestamp': item['timestamp'],
'content': {
'title': item['title'],
'extract': item['extract'],
'links': item['links'],
}
}
2022-06-05 08:15:04 +00:00
def run():
2022-06-09 21:24:54 +00:00
total_batches = total_num_batches()
2022-06-05 08:15:04 +00:00
batches = get_batches()
2022-06-09 21:24:54 +00:00
for i, hashed_batch in enumerate(batches):
new_batch = {
'user_id_hash': hashed_batch['user_id_hash'],
'timestamp': hashed_batch['timestamp'],
'items': [convert_item(item) for item in hashed_batch['items']]
}
response = requests.post(API_ENDPOINT, json=new_batch)
print(f"Response {i} of {total_batches}", response)
2022-06-05 08:15:04 +00:00
if __name__ == '__main__':
run()