mwmbl/analyse/record_historical_batches.py

"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json

import requests

from mwmbl.indexer import CRAWL_GLOB


API_ENDPOINT = "http://95.216.215.29/batches/historical"


def total_num_batches():
    return len(glob.glob(CRAWL_GLOB))


def get_batches():
    for path in sorted(glob.glob(CRAWL_GLOB)):
        hashed_batch = json.load(gzip.open(path))
        yield hashed_batch


def convert_item(item):
    return {
        'url': item['url'],
        'status': 200,
        'timestamp': item['timestamp'],
        'content': {
            'title': item['title'],
            'extract': item['extract'],
            'links': item['links'],
        }
    }


def run():
    total_batches = total_num_batches()
    batches = get_batches()
    for i, hashed_batch in enumerate(batches):
        new_batch = {
            'user_id_hash': hashed_batch['user_id_hash'],
            'timestamp': hashed_batch['timestamp'],
            'items': [convert_item(item) for item in hashed_batch['items']]
        }
        response = requests.post(API_ENDPOINT, json=new_batch)
        print(f"Response {i} of {total_batches}", response)


if __name__ == '__main__':
    run()