diff --git a/analyse/record_historical_batches.py b/analyse/record_historical_batches.py index 6c8c80e..4d8ccd3 100644 --- a/analyse/record_historical_batches.py +++ b/analyse/record_historical_batches.py @@ -12,21 +12,44 @@ import requests from mwmbl.indexer.paths import CRAWL_GLOB -API_ENDPOINT = "http://localhost:8080/batches/historical" +API_ENDPOINT = "http://95.216.215.29/batches/historical" + + +def total_num_batches(): + return len(glob.glob(CRAWL_GLOB)) def get_batches(): - for path in glob.glob(CRAWL_GLOB): + for path in sorted(glob.glob(CRAWL_GLOB)): hashed_batch = json.load(gzip.open(path)) yield hashed_batch +def convert_item(item): + return { + 'url': item['url'], + 'status': 200, + 'timestamp': item['timestamp'], + 'content': { + 'title': item['title'], + 'extract': item['extract'], + 'links': item['links'], + } + } + + + def run(): + total_batches = total_num_batches() batches = get_batches() - for hashed_batch in batches: - print("Recording batch", hashed_batch) - response = requests.post(API_ENDPOINT, json=hashed_batch) - print("Response", response) + for i, hashed_batch in enumerate(batches): + new_batch = { + 'user_id_hash': hashed_batch['user_id_hash'], + 'timestamp': hashed_batch['timestamp'], + 'items': [convert_item(item) for item in hashed_batch['items']] + } + response = requests.post(API_ENDPOINT, json=new_batch) + print(f"Response {i} of {total_batches}", response) if __name__ == '__main__':