mwmbl/analyse/record_historical_batches.py

"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json

import requests

from mwmbl.indexer import CRAWL_GLOB


API_ENDPOINT = "http://95.216.215.29/batches/historical"


def total_num_batches():
    return len(glob.glob(CRAWL_GLOB))


def get_batches():
    for path in sorted(glob.glob(CRAWL_GLOB)):
        hashed_batch = json.load(gzip.open(path))
        yield hashed_batch


def convert_item(item):
    return {
        'url': item['url'],
        'status': 200,
        'timestamp': item['timestamp'],
        'content': {
            'title': item['title'],
            'extract': item['extract'],
            'links': item['links'],
        }
    }


def run():
    total_batches = total_num_batches()
    batches = get_batches()
    for i, hashed_batch in enumerate(batches):
        new_batch = {
            'user_id_hash': hashed_batch['user_id_hash'],
            'timestamp': hashed_batch['timestamp'],
            'items': [convert_item(item) for item in hashed_batch['items']]
        }
        response = requests.post(API_ENDPOINT, json=new_batch)
        print(f"Response {i} of {total_batches}", response)


if __name__ == '__main__':
    run()
Record historical batches via the API 2022-06-05 08:15:04 +00:00			`"""`
			`See how many unique URLs and root domains we have crawled.`
			`"""`
			`import glob`
			`import gzip`
			`import json`

			`import requests`

Rename django app to mwmbl 2023-10-10 12:51:06 +00:00			`from mwmbl.indexer import CRAWL_GLOB`
Record historical batches via the API 2022-06-05 08:15:04 +00:00

Use new server 2022-06-09 21:24:54 +00:00			`API_ENDPOINT = "http://95.216.215.29/batches/historical"`


			`def total_num_batches():`
			`return len(glob.glob(CRAWL_GLOB))`
Record historical batches via the API 2022-06-05 08:15:04 +00:00

			`def get_batches():`
Use new server 2022-06-09 21:24:54 +00:00			`for path in sorted(glob.glob(CRAWL_GLOB)):`
Record historical batches via the API 2022-06-05 08:15:04 +00:00			`hashed_batch = json.load(gzip.open(path))`
			`yield hashed_batch`


Use new server 2022-06-09 21:24:54 +00:00			`def convert_item(item):`
			`return {`
			`'url': item['url'],`
			`'status': 200,`
			`'timestamp': item['timestamp'],`
			`'content': {`
			`'title': item['title'],`
			`'extract': item['extract'],`
			`'links': item['links'],`
			`}`
			`}`



Record historical batches via the API 2022-06-05 08:15:04 +00:00			`def run():`
Use new server 2022-06-09 21:24:54 +00:00			`total_batches = total_num_batches()`
Record historical batches via the API 2022-06-05 08:15:04 +00:00			`batches = get_batches()`
Use new server 2022-06-09 21:24:54 +00:00			`for i, hashed_batch in enumerate(batches):`
			`new_batch = {`
			`'user_id_hash': hashed_batch['user_id_hash'],`
			`'timestamp': hashed_batch['timestamp'],`
			`'items': [convert_item(item) for item in hashed_batch['items']]`
			`}`
			`response = requests.post(API_ENDPOINT, json=new_batch)`
			`print(f"Response {i} of {total_batches}", response)`
Record historical batches via the API 2022-06-05 08:15:04 +00:00

			`if __name__ == '__main__':`
			`run()`