2022-06-05 08:15:04 +00:00
|
|
|
"""
|
|
|
|
See how many unique URLs and root domains we have crawled.
|
|
|
|
"""
|
|
|
|
import glob
|
|
|
|
import gzip
|
|
|
|
import json
|
|
|
|
|
|
|
|
import requests
|
|
|
|
|
2023-10-10 12:51:06 +00:00
|
|
|
from mwmbl.indexer import CRAWL_GLOB
|
2022-06-05 08:15:04 +00:00
|
|
|
|
|
|
|
|
2022-06-09 21:24:54 +00:00
|
|
|
API_ENDPOINT = "http://95.216.215.29/batches/historical"
|
|
|
|
|
|
|
|
|
|
|
|
def total_num_batches():
|
|
|
|
return len(glob.glob(CRAWL_GLOB))
|
2022-06-05 08:15:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get_batches():
|
2022-06-09 21:24:54 +00:00
|
|
|
for path in sorted(glob.glob(CRAWL_GLOB)):
|
2022-06-05 08:15:04 +00:00
|
|
|
hashed_batch = json.load(gzip.open(path))
|
|
|
|
yield hashed_batch
|
|
|
|
|
|
|
|
|
2022-06-09 21:24:54 +00:00
|
|
|
def convert_item(item):
|
|
|
|
return {
|
|
|
|
'url': item['url'],
|
|
|
|
'status': 200,
|
|
|
|
'timestamp': item['timestamp'],
|
|
|
|
'content': {
|
|
|
|
'title': item['title'],
|
|
|
|
'extract': item['extract'],
|
|
|
|
'links': item['links'],
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2022-06-05 08:15:04 +00:00
|
|
|
def run():
|
2022-06-09 21:24:54 +00:00
|
|
|
total_batches = total_num_batches()
|
2022-06-05 08:15:04 +00:00
|
|
|
batches = get_batches()
|
2022-06-09 21:24:54 +00:00
|
|
|
for i, hashed_batch in enumerate(batches):
|
|
|
|
new_batch = {
|
|
|
|
'user_id_hash': hashed_batch['user_id_hash'],
|
|
|
|
'timestamp': hashed_batch['timestamp'],
|
|
|
|
'items': [convert_item(item) for item in hashed_batch['items']]
|
|
|
|
}
|
|
|
|
response = requests.post(API_ENDPOINT, json=new_batch)
|
|
|
|
print(f"Response {i} of {total_batches}", response)
|
2022-06-05 08:15:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
run()
|
|
|
|
|