|
@@ -12,21 +12,44 @@ import requests
|
|
from mwmbl.indexer.paths import CRAWL_GLOB
|
|
from mwmbl.indexer.paths import CRAWL_GLOB
|
|
|
|
|
|
|
|
|
|
-API_ENDPOINT = "http://localhost:8080/batches/historical"
|
|
|
|
|
|
+API_ENDPOINT = "http://95.216.215.29/batches/historical"
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+def total_num_batches():
|
|
|
|
+ return len(glob.glob(CRAWL_GLOB))
|
|
|
|
|
|
|
|
|
|
def get_batches():
|
|
def get_batches():
|
|
- for path in glob.glob(CRAWL_GLOB):
|
|
|
|
|
|
+ for path in sorted(glob.glob(CRAWL_GLOB)):
|
|
hashed_batch = json.load(gzip.open(path))
|
|
hashed_batch = json.load(gzip.open(path))
|
|
yield hashed_batch
|
|
yield hashed_batch
|
|
|
|
|
|
|
|
|
|
|
|
+def convert_item(item):
|
|
|
|
+ return {
|
|
|
|
+ 'url': item['url'],
|
|
|
|
+ 'status': 200,
|
|
|
|
+ 'timestamp': item['timestamp'],
|
|
|
|
+ 'content': {
|
|
|
|
+ 'title': item['title'],
|
|
|
|
+ 'extract': item['extract'],
|
|
|
|
+ 'links': item['links'],
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+
|
|
|
|
+
|
|
def run():
|
|
def run():
|
|
|
|
+ total_batches = total_num_batches()
|
|
batches = get_batches()
|
|
batches = get_batches()
|
|
- for hashed_batch in batches:
|
|
|
|
- print("Recording batch", hashed_batch)
|
|
|
|
- response = requests.post(API_ENDPOINT, json=hashed_batch)
|
|
|
|
- print("Response", response)
|
|
|
|
|
|
+ for i, hashed_batch in enumerate(batches):
|
|
|
|
+ new_batch = {
|
|
|
|
+ 'user_id_hash': hashed_batch['user_id_hash'],
|
|
|
|
+ 'timestamp': hashed_batch['timestamp'],
|
|
|
|
+ 'items': [convert_item(item) for item in hashed_batch['items']]
|
|
|
|
+ }
|
|
|
|
+ response = requests.post(API_ENDPOINT, json=new_batch)
|
|
|
|
+ print(f"Response {i} of {total_batches}", response)
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|