Browse Source

Use new server

Daoud Clarke 3 năm trước cách đây
mục cha
commit
14107acc75
1 tập tin đã thay đổi với 29 bổ sung6 xóa
  1. 29 6
      analyse/record_historical_batches.py

+ 29 - 6
analyse/record_historical_batches.py

@@ -12,21 +12,44 @@ import requests
 from mwmbl.indexer.paths import CRAWL_GLOB
 
 
-API_ENDPOINT = "http://localhost:8080/batches/historical"
+API_ENDPOINT = "http://95.216.215.29/batches/historical"
+
+
+def total_num_batches():
+    return len(glob.glob(CRAWL_GLOB))
 
 
 def get_batches():
-    for path in glob.glob(CRAWL_GLOB):
+    for path in sorted(glob.glob(CRAWL_GLOB)):
         hashed_batch = json.load(gzip.open(path))
         yield hashed_batch
 
 
+def convert_item(item):
+    return {
+        'url': item['url'],
+        'status': 200,
+        'timestamp': item['timestamp'],
+        'content': {
+            'title': item['title'],
+            'extract': item['extract'],
+            'links': item['links'],
+        }
+    }
+
+
+
 def run():
+    total_batches = total_num_batches()
     batches = get_batches()
-    for hashed_batch in batches:
-        print("Recording batch", hashed_batch)
-        response = requests.post(API_ENDPOINT, json=hashed_batch)
-        print("Response", response)
+    for i, hashed_batch in enumerate(batches):
+        new_batch = {
+            'user_id_hash': hashed_batch['user_id_hash'],
+            'timestamp': hashed_batch['timestamp'],
+            'items': [convert_item(item) for item in hashed_batch['items']]
+        }
+        response = requests.post(API_ENDPOINT, json=new_batch)
+        print(f"Response {i} of {total_batches}", response)
 
 
 if __name__ == '__main__':