Merge pull request #53 from mwmbl/record-historical-batches

Record historical batches
This commit is contained in:
Daoud Clarke 2022-06-16 22:09:12 +01:00 committed by GitHub
commit 7771657684
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 64 additions and 5 deletions

View file

@ -0,0 +1,57 @@
"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests
from mwmbl.indexer.paths import CRAWL_GLOB
API_ENDPOINT = "http://95.216.215.29/batches/historical"
def total_num_batches():
return len(glob.glob(CRAWL_GLOB))
def get_batches():
for path in sorted(glob.glob(CRAWL_GLOB)):
hashed_batch = json.load(gzip.open(path))
yield hashed_batch
def convert_item(item):
return {
'url': item['url'],
'status': 200,
'timestamp': item['timestamp'],
'content': {
'title': item['title'],
'extract': item['extract'],
'links': item['links'],
}
}
def run():
total_batches = total_num_batches()
batches = get_batches()
for i, hashed_batch in enumerate(batches):
new_batch = {
'user_id_hash': hashed_batch['user_id_hash'],
'timestamp': hashed_batch['timestamp'],
'items': [convert_item(item) for item in hashed_batch['items']]
}
response = requests.post(API_ENDPOINT, json=new_batch)
print(f"Response {i} of {total_batches}", response)
if __name__ == '__main__':
run()

11
poetry.lock generated
View file

@ -119,7 +119,7 @@ name = "certifi"
version = "2021.10.8"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = true
optional = false
python-versions = "*"
[[package]]
@ -138,7 +138,7 @@ name = "charset-normalizer"
version = "2.0.12"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = true
optional = false
python-versions = ">=3.5.0"
[package.extras]
@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
[package.source]
type = "url"
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
[[package]]
name = "fastapi"
version = "0.70.1"
@ -558,7 +559,7 @@ name = "requests"
version = "2.27.1"
description = "Python HTTP for Humans."
category = "main"
optional = true
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
[package.dependencies]
@ -861,7 +862,7 @@ name = "urllib3"
version = "1.26.9"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = true
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
[package.extras]
@ -924,7 +925,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
[metadata]
lock-version = "1.1"
python-versions = ">=3.10,<3.11"
content-hash = "be01dba545a5f118cb4d1cd2726c0b9fadbba44684fcd4af2bef2e812e28da93"
content-hash = "82ce3efb1f7108ff006f0db654d1e070429dad98eb0648731effb15533d0ffed"
[metadata.files]
anyio = [

View file

@ -32,6 +32,7 @@ Levenshtein = {version= "==0.16.0", optional = true}
# en-core-web-sm requires a compatible version of spacy
spacy = {version= "==3.2.1", optional = true}
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
requests = "^2.27.1"
[tool.poetry.extras]