Record historical batches via the API

This commit is contained in:
Daoud Clarke 2022-06-05 09:15:04 +01:00
parent 617666e3b7
commit aaca8b2b6e
3 changed files with 41 additions and 5 deletions

View file

@ -0,0 +1,34 @@
"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests
from mwmbl.indexer.paths import CRAWL_GLOB
API_ENDPOINT = "http://localhost:8080/batches/historical"
def get_batches():
for path in glob.glob(CRAWL_GLOB):
hashed_batch = json.load(gzip.open(path))
yield hashed_batch
def run():
batches = get_batches()
for hashed_batch in batches:
print("Recording batch", hashed_batch)
response = requests.post(API_ENDPOINT, json=hashed_batch)
print("Response", response)
if __name__ == '__main__':
run()

11
poetry.lock generated
View file

@ -119,7 +119,7 @@ name = "certifi"
version = "2021.10.8"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = true
optional = false
python-versions = "*"
[[package]]
@ -138,7 +138,7 @@ name = "charset-normalizer"
version = "2.0.12"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = true
optional = false
python-versions = ">=3.5.0"
[package.extras]
@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
[package.source]
type = "url"
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
[[package]]
name = "fastapi"
version = "0.70.1"
@ -558,7 +559,7 @@ name = "requests"
version = "2.27.1"
description = "Python HTTP for Humans."
category = "main"
optional = true
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
[package.dependencies]
@ -861,7 +862,7 @@ name = "urllib3"
version = "1.26.9"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = true
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
[package.extras]
@ -924,7 +925,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
[metadata]
lock-version = "1.1"
python-versions = ">=3.10,<3.11"
content-hash = "be01dba545a5f118cb4d1cd2726c0b9fadbba44684fcd4af2bef2e812e28da93"
content-hash = "82ce3efb1f7108ff006f0db654d1e070429dad98eb0648731effb15533d0ffed"
[metadata.files]
anyio = [

View file

@ -32,6 +32,7 @@ Levenshtein = {version= "==0.16.0", optional = true}
# en-core-web-sm requires a compatible version of spacy
spacy = {version= "==3.2.1", optional = true}
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
requests = "^2.27.1"
[tool.poetry.extras]