Record historical batches via the API

This commit is contained in:
Daoud Clarke 2022-06-05 09:15:04 +01:00
parent 617666e3b7
commit aaca8b2b6e
3 changed files with 41 additions and 5 deletions

View file

@ -0,0 +1,34 @@
"""
See how many unique URLs and root domains we have crawled.
"""
import glob
import gzip
import json
from collections import defaultdict, Counter
from urllib.parse import urlparse
import requests
from mwmbl.indexer.paths import CRAWL_GLOB
API_ENDPOINT = "http://localhost:8080/batches/historical"
def get_batches():
for path in glob.glob(CRAWL_GLOB):
hashed_batch = json.load(gzip.open(path))
yield hashed_batch
def run():
batches = get_batches()
for hashed_batch in batches:
print("Recording batch", hashed_batch)
response = requests.post(API_ENDPOINT, json=hashed_batch)
print("Response", response)
if __name__ == '__main__':
run()

11
poetry.lock generated
View file

@ -119,7 +119,7 @@ name = "certifi"
version = "2021.10.8" version = "2021.10.8"
description = "Python package for providing Mozilla's CA Bundle." description = "Python package for providing Mozilla's CA Bundle."
category = "main" category = "main"
optional = true optional = false
python-versions = "*" python-versions = "*"
[[package]] [[package]]
@ -138,7 +138,7 @@ name = "charset-normalizer"
version = "2.0.12" version = "2.0.12"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main" category = "main"
optional = true optional = false
python-versions = ">=3.5.0" python-versions = ">=3.5.0"
[package.extras] [package.extras]
@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
[package.source] [package.source]
type = "url" type = "url"
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz" url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
[[package]] [[package]]
name = "fastapi" name = "fastapi"
version = "0.70.1" version = "0.70.1"
@ -558,7 +559,7 @@ name = "requests"
version = "2.27.1" version = "2.27.1"
description = "Python HTTP for Humans." description = "Python HTTP for Humans."
category = "main" category = "main"
optional = true optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
[package.dependencies] [package.dependencies]
@ -861,7 +862,7 @@ name = "urllib3"
version = "1.26.9" version = "1.26.9"
description = "HTTP library with thread-safe connection pooling, file post, and more." description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main" category = "main"
optional = true optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
[package.extras] [package.extras]
@ -924,7 +925,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
[metadata] [metadata]
lock-version = "1.1" lock-version = "1.1"
python-versions = ">=3.10,<3.11" python-versions = ">=3.10,<3.11"
content-hash = "be01dba545a5f118cb4d1cd2726c0b9fadbba44684fcd4af2bef2e812e28da93" content-hash = "82ce3efb1f7108ff006f0db654d1e070429dad98eb0648731effb15533d0ffed"
[metadata.files] [metadata.files]
anyio = [ anyio = [

View file

@ -32,6 +32,7 @@ Levenshtein = {version= "==0.16.0", optional = true}
# en-core-web-sm requires a compatible version of spacy # en-core-web-sm requires a compatible version of spacy
spacy = {version= "==3.2.1", optional = true} spacy = {version= "==3.2.1", optional = true}
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true} en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
requests = "^2.27.1"
[tool.poetry.extras] [tool.poetry.extras]