Record historical batches via the API
This commit is contained in:
parent
617666e3b7
commit
aaca8b2b6e
3 changed files with 41 additions and 5 deletions
34
analyse/record_historical_batches.py
Normal file
34
analyse/record_historical_batches.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
"""
|
||||
See how many unique URLs and root domains we have crawled.
|
||||
"""
|
||||
import glob
|
||||
import gzip
|
||||
import json
|
||||
from collections import defaultdict, Counter
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
|
||||
from mwmbl.indexer.paths import CRAWL_GLOB
|
||||
|
||||
|
||||
API_ENDPOINT = "http://localhost:8080/batches/historical"
|
||||
|
||||
|
||||
def get_batches():
|
||||
for path in glob.glob(CRAWL_GLOB):
|
||||
hashed_batch = json.load(gzip.open(path))
|
||||
yield hashed_batch
|
||||
|
||||
|
||||
def run():
|
||||
batches = get_batches()
|
||||
for hashed_batch in batches:
|
||||
print("Recording batch", hashed_batch)
|
||||
response = requests.post(API_ENDPOINT, json=hashed_batch)
|
||||
print("Response", response)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
||||
|
11
poetry.lock
generated
11
poetry.lock
generated
|
@ -119,7 +119,7 @@ name = "certifi"
|
|||
version = "2021.10.8"
|
||||
description = "Python package for providing Mozilla's CA Bundle."
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
|
||||
[[package]]
|
||||
|
@ -138,7 +138,7 @@ name = "charset-normalizer"
|
|||
version = "2.0.12"
|
||||
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=3.5.0"
|
||||
|
||||
[package.extras]
|
||||
|
@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
|
|||
[package.source]
|
||||
type = "url"
|
||||
url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
|
||||
|
||||
[[package]]
|
||||
name = "fastapi"
|
||||
version = "0.70.1"
|
||||
|
@ -558,7 +559,7 @@ name = "requests"
|
|||
version = "2.27.1"
|
||||
description = "Python HTTP for Humans."
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
|
||||
|
||||
[package.dependencies]
|
||||
|
@ -861,7 +862,7 @@ name = "urllib3"
|
|||
version = "1.26.9"
|
||||
description = "HTTP library with thread-safe connection pooling, file post, and more."
|
||||
category = "main"
|
||||
optional = true
|
||||
optional = false
|
||||
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
|
||||
|
||||
[package.extras]
|
||||
|
@ -924,7 +925,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
|
|||
[metadata]
|
||||
lock-version = "1.1"
|
||||
python-versions = ">=3.10,<3.11"
|
||||
content-hash = "be01dba545a5f118cb4d1cd2726c0b9fadbba44684fcd4af2bef2e812e28da93"
|
||||
content-hash = "82ce3efb1f7108ff006f0db654d1e070429dad98eb0648731effb15533d0ffed"
|
||||
|
||||
[metadata.files]
|
||||
anyio = [
|
||||
|
|
|
@ -32,6 +32,7 @@ Levenshtein = {version= "==0.16.0", optional = true}
|
|||
# en-core-web-sm requires a compatible version of spacy
|
||||
spacy = {version= "==3.2.1", optional = true}
|
||||
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
|
||||
requests = "^2.27.1"
|
||||
|
||||
|
||||
[tool.poetry.extras]
|
||||
|
|
Loading…
Reference in a new issue