diff --git a/analyse/record_historical_batches.py b/analyse/record_historical_batches.py new file mode 100644 index 0000000..6c8c80e --- /dev/null +++ b/analyse/record_historical_batches.py @@ -0,0 +1,34 @@ +""" +See how many unique URLs and root domains we have crawled. +""" +import glob +import gzip +import json +from collections import defaultdict, Counter +from urllib.parse import urlparse + +import requests + +from mwmbl.indexer.paths import CRAWL_GLOB + + +API_ENDPOINT = "http://localhost:8080/batches/historical" + + +def get_batches(): + for path in glob.glob(CRAWL_GLOB): + hashed_batch = json.load(gzip.open(path)) + yield hashed_batch + + +def run(): + batches = get_batches() + for hashed_batch in batches: + print("Recording batch", hashed_batch) + response = requests.post(API_ENDPOINT, json=hashed_batch) + print("Response", response) + + +if __name__ == '__main__': + run() + diff --git a/poetry.lock b/poetry.lock index a02238d..a15a3b5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -119,7 +119,7 @@ name = "certifi" version = "2021.10.8" description = "Python package for providing Mozilla's CA Bundle." category = "main" -optional = true +optional = false python-versions = "*" [[package]] @@ -138,7 +138,7 @@ name = "charset-normalizer" version = "2.0.12" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" -optional = true +optional = false python-versions = ">=3.5.0" [package.extras] @@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0" [package.source] type = "url" url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz" + [[package]] name = "fastapi" version = "0.70.1" @@ -558,7 +559,7 @@ name = "requests" version = "2.27.1" description = "Python HTTP for Humans." category = "main" -optional = true +optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" [package.dependencies] @@ -861,7 +862,7 @@ name = "urllib3" version = "1.26.9" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" -optional = true +optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" [package.extras] @@ -924,7 +925,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx [metadata] lock-version = "1.1" python-versions = ">=3.10,<3.11" -content-hash = "be01dba545a5f118cb4d1cd2726c0b9fadbba44684fcd4af2bef2e812e28da93" +content-hash = "82ce3efb1f7108ff006f0db654d1e070429dad98eb0648731effb15533d0ffed" [metadata.files] anyio = [ diff --git a/pyproject.toml b/pyproject.toml index 678e2b5..9cc4f1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,6 +32,7 @@ Levenshtein = {version= "==0.16.0", optional = true} # en-core-web-sm requires a compatible version of spacy spacy = {version= "==3.2.1", optional = true} en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true} +requests = "^2.27.1" [tool.poetry.extras]