Parcourir la source

Record historical batches via the API

Daoud Clarke il y a 3 ans
Parent
commit
aaca8b2b6e
3 fichiers modifiés avec 41 ajouts et 5 suppressions
  1. 34 0
      analyse/record_historical_batches.py
  2. 6 5
      poetry.lock
  3. 1 0
      pyproject.toml

+ 34 - 0
analyse/record_historical_batches.py

@@ -0,0 +1,34 @@
+"""
+See how many unique URLs and root domains we have crawled.
+"""
+import glob
+import gzip
+import json
+from collections import defaultdict, Counter
+from urllib.parse import urlparse
+
+import requests
+
+from mwmbl.indexer.paths import CRAWL_GLOB
+
+
+API_ENDPOINT = "http://localhost:8080/batches/historical"
+
+
+def get_batches():
+    for path in glob.glob(CRAWL_GLOB):
+        hashed_batch = json.load(gzip.open(path))
+        yield hashed_batch
+
+
+def run():
+    batches = get_batches()
+    for hashed_batch in batches:
+        print("Recording batch", hashed_batch)
+        response = requests.post(API_ENDPOINT, json=hashed_batch)
+        print("Response", response)
+
+
+if __name__ == '__main__':
+    run()
+

+ 6 - 5
poetry.lock

@@ -119,7 +119,7 @@ name = "certifi"
 version = "2021.10.8"
 version = "2021.10.8"
 description = "Python package for providing Mozilla's CA Bundle."
 description = "Python package for providing Mozilla's CA Bundle."
 category = "main"
 category = "main"
-optional = true
+optional = false
 python-versions = "*"
 python-versions = "*"
 
 
 [[package]]
 [[package]]
@@ -138,7 +138,7 @@ name = "charset-normalizer"
 version = "2.0.12"
 version = "2.0.12"
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
 category = "main"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.5.0"
 python-versions = ">=3.5.0"
 
 
 [package.extras]
 [package.extras]
@@ -185,6 +185,7 @@ spacy = ">=3.2.0,<3.3.0"
 [package.source]
 [package.source]
 type = "url"
 type = "url"
 url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
 url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"
+
 [[package]]
 [[package]]
 name = "fastapi"
 name = "fastapi"
 version = "0.70.1"
 version = "0.70.1"
@@ -558,7 +559,7 @@ name = "requests"
 version = "2.27.1"
 version = "2.27.1"
 description = "Python HTTP for Humans."
 description = "Python HTTP for Humans."
 category = "main"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
 
 
 [package.dependencies]
 [package.dependencies]
@@ -861,7 +862,7 @@ name = "urllib3"
 version = "1.26.9"
 version = "1.26.9"
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 description = "HTTP library with thread-safe connection pooling, file post, and more."
 category = "main"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4"
 
 
 [package.extras]
 [package.extras]
@@ -924,7 +925,7 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
 [metadata]
 [metadata]
 lock-version = "1.1"
 lock-version = "1.1"
 python-versions = ">=3.10,<3.11"
 python-versions = ">=3.10,<3.11"
-content-hash = "be01dba545a5f118cb4d1cd2726c0b9fadbba44684fcd4af2bef2e812e28da93"
+content-hash = "82ce3efb1f7108ff006f0db654d1e070429dad98eb0648731effb15533d0ffed"
 
 
 [metadata.files]
 [metadata.files]
 anyio = [
 anyio = [

+ 1 - 0
pyproject.toml

@@ -32,6 +32,7 @@ Levenshtein = {version= "==0.16.0", optional = true}
 # en-core-web-sm requires a compatible version of spacy
 # en-core-web-sm requires a compatible version of spacy
 spacy = {version= "==3.2.1", optional = true}
 spacy = {version= "==3.2.1", optional = true}
 en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
 en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
+requests = "^2.27.1"
 
 
 
 
 [tool.poetry.extras]
 [tool.poetry.extras]