Add script to process historical data

2022-06-18 15:31:35 +01:00 · 2022-06-18 15:31:35 +01:00 · d400950689
commit d400950689
parent eb1c59990c
4 changed files with 78 additions and 31 deletions
--- a/mwmbl/crawler/historical.py
+++ b/mwmbl/crawler/historical.py
@ -0,0 +1,44 @@
+from datetime import date, datetime
+
+import spacy
+
+from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
+    create_historical_batch, HashedBatch
+from mwmbl.indexer.index import tokenize_document
+from mwmbl.indexer.paths import INDEX_PATH
+from mwmbl.tinysearchengine.indexer import TinyIndex, Document
+
+
+def run(index_path):
+    nlp = spacy.load("en_core_web_sm")
+    date_str = str(date.today())
+    users = get_user_id_hashes_for_date(date_str)
+    print("Users", users)
+    with TinyIndex(Document, index_path, 'w') as indexer:
+        for user in users:
+            batch_ids = get_batches_for_date_and_user(date_str, user)
+            print("Batches", batch_ids)
+            for batch_id in batch_ids["batch_ids"]:
+                start = datetime.now()
+                batch_dict = get_batch_from_id(date_str, user, batch_id)
+                get_batch_time = datetime.now()
+                print("Get batch time", get_batch_time - start)
+                batch = HashedBatch.parse_obj(batch_dict)
+                print("Batch", batch)
+                create_historical_batch(batch)
+                create_historical_time = datetime.now()
+                print("Create historical time", create_historical_time - get_batch_time)
+
+                for item in batch.items:
+                    if item.content is None:
+                        continue
+
+                    page = tokenize_document(item.url, item.content.title, item.content.extract, 1, nlp)
+                    for token in page.tokens:
+                        indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
+                tokenize_time = datetime.now()
+                print("Tokenize time", tokenize_time - create_historical_time)
+
+
+if __name__ == '__main__':
+    run(INDEX_PATH)
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@ -45,19 +45,24 @@ def prepare_url_for_tokenizing(url: str):

 def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
    for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
-        title_tokens = tokenize(nlp, title_cleaned)
-        prepared_url = prepare_url_for_tokenizing(unquote(url))
-        url_tokens = tokenize(nlp, prepared_url)
-        extract_tokens = tokenize(nlp, extract)
-        print("Extract tokens", extract_tokens)
-        tokens = title_tokens | url_tokens | extract_tokens
        score = link_counts.get(url, DEFAULT_SCORE)
-        yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
+        yield tokenize_document(url, title_cleaned, extract, score, nlp)

        if i % 1000 == 0:
            print("Processed", i)


+def tokenize_document(url, title_cleaned, extract, score, nlp):
+    title_tokens = tokenize(nlp, title_cleaned)
+    prepared_url = prepare_url_for_tokenizing(unquote(url))
+    url_tokens = tokenize(nlp, prepared_url)
+    extract_tokens = tokenize(nlp, extract)
+    # print("Extract tokens", extract_tokens)
+    tokens = title_tokens | url_tokens | extract_tokens
+    document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
+    return document
+
+
 def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
    terms = Counter()
    pages = get_pages(nlp, titles_urls_and_extracts, link_counts)
--- a/poetry.lock
+++ b/poetry.lock
@ -68,7 +68,7 @@ name = "blis"
 version = "0.7.7"
 description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension."
 category = "main"
-optional = true
+optional = false
 python-versions = "*"

 [package.dependencies]
@ -111,7 +111,7 @@ name = "catalogue"
 version = "2.0.7"
 description = "Super lightweight function registries for your library"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [[package]]
@ -168,7 +168,7 @@ name = "cymem"
 version = "2.0.6"
 description = "Manage calls to calloc/free through Cython"
 category = "main"
-optional = true
+optional = false
 python-versions = "*"

 [[package]]
@ -232,7 +232,7 @@ name = "jinja2"
 version = "3.1.2"
 description = "A very fast and expressive template engine."
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.7"

 [package.dependencies]
@ -273,7 +273,7 @@ name = "langcodes"
 version = "3.3.0"
 description = "Tools for labeling human languages with IETF language tags"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [package.extras]
@ -320,7 +320,7 @@ name = "markupsafe"
 version = "2.1.1"
 description = "Safely add untrusted strings to HTML/XML markup."
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.7"

 [[package]]
@ -336,7 +336,7 @@ name = "murmurhash"
 version = "1.0.7"
 description = "Cython bindings for MurmurHash"
 category = "main"
-optional = true
+optional = false
 python-versions = "*"

 [[package]]
@ -379,7 +379,7 @@ name = "pathy"
 version = "0.6.1"
 description = "pathlib.Path subclasses for local and cloud bucket storage"
 category = "main"
-optional = true
+optional = false
 python-versions = ">= 3.6"

 [package.dependencies]
@ -409,7 +409,7 @@ name = "preshed"
 version = "3.0.6"
 description = "Cython hash table that trusts the keys are pre-hashed"
 category = "main"
-optional = true
+optional = false
 python-versions = "*"

 [package.dependencies]
@ -637,7 +637,7 @@ name = "smart-open"
 version = "5.2.1"
 description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6,<4.0"

 [package.extras]
@ -670,7 +670,7 @@ name = "spacy"
 version = "3.2.1"
 description = "Industrial-strength Natural Language Processing (NLP) in Python"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [package.dependencies]
@ -721,7 +721,7 @@ name = "spacy-legacy"
 version = "3.0.9"
 description = "Legacy registered functions for spaCy backwards compatibility"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [[package]]
@ -729,7 +729,7 @@ name = "spacy-loggers"
 version = "1.0.2"
 description = "Logging utilities for SpaCy"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [package.dependencies]
@ -740,7 +740,7 @@ name = "srsly"
 version = "2.4.3"
 description = "Modern high-performance serialization utilities for Python"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [package.dependencies]
@ -765,7 +765,7 @@ name = "thinc"
 version = "8.0.17"
 description = "A refreshing functional take on deep learning, compatible with your favorite libraries"
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [package.dependencies]
@ -820,7 +820,7 @@ name = "tqdm"
 version = "4.64.0"
 description = "Fast, Extensible Progress Meter"
 category = "main"
-optional = true
+optional = false
 python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"

 [package.dependencies]
@ -837,7 +837,7 @@ name = "typer"
 version = "0.4.1"
 description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
 category = "main"
-optional = true
+optional = false
 python-versions = ">=3.6"

 [package.dependencies]
@ -910,7 +910,7 @@ name = "wasabi"
 version = "0.9.1"
 description = "A lightweight console printing and formatting toolkit"
 category = "main"
-optional = true
+optional = false
 python-versions = "*"

 [[package]]
@ -928,12 +928,12 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]

 [extras]
-indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "spacy", "en-core-web-sm"]
+indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "en-core-web-sm"]

 [metadata]
 lock-version = "1.1"
 python-versions = ">=3.10,<3.11"
-content-hash = "0782bf407e18e268c5a53aede8595a60b1a545e4ed3709e87191f2bb1678e92c"
+content-hash = "cd17d671d8fa708240c3bf7bf5240bcf4bbcebea4538e5ad091bbcf9d10c5574"

 [metadata.files]
 anyio = [
--- a/pyproject.toml
+++ b/pyproject.toml
@ -17,6 +17,7 @@ pyyaml = "==6.0"
 boto3 = "^1.20.37"
 requests = "^2.27.1"
 psycopg2-binary = "^2.9.3"
+spacy = "==3.2.1"

 # Optional dependencies do not get installed by default. Look under tool.poetry.extras section
 # to see which extras to use.
@ -31,8 +32,7 @@ pyarrow = {version= "==6.0.0", optional = true}
 pyspark = {version= "==3.2.0", optional = true}
 Levenshtein = {version= "==0.16.0", optional = true}
 # en-core-web-sm requires a compatible version of spacy
-spacy = {version= "==3.2.1", optional = true}
-en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
+en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"}


 [tool.poetry.extras]
@ -48,8 +48,6 @@ indexer = [
    "pyspark",
    "Levenshtein",
    # en-core-web-sm requires a compatible version of spacy
-    "spacy",
-    "en-core-web-sm",
 ]

 [tool.poetry.dev-dependencies]