Add script to process historical data

This commit is contained in:
Daoud Clarke 2022-06-18 15:31:35 +01:00
parent eb1c59990c
commit d400950689
4 changed files with 78 additions and 31 deletions

View file

@ -0,0 +1,44 @@
from datetime import date, datetime
import spacy
from mwmbl.crawler.app import get_user_id_hashes_for_date, get_batches_for_date_and_user, get_batch_from_id, \
create_historical_batch, HashedBatch
from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.paths import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
def run(index_path):
nlp = spacy.load("en_core_web_sm")
date_str = str(date.today())
users = get_user_id_hashes_for_date(date_str)
print("Users", users)
with TinyIndex(Document, index_path, 'w') as indexer:
for user in users:
batch_ids = get_batches_for_date_and_user(date_str, user)
print("Batches", batch_ids)
for batch_id in batch_ids["batch_ids"]:
start = datetime.now()
batch_dict = get_batch_from_id(date_str, user, batch_id)
get_batch_time = datetime.now()
print("Get batch time", get_batch_time - start)
batch = HashedBatch.parse_obj(batch_dict)
print("Batch", batch)
create_historical_batch(batch)
create_historical_time = datetime.now()
print("Create historical time", create_historical_time - get_batch_time)
for item in batch.items:
if item.content is None:
continue
page = tokenize_document(item.url, item.content.title, item.content.extract, 1, nlp)
for token in page.tokens:
indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score))
tokenize_time = datetime.now()
print("Tokenize time", tokenize_time - create_historical_time)
if __name__ == '__main__':
run(INDEX_PATH)

View file

@ -45,19 +45,24 @@ def prepare_url_for_tokenizing(url: str):
def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]:
for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts):
title_tokens = tokenize(nlp, title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(nlp, prepared_url)
extract_tokens = tokenize(nlp, extract)
print("Extract tokens", extract_tokens)
tokens = title_tokens | url_tokens | extract_tokens
score = link_counts.get(url, DEFAULT_SCORE)
yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
yield tokenize_document(url, title_cleaned, extract, score, nlp)
if i % 1000 == 0:
print("Processed", i)
def tokenize_document(url, title_cleaned, extract, score, nlp):
title_tokens = tokenize(nlp, title_cleaned)
prepared_url = prepare_url_for_tokenizing(unquote(url))
url_tokens = tokenize(nlp, prepared_url)
extract_tokens = tokenize(nlp, extract)
# print("Extract tokens", extract_tokens)
tokens = title_tokens | url_tokens | extract_tokens
document = TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score)
return document
def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path):
terms = Counter()
pages = get_pages(nlp, titles_urls_and_extracts, link_counts)

40
poetry.lock generated
View file

@ -68,7 +68,7 @@ name = "blis"
version = "0.7.7"
description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension."
category = "main"
optional = true
optional = false
python-versions = "*"
[package.dependencies]
@ -111,7 +111,7 @@ name = "catalogue"
version = "2.0.7"
description = "Super lightweight function registries for your library"
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[[package]]
@ -168,7 +168,7 @@ name = "cymem"
version = "2.0.6"
description = "Manage calls to calloc/free through Cython"
category = "main"
optional = true
optional = false
python-versions = "*"
[[package]]
@ -232,7 +232,7 @@ name = "jinja2"
version = "3.1.2"
description = "A very fast and expressive template engine."
category = "main"
optional = true
optional = false
python-versions = ">=3.7"
[package.dependencies]
@ -273,7 +273,7 @@ name = "langcodes"
version = "3.3.0"
description = "Tools for labeling human languages with IETF language tags"
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[package.extras]
@ -320,7 +320,7 @@ name = "markupsafe"
version = "2.1.1"
description = "Safely add untrusted strings to HTML/XML markup."
category = "main"
optional = true
optional = false
python-versions = ">=3.7"
[[package]]
@ -336,7 +336,7 @@ name = "murmurhash"
version = "1.0.7"
description = "Cython bindings for MurmurHash"
category = "main"
optional = true
optional = false
python-versions = "*"
[[package]]
@ -379,7 +379,7 @@ name = "pathy"
version = "0.6.1"
description = "pathlib.Path subclasses for local and cloud bucket storage"
category = "main"
optional = true
optional = false
python-versions = ">= 3.6"
[package.dependencies]
@ -409,7 +409,7 @@ name = "preshed"
version = "3.0.6"
description = "Cython hash table that trusts the keys are pre-hashed"
category = "main"
optional = true
optional = false
python-versions = "*"
[package.dependencies]
@ -637,7 +637,7 @@ name = "smart-open"
version = "5.2.1"
description = "Utils for streaming large files (S3, HDFS, GCS, Azure Blob Storage, gzip, bz2...)"
category = "main"
optional = true
optional = false
python-versions = ">=3.6,<4.0"
[package.extras]
@ -670,7 +670,7 @@ name = "spacy"
version = "3.2.1"
description = "Industrial-strength Natural Language Processing (NLP) in Python"
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[package.dependencies]
@ -721,7 +721,7 @@ name = "spacy-legacy"
version = "3.0.9"
description = "Legacy registered functions for spaCy backwards compatibility"
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[[package]]
@ -729,7 +729,7 @@ name = "spacy-loggers"
version = "1.0.2"
description = "Logging utilities for SpaCy"
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[package.dependencies]
@ -740,7 +740,7 @@ name = "srsly"
version = "2.4.3"
description = "Modern high-performance serialization utilities for Python"
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[package.dependencies]
@ -765,7 +765,7 @@ name = "thinc"
version = "8.0.17"
description = "A refreshing functional take on deep learning, compatible with your favorite libraries"
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[package.dependencies]
@ -820,7 +820,7 @@ name = "tqdm"
version = "4.64.0"
description = "Fast, Extensible Progress Meter"
category = "main"
optional = true
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7"
[package.dependencies]
@ -837,7 +837,7 @@ name = "typer"
version = "0.4.1"
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
category = "main"
optional = true
optional = false
python-versions = ">=3.6"
[package.dependencies]
@ -910,7 +910,7 @@ name = "wasabi"
version = "0.9.1"
description = "A lightweight console printing and formatting toolkit"
category = "main"
optional = true
optional = false
python-versions = "*"
[[package]]
@ -928,12 +928,12 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "spacy", "en-core-web-sm"]
indexer = ["ujson", "warcio", "idna", "beautifulsoup4", "lxml", "jusText", "langdetect", "pyarrow", "pyspark", "Levenshtein", "en-core-web-sm"]
[metadata]
lock-version = "1.1"
python-versions = ">=3.10,<3.11"
content-hash = "0782bf407e18e268c5a53aede8595a60b1a545e4ed3709e87191f2bb1678e92c"
content-hash = "cd17d671d8fa708240c3bf7bf5240bcf4bbcebea4538e5ad091bbcf9d10c5574"
[metadata.files]
anyio = [

View file

@ -17,6 +17,7 @@ pyyaml = "==6.0"
boto3 = "^1.20.37"
requests = "^2.27.1"
psycopg2-binary = "^2.9.3"
spacy = "==3.2.1"
# Optional dependencies do not get installed by default. Look under tool.poetry.extras section
# to see which extras to use.
@ -31,8 +32,7 @@ pyarrow = {version= "==6.0.0", optional = true}
pyspark = {version= "==3.2.0", optional = true}
Levenshtein = {version= "==0.16.0", optional = true}
# en-core-web-sm requires a compatible version of spacy
spacy = {version= "==3.2.1", optional = true}
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz", optional = true}
en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz"}
[tool.poetry.extras]
@ -48,8 +48,6 @@ indexer = [
"pyspark",
"Levenshtein",
# en-core-web-sm requires a compatible version of spacy
"spacy",
"en-core-web-sm",
]
[tool.poetry.dev-dependencies]