diff --git a/Dockerfile b/Dockerfile index 4ac39f6..bebe590 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,4 +39,5 @@ COPY data /app/data COPY config /app/config # Using the mwmbl-tinysearchengine binary/entrypoint which comes packaged with mwmbl +# TODO: fix the arguments for the recent changes CMD ["/venv/bin/mwmbl-tinysearchengine", "--config", "config/tinysearchengine.yaml"] diff --git a/analyse/analyse_crawled_domains.py b/analyse/analyse_crawled_domains.py index 8641f61..a3d36b1 100644 --- a/analyse/analyse_crawled_domains.py +++ b/analyse/analyse_crawled_domains.py @@ -7,7 +7,7 @@ import json from collections import defaultdict, Counter from urllib.parse import urlparse -CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz" +from mwmbl.indexer.paths import CRAWL_GLOB def get_urls(): diff --git a/analyse/make_curl.py b/analyse/make_curl.py deleted file mode 100644 index 465f990..0000000 --- a/analyse/make_curl.py +++ /dev/null @@ -1,29 +0,0 @@ -""" -Make a curl script for testing performance -""" -import os -from itertools import islice -from urllib.parse import quote - -from mwmbl.indexer.paths import DATA_DIR -from mwmbl.indexer.wiki import get_wiki_titles_and_urls - -URL_TEMPLATE = "http://localhost:8000/complete?q={}" -CURL_FILE = os.path.join(DATA_DIR, "urls.curl") - - -def get_urls(): - titles_and_urls = get_wiki_titles_and_urls() - for title, url in islice(titles_and_urls, 100): - query = quote(title.lower()) - yield URL_TEMPLATE.format(query) - - -def run(): - with open(CURL_FILE, 'wt') as output_file: - for url in get_urls(): - output_file.write(f'url="{url}"\n') - - -if __name__ == '__main__': - run() diff --git a/analyse/performance.py b/analyse/performance.py deleted file mode 100644 index 0bac7f9..0000000 --- a/analyse/performance.py +++ /dev/null @@ -1,120 +0,0 @@ -""" -Test the performance of the search in terms of compression and speed. -""" -import os -from datetime import datetime - -import numpy as np -from spacy.lang.en import English -from starlette.testclient import TestClient - -from mwmbl.tinysearchengine import create_app -from mwmbl.indexer.fsqueue import ZstdJsonSerializer -from mwmbl.indexer.index import index_titles_urls_and_extracts -from mwmbl.tinysearchengine.indexer import TinyIndex, TinyIndexer, Document -from mwmbl.indexer.paths import TEST_INDEX_PATH, DATA_DIR, TEST_TERMS_PATH - -NUM_DOCUMENTS = 30000 -NUM_PAGES_FOR_STATS = 10 -TEST_PAGE_SIZE = 512 -TEST_NUM_PAGES = 1024 -TEST_DATA_PATH = os.path.join(DATA_DIR, 'test-urls.zstd') -RECALL_AT_K = 3 - -NUM_QUERY_CHARS = 10 - - -def get_test_pages(): - serializer = ZstdJsonSerializer() - with open(TEST_DATA_PATH, 'rb') as data_file: - data = serializer.deserialize(data_file.read()) - return [(row['title'], row['url']) for row in data if row['title'] is not None] - - -def query_test(): - titles_and_urls = get_test_pages() - print(f"Got {len(titles_and_urls)} titles and URLs") - tiny_index = TinyIndex(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) - - app = create_app.create() - client = TestClient(app) - - start = datetime.now() - hits = 0 - count = 0 - for title, url in titles_and_urls: - query = title[:NUM_QUERY_CHARS] - result = client.get('/complete', params={'q': query}) - assert result.status_code == 200 - data = result.json() - - hit = False - if data: - for result in data[1][:RECALL_AT_K]: - if url in result: - hit = True - break - - if hit: - hits += 1 - else: - print("Miss", data, title, url, sep='\n') - - count += 1 - - end = datetime.now() - print(f"Hits: {hits} out of {count}") - print(f"Recall at {RECALL_AT_K}: {hits/count}") - print("Query time:", (end - start).total_seconds() / NUM_DOCUMENTS) - - -def page_stats(indexer: TinyIndexer): - pages_and_sizes = [] - for i in range(TEST_NUM_PAGES): - page = indexer.get_page(i) - if page is not None: - pages_and_sizes.append((len(page), page)) - big_page_sizes, big_pages = zip(*sorted(pages_and_sizes, reverse=True)[:NUM_PAGES_FOR_STATS]) - return np.mean(big_page_sizes), np.std(big_page_sizes), big_pages - - -def performance_test(): - nlp = English() - try: - os.remove(TEST_INDEX_PATH) - except FileNotFoundError: - print("No test index found, creating") - with TinyIndexer(Document, TEST_INDEX_PATH, TEST_NUM_PAGES, TEST_PAGE_SIZE) as indexer: - titles_and_urls = get_test_pages() - - start_time = datetime.now() - index_titles_urls_and_extracts(indexer, nlp, titles_and_urls, TEST_TERMS_PATH) - stop_time = datetime.now() - - index_time = (stop_time - start_time).total_seconds() - index_size = os.path.getsize(TEST_INDEX_PATH) - - page_size_mean, page_size_std, big_pages = page_stats(indexer) - - print("Indexed pages:", NUM_DOCUMENTS) - print("Index time:", index_time) - print("Index size:", index_size) - print("Mean docs per page:", page_size_mean) - print("Std err of docs per page:", page_size_std) - print("Big pages") - print_pages(big_pages) - # print("Num tokens", indexer.get_num_tokens()) - - query_test() - - -def print_pages(pages): - for page in pages: - print("Page", page) - for title, url in page: - print(title, url) - print() - - -if __name__ == '__main__': - performance_test() diff --git a/mwmbl/indexer/batch.py b/mwmbl/indexer/batch.py new file mode 100644 index 0000000..86887ac --- /dev/null +++ b/mwmbl/indexer/batch.py @@ -0,0 +1,10 @@ +from itertools import islice +from typing import Iterator + + +def grouper(n: int, iterator: Iterator): + while True: + chunk = tuple(islice(iterator, n)) + if not chunk: + return + yield chunk \ No newline at end of file diff --git a/mwmbl/indexer/bootstrap.sh b/mwmbl/indexer/bootstrap.sh deleted file mode 100644 index 6186fae..0000000 --- a/mwmbl/indexer/bootstrap.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash -xe - -sudo python3 -m pip uninstall numpy -y -sudo python3 -m pip uninstall numpy -y -sudo python3 -m pip uninstall numpy -y - -sudo python3 -m pip install boto3==1.19.7 botocore==1.22.7 jusText==3.0.0 langdetect==1.0.9 \ - lxml==4.6.3 numpy==1.21.3 pandas==1.2.5 pyarrow==6.0.0 spacy==2.3.5 \ - warcio==1.7.4 zstandard==0.16.0 - -sudo python3 -m spacy download en_core_web_sm - -echo "========================" -echo "Normal python pip freeze" -python3 -m pip freeze diff --git a/mwmbl/indexer/crawl.py b/mwmbl/indexer/crawl.py deleted file mode 100644 index 11405d0..0000000 --- a/mwmbl/indexer/crawl.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -Crawl the web -""" -import gzip -import hashlib -import os -import sys -from traceback import print_tb, print_exc - -import pandas as pd -import requests - -from .paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX - - -def crawl(): - data = pd.read_csv(HN_TOP_PATH) - - for url in data['url']: - filename = hashlib.md5(url.encode('utf8')).hexdigest() - path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz") - if os.path.isfile(path): - print("Path already exists, skipping", url) - continue - - print("Fetching", url) - try: - html = fetch(url) - except Exception: - print_exc(file=sys.stderr) - print("Unable to fetch", url) - continue - - with gzip.open(path, 'wt') as output: - output.write(url + '\n') - output.write(html) - - -def fetch(url): - page_data = requests.get(url, timeout=10) - return page_data.text - - -if __name__ == '__main__': - crawl() diff --git a/mwmbl/indexer/dedupe.py b/mwmbl/indexer/dedupe.py new file mode 100644 index 0000000..5a09f4d --- /dev/null +++ b/mwmbl/indexer/dedupe.py @@ -0,0 +1,42 @@ +""" +Dedupe pages that have been crawled more than once and prepare them for indexing +""" +import glob +import gzip +import json + +from mwmbl.indexer.batch import grouper +from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer +from mwmbl.indexer.paths import CRAWL_GLOB, TINYSEARCH_DATA_DIR + +BATCH_SIZE = 100 + + +def get_deduped_pages(): + seen_urls = set() + for path in sorted(glob.glob(CRAWL_GLOB), reverse=True): + data = json.load(gzip.open(path)) + for item in data['items']: + url = item['url'] + if url in seen_urls: + continue + + seen_urls.add(url) + yield item + + +def queue_deduped_items(deduped_pages): + output_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer()) + + for batch in grouper(BATCH_SIZE, deduped_pages): + data = {'items': batch} + output_queue.put(data) + + +def run(): + deduped_pages = get_deduped_pages() + queue_deduped_items(deduped_pages) + + +if __name__ == '__main__': + run() diff --git a/mwmbl/indexer/deploy.sh b/mwmbl/indexer/deploy.sh deleted file mode 100644 index b17ddb2..0000000 --- a/mwmbl/indexer/deploy.sh +++ /dev/null @@ -1,20 +0,0 @@ -cat hn-top-domains-filtered.py extract.py > runextract.py - -aws s3 cp runextract.py s3://tinysearch/code/ -aws s3 cp bootstrap.sh s3://tinysearch/code/ - - -aws emr create-cluster \ - --applications Name=Spark Name=Zeppelin \ - --ec2-attributes '{"InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"subnet-03c33360c68f73a48"}' \ - --service-role EMR_DefaultRole \ - --enable-debugging \ - --release-label emr-5.33.1 \ - --log-uri 's3n://tinysearch/pyspark-logs/' \ - --bootstrap-actions '{"Path": "s3://tinysearch/code/bootstrap.sh"}' \ - --steps '[{"Args":["spark-submit","--deploy-mode","cluster","s3n://tinysearch/code/runextract.py"],"Type":"CUSTOM_JAR","ActionOnFailure":"CONTINUE","Jar":"command-runner.jar","Properties":"","Name":"Spark application"}]' \ - --name 'TinySearch' \ - --instance-groups '[{"InstanceCount":2,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"CORE","InstanceType":"m4.large","Name":"Core Instance Group"},{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":1}]},"InstanceGroupType":"MASTER","InstanceType":"m4.large","Name":"Master Instance Group"}]' \ - --configurations '[{"Classification":"spark","Properties":{}}]' \ - --scale-down-behavior TERMINATE_AT_TASK_COMPLETION --region us-east-1 \ - --auto-terminate diff --git a/mwmbl/indexer/domains/domain_titles.py b/mwmbl/indexer/domains/domain_titles.py index 907367e..43cf520 100644 --- a/mwmbl/indexer/domains/domain_titles.py +++ b/mwmbl/indexer/domains/domain_titles.py @@ -9,7 +9,7 @@ import bs4 import requests from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer -from mwmbl.indexer.paths import DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME +from mwmbl.indexer.paths import TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, DOMAINS_TITLES_QUEUE_NAME NUM_PROCESSES = 10 @@ -33,8 +33,8 @@ def get_redirect_no_cookies(url, max_redirects=5): def get_domain_titles(): - domains_queue = FSQueue(DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer()) - titles_queue = FSQueue(DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer()) + domains_queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer()) + titles_queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer()) while True: items_id, items = domains_queue.get() titles = retrieve_titles(items) diff --git a/mwmbl/indexer/domains/queue_domains.py b/mwmbl/indexer/domains/queue_domains.py index 3eb7ac6..02df697 100644 --- a/mwmbl/indexer/domains/queue_domains.py +++ b/mwmbl/indexer/domains/queue_domains.py @@ -5,7 +5,7 @@ import csv import gzip from mwmbl.indexer.fsqueue import FSQueue, ZstdJsonSerializer -from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, DATA_DIR +from mwmbl.indexer.paths import DOMAINS_PATH, DOMAINS_QUEUE_NAME, TINYSEARCH_DATA_DIR BATCH_SIZE = 250 @@ -18,7 +18,7 @@ def get_domains(): def queue_domains(): - queue = FSQueue(DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer()) + queue = FSQueue(TINYSEARCH_DATA_DIR, DOMAINS_QUEUE_NAME, ZstdJsonSerializer()) queued = 0 batch = [] for rank, domain in get_domains(): diff --git a/mwmbl/indexer/extract.py b/mwmbl/indexer/extract.py deleted file mode 100644 index a397e05..0000000 --- a/mwmbl/indexer/extract.py +++ /dev/null @@ -1,73 +0,0 @@ -""" -Extract content from HTML files and store it as compressed JSON -""" - -from urllib.parse import urlparse - -from pyspark.sql import SparkSession -from pyspark.sql.functions import col -from pyspark.sql.types import StructType, StructField, StringType, LongType, IntegerType - -RECORDS_PATH = 's3://tinysearch/outputs/records' -OUTPUT_PATH = 's3://tinysearch/outputs/index' - - -index_schema = StructType([ - StructField("term_hash", LongType(), False), - StructField("data", StringType(), False), - StructField("top", StringType(), False), -]) - - -output_schema = StructType([ - StructField("uri", StringType(), False), - StructField("title", StringType(), False), - StructField("extract", StringType(), False), -]) - - -record_schema = StructType([ - StructField("url", StringType(), False), - StructField("warc_filename", StringType(), False), - StructField("warc_record_offset", IntegerType(), False), - StructField("warc_record_length", IntegerType(), False), -]) - - -spark = SparkSession \ - .builder \ - .appName("Python Spark SQL basic example") \ - .config("spark.some.config.option", "some-value") \ - .getOrCreate() - - -def run(): - # sqlc = SQLContext(sparkContext=spark) - - df = spark.read.load('s3://commoncrawl/cc-index/table/cc-main/warc/') - df.createOrReplaceTempView('ccindex') - sqldf = spark.sql('''SELECT url, warc_filename, warc_record_offset, - warc_record_length - FROM ccindex - WHERE crawl = 'CC-MAIN-2021-43' - AND subset = 'warc' - ''') - sqldf = sqldf.sample(fraction=0.01) - sqldf = sqldf.filter(col('url_host_name').isin(list(DOMAINS.keys()))) - # print("Got rows", sqldf.take(10)) - # print("Num rows", sqldf.count()) - sqldf.write.option('compression', 'gzip').format('json').mode('overwrite').save(RECORDS_PATH) - - # warc_recs = sqldf.select("url", "warc_filename", "warc_record_offset", "warc_record_length").rdd - # rdd = warc_recs.mapPartitions(fetch_process_warc_records) - # output = sqlc.createDataFrame(rdd, schema=output_schema) - # output.write.option('compression', 'gzip').format('json').mode('overwrite').save(OUTPUT_PATH) - - -def get_domain_rating(url): - domain = urlparse(url).netloc - return DOMAINS.get(domain) - - -if __name__ == '__main__': - run() diff --git a/mwmbl/indexer/extract_local.py b/mwmbl/indexer/extract_local.py deleted file mode 100644 index b293f08..0000000 --- a/mwmbl/indexer/extract_local.py +++ /dev/null @@ -1,63 +0,0 @@ -import gzip -import json -import os -from glob import glob -from multiprocessing import Process, Lock - -from .extract_process import fetch_process_warc_records -from .fsqueue import FSQueue, GzipJsonRowSerializer -from .paths import DATA_DIR - -ARCHIVE_INFO_GLOB = 'outputs/records/*.gz' - -NUM_PROCESSES = 8 - - -def get_records(): - for path in glob(ARCHIVE_INFO_GLOB): - with gzip.open(path) as data_file: - for line in data_file: - yield json.loads(line) - - -def process(record): - print("Record", record) - return list(fetch_process_warc_records([record])) - - -def run(lock: Lock): - input_queue = FSQueue(DATA_DIR, 'records', GzipJsonRowSerializer()) - output_queue = FSQueue(DATA_DIR, 'search-items', GzipJsonRowSerializer()) - - while True: - with lock: - queue_item = input_queue.get() - if queue_item is None: - print("All finished, stopping:", os.getpid()) - break - item_id, records = queue_item - print("Got item: ", item_id, os.getpid()) - search_items = [] - for record in records: - search_items += list(fetch_process_warc_records([record])) - if search_items: - output_queue.put(search_items) - input_queue.done(item_id) - - -def run_multiprocessing(): - input_queue = FSQueue(DATA_DIR, 'records', GzipJsonRowSerializer()) - input_queue.unlock_all() - processes = [] - lock = Lock() - for i in range(NUM_PROCESSES): - new_process = Process(target=run, args=(lock,)) - new_process.start() - processes.append(new_process) - - for running_process in processes: - running_process.join() - - -if __name__ == '__main__': - run_multiprocessing() diff --git a/mwmbl/indexer/extract_process.py b/mwmbl/indexer/extract_process.py deleted file mode 100644 index 54eb247..0000000 --- a/mwmbl/indexer/extract_process.py +++ /dev/null @@ -1,137 +0,0 @@ -from io import BytesIO - -import boto3 -from justext import get_stoplist -from justext.core import LENGTH_LOW_DEFAULT, LENGTH_HIGH_DEFAULT, STOPWORDS_LOW_DEFAULT, STOPWORDS_HIGH_DEFAULT, \ - MAX_LINK_DENSITY_DEFAULT, MAX_HEADING_DISTANCE_DEFAULT, NO_HEADINGS_DEFAULT, DEFAULT_ENCODING, DEFAULT_ENC_ERRORS, \ - preprocessor, html_to_dom, ParagraphMaker, classify_paragraphs, revise_paragraph_classification -from langdetect import detect -from lxml.etree import ParserError -from warcio import ArchiveIterator - -MAX_URI_LENGTH = 150 -NUM_CHARS_TO_ANALYSE = 1000 -NUM_TITLE_CHARS = 65 -NUM_EXTRACT_CHARS = 155 - - -def fetch_process_warc_records(rows): - """Fetch all WARC records defined by filenames and offsets in rows, - parse the records and the contained HTML, split the text into words - and emit pairs """ - s3client = boto3.client('s3') - for row in rows: - warc_path = row['warc_filename'] - offset = int(row['warc_record_offset']) - length = int(row['warc_record_length']) - rangereq = 'bytes={}-{}'.format(offset, (offset+length-1)) - response = s3client.get_object(Bucket='commoncrawl', - Key=warc_path, - Range=rangereq) - record_stream = BytesIO(response["Body"].read()) - for record in ArchiveIterator(record_stream): - for result in process_record(record): - yield result - - -def is_html(record): - """Return true if (detected) MIME type of a record is HTML""" - html_types = ['text/html', 'application/xhtml+xml'] - if (('WARC-Identified-Payload-Type' in record.rec_headers) and - (record.rec_headers['WARC-Identified-Payload-Type'] in - html_types)): - return True - content_type = record.http_headers.get_header('content-type', None) - if content_type: - for html_type in html_types: - if html_type in content_type: - return True - return False - - -def justext(html_text, stoplist, length_low=LENGTH_LOW_DEFAULT, - length_high=LENGTH_HIGH_DEFAULT, stopwords_low=STOPWORDS_LOW_DEFAULT, - stopwords_high=STOPWORDS_HIGH_DEFAULT, max_link_density=MAX_LINK_DENSITY_DEFAULT, - max_heading_distance=MAX_HEADING_DISTANCE_DEFAULT, no_headings=NO_HEADINGS_DEFAULT, - encoding=None, default_encoding=DEFAULT_ENCODING, - enc_errors=DEFAULT_ENC_ERRORS, preprocessor=preprocessor): - """ - Converts an HTML page into a list of classified paragraphs. Each paragraph - is represented as instance of class ˙˙justext.paragraph.Paragraph˙˙. - """ - dom = html_to_dom(html_text, default_encoding, encoding, enc_errors) - print("Parsed HTML") - - try: - title = dom.find(".//title").text - except AttributeError: - title = None - - preprocessed_dom = preprocessor(dom) - - paragraphs = ParagraphMaker.make_paragraphs(preprocessed_dom) - print("Got paragraphs") - - classify_paragraphs(paragraphs, stoplist, length_low, length_high, - stopwords_low, stopwords_high, max_link_density, no_headings) - revise_paragraph_classification(paragraphs, max_heading_distance) - - return paragraphs, title - - -def process_record(record): - # print("Record", record.format, record.rec_type, record.rec_headers, record.raw_stream, - # record.http_headers, record.content_type, record.length) - - if record.rec_type != 'response': - # skip over WARC request or metadata records - return - if not is_html(record): - return - - uri = record.rec_headers.get_header('WARC-Target-URI') - if len(uri) > MAX_URI_LENGTH: - print("URI too long", len(uri)) - return - - # rating = get_domain_rating(uri) - # print("Rating", rating) - # if rating is None: - # return - - content = record.content_stream().read().strip() - # print("Content", uri, content[:100]) - - if not content: - return - - try: - all_paragraphs, full_title = justext(content, get_stoplist('English')) - except UnicodeDecodeError: - print("Unable to decode unicode") - return - except ParserError: - print("Unable to parse") - return - - if full_title is None: - print("Missing title") - return - - title = full_title[:NUM_TITLE_CHARS] + '…' \ - if len(full_title) > NUM_TITLE_CHARS else full_title - - text = '\n'.join([p.text for p in all_paragraphs - if not p.is_boilerplate])[:NUM_CHARS_TO_ANALYSE] - print("Paragraphs", text) - - if len(text) < NUM_EXTRACT_CHARS: - return - - language = detect(text) - print("Got language", language) - if language != 'en': - return - - extract = text[:NUM_EXTRACT_CHARS] - yield uri, title, extract \ No newline at end of file diff --git a/mwmbl/indexer/fsqueue.py b/mwmbl/indexer/fsqueue.py index f05b2d4..88787d9 100644 --- a/mwmbl/indexer/fsqueue.py +++ b/mwmbl/indexer/fsqueue.py @@ -7,7 +7,7 @@ import json import os from abc import ABC from enum import Enum -from typing import Union +from typing import Union, Any from uuid import uuid4 from pathlib import Path @@ -59,10 +59,10 @@ class GzipJsonRowSerializer(Serializer): class GzipJsonBlobSerializer(Serializer): - def serialize(self, items: list[object]) -> bytes: - raise NotImplementedError("Serializer not needed - blob is generated by browser extension") + def serialize(self, items: Any) -> bytes: + return gzip.compress(json.dumps(items).encode('utf8')) - def deserialize(self, serialized_items: bytes) -> list[object]: + def deserialize(self, serialized_items: bytes) -> Any: data = gzip.decompress(serialized_items).decode('utf8') return json.loads(data) diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 9368433..c772c55 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -2,16 +2,15 @@ Create a search index """ from collections import Counter -from itertools import islice -from typing import Iterator, Iterable +from typing import Iterable from urllib.parse import unquote import pandas as pd -# NUM_PAGES = 8192 -# PAGE_SIZE = 512 from mwmbl.tinysearchengine.indexer import Document, TokenizedDocument, TinyIndex +DEFAULT_SCORE = 0 + HTTP_START = 'http://' HTTPS_START = 'https://' BATCH_SIZE = 100 @@ -44,7 +43,7 @@ def prepare_url_for_tokenizing(url: str): return url -def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]: +def get_pages(nlp, titles_urls_and_extracts, link_counts) -> Iterable[TokenizedDocument]: for i, (title_cleaned, url, extract) in enumerate(titles_urls_and_extracts): title_tokens = tokenize(nlp, title_cleaned) prepared_url = prepare_url_for_tokenizing(unquote(url)) @@ -52,26 +51,19 @@ def get_pages(nlp, titles_urls_and_extracts) -> Iterable[TokenizedDocument]: extract_tokens = tokenize(nlp, extract) print("Extract tokens", extract_tokens) tokens = title_tokens | url_tokens | extract_tokens - yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract) + score = link_counts.get(url, DEFAULT_SCORE) + yield TokenizedDocument(tokens=list(tokens), url=url, title=title_cleaned, extract=extract, score=score) if i % 1000 == 0: print("Processed", i) -def grouper(n: int, iterator: Iterator): - while True: - chunk = tuple(islice(iterator, n)) - if not chunk: - return - yield chunk - - -def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, terms_path): +def index_titles_urls_and_extracts(indexer: TinyIndex, nlp, titles_urls_and_extracts, link_counts, terms_path): terms = Counter() - pages = get_pages(nlp, titles_urls_and_extracts) + pages = get_pages(nlp, titles_urls_and_extracts, link_counts) for page in pages: for token in page.tokens: - indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract)) + indexer.index(token, Document(url=page.url, title=page.title, extract=page.extract, score=page.score)) terms.update([t.lower() for t in page.tokens]) term_df = pd.DataFrame({ diff --git a/mwmbl/indexer/index_crawl.py b/mwmbl/indexer/index_crawl.py index 44e557e..a22f0d0 100644 --- a/mwmbl/indexer/index_crawl.py +++ b/mwmbl/indexer/index_crawl.py @@ -1,13 +1,14 @@ """ Index data crawled through the Mwmbl crawler. """ +import json from logging import getLogger import spacy from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError from mwmbl.indexer.index import index_titles_urls_and_extracts -from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR +from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, TINYSEARCH_DATA_DIR, LINK_COUNT_PATH from mwmbl.tinysearchengine.indexer import TinyIndex, Document, NUM_PAGES, PAGE_SIZE @@ -16,16 +17,16 @@ logger = getLogger(__name__) def index_mwmbl_crawl_data(): nlp = spacy.load("en_core_web_sm") + titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts() + link_counts = json.load(open(LINK_COUNT_PATH)) TinyIndex.create(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) - with TinyIndex(Document, INDEX_PATH, 'w') as indexer: - titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts() - index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH) + index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, link_counts, MWMBL_CRAWL_TERMS_PATH) def get_mwmbl_crawl_titles_urls_and_extracts(): - input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer()) + input_queue = FSQueue(TINYSEARCH_DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer()) input_queue.unlock_all() while True: try: diff --git a/mwmbl/indexer/index_glob.py b/mwmbl/indexer/index_glob.py deleted file mode 100644 index 9bd8b96..0000000 --- a/mwmbl/indexer/index_glob.py +++ /dev/null @@ -1,47 +0,0 @@ -import gzip -from glob import glob - -import bs4 -from spacy.lang.en import English - -from .index import tokenize -from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from .paths import INDEX_PATH, CRAWL_GLOB - - -def run(): - # TODO: item_factory argument is unfilled. - indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) - indexer.create() - nlp = English() - for path in glob(CRAWL_GLOB): - print("Path", path) - with gzip.open(path, 'rt') as html_file: - url = html_file.readline().strip() - content = html_file.read() - - if indexer.document_indexed(url): - print("Page exists, skipping", url) - continue - - cleaned_text = clean(content) - try: - title = bs4.BeautifulSoup(content, features="lxml").find('title').string - except AttributeError: - title = cleaned_text[:80] - tokens = tokenize(nlp, cleaned_text) - print("URL", url) - print("Tokens", tokens) - print("Title", title) - indexer.index(tokens, url, title) - - -if __name__ == '__main__': - run() - - -def clean(content): - text = justext.justext(content, justext.get_stoplist("English")) - pars = [par.text for par in text if not par.is_boilerplate] - cleaned_text = ' '.join(pars) - return cleaned_text \ No newline at end of file diff --git a/mwmbl/indexer/index_queue.py b/mwmbl/indexer/index_queue.py deleted file mode 100644 index f048e28..0000000 --- a/mwmbl/indexer/index_queue.py +++ /dev/null @@ -1,31 +0,0 @@ -""" -Index items in the file-system queue -""" -from spacy.lang.en import English - -from .fsqueue import FSQueue, ZstdJsonSerializer -from .index import index_titles_urls_and_extracts -from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from .paths import DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, INDEX_PATH - - -def get_queue_items(): - titles_queue = FSQueue(DATA_DIR, DOMAINS_TITLES_QUEUE_NAME, ZstdJsonSerializer()) - titles_queue.unlock_all() - while True: - items_id, items = titles_queue.get() - for item in items: - if item['title'] is None: - continue - yield item['title'], item['url'] - - -def index_queue_items(): - nlp = English() - with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: - titles_and_urls = get_queue_items() - index_titles_urls_and_extracts(indexer, nlp, titles_and_urls) - - -if __name__ == '__main__': - index_queue_items() diff --git a/mwmbl/indexer/indexcc.py b/mwmbl/indexer/indexcc.py deleted file mode 100644 index 4f68025..0000000 --- a/mwmbl/indexer/indexcc.py +++ /dev/null @@ -1,49 +0,0 @@ -""" -Index data downloaded from Common Crawl -""" -import logging -import sys -from logging import getLogger - -import spacy - -from .fsqueue import FSQueue, GzipJsonRowSerializer, FSQueueError -from .index import index_titles_urls_and_extracts -from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE, Document -from .paths import INDEX_PATH, DATA_DIR, COMMON_CRAWL_TERMS_PATH - - -logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) -logger = getLogger(__name__) - - -def index_common_craw_data(): - nlp = spacy.load("en_core_web_sm") - - with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: - titles_urls_and_extracts = get_common_crawl_titles_urls_and_extracts() - index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, COMMON_CRAWL_TERMS_PATH) - - -def get_common_crawl_titles_urls_and_extracts(): - input_queue = FSQueue(DATA_DIR, 'search-items', GzipJsonRowSerializer()) - input_queue.unlock_all() - while True: - try: - next_item = input_queue.get() - except FSQueueError as e: - logger.exception(f'Error with item {e.item_id}') - input_queue.error(e.item_id) - continue - if next_item is None: - logger.info('Not more items to process, stopping') - break - item_id, items = next_item - logger.info(f'Processing item {item_id}') - for url, title, extract in items: - yield title, url, extract - input_queue.done(item_id) - - -if __name__ == '__main__': - index_common_craw_data() diff --git a/analyse/top_links.py b/mwmbl/indexer/links.py similarity index 74% rename from analyse/top_links.py rename to mwmbl/indexer/links.py index 012c44f..437c9b0 100644 --- a/analyse/top_links.py +++ b/mwmbl/indexer/links.py @@ -7,7 +7,7 @@ import json from collections import defaultdict from urllib.parse import urlparse -from analyse.analyse_crawled_domains import CRAWL_GLOB +from mwmbl.indexer.paths import CRAWL_GLOB, LINK_COUNT_PATH def get_urls(): @@ -30,9 +30,9 @@ def collect_links(urls): def run(): url_links = get_urls() collected = collect_links(url_links) - top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000] - for url, items in top: - print("URL", url, len(items)) + link_counts = {url: len(links) for url, links in collected.items()} + with open(LINK_COUNT_PATH, 'w') as output_file: + json.dump(link_counts, output_file, indent=2) if __name__ == '__main__': diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py index c372021..5b02c41 100644 --- a/mwmbl/indexer/paths.py +++ b/mwmbl/indexer/paths.py @@ -3,24 +3,26 @@ from pathlib import Path HOME = os.getenv('HOME') -DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch' -COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv' +DATA_DIR = Path(os.environ['HOME']) / 'data' +TINYSEARCH_DATA_DIR = DATA_DIR / 'tinysearch' +COMMON_CRAWL_TERMS_PATH = TINYSEARCH_DATA_DIR / 'common-craw-terms.csv' -HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') -CRAWL_PREFIX = 'crawl_' -CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*") -TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch') -TEST_TERMS_PATH = os.path.join(DATA_DIR, 'index-terms.csv') -WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') -WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz') +TEST_INDEX_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'index-test.tinysearch') +TEST_TERMS_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'index-terms.csv') +WIKI_DATA_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') +WIKI_TITLES_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'abstract-titles-sorted.txt.gz') -URLS_PATH = DATA_DIR / 'urls.sqlite3' +URLS_PATH = TINYSEARCH_DATA_DIR / 'urls.sqlite3' DOMAINS_QUEUE_NAME = 'domains-queue-fs' DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' -DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') +DOMAINS_PATH = os.path.join(TINYSEARCH_DATA_DIR, 'top10milliondomains.csv.gz') LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data' INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch' MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv' -TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json' +TOP_DOMAINS_JSON_PATH = TINYSEARCH_DATA_DIR / 'hn-top-domains.json' + +MWMBL_DATA_DIR = DATA_DIR / "mwmbl" +CRAWL_GLOB = str(MWMBL_DATA_DIR / "b2") + "/*/*/*/*/*/*.json.gz" +LINK_COUNT_PATH = MWMBL_DATA_DIR / 'crawl-counts.json' diff --git a/mwmbl/indexer/wiki.py b/mwmbl/indexer/wiki.py deleted file mode 100644 index 93ac1c7..0000000 --- a/mwmbl/indexer/wiki.py +++ /dev/null @@ -1,41 +0,0 @@ -""" -Index Wikipedia -""" -import gzip -import html -from urllib.parse import quote - -from spacy.lang.en import English - -from .index import index_titles_urls_and_extracts -from mwmbl.tinysearchengine.indexer import TinyIndexer, NUM_PAGES, PAGE_SIZE -from .paths import WIKI_TITLES_PATH, INDEX_PATH - -TEXT_TAGS = ['mediawiki', 'page', 'revision', 'text'] -TITLE_START = 'Wikipedia: ' -TITLE_END = '\n' - - -def index_wiki(): - nlp = English() - with TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer: - titles_and_urls = get_wiki_titles_and_urls() - index_titles_urls_and_extracts(indexer, nlp, titles_and_urls) - - -def get_wiki_titles_and_urls(): - start_len = len(TITLE_START) - end_len = len(TITLE_END) - with gzip.open(WIKI_TITLES_PATH, 'rt') as wiki_titles_file: - wiki_titles_file.readline() - for raw_title in wiki_titles_file: - assert raw_title.startswith(TITLE_START) - assert raw_title.endswith(TITLE_END) - title = raw_title[start_len:-end_len] - unescaped_title = html.unescape(title) - url = 'https://en.wikipedia.org/wiki/' + quote(unescaped_title.replace(' ', '_')) - yield unescaped_title, url - - -if __name__ == '__main__': - index_wiki() diff --git a/mwmbl/tinysearchengine/app.py b/mwmbl/tinysearchengine/app.py index 85078e3..d386adf 100644 --- a/mwmbl/tinysearchengine/app.py +++ b/mwmbl/tinysearchengine/app.py @@ -7,7 +7,7 @@ import uvicorn from mwmbl.tinysearchengine import create_app from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.indexer import TinyIndex, Document -from mwmbl.tinysearchengine.rank import Ranker +from mwmbl.tinysearchengine.rank import HeuristicRanker logging.basicConfig() @@ -37,7 +37,7 @@ def main(): completer = Completer(terms) with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index: - ranker = Ranker(tiny_index, completer) + ranker = HeuristicRanker(tiny_index, completer) # Initialize FastApi instance app = create_app.create(ranker) diff --git a/mwmbl/tinysearchengine/create_app.py b/mwmbl/tinysearchengine/create_app.py index 5d08e30..fc5e3db 100644 --- a/mwmbl/tinysearchengine/create_app.py +++ b/mwmbl/tinysearchengine/create_app.py @@ -10,7 +10,7 @@ from starlette.middleware.cors import CORSMiddleware from mwmbl.tinysearchengine.completer import Completer from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS from mwmbl.tinysearchengine.indexer import TinyIndex, Document -from mwmbl.tinysearchengine.rank import Ranker +from mwmbl.tinysearchengine.rank import HeuristicRanker logger = getLogger(__name__) @@ -18,7 +18,7 @@ logger = getLogger(__name__) SCORE_THRESHOLD = 0.25 -def create(ranker: Ranker): +def create(ranker: HeuristicRanker): app = FastAPI() # Allow CORS requests from any site diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 1beb1b0..e53b7a3 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -12,7 +12,7 @@ VERSION = 1 METADATA_CONSTANT = b'mwmbl-tiny-search' METADATA_SIZE = 4096 -NUM_PAGES = 76800 +NUM_PAGES = 128000 PAGE_SIZE = 4096 @@ -21,6 +21,7 @@ class Document: title: str url: str extract: str + score: float @dataclass diff --git a/mwmbl/tinysearchengine/ltr.py b/mwmbl/tinysearchengine/ltr.py new file mode 100644 index 0000000..77afbd8 --- /dev/null +++ b/mwmbl/tinysearchengine/ltr.py @@ -0,0 +1,53 @@ +""" +Learning to rank predictor +""" +from pandas import DataFrame, Series +from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin + +from mwmbl.tinysearchengine.rank import get_match_features, get_domain_score, score_match + + +class ThresholdPredictor(BaseEstimator, RegressorMixin): + def __init__(self, threshold: float, classifier: BaseEstimator): + self.threshold = threshold + self.classifier = classifier + + def fit(self, X, y) -> BaseEstimator: + y_thresholded = y > self.threshold + self.classifier.fit(X, y_thresholded) + return self + + def predict(self, X): + predictions = self.classifier.predict_proba(X) + if predictions.shape[1] == 2: + return predictions[:, 1] + return predictions + + +def get_match_features_as_series(item: Series): + terms = item['query'].lower().split() + features = {} + for part in ['title', 'extract', 'url']: + last_match_char, match_length, total_possible_match_length = get_match_features(terms, item[part], True, False) + features[f'last_match_char_{part}'] = last_match_char + features[f'match_length_{part}'] = match_length + features[f'total_possible_match_length_{part}'] = total_possible_match_length + # features[f'score_{part}'] = score_match(last_match_char, match_length, total_possible_match_length) + + features['num_terms'] = len(terms) + features['num_chars'] = len(' '.join(terms)) + features['domain_score'] = get_domain_score(item['url']) + features['item_score'] = item['score'] + return Series(features) + + +class FeatureExtractor(BaseEstimator, TransformerMixin): + def fit(self, X, y=None): + return self + + def transform(self, X: DataFrame, y=None): + features = X.apply(get_match_features_as_series, axis=1) + print("Features", features.columns) + return features + + diff --git a/mwmbl/tinysearchengine/ltr_rank.py b/mwmbl/tinysearchengine/ltr_rank.py new file mode 100644 index 0000000..b1bbe2d --- /dev/null +++ b/mwmbl/tinysearchengine/ltr_rank.py @@ -0,0 +1,35 @@ +import numpy as np +from pandas import DataFrame +from sklearn.base import BaseEstimator + +from mwmbl.tinysearchengine.completer import Completer +from mwmbl.tinysearchengine.indexer import Document, TinyIndex +from mwmbl.tinysearchengine.rank import Ranker, order_results + + +class LTRRanker(Ranker): + def __init__(self, model: BaseEstimator, tiny_index: TinyIndex, completer: Completer): + super().__init__(tiny_index, completer) + self.model = model + self.top_n = 20 + + def order_results(self, terms, pages: list[Document], is_complete): + if len(pages) == 0: + return [] + + top_pages = order_results(terms, pages, is_complete)[:self.top_n] + + query = ' '.join(terms) + data = { + 'query': [query] * len(top_pages), + 'url': [page.url for page in top_pages], + 'title': [page.title for page in top_pages], + 'extract': [page.extract for page in top_pages], + 'score': [page.score for page in top_pages], + } + + dataframe = DataFrame(data) + print("Ordering results", dataframe) + predictions = self.model.predict(dataframe) + indexes = np.argsort(predictions)[::-1] + return [top_pages[i] for i in indexes] diff --git a/mwmbl/tinysearchengine/rank.py b/mwmbl/tinysearchengine/rank.py index 8ef4942..b5d4807 100644 --- a/mwmbl/tinysearchengine/rank.py +++ b/mwmbl/tinysearchengine/rank.py @@ -1,4 +1,5 @@ import re +from abc import abstractmethod from logging import getLogger from operator import itemgetter from pathlib import Path @@ -14,27 +15,49 @@ from mwmbl.tinysearchengine.indexer import TinyIndex, Document logger = getLogger(__name__) -SCORE_THRESHOLD = 0.25 +SCORE_THRESHOLD = 0.0 -def _get_query_regex(terms, is_complete): +def _get_query_regex(terms, is_complete, is_url): if not terms: return '' + word_sep = r'\b' if is_url else '' if is_complete: - term_patterns = [rf'\b{term}\b' for term in terms] + term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms] else: - term_patterns = [rf'\b{term}\b' for term in terms[:-1]] + [rf'\b{terms[-1]}'] + term_patterns = [rf'{word_sep}{re.escape(term)}{word_sep}' for term in terms[:-1]] + [ + rf'{word_sep}{re.escape(terms[-1])}'] pattern = '|'.join(term_patterns) return pattern -def _score_result(terms, result: Document, is_complete: bool): - domain = urlparse(result.url).netloc - domain_score = DOMAINS.get(domain, 0.0) +def _score_result(terms, result: Document, is_complete: bool, max_score: float): + domain_score = get_domain_score(result.url) result_string = f"{result.title.strip()} {result.extract.strip()}" - query_regex = _get_query_regex(terms, is_complete) + last_match_char, match_length, total_possible_match_length = get_match_features( + terms, result_string, is_complete, False) + + match_score = score_match(last_match_char, match_length, total_possible_match_length) + score = 0.01 * domain_score + 0.99 * match_score + # score = (0.1 + 0.9*match_score) * (0.1 + 0.9*(result.score / max_score)) + # score = 0.01 * match_score + 0.99 * (result.score / max_score) + return score + + +def score_match(last_match_char, match_length, total_possible_match_length): + return (match_length + 1. / last_match_char) / (total_possible_match_length + 1) + + +def get_domain_score(url): + domain = urlparse(url).netloc + domain_score = DOMAINS.get(domain, 0.0) + return domain_score + + +def get_match_features(terms, result_string, is_complete, is_url): + query_regex = _get_query_regex(terms, is_complete, is_url) matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE)) match_strings = {x.group(0).lower() for x in matches} match_length = sum(len(x) for x in match_strings) @@ -48,12 +71,15 @@ def _score_result(terms, result: Document, is_complete: bool): seen_matches.add(value) total_possible_match_length = sum(len(x) for x in terms) - score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1) - return score + return last_match_char, match_length, total_possible_match_length -def _order_results(terms: list[str], results: list[Document], is_complete: bool): - results_and_scores = [(_score_result(terms, result, is_complete), result) for result in results] +def order_results(terms: list[str], results: list[Document], is_complete: bool) -> list[Document]: + if len(results) == 0: + return [] + + max_score = max(result.score for result in results) + results_and_scores = [(_score_result(terms, result, is_complete, max_score), result) for result in results] ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True) filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD] return filtered_results @@ -64,11 +90,15 @@ class Ranker: self.tiny_index = tiny_index self.completer = completer + @abstractmethod + def order_results(self, terms, pages, is_complete): + pass + def search(self, s: str): - results, terms = self._get_results(s) + results, terms = self.get_results(s) is_complete = s.endswith(' ') - pattern = _get_query_regex(terms, is_complete) + pattern = _get_query_regex(terms, is_complete, False) formatted_results = [] for result in results: formatted_result = {} @@ -89,14 +119,14 @@ class Ranker: return formatted_results def complete(self, q: str): - ordered_results, terms = self._get_results(q) + ordered_results, terms = self.get_results(q) results = [item.title.replace("\n", "") + ' — ' + item.url.replace("\n", "") for item in ordered_results] if len(results) == 0: return [] return [q, results] - def _get_results(self, q): + def get_results(self, q): terms = [x.lower() for x in q.replace('.', ' ').split()] is_complete = q.endswith(' ') if len(terms) > 0 and not is_complete: @@ -115,5 +145,11 @@ class Ranker: pages.append(item) seen_items.add(item.title) - ordered_results = _order_results(terms, pages, is_complete) + ordered_results = self.order_results(terms, pages, is_complete) return ordered_results, terms + + +class HeuristicRanker(Ranker): + def order_results(self, terms, pages, is_complete): + return order_results(terms, pages, is_complete) + diff --git a/poetry.lock b/poetry.lock index cc3189d..a02238d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -65,7 +65,7 @@ lxml = ["lxml"] [[package]] name = "blis" -version = "0.7.5" +version = "0.7.6" description = "The Blis BLAS-like linear algebra library, as a self-contained C-extension." category = "main" optional = true @@ -135,7 +135,7 @@ pycparser = "*" [[package]] name = "charset-normalizer" -version = "2.0.11" +version = "2.0.12" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." category = "main" optional = true @@ -146,7 +146,7 @@ unicode_backport = ["unicodedata2"] [[package]] name = "click" -version = "8.0.3" +version = "8.0.4" description = "Composable command line interface toolkit" category = "main" optional = false @@ -185,7 +185,6 @@ spacy = ">=3.2.0,<3.3.0" [package.source] type = "url" url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0.tar.gz" - [[package]] name = "fastapi" version = "0.70.1" @@ -250,6 +249,14 @@ category = "main" optional = true python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" +[[package]] +name = "joblib" +version = "1.1.0" +description = "Lightweight pipelining with Python functions" +category = "main" +optional = false +python-versions = ">=3.6" + [[package]] name = "justext" version = "3.0.0" @@ -310,11 +317,11 @@ source = ["Cython (>=0.29.7)"] [[package]] name = "markupsafe" -version = "2.0.1" +version = "2.1.1" description = "Safely add untrusted strings to HTML/XML markup." category = "main" optional = true -python-versions = ">=3.6" +python-versions = ">=3.7" [[package]] name = "mmh3" @@ -334,11 +341,11 @@ python-versions = "*" [[package]] name = "numpy" -version = "1.21.1" +version = "1.22.3" description = "NumPy is the fundamental package for array computing with Python." category = "main" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" [[package]] name = "packaging" @@ -353,7 +360,7 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5" [[package]] name = "pandas" -version = "1.4.0" +version = "1.4.1" description = "Powerful data structures for data analysis, time series, and statistics" category = "main" optional = false @@ -489,11 +496,11 @@ sql = ["pandas (>=0.23.2)", "pyarrow (>=1.0.0)"] [[package]] name = "pytest" -version = "7.0.1" +version = "7.1.1" description = "pytest: simple powerful testing with Python" category = "dev" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" [package.dependencies] atomicwrites = {version = ">=1.0", markers = "sys_platform == \"win32\""} @@ -521,7 +528,7 @@ six = ">=1.5" [[package]] name = "pytz" -version = "2021.3" +version = "2022.1" description = "World timezone definitions, modern and historical" category = "main" optional = false @@ -566,7 +573,7 @@ use_chardet_on_py3 = ["chardet (>=3.0.2,<5)"] [[package]] name = "s3transfer" -version = "0.5.0" +version = "0.5.2" description = "An Amazon S3 Transfer Manager" category = "main" optional = true @@ -578,6 +585,37 @@ botocore = ">=1.12.36,<2.0a.0" [package.extras] crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"] +[[package]] +name = "scikit-learn" +version = "1.0.2" +description = "A set of python modules for machine learning and data mining" +category = "main" +optional = false +python-versions = ">=3.7" + +[package.dependencies] +joblib = ">=0.11" +numpy = ">=1.14.6" +scipy = ">=1.1.0" +threadpoolctl = ">=2.0.0" + +[package.extras] +benchmark = ["matplotlib (>=2.2.3)", "pandas (>=0.25.0)", "memory-profiler (>=0.57.0)"] +docs = ["matplotlib (>=2.2.3)", "scikit-image (>=0.14.5)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)", "memory-profiler (>=0.57.0)", "sphinx (>=4.0.1)", "sphinx-gallery (>=0.7.0)", "numpydoc (>=1.0.0)", "Pillow (>=7.1.2)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=2.2.3)", "scikit-image (>=0.14.5)", "pandas (>=0.25.0)", "seaborn (>=0.9.0)"] +tests = ["matplotlib (>=2.2.3)", "scikit-image (>=0.14.5)", "pandas (>=0.25.0)", "pytest (>=5.0.1)", "pytest-cov (>=2.9.0)", "flake8 (>=3.8.2)", "black (>=21.6b0)", "mypy (>=0.770)", "pyamg (>=4.0.0)"] + +[[package]] +name = "scipy" +version = "1.8.0" +description = "SciPy: Scientific Library for Python" +category = "main" +optional = false +python-versions = ">=3.8,<3.11" + +[package.dependencies] +numpy = ">=1.17.3,<1.25.0" + [[package]] name = "six" version = "1.16.0" @@ -672,7 +710,7 @@ transformers = ["spacy-transformers (>=1.1.2,<1.2.0)"] [[package]] name = "spacy-legacy" -version = "3.0.8" +version = "3.0.9" description = "Legacy registered functions for spaCy backwards compatibility" category = "main" optional = true @@ -716,7 +754,7 @@ full = ["itsdangerous", "jinja2", "python-multipart", "pyyaml", "requests", "gra [[package]] name = "thinc" -version = "8.0.13" +version = "8.0.15" description = "A refreshing functional take on deep learning, compatible with your favorite libraries" category = "main" optional = true @@ -726,7 +764,7 @@ python-versions = ">=3.6" blis = ">=0.4.0,<0.8.0" catalogue = ">=2.0.4,<2.1.0" cymem = ">=2.0.2,<2.1.0" -murmurhash = ">=0.28.0,<1.1.0" +murmurhash = ">=1.0.2,<1.1.0" numpy = ">=1.15.0" preshed = ">=3.0.2,<3.1.0" pydantic = ">=1.7.4,<1.8 || >1.8,<1.8.1 || >1.8.1,<1.9.0" @@ -743,6 +781,7 @@ cuda111 = ["cupy-cuda111 (>=5.0.0b4)"] cuda112 = ["cupy-cuda112 (>=5.0.0b4)"] cuda113 = ["cupy-cuda113 (>=5.0.0b4)"] cuda114 = ["cupy-cuda114 (>=5.0.0b4)"] +cuda115 = ["cupy-cuda115 (>=5.0.0b4)"] cuda80 = ["cupy-cuda80 (>=5.0.0b4)"] cuda90 = ["cupy-cuda90 (>=5.0.0b4)"] cuda91 = ["cupy-cuda91 (>=5.0.0b4)"] @@ -750,7 +789,15 @@ cuda92 = ["cupy-cuda92 (>=5.0.0b4)"] datasets = ["ml-datasets (>=0.2.0,<0.3.0)"] mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] -torch = ["torch (>=1.5.0)"] +torch = ["torch (>=1.6.0)"] + +[[package]] +name = "threadpoolctl" +version = "3.1.0" +description = "threadpoolctl" +category = "main" +optional = false +python-versions = ">=3.6" [[package]] name = "tomli" @@ -762,7 +809,7 @@ python-versions = ">=3.7" [[package]] name = "tqdm" -version = "4.62.3" +version = "4.63.0" description = "Fast, Extensible Progress Meter" category = "main" optional = true @@ -795,7 +842,7 @@ test = ["shellingham (>=1.3.0,<2.0.0)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov ( [[package]] name = "typing-extensions" -version = "4.0.1" +version = "4.1.1" description = "Backported and Experimental Type Hints for Python 3.6+" category = "main" optional = false @@ -811,14 +858,14 @@ python-versions = ">=3.6" [[package]] name = "urllib3" -version = "1.26.8" +version = "1.26.9" description = "HTTP library with thread-safe connection pooling, file post, and more." category = "main" optional = true python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <4" [package.extras] -brotli = ["brotlipy (>=0.6.0)"] +brotli = ["brotlicffi (>=0.8.0)", "brotli (>=1.0.9)", "brotlipy (>=0.6.0)"] secure = ["pyOpenSSL (>=0.14)", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "certifi", "ipaddress"] socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] @@ -876,8 +923,8 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx [metadata] lock-version = "1.1" -python-versions = "^3.10" -content-hash = "edb2d4bc50cb09ac5f7ba311d5238eb2deeab1d12f479067cc7239e3232bf6c9" +python-versions = ">=3.10,<3.11" +content-hash = "be01dba545a5f118cb4d1cd2726c0b9fadbba44684fcd4af2bef2e812e28da93" [metadata.files] anyio = [ @@ -901,22 +948,22 @@ beautifulsoup4 = [ {file = "beautifulsoup4-4.10.0.tar.gz", hash = "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891"}, ] blis = [ - {file = "blis-0.7.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5812a7c04561ae7332cf730f57d9f82cbd12c5f86a5bfad66ee244e51d06266d"}, - {file = "blis-0.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eecfce3d8fce61dede7b0ae0dffa461c22072437b6cde85587db0c1aa75b450"}, - {file = "blis-0.7.5-cp310-cp310-win_amd64.whl", hash = "sha256:0e476931f0d5703a21c77e7f69b8ebdeeea493fc7858a86f627ac2b376a12c8d"}, - {file = "blis-0.7.5-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:5966ddf3bce84aa7bb09ce4ca059309602fa63280a5d5e5365bb2a294bd5a138"}, - {file = "blis-0.7.5-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9034dabce4e42e3a1a7b99cc6de430484c8c369e51556ee8d47a53c085de681"}, - {file = "blis-0.7.5-cp36-cp36m-win_amd64.whl", hash = "sha256:730952f74adb0fa7dde9f1bc11249d5a64f3a3a9cf7dfa23b189a4b767bdf2d0"}, - {file = "blis-0.7.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2339cb19594134775bda8b86f23a893828fc7e8d63f09ba9a15f30b2b16c966c"}, - {file = "blis-0.7.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5023781272e0b2868be2f92017aa6836557990f1ca5ba2af5e9f5a0acf04fd8a"}, - {file = "blis-0.7.5-cp37-cp37m-win_amd64.whl", hash = "sha256:65ba723821cc57eb4227eb8dd05c57fff23d97f826d4325b316cd8a63aac8d6a"}, - {file = "blis-0.7.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ad4af690c37a5953d3aea660ad89b636bfbb80ca1470995554670ca2143f0cb2"}, - {file = "blis-0.7.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cf11c233ea5c2d30683e7c9641c5dc4cd76ed0f64755ba3321dfb8db39feb316"}, - {file = "blis-0.7.5-cp38-cp38-win_amd64.whl", hash = "sha256:31401da283ed42905f0fbf2f8b88ea424c6a911482426f84b5b88c54d382e4d1"}, - {file = "blis-0.7.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0c185979f8f528d634f5548b8cd84ab0366d340c27c039ad3937fab186c1c252"}, - {file = "blis-0.7.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8345bd04777557ef385e2f2d1f14a19d53b2ea9ca5fe107a2cdc50d7bafb8eb2"}, - {file = "blis-0.7.5-cp39-cp39-win_amd64.whl", hash = "sha256:66204a19e38986645940c887498c7b5520efb5bbc6526bf1b8a58f7d3eb37da0"}, - {file = "blis-0.7.5.tar.gz", hash = "sha256:833e01e9eaff4c01aa6e049bbc1e6acb9eca6ee513d7b35b5bf135d49705ad33"}, + {file = "blis-0.7.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:904532b38e8f93c97ba9639a0462f5a827e9a8e9fb0aaee441cbbf6d847a5bc0"}, + {file = "blis-0.7.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4bf5549f0b54359f2186e4f7d1e75136c4f313e17596f6ce2de601a033d9d44"}, + {file = "blis-0.7.6-cp310-cp310-win_amd64.whl", hash = "sha256:d0b4f2b76d81f28d1402bf69c775a9dd6ad37058ceb3ffed3da44cd951855cdf"}, + {file = "blis-0.7.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:58d051f900b0ff4cdfb0b2b3b28fede1d26f7af0cb920f48b89b8185f8d740e2"}, + {file = "blis-0.7.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc7eb12219796d18f65797d1fa402b36a03de974848c05767bc03ba0d72c512d"}, + {file = "blis-0.7.6-cp36-cp36m-win_amd64.whl", hash = "sha256:f036561d1739787e9d02e4106c340a79a519d635200c13463031f39b6e234c0b"}, + {file = "blis-0.7.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:d1f44badd1d0b1aa1d68e0990e285b3dcf320a6120709318b1cdc9b0204b8ae8"}, + {file = "blis-0.7.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7027bcdc73a80d313c1423d9b6dc381c7253f08eca7b8453a7a6ba2c49c202f7"}, + {file = "blis-0.7.6-cp37-cp37m-win_amd64.whl", hash = "sha256:dd416a08f099644bd229667d2acdbfedc50d709f8c88b4d32363eb7ab7962e1b"}, + {file = "blis-0.7.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dcf2bb2a3223683eee7ee348c647566daadc1642a775f36ab52a8b62e6ad6a3"}, + {file = "blis-0.7.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:66115b899052ded91fedfe3091295eb4da81dc2115e292ab4c5dd97d9e458e75"}, + {file = "blis-0.7.6-cp38-cp38-win_amd64.whl", hash = "sha256:1ef5b9fd08fe4efb5679d60d9e61f2ca2c36158c90ba01cbf9e46e8be473212f"}, + {file = "blis-0.7.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:762714b1d6901d628c53a11072db932aeeb01b6df2a394ec240f0a051e4e8e8e"}, + {file = "blis-0.7.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:43789dc60d81692d5db4978d3eba1c9fa02e4d1e9eadad11000244609d924521"}, + {file = "blis-0.7.6-cp39-cp39-win_amd64.whl", hash = "sha256:6eb5553a9905bbc63eebeba4bf555bbabb47d029aaed8b0a4f0490a199dccba7"}, + {file = "blis-0.7.6.tar.gz", hash = "sha256:fe97b10f68a1c7b54c0e54beada84e18f4efb68dd40c906fb8748f5114743ec6"}, ] boto3 = [ {file = "boto3-1.20.20-py3-none-any.whl", hash = "sha256:6c173ffaf0604e34d6865edf7a9a71e1b3e79bd441b8b465ca4b2d44f840806d"}, @@ -987,12 +1034,12 @@ cffi = [ {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"}, ] charset-normalizer = [ - {file = "charset-normalizer-2.0.11.tar.gz", hash = "sha256:98398a9d69ee80548c762ba991a4728bfc3836768ed226b3945908d1a688371c"}, - {file = "charset_normalizer-2.0.11-py3-none-any.whl", hash = "sha256:2842d8f5e82a1f6aa437380934d5e1cd4fcf2003b06fed6940769c164a480a45"}, + {file = "charset-normalizer-2.0.12.tar.gz", hash = "sha256:2857e29ff0d34db842cd7ca3230549d1a697f96ee6d3fb071cfa6c7393832597"}, + {file = "charset_normalizer-2.0.12-py3-none-any.whl", hash = "sha256:6881edbebdb17b39b4eaaa821b438bf6eddffb4468cf344f09f89def34a8b1df"}, ] click = [ - {file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"}, - {file = "click-8.0.3.tar.gz", hash = "sha256:410e932b050f5eed773c4cda94de75971c89cdb3155a72a0831139a79e5ecb5b"}, + {file = "click-8.0.4-py3-none-any.whl", hash = "sha256:6a7a62563bbfabfda3a38f3023a1db4a35978c0abd76f6c9605ecd6554d6d9b1"}, + {file = "click-8.0.4.tar.gz", hash = "sha256:8458d7b1287c5fb128c90e23381cf99dcde74beaf6c7ff6384ce84d6fe090adb"}, ] colorama = [ {file = "colorama-0.4.4-py2.py3-none-any.whl", hash = "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"}, @@ -1041,6 +1088,10 @@ jmespath = [ {file = "jmespath-0.10.0-py2.py3-none-any.whl", hash = "sha256:cdf6525904cc597730141d61b36f2e4b8ecc257c420fa2f4549bac2c2d0cb72f"}, {file = "jmespath-0.10.0.tar.gz", hash = "sha256:b85d0567b8666149a93172712e68920734333c0ce7e89b78b3e987f71e5ed4f9"}, ] +joblib = [ + {file = "joblib-1.1.0-py2.py3-none-any.whl", hash = "sha256:f21f109b3c7ff9d95f8387f752d0d9c34a02aa2f7060c2135f465da0e5160ff6"}, + {file = "joblib-1.1.0.tar.gz", hash = "sha256:4158fcecd13733f8be669be0683b96ebdbbd38d23559f54dca7205aea1bf1e35"}, +] justext = [ {file = "jusText-3.0.0-py2.py3-none-any.whl", hash = "sha256:86b48f5b1d99505acd072f5831def6cd3f1306043651c524a1c609e62e3544e4"}, {file = "jusText-3.0.0.tar.gz", hash = "sha256:7640e248218795f6be65f6c35fe697325a3280fcb4675d1525bcdff2b86faadf"}, @@ -1170,75 +1221,46 @@ lxml = [ {file = "lxml-4.6.4.tar.gz", hash = "sha256:daf9bd1fee31f1c7a5928b3e1059e09a8d683ea58fb3ffc773b6c88cb8d1399c"}, ] markupsafe = [ - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win32.whl", hash = "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28"}, - {file = "MarkupSafe-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_i686.whl", hash = "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_i686.whl", hash = "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2010_x86_64.whl", hash = "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win32.whl", hash = "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d"}, - {file = "MarkupSafe-2.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_i686.whl", hash = "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2010_x86_64.whl", hash = "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win32.whl", hash = "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415"}, - {file = "MarkupSafe-2.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_i686.whl", hash = "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_i686.whl", hash = "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2010_x86_64.whl", hash = "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win32.whl", hash = "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64"}, - {file = "MarkupSafe-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_i686.whl", hash = "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_i686.whl", hash = "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2010_x86_64.whl", hash = "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win32.whl", hash = "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74"}, - {file = "MarkupSafe-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8"}, - {file = "MarkupSafe-2.0.1.tar.gz", hash = "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:86b1f75c4e7c2ac2ccdaec2b9022845dbb81880ca318bb7a0a01fbf7813e3812"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f121a1420d4e173a5d96e47e9a0c0dcff965afdf1626d28de1460815f7c4ee7a"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a49907dd8420c5685cfa064a1335b6754b74541bbb3706c259c02ed65b644b3e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:10c1bfff05d95783da83491be968e8fe789263689c02724e0c691933c52994f5"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7bd98b796e2b6553da7225aeb61f447f80a1ca64f41d83612e6139ca5213aa4"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b09bf97215625a311f669476f44b8b318b075847b49316d3e28c08e41a7a573f"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:694deca8d702d5db21ec83983ce0bb4b26a578e71fbdbd4fdcd387daa90e4d5e"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:efc1913fd2ca4f334418481c7e595c00aad186563bbc1ec76067848c7ca0a933"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win32.whl", hash = "sha256:4a33dea2b688b3190ee12bd7cfa29d39c9ed176bda40bfa11099a3ce5d3a7ac6"}, + {file = "MarkupSafe-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:dda30ba7e87fbbb7eab1ec9f58678558fd9a6b8b853530e176eabd064da81417"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:671cd1187ed5e62818414afe79ed29da836dde67166a9fac6d435873c44fdd02"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3799351e2336dc91ea70b034983ee71cf2f9533cdff7c14c90ea126bfd95d65a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e72591e9ecd94d7feb70c1cbd7be7b3ebea3f548870aa91e2732960fa4d57a37"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6fbf47b5d3728c6aea2abb0589b5d30459e369baa772e0f37a0320185e87c980"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d5ee4f386140395a2c818d149221149c54849dfcfcb9f1debfe07a8b8bd63f9a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bcb3ed405ed3222f9904899563d6fc492ff75cce56cba05e32eff40e6acbeaa3"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:e1c0b87e09fa55a220f058d1d49d3fb8df88fbfab58558f1198e08c1e1de842a"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win32.whl", hash = "sha256:8dc1c72a69aa7e082593c4a203dcf94ddb74bb5c8a731e4e1eb68d031e8498ff"}, + {file = "MarkupSafe-2.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:97a68e6ada378df82bc9f16b800ab77cbf4b2fada0081794318520138c088e4a"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:e8c843bbcda3a2f1e3c2ab25913c80a3c5376cd00c6e8c4a86a89a28c8dc5452"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0212a68688482dc52b2d45013df70d169f542b7394fc744c02a57374a4207003"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e576a51ad59e4bfaac456023a78f6b5e6e7651dcd383bcc3e18d06f9b55d6d1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b9fe39a2ccc108a4accc2676e77da025ce383c108593d65cc909add5c3bd601"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96e37a3dc86e80bf81758c152fe66dbf60ed5eca3d26305edf01892257049925"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6d0072fea50feec76a4c418096652f2c3238eaa014b2f94aeb1d56a66b41403f"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:089cf3dbf0cd6c100f02945abeb18484bd1ee57a079aefd52cffd17fba910b88"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:6a074d34ee7a5ce3effbc526b7083ec9731bb3cbf921bbe1d3005d4d2bdb3a63"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win32.whl", hash = "sha256:421be9fbf0ffe9ffd7a378aafebbf6f4602d564d34be190fc19a193232fd12b1"}, + {file = "MarkupSafe-2.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:fc7b548b17d238737688817ab67deebb30e8073c95749d55538ed473130ec0c7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e04e26803c9c3851c931eac40c695602c6295b8d432cbe78609649ad9bd2da8a"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b87db4360013327109564f0e591bd2a3b318547bcef31b468a92ee504d07ae4f"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:99a2a507ed3ac881b975a2976d59f38c19386d128e7a9a18b7df6fff1fd4c1d6"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:56442863ed2b06d19c37f94d999035e15ee982988920e12a5b4ba29b62ad1f77"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3ce11ee3f23f79dbd06fb3d63e2f6af7b12db1d46932fe7bd8afa259a5996603"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:33b74d289bd2f5e527beadcaa3f401e0df0a89927c1559c8566c066fa4248ab7"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:43093fb83d8343aac0b1baa75516da6092f58f41200907ef92448ecab8825135"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:8e3dcf21f367459434c18e71b2a9532d96547aef8a871872a5bd69a715c15f96"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win32.whl", hash = "sha256:d4306c36ca495956b6d568d276ac11fdd9c30a36f1b6eb928070dc5360b22e1c"}, + {file = "MarkupSafe-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:46d00d6cfecdde84d40e572d63735ef81423ad31184100411e6e3388d405e247"}, + {file = "MarkupSafe-2.1.1.tar.gz", hash = "sha256:7f91197cc9e48f989d12e4e6fbc46495c446636dfc81b9ccf50bb0ec74b91d4b"}, ] mmh3 = [ {file = "mmh3-3.0.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:23912dde2ad4f701926948dd8e79a0e42b000f73962806f153931f52985e1e07"}, @@ -1287,61 +1309,53 @@ murmurhash = [ {file = "murmurhash-1.0.6.tar.gz", hash = "sha256:00a5252b569d3f914b5bd0bce72d2efe9c0fb91a9703556ea1b608b141c68f2d"}, ] numpy = [ - {file = "numpy-1.21.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:38e8648f9449a549a7dfe8d8755a5979b45b3538520d1e735637ef28e8c2dc50"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fd7d7409fa643a91d0a05c7554dd68aa9c9bb16e186f6ccfe40d6e003156e33a"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:a75b4498b1e93d8b700282dc8e655b8bd559c0904b3910b144646dbbbc03e062"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1412aa0aec3e00bc23fbb8664d76552b4efde98fb71f60737c83efbac24112f1"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e46ceaff65609b5399163de5893d8f2a82d3c77d5e56d976c8b5fb01faa6b671"}, - {file = "numpy-1.21.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:c6a2324085dd52f96498419ba95b5777e40b6bcbc20088fddb9e8cbb58885e8e"}, - {file = "numpy-1.21.1-cp37-cp37m-win32.whl", hash = "sha256:73101b2a1fef16602696d133db402a7e7586654682244344b8329cdcbbb82172"}, - {file = "numpy-1.21.1-cp37-cp37m-win_amd64.whl", hash = "sha256:7a708a79c9a9d26904d1cca8d383bf869edf6f8e7650d85dbc77b041e8c5a0f8"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:95b995d0c413f5d0428b3f880e8fe1660ff9396dcd1f9eedbc311f37b5652e16"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:635e6bd31c9fb3d475c8f44a089569070d10a9ef18ed13738b03049280281267"}, - {file = "numpy-1.21.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4a3d5fb89bfe21be2ef47c0614b9c9c707b7362386c9a3ff1feae63e0267ccb6"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a326af80e86d0e9ce92bcc1e65c8ff88297de4fa14ee936cb2293d414c9ec63"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:791492091744b0fe390a6ce85cc1bf5149968ac7d5f0477288f78c89b385d9af"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0318c465786c1f63ac05d7c4dbcecd4d2d7e13f0959b01b534ea1e92202235c5"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9a513bd9c1551894ee3d31369f9b07460ef223694098cf27d399513415855b68"}, - {file = "numpy-1.21.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:91c6f5fc58df1e0a3cc0c3a717bb3308ff850abdaa6d2d802573ee2b11f674a8"}, - {file = "numpy-1.21.1-cp38-cp38-win32.whl", hash = "sha256:978010b68e17150db8765355d1ccdd450f9fc916824e8c4e35ee620590e234cd"}, - {file = "numpy-1.21.1-cp38-cp38-win_amd64.whl", hash = "sha256:9749a40a5b22333467f02fe11edc98f022133ee1bfa8ab99bda5e5437b831214"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d7a4aeac3b94af92a9373d6e77b37691b86411f9745190d2c351f410ab3a791f"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d9e7912a56108aba9b31df688a4c4f5cb0d9d3787386b87d504762b6754fbb1b"}, - {file = "numpy-1.21.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:25b40b98ebdd272bc3020935427a4530b7d60dfbe1ab9381a39147834e985eac"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:8a92c5aea763d14ba9d6475803fc7904bda7decc2a0a68153f587ad82941fec1"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05a0f648eb28bae4bcb204e6fd14603de2908de982e761a2fc78efe0f19e96e1"}, - {file = "numpy-1.21.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f01f28075a92eede918b965e86e8f0ba7b7797a95aa8d35e1cc8821f5fc3ad6a"}, - {file = "numpy-1.21.1-cp39-cp39-win32.whl", hash = "sha256:88c0b89ad1cc24a5efbb99ff9ab5db0f9a86e9cc50240177a571fbe9c2860ac2"}, - {file = "numpy-1.21.1-cp39-cp39-win_amd64.whl", hash = "sha256:01721eefe70544d548425a07c80be8377096a54118070b8a62476866d5208e33"}, - {file = "numpy-1.21.1-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:2d4d1de6e6fb3d28781c73fbde702ac97f03d79e4ffd6598b880b2d95d62ead4"}, - {file = "numpy-1.21.1.zip", hash = "sha256:dff4af63638afcc57a3dfb9e4b26d434a7a602d225b42d746ea7fe2edf1342fd"}, + {file = "numpy-1.22.3-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:92bfa69cfbdf7dfc3040978ad09a48091143cffb778ec3b03fa170c494118d75"}, + {file = "numpy-1.22.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8251ed96f38b47b4295b1ae51631de7ffa8260b5b087808ef09a39a9d66c97ab"}, + {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:48a3aecd3b997bf452a2dedb11f4e79bc5bfd21a1d4cc760e703c31d57c84b3e"}, + {file = "numpy-1.22.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a3bae1a2ed00e90b3ba5f7bd0a7c7999b55d609e0c54ceb2b076a25e345fa9f4"}, + {file = "numpy-1.22.3-cp310-cp310-win32.whl", hash = "sha256:f950f8845b480cffe522913d35567e29dd381b0dc7e4ce6a4a9f9156417d2430"}, + {file = "numpy-1.22.3-cp310-cp310-win_amd64.whl", hash = "sha256:08d9b008d0156c70dc392bb3ab3abb6e7a711383c3247b410b39962263576cd4"}, + {file = "numpy-1.22.3-cp38-cp38-macosx_10_14_x86_64.whl", hash = "sha256:201b4d0552831f7250a08d3b38de0d989d6f6e4658b709a02a73c524ccc6ffce"}, + {file = "numpy-1.22.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8c1f39caad2c896bc0018f699882b345b2a63708008be29b1f355ebf6f933fe"}, + {file = "numpy-1.22.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:568dfd16224abddafb1cbcce2ff14f522abe037268514dd7e42c6776a1c3f8e5"}, + {file = "numpy-1.22.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca688e1b9b95d80250bca34b11a05e389b1420d00e87a0d12dc45f131f704a1"}, + {file = "numpy-1.22.3-cp38-cp38-win32.whl", hash = "sha256:e7927a589df200c5e23c57970bafbd0cd322459aa7b1ff73b7c2e84d6e3eae62"}, + {file = "numpy-1.22.3-cp38-cp38-win_amd64.whl", hash = "sha256:07a8c89a04997625236c5ecb7afe35a02af3896c8aa01890a849913a2309c676"}, + {file = "numpy-1.22.3-cp39-cp39-macosx_10_14_x86_64.whl", hash = "sha256:2c10a93606e0b4b95c9b04b77dc349b398fdfbda382d2a39ba5a822f669a0123"}, + {file = "numpy-1.22.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:fade0d4f4d292b6f39951b6836d7a3c7ef5b2347f3c420cd9820a1d90d794802"}, + {file = "numpy-1.22.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5bfb1bb598e8229c2d5d48db1860bcf4311337864ea3efdbe1171fb0c5da515d"}, + {file = "numpy-1.22.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:97098b95aa4e418529099c26558eeb8486e66bd1e53a6b606d684d0c3616b168"}, + {file = "numpy-1.22.3-cp39-cp39-win32.whl", hash = "sha256:fdf3c08bce27132395d3c3ba1503cac12e17282358cb4bddc25cc46b0aca07aa"}, + {file = "numpy-1.22.3-cp39-cp39-win_amd64.whl", hash = "sha256:639b54cdf6aa4f82fe37ebf70401bbb74b8508fddcf4797f9fe59615b8c5813a"}, + {file = "numpy-1.22.3-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c34ea7e9d13a70bf2ab64a2532fe149a9aced424cd05a2c4ba662fd989e3e45f"}, + {file = "numpy-1.22.3.zip", hash = "sha256:dbc7601a3b7472d559dc7b933b18b4b66f9aa7452c120e87dfb33d02008c8a18"}, ] packaging = [ {file = "packaging-21.3-py3-none-any.whl", hash = "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522"}, {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"}, ] pandas = [ - {file = "pandas-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:de62cf699122dcef175988f0714678e59c453dc234c5b47b7136bfd7641e3c8c"}, - {file = "pandas-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:46a18572f3e1cb75db59d9461940e9ba7ee38967fa48dd58f4139197f6e32280"}, - {file = "pandas-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:73f7da2ccc38cc988b74e5400b430b7905db5f2c413ff215506bea034eaf832d"}, - {file = "pandas-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5229c95db3a907451dacebc551492db6f7d01743e49bbc862f4a6010c227d187"}, - {file = "pandas-1.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe454180ad31bbbe1e5d111b44443258730467f035e26b4e354655ab59405871"}, - {file = "pandas-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:784cca3f69cfd7f6bd7c7fdb44f2bbab17e6de55725e9ff36d6f382510dfefb5"}, - {file = "pandas-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:de8f8999864399529e8514a2e6bfe00fd161f0a667903655552ed12e583ae3cb"}, - {file = "pandas-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0f19504f2783526fb5b4de675ea69d68974e21c1624f4b92295d057a31d5ec5f"}, - {file = "pandas-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f045bb5c6bfaba536089573bf97d6b8ccc7159d951fe63904c395a5e486fbe14"}, - {file = "pandas-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5280d057ddae06fe4a3cd6aa79040b8c205cd6dd21743004cf8635f39ed01712"}, - {file = "pandas-1.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f3b74335390dda49f5d5089fab71958812bf56f42aa27663ee4c16d19f4f1c5"}, - {file = "pandas-1.4.0-cp38-cp38-win32.whl", hash = "sha256:51e5da3802aaee1aa4254108ffaf1129a15fb3810b7ce8da1ec217c655b418f5"}, - {file = "pandas-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:f103a5cdcd66cb18882ccdc18a130c31c3cfe3529732e7f10a8ab3559164819c"}, - {file = "pandas-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4a8d5a200f8685e7ea562b2f022c77ab7cb82c1ca5b240e6965faa6f84e5c1e9"}, - {file = "pandas-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b5af258c7b090cca7b742cf2bd67ad1919aa9e4e681007366c9edad2d6a3d42b"}, - {file = "pandas-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:156aac90dd7b303bf0b91bae96c0503212777f86c731e41929c571125d26c8e9"}, - {file = "pandas-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dad075089e17a72391de33021ad93720aff258c3c4b68c78e1cafce7e447045"}, - {file = "pandas-1.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d59c958d6b8f96fdf850c7821571782168d5acfe75ccf78cd8d1ac15fb921df"}, - {file = "pandas-1.4.0-cp39-cp39-win32.whl", hash = "sha256:55ec0e192eefa26d823fc25a1f213d6c304a3592915f368e360652994cdb8d9a"}, - {file = "pandas-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:23c04dab11f3c6359cfa7afa83d3d054a8f8c283d773451184d98119ef54da97"}, - {file = "pandas-1.4.0.tar.gz", hash = "sha256:cdd76254c7f0a1583bd4e4781fb450d0ebf392e10d3f12e92c95575942e37df5"}, + {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3dfb32ed50122fe8c5e7f2b8d97387edd742cc78f9ec36f007ee126cd3720907"}, + {file = "pandas-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0259cd11e7e6125aaea3af823b80444f3adad6149ff4c97fef760093598b3e34"}, + {file = "pandas-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:96e9ece5759f9b47ae43794b6359bbc54805d76e573b161ae770c1ea59393106"}, + {file = "pandas-1.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:508c99debccd15790d526ce6b1624b97a5e1e4ca5b871319fb0ebfd46b8f4dad"}, + {file = "pandas-1.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6a7bbbb7950063bfc942f8794bc3e31697c020a14f1cd8905fc1d28ec674a01"}, + {file = "pandas-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:c614001129b2a5add5e3677c3a213a9e6fd376204cb8d17c04e84ff7dfc02a73"}, + {file = "pandas-1.4.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:4e1176f45981c8ccc8161bc036916c004ca51037a7ed73f2d2a9857e6dbe654f"}, + {file = "pandas-1.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bbb15ad79050e8b8d39ec40dd96a30cd09b886a2ae8848d0df1abba4d5502a67"}, + {file = "pandas-1.4.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:6d6ad1da00c7cc7d8dd1559a6ba59ba3973be6b15722d49738b2be0977eb8a0c"}, + {file = "pandas-1.4.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:358b0bc98a5ff067132d23bf7a2242ee95db9ea5b7bbc401cf79205f11502fd3"}, + {file = "pandas-1.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6105af6533f8b63a43ea9f08a2ede04e8f43e49daef0209ab0d30352bcf08bee"}, + {file = "pandas-1.4.1-cp38-cp38-win32.whl", hash = "sha256:04dd15d9db538470900c851498e532ef28d4e56bfe72c9523acb32042de43dfb"}, + {file = "pandas-1.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:1b384516dbb4e6aae30e3464c2e77c563da5980440fbdfbd0968e3942f8f9d70"}, + {file = "pandas-1.4.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:f02e85e6d832be37d7f16cf6ac8bb26b519ace3e5f3235564a91c7f658ab2a43"}, + {file = "pandas-1.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:0b1a13f647e4209ed7dbb5da3497891d0045da9785327530ab696417ef478f84"}, + {file = "pandas-1.4.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:19f7c632436b1b4f84615c3b127bbd7bc603db95e3d4332ed259dc815c9aaa26"}, + {file = "pandas-1.4.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7ea47ba1d6f359680130bd29af497333be6110de8f4c35b9211eec5a5a9630fa"}, + {file = "pandas-1.4.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e5a7a1e0ecaac652326af627a3eca84886da9e667d68286866d4e33f6547caf"}, + {file = "pandas-1.4.1-cp39-cp39-win32.whl", hash = "sha256:1d85d5f6be66dfd6d1d8d13b9535e342a2214260f1852654b19fa4d7b8d1218b"}, + {file = "pandas-1.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:3129a35d9dad1d80c234dd78f8f03141b914395d23f97cf92a366dcd19f8f8bf"}, + {file = "pandas-1.4.1.tar.gz", hash = "sha256:8db93ec98ac7cb5f8ac1420c10f5e3c43533153f253fe7fb6d891cf5aa2b80d2"}, ] pathy = [ {file = "pathy-0.6.1-py3-none-any.whl", hash = "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a"}, @@ -1451,16 +1465,16 @@ pyspark = [ {file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"}, ] pytest = [ - {file = "pytest-7.0.1-py3-none-any.whl", hash = "sha256:9ce3ff477af913ecf6321fe337b93a2c0dcf2a0a1439c43f5452112c1e4280db"}, - {file = "pytest-7.0.1.tar.gz", hash = "sha256:e30905a0c131d3d94b89624a1cc5afec3e0ba2fbdb151867d8e0ebd49850f171"}, + {file = "pytest-7.1.1-py3-none-any.whl", hash = "sha256:92f723789a8fdd7180b6b06483874feca4c48a5c76968e03bb3e7f806a1869ea"}, + {file = "pytest-7.1.1.tar.gz", hash = "sha256:841132caef6b1ad17a9afde46dc4f6cfa59a05f9555aae5151f73bdf2820ca63"}, ] python-dateutil = [ {file = "python-dateutil-2.8.2.tar.gz", hash = "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86"}, {file = "python_dateutil-2.8.2-py2.py3-none-any.whl", hash = "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9"}, ] pytz = [ - {file = "pytz-2021.3-py2.py3-none-any.whl", hash = "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c"}, - {file = "pytz-2021.3.tar.gz", hash = "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326"}, + {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"}, + {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"}, ] pyyaml = [ {file = "PyYAML-6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53"}, @@ -1558,8 +1572,67 @@ requests = [ {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"}, ] s3transfer = [ - {file = "s3transfer-0.5.0-py3-none-any.whl", hash = "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"}, - {file = "s3transfer-0.5.0.tar.gz", hash = "sha256:50ed823e1dc5868ad40c8dc92072f757aa0e653a192845c94a3b676f4a62da4c"}, + {file = "s3transfer-0.5.2-py3-none-any.whl", hash = "sha256:7a6f4c4d1fdb9a2b640244008e142cbc2cd3ae34b386584ef044dd0f27101971"}, + {file = "s3transfer-0.5.2.tar.gz", hash = "sha256:95c58c194ce657a5f4fb0b9e60a84968c808888aed628cd98ab8771fe1db98ed"}, +] +scikit-learn = [ + {file = "scikit-learn-1.0.2.tar.gz", hash = "sha256:b5870959a5484b614f26d31ca4c17524b1b0317522199dc985c3b4256e030767"}, + {file = "scikit_learn-1.0.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:da3c84694ff693b5b3194d8752ccf935a665b8b5edc33a283122f4273ca3e687"}, + {file = "scikit_learn-1.0.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:75307d9ea39236cad7eea87143155eea24d48f93f3a2f9389c817f7019f00705"}, + {file = "scikit_learn-1.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f14517e174bd7332f1cca2c959e704696a5e0ba246eb8763e6c24876d8710049"}, + {file = "scikit_learn-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9aac97e57c196206179f674f09bc6bffcd0284e2ba95b7fe0b402ac3f986023"}, + {file = "scikit_learn-1.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:d93d4c28370aea8a7cbf6015e8a669cd5d69f856cc2aa44e7a590fb805bb5583"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-macosx_10_13_x86_64.whl", hash = "sha256:85260fb430b795d806251dd3bb05e6f48cdc777ac31f2bcf2bc8bbed3270a8f5"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:a053a6a527c87c5c4fa7bf1ab2556fa16d8345cf99b6c5a19030a4a7cd8fd2c0"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:245c9b5a67445f6f044411e16a93a554edc1efdcce94d3fc0bc6a4b9ac30b752"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:158faf30684c92a78e12da19c73feff9641a928a8024b4fa5ec11d583f3d8a87"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:08ef968f6b72033c16c479c966bf37ccd49b06ea91b765e1cc27afefe723920b"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:16455ace947d8d9e5391435c2977178d0ff03a261571e67f627c8fee0f9d431a"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-win32.whl", hash = "sha256:2f3b453e0b149898577e301d27e098dfe1a36943f7bb0ad704d1e548efc3b448"}, + {file = "scikit_learn-1.0.2-cp37-cp37m-win_amd64.whl", hash = "sha256:46f431ec59dead665e1370314dbebc99ead05e1c0a9df42f22d6a0e00044820f"}, + {file = "scikit_learn-1.0.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:ff3fa8ea0e09e38677762afc6e14cad77b5e125b0ea70c9bba1992f02c93b028"}, + {file = "scikit_learn-1.0.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:9369b030e155f8188743eb4893ac17a27f81d28a884af460870c7c072f114243"}, + {file = "scikit_learn-1.0.2-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:7d6b2475f1c23a698b48515217eb26b45a6598c7b1840ba23b3c5acece658dbb"}, + {file = "scikit_learn-1.0.2-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:285db0352e635b9e3392b0b426bc48c3b485512d3b4ac3c7a44ec2a2ba061e66"}, + {file = "scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5cb33fe1dc6f73dc19e67b264dbb5dde2a0539b986435fdd78ed978c14654830"}, + {file = "scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b1391d1a6e2268485a63c3073111fe3ba6ec5145fc957481cfd0652be571226d"}, + {file = "scikit_learn-1.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc3744dabc56b50bec73624aeca02e0def06b03cb287de26836e730659c5d29c"}, + {file = "scikit_learn-1.0.2-cp38-cp38-win32.whl", hash = "sha256:a999c9f02ff9570c783069f1074f06fe7386ec65b84c983db5aeb8144356a355"}, + {file = "scikit_learn-1.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:7626a34eabbf370a638f32d1a3ad50526844ba58d63e3ab81ba91e2a7c6d037e"}, + {file = "scikit_learn-1.0.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:a90b60048f9ffdd962d2ad2fb16367a87ac34d76e02550968719eb7b5716fd10"}, + {file = "scikit_learn-1.0.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:7a93c1292799620df90348800d5ac06f3794c1316ca247525fa31169f6d25855"}, + {file = "scikit_learn-1.0.2-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:eabceab574f471de0b0eb3f2ecf2eee9f10b3106570481d007ed1c84ebf6d6a1"}, + {file = "scikit_learn-1.0.2-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:55f2f3a8414e14fbee03782f9fe16cca0f141d639d2b1c1a36779fa069e1db57"}, + {file = "scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80095a1e4b93bd33261ef03b9bc86d6db649f988ea4dbcf7110d0cded8d7213d"}, + {file = "scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fa38a1b9b38ae1fad2863eff5e0d69608567453fdfc850c992e6e47eb764e846"}, + {file = "scikit_learn-1.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff746a69ff2ef25f62b36338c615dd15954ddc3ab8e73530237dd73235e76d62"}, + {file = "scikit_learn-1.0.2-cp39-cp39-win32.whl", hash = "sha256:e174242caecb11e4abf169342641778f68e1bfaba80cd18acd6bc84286b9a534"}, + {file = "scikit_learn-1.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:b54a62c6e318ddbfa7d22c383466d38d2ee770ebdb5ddb668d56a099f6eaf75f"}, +] +scipy = [ + {file = "scipy-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:87b01c7d5761e8a266a0fbdb9d88dcba0910d63c1c671bdb4d99d29f469e9e03"}, + {file = "scipy-1.8.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ae3e327da323d82e918e593460e23babdce40d7ab21490ddf9fc06dec6b91a18"}, + {file = "scipy-1.8.0-cp310-cp310-macosx_12_0_universal2.macosx_10_9_x86_64.whl", hash = "sha256:16e09ef68b352d73befa8bcaf3ebe25d3941fe1a58c82909d5589856e6bc8174"}, + {file = "scipy-1.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c17a1878d00a5dd2797ccd73623ceca9d02375328f6218ee6d921e1325e61aff"}, + {file = "scipy-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:937d28722f13302febde29847bbe554b89073fbb924a30475e5ed7b028898b5f"}, + {file = "scipy-1.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:8f4d059a97b29c91afad46b1737274cb282357a305a80bdd9e8adf3b0ca6a3f0"}, + {file = "scipy-1.8.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:38aa39b6724cb65271e469013aeb6f2ce66fd44f093e241c28a9c6bc64fd79ed"}, + {file = "scipy-1.8.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:559a8a4c03a5ba9fe3232f39ed24f86457e4f3f6c0abbeae1fb945029f092720"}, + {file = "scipy-1.8.0-cp38-cp38-macosx_12_0_universal2.macosx_10_9_x86_64.whl", hash = "sha256:f4a6d3b9f9797eb2d43938ac2c5d96d02aed17ef170c8b38f11798717523ddba"}, + {file = "scipy-1.8.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:92b2c2af4183ed09afb595709a8ef5783b2baf7f41e26ece24e1329c109691a7"}, + {file = "scipy-1.8.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a279e27c7f4566ef18bab1b1e2c37d168e365080974758d107e7d237d3f0f484"}, + {file = "scipy-1.8.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad5be4039147c808e64f99c0e8a9641eb5d2fa079ff5894dcd8240e94e347af4"}, + {file = "scipy-1.8.0-cp38-cp38-win32.whl", hash = "sha256:3d9dd6c8b93a22bf9a3a52d1327aca7e092b1299fb3afc4f89e8eba381be7b59"}, + {file = "scipy-1.8.0-cp38-cp38-win_amd64.whl", hash = "sha256:5e73343c5e0d413c1f937302b2e04fb07872f5843041bcfd50699aef6e95e399"}, + {file = "scipy-1.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:de2e80ee1d925984c2504812a310841c241791c5279352be4707cdcd7c255039"}, + {file = "scipy-1.8.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:c2bae431d127bf0b1da81fc24e4bba0a84d058e3a96b9dd6475dfcb3c5e8761e"}, + {file = "scipy-1.8.0-cp39-cp39-macosx_12_0_universal2.macosx_10_9_x86_64.whl", hash = "sha256:723b9f878095ed994756fa4ee3060c450e2db0139c5ba248ee3f9628bd64e735"}, + {file = "scipy-1.8.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:011d4386b53b933142f58a652aa0f149c9b9242abd4f900b9f4ea5fbafc86b89"}, + {file = "scipy-1.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e6f0cd9c0bd374ef834ee1e0f0999678d49dcc400ea6209113d81528958f97c7"}, + {file = "scipy-1.8.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3720d0124aced49f6f2198a6900304411dbbeed12f56951d7c66ebef05e3df6"}, + {file = "scipy-1.8.0-cp39-cp39-win32.whl", hash = "sha256:3d573228c10a3a8c32b9037be982e6440e411b443a6267b067cac72f690b8d56"}, + {file = "scipy-1.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:bb7088e89cd751acf66195d2f00cf009a1ea113f3019664032d9075b1e727b6c"}, + {file = "scipy-1.8.0.tar.gz", hash = "sha256:31d4f2d6b724bc9a98e527b5849b8a7e589bf1ea630c33aa563eda912c9ff0bd"}, ] six = [ {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, @@ -1596,8 +1669,8 @@ spacy = [ {file = "spacy-3.2.1.tar.gz", hash = "sha256:f6ebac511627740a8ca2b117b91ef5515c8f0b2fb117a69ebe01d010dd4fc53c"}, ] spacy-legacy = [ - {file = "spacy-legacy-3.0.8.tar.gz", hash = "sha256:b4725c5c161f0685ab4fce3fc912bc68aefdb7e102ba9848e852bb5842256c2f"}, - {file = "spacy_legacy-3.0.8-py2.py3-none-any.whl", hash = "sha256:eb37a3540bb461b5fe9348d4976784f18a0e345982e41e2c5c7cd8229889e825"}, + {file = "spacy-legacy-3.0.9.tar.gz", hash = "sha256:4f7dcbc4e6c8e8cb4eadbb009f9c0a1a2a67442e0032c8d6776c9470c3759903"}, + {file = "spacy_legacy-3.0.9-py2.py3-none-any.whl", hash = "sha256:dfd58b0cc65b3596cb06f7b95e7bf4fff34668297c59eb179eb050db07b199df"}, ] spacy-loggers = [ {file = "spacy-loggers-1.0.1.tar.gz", hash = "sha256:17d0e249b2e6c6546c49fc6561a0a685f91a8edbf24a5b2b7759ead443c74654"}, @@ -1626,38 +1699,42 @@ starlette = [ {file = "starlette-0.16.0.tar.gz", hash = "sha256:e1904b5d0007aee24bdd3c43994be9b3b729f4f58e740200de1d623f8c3a8870"}, ] thinc = [ - {file = "thinc-8.0.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f818b9f012169a11beb3561c43dc52080588e50cf495733e492efab8b9b4135e"}, - {file = "thinc-8.0.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f520daf45b7f42a04363852df43be1b423ae42d9327709d74f6c3279b3f73778"}, - {file = "thinc-8.0.13-cp310-cp310-win_amd64.whl", hash = "sha256:2b217059c9e126220b77e7d6c9da56912c4e1eb4e8a11af14f17752e198e88cc"}, - {file = "thinc-8.0.13-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:0f956c693d180209075703072fd226a24408cbe80eb67bd3b6eea407f61cb283"}, - {file = "thinc-8.0.13-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a17d87469082b82c27b7d40dd86c793fc34c60f734209ee056cb02d7609f255b"}, - {file = "thinc-8.0.13-cp36-cp36m-win_amd64.whl", hash = "sha256:27ea64843d6af0f3de8c788ec2a00598a1e5b4d57aadb52845fa42e95e4038c2"}, - {file = "thinc-8.0.13-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:1f274bcaa781aaf1dba5eac7da7d88d9b0cb8c2fd7477647f0ca9d3221dfb958"}, - {file = "thinc-8.0.13-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d52a5621e1784af5c64af4cfa9b2924358ca07aafd99014c57a736cf032e42f7"}, - {file = "thinc-8.0.13-cp37-cp37m-win_amd64.whl", hash = "sha256:753f65e07860553551ed8806b934a74f26a4a50985d556ecd5c4ab50c29b3222"}, - {file = "thinc-8.0.13-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ffe0a4d74f2ba2819193a5d9179156256f44c69255d7ae286ce1861efcefbc64"}, - {file = "thinc-8.0.13-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b61f78f6f116d23438b034c3552804c9767c4165960b1d7e48f07b2e9a95afb0"}, - {file = "thinc-8.0.13-cp38-cp38-win_amd64.whl", hash = "sha256:ba576af211ad2b00af78ab3e24e689289b29af8a9e51619ad55fab86871d8652"}, - {file = "thinc-8.0.13-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:def8e96eddb5a098d07dcf8752266095e14a6cf5d056ff766e2cdc542eb63f02"}, - {file = "thinc-8.0.13-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce322b66053819654d0444877154a08ed01cf5b45c6b3c9763e59b78af4f6039"}, - {file = "thinc-8.0.13-cp39-cp39-win_amd64.whl", hash = "sha256:b3ae088f60d3dfe6a88c6be37548aae40023e46a718cffe3e43953b4f0ffc340"}, - {file = "thinc-8.0.13.tar.gz", hash = "sha256:47662a3ae33d445a77b6ea7b772444805c7bba8991f122e350daf72dedc8171a"}, + {file = "thinc-8.0.15-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0368c0b279492c0ed0b5b1bc79614e8a335ae1ccc3b1617de46f04eb74dc9a43"}, + {file = "thinc-8.0.15-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4276b64a8cd91197f30382c0874f59fa6c94ef533150d845b2f30998aae87cc"}, + {file = "thinc-8.0.15-cp310-cp310-win_amd64.whl", hash = "sha256:72cec290eb1b54ba6144b05d96f3247ea34eb41c66842961b05b408b93f2ba9b"}, + {file = "thinc-8.0.15-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:a4ee24a6505d63b6f0161f25d0f73f87ab569e0e1a9799a6baca97352788a91f"}, + {file = "thinc-8.0.15-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:376b196da6c69c8efaaf26fb99f6997543d80ea4bc5f4ab8600e9d1d521a7dc9"}, + {file = "thinc-8.0.15-cp36-cp36m-win_amd64.whl", hash = "sha256:bed92be72516b1511fecaf616ea31ff1c2e972a7ec4ad991c212f9b2f5c94183"}, + {file = "thinc-8.0.15-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:42641f021f4fdc47eaec4b9ff66246b153b9783ef24e2c266bf0f51eccd40db5"}, + {file = "thinc-8.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0557791e73865fa81f09623dd1f9b98b6d4ab80c63fca5f141530536516aac98"}, + {file = "thinc-8.0.15-cp37-cp37m-win_amd64.whl", hash = "sha256:f9ba4e4dac98e166950e004c87a0f57b8f8796ecd0e3b6973beb6febc20257ff"}, + {file = "thinc-8.0.15-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:489521ca3cca469d67432fc30f14c7c13c17320b179bf8e362319313feaafbb7"}, + {file = "thinc-8.0.15-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ddda1aa1432eef8bab5c83e4cf2020f1ed891771a6dd86729f1aa6078f25f2c"}, + {file = "thinc-8.0.15-cp38-cp38-win_amd64.whl", hash = "sha256:70781a0802fbb62a27217ccb80e744e80a5b43f9107ac596c5cd2dc9878ae258"}, + {file = "thinc-8.0.15-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a1f19dd9a7121d332d16446db39b4999abb4f040ce7c71bc86ea05664c86d361"}, + {file = "thinc-8.0.15-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ecd8eab82598b079e901f16567818dc955481326c01d84b819c3c05801b97e07"}, + {file = "thinc-8.0.15-cp39-cp39-win_amd64.whl", hash = "sha256:5d98e6b3bf220c1068442d09d7c34dd8e52bbdfa43ea32f773747c5909a1c011"}, + {file = "thinc-8.0.15.tar.gz", hash = "sha256:2e315020da85c3791e191fbf37c4a2433f57cf322e27380da0cd4de99d96053b"}, +] +threadpoolctl = [ + {file = "threadpoolctl-3.1.0-py3-none-any.whl", hash = "sha256:8b99adda265feb6773280df41eece7b2e6561b772d21ffd52e372f999024907b"}, + {file = "threadpoolctl-3.1.0.tar.gz", hash = "sha256:a335baacfaa4400ae1f0d8e3a58d6674d2f8828e3716bb2802c44955ad391380"}, ] tomli = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] tqdm = [ - {file = "tqdm-4.62.3-py2.py3-none-any.whl", hash = "sha256:8dd278a422499cd6b727e6ae4061c40b48fce8b76d1ccbf5d34fca9b7f925b0c"}, - {file = "tqdm-4.62.3.tar.gz", hash = "sha256:d359de7217506c9851b7869f3708d8ee53ed70a1b8edbba4dbcb47442592920d"}, + {file = "tqdm-4.63.0-py2.py3-none-any.whl", hash = "sha256:e643e071046f17139dea55b880dc9b33822ce21613b4a4f5ea57f202833dbc29"}, + {file = "tqdm-4.63.0.tar.gz", hash = "sha256:1d9835ede8e394bb8c9dcbffbca02d717217113adc679236873eeaac5bc0b3cd"}, ] typer = [ {file = "typer-0.4.0-py3-none-any.whl", hash = "sha256:d81169725140423d072df464cad1ff25ee154ef381aaf5b8225352ea187ca338"}, {file = "typer-0.4.0.tar.gz", hash = "sha256:63c3aeab0549750ffe40da79a1b524f60e08a2cbc3126c520ebf2eeaf507f5dd"}, ] typing-extensions = [ - {file = "typing_extensions-4.0.1-py3-none-any.whl", hash = "sha256:7f001e5ac290a0c0401508864c7ec868be4e701886d5b573a9528ed3973d9d3b"}, - {file = "typing_extensions-4.0.1.tar.gz", hash = "sha256:4ca091dea149f945ec56afb48dae714f21e8692ef22a395223bcd328961b6a0e"}, + {file = "typing_extensions-4.1.1-py3-none-any.whl", hash = "sha256:21c85e0fe4b9a155d0799430b0ad741cdce7e359660ccbd8b530613e8df88ce2"}, + {file = "typing_extensions-4.1.1.tar.gz", hash = "sha256:1a9462dcc3347a79b1f1c0271fbe79e844580bb598bafa1ed208b94da3cdcd42"}, ] ujson = [ {file = "ujson-4.3.0-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:3609e0514f6f721c6c9818b9374ec91b994e59fb193af2f924ca3f2f32009f1c"}, @@ -1706,8 +1783,8 @@ ujson = [ {file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"}, ] urllib3 = [ - {file = "urllib3-1.26.8-py2.py3-none-any.whl", hash = "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed"}, - {file = "urllib3-1.26.8.tar.gz", hash = "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"}, + {file = "urllib3-1.26.9-py2.py3-none-any.whl", hash = "sha256:44ece4d53fb1706f667c9bd1c648f5469a2ec925fcf3a776667042d645472c14"}, + {file = "urllib3-1.26.9.tar.gz", hash = "sha256:aabaf16477806a5e1dd19aa41f8c2b7950dd3c746362d7e3223dbe6de6ac448e"}, ] uvicorn = [ {file = "uvicorn-0.16.0-py3-none-any.whl", hash = "sha256:d8c839231f270adaa6d338d525e2652a0b4a5f4c2430b5c4ef6ae4d11776b0d2"}, diff --git a/pyproject.toml b/pyproject.toml index cd790c2..678e2b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,14 +5,16 @@ description = "" authors = ["Daoud Clarke "] [tool.poetry.dependencies] -python = "^3.10" -pandas = "^1.3.4" +python = ">=3.10,<3.11" +pandas = "^1.3.5" +scipy = "^1.8.0" +scikit-learn = "^1.0.2" zstandard = "^0.16.0" mmh3 = "^3.0.0" fastapi = "^0.70.1" uvicorn = "^0.16.0" -numpy = "==1.21.1" pyyaml = "==6.0" + # Optional dependencies do not get installed by default. Look under tool.poetry.extras section # to see which extras to use. botocore = {version= "==1.23.20", optional = true}