3 years ago · 95c9bcfe3b
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,4 +1,4 @@
 
				-FROM python:3.9-slim-bullseye as base
			
 
				+FROM python:3.10.2-bullseye as base
			
 
				 
			
 
				 ENV PYTHONFAULTHANDLER=1 \
			
 
				     PYTHONHASHSEED=random \
			
--- a/analyse/analyse_crawled_domains.py
+++ b/analyse/analyse_crawled_domains.py
@@ -0,0 +1,53 @@
 
				+"""
			
 
				+See how many unique URLs and root domains we have crawled.
			
 
				+"""
			
 
				+import glob
			
 
				+import gzip
			
 
				+import json
			
 
				+from collections import defaultdict, Counter
			
 
				+from urllib.parse import urlparse
			
 
				+
			
 
				+CRAWL_GLOB = "../../data/mwmbl/b2/*/*/*/*/*/*.json.gz"
			
 
				+
			
 
				+
			
 
				+def get_urls():
			
 
				+    for path in glob.glob(CRAWL_GLOB):
			
 
				+        data = json.load(gzip.open(path))
			
 
				+        user = data['user_id_hash']
			
 
				+        for item in data['items']:
			
 
				+            yield user, item['url']
			
 
				+
			
 
				+
			
 
				+def analyse_urls(urls):
			
 
				+    url_set = defaultdict(list)
			
 
				+    domains = set()
			
 
				+    for user, url in urls:
			
 
				+        url_set[url].append(user)
			
 
				+
			
 
				+        parsed_url = urlparse(url)
			
 
				+        path = parsed_url.path.strip('/')
			
 
				+        if path == '':
			
 
				+            domains.add(parsed_url.netloc)
			
 
				+
			
 
				+    count = sum(len(x) for x in url_set.values())
			
 
				+    print("Root pages crawled", sorted(domains))
			
 
				+    find_worst_pages(url_set)
			
 
				+    print(f"Got {len(url_set)} URLs and {len(domains)} root pages from {count} items")
			
 
				+    url_list_size = len(json.dumps(list(url_set.keys())))
			
 
				+    print("Length of all URLs", url_list_size)
			
 
				+
			
 
				+
			
 
				+def find_worst_pages(url_set):
			
 
				+    worst = sorted(((len(users), url) for url, users in url_set.items()), reverse=True)[:50]
			
 
				+    for count, url in worst:
			
 
				+        print("Worst", count, url, Counter(url_set[url]))
			
 
				+
			
 
				+
			
 
				+def run():
			
 
				+    urls = get_urls()
			
 
				+    analyse_urls(urls)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    run()
			
 
				+
			
--- a/analyse/export_top_domains.py
+++ b/analyse/export_top_domains.py
@@ -0,0 +1,13 @@
 
				+import json
			
 
				+
			
 
				+from mwmbl.indexer.paths import TOP_DOMAINS_JSON_PATH
			
 
				+from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
			
 
				+
			
 
				+
			
 
				+def export_top_domains_to_json():
			
 
				+    with open(TOP_DOMAINS_JSON_PATH, 'w') as output_file:
			
 
				+        json.dump(DOMAINS, output_file, indent=2)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    export_top_domains_to_json()
			
--- a/analyse/top_links.py
+++ b/analyse/top_links.py
@@ -0,0 +1,39 @@
 
				+"""
			
 
				+Analyse crawl data to find the most popular links
			
 
				+"""
			
 
				+import glob
			
 
				+import gzip
			
 
				+import json
			
 
				+from collections import defaultdict
			
 
				+from urllib.parse import urlparse
			
 
				+
			
 
				+from analyse.analyse_crawled_domains import CRAWL_GLOB
			
 
				+
			
 
				+
			
 
				+def get_urls():
			
 
				+    for path in glob.glob(CRAWL_GLOB):
			
 
				+        data = json.load(gzip.open(path))
			
 
				+        for item in data['items']:
			
 
				+            url = item['url']
			
 
				+            domain = urlparse(url).hostname
			
 
				+            for link in item['links']:
			
 
				+                yield domain, link
			
 
				+
			
 
				+
			
 
				+def collect_links(urls):
			
 
				+    links = defaultdict(set)
			
 
				+    for url, link in urls:
			
 
				+        links[link].add(url)
			
 
				+    return links
			
 
				+
			
 
				+
			
 
				+def run():
			
 
				+    url_links = get_urls()
			
 
				+    collected = collect_links(url_links)
			
 
				+    top = sorted(collected.items(), key=lambda x: len(x[1]), reverse=True)[:1000]
			
 
				+    for url, items in top:
			
 
				+        print("URL", url, len(items))
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    run()
			
--- a/mwmbl/indexer/fsqueue.py
+++ b/mwmbl/indexer/fsqueue.py
@@ -58,6 +58,15 @@ class GzipJsonRowSerializer(Serializer):
 
				         return [json.loads(line) for line in lines.strip().split('\n')]
			
 
				 
			
 
				 
			
 
				+class GzipJsonBlobSerializer(Serializer):
			
 
				+    def serialize(self, items: list[object]) -> bytes:
			
 
				+        raise NotImplementedError("Serializer not needed - blob is generated by browser extension")
			
 
				+
			
 
				+    def deserialize(self, serialized_items: bytes) -> list[object]:
			
 
				+        data = gzip.decompress(serialized_items).decode('utf8')
			
 
				+        return json.loads(data)
			
 
				+
			
 
				+
			
 
				 class FSQueue:
			
 
				     def __init__(self, directory: Union[str, Path], name: str, serializer: Serializer):
			
 
				         self.directory = str(directory)
			
--- a/mwmbl/indexer/index.py
+++ b/mwmbl/indexer/index.py
@@ -12,8 +12,6 @@ import pandas as pd
 
				 # PAGE_SIZE = 512
			
 
				 from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
			
 
				 
			
 
				-NUM_INITIAL_TOKENS = 50
			
 
				-
			
 
				 HTTP_START = 'http://'
			
 
				 HTTPS_START = 'https://'
			
 
				 BATCH_SIZE = 100
			
@@ -24,10 +22,13 @@ def is_content_token(nlp, token):
 
				     return (lexeme.is_alpha or lexeme.is_digit) and not token.is_stop
			
 
				 
			
 
				 
			
 
				-def tokenize(nlp, cleaned_text):
			
 
				+def tokenize(nlp, input_text):
			
 
				+    cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
			
 
				     tokens = nlp.tokenizer(cleaned_text)
			
 
				-    content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
			
 
				-                      if is_content_token(nlp, token)]
			
 
				+    if input_text.endswith('…'):
			
 
				+        # Discard the last two tokens since there will likely be a word cut in two
			
 
				+        tokens = tokens[:-2]
			
 
				+    content_tokens = [token for token in tokens if is_content_token(nlp, token)]
			
 
				     lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
			
 
				     return lowered
			
 
				 
			
--- a/mwmbl/indexer/index_crawl.py
+++ b/mwmbl/indexer/index_crawl.py
@@ -0,0 +1,46 @@
 
				+"""
			
 
				+Index data crawled through the Mwmbl crawler.
			
 
				+"""
			
 
				+from logging import getLogger
			
 
				+
			
 
				+import spacy
			
 
				+
			
 
				+from mwmbl.indexer.fsqueue import FSQueue, GzipJsonBlobSerializer, FSQueueError
			
 
				+from mwmbl.indexer.index import index_titles_urls_and_extracts
			
 
				+from mwmbl.indexer.paths import INDEX_PATH, MWMBL_CRAWL_TERMS_PATH, DATA_DIR
			
 
				+from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, NUM_PAGES, PAGE_SIZE
			
 
				+
			
 
				+
			
 
				+logger = getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+def index_mwmbl_craw_data():
			
 
				+    nlp = spacy.load("en_core_web_sm")
			
 
				+
			
 
				+    with TinyIndexer(Document, INDEX_PATH, NUM_PAGES, PAGE_SIZE) as indexer:
			
 
				+        titles_urls_and_extracts = get_mwmbl_crawl_titles_urls_and_extracts()
			
 
				+        index_titles_urls_and_extracts(indexer, nlp, titles_urls_and_extracts, MWMBL_CRAWL_TERMS_PATH)
			
 
				+
			
 
				+
			
 
				+def get_mwmbl_crawl_titles_urls_and_extracts():
			
 
				+    input_queue = FSQueue(DATA_DIR, 'mwmbl-search-items', GzipJsonBlobSerializer())
			
 
				+    input_queue.unlock_all()
			
 
				+    while True:
			
 
				+        try:
			
 
				+            next_item = input_queue.get()
			
 
				+        except FSQueueError as e:
			
 
				+            logger.exception(f'Error with item {e.item_id}')
			
 
				+            input_queue.error(e.item_id)
			
 
				+            continue
			
 
				+        if next_item is None:
			
 
				+            logger.info('Not more items to process, stopping')
			
 
				+            break
			
 
				+        item_id, item_data = next_item
			
 
				+        logger.info(f'Processing item {item_id}')
			
 
				+        for item in item_data['items']:
			
 
				+            yield item['title'], item['url'], item['extract']
			
 
				+        input_queue.done(item_id)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    index_mwmbl_craw_data()
			
--- a/mwmbl/indexer/paths.py
+++ b/mwmbl/indexer/paths.py
@@ -19,4 +19,8 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs'
 
				 DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
			
 
				 DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
			
 
				 
			
 
				-INDEX_PATH = Path(__file__).parent / 'data' / 'index.tinysearch'
			
 
				+LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data'
			
 
				+INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch'
			
 
				+MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv'
			
 
				+
			
 
				+TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'
			
--- a/mwmbl/tinysearchengine/app.py
+++ b/mwmbl/tinysearchengine/app.py
@@ -1,8 +1,11 @@
 
				 import logging
			
 
				 import argparse
			
 
				+
			
 
				+import pandas as pd
			
 
				 import uvicorn
			
 
				 
			
 
				 from mwmbl.tinysearchengine import create_app
			
 
				+from mwmbl.tinysearchengine.completer import Completer
			
 
				 from mwmbl.tinysearchengine.indexer import TinyIndex, NUM_PAGES, PAGE_SIZE, Document
			
 
				 from mwmbl.tinysearchengine.config import parse_config_file
			
 
				 
			
@@ -28,8 +31,12 @@ def main():
 
				     """
			
 
				     config, tiny_index = get_config_and_index()
			
 
				 
			
 
				+    # Load term data
			
 
				+    terms = pd.read_csv(config.terms_path)
			
 
				+    completer = Completer(terms)
			
 
				+
			
 
				     # Initialize FastApi instance
			
 
				-    app = create_app.create(tiny_index)
			
 
				+    app = create_app.create(tiny_index, completer)
			
 
				 
			
 
				     # Initialize uvicorn server using global app instance and server config params
			
 
				     uvicorn.run(app, **config.server_config.dict())
			
--- a/mwmbl/tinysearchengine/completer.py
+++ b/mwmbl/tinysearchengine/completer.py
@@ -0,0 +1,37 @@
 
				+from bisect import bisect_left, bisect_right
			
 
				+from datetime import datetime
			
 
				+
			
 
				+import pandas as pd
			
 
				+from pandas import DataFrame
			
 
				+
			
 
				+
			
 
				+class Completer:
			
 
				+    def __init__(self, terms: DataFrame, num_matches: int = 3):
			
 
				+        terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict()
			
 
				+        self.terms = list(terms_dict.keys())
			
 
				+        self.counts = list(terms_dict.values())
			
 
				+        self.num_matches = num_matches
			
 
				+        print("Terms", self.terms[:100], self.counts[:100])
			
 
				+
			
 
				+    def complete(self, term) -> list[str]:
			
 
				+        term_length = len(term)
			
 
				+        start_index = bisect_left(self.terms, term, key=lambda x: x[:term_length])
			
 
				+        end_index = bisect_right(self.terms, term, key=lambda x: x[:term_length])
			
 
				+
			
 
				+        matching_terms = zip(self.counts[start_index:end_index], self.terms[start_index:end_index])
			
 
				+        top_terms = sorted(matching_terms, reverse=True)[:self.num_matches]
			
 
				+        print("Top terms, counts", top_terms)
			
 
				+        if not top_terms:
			
 
				+            return []
			
 
				+
			
 
				+        counts, terms = zip(*top_terms)
			
 
				+        return list(terms)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    data = pd.read_csv('data/mwmbl-crawl-terms.csv')
			
 
				+    completer = Completer(data)
			
 
				+    start = datetime.now()
			
 
				+    completer.complete('fa')
			
 
				+    end = datetime.now()
			
 
				+    print("Time", end - start)
			
--- a/mwmbl/tinysearchengine/config.py
+++ b/mwmbl/tinysearchengine/config.py
@@ -18,6 +18,7 @@ class IndexConfigModel(BaseModel):
 
				 class ConfigModel(BaseModel):
			
 
				     server_config: ServerConfigModel = Field(default_factory=ServerConfigModel)
			
 
				     index_config: IndexConfigModel = Field(default_factory=IndexConfigModel)
			
 
				+    terms_path: StrictStr = "data/mwmbl-crawl-terms.csv"
			
 
				 
			
 
				 
			
 
				 def parse_config_file(config_filename: str) -> ConfigModel:
			
--- a/mwmbl/tinysearchengine/create_app.py
+++ b/mwmbl/tinysearchengine/create_app.py
@@ -6,20 +6,18 @@ from urllib.parse import urlparse
 
				 
			
 
				 from fastapi import FastAPI
			
 
				 from starlette.middleware.cors import CORSMiddleware
			
 
				-from starlette.responses import FileResponse
			
 
				-from starlette.staticfiles import StaticFiles
			
 
				 
			
 
				+from mwmbl.tinysearchengine.completer import Completer
			
 
				 from mwmbl.tinysearchengine.hn_top_domains_filtered import DOMAINS
			
 
				 from mwmbl.tinysearchengine.indexer import TinyIndex, Document
			
 
				 
			
 
				 logger = getLogger(__name__)
			
 
				 
			
 
				 
			
 
				-STATIC_FILES_PATH = Path(__file__).parent / 'static'
			
 
				 SCORE_THRESHOLD = 0.25
			
 
				 
			
 
				 
			
 
				-def create(tiny_index: TinyIndex):
			
 
				+def create(tiny_index: TinyIndex, completer: Completer):
			
 
				     app = FastAPI()
			
 
				     
			
 
				     # Allow CORS requests from any site
			
@@ -33,9 +31,10 @@ def create(tiny_index: TinyIndex):
 
				     def search(s: str):
			
 
				         results, terms = get_results(s)
			
 
				 
			
 
				+        is_complete = s.endswith(' ')
			
 
				+        pattern = get_query_regex(terms, is_complete)
			
 
				         formatted_results = []
			
 
				         for result in results:
			
 
				-            pattern = get_query_regex(terms)
			
 
				             formatted_result = {}
			
 
				             for content_type, content in [('title', result.title), ('extract', result.extract)]:
			
 
				                 matches = re.finditer(pattern, content, re.IGNORECASE)
			
@@ -53,17 +52,23 @@ def create(tiny_index: TinyIndex):
 
				         logger.info("Return results: %r", formatted_results)
			
 
				         return formatted_results
			
 
				 
			
 
				-    def get_query_regex(terms):
			
 
				-        term_patterns = [rf'\b{term}\b' for term in terms]
			
 
				+    def get_query_regex(terms, is_complete):
			
 
				+        if not terms:
			
 
				+            return ''
			
 
				+
			
 
				+        if is_complete:
			
 
				+            term_patterns = [rf'\b{term}\b' for term in terms]
			
 
				+        else:
			
 
				+            term_patterns = [rf'\b{term}\b' for term in terms[:-1]] + [rf'\b{terms[-1]}']
			
 
				         pattern = '|'.join(term_patterns)
			
 
				         return pattern
			
 
				 
			
 
				-    def score_result(terms, result: Document):
			
 
				+    def score_result(terms, result: Document, is_complete: bool):
			
 
				         domain = urlparse(result.url).netloc
			
 
				         domain_score = DOMAINS.get(domain, 0.0)
			
 
				 
			
 
				         result_string = f"{result.title.strip()} {result.extract.strip()}"
			
 
				-        query_regex = get_query_regex(terms)
			
 
				+        query_regex = get_query_regex(terms, is_complete)
			
 
				         matches = list(re.finditer(query_regex, result_string, flags=re.IGNORECASE))
			
 
				         match_strings = {x.group(0).lower() for x in matches}
			
 
				         match_length = sum(len(x) for x in match_strings)
			
@@ -80,8 +85,8 @@ def create(tiny_index: TinyIndex):
 
				         score = 0.1*domain_score + 0.9*(match_length + 1./last_match_char) / (total_possible_match_length + 1)
			
 
				         return score
			
 
				 
			
 
				-    def order_results(terms: list[str], results: list[Document]):
			
 
				-        results_and_scores = [(score_result(terms, result), result) for result in results]
			
 
				+    def order_results(terms: list[str], results: list[Document], is_complete: bool):
			
 
				+        results_and_scores = [(score_result(terms, result, is_complete), result) for result in results]
			
 
				         ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True)
			
 
				         filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD]
			
 
				         return filtered_results
			
@@ -97,9 +102,15 @@ def create(tiny_index: TinyIndex):
 
				 
			
 
				     def get_results(q):
			
 
				         terms = [x.lower() for x in q.replace('.', ' ').split()]
			
 
				+        is_complete = q.endswith(' ')
			
 
				+        if len(terms) > 0 and not is_complete:
			
 
				+            retrieval_terms = terms[:-1] + completer.complete(terms[-1])
			
 
				+        else:
			
 
				+            retrieval_terms = terms
			
 
				+
			
 
				         pages = []
			
 
				         seen_items = set()
			
 
				-        for term in terms:
			
 
				+        for term in retrieval_terms:
			
 
				             items = tiny_index.retrieve(term)
			
 
				             if items is not None:
			
 
				                 for item in items:
			
@@ -108,12 +119,6 @@ def create(tiny_index: TinyIndex):
 
				                             pages.append(item)
			
 
				                             seen_items.add(item.title)
			
 
				 
			
 
				-        ordered_results = order_results(terms, pages)
			
 
				+        ordered_results = order_results(terms, pages, is_complete)
			
 
				         return ordered_results, terms
			
 
				-
			
 
				-    @app.get('/')
			
 
				-    def index():
			
 
				-        return FileResponse(STATIC_FILES_PATH / 'index.html')
			
 
				-
			
 
				-    app.mount('/', StaticFiles(directory=STATIC_FILES_PATH), name="static")
			
 
				     return app
			
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -41,7 +41,6 @@ class TinyIndexBase(Generic[T]):
 
				         page = self.get_page(index)
			
 
				         if page is None:
			
 
				             return []
			
 
				-        # print("REtrieve", self.index_path, page)
			
 
				         return self.convert_items(page)
			
 
				 
			
 
				     def _get_key_page_index(self, key):
			
@@ -53,25 +52,21 @@ class TinyIndexBase(Generic[T]):
 
				         Get the page at index i, decompress and deserialise it using JSON
			
 
				         """
			
 
				         page_data = self.mmap[i * self.page_size:(i + 1) * self.page_size]
			
 
				-        zeros = page_data.count(b'\x00\x00\x00\x00') * 4
			
 
				         try:
			
 
				             decompressed_data = self.decompressor.decompress(page_data)
			
 
				         except ZstdError:
			
 
				             return None
			
 
				         results = json.loads(decompressed_data.decode('utf8'))
			
 
				-        # print(f"Num results: {len(results)}, num zeros: {zeros}")
			
 
				         return results
			
 
				 
			
 
				     def convert_items(self, items) -> List[T]:
			
 
				         converted = [self.item_factory(*item) for item in items]
			
 
				-        # print("Converted", items, converted)
			
 
				         return converted
			
 
				 
			
 
				 
			
 
				 class TinyIndex(TinyIndexBase[T]):
			
 
				     def __init__(self, item_factory: Callable[..., T], index_path, num_pages, page_size):
			
 
				         super().__init__(item_factory, num_pages, page_size)
			
 
				-        # print("REtrieve path", index_path)
			
 
				         self.index_path = index_path
			
 
				         self.index_file = open(self.index_path, 'rb')
			
 
				         self.mmap = mmap(self.index_file.fileno(), 0, prot=PROT_READ)
			
--- a/mwmbl/tinysearchengine/static/index.css
+++ b/mwmbl/tinysearchengine/static/index.css
@@ -1,82 +0,0 @@
 
				-html {
			
 
				-    font-family: Verdana, Geneva, sans-serif;
			
 
				-    background: #dcdced;
			
 
				-}
			
 
				-
			
 
				-body {
			
 
				-  font-size: 1.2rem;
			
 
				-}
			
 
				-
			
 
				-p {
			
 
				-    width: 100%;
			
 
				-    white-space: nowrap;
			
 
				-    overflow: hidden;
			
 
				-    text-overflow: ellipsis;
			
 
				-    margin: 3px;
			
 
				-}
			
 
				-
			
 
				-div {
			
 
				-    margin-top: 15px;
			
 
				-    margin-bottom: 15px;
			
 
				-}
			
 
				-
			
 
				-.url {
			
 
				-   margin-top: 0px;
			
 
				-   font-size: 1rem;
			
 
				-}
			
 
				-
			
 
				-.container {
			
 
				-  width: 100%;
			
 
				-  max-width: 1024px;
			
 
				-  margin: 0 auto;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-#search {
			
 
				-    display: block;
			
 
				-    width: 100%;
			
 
				-
			
 
				-    outline: none;
			
 
				-
			
 
				-    font-size: inherit;
			
 
				-
			
 
				-    border: 2px solid #ccc;
			
 
				-    border-width: 4px;
			
 
				-    border-radius: 50px;
			
 
				-
			
 
				-    padding: 10px;
			
 
				-    padding-left: 35px;
			
 
				-
			
 
				-    margin-top: 50px;
			
 
				-}
			
 
				-
			
 
				-a {
			
 
				-    text-decoration: none;
			
 
				-    color: #555555;
			
 
				-}
			
 
				-
			
 
				-div .result:hover {
			
 
				-    background: #f0f0ff;
			
 
				-}
			
 
				-
			
 
				-div .selected {
			
 
				-    background: #f0f0ff;
			
 
				-}
			
 
				-
			
 
				-div .result {
			
 
				-    padding: 10px;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-
			
 
				-span .term {
			
 
				-    font-weight: bold;
			
 
				-}
			
 
				-
			
 
				-.title {
			
 
				-    color: #1a0daa;
			
 
				-    /* color: black; */
			
 
				-}
			
 
				-
			
 
				-.extract {
			
 
				-}
			
--- a/mwmbl/tinysearchengine/static/index.html
+++ b/mwmbl/tinysearchengine/static/index.html
@@ -1,22 +0,0 @@
 
				-<html>
			
 
				-<head>
			
 
				-<meta name="referrer" content="no-referrer">
			
 
				-  <title>Stoatally Different</title>
			
 
				-  <meta name="viewport" content="width=device-width, initial-scale=1">
			
 
				-  <link href="/index.css" rel="stylesheet">
			
 
				-  <link rel="search"
			
 
				-      type="application/opensearchdescription+xml"
			
 
				-      title="Stoatally Different"
			
 
				-      href="http://localhost:8000/plugin.xml">
			
 
				-  <script src="/index.js"></script>
			
 
				-</head>
			
 
				-<body>
			
 
				-<div class="container">
			
 
				-  <form autocomplete="off" id="search-form">
			
 
				-    <input type="search" id="search" name="s" value="" autofocus/>
			
 
				-  </form>
			
 
				-
			
 
				-  <div id="results"></div>
			
 
				-</div>
			
 
				-</body>
			
 
				-</html>
			
--- a/mwmbl/tinysearchengine/static/index.js
+++ b/mwmbl/tinysearchengine/static/index.js
@@ -1,167 +0,0 @@
 
				-
			
 
				-ts = {
			
 
				-    selected: null,
			
 
				-    numItems: 0
			
 
				-};
			
 
				-
			
 
				-window.onload = (event) => {
			
 
				-    const searchInput = document.getElementById('search');
			
 
				-
			
 
				-    const length = searchInput.value.length;
			
 
				-    searchInput.setSelectionRange(length, length);
			
 
				-
			
 
				-    searchInput.oninput = debounce(e => {
			
 
				-        console.log("Key", e.key);
			
 
				-        console.log(searchInput.value);
			
 
				-
			
 
				-        const encodedValue = encodeURIComponent(searchInput.value);
			
 
				-        fetch('/search?s=' + encodedValue).then(response => {
			
 
				-            clearResults();
			
 
				-            console.log(response);
			
 
				-            response.json().then(content => {
			
 
				-                console.log(content);
			
 
				-                for (const [i, element] of content.entries()) {
			
 
				-                    addResult(element.title, element.extract, element.url, i);
			
 
				-                };
			
 
				-                ts.selected = null;
			
 
				-                ts.numItems = content.length;
			
 
				-            });
			
 
				-        });
			
 
				-    });
			
 
				-
			
 
				-    // Handle moving the selected item up and down
			
 
				-    document.addEventListener('keydown', (e) => {
			
 
				-        console.log("Key press", e);
			
 
				-        if (e.key == 'ArrowDown') {
			
 
				-            selectNextItem();
			
 
				-            e.preventDefault();
			
 
				-        } else if (e.key == 'ArrowUp') {
			
 
				-            selectPreviousItem();
			
 
				-            e.preventDefault();
			
 
				-        } else if (e.key == 'Enter') {
			
 
				-//            const form = document.getElementById('search-form');
			
 
				-//            form.submit();
			
 
				-//            event.preventDefault();
			
 
				-       }
			
 
				-    });
			
 
				-
			
 
				-    // Handle pressing enter
			
 
				-    const form = document.getElementById('search-form');
			
 
				-
			
 
				-    form.addEventListener("submit", event => {
			
 
				-        event.preventDefault();
			
 
				-        clickSelected();
			
 
				-    });
			
 
				-
			
 
				-    searchInput.focus();
			
 
				-};
			
 
				-
			
 
				-
			
 
				-function debounce(callback, timeout = 100){
			
 
				-    let timer;
			
 
				-    return (...args) => {
			
 
				-        clearTimeout(timer);
			
 
				-        timer = setTimeout(() => { callback.apply(this, args); }, timeout);
			
 
				-    };
			
 
				-}
			
 
				-
			
 
				-
			
 
				-function selectNextItem() {
			
 
				-    if (ts.selected === null) {
			
 
				-        ts.selected = 0;
			
 
				-    } else if (ts.selected < ts.numItems -1) {
			
 
				-        ts.selected++;
			
 
				-    }
			
 
				-
			
 
				-    updateSelected();
			
 
				-}
			
 
				-
			
 
				-
			
 
				-function clickSelected() {
			
 
				-    if (ts.selected !== null) {
			
 
				-        const selectedResult = document.getElementById(ts.selected.toString());
			
 
				-        selectedResult.click();
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-
			
 
				-function selectPreviousItem() {
			
 
				-    if (ts.selected === null) {
			
 
				-        return;
			
 
				-    } else if (ts.selected > 0) {
			
 
				-        ts.selected--;
			
 
				-    } else if (ts.selected == 0) {
			
 
				-        ts.selected = null;
			
 
				-    }
			
 
				-
			
 
				-    updateSelected();
			
 
				-}
			
 
				-
			
 
				-
			
 
				-function updateSelected() {
			
 
				-    const results = document.querySelectorAll('.result');
			
 
				-    results.forEach(child => {
			
 
				-        child.classList.remove('selected');
			
 
				-    });
			
 
				-
			
 
				-    if (ts.selected !== null) {
			
 
				-        const selectedResult = document.getElementById(ts.selected.toString());
			
 
				-        selectedResult.classList.add('selected');
			
 
				-    }
			
 
				-}
			
 
				-
			
 
				-
			
 
				-function clearResults() {
			
 
				-  const results = document.getElementById('results');
			
 
				-  results.innerHTML = '';
			
 
				-}
			
 
				-
			
 
				-
			
 
				-function addResult(title, extract, url, id) {
			
 
				-   const par = document.createElement("p");
			
 
				-
			
 
				-   const titleText = createBoldedSpan(title);
			
 
				-   titleText.classList.add('title');
			
 
				-   const extractText = createBoldedSpan(extract);
			
 
				-   extractText.classList.add('extract');
			
 
				-   par.appendChild(titleText);
			
 
				-
			
 
				-   separator = document.createTextNode(' - ')
			
 
				-   par.appendChild(separator);
			
 
				-
			
 
				-   par.appendChild(extractText);
			
 
				-
			
 
				-   const div = document.createElement("div");
			
 
				-   div.classList.add('result');
			
 
				-   div.id = id.toString();
			
 
				-
			
 
				-   const urlPar = document.createElement("p");
			
 
				-   const urlText = document.createTextNode(url);
			
 
				-   urlPar.appendChild(urlText);
			
 
				-   urlPar.classList.add('url');
			
 
				-   div.appendChild(urlPar);
			
 
				-   div.appendChild(par);
			
 
				-
			
 
				-   const link = document.createElement("a");
			
 
				-   link.appendChild(div);
			
 
				-   link.href = url;
			
 
				-
			
 
				-   const results = document.getElementById('results');
			
 
				-   results.appendChild(link);
			
 
				-}
			
 
				-
			
 
				-function createBoldedSpan(title) {
			
 
				-    span = document.createElement('span');
			
 
				-    title.forEach(element => {
			
 
				-        text = document.createTextNode(element.value);
			
 
				-        if (element.is_bold) {
			
 
				-            b = document.createElement('span');
			
 
				-            b.classList.add('term');
			
 
				-            b.appendChild(text);
			
 
				-            span.appendChild(b);
			
 
				-        } else {
			
 
				-            span.appendChild(text);
			
 
				-        }
			
 
				-    });
			
 
				-    return span;
			
 
				-}
			
--- a/mwmbl/tinysearchengine/static/landing.html
+++ b/mwmbl/tinysearchengine/static/landing.html
@@ -1,48 +0,0 @@
 
				-<!DOCTYPE html>
			
 
				-<html lang="en">
			
 
				-<head>
			
 
				-  <title>%s</title>
			
 
				-  <link rel="search"
			
 
				-      type="application/opensearchdescription+xml"
			
 
				-      title="Stoatally Different"
			
 
				-      href="https://stoatally-different.appspot.com/plugin.xml">
			
 
				-  <!--BOOTSTRAP-->
			
 
				-  <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/css/bootstrap.min.css" integrity="sha384-/Y6pD6FV/Vv2HJnA6t+vslU6fwYXjCFtcEpHbNJ0lyAFsXTsjBbfaDjzALeQsN6M" crossorigin="anonymous">
			
 
				-  <link href="typeaheadjs.css" rel="stylesheet">
			
 
				-  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
			
 
				-</head>
			
 
				-<body>
			
 
				-%s
			
 
				-
			
 
				-<div id="remote">
			
 
				-  <form action="/search">
			
 
				-    <input class="typeahead" type="search" placeholder="Search" name="s">
			
 
				-    <!--<input type="search" name="s" />-->
			
 
				-  </form>
			
 
				-</div>
			
 
				-
			
 
				-<!--BOOTSTRAP-->
			
 
				-<script src="https://code.jquery.com/jquery-3.2.1.slim.min.js" integrity="sha384-KJ3o2DKtIkvYIK3UENzmM7KCkRr/rE9/Qpg6aAZGJwFDMVNA/GpGFF93hXpG5KkN" crossorigin="anonymous"></script>
			
 
				-<script src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.11.0/umd/popper.min.js" integrity="sha384-b/U6ypiBEHpOf/4+1nzFpr53nxSS+GLCkfwBdFNTxtclqqenISfwAzpKaMNFNmj4" crossorigin="anonymous"></script>
			
 
				-<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-beta/js/bootstrap.min.js" integrity="sha384-h0AbiXch4ZDo7tp9hKZ4TsHbi047NrKGLO3SEJAg45jXxnGIfYzk4Si90RDIqNm1" crossorigin="anonymous"></script>
			
 
				-<script src="typeahead.js"></script>
			
 
				-<script>
			
 
				-var bestPictures = new Bloodhound({
			
 
				-  datumTokenizer: Bloodhound.tokenizers.obj.whitespace('value'),
			
 
				-  queryTokenizer: Bloodhound.tokenizers.whitespace,
			
 
				-  remote: {
			
 
				-    url: '/complete?q=%%QUERY',
			
 
				-    wildcard: '%%QUERY'
			
 
				-  }
			
 
				-});
			
 
				-
			
 
				-$('#remote .typeahead').typeahead(null, {
			
 
				-  name: 'best-pictures',
			
 
				-  display: 'value',
			
 
				-  source: bestPictures
			
 
				-});
			
 
				-
			
 
				-</script>
			
 
				-
			
 
				-</body>
			
 
				-</html>
			
--- a/mwmbl/tinysearchengine/static/plugin.xml
+++ b/mwmbl/tinysearchengine/static/plugin.xml
--- a/mwmbl/tinysearchengine/static/search.html
+++ b/mwmbl/tinysearchengine/static/search.html
@@ -1,18 +0,0 @@
 
				-<!DOCTYPE html>
			
 
				-<html lang="en">
			
 
				-<head>
			
 
				-    <meta charset="UTF-8">
			
 
				-    <title>Search results</title>
			
 
				-</head>
			
 
				-<body>
			
 
				-
			
 
				-<div id="remote">
			
 
				-  <form action="/search">
			
 
				-    <input class="typeahead" type="search" placeholder="Search" name="s">
			
 
				-    <!--<input type="search" name="s" />-->
			
 
				-  </form>
			
 
				-</div>
			
 
				-
			
 
				-
			
 
				-</body>
			
 
				-</html>
			
--- a/mwmbl/tinysearchengine/static/typeahead.css
+++ b/mwmbl/tinysearchengine/static/typeahead.css
@@ -1,58 +0,0 @@
 
				-span.twitter-typeahead .tt-menu {
			
 
				-  cursor: pointer;
			
 
				-}
			
 
				-
			
 
				-.dropdown-menu, span.twitter-typeahead .tt-menu {
			
 
				-  position: absolute;
			
 
				-  top: 100%;
			
 
				-  left: 0;
			
 
				-  z-index: 1000;
			
 
				-  display: none;
			
 
				-  float: left;
			
 
				-  min-width: 160px;
			
 
				-  padding: 5px 0;
			
 
				-  margin: 2px 0 0;
			
 
				-  font-size: 1rem;
			
 
				-  color: #373a3c;
			
 
				-  text-align: left;
			
 
				-  list-style: none;
			
 
				-  background-color: #fff;
			
 
				-  background-clip: padding-box;
			
 
				-  border: 1px solid rgba(0, 0, 0, 0.15);
			
 
				-  border-radius: 0.25rem; }
			
 
				-
			
 
				-span.twitter-typeahead .tt-suggestion {
			
 
				-  display: block;
			
 
				-  width: 100%;
			
 
				-  padding: 3px 20px;
			
 
				-  clear: both;
			
 
				-  font-weight: normal;
			
 
				-  line-height: 1.5;
			
 
				-  color: #373a3c;
			
 
				-  text-align: inherit;
			
 
				-  white-space: nowrap;
			
 
				-  background: none;
			
 
				-  border: 0; }
			
 
				-span.twitter-typeahead .tt-suggestion:focus, .dropdown-item:hover, span.twitter-typeahead .tt-suggestion:hover {
			
 
				-    color: #2b2d2f;
			
 
				-    text-decoration: none;
			
 
				-    background-color: #f5f5f5; }
			
 
				-span.twitter-typeahead .active.tt-suggestion, span.twitter-typeahead .tt-suggestion.tt-cursor, span.twitter-typeahead .active.tt-suggestion:focus, span.twitter-typeahead .tt-suggestion.tt-cursor:focus, span.twitter-typeahead .active.tt-suggestion:hover, span.twitter-typeahead .tt-suggestion.tt-cursor:hover {
			
 
				-    color: #fff;
			
 
				-    text-decoration: none;
			
 
				-    background-color: #0275d8;
			
 
				-    outline: 0; }
			
 
				-span.twitter-typeahead .disabled.tt-suggestion, span.twitter-typeahead .disabled.tt-suggestion:focus, span.twitter-typeahead .disabled.tt-suggestion:hover {
			
 
				-    color: #818a91; }
			
 
				-span.twitter-typeahead .disabled.tt-suggestion:focus, span.twitter-typeahead .disabled.tt-suggestion:hover {
			
 
				-    text-decoration: none;
			
 
				-    cursor: not-allowed;
			
 
				-    background-color: transparent;
			
 
				-    background-image: none;
			
 
				-    filter: "progid:DXImageTransform.Microsoft.gradient(enabled = false)"; }
			
 
				-span.twitter-typeahead {
			
 
				-  width: 100%; }
			
 
				-  .input-group span.twitter-typeahead {
			
 
				-    display: block !important; }
			
 
				-    .input-group span.twitter-typeahead .tt-menu {
			
 
				-      top: 2.375rem !important; }
			
--- a/mwmbl/tinysearchengine/static/typeahead.js
+++ b/mwmbl/tinysearchengine/static/typeahead.js
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,6 +1,6 @@
 
				 [[package]]
			
 
				 name = "anyio"
			
 
				-version = "3.4.0"
			
 
				+version = "3.5.0"
			
 
				 description = "High level compatibility layer for multiple asynchronous event loop implementations"
			
 
				 category = "main"
			
 
				 optional = false
			
@@ -11,17 +11,17 @@ idna = ">=2.8"
 
				 sniffio = ">=1.1"
			
 
				 
			
 
				 [package.extras]
			
 
				-doc = ["sphinx-rtd-theme", "sphinx-autodoc-typehints (>=1.2.0)"]
			
 
				+doc = ["packaging", "sphinx-rtd-theme", "sphinx-autodoc-typehints (>=1.2.0)"]
			
 
				 test = ["coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "pytest (>=6.0)", "pytest-mock (>=3.6.1)", "trustme", "contextlib2", "uvloop (<0.15)", "mock (>=4)", "uvloop (>=0.15)"]
			
 
				 trio = ["trio (>=0.16)"]
			
 
				 
			
 
				 [[package]]
			
 
				 name = "asgiref"
			
 
				-version = "3.4.1"
			
 
				+version = "3.5.0"
			
 
				 description = "ASGI specs, helper code, and adapters"
			
 
				 category = "main"
			
 
				 optional = false
			
 
				-python-versions = ">=3.6"
			
 
				+python-versions = ">=3.7"
			
 
				 
			
 
				 [package.extras]
			
 
				 tests = ["pytest", "pytest-asyncio", "mypy (>=0.800)"]
			
@@ -113,7 +113,7 @@ pycparser = "*"
 
				 
			
 
				 [[package]]
			
 
				 name = "charset-normalizer"
			
 
				-version = "2.0.9"
			
 
				+version = "2.0.11"
			
 
				 description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
			
 
				 category = "main"
			
 
				 optional = true
			
@@ -183,7 +183,7 @@ test = ["pytest (>=6.2.4,<7.0.0)", "pytest-cov (>=2.12.0,<4.0.0)", "mypy (==0.91
 
				 
			
 
				 [[package]]
			
 
				 name = "h11"
			
 
				-version = "0.12.0"
			
 
				+version = "0.13.0"
			
 
				 description = "A pure-Python, bring-your-own-I/O implementation of HTTP/1.1"
			
 
				 category = "main"
			
 
				 optional = false
			
@@ -322,24 +322,19 @@ pyparsing = ">=2.0.2,<3.0.5 || >3.0.5"
 
				 
			
 
				 [[package]]
			
 
				 name = "pandas"
			
 
				-version = "1.3.5"
			
 
				+version = "1.4.0"
			
 
				 description = "Powerful data structures for data analysis, time series, and statistics"
			
 
				 category = "main"
			
 
				 optional = false
			
 
				-python-versions = ">=3.7.1"
			
 
				+python-versions = ">=3.8"
			
 
				 
			
 
				 [package.dependencies]
			
 
				-numpy = [
			
 
				-    {version = ">=1.17.3", markers = "platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
			
 
				-    {version = ">=1.19.2", markers = "platform_machine == \"aarch64\" and python_version < \"3.10\""},
			
 
				-    {version = ">=1.20.0", markers = "platform_machine == \"arm64\" and python_version < \"3.10\""},
			
 
				-    {version = ">=1.21.0", markers = "python_version >= \"3.10\""},
			
 
				-]
			
 
				-python-dateutil = ">=2.7.3"
			
 
				-pytz = ">=2017.3"
			
 
				+numpy = {version = ">=1.21.0", markers = "python_version >= \"3.10\""}
			
 
				+python-dateutil = ">=2.8.1"
			
 
				+pytz = ">=2020.1"
			
 
				 
			
 
				 [package.extras]
			
 
				-test = ["hypothesis (>=3.58)", "pytest (>=6.0)", "pytest-xdist"]
			
 
				+test = ["hypothesis (>=5.5.3)", "pytest (>=6.0)", "pytest-xdist (>=1.31)"]
			
 
				 
			
 
				 [[package]]
			
 
				 name = "pathy"
			
@@ -415,7 +410,7 @@ email = ["email-validator (>=1.0.3)"]
 
				 
			
 
				 [[package]]
			
 
				 name = "pyparsing"
			
 
				-version = "3.0.6"
			
 
				+version = "3.0.7"
			
 
				 description = "Python parsing module"
			
 
				 category = "main"
			
 
				 optional = true
			
@@ -481,7 +476,7 @@ full = ["numpy"]
 
				 
			
 
				 [[package]]
			
 
				 name = "requests"
			
 
				-version = "2.26.0"
			
 
				+version = "2.27.1"
			
 
				 description = "Python HTTP for Humans."
			
 
				 category = "main"
			
 
				 optional = true
			
@@ -736,7 +731,7 @@ python-versions = ">=3.6"
 
				 
			
 
				 [[package]]
			
 
				 name = "urllib3"
			
 
				-version = "1.26.7"
			
 
				+version = "1.26.8"
			
 
				 description = "HTTP library with thread-safe connection pooling, file post, and more."
			
 
				 category = "main"
			
 
				 optional = true
			
@@ -801,17 +796,17 @@ indexer = ["botocore", "boto3", "ujson", "warcio", "idna", "beautifulsoup4", "lx
 
				 
			
 
				 [metadata]
			
 
				 lock-version = "1.1"
			
 
				-python-versions = "^3.9"
			
 
				-content-hash = "b45f9def8dcadfaa6ce23560b51bdee7f81c335598f6cc84d11fd3d596e3da5b"
			
 
				+python-versions = "^3.10"
			
 
				+content-hash = "b5af8ce9887d0cf69297180fbb4040e1522e4a3135f8b651415afb35f86124ef"
			
 
				 
			
 
				 [metadata.files]
			
 
				 anyio = [
			
 
				-    {file = "anyio-3.4.0-py3-none-any.whl", hash = "sha256:2855a9423524abcdd652d942f8932fda1735210f77a6b392eafd9ff34d3fe020"},
			
 
				-    {file = "anyio-3.4.0.tar.gz", hash = "sha256:24adc69309fb5779bc1e06158e143e0b6d2c56b302a3ac3de3083c705a6ed39d"},
			
 
				+    {file = "anyio-3.5.0-py3-none-any.whl", hash = "sha256:b5fa16c5ff93fa1046f2eeb5bbff2dad4d3514d6cda61d02816dba34fa8c3c2e"},
			
 
				+    {file = "anyio-3.5.0.tar.gz", hash = "sha256:a0aeffe2fb1fdf374a8e4b471444f0f3ac4fb9f5a5b542b48824475e0042a5a6"},
			
 
				 ]
			
 
				 asgiref = [
			
 
				-    {file = "asgiref-3.4.1-py3-none-any.whl", hash = "sha256:ffc141aa908e6f175673e7b1b3b7af4fdb0ecb738fc5c8b88f69f055c2415214"},
			
 
				-    {file = "asgiref-3.4.1.tar.gz", hash = "sha256:4ef1ab46b484e3c706329cedeff284a5d40824200638503f5768edb6de7d58e9"},
			
 
				+    {file = "asgiref-3.5.0-py3-none-any.whl", hash = "sha256:88d59c13d634dcffe0510be048210188edd79aeccb6a6c9028cdad6f31d730a9"},
			
 
				+    {file = "asgiref-3.5.0.tar.gz", hash = "sha256:2f8abc20f7248433085eda803936d98992f1343ddb022065779f37c5da0181d0"},
			
 
				 ]
			
 
				 beautifulsoup4 = [
			
 
				     {file = "beautifulsoup4-4.10.0-py3-none-any.whl", hash = "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf"},
			
@@ -904,8 +899,8 @@ cffi = [
 
				     {file = "cffi-1.15.0.tar.gz", hash = "sha256:920f0d66a896c2d99f0adbb391f990a84091179542c205fa53ce5787aff87954"},
			
 
				 ]
			
 
				 charset-normalizer = [
			
 
				-    {file = "charset-normalizer-2.0.9.tar.gz", hash = "sha256:b0b883e8e874edfdece9c28f314e3dd5badf067342e42fb162203335ae61aa2c"},
			
 
				-    {file = "charset_normalizer-2.0.9-py3-none-any.whl", hash = "sha256:1eecaa09422db5be9e29d7fc65664e6c33bd06f9ced7838578ba40d58bdf3721"},
			
 
				+    {file = "charset-normalizer-2.0.11.tar.gz", hash = "sha256:98398a9d69ee80548c762ba991a4728bfc3836768ed226b3945908d1a688371c"},
			
 
				+    {file = "charset_normalizer-2.0.11-py3-none-any.whl", hash = "sha256:2842d8f5e82a1f6aa437380934d5e1cd4fcf2003b06fed6940769c164a480a45"},
			
 
				 ]
			
 
				 click = [
			
 
				     {file = "click-8.0.3-py3-none-any.whl", hash = "sha256:353f466495adaeb40b6b5f592f9f91cb22372351c84caeb068132442a4518ef3"},
			
@@ -939,8 +934,8 @@ fastapi = [
 
				     {file = "fastapi-0.70.1.tar.gz", hash = "sha256:21d03979b5336375c66fa5d1f3126c6beca650d5d2166fbb78345a30d33c8d06"},
			
 
				 ]
			
 
				 h11 = [
			
 
				-    {file = "h11-0.12.0-py3-none-any.whl", hash = "sha256:36a3cb8c0a032f56e2da7084577878a035d3b61d104230d4bd49c0c6b555a9c6"},
			
 
				-    {file = "h11-0.12.0.tar.gz", hash = "sha256:47222cb6067e4a307d535814917cd98fd0a57b6788ce715755fa2b6c28b56042"},
			
 
				+    {file = "h11-0.13.0-py3-none-any.whl", hash = "sha256:8ddd78563b633ca55346c8cd41ec0af27d3c79931828beffb46ce70a379e7442"},
			
 
				+    {file = "h11-0.13.0.tar.gz", hash = "sha256:70813c1135087a248a4d38cc0e1a0181ffab2188141a93eaf567940c3957ff06"},
			
 
				 ]
			
 
				 idna = [
			
 
				     {file = "idna-3.3-py3-none-any.whl", hash = "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff"},
			
@@ -1234,31 +1229,27 @@ packaging = [
 
				     {file = "packaging-21.3.tar.gz", hash = "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb"},
			
 
				 ]
			
 
				 pandas = [
			
 
				-    {file = "pandas-1.3.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:62d5b5ce965bae78f12c1c0df0d387899dd4211ec0bdc52822373f13a3a022b9"},
			
 
				-    {file = "pandas-1.3.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:adfeb11be2d54f275142c8ba9bf67acee771b7186a5745249c7d5a06c670136b"},
			
 
				-    {file = "pandas-1.3.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:60a8c055d58873ad81cae290d974d13dd479b82cbb975c3e1fa2cf1920715296"},
			
 
				-    {file = "pandas-1.3.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fd541ab09e1f80a2a1760032d665f6e032d8e44055d602d65eeea6e6e85498cb"},
			
 
				-    {file = "pandas-1.3.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2651d75b9a167cc8cc572cf787ab512d16e316ae00ba81874b560586fa1325e0"},
			
 
				-    {file = "pandas-1.3.5-cp310-cp310-win_amd64.whl", hash = "sha256:aaf183a615ad790801fa3cf2fa450e5b6d23a54684fe386f7e3208f8b9bfbef6"},
			
 
				-    {file = "pandas-1.3.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:344295811e67f8200de2390093aeb3c8309f5648951b684d8db7eee7d1c81fb7"},
			
 
				-    {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:552020bf83b7f9033b57cbae65589c01e7ef1544416122da0c79140c93288f56"},
			
 
				-    {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5cce0c6bbeb266b0e39e35176ee615ce3585233092f685b6a82362523e59e5b4"},
			
 
				-    {file = "pandas-1.3.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d28a3c65463fd0d0ba8bbb7696b23073efee0510783340a44b08f5e96ffce0c"},
			
 
				-    {file = "pandas-1.3.5-cp37-cp37m-win32.whl", hash = "sha256:a62949c626dd0ef7de11de34b44c6475db76995c2064e2d99c6498c3dba7fe58"},
			
 
				-    {file = "pandas-1.3.5-cp37-cp37m-win_amd64.whl", hash = "sha256:8025750767e138320b15ca16d70d5cdc1886e8f9cc56652d89735c016cd8aea6"},
			
 
				-    {file = "pandas-1.3.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:fe95bae4e2d579812865db2212bb733144e34d0c6785c0685329e5b60fcb85dd"},
			
 
				-    {file = "pandas-1.3.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5f261553a1e9c65b7a310302b9dbac31cf0049a51695c14ebe04e4bfd4a96f02"},
			
 
				-    {file = "pandas-1.3.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8b6dbec5f3e6d5dc80dcfee250e0a2a652b3f28663492f7dab9a24416a48ac39"},
			
 
				-    {file = "pandas-1.3.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d3bc49af96cd6285030a64779de5b3688633a07eb75c124b0747134a63f4c05f"},
			
 
				-    {file = "pandas-1.3.5-cp38-cp38-win32.whl", hash = "sha256:b6b87b2fb39e6383ca28e2829cddef1d9fc9e27e55ad91ca9c435572cdba51bf"},
			
 
				-    {file = "pandas-1.3.5-cp38-cp38-win_amd64.whl", hash = "sha256:a395692046fd8ce1edb4c6295c35184ae0c2bbe787ecbe384251da609e27edcb"},
			
 
				-    {file = "pandas-1.3.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bd971a3f08b745a75a86c00b97f3007c2ea175951286cdda6abe543e687e5f2f"},
			
 
				-    {file = "pandas-1.3.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:37f06b59e5bc05711a518aa10beaec10942188dccb48918bb5ae602ccbc9f1a0"},
			
 
				-    {file = "pandas-1.3.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c21778a688d3712d35710501f8001cdbf96eb70a7c587a3d5613573299fdca6"},
			
 
				-    {file = "pandas-1.3.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3345343206546545bc26a05b4602b6a24385b5ec7c75cb6059599e3d56831da2"},
			
 
				-    {file = "pandas-1.3.5-cp39-cp39-win32.whl", hash = "sha256:c69406a2808ba6cf580c2255bcf260b3f214d2664a3a4197d0e640f573b46fd3"},
			
 
				-    {file = "pandas-1.3.5-cp39-cp39-win_amd64.whl", hash = "sha256:32e1a26d5ade11b547721a72f9bfc4bd113396947606e00d5b4a5b79b3dcb006"},
			
 
				-    {file = "pandas-1.3.5.tar.gz", hash = "sha256:1e4285f5de1012de20ca46b188ccf33521bff61ba5c5ebd78b4fb28e5416a9f1"},
			
 
				+    {file = "pandas-1.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:de62cf699122dcef175988f0714678e59c453dc234c5b47b7136bfd7641e3c8c"},
			
 
				+    {file = "pandas-1.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:46a18572f3e1cb75db59d9461940e9ba7ee38967fa48dd58f4139197f6e32280"},
			
 
				+    {file = "pandas-1.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:73f7da2ccc38cc988b74e5400b430b7905db5f2c413ff215506bea034eaf832d"},
			
 
				+    {file = "pandas-1.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5229c95db3a907451dacebc551492db6f7d01743e49bbc862f4a6010c227d187"},
			
 
				+    {file = "pandas-1.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe454180ad31bbbe1e5d111b44443258730467f035e26b4e354655ab59405871"},
			
 
				+    {file = "pandas-1.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:784cca3f69cfd7f6bd7c7fdb44f2bbab17e6de55725e9ff36d6f382510dfefb5"},
			
 
				+    {file = "pandas-1.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:de8f8999864399529e8514a2e6bfe00fd161f0a667903655552ed12e583ae3cb"},
			
 
				+    {file = "pandas-1.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:0f19504f2783526fb5b4de675ea69d68974e21c1624f4b92295d057a31d5ec5f"},
			
 
				+    {file = "pandas-1.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f045bb5c6bfaba536089573bf97d6b8ccc7159d951fe63904c395a5e486fbe14"},
			
 
				+    {file = "pandas-1.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5280d057ddae06fe4a3cd6aa79040b8c205cd6dd21743004cf8635f39ed01712"},
			
 
				+    {file = "pandas-1.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f3b74335390dda49f5d5089fab71958812bf56f42aa27663ee4c16d19f4f1c5"},
			
 
				+    {file = "pandas-1.4.0-cp38-cp38-win32.whl", hash = "sha256:51e5da3802aaee1aa4254108ffaf1129a15fb3810b7ce8da1ec217c655b418f5"},
			
 
				+    {file = "pandas-1.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:f103a5cdcd66cb18882ccdc18a130c31c3cfe3529732e7f10a8ab3559164819c"},
			
 
				+    {file = "pandas-1.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4a8d5a200f8685e7ea562b2f022c77ab7cb82c1ca5b240e6965faa6f84e5c1e9"},
			
 
				+    {file = "pandas-1.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b5af258c7b090cca7b742cf2bd67ad1919aa9e4e681007366c9edad2d6a3d42b"},
			
 
				+    {file = "pandas-1.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:156aac90dd7b303bf0b91bae96c0503212777f86c731e41929c571125d26c8e9"},
			
 
				+    {file = "pandas-1.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2dad075089e17a72391de33021ad93720aff258c3c4b68c78e1cafce7e447045"},
			
 
				+    {file = "pandas-1.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1d59c958d6b8f96fdf850c7821571782168d5acfe75ccf78cd8d1ac15fb921df"},
			
 
				+    {file = "pandas-1.4.0-cp39-cp39-win32.whl", hash = "sha256:55ec0e192eefa26d823fc25a1f213d6c304a3592915f368e360652994cdb8d9a"},
			
 
				+    {file = "pandas-1.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:23c04dab11f3c6359cfa7afa83d3d054a8f8c283d773451184d98119ef54da97"},
			
 
				+    {file = "pandas-1.4.0.tar.gz", hash = "sha256:cdd76254c7f0a1583bd4e4781fb450d0ebf392e10d3f12e92c95575942e37df5"},
			
 
				 ]
			
 
				 pathy = [
			
 
				     {file = "pathy-0.6.1-py3-none-any.whl", hash = "sha256:25fd04cec6393661113086730ce69c789d121bea83ab1aa18452e8fd42faf29a"},
			
@@ -1353,8 +1344,8 @@ pydantic = [
 
				     {file = "pydantic-1.8.2.tar.gz", hash = "sha256:26464e57ccaafe72b7ad156fdaa4e9b9ef051f69e175dbbb463283000c05ab7b"},
			
 
				 ]
			
 
				 pyparsing = [
			
 
				-    {file = "pyparsing-3.0.6-py3-none-any.whl", hash = "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4"},
			
 
				-    {file = "pyparsing-3.0.6.tar.gz", hash = "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81"},
			
 
				+    {file = "pyparsing-3.0.7-py3-none-any.whl", hash = "sha256:a6c06a88f252e6c322f65faf8f418b16213b51bdfaece0524c1c1bc30c63c484"},
			
 
				+    {file = "pyparsing-3.0.7.tar.gz", hash = "sha256:18ee9022775d270c55187733956460083db60b37d0d0fb357445f3094eed3eea"},
			
 
				 ]
			
 
				 pyspark = [
			
 
				     {file = "pyspark-3.2.0.tar.gz", hash = "sha256:bfea06179edbfb4bc76a0f470bd3c38e12f00e1023e3ad0373558d07cff102ab"},
			
@@ -1459,8 +1450,8 @@ rapidfuzz = [
 
				     {file = "rapidfuzz-1.8.3.tar.gz", hash = "sha256:e85fa8110dc1271b7f193f225e5c6c63be81c3cf1a48648d01ed5d55955fbc4c"},
			
 
				 ]
			
 
				 requests = [
			
 
				-    {file = "requests-2.26.0-py2.py3-none-any.whl", hash = "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24"},
			
 
				-    {file = "requests-2.26.0.tar.gz", hash = "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7"},
			
 
				+    {file = "requests-2.27.1-py2.py3-none-any.whl", hash = "sha256:f22fa1e554c9ddfd16e6e41ac79759e17be9e492b3587efa038054674760e72d"},
			
 
				+    {file = "requests-2.27.1.tar.gz", hash = "sha256:68d7c56fd5a8999887728ef304a6d12edc7be74f1cfa47714fc8b414525c9a61"},
			
 
				 ]
			
 
				 s3transfer = [
			
 
				     {file = "s3transfer-0.5.0-py3-none-any.whl", hash = "sha256:9c1dc369814391a6bda20ebbf4b70a0f34630592c9aa520856bf384916af2803"},
			
@@ -1607,8 +1598,8 @@ ujson = [
 
				     {file = "ujson-4.3.0.tar.gz", hash = "sha256:baee56eca35cb5fbe02c28bd9c0936be41a96fa5c0812d9d4b7edeb5c3d568a0"},
			
 
				 ]
			
 
				 urllib3 = [
			
 
				-    {file = "urllib3-1.26.7-py2.py3-none-any.whl", hash = "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844"},
			
 
				-    {file = "urllib3-1.26.7.tar.gz", hash = "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece"},
			
 
				+    {file = "urllib3-1.26.8-py2.py3-none-any.whl", hash = "sha256:000ca7f471a233c2251c6c7023ee85305721bfdf18621ebff4fd17a8653427ed"},
			
 
				+    {file = "urllib3-1.26.8.tar.gz", hash = "sha256:0e7c33d9a63e7ddfcb86780aac87befc2fbddf46c58dbb487e0855f7ceec283c"},
			
 
				 ]
			
 
				 uvicorn = [
			
 
				     {file = "uvicorn-0.16.0-py3-none-any.whl", hash = "sha256:d8c839231f270adaa6d338d525e2652a0b4a5f4c2430b5c4ef6ae4d11776b0d2"},
			
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ description = ""
 
				 authors = ["Daoud Clarke <daoud.clarke@gmail.com>"]
			
 
				 
			
 
				 [tool.poetry.dependencies]
			
 
				-python = "^3.9"
			
 
				+python = "^3.10"
			
 
				 pandas = "^1.3.4"
			
 
				 zstandard = "^0.16.0"
			
 
				 mmh3 = "^3.0.0"