From 3c75dd1a74ffc86308d7ffe98abe9ec1afeee217 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sun, 30 Jan 2022 22:20:28 +0000 Subject: [PATCH] WIP: implement term completer --- mwmbl/indexer/paths.py | 5 +++-- mwmbl/tinysearchengine/completer.py | 27 +++++++++++++++++++++++++++ mwmbl/tinysearchengine/config.py | 1 + 3 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 mwmbl/tinysearchengine/completer.py diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py index 75e81e8..c372021 100644 --- a/mwmbl/indexer/paths.py +++ b/mwmbl/indexer/paths.py @@ -5,7 +5,6 @@ HOME = os.getenv('HOME') DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch' COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv' -MWMBL_CRAWL_TERMS_PATH = DATA_DIR / 'mwmbl-craw-terms.csv' HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') CRAWL_PREFIX = 'crawl_' @@ -20,6 +19,8 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs' DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs' DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') -INDEX_PATH = Path(__file__).parent.parent.parent / 'data' / 'index.tinysearch' +LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data' +INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch' +MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv' TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json' diff --git a/mwmbl/tinysearchengine/completer.py b/mwmbl/tinysearchengine/completer.py new file mode 100644 index 0000000..eb88a80 --- /dev/null +++ b/mwmbl/tinysearchengine/completer.py @@ -0,0 +1,27 @@ +from bisect import bisect_left, bisect_right + +import pandas as pd +from pandas import DataFrame + + +class Completer: + def __init__(self, terms: DataFrame): + terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict() + self.terms = list(terms_dict.keys()) + self.counts = list(terms_dict.values()) + print("Terms", self.terms[:100], self.counts[:100]) + + def complete(self, term): + term_length = len(term) + start = bisect_left(self.terms, term, key=lambda x: x[:term_length]) + end = bisect_right(self.terms, term, key=lambda x: x[:term_length]) + + print("Start", self.terms[start]) + print("End", self.terms[end]) + + + +if __name__ == '__main__': + data = pd.read_csv('data/mwmbl-crawl-terms.csv') + completer = Completer(data) + completer.complete('yo') diff --git a/mwmbl/tinysearchengine/config.py b/mwmbl/tinysearchengine/config.py index c506117..f0dd3d3 100644 --- a/mwmbl/tinysearchengine/config.py +++ b/mwmbl/tinysearchengine/config.py @@ -11,6 +11,7 @@ class ServerConfigModel(BaseModel): class IndexConfigModel(BaseModel): index_path: StrictStr = "data/index.tinysearch" + terms_path: StrictStr = "data/mwmbl-crawl-terms.csv" num_pages: StrictInt = 25600 page_size: StrictInt = 4096