From 3c75dd1a74ffc86308d7ffe98abe9ec1afeee217 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Sun, 30 Jan 2022 22:20:28 +0000
Subject: [PATCH] WIP: implement term completer

---
 mwmbl/indexer/paths.py              |  5 +++--
 mwmbl/tinysearchengine/completer.py | 27 +++++++++++++++++++++++++++
 mwmbl/tinysearchengine/config.py    |  1 +
 3 files changed, 31 insertions(+), 2 deletions(-)
 create mode 100644 mwmbl/tinysearchengine/completer.py

diff --git a/mwmbl/indexer/paths.py b/mwmbl/indexer/paths.py
index 75e81e8..c372021 100644
--- a/mwmbl/indexer/paths.py
+++ b/mwmbl/indexer/paths.py
@@ -5,7 +5,6 @@ HOME = os.getenv('HOME')
 
 DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
 COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
-MWMBL_CRAWL_TERMS_PATH = DATA_DIR / 'mwmbl-craw-terms.csv'
 
 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 CRAWL_PREFIX = 'crawl_'
@@ -20,6 +19,8 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs'
 DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
 DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
 
-INDEX_PATH = Path(__file__).parent.parent.parent / 'data' / 'index.tinysearch'
+LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data'
+INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch'
+MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv'
 
 TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'
diff --git a/mwmbl/tinysearchengine/completer.py b/mwmbl/tinysearchengine/completer.py
new file mode 100644
index 0000000..eb88a80
--- /dev/null
+++ b/mwmbl/tinysearchengine/completer.py
@@ -0,0 +1,27 @@
+from bisect import bisect_left, bisect_right
+
+import pandas as pd
+from pandas import DataFrame
+
+
+class Completer:
+    def __init__(self, terms: DataFrame):
+        terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict()
+        self.terms = list(terms_dict.keys())
+        self.counts = list(terms_dict.values())
+        print("Terms", self.terms[:100], self.counts[:100])
+
+    def complete(self, term):
+        term_length = len(term)
+        start = bisect_left(self.terms, term, key=lambda x: x[:term_length])
+        end = bisect_right(self.terms, term, key=lambda x: x[:term_length])
+
+        print("Start", self.terms[start])
+        print("End", self.terms[end])
+
+
+
+if __name__ == '__main__':
+    data = pd.read_csv('data/mwmbl-crawl-terms.csv')
+    completer = Completer(data)
+    completer.complete('yo')
diff --git a/mwmbl/tinysearchengine/config.py b/mwmbl/tinysearchengine/config.py
index c506117..f0dd3d3 100644
--- a/mwmbl/tinysearchengine/config.py
+++ b/mwmbl/tinysearchengine/config.py
@@ -11,6 +11,7 @@ class ServerConfigModel(BaseModel):
 
 class IndexConfigModel(BaseModel):
     index_path: StrictStr = "data/index.tinysearch"
+    terms_path: StrictStr = "data/mwmbl-crawl-terms.csv"
     num_pages: StrictInt = 25600
     page_size: StrictInt = 4096