WIP: implement term completer

This commit is contained in:
Daoud Clarke 2022-01-30 22:20:28 +00:00
parent 01a21337a9
commit 3c75dd1a74
3 changed files with 31 additions and 2 deletions

View file

@ -5,7 +5,6 @@ HOME = os.getenv('HOME')
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
MWMBL_CRAWL_TERMS_PATH = DATA_DIR / 'mwmbl-craw-terms.csv'
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_'
@ -20,6 +19,8 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs'
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
INDEX_PATH = Path(__file__).parent.parent.parent / 'data' / 'index.tinysearch'
LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data'
INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch'
MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv'
TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'

View file

@ -0,0 +1,27 @@
from bisect import bisect_left, bisect_right
import pandas as pd
from pandas import DataFrame
class Completer:
def __init__(self, terms: DataFrame):
terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict()
self.terms = list(terms_dict.keys())
self.counts = list(terms_dict.values())
print("Terms", self.terms[:100], self.counts[:100])
def complete(self, term):
term_length = len(term)
start = bisect_left(self.terms, term, key=lambda x: x[:term_length])
end = bisect_right(self.terms, term, key=lambda x: x[:term_length])
print("Start", self.terms[start])
print("End", self.terms[end])
if __name__ == '__main__':
data = pd.read_csv('data/mwmbl-crawl-terms.csv')
completer = Completer(data)
completer.complete('yo')

View file

@ -11,6 +11,7 @@ class ServerConfigModel(BaseModel):
class IndexConfigModel(BaseModel):
index_path: StrictStr = "data/index.tinysearch"
terms_path: StrictStr = "data/mwmbl-crawl-terms.csv"
num_pages: StrictInt = 25600
page_size: StrictInt = 4096