WIP: implement term completer
This commit is contained in:
parent
01a21337a9
commit
3c75dd1a74
3 changed files with 31 additions and 2 deletions
|
@ -5,7 +5,6 @@ HOME = os.getenv('HOME')
|
|||
|
||||
DATA_DIR = Path(os.environ['HOME']) / 'data' / 'tinysearch'
|
||||
COMMON_CRAWL_TERMS_PATH = DATA_DIR / 'common-craw-terms.csv'
|
||||
MWMBL_CRAWL_TERMS_PATH = DATA_DIR / 'mwmbl-craw-terms.csv'
|
||||
|
||||
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
||||
CRAWL_PREFIX = 'crawl_'
|
||||
|
@ -20,6 +19,8 @@ DOMAINS_QUEUE_NAME = 'domains-queue-fs'
|
|||
DOMAINS_TITLES_QUEUE_NAME = 'domains-title-queue-fs'
|
||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
|
||||
INDEX_PATH = Path(__file__).parent.parent.parent / 'data' / 'index.tinysearch'
|
||||
LOCAL_DATA_DIR = Path(__file__).parent.parent.parent / 'data'
|
||||
INDEX_PATH = LOCAL_DATA_DIR / 'index.tinysearch'
|
||||
MWMBL_CRAWL_TERMS_PATH = LOCAL_DATA_DIR / 'mwmbl-crawl-terms.csv'
|
||||
|
||||
TOP_DOMAINS_JSON_PATH = DATA_DIR / 'hn-top-domains.json'
|
||||
|
|
27
mwmbl/tinysearchengine/completer.py
Normal file
27
mwmbl/tinysearchengine/completer.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
from bisect import bisect_left, bisect_right
|
||||
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
|
||||
|
||||
class Completer:
|
||||
def __init__(self, terms: DataFrame):
|
||||
terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict()
|
||||
self.terms = list(terms_dict.keys())
|
||||
self.counts = list(terms_dict.values())
|
||||
print("Terms", self.terms[:100], self.counts[:100])
|
||||
|
||||
def complete(self, term):
|
||||
term_length = len(term)
|
||||
start = bisect_left(self.terms, term, key=lambda x: x[:term_length])
|
||||
end = bisect_right(self.terms, term, key=lambda x: x[:term_length])
|
||||
|
||||
print("Start", self.terms[start])
|
||||
print("End", self.terms[end])
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
data = pd.read_csv('data/mwmbl-crawl-terms.csv')
|
||||
completer = Completer(data)
|
||||
completer.complete('yo')
|
|
@ -11,6 +11,7 @@ class ServerConfigModel(BaseModel):
|
|||
|
||||
class IndexConfigModel(BaseModel):
|
||||
index_path: StrictStr = "data/index.tinysearch"
|
||||
terms_path: StrictStr = "data/mwmbl-crawl-terms.csv"
|
||||
num_pages: StrictInt = 25600
|
||||
page_size: StrictInt = 4096
|
||||
|
||||
|
|
Loading…
Reference in a new issue