completer.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536
  1. from bisect import bisect_left, bisect_right
  2. from pathlib import Path
  3. import pandas as pd
  4. TERMS_PATH = Path(__file__).parent.parent / 'resources' / 'mwmbl-crawl-terms.csv'
  5. class Completer:
  6. def __init__(self, num_matches: int = 3):
  7. # Load term data
  8. terms = self.get_terms()
  9. terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict()
  10. self.terms = list(terms_dict.keys())
  11. self.counts = list(terms_dict.values())
  12. self.num_matches = num_matches
  13. print("Terms", self.terms[:100], self.counts[:100])
  14. def get_terms(self):
  15. return pd.read_csv(TERMS_PATH)
  16. def complete(self, term) -> list[str]:
  17. term_length = len(term)
  18. start_index = bisect_left(self.terms, term, key=lambda x: x[:term_length])
  19. end_index = bisect_right(self.terms, term, key=lambda x: x[:term_length])
  20. matching_terms = zip(self.counts[start_index:end_index], self.terms[start_index:end_index])
  21. top_terms = sorted(matching_terms, reverse=True)[:self.num_matches]
  22. print("Top terms, counts", top_terms)
  23. if not top_terms:
  24. return []
  25. counts, terms = zip(*top_terms)
  26. return list(terms)