diff --git a/mwmbl/main.py b/mwmbl/main.py index 45f46a3..3b46cb0 100644 --- a/mwmbl/main.py +++ b/mwmbl/main.py @@ -16,9 +16,6 @@ from mwmbl.tinysearchengine.rank import HeuristicRanker logging.basicConfig() -TERMS_PATH = Path(__file__).parent.parent / 'resources' / 'mwmbl-crawl-terms.csv' - - def setup_args(): parser = argparse.ArgumentParser(description="mwmbl-tinysearchengine") parser.add_argument("--index", help="Path to the tinysearchengine index file", default="/data/index.tinysearch") @@ -42,9 +39,7 @@ def run(): except FileExistsError: print("Index already exists") - # Load term data - terms = pd.read_csv(TERMS_PATH) - completer = Completer(terms) + completer = Completer() with TinyIndex(item_factory=Document, index_path=args.index) as tiny_index: ranker = HeuristicRanker(tiny_index, completer) diff --git a/mwmbl/tinysearchengine/completer.py b/mwmbl/tinysearchengine/completer.py index 2336fb5..5ef52d5 100644 --- a/mwmbl/tinysearchengine/completer.py +++ b/mwmbl/tinysearchengine/completer.py @@ -1,12 +1,17 @@ from bisect import bisect_left, bisect_right -from datetime import datetime +from pathlib import Path import pandas as pd -from pandas import DataFrame + + +TERMS_PATH = Path(__file__).parent.parent.parent / 'resources' / 'mwmbl-crawl-terms.csv' class Completer: - def __init__(self, terms: DataFrame, num_matches: int = 3): + def __init__(self, num_matches: int = 3): + # Load term data + terms = pd.read_csv(TERMS_PATH) + terms_dict = terms.sort_values('term').set_index('term')['count'].to_dict() self.terms = list(terms_dict.keys()) self.counts = list(terms_dict.values()) @@ -26,12 +31,3 @@ class Completer: counts, terms = zip(*top_terms) return list(terms) - - -if __name__ == '__main__': - data = pd.read_csv('data/mwmbl-crawl-terms.csv') - completer = Completer(data) - start = datetime.now() - completer.complete('fa') - end = datetime.now() - print("Time", end - start)