diff --git a/pyproject.toml b/pyproject.toml index 43cd14d..c634fc9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,16 @@ authors = ["Daoud Clarke "] [tool.poetry.dependencies] python = "^3.9" +pandas = "^1.3.4" +zstandard = "^0.16.0" +mmh3 = "^3.0.0" +fastapi = "^0.70.1" +uvicorn = "^0.16.0" + +# [tool.poetry.dependencies.en_core_web_sm] +# url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl" + +[tool.poetry.dev-dependencies] # botocore = "^1.23.20" # boto3 = "^1.20.20" # ujson = "^4.3.0" @@ -14,20 +24,10 @@ python = "^3.9" # beautifulsoup4 = "^4.10.0" # lxml = "^4.6.4" # jusText = "^3.0.0" -pandas = "^1.3.4" # pyspark = "^3.2.0" # langdetect = "^1.0.9" -zstandard = "^0.16.0" # spacy = "^3.2.1" -mmh3 = "^3.0.0" -fastapi = "^0.70.1" # Levenshtein = "^0.16.0" -uvicorn = "^0.16.0" - -# [tool.poetry.dependencies.en_core_web_sm] -# url= "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl" - -[tool.poetry.dev-dependencies] [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 679f600..0000000 --- a/setup.cfg +++ /dev/null @@ -1,24 +0,0 @@ -[metadata] -name = tiny-search-engine-daoudc -version = 0.0.1 -author = Daoud Clarke -author_email = daoud.clarke@gmail.com -description = Tiny Search Engine -long_description = file: README.md -long_description_content_type = text/markdown -# url = https://github.com/pypa/sampleproject -# project_urls = -# Bug Tracker = https://github.com/pypa/sampleproject/issues -# classifiers = -# Programming Language :: Python :: 3 -# License :: OSI Approved :: MIT License -# Operating System :: OS Independent - -[options] -package_dir = - = src -packages = find: -python_requires = >=3.9 - -[options.packages.find] -where = src diff --git a/tinysearchengine/create_app.py b/tinysearchengine/create_app.py index be8dd53..19c4a3b 100644 --- a/tinysearchengine/create_app.py +++ b/tinysearchengine/create_app.py @@ -63,16 +63,13 @@ def create(tiny_index: TinyIndex): last_match_char = match.span()[1] seen_matches.add(value) - # num_words = len(re.findall(r'\b\w+\b', result_string)) total_possible_match_length = sum(len(x) for x in terms) score = (match_length + 1./last_match_char) / (total_possible_match_length + 1) - # print("Score result", match_length, last_match_char, score, result.title) return score def order_results(terms: list[str], results: list[Document]): results_and_scores = [(score_result(terms, result), result) for result in results] ordered_results = sorted(results_and_scores, key=itemgetter(0), reverse=True) - # print("Ordered results", ordered_results) filtered_results = [result for score, result in ordered_results if score > SCORE_THRESHOLD] return filtered_results @@ -82,21 +79,15 @@ def create(tiny_index: TinyIndex): results = [item.title.replace("\n", "") + ' — ' + item.url.replace("\n", "") for item in ordered_results] if len(results) == 0: - # print("No results") return [] - # print("Results", results) return [q, results] - # TODO: why does 'leek and potato soup' result not get returned for 'potato soup' query? def get_results(q): terms = [x.lower() for x in q.replace('.', ' ').split()] - # completed = complete_term(terms[-1]) - # terms = terms[:-1] + [completed] pages = [] seen_items = set() for term in terms: items = tiny_index.retrieve(term) - print("Items", items) if items is not None: for item in items: if term in item.title.lower() or term in item.extract.lower():