Don't index partial words

This commit is contained in:
Daoud Clarke 2022-01-30 14:30:02 +00:00
parent 2ef8304919
commit 01a21337a9
2 changed files with 5 additions and 5 deletions

View file

@ -12,8 +12,6 @@ import pandas as pd
# PAGE_SIZE = 512
from mwmbl.tinysearchengine.indexer import TinyIndexer, Document, TokenizedDocument
NUM_INITIAL_TOKENS = 50
HTTP_START = 'http://'
HTTPS_START = 'https://'
BATCH_SIZE = 100
@ -27,8 +25,10 @@ def is_content_token(nlp, token):
def tokenize(nlp, input_text):
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
tokens = nlp.tokenizer(cleaned_text)
content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
if is_content_token(nlp, token)]
if input_text.endswith(''):
# Discard the last two tokens since there will likely be a word cut in two
tokens = tokens[:-2]
content_tokens = [token for token in tokens if is_content_token(nlp, token)]
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
return lowered

View file

@ -31,9 +31,9 @@ def create(tiny_index: TinyIndex):
def search(s: str):
results, terms = get_results(s)
pattern = get_query_regex(terms)
formatted_results = []
for result in results:
pattern = get_query_regex(terms)
formatted_result = {}
for content_type, content in [('title', result.title), ('extract', result.extract)]:
matches = re.finditer(pattern, content, re.IGNORECASE)