diff --git a/mwmbl/indexer/index.py b/mwmbl/indexer/index.py index 67ae164..fb52fe1 100644 --- a/mwmbl/indexer/index.py +++ b/mwmbl/indexer/index.py @@ -37,7 +37,7 @@ STOPWORDS = set("0,1,2,3,4,5,6,7,8,9,a,A,about,above,across,after,again,against, def tokenize(input_text): cleaned_text = input_text.encode('utf8', 'replace').decode('utf8') - tokens = cleaned_text.lower().replace('.', ' ').replace(',', ' ').split() + tokens = cleaned_text.lower().split() # tokens = nlp.tokenizer(cleaned_text) if input_text.endswith('…'): # Discard the last two tokens since there will likely be a word cut in two