Ver Fonte

Don't replace full stops and commas

Daoud Clarke há 3 anos atrás
pai
commit
578b705609
1 ficheiros alterados com 1 adições e 1 exclusões
  1. 1 1
      mwmbl/indexer/index.py

+ 1 - 1
mwmbl/indexer/index.py

@@ -37,7 +37,7 @@ STOPWORDS = set("0,1,2,3,4,5,6,7,8,9,a,A,about,above,across,after,again,against,
 
 
 def tokenize(input_text):
 def tokenize(input_text):
     cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
     cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
-    tokens = cleaned_text.lower().replace('.', ' ').replace(',', ' ').split()
+    tokens = cleaned_text.lower().split()
     # tokens = nlp.tokenizer(cleaned_text)
     # tokens = nlp.tokenizer(cleaned_text)
     if input_text.endswith('…'):
     if input_text.endswith('…'):
         # Discard the last two tokens since there will likely be a word cut in two
         # Discard the last two tokens since there will likely be a word cut in two