Clean unicode when formatting result
This commit is contained in:
parent
dec7c4853d
commit
b5b37629ce
4 changed files with 9 additions and 9 deletions
|
@ -1,11 +1,12 @@
|
|||
import re
|
||||
|
||||
from mwmbl.tokenizer import tokenize
|
||||
from mwmbl.tokenizer import tokenize, clean_unicode
|
||||
|
||||
|
||||
def format_result_with_pattern(pattern, result):
|
||||
formatted_result = {}
|
||||
for content_type, content in [('title', result.title), ('extract', result.extract)]:
|
||||
for content_type, content_raw in [('title', result.title), ('extract', result.extract)]:
|
||||
content = clean_unicode(content_raw)
|
||||
matches = re.finditer(pattern, content, re.IGNORECASE)
|
||||
all_spans = [0] + sum((list(m.span()) for m in matches), []) + [len(content)]
|
||||
content_result = []
|
||||
|
|
|
@ -89,8 +89,3 @@ def get_url_error_status(item: Item):
|
|||
elif item.error.name == 'RobotsDenied':
|
||||
return URLStatus.ERROR_ROBOTS_DENIED
|
||||
return URLStatus.ERROR_OTHER
|
||||
|
||||
|
||||
# TODO: clean unicode at some point
|
||||
def clean_unicode(s: str) -> str:
|
||||
return s.encode('utf-8', 'ignore').decode('utf-8')
|
|
@ -1,5 +1,5 @@
|
|||
def tokenize(input_text):
|
||||
cleaned_text = input_text.encode('utf8', 'replace').decode('utf8')
|
||||
cleaned_text = clean_unicode(input_text)
|
||||
tokens = cleaned_text.lower().split()
|
||||
if input_text.endswith('…'):
|
||||
# Discard the last two tokens since there will likely be a word cut in two
|
||||
|
@ -11,3 +11,7 @@ def get_bigrams(num_bigrams, tokens):
|
|||
num_bigrams = min(num_bigrams, len(tokens) - 1)
|
||||
bigrams = [f'{tokens[i]} {tokens[i + 1]}' for i in range(num_bigrams)]
|
||||
return bigrams
|
||||
|
||||
|
||||
def clean_unicode(s: str) -> str:
|
||||
return s.encode('utf-8', errors='ignore').decode('utf-8')
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from mwmbl.indexer.index_batches import clean_unicode
|
||||
from mwmbl.tokenizer import clean_unicode
|
||||
|
||||
|
||||
def test_clean_unicode():
|
||||
|
|
Loading…
Add table
Reference in a new issue