Fix unicode encoding error
This commit is contained in:
parent
fee5cbb400
commit
680fe1ca0c
2 changed files with 21 additions and 1 deletions
|
@ -104,7 +104,8 @@ class IndexDatabase:
|
|||
"""
|
||||
|
||||
sorted_documents = sorted(documents, key=lambda x: x.url)
|
||||
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
|
||||
data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract),
|
||||
document.score, DocumentStatus.NEW.value)
|
||||
for document in sorted_documents]
|
||||
|
||||
print("Queueing documents", len(data))
|
||||
|
@ -176,3 +177,7 @@ class IndexDatabase:
|
|||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql, {'page_index': page_index})
|
||||
|
||||
|
||||
def clean_unicode(s: str) -> str:
|
||||
return s.encode('utf-8', 'ignore').decode('utf-8')
|
||||
|
|
15
test/test_indexdb.py
Normal file
15
test/test_indexdb.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
|
||||
|
||||
def test_bad_unicode_encoding():
|
||||
bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0)
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.queue_documents([bad_doc])
|
||||
|
||||
|
||||
def test_clean_unicode():
|
||||
result = clean_unicode('Bad extract text \ud83c')
|
||||
assert result == 'Bad extract text '
|
Loading…
Add table
Reference in a new issue