From 680fe1ca0c9d9b8f3793bbcec5572e5fe843909d Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 16 Jul 2022 10:54:25 +0100 Subject: [PATCH] Fix unicode encoding error --- mwmbl/indexer/indexdb.py | 7 ++++++- test/test_indexdb.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) create mode 100644 test/test_indexdb.py diff --git a/mwmbl/indexer/indexdb.py b/mwmbl/indexer/indexdb.py index 9895d90..efc4fc1 100644 --- a/mwmbl/indexer/indexdb.py +++ b/mwmbl/indexer/indexdb.py @@ -104,7 +104,8 @@ class IndexDatabase: """ sorted_documents = sorted(documents, key=lambda x: x.url) - data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value) + data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract), + document.score, DocumentStatus.NEW.value) for document in sorted_documents] print("Queueing documents", len(data)) @@ -176,3 +177,7 @@ class IndexDatabase: with self.connection.cursor() as cursor: cursor.execute(sql, {'page_index': page_index}) + + +def clean_unicode(s: str) -> str: + return s.encode('utf-8', 'ignore').decode('utf-8') diff --git a/test/test_indexdb.py b/test/test_indexdb.py new file mode 100644 index 0000000..f08a46b --- /dev/null +++ b/test/test_indexdb.py @@ -0,0 +1,15 @@ +from mwmbl.database import Database +from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode +from mwmbl.tinysearchengine.indexer import Document + + +def test_bad_unicode_encoding(): + bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0) + with Database() as db: + index_db = IndexDatabase(db.connection) + index_db.queue_documents([bad_doc]) + + +def test_clean_unicode(): + result = clean_unicode('Bad extract text \ud83c') + assert result == 'Bad extract text '