Kaynağa Gözat

Fix unicode encoding error

Daoud Clarke 3 yıl önce
ebeveyn
işleme
680fe1ca0c
2 değiştirilmiş dosya ile 21 ekleme ve 1 silme
  1. 6 1
      mwmbl/indexer/indexdb.py
  2. 15 0
      test/test_indexdb.py

+ 6 - 1
mwmbl/indexer/indexdb.py

@@ -104,7 +104,8 @@ class IndexDatabase:
         """
         """
 
 
         sorted_documents = sorted(documents, key=lambda x: x.url)
         sorted_documents = sorted(documents, key=lambda x: x.url)
-        data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
+        data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract),
+                 document.score, DocumentStatus.NEW.value)
                 for document in sorted_documents]
                 for document in sorted_documents]
 
 
         print("Queueing documents", len(data))
         print("Queueing documents", len(data))
@@ -176,3 +177,7 @@ class IndexDatabase:
 
 
         with self.connection.cursor() as cursor:
         with self.connection.cursor() as cursor:
             cursor.execute(sql, {'page_index': page_index})
             cursor.execute(sql, {'page_index': page_index})
+
+
+def clean_unicode(s: str) -> str:
+    return s.encode('utf-8', 'ignore').decode('utf-8')

+ 15 - 0
test/test_indexdb.py

@@ -0,0 +1,15 @@
+from mwmbl.database import Database
+from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode
+from mwmbl.tinysearchengine.indexer import Document
+
+
+def test_bad_unicode_encoding():
+    bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0)
+    with Database() as db:
+        index_db = IndexDatabase(db.connection)
+        index_db.queue_documents([bad_doc])
+
+
+def test_clean_unicode():
+    result = clean_unicode('Bad extract text \ud83c')
+    assert result == 'Bad extract text '