3 yıl önce · 3c97fdb3a0
--- a/mwmbl/indexer/indexdb.py
+++ b/mwmbl/indexer/indexdb.py
@@ -104,7 +104,8 @@ class IndexDatabase:
 
				         """
			
 
				 
			
 
				         sorted_documents = sorted(documents, key=lambda x: x.url)
			
 
				-        data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
			
 
				+        data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract),
			
 
				+                 document.score, DocumentStatus.NEW.value)
			
 
				                 for document in sorted_documents]
			
 
				 
			
 
				         print("Queueing documents", len(data))
			
@@ -176,3 +177,7 @@ class IndexDatabase:
 
				 
			
 
				         with self.connection.cursor() as cursor:
			
 
				             cursor.execute(sql, {'page_index': page_index})
			
 
				+
			
 
				+
			
 
				+def clean_unicode(s: str) -> str:
			
 
				+    return s.encode('utf-8', 'ignore').decode('utf-8')
			
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -12,7 +12,7 @@ VERSION = 1
 
				 METADATA_CONSTANT = b'mwmbl-tiny-search'
			
 
				 METADATA_SIZE = 4096
			
 
				 
			
 
				-NUM_PAGES = 512000
			
 
				+NUM_PAGES = 5_120_000
			
 
				 PAGE_SIZE = 4096
			
 
				 
			
 
				 
			
--- a/test/test_indexdb.py
+++ b/test/test_indexdb.py
@@ -0,0 +1,15 @@
 
				+from mwmbl.database import Database
			
 
				+from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode
			
 
				+from mwmbl.tinysearchengine.indexer import Document
			
 
				+
			
 
				+
			
 
				+def test_bad_unicode_encoding():
			
 
				+    bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0)
			
 
				+    with Database() as db:
			
 
				+        index_db = IndexDatabase(db.connection)
			
 
				+        index_db.queue_documents([bad_doc])
			
 
				+
			
 
				+
			
 
				+def test_clean_unicode():
			
 
				+    result = clean_unicode('Bad extract text \ud83c')
			
 
				+    assert result == 'Bad extract text '