diff --git a/mwmbl/indexer/indexdb.py b/mwmbl/indexer/indexdb.py
index 9895d90183a7f16fb788430cb5318e0e33864b34..efc4fc1eabedcd382cc5a6574a3d6f06660f8a92 100644
--- a/mwmbl/indexer/indexdb.py
+++ b/mwmbl/indexer/indexdb.py
@@ -104,7 +104,8 @@ class IndexDatabase:
         """
 
         sorted_documents = sorted(documents, key=lambda x: x.url)
-        data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
+        data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract),
+                 document.score, DocumentStatus.NEW.value)
                 for document in sorted_documents]
 
         print("Queueing documents", len(data))
@@ -176,3 +177,7 @@ class IndexDatabase:
 
         with self.connection.cursor() as cursor:
             cursor.execute(sql, {'page_index': page_index})
+
+
+def clean_unicode(s: str) -> str:
+    return s.encode('utf-8', 'ignore').decode('utf-8')
diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py
index 1dc8ba10b96c6dad3be651c5fbaada8aff1616d1..00ebd9e0034889f0d261a8bd16b08ea55c559aa2 100644
--- a/mwmbl/tinysearchengine/indexer.py
+++ b/mwmbl/tinysearchengine/indexer.py
@@ -12,7 +12,7 @@ VERSION = 1
 METADATA_CONSTANT = b'mwmbl-tiny-search'
 METADATA_SIZE = 4096
 
-NUM_PAGES = 512000
+NUM_PAGES = 5_120_000
 PAGE_SIZE = 4096
 
 
diff --git a/test/test_indexdb.py b/test/test_indexdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..f08a46bcf287df41dfdd3c9b616a6221b39eae4d
--- /dev/null
+++ b/test/test_indexdb.py
@@ -0,0 +1,15 @@
+from mwmbl.database import Database
+from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode
+from mwmbl.tinysearchengine.indexer import Document
+
+
+def test_bad_unicode_encoding():
+    bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0)
+    with Database() as db:
+        index_db = IndexDatabase(db.connection)
+        index_db.queue_documents([bad_doc])
+
+
+def test_clean_unicode():
+    result = clean_unicode('Bad extract text \ud83c')
+    assert result == 'Bad extract text '