diff --git a/mwmbl/indexer/indexdb.py b/mwmbl/indexer/indexdb.py index 9895d90..efc4fc1 100644 --- a/mwmbl/indexer/indexdb.py +++ b/mwmbl/indexer/indexdb.py @@ -104,7 +104,8 @@ class IndexDatabase: """ sorted_documents = sorted(documents, key=lambda x: x.url) - data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value) + data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract), + document.score, DocumentStatus.NEW.value) for document in sorted_documents] print("Queueing documents", len(data)) @@ -176,3 +177,7 @@ class IndexDatabase: with self.connection.cursor() as cursor: cursor.execute(sql, {'page_index': page_index}) + + +def clean_unicode(s: str) -> str: + return s.encode('utf-8', 'ignore').decode('utf-8') diff --git a/mwmbl/tinysearchengine/indexer.py b/mwmbl/tinysearchengine/indexer.py index 1dc8ba1..00ebd9e 100644 --- a/mwmbl/tinysearchengine/indexer.py +++ b/mwmbl/tinysearchengine/indexer.py @@ -12,7 +12,7 @@ VERSION = 1 METADATA_CONSTANT = b'mwmbl-tiny-search' METADATA_SIZE = 4096 -NUM_PAGES = 512000 +NUM_PAGES = 5_120_000 PAGE_SIZE = 4096 diff --git a/test/test_indexdb.py b/test/test_indexdb.py new file mode 100644 index 0000000..f08a46b --- /dev/null +++ b/test/test_indexdb.py @@ -0,0 +1,15 @@ +from mwmbl.database import Database +from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode +from mwmbl.tinysearchengine.indexer import Document + + +def test_bad_unicode_encoding(): + bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0) + with Database() as db: + index_db = IndexDatabase(db.connection) + index_db.queue_documents([bad_doc]) + + +def test_clean_unicode(): + result = clean_unicode('Bad extract text \ud83c') + assert result == 'Bad extract text '