Merge pull request #66 from mwmbl/fix-unicode-encode-error

Fix unicode encode error; bigger index
This commit is contained in:
Daoud Clarke 2022-07-16 10:59:14 +01:00 committed by GitHub
commit 3c97fdb3a0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 22 additions and 2 deletions

View file

@ -104,7 +104,8 @@ class IndexDatabase:
"""
sorted_documents = sorted(documents, key=lambda x: x.url)
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract),
document.score, DocumentStatus.NEW.value)
for document in sorted_documents]
print("Queueing documents", len(data))
@ -176,3 +177,7 @@ class IndexDatabase:
with self.connection.cursor() as cursor:
cursor.execute(sql, {'page_index': page_index})
def clean_unicode(s: str) -> str:
return s.encode('utf-8', 'ignore').decode('utf-8')

View file

@ -12,7 +12,7 @@ VERSION = 1
METADATA_CONSTANT = b'mwmbl-tiny-search'
METADATA_SIZE = 4096
NUM_PAGES = 512000
NUM_PAGES = 5_120_000
PAGE_SIZE = 4096

15
test/test_indexdb.py Normal file
View file

@ -0,0 +1,15 @@
from mwmbl.database import Database
from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode
from mwmbl.tinysearchengine.indexer import Document
def test_bad_unicode_encoding():
bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0)
with Database() as db:
index_db = IndexDatabase(db.connection)
index_db.queue_documents([bad_doc])
def test_clean_unicode():
result = clean_unicode('Bad extract text \ud83c')
assert result == 'Bad extract text '