Merge pull request #66 from mwmbl/fix-unicode-encode-error
Fix unicode encode error; bigger index
This commit is contained in:
commit
3c97fdb3a0
3 changed files with 22 additions and 2 deletions
|
@ -104,7 +104,8 @@ class IndexDatabase:
|
|||
"""
|
||||
|
||||
sorted_documents = sorted(documents, key=lambda x: x.url)
|
||||
data = [(document.url, document.title, document.extract, document.score, DocumentStatus.NEW.value)
|
||||
data = [(document.url, clean_unicode(document.title), clean_unicode(document.extract),
|
||||
document.score, DocumentStatus.NEW.value)
|
||||
for document in sorted_documents]
|
||||
|
||||
print("Queueing documents", len(data))
|
||||
|
@ -176,3 +177,7 @@ class IndexDatabase:
|
|||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql, {'page_index': page_index})
|
||||
|
||||
|
||||
def clean_unicode(s: str) -> str:
|
||||
return s.encode('utf-8', 'ignore').decode('utf-8')
|
||||
|
|
|
@ -12,7 +12,7 @@ VERSION = 1
|
|||
METADATA_CONSTANT = b'mwmbl-tiny-search'
|
||||
METADATA_SIZE = 4096
|
||||
|
||||
NUM_PAGES = 512000
|
||||
NUM_PAGES = 5_120_000
|
||||
PAGE_SIZE = 4096
|
||||
|
||||
|
||||
|
|
15
test/test_indexdb.py
Normal file
15
test/test_indexdb.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
from mwmbl.database import Database
|
||||
from mwmbl.indexer.indexdb import IndexDatabase, clean_unicode
|
||||
from mwmbl.tinysearchengine.indexer import Document
|
||||
|
||||
|
||||
def test_bad_unicode_encoding():
|
||||
bad_doc = Document('Good title', 'https://goodurl.com', 'Bad extract text \ud83c', 1.0)
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.queue_documents([bad_doc])
|
||||
|
||||
|
||||
def test_clean_unicode():
|
||||
result = clean_unicode('Bad extract text \ud83c')
|
||||
assert result == 'Bad extract text '
|
Loading…
Add table
Reference in a new issue