Update queued pages in the index

This commit is contained in:
Daoud Clarke 2022-06-20 23:35:44 +01:00
parent 4330551e0f
commit 30e1e19072
3 changed files with 71 additions and 9 deletions

View file

@ -57,11 +57,15 @@ class IndexDatabase:
)
"""
document_pages_index_sql = """
CREATE INDEX IF NOT EXISTS document_pages_page_index ON document_pages (page)
"""
with self.connection.cursor() as cursor:
cursor.execute(batches_sql)
print("Creating documents table")
cursor.execute(documents_sql)
cursor.execute(document_pages_sql)
cursor.execute(document_pages_index_sql)
def record_batches(self, batch_infos: list[BatchInfo]):
sql = """
@ -133,3 +137,24 @@ class IndexDatabase:
print("Queuing", urls_and_page_indexes)
with self.connection.cursor() as cursor:
execute_values(cursor, sql, urls_and_page_indexes)
def get_queued_documents_for_page(self, page_index: int) -> list[Document]:
sql = """
SELECT d.url, title, extract, score
FROM document_pages p INNER JOIN documents d ON p.url = d.url
WHERE p.page = %(page_index)s
"""
with self.connection.cursor() as cursor:
cursor.execute(sql, {'page_index': page_index})
results = cursor.fetchall()
return [Document(title, url, extract, score) for url, title, extract, score in results]
def clear_queued_documents_for_page(self, page_index: int):
sql = """
DELETE FROM document_pages
WHERE page = %(page_index)s
"""
with self.connection.cursor() as cursor:
cursor.execute(sql, {'page_index': page_index})

View file

@ -129,18 +129,18 @@ class TinyIndex(Generic[T]):
assert type(value) == self.item_factory, f"Can only index the specified type" \
f" ({self.item_factory.__name__})"
page_index = self.get_key_page_index(key)
self.add_to_page(page_index, value)
try:
self.add_to_page(page_index, [value])
except ValueError:
pass
def add_to_page(self, page_index: int, value: T):
def add_to_page(self, page_index: int, values: list[T]):
current_page = self._get_page_tuples(page_index)
if current_page is None:
current_page = []
value_tuple = astuple(value)
current_page.append(value_tuple)
try:
self._write_page(current_page, page_index)
except ValueError:
pass
value_tuples = [astuple(value) for value in values]
current_page += value_tuples
self._write_page(current_page, page_index)
def _write_page(self, data, i):
"""

37
mwmbl/update_pages.py Normal file
View file

@ -0,0 +1,37 @@
"""
Iterate over each page in the index and update it based on what is in the index database.
"""
from mwmbl.database import Database
from mwmbl.indexdb import IndexDatabase
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
def run(index_path):
with Database() as db:
index_db = IndexDatabase(db.connection)
index_db.create_tables()
with TinyIndex(Document, index_path, 'w') as indexer:
for i in range(indexer.num_pages):
with Database() as db:
index_db = IndexDatabase(db.connection)
pages = index_db.get_queued_documents_for_page(i)
if len(pages) > 0:
print("Pages", len(pages))
else:
continue
for j in range(3):
try:
indexer.add_to_page(i, pages)
break
except ValueError:
pages = pages[:len(pages)//2]
if len(pages) == 0:
break
print(f"Not enough space, adding {len(pages)}")
index_db.clear_queued_documents_for_page(i)
if __name__ == '__main__':
run('data/index.tinysearch')