Update queued pages in the index
This commit is contained in:
parent
4330551e0f
commit
30e1e19072
3 changed files with 71 additions and 9 deletions
|
@ -57,11 +57,15 @@ class IndexDatabase:
|
|||
)
|
||||
"""
|
||||
|
||||
document_pages_index_sql = """
|
||||
CREATE INDEX IF NOT EXISTS document_pages_page_index ON document_pages (page)
|
||||
"""
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(batches_sql)
|
||||
print("Creating documents table")
|
||||
cursor.execute(documents_sql)
|
||||
cursor.execute(document_pages_sql)
|
||||
cursor.execute(document_pages_index_sql)
|
||||
|
||||
def record_batches(self, batch_infos: list[BatchInfo]):
|
||||
sql = """
|
||||
|
@ -133,3 +137,24 @@ class IndexDatabase:
|
|||
print("Queuing", urls_and_page_indexes)
|
||||
with self.connection.cursor() as cursor:
|
||||
execute_values(cursor, sql, urls_and_page_indexes)
|
||||
|
||||
def get_queued_documents_for_page(self, page_index: int) -> list[Document]:
|
||||
sql = """
|
||||
SELECT d.url, title, extract, score
|
||||
FROM document_pages p INNER JOIN documents d ON p.url = d.url
|
||||
WHERE p.page = %(page_index)s
|
||||
"""
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql, {'page_index': page_index})
|
||||
results = cursor.fetchall()
|
||||
return [Document(title, url, extract, score) for url, title, extract, score in results]
|
||||
|
||||
def clear_queued_documents_for_page(self, page_index: int):
|
||||
sql = """
|
||||
DELETE FROM document_pages
|
||||
WHERE page = %(page_index)s
|
||||
"""
|
||||
|
||||
with self.connection.cursor() as cursor:
|
||||
cursor.execute(sql, {'page_index': page_index})
|
||||
|
|
|
@ -129,18 +129,18 @@ class TinyIndex(Generic[T]):
|
|||
assert type(value) == self.item_factory, f"Can only index the specified type" \
|
||||
f" ({self.item_factory.__name__})"
|
||||
page_index = self.get_key_page_index(key)
|
||||
self.add_to_page(page_index, value)
|
||||
try:
|
||||
self.add_to_page(page_index, [value])
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
def add_to_page(self, page_index: int, value: T):
|
||||
def add_to_page(self, page_index: int, values: list[T]):
|
||||
current_page = self._get_page_tuples(page_index)
|
||||
if current_page is None:
|
||||
current_page = []
|
||||
value_tuple = astuple(value)
|
||||
current_page.append(value_tuple)
|
||||
try:
|
||||
self._write_page(current_page, page_index)
|
||||
except ValueError:
|
||||
pass
|
||||
value_tuples = [astuple(value) for value in values]
|
||||
current_page += value_tuples
|
||||
self._write_page(current_page, page_index)
|
||||
|
||||
def _write_page(self, data, i):
|
||||
"""
|
||||
|
|
37
mwmbl/update_pages.py
Normal file
37
mwmbl/update_pages.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
"""
|
||||
Iterate over each page in the index and update it based on what is in the index database.
|
||||
"""
|
||||
from mwmbl.database import Database
|
||||
from mwmbl.indexdb import IndexDatabase
|
||||
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
||||
|
||||
|
||||
def run(index_path):
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
index_db.create_tables()
|
||||
|
||||
with TinyIndex(Document, index_path, 'w') as indexer:
|
||||
for i in range(indexer.num_pages):
|
||||
with Database() as db:
|
||||
index_db = IndexDatabase(db.connection)
|
||||
pages = index_db.get_queued_documents_for_page(i)
|
||||
if len(pages) > 0:
|
||||
print("Pages", len(pages))
|
||||
else:
|
||||
continue
|
||||
|
||||
for j in range(3):
|
||||
try:
|
||||
indexer.add_to_page(i, pages)
|
||||
break
|
||||
except ValueError:
|
||||
pages = pages[:len(pages)//2]
|
||||
if len(pages) == 0:
|
||||
break
|
||||
print(f"Not enough space, adding {len(pages)}")
|
||||
index_db.clear_queued_documents_for_page(i)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run('data/index.tinysearch')
|
Loading…
Add table
Reference in a new issue