2022-06-21 20:55:38 +00:00
|
|
|
"""
|
|
|
|
Count unique URLs in the index.
|
|
|
|
"""
|
2023-10-10 12:51:06 +00:00
|
|
|
from mwmbl.tinysearchengine import TinyIndex, Document
|
2022-06-21 20:55:38 +00:00
|
|
|
|
|
|
|
|
|
|
|
def run():
|
|
|
|
urls = set()
|
|
|
|
with TinyIndex(Document, 'data/index.tinysearch') as index:
|
|
|
|
for i in range(index.num_pages):
|
|
|
|
print("Page", i)
|
|
|
|
page = index.get_page(i)
|
|
|
|
new_urls = {doc.url for doc in page}
|
|
|
|
urls |= new_urls
|
|
|
|
print("URLs", len(urls))
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
run()
|