mwmbl/analyse/index_url_count.py

20 lines
442 B
Python
Raw Permalink Normal View History

"""
Count unique URLs in the index.
"""
2023-10-10 12:51:06 +00:00
from mwmbl.tinysearchengine import TinyIndex, Document
def run():
urls = set()
with TinyIndex(Document, 'data/index.tinysearch') as index:
for i in range(index.num_pages):
print("Page", i)
page = index.get_page(i)
new_urls = {doc.url for doc in page}
urls |= new_urls
print("URLs", len(urls))
if __name__ == '__main__':
run()