60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
import logging
|
|
import sys
|
|
|
|
import numpy as np
|
|
import spacy
|
|
|
|
from analyse.index_local import EVALUATE_INDEX_PATH
|
|
from mwmbl.indexer.index import tokenize_document
|
|
from mwmbl.indexer.paths import INDEX_PATH
|
|
from mwmbl.tinysearchengine.indexer import TinyIndex, Document
|
|
|
|
|
|
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
|
nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
|
def store():
|
|
document = Document(
|
|
title='A nation in search of the new black | Theatre | The Guardian',
|
|
url='https://www.theguardian.com/stage/2007/nov/18/theatre',
|
|
extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
|
|
score=1.0
|
|
)
|
|
with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
|
|
tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
|
|
print("Tokenized", tokenized)
|
|
# for token in tokenized.tokens:
|
|
#
|
|
# tiny_index.index(token, document)
|
|
|
|
|
|
def get_items():
|
|
with TinyIndex(Document, INDEX_PATH) as tiny_index:
|
|
items = tiny_index.retrieve('wikipedia')
|
|
if items:
|
|
for item in items:
|
|
print("Items", item)
|
|
|
|
|
|
def run(index_path):
|
|
with TinyIndex(Document, index_path) as tiny_index:
|
|
sizes = {}
|
|
for i in range(tiny_index.num_pages):
|
|
page = tiny_index.get_page(i)
|
|
if page:
|
|
sizes[i] = len(page)
|
|
if len(page) > 50:
|
|
print("Page", len(page), page)
|
|
# for item in page:
|
|
# if ' search' in item.title:
|
|
# print("Page", i, item)
|
|
print("Max", max(sizes.values()))
|
|
print("Top", sorted(sizes.values())[-100:])
|
|
print("Mean", np.mean(list(sizes.values())))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# store()
|
|
run(EVALUATE_INDEX_PATH)
|
|
# get_items()
|