0ct0pu5
/
mwmbl


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
							import logging
import sys

import spacy

from mwmbl.indexer.index import tokenize_document
from mwmbl.indexer.paths import INDEX_PATH
from mwmbl.tinysearchengine.indexer import TinyIndex, Document


logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
nlp = spacy.load("en_core_web_sm")


def store():
    document = Document(
        title='A nation in search of the new black | Theatre | The Guardian',
        url='https://www.theguardian.com/stage/2007/nov/18/theatre',
        extract="Topic-stuffed and talk-filled, Kwame Kwei-Armah's new play proves that issue-driven drama is (despite reports of its death) still being written and staged…",
        score=1.0
    )
    with TinyIndex(Document, INDEX_PATH, 'w') as tiny_index:
        tokenized = tokenize_document(document.url, document.title, document.extract, 1, nlp)
        print("Tokenized", tokenized)
        # for token in tokenized.tokens:
        #
        #     tiny_index.index(token, document)


def get_items():
    with TinyIndex(Document, INDEX_PATH) as tiny_index:
        items = tiny_index.retrieve('wikipedia')
        if items:
            for item in items:
                print("Items", item)


def run():
    with TinyIndex(Document, INDEX_PATH) as tiny_index:
        for i in range(100000):
            page = tiny_index.get_page(i)
            for item in page:
                if ' search' in item.title:
                    print("Page", i, item)


if __name__ == '__main__':
    # store()
    # run()
    get_items()