mwmbl/index_glob.py

import gzip
from glob import glob

import bs4
from spacy.lang.en import English

from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
from paths import INDEX_PATH, CRAWL_GLOB


def run():
    indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
    indexer.create_if_not_exists()
    nlp = English()
    for path in glob(CRAWL_GLOB):
        print("Path", path)
        with gzip.open(path, 'rt') as html_file:
            url = html_file.readline().strip()
            content = html_file.read()

        if indexer.document_indexed(url):
            print("Page exists, skipping", url)
            continue

        cleaned_text = clean(content)
        try:
            title = bs4.BeautifulSoup(content, features="lxml").find('title').string
        except AttributeError:
            title = cleaned_text[:80]
        tokens = tokenize(nlp, cleaned_text)
        print("URL", url)
        print("Tokens", tokens)
        print("Title", title)
        indexer.index(tokens, url, title)


if __name__ == '__main__':
    run()
Index queued items 2021-05-19 20:48:03 +00:00			`import gzip`
			`from glob import glob`

			`import bs4`
			`from spacy.lang.en import English`

			`from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize`
			`from paths import INDEX_PATH, CRAWL_GLOB`


			`def run():`
			`indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)`
			`indexer.create_if_not_exists()`
			`nlp = English()`
			`for path in glob(CRAWL_GLOB):`
			`print("Path", path)`
			`with gzip.open(path, 'rt') as html_file:`
			`url = html_file.readline().strip()`
			`content = html_file.read()`

			`if indexer.document_indexed(url):`
			`print("Page exists, skipping", url)`
			`continue`

			`cleaned_text = clean(content)`
			`try:`
			`title = bs4.BeautifulSoup(content, features="lxml").find('title').string`
			`except AttributeError:`
			`title = cleaned_text[:80]`
			`tokens = tokenize(nlp, cleaned_text)`
			`print("URL", url)`
			`print("Tokens", tokens)`
			`print("Title", title)`
			`indexer.index(tokens, url, title)`


			`if __name__ == '__main__':`
			`run()`