mwmbl/index_glob.py

39 lines
1 KiB
Python
Raw Normal View History

2021-05-19 20:48:03 +00:00
import gzip
from glob import glob
import bs4
from spacy.lang.en import English
from index import TinyIndexer, NUM_PAGES, PAGE_SIZE, clean, tokenize
from paths import INDEX_PATH, CRAWL_GLOB
def run():
indexer = TinyIndexer(INDEX_PATH, NUM_PAGES, PAGE_SIZE)
indexer.create_if_not_exists()
nlp = English()
for path in glob(CRAWL_GLOB):
print("Path", path)
with gzip.open(path, 'rt') as html_file:
url = html_file.readline().strip()
content = html_file.read()
if indexer.document_indexed(url):
print("Page exists, skipping", url)
continue
cleaned_text = clean(content)
try:
title = bs4.BeautifulSoup(content, features="lxml").find('title').string
except AttributeError:
title = cleaned_text[:80]
tokens = tokenize(nlp, cleaned_text)
print("URL", url)
print("Tokens", tokens)
print("Title", title)
indexer.index(tokens, url, title)
if __name__ == '__main__':
run()