From b1bfe1cdd4574353cb9a7f6a34bb84fe086a4815 Mon Sep 17 00:00:00 2001 From: Daoud Clarke Date: Sat, 13 Mar 2021 20:54:15 +0000 Subject: [PATCH] Initial commit --- crawl.py | 36 ++++++++++++++++++++++++++++++++++++ index.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ paths.py | 7 +++++++ 3 files changed, 87 insertions(+) create mode 100644 crawl.py create mode 100644 index.py create mode 100644 paths.py diff --git a/crawl.py b/crawl.py new file mode 100644 index 0000000..3ec81a4 --- /dev/null +++ b/crawl.py @@ -0,0 +1,36 @@ +""" +Crawl the web +""" +import gzip +import hashlib +import os + +import pandas as pd +import requests +import justext + +from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX + + +def crawl(): + data = pd.read_csv(HN_TOP_PATH) + + for url in data['url']: + print("Fetching", url) + html = fetch(url) + filename = hashlib.md5(url.encode('utf8')).hexdigest() + path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz") + if os.path.isfile(path): + print("Path already exists, skipping") + + with gzip.open(path, 'w') as output: + output.write(html.encode('utf8')) + + +def fetch(url): + page_data = requests.get(url) + return page_data.text + + +if __name__ == '__main__': + crawl() diff --git a/index.py b/index.py new file mode 100644 index 0000000..5e5fdea --- /dev/null +++ b/index.py @@ -0,0 +1,44 @@ +""" +Create a search index +""" +import gzip +from glob import glob + +import justext +from spacy.lang.en import English + +from paths import CRAWL_GLOB + + +def is_content_token(nlp, token): + lexeme = nlp.vocab[token.orth] + return lexeme.is_alpha and not token.is_stop + + +def tokenize(nlp, cleaned_text): + tokens = nlp.tokenizer(cleaned_text) + content_tokens = [token for token in tokens if is_content_token(nlp, token)] + lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens} + return lowered + + +def clean(content): + text = justext.justext(content, justext.get_stoplist("English")) + pars = [par.text for par in text if not par.is_boilerplate] + cleaned_text = ' '.join(pars) + return cleaned_text + + +def run(): + nlp = English() + for path in glob(CRAWL_GLOB): + with gzip.open(path) as html_file: + content = html_file.read().decode("utf8") + cleaned_text = clean(content) + tokens = tokenize(nlp, cleaned_text) + print("Tokens", tokens) + break + + +if __name__ == '__main__': + run() diff --git a/paths.py b/paths.py new file mode 100644 index 0000000..1d6b65f --- /dev/null +++ b/paths.py @@ -0,0 +1,7 @@ +import os + +HOME = os.getenv('HOME') +DATA_DIR = os.path.join(HOME, 'data', 'tinysearch') +HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') +CRAWL_PREFIX = 'crawl_' +CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")