Initial commit

2021-03-13 20:54:15 +00:00 · 2021-03-13 20:54:15 +00:00 · b1bfe1cdd4
commit b1bfe1cdd4
3 changed files with 87 additions and 0 deletions
--- a/crawl.py
+++ b/crawl.py
@ -0,0 +1,36 @@
+"""
+Crawl the web
+"""
+import gzip
+import hashlib
+import os
+
+import pandas as pd
+import requests
+import justext
+
+from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
+
+
+def crawl():
+    data = pd.read_csv(HN_TOP_PATH)
+
+    for url in data['url']:
+        print("Fetching", url)
+        html = fetch(url)
+        filename = hashlib.md5(url.encode('utf8')).hexdigest()
+        path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
+        if os.path.isfile(path):
+            print("Path already exists, skipping")
+
+        with gzip.open(path, 'w') as output:
+            output.write(html.encode('utf8'))
+
+
+def fetch(url):
+    page_data = requests.get(url)
+    return page_data.text
+
+
+if __name__ == '__main__':
+    crawl()
--- a/index.py
+++ b/index.py
@ -0,0 +1,44 @@
+"""
+Create a search index
+"""
+import gzip
+from glob import glob
+
+import justext
+from spacy.lang.en import English
+
+from paths import CRAWL_GLOB
+
+
+def is_content_token(nlp, token):
+    lexeme = nlp.vocab[token.orth]
+    return lexeme.is_alpha and not token.is_stop
+
+
+def tokenize(nlp, cleaned_text):
+    tokens = nlp.tokenizer(cleaned_text)
+    content_tokens = [token for token in tokens if is_content_token(nlp, token)]
+    lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
+    return lowered
+
+
+def clean(content):
+    text = justext.justext(content, justext.get_stoplist("English"))
+    pars = [par.text for par in text if not par.is_boilerplate]
+    cleaned_text = ' '.join(pars)
+    return cleaned_text
+
+
+def run():
+    nlp = English()
+    for path in glob(CRAWL_GLOB):
+        with gzip.open(path) as html_file:
+            content = html_file.read().decode("utf8")
+        cleaned_text = clean(content)
+        tokens = tokenize(nlp, cleaned_text)
+        print("Tokens", tokens)
+        break
+
+
+if __name__ == '__main__':
+    run()
--- a/paths.py
+++ b/paths.py
@ -0,0 +1,7 @@
+import os
+
+HOME = os.getenv('HOME')
+DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
+HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
+CRAWL_PREFIX = 'crawl_'
+CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")