From b1bfe1cdd4574353cb9a7f6a34bb84fe086a4815 Mon Sep 17 00:00:00 2001
From: Daoud Clarke <daoud.clarke@gmail.com>
Date: Sat, 13 Mar 2021 20:54:15 +0000
Subject: [PATCH] Initial commit

---
 crawl.py | 36 ++++++++++++++++++++++++++++++++++++
 index.py | 44 ++++++++++++++++++++++++++++++++++++++++++++
 paths.py |  7 +++++++
 3 files changed, 87 insertions(+)
 create mode 100644 crawl.py
 create mode 100644 index.py
 create mode 100644 paths.py

diff --git a/crawl.py b/crawl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ec81a4d494120d28ae4e6bd935e020a12ef1edc
--- /dev/null
+++ b/crawl.py
@@ -0,0 +1,36 @@
+"""
+Crawl the web
+"""
+import gzip
+import hashlib
+import os
+
+import pandas as pd
+import requests
+import justext
+
+from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
+
+
+def crawl():
+    data = pd.read_csv(HN_TOP_PATH)
+
+    for url in data['url']:
+        print("Fetching", url)
+        html = fetch(url)
+        filename = hashlib.md5(url.encode('utf8')).hexdigest()
+        path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
+        if os.path.isfile(path):
+            print("Path already exists, skipping")
+
+        with gzip.open(path, 'w') as output:
+            output.write(html.encode('utf8'))
+
+
+def fetch(url):
+    page_data = requests.get(url)
+    return page_data.text
+
+
+if __name__ == '__main__':
+    crawl()
diff --git a/index.py b/index.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e5fdea2f773a50f9550caf76036bac948fc6339
--- /dev/null
+++ b/index.py
@@ -0,0 +1,44 @@
+"""
+Create a search index
+"""
+import gzip
+from glob import glob
+
+import justext
+from spacy.lang.en import English
+
+from paths import CRAWL_GLOB
+
+
+def is_content_token(nlp, token):
+    lexeme = nlp.vocab[token.orth]
+    return lexeme.is_alpha and not token.is_stop
+
+
+def tokenize(nlp, cleaned_text):
+    tokens = nlp.tokenizer(cleaned_text)
+    content_tokens = [token for token in tokens if is_content_token(nlp, token)]
+    lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
+    return lowered
+
+
+def clean(content):
+    text = justext.justext(content, justext.get_stoplist("English"))
+    pars = [par.text for par in text if not par.is_boilerplate]
+    cleaned_text = ' '.join(pars)
+    return cleaned_text
+
+
+def run():
+    nlp = English()
+    for path in glob(CRAWL_GLOB):
+        with gzip.open(path) as html_file:
+            content = html_file.read().decode("utf8")
+        cleaned_text = clean(content)
+        tokens = tokenize(nlp, cleaned_text)
+        print("Tokens", tokens)
+        break
+
+
+if __name__ == '__main__':
+    run()
diff --git a/paths.py b/paths.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d6b65fe2e20bcfa605281d5a7846c1083f5194d
--- /dev/null
+++ b/paths.py
@@ -0,0 +1,7 @@
+import os
+
+HOME = os.getenv('HOME')
+DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
+HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
+CRAWL_PREFIX = 'crawl_'
+CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")