Create index

2021-03-13 22:21:50 +00:00 · 2021-03-13 22:21:50 +00:00 · 9815372297
commit 9815372297
parent b1bfe1cdd4
3 changed files with 90 additions and 11 deletions
--- a/crawl.py
+++ b/crawl.py
@ -4,10 +4,11 @@ Crawl the web
 import gzip
 import hashlib
 import os
 import sys
 from traceback import print_tb, print_exc
 import pandas as pd
 import requests
 import justext
 from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
@ -16,15 +17,23 @@ def crawl():
    data = pd.read_csv(HN_TOP_PATH)
    for url in data['url']:
        print("Fetching", url)
        html = fetch(url)
        filename = hashlib.md5(url.encode('utf8')).hexdigest()
        path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
        if os.path.isfile(path):
-            print("Path already exists, skipping")
+            print("Path already exists, skipping", url)
            continue
-        with gzip.open(path, 'w') as output:
+        print("Fetching", url)
-            output.write(html.encode('utf8'))
+        try:
            html = fetch(url)
        except Exception:
            print_exc(file=sys.stderr)
            print("Unable to fetch", url)
            continue
        with gzip.open(path, 'wt') as output:
            output.write(url + '\n')
            output.write(html)
 def fetch(url):
--- a/index.py
+++ b/index.py
@ -2,12 +2,16 @@
 Create a search index
 """
 import gzip
 import sqlite3
 from glob import glob
 import bs4
 import justext
 from spacy.lang.en import English
-from paths import CRAWL_GLOB
+from paths import CRAWL_GLOB, INDEX_PATH
 NUM_INITIAL_TOKENS = 50
 def is_content_token(nlp, token):
@ -17,7 +21,8 @@ def is_content_token(nlp, token):
 def tokenize(nlp, cleaned_text):
    tokens = nlp.tokenizer(cleaned_text)
-    content_tokens = [token for token in tokens if is_content_token(nlp, token)]
+    content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
                      if is_content_token(nlp, token)]
    lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
    return lowered
@ -29,15 +34,79 @@ def clean(content):
    return cleaned_text
 def index(tokens, url, title):
    with sqlite3.connect(INDEX_PATH) as con:
        con.execute("""
            INSERT INTO pages (url, title)
            VALUES (?, ?)
        """, (url, title))
        result = con.execute("""
            SELECT last_insert_rowid()
        """)
        page_id = result.fetchone()[0]
        print("Created page with id", page_id)
        con.executemany("""
            INSERT INTO terms (term, page_id)
            VALUES (?, ?)
        """, [(term, page_id) for term in tokens])
 def create_if_not_exists():
    con = sqlite3.connect(INDEX_PATH)
    con.execute("""
    CREATE TABLE IF NOT EXISTS pages (
      id INTEGER PRIMARY KEY,
      url TEXT UNIQUE,
      title TEXT
    )
    """)
    con.execute("""
    CREATE TABLE IF NOT EXISTS terms (
      term TEXT,
      page_id INTEGER 
    )
    """)
    con.execute("""
    CREATE INDEX IF NOT EXISTS term_index ON terms (term)
    """)
 def page_indexed(url):
    con = sqlite3.connect(INDEX_PATH)
    result = con.execute("""
        SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
    """, (url,))
    value = result.fetchone()[0]
    return value == 1
 def run():
    create_if_not_exists()
    nlp = English()
    for path in glob(CRAWL_GLOB):
-        with gzip.open(path) as html_file:
+        print("Path", path)
-            content = html_file.read().decode("utf8")
+        with gzip.open(path, 'rt') as html_file:
            url = html_file.readline().strip()
            content = html_file.read()
        if page_indexed(url):
            print("Page exists, skipping", url)
            continue
        cleaned_text = clean(content)
        try:
            title = bs4.BeautifulSoup(content, features="lxml").find('title').string
        except AttributeError:
            title = cleaned_text[:80]
        tokens = tokenize(nlp, cleaned_text)
        print("URL", url)
        print("Tokens", tokens)
-        break
+        print("Title", title)
        index(tokens, url, title)
 if __name__ == '__main__':
--- a/paths.py
+++ b/paths.py
@ -5,3 +5,4 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
 HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
 CRAWL_PREFIX = 'crawl_'
 CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
 INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')