Create index

This commit is contained in:
Daoud Clarke 2021-03-13 22:21:50 +00:00
parent b1bfe1cdd4
commit 9815372297
3 changed files with 90 additions and 11 deletions

View file

@ -4,10 +4,11 @@ Crawl the web
import gzip import gzip
import hashlib import hashlib
import os import os
import sys
from traceback import print_tb, print_exc
import pandas as pd import pandas as pd
import requests import requests
import justext
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
@ -16,15 +17,23 @@ def crawl():
data = pd.read_csv(HN_TOP_PATH) data = pd.read_csv(HN_TOP_PATH)
for url in data['url']: for url in data['url']:
print("Fetching", url)
html = fetch(url)
filename = hashlib.md5(url.encode('utf8')).hexdigest() filename = hashlib.md5(url.encode('utf8')).hexdigest()
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz") path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
if os.path.isfile(path): if os.path.isfile(path):
print("Path already exists, skipping") print("Path already exists, skipping", url)
continue
with gzip.open(path, 'w') as output: print("Fetching", url)
output.write(html.encode('utf8')) try:
html = fetch(url)
except Exception:
print_exc(file=sys.stderr)
print("Unable to fetch", url)
continue
with gzip.open(path, 'wt') as output:
output.write(url + '\n')
output.write(html)
def fetch(url): def fetch(url):

View file

@ -2,12 +2,16 @@
Create a search index Create a search index
""" """
import gzip import gzip
import sqlite3
from glob import glob from glob import glob
import bs4
import justext import justext
from spacy.lang.en import English from spacy.lang.en import English
from paths import CRAWL_GLOB from paths import CRAWL_GLOB, INDEX_PATH
NUM_INITIAL_TOKENS = 50
def is_content_token(nlp, token): def is_content_token(nlp, token):
@ -17,7 +21,8 @@ def is_content_token(nlp, token):
def tokenize(nlp, cleaned_text): def tokenize(nlp, cleaned_text):
tokens = nlp.tokenizer(cleaned_text) tokens = nlp.tokenizer(cleaned_text)
content_tokens = [token for token in tokens if is_content_token(nlp, token)] content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
if is_content_token(nlp, token)]
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens} lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
return lowered return lowered
@ -29,15 +34,79 @@ def clean(content):
return cleaned_text return cleaned_text
def index(tokens, url, title):
with sqlite3.connect(INDEX_PATH) as con:
con.execute("""
INSERT INTO pages (url, title)
VALUES (?, ?)
""", (url, title))
result = con.execute("""
SELECT last_insert_rowid()
""")
page_id = result.fetchone()[0]
print("Created page with id", page_id)
con.executemany("""
INSERT INTO terms (term, page_id)
VALUES (?, ?)
""", [(term, page_id) for term in tokens])
def create_if_not_exists():
con = sqlite3.connect(INDEX_PATH)
con.execute("""
CREATE TABLE IF NOT EXISTS pages (
id INTEGER PRIMARY KEY,
url TEXT UNIQUE,
title TEXT
)
""")
con.execute("""
CREATE TABLE IF NOT EXISTS terms (
term TEXT,
page_id INTEGER
)
""")
con.execute("""
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
""")
def page_indexed(url):
con = sqlite3.connect(INDEX_PATH)
result = con.execute("""
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
""", (url,))
value = result.fetchone()[0]
return value == 1
def run(): def run():
create_if_not_exists()
nlp = English() nlp = English()
for path in glob(CRAWL_GLOB): for path in glob(CRAWL_GLOB):
with gzip.open(path) as html_file: print("Path", path)
content = html_file.read().decode("utf8") with gzip.open(path, 'rt') as html_file:
url = html_file.readline().strip()
content = html_file.read()
if page_indexed(url):
print("Page exists, skipping", url)
continue
cleaned_text = clean(content) cleaned_text = clean(content)
try:
title = bs4.BeautifulSoup(content, features="lxml").find('title').string
except AttributeError:
title = cleaned_text[:80]
tokens = tokenize(nlp, cleaned_text) tokens = tokenize(nlp, cleaned_text)
print("URL", url)
print("Tokens", tokens) print("Tokens", tokens)
break print("Title", title)
index(tokens, url, title)
if __name__ == '__main__': if __name__ == '__main__':

View file

@ -5,3 +5,4 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv') HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
CRAWL_PREFIX = 'crawl_' CRAWL_PREFIX = 'crawl_'
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*") CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')