Create index
This commit is contained in:
parent
b1bfe1cdd4
commit
9815372297
3 changed files with 90 additions and 11 deletions
21
crawl.py
21
crawl.py
|
@ -4,10 +4,11 @@ Crawl the web
|
||||||
import gzip
|
import gzip
|
||||||
import hashlib
|
import hashlib
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
|
from traceback import print_tb, print_exc
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
import justext
|
|
||||||
|
|
||||||
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
|
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
|
||||||
|
|
||||||
|
@ -16,15 +17,23 @@ def crawl():
|
||||||
data = pd.read_csv(HN_TOP_PATH)
|
data = pd.read_csv(HN_TOP_PATH)
|
||||||
|
|
||||||
for url in data['url']:
|
for url in data['url']:
|
||||||
print("Fetching", url)
|
|
||||||
html = fetch(url)
|
|
||||||
filename = hashlib.md5(url.encode('utf8')).hexdigest()
|
filename = hashlib.md5(url.encode('utf8')).hexdigest()
|
||||||
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
|
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
|
||||||
if os.path.isfile(path):
|
if os.path.isfile(path):
|
||||||
print("Path already exists, skipping")
|
print("Path already exists, skipping", url)
|
||||||
|
continue
|
||||||
|
|
||||||
with gzip.open(path, 'w') as output:
|
print("Fetching", url)
|
||||||
output.write(html.encode('utf8'))
|
try:
|
||||||
|
html = fetch(url)
|
||||||
|
except Exception:
|
||||||
|
print_exc(file=sys.stderr)
|
||||||
|
print("Unable to fetch", url)
|
||||||
|
continue
|
||||||
|
|
||||||
|
with gzip.open(path, 'wt') as output:
|
||||||
|
output.write(url + '\n')
|
||||||
|
output.write(html)
|
||||||
|
|
||||||
|
|
||||||
def fetch(url):
|
def fetch(url):
|
||||||
|
|
79
index.py
79
index.py
|
@ -2,12 +2,16 @@
|
||||||
Create a search index
|
Create a search index
|
||||||
"""
|
"""
|
||||||
import gzip
|
import gzip
|
||||||
|
import sqlite3
|
||||||
from glob import glob
|
from glob import glob
|
||||||
|
|
||||||
|
import bs4
|
||||||
import justext
|
import justext
|
||||||
from spacy.lang.en import English
|
from spacy.lang.en import English
|
||||||
|
|
||||||
from paths import CRAWL_GLOB
|
from paths import CRAWL_GLOB, INDEX_PATH
|
||||||
|
|
||||||
|
NUM_INITIAL_TOKENS = 50
|
||||||
|
|
||||||
|
|
||||||
def is_content_token(nlp, token):
|
def is_content_token(nlp, token):
|
||||||
|
@ -17,7 +21,8 @@ def is_content_token(nlp, token):
|
||||||
|
|
||||||
def tokenize(nlp, cleaned_text):
|
def tokenize(nlp, cleaned_text):
|
||||||
tokens = nlp.tokenizer(cleaned_text)
|
tokens = nlp.tokenizer(cleaned_text)
|
||||||
content_tokens = [token for token in tokens if is_content_token(nlp, token)]
|
content_tokens = [token for token in tokens[:NUM_INITIAL_TOKENS]
|
||||||
|
if is_content_token(nlp, token)]
|
||||||
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
|
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
|
||||||
return lowered
|
return lowered
|
||||||
|
|
||||||
|
@ -29,15 +34,79 @@ def clean(content):
|
||||||
return cleaned_text
|
return cleaned_text
|
||||||
|
|
||||||
|
|
||||||
|
def index(tokens, url, title):
|
||||||
|
with sqlite3.connect(INDEX_PATH) as con:
|
||||||
|
con.execute("""
|
||||||
|
INSERT INTO pages (url, title)
|
||||||
|
VALUES (?, ?)
|
||||||
|
""", (url, title))
|
||||||
|
|
||||||
|
result = con.execute("""
|
||||||
|
SELECT last_insert_rowid()
|
||||||
|
""")
|
||||||
|
page_id = result.fetchone()[0]
|
||||||
|
print("Created page with id", page_id)
|
||||||
|
|
||||||
|
con.executemany("""
|
||||||
|
INSERT INTO terms (term, page_id)
|
||||||
|
VALUES (?, ?)
|
||||||
|
""", [(term, page_id) for term in tokens])
|
||||||
|
|
||||||
|
|
||||||
|
def create_if_not_exists():
|
||||||
|
con = sqlite3.connect(INDEX_PATH)
|
||||||
|
con.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS pages (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
url TEXT UNIQUE,
|
||||||
|
title TEXT
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
con.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS terms (
|
||||||
|
term TEXT,
|
||||||
|
page_id INTEGER
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
con.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS term_index ON terms (term)
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
|
def page_indexed(url):
|
||||||
|
con = sqlite3.connect(INDEX_PATH)
|
||||||
|
result = con.execute("""
|
||||||
|
SELECT EXISTS(SELECT 1 FROM pages WHERE url=?)
|
||||||
|
""", (url,))
|
||||||
|
value = result.fetchone()[0]
|
||||||
|
return value == 1
|
||||||
|
|
||||||
|
|
||||||
def run():
|
def run():
|
||||||
|
create_if_not_exists()
|
||||||
nlp = English()
|
nlp = English()
|
||||||
for path in glob(CRAWL_GLOB):
|
for path in glob(CRAWL_GLOB):
|
||||||
with gzip.open(path) as html_file:
|
print("Path", path)
|
||||||
content = html_file.read().decode("utf8")
|
with gzip.open(path, 'rt') as html_file:
|
||||||
|
url = html_file.readline().strip()
|
||||||
|
content = html_file.read()
|
||||||
|
|
||||||
|
if page_indexed(url):
|
||||||
|
print("Page exists, skipping", url)
|
||||||
|
continue
|
||||||
|
|
||||||
cleaned_text = clean(content)
|
cleaned_text = clean(content)
|
||||||
|
try:
|
||||||
|
title = bs4.BeautifulSoup(content, features="lxml").find('title').string
|
||||||
|
except AttributeError:
|
||||||
|
title = cleaned_text[:80]
|
||||||
tokens = tokenize(nlp, cleaned_text)
|
tokens = tokenize(nlp, cleaned_text)
|
||||||
|
print("URL", url)
|
||||||
print("Tokens", tokens)
|
print("Tokens", tokens)
|
||||||
break
|
print("Title", title)
|
||||||
|
index(tokens, url, title)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
1
paths.py
1
paths.py
|
@ -5,3 +5,4 @@ DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
|
||||||
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
||||||
CRAWL_PREFIX = 'crawl_'
|
CRAWL_PREFIX = 'crawl_'
|
||||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
||||||
|
INDEX_PATH = os.path.join(DATA_DIR, 'index.sqlite3')
|
||||||
|
|
Loading…
Reference in a new issue