Initial commit
This commit is contained in:
commit
b1bfe1cdd4
3 changed files with 87 additions and 0 deletions
36
crawl.py
Normal file
36
crawl.py
Normal file
|
@ -0,0 +1,36 @@
|
|||
"""
|
||||
Crawl the web
|
||||
"""
|
||||
import gzip
|
||||
import hashlib
|
||||
import os
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
import justext
|
||||
|
||||
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
|
||||
|
||||
|
||||
def crawl():
|
||||
data = pd.read_csv(HN_TOP_PATH)
|
||||
|
||||
for url in data['url']:
|
||||
print("Fetching", url)
|
||||
html = fetch(url)
|
||||
filename = hashlib.md5(url.encode('utf8')).hexdigest()
|
||||
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
|
||||
if os.path.isfile(path):
|
||||
print("Path already exists, skipping")
|
||||
|
||||
with gzip.open(path, 'w') as output:
|
||||
output.write(html.encode('utf8'))
|
||||
|
||||
|
||||
def fetch(url):
|
||||
page_data = requests.get(url)
|
||||
return page_data.text
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
crawl()
|
44
index.py
Normal file
44
index.py
Normal file
|
@ -0,0 +1,44 @@
|
|||
"""
|
||||
Create a search index
|
||||
"""
|
||||
import gzip
|
||||
from glob import glob
|
||||
|
||||
import justext
|
||||
from spacy.lang.en import English
|
||||
|
||||
from paths import CRAWL_GLOB
|
||||
|
||||
|
||||
def is_content_token(nlp, token):
|
||||
lexeme = nlp.vocab[token.orth]
|
||||
return lexeme.is_alpha and not token.is_stop
|
||||
|
||||
|
||||
def tokenize(nlp, cleaned_text):
|
||||
tokens = nlp.tokenizer(cleaned_text)
|
||||
content_tokens = [token for token in tokens if is_content_token(nlp, token)]
|
||||
lowered = {nlp.vocab[token.orth].text.lower() for token in content_tokens}
|
||||
return lowered
|
||||
|
||||
|
||||
def clean(content):
|
||||
text = justext.justext(content, justext.get_stoplist("English"))
|
||||
pars = [par.text for par in text if not par.is_boilerplate]
|
||||
cleaned_text = ' '.join(pars)
|
||||
return cleaned_text
|
||||
|
||||
|
||||
def run():
|
||||
nlp = English()
|
||||
for path in glob(CRAWL_GLOB):
|
||||
with gzip.open(path) as html_file:
|
||||
content = html_file.read().decode("utf8")
|
||||
cleaned_text = clean(content)
|
||||
tokens = tokenize(nlp, cleaned_text)
|
||||
print("Tokens", tokens)
|
||||
break
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
7
paths.py
Normal file
7
paths.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
import os
|
||||
|
||||
HOME = os.getenv('HOME')
|
||||
DATA_DIR = os.path.join(HOME, 'data', 'tinysearch')
|
||||
HN_TOP_PATH = os.path.join(DATA_DIR, 'hn-top.csv')
|
||||
CRAWL_PREFIX = 'crawl_'
|
||||
CRAWL_GLOB = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}*")
|
Loading…
Add table
Reference in a new issue