mwmbl/crawl.py

46 lines
981 B
Python
Raw Normal View History

2021-03-13 20:54:15 +00:00
"""
Crawl the web
"""
import gzip
import hashlib
import os
2021-03-13 22:21:50 +00:00
import sys
from traceback import print_tb, print_exc
2021-03-13 20:54:15 +00:00
import pandas as pd
import requests
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
def crawl():
data = pd.read_csv(HN_TOP_PATH)
for url in data['url']:
filename = hashlib.md5(url.encode('utf8')).hexdigest()
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
if os.path.isfile(path):
2021-03-13 22:21:50 +00:00
print("Path already exists, skipping", url)
continue
2021-03-13 20:54:15 +00:00
2021-03-13 22:21:50 +00:00
print("Fetching", url)
try:
html = fetch(url)
except Exception:
print_exc(file=sys.stderr)
print("Unable to fetch", url)
continue
with gzip.open(path, 'wt') as output:
output.write(url + '\n')
output.write(html)
2021-03-13 20:54:15 +00:00
def fetch(url):
2021-03-14 12:53:23 +00:00
page_data = requests.get(url, timeout=10)
2021-03-13 20:54:15 +00:00
return page_data.text
if __name__ == '__main__':
crawl()