2021-03-13 20:54:15 +00:00
|
|
|
"""
|
|
|
|
Crawl the web
|
|
|
|
"""
|
|
|
|
import gzip
|
|
|
|
import hashlib
|
|
|
|
import os
|
2021-03-13 22:21:50 +00:00
|
|
|
import sys
|
|
|
|
from traceback import print_tb, print_exc
|
2021-03-13 20:54:15 +00:00
|
|
|
|
|
|
|
import pandas as pd
|
|
|
|
import requests
|
|
|
|
|
|
|
|
from paths import DATA_DIR, HN_TOP_PATH, CRAWL_PREFIX
|
|
|
|
|
|
|
|
|
|
|
|
def crawl():
|
|
|
|
data = pd.read_csv(HN_TOP_PATH)
|
|
|
|
|
|
|
|
for url in data['url']:
|
|
|
|
filename = hashlib.md5(url.encode('utf8')).hexdigest()
|
|
|
|
path = os.path.join(DATA_DIR, f"{CRAWL_PREFIX}{filename}.html.gz")
|
|
|
|
if os.path.isfile(path):
|
2021-03-13 22:21:50 +00:00
|
|
|
print("Path already exists, skipping", url)
|
|
|
|
continue
|
2021-03-13 20:54:15 +00:00
|
|
|
|
2021-03-13 22:21:50 +00:00
|
|
|
print("Fetching", url)
|
|
|
|
try:
|
|
|
|
html = fetch(url)
|
|
|
|
except Exception:
|
|
|
|
print_exc(file=sys.stderr)
|
|
|
|
print("Unable to fetch", url)
|
|
|
|
continue
|
|
|
|
|
|
|
|
with gzip.open(path, 'wt') as output:
|
|
|
|
output.write(url + '\n')
|
|
|
|
output.write(html)
|
2021-03-13 20:54:15 +00:00
|
|
|
|
|
|
|
|
|
|
|
def fetch(url):
|
2021-03-14 12:53:23 +00:00
|
|
|
page_data = requests.get(url, timeout=10)
|
2021-03-13 20:54:15 +00:00
|
|
|
return page_data.text
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
crawl()
|