Use queues

This commit is contained in:
Daoud Clarke 2021-04-25 08:55:15 +01:00
parent e76ce691d0
commit 61ce4bb832
5 changed files with 107 additions and 71 deletions

View file

@ -1,71 +0,0 @@
"""
Retrieve titles for each domain in the list of top domains
"""
import csv
import gzip
import os
from urllib.parse import urlsplit, urlunsplit
import bs4
import requests
from paths import DATA_DIR
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
TITLES_PATH = os.path.join(DATA_DIR, 'top-domains-titles.sqlite')
def get_redirect_no_cookies(url, max_redirects=5):
if max_redirects == 0:
raise RecursionError("Too many redirects")
result = requests.get(url, allow_redirects=False, verify=False)
if result.status_code // 100 == 3:
location = result.headers['Location']
if not location.startswith('http'):
parsed_url = urlsplit(url)
location = urlunsplit(parsed_url[:2] + (location, '', ''))
return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
return result
def get_domain_titles():
with gzip.open(DOMAINS_PATH, 'rt') as domains_file:
reader = csv.reader(domains_file)
next(reader)
for rank, domain, _ in reader:
print("Domain", domain)
original_url = f"https://{domain}"
try:
result = get_redirect_no_cookies(original_url)
status = result.status_code
except RecursionError as e:
print("Error retrieving URL", str(e))
status = None
print("Status", status)
if status != 200:
title = None
else:
title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
title = title_tag.string if title_tag is not None else domain
print("Title", rank, domain, title)
yield dict(rank=rank, domain=domain, status=status, url=result.url, title=title)
def save_domain_titles(domain_titles):
with gzip.open(TITLES_PATH, 'wt') as titles_file:
writer = csv.DictWriter(titles_file, ['rank', 'domain', 'status', 'url', 'title'])
writer.writeheader()
for row in domain_titles:
writer.writerow(row)
titles_file.flush()
def run():
domain_titles = get_domain_titles()
save_domain_titles(domain_titles)
if __name__ == '__main__':
run()

0
domains/__init__.py Normal file
View file

73
domains/domain_titles.py Normal file
View file

@ -0,0 +1,73 @@
"""
Retrieve titles for each domain in the list of top domains
"""
import csv
import gzip
import pickle
from urllib.parse import urlsplit, urlunsplit
import bs4
import requests
from persistqueue import SQLiteQueue, SQLiteAckQueue
from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
def get_redirect_no_cookies(url, max_redirects=5):
if max_redirects == 0:
raise RecursionError("Too many redirects")
result = requests.get(url, allow_redirects=False, verify=False)
if result.status_code // 100 == 3:
location = result.headers['Location']
if not location.startswith('http'):
parsed_url = urlsplit(url)
location = urlunsplit(parsed_url[:2] + (location, '', ''))
return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
return result
def get_domain_titles():
domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
while True:
item = domains_queue.get()
print("Item", item)
rank, domain = item
print("Domain", domain, rank, type(domain), type(rank))
status, title, url = retrieve_title(domain)
print("Title", type(title))
title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
print("Title item", str(title_item))
print("Dump", pickle.dumps(title_item))
titles_queue.put(title_item)
domains_queue.ack(item)
def retrieve_title(domain):
original_url = f"https://{domain}"
try:
result = get_redirect_no_cookies(original_url)
status = result.status_code
url = result.url
except RecursionError as e:
print("Error retrieving URL", str(e))
status = None
url = None
print("Status", status)
if status != 200:
title = None
else:
title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
title = str(title_tag.string) if title_tag is not None else domain
print("Title", domain, title)
return status, title, url
def run():
get_domain_titles()
if __name__ == '__main__':
run()

30
domains/queue_domains.py Normal file
View file

@ -0,0 +1,30 @@
"""
Add domains to the queue to be retrieved
"""
import csv
import gzip
from persistqueue import SQLiteQueue, SQLiteAckQueue
from paths import DOMAINS_QUEUE_PATH, DOMAINS_PATH
def get_domains():
reader = csv.reader(gzip.open(DOMAINS_PATH, 'rt'))
next(reader)
for rank, domain, _ in reader:
yield rank, domain
def queue_domains():
queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
queued = 0
for rank, domain in get_domains():
queue.put((rank, domain))
queued += 1
if queued % 1000 == 0:
print("Queued:", queued)
if __name__ == '__main__':
queue_domains()

View file

@ -9,3 +9,7 @@ INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
DOMAINS_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-queue')
DOMAINS_TITLES_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-title-queue')
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')