diff --git a/domains/domain_titles.py b/domains/domain_titles.py index 09fd8be..13c9291 100644 --- a/domains/domain_titles.py +++ b/domains/domain_titles.py @@ -2,6 +2,8 @@ Retrieve titles for each domain in the list of top domains """ import pickle +from multiprocessing import Process +from time import sleep from urllib.parse import urlsplit, urlunsplit import bs4 @@ -11,10 +13,17 @@ from persistqueue import SQLiteAckQueue from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH +NUM_PROCESSES = 10 + + def get_redirect_no_cookies(url, max_redirects=5): if max_redirects == 0: raise RecursionError("Too many redirects") - result = requests.get(url, allow_redirects=False, verify=False) + try: + result = requests.get(url, allow_redirects=False, timeout=10) + except requests.exceptions.SSLError as e: + print("Unable to get with SSL", e) + result = requests.get(url, allow_redirects=False, verify=False, timeout=10) if result.status_code // 100 == 3: location = result.headers['Location'] if not location.startswith('http'): @@ -27,19 +36,20 @@ def get_redirect_no_cookies(url, max_redirects=5): def get_domain_titles(): domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH) - titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH) + titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH, multithreading=True) while True: item = domains_queue.get() - print("Item", item) + # print("Item", item) rank, domain = item - print("Domain", domain, rank, type(domain), type(rank)) + print("Domain", domain, rank) status, title, url = retrieve_title(domain) - print("Title", type(title)) + # print("Title", type(title)) title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title) - print("Title item", str(title_item)) - print("Dump", pickle.dumps(title_item)) + # print("Title item", str(title_item)) + # print("Dump", pickle.dumps(title_item)) titles_queue.put(title_item) domains_queue.ack(item) + print("Queued", titles_queue.size) def retrieve_title(domain): @@ -48,23 +58,27 @@ def retrieve_title(domain): result = get_redirect_no_cookies(original_url) status = result.status_code url = result.url - except (RecursionError, requests.exceptions.ConnectionError) as e: + except (RecursionError, requests.exceptions.ConnectionError, + requests.exceptions.ReadTimeout) as e: print("Error retrieving URL", str(e)) status = None url = None - print("Status", status) + # print("Status", status) if status != 200: title = None else: title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title') title = str(title_tag.string) if title_tag is not None else domain - print("Title", domain, title) + # print("Title", domain, title) return status, title, url def run(): - get_domain_titles() + for i in range(NUM_PROCESSES): + process = Process(target=get_domain_titles) + process.start() + sleep(3) if __name__ == '__main__':