Set multithreading=True (but it doesn't seem to help)

This commit is contained in:
Daoud Clarke 2021-05-03 08:37:30 +01:00
parent ba45d950ef
commit 7b4a3897b5

View file

@ -2,6 +2,8 @@
Retrieve titles for each domain in the list of top domains
"""
import pickle
from multiprocessing import Process
from time import sleep
from urllib.parse import urlsplit, urlunsplit
import bs4
@ -11,10 +13,17 @@ from persistqueue import SQLiteAckQueue
from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
NUM_PROCESSES = 10
def get_redirect_no_cookies(url, max_redirects=5):
if max_redirects == 0:
raise RecursionError("Too many redirects")
result = requests.get(url, allow_redirects=False, verify=False)
try:
result = requests.get(url, allow_redirects=False, timeout=10)
except requests.exceptions.SSLError as e:
print("Unable to get with SSL", e)
result = requests.get(url, allow_redirects=False, verify=False, timeout=10)
if result.status_code // 100 == 3:
location = result.headers['Location']
if not location.startswith('http'):
@ -27,19 +36,20 @@ def get_redirect_no_cookies(url, max_redirects=5):
def get_domain_titles():
domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH, multithreading=True)
while True:
item = domains_queue.get()
print("Item", item)
# print("Item", item)
rank, domain = item
print("Domain", domain, rank, type(domain), type(rank))
print("Domain", domain, rank)
status, title, url = retrieve_title(domain)
print("Title", type(title))
# print("Title", type(title))
title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
print("Title item", str(title_item))
print("Dump", pickle.dumps(title_item))
# print("Title item", str(title_item))
# print("Dump", pickle.dumps(title_item))
titles_queue.put(title_item)
domains_queue.ack(item)
print("Queued", titles_queue.size)
def retrieve_title(domain):
@ -48,23 +58,27 @@ def retrieve_title(domain):
result = get_redirect_no_cookies(original_url)
status = result.status_code
url = result.url
except (RecursionError, requests.exceptions.ConnectionError) as e:
except (RecursionError, requests.exceptions.ConnectionError,
requests.exceptions.ReadTimeout) as e:
print("Error retrieving URL", str(e))
status = None
url = None
print("Status", status)
# print("Status", status)
if status != 200:
title = None
else:
title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
title = str(title_tag.string) if title_tag is not None else domain
print("Title", domain, title)
# print("Title", domain, title)
return status, title, url
def run():
get_domain_titles()
for i in range(NUM_PROCESSES):
process = Process(target=get_domain_titles)
process.start()
sleep(3)
if __name__ == '__main__':