|
@@ -2,6 +2,8 @@
|
|
Retrieve titles for each domain in the list of top domains
|
|
Retrieve titles for each domain in the list of top domains
|
|
"""
|
|
"""
|
|
import pickle
|
|
import pickle
|
|
|
|
+from multiprocessing import Process
|
|
|
|
+from time import sleep
|
|
from urllib.parse import urlsplit, urlunsplit
|
|
from urllib.parse import urlsplit, urlunsplit
|
|
|
|
|
|
import bs4
|
|
import bs4
|
|
@@ -11,10 +13,17 @@ from persistqueue import SQLiteAckQueue
|
|
from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
|
|
from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
|
|
|
|
|
|
|
|
|
|
|
|
+NUM_PROCESSES = 10
|
|
|
|
+
|
|
|
|
+
|
|
def get_redirect_no_cookies(url, max_redirects=5):
|
|
def get_redirect_no_cookies(url, max_redirects=5):
|
|
if max_redirects == 0:
|
|
if max_redirects == 0:
|
|
raise RecursionError("Too many redirects")
|
|
raise RecursionError("Too many redirects")
|
|
- result = requests.get(url, allow_redirects=False, verify=False)
|
|
|
|
|
|
+ try:
|
|
|
|
+ result = requests.get(url, allow_redirects=False, timeout=10)
|
|
|
|
+ except requests.exceptions.SSLError as e:
|
|
|
|
+ print("Unable to get with SSL", e)
|
|
|
|
+ result = requests.get(url, allow_redirects=False, verify=False, timeout=10)
|
|
if result.status_code // 100 == 3:
|
|
if result.status_code // 100 == 3:
|
|
location = result.headers['Location']
|
|
location = result.headers['Location']
|
|
if not location.startswith('http'):
|
|
if not location.startswith('http'):
|
|
@@ -27,19 +36,20 @@ def get_redirect_no_cookies(url, max_redirects=5):
|
|
|
|
|
|
def get_domain_titles():
|
|
def get_domain_titles():
|
|
domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
|
|
domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
|
|
- titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
|
|
|
|
|
|
+ titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH, multithreading=True)
|
|
while True:
|
|
while True:
|
|
item = domains_queue.get()
|
|
item = domains_queue.get()
|
|
- print("Item", item)
|
|
|
|
|
|
+ # print("Item", item)
|
|
rank, domain = item
|
|
rank, domain = item
|
|
- print("Domain", domain, rank, type(domain), type(rank))
|
|
|
|
|
|
+ print("Domain", domain, rank)
|
|
status, title, url = retrieve_title(domain)
|
|
status, title, url = retrieve_title(domain)
|
|
- print("Title", type(title))
|
|
|
|
|
|
+ # print("Title", type(title))
|
|
title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
|
|
title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
|
|
- print("Title item", str(title_item))
|
|
|
|
- print("Dump", pickle.dumps(title_item))
|
|
|
|
|
|
+ # print("Title item", str(title_item))
|
|
|
|
+ # print("Dump", pickle.dumps(title_item))
|
|
titles_queue.put(title_item)
|
|
titles_queue.put(title_item)
|
|
domains_queue.ack(item)
|
|
domains_queue.ack(item)
|
|
|
|
+ print("Queued", titles_queue.size)
|
|
|
|
|
|
|
|
|
|
def retrieve_title(domain):
|
|
def retrieve_title(domain):
|
|
@@ -48,23 +58,27 @@ def retrieve_title(domain):
|
|
result = get_redirect_no_cookies(original_url)
|
|
result = get_redirect_no_cookies(original_url)
|
|
status = result.status_code
|
|
status = result.status_code
|
|
url = result.url
|
|
url = result.url
|
|
- except (RecursionError, requests.exceptions.ConnectionError) as e:
|
|
|
|
|
|
+ except (RecursionError, requests.exceptions.ConnectionError,
|
|
|
|
+ requests.exceptions.ReadTimeout) as e:
|
|
print("Error retrieving URL", str(e))
|
|
print("Error retrieving URL", str(e))
|
|
status = None
|
|
status = None
|
|
url = None
|
|
url = None
|
|
|
|
|
|
- print("Status", status)
|
|
|
|
|
|
+ # print("Status", status)
|
|
if status != 200:
|
|
if status != 200:
|
|
title = None
|
|
title = None
|
|
else:
|
|
else:
|
|
title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
|
|
title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
|
|
title = str(title_tag.string) if title_tag is not None else domain
|
|
title = str(title_tag.string) if title_tag is not None else domain
|
|
- print("Title", domain, title)
|
|
|
|
|
|
+ # print("Title", domain, title)
|
|
return status, title, url
|
|
return status, title, url
|
|
|
|
|
|
|
|
|
|
def run():
|
|
def run():
|
|
- get_domain_titles()
|
|
|
|
|
|
+ for i in range(NUM_PROCESSES):
|
|
|
|
+ process = Process(target=get_domain_titles)
|
|
|
|
+ process.start()
|
|
|
|
+ sleep(3)
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|