Procházet zdrojové kódy

Set multithreading=True (but it doesn't seem to help)

Daoud Clarke před 4 roky
rodič
revize
7b4a3897b5
1 změnil soubory, kde provedl 25 přidání a 11 odebrání
  1. 25 11
      domains/domain_titles.py

+ 25 - 11
domains/domain_titles.py

@@ -2,6 +2,8 @@
 Retrieve titles for each domain in the list of top domains
 """
 import pickle
+from multiprocessing import Process
+from time import sleep
 from urllib.parse import urlsplit, urlunsplit
 
 import bs4
@@ -11,10 +13,17 @@ from persistqueue import SQLiteAckQueue
 from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
 
 
+NUM_PROCESSES = 10
+
+
 def get_redirect_no_cookies(url, max_redirects=5):
     if max_redirects == 0:
         raise RecursionError("Too many redirects")
-    result = requests.get(url, allow_redirects=False, verify=False)
+    try:
+        result = requests.get(url, allow_redirects=False, timeout=10)
+    except requests.exceptions.SSLError as e:
+        print("Unable to get with SSL", e)
+        result = requests.get(url, allow_redirects=False, verify=False, timeout=10)
     if result.status_code // 100 == 3:
         location = result.headers['Location']
         if not location.startswith('http'):
@@ -27,19 +36,20 @@ def get_redirect_no_cookies(url, max_redirects=5):
 
 def get_domain_titles():
     domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
-    titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
+    titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH, multithreading=True)
     while True:
         item = domains_queue.get()
-        print("Item", item)
+        # print("Item", item)
         rank, domain = item
-        print("Domain", domain, rank, type(domain), type(rank))
+        print("Domain", domain, rank)
         status, title, url = retrieve_title(domain)
-        print("Title", type(title))
+        # print("Title", type(title))
         title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
-        print("Title item", str(title_item))
-        print("Dump", pickle.dumps(title_item))
+        # print("Title item", str(title_item))
+        # print("Dump", pickle.dumps(title_item))
         titles_queue.put(title_item)
         domains_queue.ack(item)
+        print("Queued", titles_queue.size)
 
 
 def retrieve_title(domain):
@@ -48,23 +58,27 @@ def retrieve_title(domain):
         result = get_redirect_no_cookies(original_url)
         status = result.status_code
         url = result.url
-    except (RecursionError, requests.exceptions.ConnectionError) as e:
+    except (RecursionError, requests.exceptions.ConnectionError,
+            requests.exceptions.ReadTimeout) as e:
         print("Error retrieving URL", str(e))
         status = None
         url = None
 
-    print("Status", status)
+    # print("Status", status)
     if status != 200:
         title = None
     else:
         title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
         title = str(title_tag.string) if title_tag is not None else domain
-        print("Title", domain, title)
+        # print("Title", domain, title)
     return status, title, url
 
 
 def run():
-    get_domain_titles()
+    for i in range(NUM_PROCESSES):
+        process = Process(target=get_domain_titles)
+        process.start()
+        sleep(3)
 
 
 if __name__ == '__main__':