4 éve · 61ce4bb832
--- a/domain_titles.py
+++ b/domain_titles.py
@@ -1,71 +0,0 @@
 
				-"""
			
 
				-Retrieve titles for each domain in the list of top domains
			
 
				-"""
			
 
				-import csv
			
 
				-import gzip
			
 
				-import os
			
 
				-from urllib.parse import urlsplit, urlunsplit
			
 
				-
			
 
				-import bs4
			
 
				-import requests
			
 
				-
			
 
				-from paths import DATA_DIR
			
 
				-
			
 
				-DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
			
 
				-TITLES_PATH = os.path.join(DATA_DIR, 'top-domains-titles.sqlite')
			
 
				-
			
 
				-
			
 
				-def get_redirect_no_cookies(url, max_redirects=5):
			
 
				-    if max_redirects == 0:
			
 
				-        raise RecursionError("Too many redirects")
			
 
				-    result = requests.get(url, allow_redirects=False, verify=False)
			
 
				-    if result.status_code // 100 == 3:
			
 
				-        location = result.headers['Location']
			
 
				-        if not location.startswith('http'):
			
 
				-            parsed_url = urlsplit(url)
			
 
				-            location = urlunsplit(parsed_url[:2] + (location, '', ''))
			
 
				-
			
 
				-        return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
			
 
				-    return result
			
 
				-
			
 
				-
			
 
				-def get_domain_titles():
			
 
				-    with gzip.open(DOMAINS_PATH, 'rt') as domains_file:
			
 
				-        reader = csv.reader(domains_file)
			
 
				-        next(reader)
			
 
				-        for rank, domain, _ in reader:
			
 
				-            print("Domain", domain)
			
 
				-            original_url = f"https://{domain}"
			
 
				-            try:
			
 
				-                result = get_redirect_no_cookies(original_url)
			
 
				-                status = result.status_code
			
 
				-            except RecursionError as e:
			
 
				-                print("Error retrieving URL", str(e))
			
 
				-                status = None
			
 
				-            print("Status", status)
			
 
				-
			
 
				-            if status != 200:
			
 
				-                title = None
			
 
				-            else:
			
 
				-                title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
			
 
				-                title = title_tag.string if title_tag is not None else domain
			
 
				-                print("Title", rank, domain, title)
			
 
				-            yield dict(rank=rank, domain=domain, status=status, url=result.url, title=title)
			
 
				-
			
 
				-
			
 
				-def save_domain_titles(domain_titles):
			
 
				-    with gzip.open(TITLES_PATH, 'wt') as titles_file:
			
 
				-        writer = csv.DictWriter(titles_file, ['rank', 'domain', 'status', 'url', 'title'])
			
 
				-        writer.writeheader()
			
 
				-        for row in domain_titles:
			
 
				-            writer.writerow(row)
			
 
				-            titles_file.flush()
			
 
				-
			
 
				-
			
 
				-def run():
			
 
				-    domain_titles = get_domain_titles()
			
 
				-    save_domain_titles(domain_titles)
			
 
				-
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    run()
			
--- a/domains/__init__.py
+++ b/domains/__init__.py
--- a/domains/domain_titles.py
+++ b/domains/domain_titles.py
@@ -0,0 +1,73 @@
 
				+"""
			
 
				+Retrieve titles for each domain in the list of top domains
			
 
				+"""
			
 
				+import csv
			
 
				+import gzip
			
 
				+import pickle
			
 
				+from urllib.parse import urlsplit, urlunsplit
			
 
				+
			
 
				+import bs4
			
 
				+import requests
			
 
				+from persistqueue import SQLiteQueue, SQLiteAckQueue
			
 
				+
			
 
				+from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
			
 
				+
			
 
				+
			
 
				+def get_redirect_no_cookies(url, max_redirects=5):
			
 
				+    if max_redirects == 0:
			
 
				+        raise RecursionError("Too many redirects")
			
 
				+    result = requests.get(url, allow_redirects=False, verify=False)
			
 
				+    if result.status_code // 100 == 3:
			
 
				+        location = result.headers['Location']
			
 
				+        if not location.startswith('http'):
			
 
				+            parsed_url = urlsplit(url)
			
 
				+            location = urlunsplit(parsed_url[:2] + (location, '', ''))
			
 
				+
			
 
				+        return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
			
 
				+    return result
			
 
				+
			
 
				+
			
 
				+def get_domain_titles():
			
 
				+    domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
			
 
				+    titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
			
 
				+    while True:
			
 
				+        item = domains_queue.get()
			
 
				+        print("Item", item)
			
 
				+        rank, domain = item
			
 
				+        print("Domain", domain, rank, type(domain), type(rank))
			
 
				+        status, title, url = retrieve_title(domain)
			
 
				+        print("Title", type(title))
			
 
				+        title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
			
 
				+        print("Title item", str(title_item))
			
 
				+        print("Dump", pickle.dumps(title_item))
			
 
				+        titles_queue.put(title_item)
			
 
				+        domains_queue.ack(item)
			
 
				+
			
 
				+
			
 
				+def retrieve_title(domain):
			
 
				+    original_url = f"https://{domain}"
			
 
				+    try:
			
 
				+        result = get_redirect_no_cookies(original_url)
			
 
				+        status = result.status_code
			
 
				+        url = result.url
			
 
				+    except RecursionError as e:
			
 
				+        print("Error retrieving URL", str(e))
			
 
				+        status = None
			
 
				+        url = None
			
 
				+
			
 
				+    print("Status", status)
			
 
				+    if status != 200:
			
 
				+        title = None
			
 
				+    else:
			
 
				+        title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
			
 
				+        title = str(title_tag.string) if title_tag is not None else domain
			
 
				+        print("Title", domain, title)
			
 
				+    return status, title, url
			
 
				+
			
 
				+
			
 
				+def run():
			
 
				+    get_domain_titles()
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    run()
			
--- a/domains/queue_domains.py
+++ b/domains/queue_domains.py
@@ -0,0 +1,30 @@
 
				+"""
			
 
				+Add domains to the queue to be retrieved
			
 
				+"""
			
 
				+import csv
			
 
				+import gzip
			
 
				+
			
 
				+from persistqueue import SQLiteQueue, SQLiteAckQueue
			
 
				+
			
 
				+from paths import DOMAINS_QUEUE_PATH, DOMAINS_PATH
			
 
				+
			
 
				+
			
 
				+def get_domains():
			
 
				+    reader = csv.reader(gzip.open(DOMAINS_PATH, 'rt'))
			
 
				+    next(reader)
			
 
				+    for rank, domain, _ in reader:
			
 
				+        yield rank, domain
			
 
				+
			
 
				+
			
 
				+def queue_domains():
			
 
				+    queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
			
 
				+    queued = 0
			
 
				+    for rank, domain in get_domains():
			
 
				+        queue.put((rank, domain))
			
 
				+        queued += 1
			
 
				+        if queued % 1000 == 0:
			
 
				+            print("Queued:", queued)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    queue_domains()
			
--- a/paths.py
+++ b/paths.py
@@ -9,3 +9,7 @@ INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
 
				 TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
			
 
				 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
			
 
				 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
			
 
				+
			
 
				+DOMAINS_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-queue')
			
 
				+DOMAINS_TITLES_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-title-queue')
			
 
				+DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')