hace 4 años · 61ce4bb832
--- a/domain_titles.py
+++ b/domain_titles.py
@@ -1,71 +0,0 @@
 
															-"""
														
 
															-Retrieve titles for each domain in the list of top domains
														
 
															-"""
														
 
															-import csv
														
 
															-import gzip
														
 
															-import os
														
 
															-from urllib.parse import urlsplit, urlunsplit
														
 
															-
														
 
															-import bs4
														
 
															-import requests
														
 
															-
														
 
															-from paths import DATA_DIR
														
 
															-
														
 
															-DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
														
 
															-TITLES_PATH = os.path.join(DATA_DIR, 'top-domains-titles.sqlite')
														
 
															-
														
 
															-
														
 
															-def get_redirect_no_cookies(url, max_redirects=5):
														
 
															-    if max_redirects == 0:
														
 
															-        raise RecursionError("Too many redirects")
														
 
															-    result = requests.get(url, allow_redirects=False, verify=False)
														
 
															-    if result.status_code // 100 == 3:
														
 
															-        location = result.headers['Location']
														
 
															-        if not location.startswith('http'):
														
 
															-            parsed_url = urlsplit(url)
														
 
															-            location = urlunsplit(parsed_url[:2] + (location, '', ''))
														
 
															-
														
 
															-        return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
														
 
															-    return result
														
 
															-
														
 
															-
														
 
															-def get_domain_titles():
														
 
															-    with gzip.open(DOMAINS_PATH, 'rt') as domains_file:
														
 
															-        reader = csv.reader(domains_file)
														
 
															-        next(reader)
														
 
															-        for rank, domain, _ in reader:
														
 
															-            print("Domain", domain)
														
 
															-            original_url = f"https://{domain}"
														
 
															-            try:
														
 
															-                result = get_redirect_no_cookies(original_url)
														
 
															-                status = result.status_code
														
 
															-            except RecursionError as e:
														
 
															-                print("Error retrieving URL", str(e))
														
 
															-                status = None
														
 
															-            print("Status", status)
														
 
															-
														
 
															-            if status != 200:
														
 
															-                title = None
														
 
															-            else:
														
 
															-                title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
														
 
															-                title = title_tag.string if title_tag is not None else domain
														
 
															-                print("Title", rank, domain, title)
														
 
															-            yield dict(rank=rank, domain=domain, status=status, url=result.url, title=title)
														
 
															-
														
 
															-
														
 
															-def save_domain_titles(domain_titles):
														
 
															-    with gzip.open(TITLES_PATH, 'wt') as titles_file:
														
 
															-        writer = csv.DictWriter(titles_file, ['rank', 'domain', 'status', 'url', 'title'])
														
 
															-        writer.writeheader()
														
 
															-        for row in domain_titles:
														
 
															-            writer.writerow(row)
														
 
															-            titles_file.flush()
														
 
															-
														
 
															-
														
 
															-def run():
														
 
															-    domain_titles = get_domain_titles()
														
 
															-    save_domain_titles(domain_titles)
														
 
															-
														
 
															-
														
 
															-if __name__ == '__main__':
														
 
															-    run()
														
--- a/domains/__init__.py
+++ b/domains/__init__.py
--- a/domains/domain_titles.py
+++ b/domains/domain_titles.py
@@ -0,0 +1,73 @@
 
															+"""
														
 
															+Retrieve titles for each domain in the list of top domains
														
 
															+"""
														
 
															+import csv
														
 
															+import gzip
														
 
															+import pickle
														
 
															+from urllib.parse import urlsplit, urlunsplit
														
 
															+
														
 
															+import bs4
														
 
															+import requests
														
 
															+from persistqueue import SQLiteQueue, SQLiteAckQueue
														
 
															+
														
 
															+from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
														
 
															+
														
 
															+
														
 
															+def get_redirect_no_cookies(url, max_redirects=5):
														
 
															+    if max_redirects == 0:
														
 
															+        raise RecursionError("Too many redirects")
														
 
															+    result = requests.get(url, allow_redirects=False, verify=False)
														
 
															+    if result.status_code // 100 == 3:
														
 
															+        location = result.headers['Location']
														
 
															+        if not location.startswith('http'):
														
 
															+            parsed_url = urlsplit(url)
														
 
															+            location = urlunsplit(parsed_url[:2] + (location, '', ''))
														
 
															+
														
 
															+        return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
														
 
															+    return result
														
 
															+
														
 
															+
														
 
															+def get_domain_titles():
														
 
															+    domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
														
 
															+    titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
														
 
															+    while True:
														
 
															+        item = domains_queue.get()
														
 
															+        print("Item", item)
														
 
															+        rank, domain = item
														
 
															+        print("Domain", domain, rank, type(domain), type(rank))
														
 
															+        status, title, url = retrieve_title(domain)
														
 
															+        print("Title", type(title))
														
 
															+        title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
														
 
															+        print("Title item", str(title_item))
														
 
															+        print("Dump", pickle.dumps(title_item))
														
 
															+        titles_queue.put(title_item)
														
 
															+        domains_queue.ack(item)
														
 
															+
														
 
															+
														
 
															+def retrieve_title(domain):
														
 
															+    original_url = f"https://{domain}"
														
 
															+    try:
														
 
															+        result = get_redirect_no_cookies(original_url)
														
 
															+        status = result.status_code
														
 
															+        url = result.url
														
 
															+    except RecursionError as e:
														
 
															+        print("Error retrieving URL", str(e))
														
 
															+        status = None
														
 
															+        url = None
														
 
															+
														
 
															+    print("Status", status)
														
 
															+    if status != 200:
														
 
															+        title = None
														
 
															+    else:
														
 
															+        title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
														
 
															+        title = str(title_tag.string) if title_tag is not None else domain
														
 
															+        print("Title", domain, title)
														
 
															+    return status, title, url
														
 
															+
														
 
															+
														
 
															+def run():
														
 
															+    get_domain_titles()
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    run()
														
--- a/domains/queue_domains.py
+++ b/domains/queue_domains.py
@@ -0,0 +1,30 @@
 
															+"""
														
 
															+Add domains to the queue to be retrieved
														
 
															+"""
														
 
															+import csv
														
 
															+import gzip
														
 
															+
														
 
															+from persistqueue import SQLiteQueue, SQLiteAckQueue
														
 
															+
														
 
															+from paths import DOMAINS_QUEUE_PATH, DOMAINS_PATH
														
 
															+
														
 
															+
														
 
															+def get_domains():
														
 
															+    reader = csv.reader(gzip.open(DOMAINS_PATH, 'rt'))
														
 
															+    next(reader)
														
 
															+    for rank, domain, _ in reader:
														
 
															+        yield rank, domain
														
 
															+
														
 
															+
														
 
															+def queue_domains():
														
 
															+    queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
														
 
															+    queued = 0
														
 
															+    for rank, domain in get_domains():
														
 
															+        queue.put((rank, domain))
														
 
															+        queued += 1
														
 
															+        if queued % 1000 == 0:
														
 
															+            print("Queued:", queued)
														
 
															+
														
 
															+
														
 
															+if __name__ == '__main__':
														
 
															+    queue_domains()
														
--- a/paths.py
+++ b/paths.py
@@ -9,3 +9,7 @@ INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
 
															 TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
														
 
															 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
														
 
															 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
														
 
															+
														
 
															+DOMAINS_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-queue')
														
 
															+DOMAINS_TITLES_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-title-queue')
														
 
															+DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')