Daoud Clarke 4 lat temu
rodzic
commit
61ce4bb832
5 zmienionych plików z 107 dodań i 71 usunięć
  1. 0 71
      domain_titles.py
  2. 0 0
      domains/__init__.py
  3. 73 0
      domains/domain_titles.py
  4. 30 0
      domains/queue_domains.py
  5. 4 0
      paths.py

+ 0 - 71
domain_titles.py

@@ -1,71 +0,0 @@
-"""
-Retrieve titles for each domain in the list of top domains
-"""
-import csv
-import gzip
-import os
-from urllib.parse import urlsplit, urlunsplit
-
-import bs4
-import requests
-
-from paths import DATA_DIR
-
-DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
-TITLES_PATH = os.path.join(DATA_DIR, 'top-domains-titles.sqlite')
-
-
-def get_redirect_no_cookies(url, max_redirects=5):
-    if max_redirects == 0:
-        raise RecursionError("Too many redirects")
-    result = requests.get(url, allow_redirects=False, verify=False)
-    if result.status_code // 100 == 3:
-        location = result.headers['Location']
-        if not location.startswith('http'):
-            parsed_url = urlsplit(url)
-            location = urlunsplit(parsed_url[:2] + (location, '', ''))
-
-        return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
-    return result
-
-
-def get_domain_titles():
-    with gzip.open(DOMAINS_PATH, 'rt') as domains_file:
-        reader = csv.reader(domains_file)
-        next(reader)
-        for rank, domain, _ in reader:
-            print("Domain", domain)
-            original_url = f"https://{domain}"
-            try:
-                result = get_redirect_no_cookies(original_url)
-                status = result.status_code
-            except RecursionError as e:
-                print("Error retrieving URL", str(e))
-                status = None
-            print("Status", status)
-
-            if status != 200:
-                title = None
-            else:
-                title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
-                title = title_tag.string if title_tag is not None else domain
-                print("Title", rank, domain, title)
-            yield dict(rank=rank, domain=domain, status=status, url=result.url, title=title)
-
-
-def save_domain_titles(domain_titles):
-    with gzip.open(TITLES_PATH, 'wt') as titles_file:
-        writer = csv.DictWriter(titles_file, ['rank', 'domain', 'status', 'url', 'title'])
-        writer.writeheader()
-        for row in domain_titles:
-            writer.writerow(row)
-            titles_file.flush()
-
-
-def run():
-    domain_titles = get_domain_titles()
-    save_domain_titles(domain_titles)
-
-
-if __name__ == '__main__':
-    run()

+ 0 - 0
domains/__init__.py


+ 73 - 0
domains/domain_titles.py

@@ -0,0 +1,73 @@
+"""
+Retrieve titles for each domain in the list of top domains
+"""
+import csv
+import gzip
+import pickle
+from urllib.parse import urlsplit, urlunsplit
+
+import bs4
+import requests
+from persistqueue import SQLiteQueue, SQLiteAckQueue
+
+from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
+
+
+def get_redirect_no_cookies(url, max_redirects=5):
+    if max_redirects == 0:
+        raise RecursionError("Too many redirects")
+    result = requests.get(url, allow_redirects=False, verify=False)
+    if result.status_code // 100 == 3:
+        location = result.headers['Location']
+        if not location.startswith('http'):
+            parsed_url = urlsplit(url)
+            location = urlunsplit(parsed_url[:2] + (location, '', ''))
+
+        return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
+    return result
+
+
+def get_domain_titles():
+    domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
+    titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
+    while True:
+        item = domains_queue.get()
+        print("Item", item)
+        rank, domain = item
+        print("Domain", domain, rank, type(domain), type(rank))
+        status, title, url = retrieve_title(domain)
+        print("Title", type(title))
+        title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
+        print("Title item", str(title_item))
+        print("Dump", pickle.dumps(title_item))
+        titles_queue.put(title_item)
+        domains_queue.ack(item)
+
+
+def retrieve_title(domain):
+    original_url = f"https://{domain}"
+    try:
+        result = get_redirect_no_cookies(original_url)
+        status = result.status_code
+        url = result.url
+    except RecursionError as e:
+        print("Error retrieving URL", str(e))
+        status = None
+        url = None
+
+    print("Status", status)
+    if status != 200:
+        title = None
+    else:
+        title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
+        title = str(title_tag.string) if title_tag is not None else domain
+        print("Title", domain, title)
+    return status, title, url
+
+
+def run():
+    get_domain_titles()
+
+
+if __name__ == '__main__':
+    run()

+ 30 - 0
domains/queue_domains.py

@@ -0,0 +1,30 @@
+"""
+Add domains to the queue to be retrieved
+"""
+import csv
+import gzip
+
+from persistqueue import SQLiteQueue, SQLiteAckQueue
+
+from paths import DOMAINS_QUEUE_PATH, DOMAINS_PATH
+
+
+def get_domains():
+    reader = csv.reader(gzip.open(DOMAINS_PATH, 'rt'))
+    next(reader)
+    for rank, domain, _ in reader:
+        yield rank, domain
+
+
+def queue_domains():
+    queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
+    queued = 0
+    for rank, domain in get_domains():
+        queue.put((rank, domain))
+        queued += 1
+        if queued % 1000 == 0:
+            print("Queued:", queued)
+
+
+if __name__ == '__main__':
+    queue_domains()

+ 4 - 0
paths.py

@@ -9,3 +9,7 @@ INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
 TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
 WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
 WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
+
+DOMAINS_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-queue')
+DOMAINS_TITLES_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-title-queue')
+DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')