diff --git a/domain_titles.py b/domain_titles.py deleted file mode 100644 index 3ba4f9e..0000000 --- a/domain_titles.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -Retrieve titles for each domain in the list of top domains -""" -import csv -import gzip -import os -from urllib.parse import urlsplit, urlunsplit - -import bs4 -import requests - -from paths import DATA_DIR - -DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz') -TITLES_PATH = os.path.join(DATA_DIR, 'top-domains-titles.sqlite') - - -def get_redirect_no_cookies(url, max_redirects=5): - if max_redirects == 0: - raise RecursionError("Too many redirects") - result = requests.get(url, allow_redirects=False, verify=False) - if result.status_code // 100 == 3: - location = result.headers['Location'] - if not location.startswith('http'): - parsed_url = urlsplit(url) - location = urlunsplit(parsed_url[:2] + (location, '', '')) - - return get_redirect_no_cookies(location, max_redirects=max_redirects - 1) - return result - - -def get_domain_titles(): - with gzip.open(DOMAINS_PATH, 'rt') as domains_file: - reader = csv.reader(domains_file) - next(reader) - for rank, domain, _ in reader: - print("Domain", domain) - original_url = f"https://{domain}" - try: - result = get_redirect_no_cookies(original_url) - status = result.status_code - except RecursionError as e: - print("Error retrieving URL", str(e)) - status = None - print("Status", status) - - if status != 200: - title = None - else: - title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title') - title = title_tag.string if title_tag is not None else domain - print("Title", rank, domain, title) - yield dict(rank=rank, domain=domain, status=status, url=result.url, title=title) - - -def save_domain_titles(domain_titles): - with gzip.open(TITLES_PATH, 'wt') as titles_file: - writer = csv.DictWriter(titles_file, ['rank', 'domain', 'status', 'url', 'title']) - writer.writeheader() - for row in domain_titles: - writer.writerow(row) - titles_file.flush() - - -def run(): - domain_titles = get_domain_titles() - save_domain_titles(domain_titles) - - -if __name__ == '__main__': - run() diff --git a/domains/__init__.py b/domains/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/domains/domain_titles.py b/domains/domain_titles.py new file mode 100644 index 0000000..29a866a --- /dev/null +++ b/domains/domain_titles.py @@ -0,0 +1,73 @@ +""" +Retrieve titles for each domain in the list of top domains +""" +import csv +import gzip +import pickle +from urllib.parse import urlsplit, urlunsplit + +import bs4 +import requests +from persistqueue import SQLiteQueue, SQLiteAckQueue + +from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH + + +def get_redirect_no_cookies(url, max_redirects=5): + if max_redirects == 0: + raise RecursionError("Too many redirects") + result = requests.get(url, allow_redirects=False, verify=False) + if result.status_code // 100 == 3: + location = result.headers['Location'] + if not location.startswith('http'): + parsed_url = urlsplit(url) + location = urlunsplit(parsed_url[:2] + (location, '', '')) + + return get_redirect_no_cookies(location, max_redirects=max_redirects - 1) + return result + + +def get_domain_titles(): + domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH) + titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH) + while True: + item = domains_queue.get() + print("Item", item) + rank, domain = item + print("Domain", domain, rank, type(domain), type(rank)) + status, title, url = retrieve_title(domain) + print("Title", type(title)) + title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title) + print("Title item", str(title_item)) + print("Dump", pickle.dumps(title_item)) + titles_queue.put(title_item) + domains_queue.ack(item) + + +def retrieve_title(domain): + original_url = f"https://{domain}" + try: + result = get_redirect_no_cookies(original_url) + status = result.status_code + url = result.url + except RecursionError as e: + print("Error retrieving URL", str(e)) + status = None + url = None + + print("Status", status) + if status != 200: + title = None + else: + title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title') + title = str(title_tag.string) if title_tag is not None else domain + print("Title", domain, title) + return status, title, url + + +def run(): + get_domain_titles() + + +if __name__ == '__main__': + run() diff --git a/domains/queue_domains.py b/domains/queue_domains.py new file mode 100644 index 0000000..ac627fb --- /dev/null +++ b/domains/queue_domains.py @@ -0,0 +1,30 @@ +""" +Add domains to the queue to be retrieved +""" +import csv +import gzip + +from persistqueue import SQLiteQueue, SQLiteAckQueue + +from paths import DOMAINS_QUEUE_PATH, DOMAINS_PATH + + +def get_domains(): + reader = csv.reader(gzip.open(DOMAINS_PATH, 'rt')) + next(reader) + for rank, domain, _ in reader: + yield rank, domain + + +def queue_domains(): + queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH) + queued = 0 + for rank, domain in get_domains(): + queue.put((rank, domain)) + queued += 1 + if queued % 1000 == 0: + print("Queued:", queued) + + +if __name__ == '__main__': + queue_domains() diff --git a/paths.py b/paths.py index 813ef5e..4c19dcc 100644 --- a/paths.py +++ b/paths.py @@ -9,3 +9,7 @@ INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch') TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch') WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2') WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz') + +DOMAINS_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-queue') +DOMAINS_TITLES_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-title-queue') +DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')