Use queues
This commit is contained in:
parent
e76ce691d0
commit
61ce4bb832
5 changed files with 107 additions and 71 deletions
|
@ -1,71 +0,0 @@
|
|||
"""
|
||||
Retrieve titles for each domain in the list of top domains
|
||||
"""
|
||||
import csv
|
||||
import gzip
|
||||
import os
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
import bs4
|
||||
import requests
|
||||
|
||||
from paths import DATA_DIR
|
||||
|
||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
TITLES_PATH = os.path.join(DATA_DIR, 'top-domains-titles.sqlite')
|
||||
|
||||
|
||||
def get_redirect_no_cookies(url, max_redirects=5):
|
||||
if max_redirects == 0:
|
||||
raise RecursionError("Too many redirects")
|
||||
result = requests.get(url, allow_redirects=False, verify=False)
|
||||
if result.status_code // 100 == 3:
|
||||
location = result.headers['Location']
|
||||
if not location.startswith('http'):
|
||||
parsed_url = urlsplit(url)
|
||||
location = urlunsplit(parsed_url[:2] + (location, '', ''))
|
||||
|
||||
return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
|
||||
return result
|
||||
|
||||
|
||||
def get_domain_titles():
|
||||
with gzip.open(DOMAINS_PATH, 'rt') as domains_file:
|
||||
reader = csv.reader(domains_file)
|
||||
next(reader)
|
||||
for rank, domain, _ in reader:
|
||||
print("Domain", domain)
|
||||
original_url = f"https://{domain}"
|
||||
try:
|
||||
result = get_redirect_no_cookies(original_url)
|
||||
status = result.status_code
|
||||
except RecursionError as e:
|
||||
print("Error retrieving URL", str(e))
|
||||
status = None
|
||||
print("Status", status)
|
||||
|
||||
if status != 200:
|
||||
title = None
|
||||
else:
|
||||
title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
|
||||
title = title_tag.string if title_tag is not None else domain
|
||||
print("Title", rank, domain, title)
|
||||
yield dict(rank=rank, domain=domain, status=status, url=result.url, title=title)
|
||||
|
||||
|
||||
def save_domain_titles(domain_titles):
|
||||
with gzip.open(TITLES_PATH, 'wt') as titles_file:
|
||||
writer = csv.DictWriter(titles_file, ['rank', 'domain', 'status', 'url', 'title'])
|
||||
writer.writeheader()
|
||||
for row in domain_titles:
|
||||
writer.writerow(row)
|
||||
titles_file.flush()
|
||||
|
||||
|
||||
def run():
|
||||
domain_titles = get_domain_titles()
|
||||
save_domain_titles(domain_titles)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
0
domains/__init__.py
Normal file
0
domains/__init__.py
Normal file
73
domains/domain_titles.py
Normal file
73
domains/domain_titles.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
"""
|
||||
Retrieve titles for each domain in the list of top domains
|
||||
"""
|
||||
import csv
|
||||
import gzip
|
||||
import pickle
|
||||
from urllib.parse import urlsplit, urlunsplit
|
||||
|
||||
import bs4
|
||||
import requests
|
||||
from persistqueue import SQLiteQueue, SQLiteAckQueue
|
||||
|
||||
from paths import DOMAINS_QUEUE_PATH, DOMAINS_TITLES_QUEUE_PATH
|
||||
|
||||
|
||||
def get_redirect_no_cookies(url, max_redirects=5):
|
||||
if max_redirects == 0:
|
||||
raise RecursionError("Too many redirects")
|
||||
result = requests.get(url, allow_redirects=False, verify=False)
|
||||
if result.status_code // 100 == 3:
|
||||
location = result.headers['Location']
|
||||
if not location.startswith('http'):
|
||||
parsed_url = urlsplit(url)
|
||||
location = urlunsplit(parsed_url[:2] + (location, '', ''))
|
||||
|
||||
return get_redirect_no_cookies(location, max_redirects=max_redirects - 1)
|
||||
return result
|
||||
|
||||
|
||||
def get_domain_titles():
|
||||
domains_queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
|
||||
titles_queue = SQLiteAckQueue(DOMAINS_TITLES_QUEUE_PATH)
|
||||
while True:
|
||||
item = domains_queue.get()
|
||||
print("Item", item)
|
||||
rank, domain = item
|
||||
print("Domain", domain, rank, type(domain), type(rank))
|
||||
status, title, url = retrieve_title(domain)
|
||||
print("Title", type(title))
|
||||
title_item = dict(rank=rank, domain=domain, status=status, url=url, title=title)
|
||||
print("Title item", str(title_item))
|
||||
print("Dump", pickle.dumps(title_item))
|
||||
titles_queue.put(title_item)
|
||||
domains_queue.ack(item)
|
||||
|
||||
|
||||
def retrieve_title(domain):
|
||||
original_url = f"https://{domain}"
|
||||
try:
|
||||
result = get_redirect_no_cookies(original_url)
|
||||
status = result.status_code
|
||||
url = result.url
|
||||
except RecursionError as e:
|
||||
print("Error retrieving URL", str(e))
|
||||
status = None
|
||||
url = None
|
||||
|
||||
print("Status", status)
|
||||
if status != 200:
|
||||
title = None
|
||||
else:
|
||||
title_tag = bs4.BeautifulSoup(result.content, features="lxml").find('title')
|
||||
title = str(title_tag.string) if title_tag is not None else domain
|
||||
print("Title", domain, title)
|
||||
return status, title, url
|
||||
|
||||
|
||||
def run():
|
||||
get_domain_titles()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
run()
|
30
domains/queue_domains.py
Normal file
30
domains/queue_domains.py
Normal file
|
@ -0,0 +1,30 @@
|
|||
"""
|
||||
Add domains to the queue to be retrieved
|
||||
"""
|
||||
import csv
|
||||
import gzip
|
||||
|
||||
from persistqueue import SQLiteQueue, SQLiteAckQueue
|
||||
|
||||
from paths import DOMAINS_QUEUE_PATH, DOMAINS_PATH
|
||||
|
||||
|
||||
def get_domains():
|
||||
reader = csv.reader(gzip.open(DOMAINS_PATH, 'rt'))
|
||||
next(reader)
|
||||
for rank, domain, _ in reader:
|
||||
yield rank, domain
|
||||
|
||||
|
||||
def queue_domains():
|
||||
queue = SQLiteAckQueue(DOMAINS_QUEUE_PATH)
|
||||
queued = 0
|
||||
for rank, domain in get_domains():
|
||||
queue.put((rank, domain))
|
||||
queued += 1
|
||||
if queued % 1000 == 0:
|
||||
print("Queued:", queued)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
queue_domains()
|
4
paths.py
4
paths.py
|
@ -9,3 +9,7 @@ INDEX_PATH = os.path.join(DATA_DIR, 'index.tinysearch')
|
|||
TEST_INDEX_PATH = os.path.join(DATA_DIR, 'index-test.tinysearch')
|
||||
WIKI_DATA_PATH = os.path.join(DATA_DIR, 'enwiki-20210301-pages-articles1.xml-p1p41242.bz2')
|
||||
WIKI_TITLES_PATH = os.path.join(DATA_DIR, 'abstract-titles-sorted.txt.gz')
|
||||
|
||||
DOMAINS_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-queue')
|
||||
DOMAINS_TITLES_QUEUE_PATH = os.path.join(DATA_DIR, 'domains-title-queue')
|
||||
DOMAINS_PATH = os.path.join(DATA_DIR, 'top10milliondomains.csv.gz')
|
||||
|
|
Loading…
Add table
Reference in a new issue