infinity-decentralized/crawler.py

213 lines
7.8 KiB
Python

import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from random import choice
import pymongo
import url_analysis
import re
import os
from func_timeout import func_timeout, FunctionTimedOut
from elasticsearch import Elasticsearch
from robots_parser import RobotsParser
desktop_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']
def random_headers():
return {'User-Agent': choice(desktop_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
def get_links_from_page(url):
parsed_main_url = urlparse(url)
links = []
try:
page = requests.get(url, headers=random_headers(), timeout=30).text
except Exception:
return []
if url.endswith('.jpg') or url.endswith('.png') or url.endswith('.jpeg') or url.endswith('.pdf'):
return []
soup = BeautifulSoup(page, 'html.parser')
for link in soup.find_all('a'):
try:
link_name = str(link.get('href'))
if link_name.find('mailto:') >= 0:
continue
parsed_link = urlparse(link_name)
if str(parsed_link[2]).find('void(0)') >= 0:
continue
if parsed_link[5] != '': # Skipping page fragments
continue
if parsed_link[1] != '':
links.append(link_name)
elif parsed_link[2] != 'None':
if parsed_link[2].startswith('/'):
links.append(parsed_main_url[0] + '://' + parsed_main_url[1] + parsed_link[2])
else:
if parsed_main_url[2].endswith('/'):
links.append(
parsed_main_url[0] + '://' + parsed_main_url[1] + parsed_main_url[2] + parsed_link[2])
else:
links.append(
parsed_main_url[0] + '://' + parsed_main_url[1] + parsed_main_url[2] + '/' + parsed_link[2])
except Exception as e:
continue
return list(set(links))
def shorten_link(url):
if url.startswith('//'):
return url[2:]
if url.startswith('http://'):
return url[7:]
if url.startswith('https://'):
return url[8:]
return url
def crawler(index, accepted_sites, bad_strings_list=[]):
client = pymongo.MongoClient()
db = client['infinity_decentralized']
col = db[index]
es = Elasticsearch()
# Adding the accepted sites in to the DB if they aren't already there
for site in accepted_sites:
if site == '':
accepted_sites.remove(site)
continue
site_exists = col.find_one({'url': shorten_link(site)})
if site_exists is None:
col.insert_one({'url': shorten_link(site), 'passed': False})
while True:
count = 0
print('New cycle starting')
for site_to_crawl in accepted_sites:
print(shorten_link(site_to_crawl))
uncrawled_webpages = col.find({'url': re.compile('^' + shorten_link(site_to_crawl)), 'passed': False}).limit(20)
for webpage in uncrawled_webpages:
count += 1
shortened_webpage_url = shorten_link(webpage['url'])
full_webpage_url = 'https://' + shortened_webpage_url
print('Count:', count)
print('Webpage:', full_webpage_url)
try:
links = func_timeout(10, get_links_from_page, args=(full_webpage_url,))
except FunctionTimedOut:
print('Error getting links (took too long)')
continue
except Exception:
print('Error getting links')
continue
print('Links:', links)
robot_parser = RobotsParser('https://' + urlparse(full_webpage_url)[1] + '/robots.txt')
for link in links:
try:
if len(link) > 500:
continue
parsed = urlparse(link)
if parsed[0] == '':
link = 'https:' + link
if robot_parser.can_crawl_path(parsed[2]) is False:
continue
except Exception as e:
print(e)
continue
passed = False
for accepted_site in accepted_sites:
if link.startswith(accepted_site) is True:
passed = True
break
for bad_string in bad_strings_list:
if link.find(bad_string) >= 0:
passed = False
break
if passed is False:
continue
shortened_link = shorten_link(link)
if col.find_one({'url': shortened_link}) is not None:
continue
# For new links
url_analysis_result = url_analysis.get_page_information(link)
# If it is a bad/broken link
if url_analysis_result[0] is False:
print('Analysis Failed: ', link)
col.update_one({'url': shortened_link}, {'$set': {'uploaded': False, 'good_link': False, 'passed': True}}, True)
continue
url_info = dict(url_analysis_result[1])
url_info['url'] = shortened_link
print('Link Info:', url_info)
es_response = es.index(index, url_info)
url_info['uploaded'] = False
url_info['good_link'] = True
url_info['passed'] = False
try:
if es_response['_shards']['successful'] > 0:
url_info['uploaded'] = True
print('SUCCESS')
else:
'Submission FAILED!'
except Exception as e:
print(e)
col.update_one({'url': shortened_link}, {'$set': url_info}, True)
col.update_one({'url': shortened_webpage_url}, {'$set': {'passed': True}}, True)
if count == 0:
print('All of your links have been crawled! Exiting program')
exit(0)
if __name__ == '__main__':
index = 'infinity_decentralized'
open('crawler_pid.txt', 'w').write(str(os.getpid()))
websites_to_crawl = str(open('sites_to_crawl.txt', 'r').read()).split('\n')
crawler(index=index, accepted_sites=websites_to_crawl, bad_strings_list=[])