213 lines
7.8 KiB
Python
213 lines
7.8 KiB
Python
import requests
|
|
from urllib.parse import urlparse
|
|
from bs4 import BeautifulSoup
|
|
from random import choice
|
|
import pymongo
|
|
import url_analysis
|
|
import re
|
|
import os
|
|
|
|
from func_timeout import func_timeout, FunctionTimedOut
|
|
from elasticsearch import Elasticsearch
|
|
from robots_parser import RobotsParser
|
|
|
|
desktop_agents = [
|
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
|
|
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']
|
|
|
|
|
|
def random_headers():
|
|
return {'User-Agent': choice(desktop_agents),
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
|
|
|
|
|
|
def get_links_from_page(url):
|
|
parsed_main_url = urlparse(url)
|
|
links = []
|
|
try:
|
|
page = requests.get(url, headers=random_headers(), timeout=30).text
|
|
except Exception:
|
|
return []
|
|
|
|
if url.endswith('.jpg') or url.endswith('.png') or url.endswith('.jpeg') or url.endswith('.pdf'):
|
|
return []
|
|
|
|
soup = BeautifulSoup(page, 'html.parser')
|
|
for link in soup.find_all('a'):
|
|
try:
|
|
link_name = str(link.get('href'))
|
|
|
|
if link_name.find('mailto:') >= 0:
|
|
continue
|
|
|
|
parsed_link = urlparse(link_name)
|
|
|
|
if str(parsed_link[2]).find('void(0)') >= 0:
|
|
continue
|
|
|
|
if parsed_link[5] != '': # Skipping page fragments
|
|
continue
|
|
|
|
if parsed_link[1] != '':
|
|
links.append(link_name)
|
|
|
|
elif parsed_link[2] != 'None':
|
|
if parsed_link[2].startswith('/'):
|
|
links.append(parsed_main_url[0] + '://' + parsed_main_url[1] + parsed_link[2])
|
|
else:
|
|
if parsed_main_url[2].endswith('/'):
|
|
links.append(
|
|
parsed_main_url[0] + '://' + parsed_main_url[1] + parsed_main_url[2] + parsed_link[2])
|
|
else:
|
|
links.append(
|
|
parsed_main_url[0] + '://' + parsed_main_url[1] + parsed_main_url[2] + '/' + parsed_link[2])
|
|
except Exception as e:
|
|
continue
|
|
|
|
return list(set(links))
|
|
|
|
|
|
def shorten_link(url):
|
|
if url.startswith('//'):
|
|
return url[2:]
|
|
|
|
if url.startswith('http://'):
|
|
return url[7:]
|
|
|
|
if url.startswith('https://'):
|
|
return url[8:]
|
|
|
|
return url
|
|
|
|
|
|
def crawler(index, accepted_sites, bad_strings_list=[]):
|
|
client = pymongo.MongoClient()
|
|
db = client['infinity_decentralized']
|
|
col = db[index]
|
|
|
|
es = Elasticsearch()
|
|
|
|
# Adding the accepted sites in to the DB if they aren't already there
|
|
for site in accepted_sites:
|
|
if site == '':
|
|
accepted_sites.remove(site)
|
|
continue
|
|
site_exists = col.find_one({'url': shorten_link(site)})
|
|
if site_exists is None:
|
|
col.insert_one({'url': shorten_link(site), 'passed': False})
|
|
|
|
|
|
while True:
|
|
count = 0
|
|
print('New cycle starting')
|
|
for site_to_crawl in accepted_sites:
|
|
print(shorten_link(site_to_crawl))
|
|
uncrawled_webpages = col.find({'url': re.compile('^' + shorten_link(site_to_crawl)), 'passed': False}).limit(20)
|
|
for webpage in uncrawled_webpages:
|
|
count += 1
|
|
|
|
shortened_webpage_url = shorten_link(webpage['url'])
|
|
full_webpage_url = 'https://' + shortened_webpage_url
|
|
print('Count:', count)
|
|
print('Webpage:', full_webpage_url)
|
|
try:
|
|
links = func_timeout(10, get_links_from_page, args=(full_webpage_url,))
|
|
except FunctionTimedOut:
|
|
print('Error getting links (took too long)')
|
|
continue
|
|
|
|
except Exception:
|
|
print('Error getting links')
|
|
continue
|
|
|
|
print('Links:', links)
|
|
|
|
robot_parser = RobotsParser('https://' + urlparse(full_webpage_url)[1] + '/robots.txt')
|
|
for link in links:
|
|
try:
|
|
if len(link) > 500:
|
|
continue
|
|
|
|
parsed = urlparse(link)
|
|
|
|
if parsed[0] == '':
|
|
link = 'https:' + link
|
|
|
|
if robot_parser.can_crawl_path(parsed[2]) is False:
|
|
continue
|
|
|
|
except Exception as e:
|
|
print(e)
|
|
continue
|
|
|
|
passed = False
|
|
for accepted_site in accepted_sites:
|
|
if link.startswith(accepted_site) is True:
|
|
passed = True
|
|
break
|
|
|
|
for bad_string in bad_strings_list:
|
|
if link.find(bad_string) >= 0:
|
|
passed = False
|
|
break
|
|
|
|
if passed is False:
|
|
continue
|
|
|
|
|
|
shortened_link = shorten_link(link)
|
|
if col.find_one({'url': shortened_link}) is not None:
|
|
continue
|
|
|
|
# For new links
|
|
url_analysis_result = url_analysis.get_page_information(link)
|
|
|
|
# If it is a bad/broken link
|
|
if url_analysis_result[0] is False:
|
|
print('Analysis Failed: ', link)
|
|
col.update_one({'url': shortened_link}, {'$set': {'uploaded': False, 'good_link': False, 'passed': True}}, True)
|
|
continue
|
|
|
|
|
|
url_info = dict(url_analysis_result[1])
|
|
url_info['url'] = shortened_link
|
|
|
|
print('Link Info:', url_info)
|
|
|
|
es_response = es.index(index, url_info)
|
|
|
|
url_info['uploaded'] = False
|
|
url_info['good_link'] = True
|
|
url_info['passed'] = False
|
|
|
|
try:
|
|
if es_response['_shards']['successful'] > 0:
|
|
url_info['uploaded'] = True
|
|
print('SUCCESS')
|
|
else:
|
|
'Submission FAILED!'
|
|
except Exception as e:
|
|
print(e)
|
|
|
|
col.update_one({'url': shortened_link}, {'$set': url_info}, True)
|
|
|
|
col.update_one({'url': shortened_webpage_url}, {'$set': {'passed': True}}, True)
|
|
|
|
if count == 0:
|
|
print('All of your links have been crawled! Exiting program')
|
|
exit(0)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
index = 'infinity_decentralized'
|
|
open('crawler_pid.txt', 'w').write(str(os.getpid()))
|
|
websites_to_crawl = str(open('sites_to_crawl.txt', 'r').read()).split('\n')
|
|
crawler(index=index, accepted_sites=websites_to_crawl, bad_strings_list=[])
|