浏览代码

Delete crawl-formSubmit.php

Zepher Ashe 2 年之前
父节点
当前提交
e03c08cd74
共有 1 个文件被更改,包括 0 次插入232 次删除
  1. 0 232
      crawl-formSubmit.php

+ 0 - 232
crawl-formSubmit.php

@@ -1,232 +0,0 @@
-<?php
-include("config.php");
-include("classes/DomDocumentParser.php");
-
-$alreadyCrawled = array();
-$crawling = array();
-$alreadyFoundImages = array();
-
-
-function linkExists($url) 
-{
-	global $con;
-
-	$query = $con->prepare("SELECT * FROM sites WHERE url = :url");
-
-	$query->bindParam(":url", $url);
-	$query->execute();
-
-	return $query->rowCount() != 0;
-}
-
-function imageExists($src) 
-{
-	global $con;
-
-	$query = $con->prepare("SELECT * FROM images WHERE imageUrl = :src");
-
-	$query->bindParam(":src", $src);
-	$query->execute();
-
-	return $query->rowCount() != 0;
-}
-
-
-function insertLink($url, $title, $description, $keywords)
-{
-	global $con;
-
-	$query = $con->prepare("INSERT INTO sites(url, title, description, keywords)
-							VALUES(:url, :title, :description, :keywords)");
-
-	$query->bindParam(":url", $url);
-	$query->bindParam(":title", $title);
-	$query->bindParam(":description", $description);
-	$query->bindParam(":keywords", $keywords);
-
-	return $query->execute();
-}
-
-function insertImage($url, $src, $alt, $title) 
-{
-	global $con;
-
-	$query = $con->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
-							VALUES(:siteUrl, :imageUrl, :alt, :title)");
-
-	$query->bindParam(":siteUrl", $url);
-	$query->bindParam(":imageUrl", $src);
-	$query->bindParam(":alt", $alt);
-	$query->bindParam(":title", $title);
-
-	return $query->execute();
-}
-
-/* Converts relative link to absolute link */
-function createLink($src, $url)
-{
-	$scheme = parse_url($url)["scheme"]; // http
-	$host = parse_url($url)["host"]; // www.safesploit.com
-	
-	if(substr($src, 0, 2) == "//") 
-		$src =  $scheme . ":" . $src;
-	else if(substr($src, 0, 1) == "/") 
-		$src = $scheme . "://" . $host . $src;
-	else if(substr($src, 0, 2) == "./") 
-		$src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
-	else if(substr($src, 0, 3) == "../") 
-		$src = $scheme . "://" . $host . "/" . $src;
-	else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http") 
-		$src = $scheme . "://" . $host . "/" . $src;
-
-	return $src;
-}
-
-function getDetails($url)
-{
-	global $alreadyFoundImages;
-
-	$parser = new DomDocumentParser($url);
-
-	$titleArray = $parser->getTitleTags();
-
-	if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL)
-		return;
-
-	//Replace linebreak
-	$title = $titleArray->item(0)->nodeValue;
-	$title = str_replace("\n", "", $title);
-
-	//Return if no <title>
-	if($title == "")
-		return;
-
-	$description = "";
-	$keywords = "";
-
-	$metasArray = $parser->getMetatags();
-
-	foreach($metasArray as $meta) 
-	{
-		if($meta->getAttribute("name") == "description")
-			$description = $meta->getAttribute("content");
-
-		if($meta->getAttribute("name") == "keywords")
-			$keywords = $meta->getAttribute("content");
-	}	
-
-	$description = str_replace("\n", "", $description);
-	$keywords = str_replace("\n", "", $keywords);
-
-	if(linkExists($url))
-		echo "$url already exists<br>";
-	else if(insertLink($url, $title, $description, $keywords))
-		echo "SUCCESS: $url<br>";
-	else
-		echo "ERROR: Failed to insert $url<br>";
-
-	$imageArray = $parser->getImages();
-	foreach($imageArray as $image) 
-	{
-		$src = $image->getAttribute("src");
-		$alt = $image->getAttribute("alt");
-		$title = $image->getAttribute("title");
-
-		if(!$title && !$alt)
-			continue;
-
-		$src = createLink($src, $url);
-
-		if(!in_array($src, $alreadyFoundImages)) 
-		{
-			$alreadyFoundImages[] = $src;
-
-			if(imageExists($src))
-				echo "$src already exists<br>";
-			else if(insertImage($url, $src, $alt, $title))
-				echo "SUCCESS: $src<br>";
-			else
-				echo "ERROR: Failed to insert $src<br>";
-		}
-
-	}
-
-	echo "<b>URL:</b> $url, <b>Title:</b> $title, <b>Description:</b> $description, <b>keywords:</b> $keywords<br>"; //DEBUGGING sites
-	echo "<b>src:</b> <a href=$src>$src</a>, <b>alt:</b> $alt, <b>title:</b> $title, <b>url:</b> $url<br>"; //DEBUGGING images
-}
-
-function followLinks($url)
-{
-	global $alreadyCrawled;
-	global $crawling;
-
-	$parser = new DomDocumentParser($url);
-
-	$linkList = $parser->getLinks();
-
-
-	foreach($linkList as $link) 
-	{
-		$href = $link->getAttribute("href");
-
-		// Filter hrefs
-		if(strpos($href, "#") !== false) 
-			continue;
-		else if(substr($href, 0, 11) == "javascript:") 
-			continue;
-
-		$href = createLink($href, $url);
-
-		if(!in_array($href, $alreadyCrawled)) 
-		{
-			$alreadyCrawled[] = $href;
-			$crawling[] = $href;
-
-			getDetails($href);
-		}
-		//else return; //DEBUGGING
-
-		echo ($href . "<br>"); //DEBUGGING
-	}
-
-	array_shift($crawling);
-
-	foreach($crawling as $site)
-		followLinks($site);
-}
-?>
-
-<!DOCTYPE html>
-<html>
-<head>
-	<title>doogleBot Crawler</title>
-
-	<link rel="icon" type="image/x-icon" href="assets/images/favicon/favicon.ico">
-	<link rel="shortcut icon" type="image/png" href="assets/images/favicon/favicon-32x32.png">
-	<link rel="apple-touch-icon" href="assets/images/favicon/apple-touch-icon.png">
-	<link rel="android-chrome-icon" type="image/png" href="assets/images/favicon/android-chrome-512x512.png">
-
-	<meta name="description" content="Search the web for sites and images.">
-	<meta name="keywords" content="Search engine, doogle, websites">
-	<meta name="author" content="Zepher Ashe">
-	<meta name="viewport" content="width=device-width, initial-scale=1.0">
-
-	<link rel="stylesheet" type="text/css" href="assets/css/style.css">
-</head>
-<body>
-	<div id="crawl-wrapper">
-		<form action="crawl-formSubmit.php" method="post" >
-			URL: <input type="text" name="url" required="required" id="crawl-input" value="">
-			<button type="submit">Crawl</button>
-		</form>
-	</div>
-</body>
-</html>
-
-<?php
-if (isset($_POST['url']))
-{
-	$startUrl = $_POST['url'];
-	followLinks($startUrl);
-}
-?>