123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186 |
- <?php
- include("includes/config.php");
- include("includes/DomDocumentParser.php");
- $alreadyCrawled = array();
- $crawling = array();
- $alreadyFoundImages = array();
- function linkExists($url) {
- global $db;
- $query = $db->prepare("SELECT * FROM sites WHERE url = :url");
- $query->bindParam(":url", $url);
- $query->execute();
- return $query->rowCount() != 0;
- }
- function insertLink($url, $title, $description, $keywords) {
- global $db;
- $query = $db->prepare("INSERT INTO sites(url, title, description, keywords)
- VALUES(:url, :title, :description, :keywords)");
- $query->bindParam(":url", $url);
- $query->bindParam(":title", $title);
- $query->bindParam(":description", $description);
- $query->bindParam(":keywords", $keywords);
- return $query->execute();
- }
- function insertImage($url, $src, $alt, $title) {
- global $db;
- $query = $db->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
- VALUES(:siteUrl, :imageUrl, :alt, :title)");
- $query->bindParam(":siteUrl", $url);
- $query->bindParam(":imageUrl", $src);
- $query->bindParam(":alt", $alt);
- $query->bindParam(":title", $title);
- return $query->execute();
- }
- function createLink($src, $url) {
- $scheme = parse_url($url)["scheme"]; // http
- $host = parse_url($url)["host"]; // www.reecekenney.com
-
- if(substr($src, 0, 2) == "//") {
- $src = $scheme . ":" . $src;
- }
- else if(substr($src, 0, 1) == "/") {
- $src = $scheme . "://" . $host . $src;
- }
- else if(substr($src, 0, 2) == "./") {
- $src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
- }
- else if(substr($src, 0, 3) == "../") {
- $src = $scheme . "://" . $host . "/" . $src;
- }
- else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http") {
- $src = $scheme . "://" . $host . "/" . $src;
- }
- return $src;
- }
- function getDetails($url) {
- global $alreadyFoundImages;
- $parser = new DomDocumentParser($url);
- $titleArray = $parser->getTitleTags();
- if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL) {
- return;
- }
- $title = $titleArray->item(0)->nodeValue;
- $title = str_replace("\n", "", $title);
- if($title == "") {
- return;
- }
- $description = "";
- $keywords = "";
- $metasArray = $parser->getMetatags();
- foreach($metasArray as $meta) {
- if($meta->getAttribute("name") == "description") {
- $description = $meta->getAttribute("content");
- }
- if($meta->getAttribute("name") == "keywords") {
- $keywords = $meta->getAttribute("content");
- }
- }
- $description = str_replace("\n", "", $description);
- $keywords = str_replace("\n", "", $keywords);
- if(linkExists($url)) {
- echo "$url already exists<br>";
- }
- else if(insertLink($url, $title, $description, $keywords)) {
- echo "SUCCESS: $url<br>";
- }
- else {
- echo "ERROR: Failed to insert $url<br>";
- }
- $imageArray = $parser->getImages();
- foreach($imageArray as $image) {
- $src = $image->getAttribute("src");
- $alt = $image->getAttribute("alt");
- $title = $image->getAttribute("title");
- if(!$title && !$alt) {
- continue;
- }
- $src = createLink($src, $url);
- if(!in_array($src, $alreadyFoundImages)) {
- $alreadyFoundImages[] = $src;
- insertImage($url, $src, $alt, $title);
- }
- }
- }
- function followLinks($url) {
- global $alreadyCrawled;
- global $crawling;
- $parser = new DomDocumentParser($url);
- $linkList = $parser->getLinks();
- foreach($linkList as $link) {
- $href = $link->getAttribute("href");
- if(strpos($href, "#") !== false) {
- continue;
- }
- else if(substr($href, 0, 11) == "javascript:") {
- continue;
- }
- $href = createLink($href, $url);
- if(!in_array($href, $alreadyCrawled)) {
- $alreadyCrawled[] = $href;
- $crawling[] = $href;
- getDetails($href);
- }
- }
- array_shift($crawling);
- foreach($crawling as $site) {
- followLinks($site);
- }
- }
- //you can change this to the url of the website you want to crawl
- $startUrl = "https://github.com/MusheAbdulHakim";
- followLinks($startUrl);
- ?>
|