prepare("SELECT * FROM sites WHERE url = :url");
$query->bindParam(":url", $url);
$query->execute();
return $query->rowCount() != 0;
}
function insertLink($url, $title, $description, $keywords) {
global $db;
$query = $db->prepare("INSERT INTO sites(url, title, description, keywords)
VALUES(:url, :title, :description, :keywords)");
$query->bindParam(":url", $url);
$query->bindParam(":title", $title);
$query->bindParam(":description", $description);
$query->bindParam(":keywords", $keywords);
return $query->execute();
}
function insertImage($url, $src, $alt, $title) {
global $db;
$query = $db->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
VALUES(:siteUrl, :imageUrl, :alt, :title)");
$query->bindParam(":siteUrl", $url);
$query->bindParam(":imageUrl", $src);
$query->bindParam(":alt", $alt);
$query->bindParam(":title", $title);
return $query->execute();
}
function createLink($src, $url) {
$scheme = parse_url($url)["scheme"]; // http
$host = parse_url($url)["host"]; // www.reecekenney.com
if(substr($src, 0, 2) == "//") {
$src = $scheme . ":" . $src;
}
else if(substr($src, 0, 1) == "/") {
$src = $scheme . "://" . $host . $src;
}
else if(substr($src, 0, 2) == "./") {
$src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
}
else if(substr($src, 0, 3) == "../") {
$src = $scheme . "://" . $host . "/" . $src;
}
else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http") {
$src = $scheme . "://" . $host . "/" . $src;
}
return $src;
}
function getDetails($url) {
global $alreadyFoundImages;
$parser = new DomDocumentParser($url);
$titleArray = $parser->getTitleTags();
if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL) {
return;
}
$title = $titleArray->item(0)->nodeValue;
$title = str_replace("\n", "", $title);
if($title == "") {
return;
}
$description = "";
$keywords = "";
$metasArray = $parser->getMetatags();
foreach($metasArray as $meta) {
if($meta->getAttribute("name") == "description") {
$description = $meta->getAttribute("content");
}
if($meta->getAttribute("name") == "keywords") {
$keywords = $meta->getAttribute("content");
}
}
$description = str_replace("\n", "", $description);
$keywords = str_replace("\n", "", $keywords);
if(linkExists($url)) {
echo "$url already exists
";
}
else if(insertLink($url, $title, $description, $keywords)) {
echo "SUCCESS: $url
";
}
else {
echo "ERROR: Failed to insert $url
";
}
$imageArray = $parser->getImages();
foreach($imageArray as $image) {
$src = $image->getAttribute("src");
$alt = $image->getAttribute("alt");
$title = $image->getAttribute("title");
if(!$title && !$alt) {
continue;
}
$src = createLink($src, $url);
if(!in_array($src, $alreadyFoundImages)) {
$alreadyFoundImages[] = $src;
insertImage($url, $src, $alt, $title);
}
}
}
function followLinks($url) {
global $alreadyCrawled;
global $crawling;
$parser = new DomDocumentParser($url);
$linkList = $parser->getLinks();
foreach($linkList as $link) {
$href = $link->getAttribute("href");
if(strpos($href, "#") !== false) {
continue;
}
else if(substr($href, 0, 11) == "javascript:") {
continue;
}
$href = createLink($href, $url);
if(!in_array($href, $alreadyCrawled)) {
$alreadyCrawled[] = $href;
$crawling[] = $href;
getDetails($href);
}
}
array_shift($crawling);
foreach($crawling as $site) {
followLinks($site);
}
}
//you can change this to the url of the website you want to crawl
$startUrl = "https://github.com/MusheAbdulHakim";
followLinks($startUrl);
?>