crawl.php 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. <?php
  2. include("includes/config.php");
  3. include("includes/DomDocumentParser.php");
  4. $alreadyCrawled = array();
  5. $crawling = array();
  6. $alreadyFoundImages = array();
  7. function linkExists($url) {
  8. global $db;
  9. $query = $db->prepare("SELECT * FROM sites WHERE url = :url");
  10. $query->bindParam(":url", $url);
  11. $query->execute();
  12. return $query->rowCount() != 0;
  13. }
  14. function insertLink($url, $title, $description, $keywords) {
  15. global $db;
  16. $query = $db->prepare("INSERT INTO sites(url, title, description, keywords)
  17. VALUES(:url, :title, :description, :keywords)");
  18. $query->bindParam(":url", $url);
  19. $query->bindParam(":title", $title);
  20. $query->bindParam(":description", $description);
  21. $query->bindParam(":keywords", $keywords);
  22. return $query->execute();
  23. }
  24. function insertImage($url, $src, $alt, $title) {
  25. global $db;
  26. $query = $db->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
  27. VALUES(:siteUrl, :imageUrl, :alt, :title)");
  28. $query->bindParam(":siteUrl", $url);
  29. $query->bindParam(":imageUrl", $src);
  30. $query->bindParam(":alt", $alt);
  31. $query->bindParam(":title", $title);
  32. return $query->execute();
  33. }
  34. function createLink($src, $url) {
  35. $scheme = parse_url($url)["scheme"]; // http
  36. $host = parse_url($url)["host"]; // www.reecekenney.com
  37. if(substr($src, 0, 2) == "//") {
  38. $src = $scheme . ":" . $src;
  39. }
  40. else if(substr($src, 0, 1) == "/") {
  41. $src = $scheme . "://" . $host . $src;
  42. }
  43. else if(substr($src, 0, 2) == "./") {
  44. $src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
  45. }
  46. else if(substr($src, 0, 3) == "../") {
  47. $src = $scheme . "://" . $host . "/" . $src;
  48. }
  49. else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http") {
  50. $src = $scheme . "://" . $host . "/" . $src;
  51. }
  52. return $src;
  53. }
  54. function getDetails($url) {
  55. global $alreadyFoundImages;
  56. $parser = new DomDocumentParser($url);
  57. $titleArray = $parser->getTitleTags();
  58. if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL) {
  59. return;
  60. }
  61. $title = $titleArray->item(0)->nodeValue;
  62. $title = str_replace("\n", "", $title);
  63. if($title == "") {
  64. return;
  65. }
  66. $description = "";
  67. $keywords = "";
  68. $metasArray = $parser->getMetatags();
  69. foreach($metasArray as $meta) {
  70. if($meta->getAttribute("name") == "description") {
  71. $description = $meta->getAttribute("content");
  72. }
  73. if($meta->getAttribute("name") == "keywords") {
  74. $keywords = $meta->getAttribute("content");
  75. }
  76. }
  77. $description = str_replace("\n", "", $description);
  78. $keywords = str_replace("\n", "", $keywords);
  79. if(linkExists($url)) {
  80. echo "$url already exists<br>";
  81. }
  82. else if(insertLink($url, $title, $description, $keywords)) {
  83. echo "SUCCESS: $url<br>";
  84. }
  85. else {
  86. echo "ERROR: Failed to insert $url<br>";
  87. }
  88. $imageArray = $parser->getImages();
  89. foreach($imageArray as $image) {
  90. $src = $image->getAttribute("src");
  91. $alt = $image->getAttribute("alt");
  92. $title = $image->getAttribute("title");
  93. if(!$title && !$alt) {
  94. continue;
  95. }
  96. $src = createLink($src, $url);
  97. if(!in_array($src, $alreadyFoundImages)) {
  98. $alreadyFoundImages[] = $src;
  99. insertImage($url, $src, $alt, $title);
  100. }
  101. }
  102. }
  103. function followLinks($url) {
  104. global $alreadyCrawled;
  105. global $crawling;
  106. $parser = new DomDocumentParser($url);
  107. $linkList = $parser->getLinks();
  108. foreach($linkList as $link) {
  109. $href = $link->getAttribute("href");
  110. if(strpos($href, "#") !== false) {
  111. continue;
  112. }
  113. else if(substr($href, 0, 11) == "javascript:") {
  114. continue;
  115. }
  116. $href = createLink($href, $url);
  117. if(!in_array($href, $alreadyCrawled)) {
  118. $alreadyCrawled[] = $href;
  119. $crawling[] = $href;
  120. getDetails($href);
  121. }
  122. }
  123. array_shift($crawling);
  124. foreach($crawling as $site) {
  125. followLinks($site);
  126. }
  127. }
  128. //you can change this to the url of the website you want to crawl
  129. $startUrl = "https://github.com/MusheAbdulHakim";
  130. followLinks($startUrl);
  131. ?>