Crawler.php 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208
  1. <?php
  2. class Crawler
  3. {
  4. private $con;
  5. public function __construct($con)
  6. {
  7. $this->con = $con;
  8. }
  9. function linkExists($url)
  10. {
  11. global $con;
  12. $query = $con->prepare("SELECT * FROM sites WHERE url = :url");
  13. $query->bindParam(":url", $url);
  14. $query->execute();
  15. return $query->rowCount() != 0;
  16. }
  17. function imageExists($src)
  18. {
  19. global $con;
  20. $query = $con->prepare("SELECT * FROM images WHERE imageUrl = :src");
  21. $query->bindParam(":src", $src);
  22. $query->execute();
  23. return $query->rowCount() != 0;
  24. }
  25. function insertLink($url, $title, $description, $keywords)
  26. {
  27. global $con;
  28. $query = $con->prepare("INSERT INTO sites(url, title, description, keywords)
  29. VALUES(:url, :title, :description, :keywords)");
  30. $query->bindParam(":url", $url);
  31. $query->bindParam(":title", $title);
  32. $query->bindParam(":description", $description);
  33. $query->bindParam(":keywords", $keywords);
  34. return $query->execute();
  35. }
  36. function insertImage($url, $src, $alt, $title)
  37. {
  38. global $con;
  39. $query = $con->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
  40. VALUES(:siteUrl, :imageUrl, :alt, :title)");
  41. $query->bindParam(":siteUrl", $url);
  42. $query->bindParam(":imageUrl", $src);
  43. $query->bindParam(":alt", $alt);
  44. $query->bindParam(":title", $title);
  45. return $query->execute();
  46. }
  47. /* Converts relative link to absolute link */
  48. function createLink($src, $url)
  49. {
  50. $scheme = parse_url($url)["scheme"]; // http
  51. $host = parse_url($url)["host"]; // www.safesploit.com
  52. if(substr($src, 0, 2) == "//")
  53. $src = $scheme . ":" . $src;
  54. else if(substr($src, 0, 1) == "/")
  55. $src = $scheme . "://" . $host . $src;
  56. else if(substr($src, 0, 2) == "./")
  57. $src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
  58. else if(substr($src, 0, 3) == "../")
  59. $src = $scheme . "://" . $host . "/" . $src;
  60. else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http")
  61. $src = $scheme . "://" . $host . "/" . $src;
  62. return $src;
  63. }
  64. function getDetails($url)
  65. {
  66. global $alreadyFoundImages;
  67. $parser = new DomDocumentParser($url);
  68. $titleArray = $parser->getTitleTags();
  69. if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL)
  70. return;
  71. //Replace linebreak
  72. $title = $titleArray->item(0)->nodeValue;
  73. $title = str_replace("\n", "", $title);
  74. //Return if no <title>
  75. if($title == "")
  76. return;
  77. $description = "";
  78. $keywords = "";
  79. $metasArray = $parser->getMetatags();
  80. foreach($metasArray as $meta)
  81. {
  82. if($meta->getAttribute("name") == "description")
  83. $description = $meta->getAttribute("content");
  84. if($meta->getAttribute("name") == "keywords")
  85. $keywords = $meta->getAttribute("content");
  86. }
  87. $description = str_replace("\n", "", $description);
  88. $keywords = str_replace("\n", "", $keywords);
  89. //Non-ASCII char encoding
  90. // $title = json_encode($title);
  91. // $description = json_encode($description);
  92. // $keywords = json_encode($keywords);
  93. if(linkExists($url))
  94. echo "$url already exists<br>";
  95. else if(insertLink($url, $title, $description, $keywords))
  96. echo "SUCCESS: $url<br>";
  97. else
  98. echo "ERROR: Failed to insert $url<br>";
  99. $imageArray = $parser->getImages();
  100. foreach($imageArray as $image)
  101. {
  102. $src = $image->getAttribute("src");
  103. $alt = $image->getAttribute("alt");
  104. $title = $image->getAttribute("title");
  105. if(!$title && !$alt)
  106. continue;
  107. $src = createLink($src, $url);
  108. if(!in_array($src, $alreadyFoundImages))
  109. {
  110. $alreadyFoundImages[] = $src;
  111. if(imageExists($src))
  112. echo "$src already exists<br>";
  113. else if(insertImage($url, $src, $alt, $title))
  114. echo "SUCCESS: $src<br>";
  115. else
  116. echo "ERROR: Failed to insert $src<br>";
  117. }
  118. }
  119. echo "<b>URL:</b> $url, <b>Title:</b> $title, <b>Description:</b> $description, <b>keywords:</b> $keywords<br>"; //DEBUGGING sites
  120. echo "<b>src:</b> <a href=$src>$src</a>, <b>alt:</b> $alt, <b>title:</b> $title, <b>url:</b> $url<br>"; //DEBUGGING images
  121. }
  122. function followLinks($url)
  123. {
  124. global $alreadyCrawled;
  125. global $crawling;
  126. $parser = new DomDocumentParser($url);
  127. $linkList = $parser->getLinks();
  128. foreach($linkList as $link)
  129. {
  130. $href = $link->getAttribute("href");
  131. // Filter hrefs
  132. if(strpos($href, "#") !== false)
  133. continue;
  134. else if(substr($href, 0, 11) == "javascript:")
  135. continue;
  136. $href = createLink($href, $url);
  137. if(!in_array($href, $alreadyCrawled))
  138. {
  139. $alreadyCrawled[] = $href;
  140. $crawling[] = $href;
  141. getDetails($href);
  142. }
  143. //else return; //DEBUGGING
  144. echo ($href . "<br>"); //DEBUGGING
  145. }
  146. array_shift($crawling);
  147. foreach($crawling as $site)
  148. followLinks($site);
  149. }
  150. }
  151. ?>