f4b2cb4298
The first version of the theme
171 lines
5.1 KiB
PHP
171 lines
5.1 KiB
PHP
<?php
|
|
|
|
class SitemapGenerator
|
|
{
|
|
// Config file with crawler/sitemap options
|
|
private $config;
|
|
|
|
// Array containing all scanned pages
|
|
private $scanned;
|
|
|
|
// The base of the given site url
|
|
// EXAMPLE: https://student-laptop.nl
|
|
private $site_url_base;
|
|
|
|
// File where sitemap is written to.
|
|
private $sitemap_file;
|
|
|
|
// Constructor sets the given file for internal use
|
|
public function __construct($conf)
|
|
{
|
|
// Setup class variables using the config
|
|
$this->config = $conf;
|
|
$this->scanned = [];
|
|
$this->site_url_base = parse_url($this->config['SITE_URL'])['scheme'] . "://" . parse_url($this->config['SITE_URL'])['host'];
|
|
$this->sitemap_file = fopen($this->config['SAVE_LOC'], "w");
|
|
}
|
|
|
|
public function GenerateSitemap()
|
|
{
|
|
// Call the recursive crawl function with the start url.
|
|
$this->crawlPage($this->config['SITE_URL']);
|
|
|
|
// Generate a sitemap with the scanned pages.
|
|
$this->generateFile($this->scanned);
|
|
}
|
|
|
|
// Get the html content of a page and return it as a dom object
|
|
private function getHtml($url)
|
|
{
|
|
// Get html from the given page
|
|
$curl = curl_init();
|
|
curl_setopt($curl, CURLOPT_URL, $url);
|
|
curl_setopt($curl, CURLOPT_RETURNTRANSFER, true);
|
|
$html = curl_exec($curl);
|
|
curl_close($curl);
|
|
|
|
//Load the html and store it into a DOM object
|
|
$dom = new DOMDocument();
|
|
@$dom->loadHTML($html);
|
|
|
|
return $dom;
|
|
}
|
|
|
|
// Recursive function that crawls a page's anchor tags and store them in the scanned array.
|
|
private function crawlPage($page_url)
|
|
{
|
|
$url = filter_var($page_url, FILTER_SANITIZE_URL);
|
|
|
|
// Check if the url is invalid or if the page is already scanned;
|
|
if (in_array($url, $this->scanned) || !filter_var($page_url, FILTER_VALIDATE_URL)) {
|
|
return;
|
|
}
|
|
|
|
// Add the page url to the scanned array
|
|
array_push($this->scanned, $page_url);
|
|
|
|
// Get the html content from the
|
|
$html = $this->getHtml($url);
|
|
$anchors = $html->getElementsByTagName('a');
|
|
|
|
// Loop through all anchor tags on the page
|
|
foreach ($anchors as $a) {
|
|
$next_url = $a->getAttribute('href');
|
|
|
|
// Check if there is a anchor ID set in the config.
|
|
if ($this->config['CRAWL_ANCHORS_WITH_ID'] != "") {
|
|
// Check if the id is set and matches the config setting, else it will move on to the next anchor
|
|
if ($a->getAttribute('id') != "" || $a->getAttribute('id') == $this->config['CRAWL_ANCHORS_WITH_ID']) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Split page url into base and extra parameters
|
|
$base_page_url = explode("?", $page_url)[0];
|
|
|
|
if (!$this->config['ALLOW_ELEMENT_LINKS']) {
|
|
// Skip the url if it starts with a # or is equal to root.
|
|
if (substr($next_url, 0, 1) == "#" || $next_url == "/") {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Check if the given url is external, if yes it will skip the iteration
|
|
// This code will only run if you set ALLOW_EXTERNAL_LINKS to false in the config.
|
|
if (!$this->config['ALLOW_EXTERNAL_LINKS']) {
|
|
$parsed_url = parse_url($next_url);
|
|
if (isset($parsed_url['host'])) {
|
|
if ($parsed_url['host'] != parse_url($this->config['SITE_URL'])['host']) {
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Check if the link is absolute or relative.
|
|
if (substr($next_url, 0, 7) != "http://" && substr($next_url, 0, 8) != "https://") {
|
|
$next_url = $this->convertRelativeToAbsolute($base_page_url, $next_url);
|
|
}
|
|
|
|
// Check if the next link contains any of the pages to skip. If true, the loop will move on to the next iteration.
|
|
$found = false;
|
|
foreach ($this->config['KEYWORDS_TO_SKIP'] as $skip) {
|
|
if (strpos($next_url, $skip) || $next_url === $skip) {
|
|
$found = true;
|
|
}
|
|
}
|
|
|
|
// Call the function again with the new URL
|
|
if (!$found) {
|
|
$this->crawlPage($next_url);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Convert a relative link to a absolute link
|
|
// Example: Relative /articles
|
|
// Absolute https://student-laptop.nl/articles
|
|
private function convertRelativeToAbsolute($page_base_url, $link)
|
|
{
|
|
$first_character = substr($link, 0, 1);
|
|
if ($first_character == "?" || $first_character == "#") {
|
|
return $page_base_url . $link;
|
|
} else if ($first_character != "/") {
|
|
return $this->site_url_base . "/" . $link;
|
|
} else {
|
|
return $this->site_url_base . $link;
|
|
}
|
|
}
|
|
|
|
// Function to generate a Sitemap with the given pages array where the script has run through
|
|
private function generateFile($pages)
|
|
{
|
|
$xml = '<?xml version="1.0" encoding="UTF-8"?>
|
|
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
<!-- ' . count($pages) . ' total pages-->
|
|
<!-- PHP-sitemap-generator by https://github.com/tristangoossens -->';
|
|
|
|
|
|
// Print the amount of pages
|
|
echo count($pages);
|
|
|
|
foreach ($pages as $page) {
|
|
$xml .= "<url><loc>" . $page . "</loc>
|
|
<lastmod>" . $this->config['LAST_UPDATED'] . "</lastmod>
|
|
<changefreq>" . $this->config['CHANGE_FREQUENCY'] . "</changefreq>
|
|
<priority>" . $this->config['PRIORITY'] . "</priority></url>";
|
|
}
|
|
|
|
$xml .= "</urlset>";
|
|
$xml = str_replace('&', '&', $xml);
|
|
|
|
// Format string to XML
|
|
$dom = new DOMDocument;
|
|
$dom->preserveWhiteSpace = FALSE;
|
|
$dom->loadXML($xml);
|
|
$dom->formatOutput = TRUE;
|
|
|
|
// Write XML to file and close it
|
|
fwrite($this->sitemap_file, $dom->saveXML());
|
|
fclose($this->sitemap_file);
|
|
}
|
|
}
|