Compare commits
No commits in common. "main" and "v1.1.2-beta" have entirely different histories.
main
...
v1.1.2-bet
6 changed files with 30 additions and 258 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,2 +0,0 @@
|
|||
|
||||
config.php
|
29
README.md
29
README.md
|
@ -21,14 +21,11 @@ Written primarily in OOP style PHP with the intent of better understanding OOP a
|
|||
- Organises search results by clicks/visits
|
||||
- Pagination system at the bottom of the search page
|
||||
- Shows 'results found' for search term
|
||||
- Supports non-latin characters (UTF-8)
|
||||
|
||||
# Table of Contents
|
||||
|
||||
- [Setup and Usage](#setup-and-usage)
|
||||
- [Docker](#docker)
|
||||
- [Server Setup](#server-setup)
|
||||
- [PHP Dependencies](#php-dependencies)
|
||||
- [Connecting PHP to MySQL Server](#connecting-php-to-mysql-server)
|
||||
- [Crawling Websites to Populate Images and Sites tables](#crawling-websites-to-populate-images-and-sites-tables)
|
||||
- [Programming Logic](#programming-logic)
|
||||
|
@ -47,30 +44,6 @@ Written primarily in OOP style PHP with the intent of better understanding OOP a
|
|||
|
||||
# Setup and Usage
|
||||
|
||||
Two methods of setup are discussed.
|
||||
- Docker (Easiest)
|
||||
- Server Setup
|
||||
|
||||
## Docker
|
||||
|
||||
Docker configuration files are available at [doogle-docker](https://github.com/safesploit/doogle-docker).
|
||||
|
||||
Presuming you already have [Docker](https://www.docker.com/) v3.9 (or greater) installed and configured.
|
||||
|
||||
git clone https://github.com/safesploit/doogle-docker.git
|
||||
cd doogle-docker
|
||||
sh build.sh
|
||||
|
||||
<p align="center">
|
||||
<img width="857" alt="Screenshot 2023-02-22 at 21 11 33" src="https://user-images.githubusercontent.com/10171446/220760089-71baee5a-19ce-43e6-9cd5-35ce9e143400.png">
|
||||
<img width="857" alt="image" src="https://user-images.githubusercontent.com/10171446/220760298-65e0b64e-3724-4e8e-b9ec-a86ba20d58c8.png">
|
||||
|
||||
Doogle is now accessible via [localhost:8000](http://localhost:8000).
|
||||
|
||||
For debugging phpMyAdmin has also been included on [localhost:8001](http://localhost:8001).
|
||||
|
||||
</p>
|
||||
|
||||
## Server Setup
|
||||
|
||||
v1.0.0-beta.1 is supported and tested in PHP 7.4, 8.0 and 8.1.
|
||||
|
@ -121,7 +94,7 @@ In the file 'doogle-tables-no-data.sql' the database will be created as 'doogle'
|
|||
|
||||
### Form-based crawl
|
||||
|
||||
In your browser go to where the file is hosted http://localhost/crawl.php
|
||||
In your browser go to where the file is hosted http://localhost/crawl-formSubmit.php
|
||||
|
||||
Paste the URL into the input field and press the Crawl button.
|
||||
|
||||
|
|
|
@ -1,208 +0,0 @@
|
|||
<?php
|
||||
class Crawler
|
||||
{
|
||||
private $con;
|
||||
|
||||
public function __construct($con)
|
||||
{
|
||||
$this->con = $con;
|
||||
}
|
||||
|
||||
|
||||
|
||||
function linkExists($url)
|
||||
{
|
||||
global $con;
|
||||
|
||||
$query = $con->prepare("SELECT * FROM sites WHERE url = :url");
|
||||
|
||||
$query->bindParam(":url", $url);
|
||||
$query->execute();
|
||||
|
||||
return $query->rowCount() != 0;
|
||||
}
|
||||
|
||||
function imageExists($src)
|
||||
{
|
||||
global $con;
|
||||
|
||||
$query = $con->prepare("SELECT * FROM images WHERE imageUrl = :src");
|
||||
|
||||
$query->bindParam(":src", $src);
|
||||
$query->execute();
|
||||
|
||||
return $query->rowCount() != 0;
|
||||
}
|
||||
|
||||
|
||||
function insertLink($url, $title, $description, $keywords)
|
||||
{
|
||||
global $con;
|
||||
|
||||
$query = $con->prepare("INSERT INTO sites(url, title, description, keywords)
|
||||
VALUES(:url, :title, :description, :keywords)");
|
||||
|
||||
$query->bindParam(":url", $url);
|
||||
$query->bindParam(":title", $title);
|
||||
$query->bindParam(":description", $description);
|
||||
$query->bindParam(":keywords", $keywords);
|
||||
|
||||
return $query->execute();
|
||||
}
|
||||
|
||||
function insertImage($url, $src, $alt, $title)
|
||||
{
|
||||
global $con;
|
||||
|
||||
$query = $con->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
|
||||
VALUES(:siteUrl, :imageUrl, :alt, :title)");
|
||||
|
||||
$query->bindParam(":siteUrl", $url);
|
||||
$query->bindParam(":imageUrl", $src);
|
||||
$query->bindParam(":alt", $alt);
|
||||
$query->bindParam(":title", $title);
|
||||
|
||||
return $query->execute();
|
||||
}
|
||||
|
||||
/* Converts relative link to absolute link */
|
||||
function createLink($src, $url)
|
||||
{
|
||||
$scheme = parse_url($url)["scheme"]; // http
|
||||
$host = parse_url($url)["host"]; // www.safesploit.com
|
||||
|
||||
if(substr($src, 0, 2) == "//")
|
||||
$src = $scheme . ":" . $src;
|
||||
else if(substr($src, 0, 1) == "/")
|
||||
$src = $scheme . "://" . $host . $src;
|
||||
else if(substr($src, 0, 2) == "./")
|
||||
$src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
|
||||
else if(substr($src, 0, 3) == "../")
|
||||
$src = $scheme . "://" . $host . "/" . $src;
|
||||
else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http")
|
||||
$src = $scheme . "://" . $host . "/" . $src;
|
||||
|
||||
return $src;
|
||||
}
|
||||
|
||||
function getDetails($url)
|
||||
{
|
||||
global $alreadyFoundImages;
|
||||
|
||||
$parser = new DomDocumentParser($url);
|
||||
|
||||
$titleArray = $parser->getTitleTags();
|
||||
|
||||
if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL)
|
||||
return;
|
||||
|
||||
//Replace linebreak
|
||||
$title = $titleArray->item(0)->nodeValue;
|
||||
$title = str_replace("\n", "", $title);
|
||||
|
||||
//Return if no <title>
|
||||
if($title == "")
|
||||
return;
|
||||
|
||||
$description = "";
|
||||
$keywords = "";
|
||||
|
||||
$metasArray = $parser->getMetatags();
|
||||
|
||||
foreach($metasArray as $meta)
|
||||
{
|
||||
if($meta->getAttribute("name") == "description")
|
||||
$description = $meta->getAttribute("content");
|
||||
|
||||
if($meta->getAttribute("name") == "keywords")
|
||||
$keywords = $meta->getAttribute("content");
|
||||
}
|
||||
|
||||
$description = str_replace("\n", "", $description);
|
||||
$keywords = str_replace("\n", "", $keywords);
|
||||
|
||||
//Non-ASCII char encoding
|
||||
// $title = json_encode($title);
|
||||
// $description = json_encode($description);
|
||||
// $keywords = json_encode($keywords);
|
||||
|
||||
if(linkExists($url))
|
||||
echo "$url already exists<br>";
|
||||
else if(insertLink($url, $title, $description, $keywords))
|
||||
echo "SUCCESS: $url<br>";
|
||||
else
|
||||
echo "ERROR: Failed to insert $url<br>";
|
||||
|
||||
$imageArray = $parser->getImages();
|
||||
foreach($imageArray as $image)
|
||||
{
|
||||
$src = $image->getAttribute("src");
|
||||
$alt = $image->getAttribute("alt");
|
||||
$title = $image->getAttribute("title");
|
||||
|
||||
if(!$title && !$alt)
|
||||
continue;
|
||||
|
||||
$src = createLink($src, $url);
|
||||
|
||||
if(!in_array($src, $alreadyFoundImages))
|
||||
{
|
||||
$alreadyFoundImages[] = $src;
|
||||
|
||||
if(imageExists($src))
|
||||
echo "$src already exists<br>";
|
||||
else if(insertImage($url, $src, $alt, $title))
|
||||
echo "SUCCESS: $src<br>";
|
||||
else
|
||||
echo "ERROR: Failed to insert $src<br>";
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
echo "<b>URL:</b> $url, <b>Title:</b> $title, <b>Description:</b> $description, <b>keywords:</b> $keywords<br>"; //DEBUGGING sites
|
||||
echo "<b>src:</b> <a href=$src>$src</a>, <b>alt:</b> $alt, <b>title:</b> $title, <b>url:</b> $url<br>"; //DEBUGGING images
|
||||
}
|
||||
|
||||
function followLinks($url)
|
||||
{
|
||||
global $alreadyCrawled;
|
||||
global $crawling;
|
||||
|
||||
$parser = new DomDocumentParser($url);
|
||||
|
||||
$linkList = $parser->getLinks();
|
||||
|
||||
|
||||
foreach($linkList as $link)
|
||||
{
|
||||
$href = $link->getAttribute("href");
|
||||
|
||||
// Filter hrefs
|
||||
if(strpos($href, "#") !== false)
|
||||
continue;
|
||||
else if(substr($href, 0, 11) == "javascript:")
|
||||
continue;
|
||||
|
||||
$href = createLink($href, $url);
|
||||
|
||||
if(!in_array($href, $alreadyCrawled))
|
||||
{
|
||||
$alreadyCrawled[] = $href;
|
||||
$crawling[] = $href;
|
||||
|
||||
getDetails($href);
|
||||
}
|
||||
//else return; //DEBUGGING
|
||||
|
||||
echo ($href . "<br>"); //DEBUGGING
|
||||
}
|
||||
|
||||
array_shift($crawling);
|
||||
|
||||
foreach($crawling as $site)
|
||||
followLinks($site);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
?>
|
|
@ -2,9 +2,9 @@
|
|||
ob_start();
|
||||
|
||||
$dbname = "doogle";
|
||||
$dbhost = "mysql_db";
|
||||
$dbuser = "doogle";
|
||||
$dbpass = "PASSWORD_HERE";
|
||||
$dbhost = "192.168.5.240";
|
||||
$dbuser = "root";
|
||||
$dbpass = "";
|
||||
|
||||
try
|
||||
{
|
||||
|
@ -15,4 +15,4 @@ catch(PDOExeption $e)
|
|||
{
|
||||
echo "Connection failed: " . $e->getMessage();
|
||||
}
|
||||
?>
|
||||
?>
|
14
crawl.php
14
crawl.php
|
@ -1,6 +1,5 @@
|
|||
<?php
|
||||
include("config.php");
|
||||
include("classes/Crawler.php");
|
||||
include("classes/DomDocumentParser.php");
|
||||
|
||||
if(isset($_SESSION['loggedin']))
|
||||
|
@ -125,6 +124,11 @@ function getDetails($url)
|
|||
$description = str_replace("\n", "", $description);
|
||||
$keywords = str_replace("\n", "", $keywords);
|
||||
|
||||
//Non-ASCII char encoding
|
||||
// $title = json_encode($title);
|
||||
// $description = json_encode($description);
|
||||
// $keywords = json_encode($keywords);
|
||||
|
||||
if(linkExists($url))
|
||||
echo "$url already exists<br>";
|
||||
else if(insertLink($url, $title, $description, $keywords))
|
||||
|
@ -241,9 +245,13 @@ function followLinks($url)
|
|||
<?php
|
||||
if (isset($_POST['url']))
|
||||
{
|
||||
$crawlerObj = new Crawler($con);
|
||||
$startUrl = $_POST['url'];
|
||||
// $crawlerObj->followLinks($startUrl);
|
||||
followLinks($startUrl);
|
||||
// $url = "https://pogoda.wp.pl/";
|
||||
// $title = "Pogoda WP.pl - na dziś, na jutro, długoterminowa dla Polski, Europy i Świata";
|
||||
// $description = "Prognoza pogody na dziś, jutro i najbliższe dni w WP.pl. Sprawdź jaka pogoda czeka Cię w ciągu najbliższych dni!";
|
||||
// $keywords = "";
|
||||
// insertLink($url, $title, $description, $keywords);
|
||||
|
||||
}
|
||||
?>
|
||||
|
|
|
@ -1,4 +1,11 @@
|
|||
-- phpMyAdmin SQL Dump - No Data
|
||||
-- version 5.1.1
|
||||
-- https://www.phpmyadmin.net/
|
||||
--
|
||||
-- Host: 192.168.5.240
|
||||
-- Generation Time: Apr 24, 2022 at 09:25 AM
|
||||
-- Server version: 8.0.28-0ubuntu0.20.04.3
|
||||
-- PHP Version: 7.4.24
|
||||
|
||||
SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
|
||||
SET AUTOCOMMIT = 0;
|
||||
|
@ -11,16 +18,10 @@ SET time_zone = "+00:00";
|
|||
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
|
||||
/*!40101 SET NAMES utf8mb4 */;
|
||||
|
||||
--
|
||||
-- User Creation: `doogle`
|
||||
--
|
||||
CREATE USER IF NOT EXISTS 'doogle'@'%' IDENTIFIED BY 'PASSWORD_HERE';
|
||||
GRANT SELECT, INSERT, UPDATE ON `doogle`.* TO 'doogle'@'%';
|
||||
|
||||
--
|
||||
-- Database: `doogle`
|
||||
--
|
||||
CREATE DATABASE IF NOT EXISTS `doogle` DEFAULT CHARACTER SET utf8mb4;
|
||||
CREATE DATABASE IF NOT EXISTS `doogle` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci;
|
||||
USE `doogle`;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
@ -29,7 +30,7 @@ USE `doogle`;
|
|||
-- Table structure for table `images`
|
||||
--
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `images` (
|
||||
CREATE TABLE `images` (
|
||||
`id` int(11) NOT NULL,
|
||||
`siteUrl` varchar(512) NOT NULL,
|
||||
`imageUrl` varchar(512) NOT NULL,
|
||||
|
@ -37,7 +38,7 @@ CREATE TABLE IF NOT EXISTS `images` (
|
|||
`title` varchar(512) NOT NULL,
|
||||
`clicks` int(11) NOT NULL DEFAULT '0',
|
||||
`broken` tinyint(4) NOT NULL DEFAULT '0'
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
|
@ -45,14 +46,14 @@ CREATE TABLE IF NOT EXISTS `images` (
|
|||
-- Table structure for table `sites`
|
||||
--
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `sites` (
|
||||
CREATE TABLE `sites` (
|
||||
`id` int(11) NOT NULL,
|
||||
`url` varchar(512) NOT NULL,
|
||||
`title` varchar(512) NOT NULL,
|
||||
`description` varchar(512) NOT NULL,
|
||||
`keywords` varchar(512) NOT NULL,
|
||||
`clicks` int(11) NOT NULL DEFAULT '0'
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
|
||||
|
||||
-- --------------------------------------------------------
|
||||
|
||||
|
@ -60,12 +61,12 @@ CREATE TABLE IF NOT EXISTS `sites` (
|
|||
-- Table structure for table `users`
|
||||
--
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `users` (
|
||||
CREATE TABLE `users` (
|
||||
`id` int(11) NOT NULL,
|
||||
`username` varchar(100) NOT NULL,
|
||||
`email` varchar(150) NOT NULL,
|
||||
`password` varchar(255) NOT NULL
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
|
||||
|
||||
|
||||
--
|
||||
|
|
Loading…
Add table
Reference in a new issue