Compare commits

..

No commits in common. "main" and "v1.1.2-beta" have entirely different histories.

6 changed files with 30 additions and 258 deletions

2
.gitignore vendored
View file

@ -1,2 +0,0 @@
config.php

View file

@ -21,14 +21,11 @@ Written primarily in OOP style PHP with the intent of better understanding OOP a
- Organises search results by clicks/visits
- Pagination system at the bottom of the search page
- Shows 'results found' for search term
- Supports non-latin characters (UTF-8)
# Table of Contents
- [Setup and Usage](#setup-and-usage)
- [Docker](#docker)
- [Server Setup](#server-setup)
- [PHP Dependencies](#php-dependencies)
- [Connecting PHP to MySQL Server](#connecting-php-to-mysql-server)
- [Crawling Websites to Populate Images and Sites tables](#crawling-websites-to-populate-images-and-sites-tables)
- [Programming Logic](#programming-logic)
@ -47,30 +44,6 @@ Written primarily in OOP style PHP with the intent of better understanding OOP a
# Setup and Usage
Two methods of setup are discussed.
- Docker (Easiest)
- Server Setup
## Docker
Docker configuration files are available at [doogle-docker](https://github.com/safesploit/doogle-docker).
Presuming you already have [Docker](https://www.docker.com/) v3.9 (or greater) installed and configured.
git clone https://github.com/safesploit/doogle-docker.git
cd doogle-docker
sh build.sh
<p align="center">
<img width="857" alt="Screenshot 2023-02-22 at 21 11 33" src="https://user-images.githubusercontent.com/10171446/220760089-71baee5a-19ce-43e6-9cd5-35ce9e143400.png">
<img width="857" alt="image" src="https://user-images.githubusercontent.com/10171446/220760298-65e0b64e-3724-4e8e-b9ec-a86ba20d58c8.png">
Doogle is now accessible via [localhost:8000](http://localhost:8000).
For debugging phpMyAdmin has also been included on [localhost:8001](http://localhost:8001).
</p>
## Server Setup
v1.0.0-beta.1 is supported and tested in PHP 7.4, 8.0 and 8.1.
@ -121,7 +94,7 @@ In the file 'doogle-tables-no-data.sql' the database will be created as 'doogle'
### Form-based crawl
In your browser go to where the file is hosted http://localhost/crawl.php
In your browser go to where the file is hosted http://localhost/crawl-formSubmit.php
Paste the URL into the input field and press the Crawl button.

View file

@ -1,208 +0,0 @@
<?php
class Crawler
{
private $con;
public function __construct($con)
{
$this->con = $con;
}
function linkExists($url)
{
global $con;
$query = $con->prepare("SELECT * FROM sites WHERE url = :url");
$query->bindParam(":url", $url);
$query->execute();
return $query->rowCount() != 0;
}
function imageExists($src)
{
global $con;
$query = $con->prepare("SELECT * FROM images WHERE imageUrl = :src");
$query->bindParam(":src", $src);
$query->execute();
return $query->rowCount() != 0;
}
function insertLink($url, $title, $description, $keywords)
{
global $con;
$query = $con->prepare("INSERT INTO sites(url, title, description, keywords)
VALUES(:url, :title, :description, :keywords)");
$query->bindParam(":url", $url);
$query->bindParam(":title", $title);
$query->bindParam(":description", $description);
$query->bindParam(":keywords", $keywords);
return $query->execute();
}
function insertImage($url, $src, $alt, $title)
{
global $con;
$query = $con->prepare("INSERT INTO images(siteUrl, imageUrl, alt, title)
VALUES(:siteUrl, :imageUrl, :alt, :title)");
$query->bindParam(":siteUrl", $url);
$query->bindParam(":imageUrl", $src);
$query->bindParam(":alt", $alt);
$query->bindParam(":title", $title);
return $query->execute();
}
/* Converts relative link to absolute link */
function createLink($src, $url)
{
$scheme = parse_url($url)["scheme"]; // http
$host = parse_url($url)["host"]; // www.safesploit.com
if(substr($src, 0, 2) == "//")
$src = $scheme . ":" . $src;
else if(substr($src, 0, 1) == "/")
$src = $scheme . "://" . $host . $src;
else if(substr($src, 0, 2) == "./")
$src = $scheme . "://" . $host . dirname(parse_url($url)["path"]) . substr($src, 1);
else if(substr($src, 0, 3) == "../")
$src = $scheme . "://" . $host . "/" . $src;
else if(substr($src, 0, 5) != "https" && substr($src, 0, 4) != "http")
$src = $scheme . "://" . $host . "/" . $src;
return $src;
}
function getDetails($url)
{
global $alreadyFoundImages;
$parser = new DomDocumentParser($url);
$titleArray = $parser->getTitleTags();
if(sizeof($titleArray) == 0 || $titleArray->item(0) == NULL)
return;
//Replace linebreak
$title = $titleArray->item(0)->nodeValue;
$title = str_replace("\n", "", $title);
//Return if no <title>
if($title == "")
return;
$description = "";
$keywords = "";
$metasArray = $parser->getMetatags();
foreach($metasArray as $meta)
{
if($meta->getAttribute("name") == "description")
$description = $meta->getAttribute("content");
if($meta->getAttribute("name") == "keywords")
$keywords = $meta->getAttribute("content");
}
$description = str_replace("\n", "", $description);
$keywords = str_replace("\n", "", $keywords);
//Non-ASCII char encoding
// $title = json_encode($title);
// $description = json_encode($description);
// $keywords = json_encode($keywords);
if(linkExists($url))
echo "$url already exists<br>";
else if(insertLink($url, $title, $description, $keywords))
echo "SUCCESS: $url<br>";
else
echo "ERROR: Failed to insert $url<br>";
$imageArray = $parser->getImages();
foreach($imageArray as $image)
{
$src = $image->getAttribute("src");
$alt = $image->getAttribute("alt");
$title = $image->getAttribute("title");
if(!$title && !$alt)
continue;
$src = createLink($src, $url);
if(!in_array($src, $alreadyFoundImages))
{
$alreadyFoundImages[] = $src;
if(imageExists($src))
echo "$src already exists<br>";
else if(insertImage($url, $src, $alt, $title))
echo "SUCCESS: $src<br>";
else
echo "ERROR: Failed to insert $src<br>";
}
}
echo "<b>URL:</b> $url, <b>Title:</b> $title, <b>Description:</b> $description, <b>keywords:</b> $keywords<br>"; //DEBUGGING sites
echo "<b>src:</b> <a href=$src>$src</a>, <b>alt:</b> $alt, <b>title:</b> $title, <b>url:</b> $url<br>"; //DEBUGGING images
}
function followLinks($url)
{
global $alreadyCrawled;
global $crawling;
$parser = new DomDocumentParser($url);
$linkList = $parser->getLinks();
foreach($linkList as $link)
{
$href = $link->getAttribute("href");
// Filter hrefs
if(strpos($href, "#") !== false)
continue;
else if(substr($href, 0, 11) == "javascript:")
continue;
$href = createLink($href, $url);
if(!in_array($href, $alreadyCrawled))
{
$alreadyCrawled[] = $href;
$crawling[] = $href;
getDetails($href);
}
//else return; //DEBUGGING
echo ($href . "<br>"); //DEBUGGING
}
array_shift($crawling);
foreach($crawling as $site)
followLinks($site);
}
}
?>

View file

@ -2,9 +2,9 @@
ob_start();
$dbname = "doogle";
$dbhost = "mysql_db";
$dbuser = "doogle";
$dbpass = "PASSWORD_HERE";
$dbhost = "192.168.5.240";
$dbuser = "root";
$dbpass = "";
try
{
@ -15,4 +15,4 @@ catch(PDOExeption $e)
{
echo "Connection failed: " . $e->getMessage();
}
?>
?>

View file

@ -1,6 +1,5 @@
<?php
include("config.php");
include("classes/Crawler.php");
include("classes/DomDocumentParser.php");
if(isset($_SESSION['loggedin']))
@ -125,6 +124,11 @@ function getDetails($url)
$description = str_replace("\n", "", $description);
$keywords = str_replace("\n", "", $keywords);
//Non-ASCII char encoding
// $title = json_encode($title);
// $description = json_encode($description);
// $keywords = json_encode($keywords);
if(linkExists($url))
echo "$url already exists<br>";
else if(insertLink($url, $title, $description, $keywords))
@ -241,9 +245,13 @@ function followLinks($url)
<?php
if (isset($_POST['url']))
{
$crawlerObj = new Crawler($con);
$startUrl = $_POST['url'];
// $crawlerObj->followLinks($startUrl);
followLinks($startUrl);
// $url = "https://pogoda.wp.pl/";
// $title = "Pogoda WP.pl - na dziś, na jutro, długoterminowa dla Polski, Europy i Świata";
// $description = "Prognoza pogody na dziś, jutro i najbliższe dni w WP.pl. Sprawdź jaka pogoda czeka Cię w ciągu najbliższych dni!";
// $keywords = "";
// insertLink($url, $title, $description, $keywords);
}
?>

View file

@ -1,4 +1,11 @@
-- phpMyAdmin SQL Dump - No Data
-- version 5.1.1
-- https://www.phpmyadmin.net/
--
-- Host: 192.168.5.240
-- Generation Time: Apr 24, 2022 at 09:25 AM
-- Server version: 8.0.28-0ubuntu0.20.04.3
-- PHP Version: 7.4.24
SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO";
SET AUTOCOMMIT = 0;
@ -11,16 +18,10 @@ SET time_zone = "+00:00";
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8mb4 */;
--
-- User Creation: `doogle`
--
CREATE USER IF NOT EXISTS 'doogle'@'%' IDENTIFIED BY 'PASSWORD_HERE';
GRANT SELECT, INSERT, UPDATE ON `doogle`.* TO 'doogle'@'%';
--
-- Database: `doogle`
--
CREATE DATABASE IF NOT EXISTS `doogle` DEFAULT CHARACTER SET utf8mb4;
CREATE DATABASE IF NOT EXISTS `doogle` DEFAULT CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci;
USE `doogle`;
-- --------------------------------------------------------
@ -29,7 +30,7 @@ USE `doogle`;
-- Table structure for table `images`
--
CREATE TABLE IF NOT EXISTS `images` (
CREATE TABLE `images` (
`id` int(11) NOT NULL,
`siteUrl` varchar(512) NOT NULL,
`imageUrl` varchar(512) NOT NULL,
@ -37,7 +38,7 @@ CREATE TABLE IF NOT EXISTS `images` (
`title` varchar(512) NOT NULL,
`clicks` int(11) NOT NULL DEFAULT '0',
`broken` tinyint(4) NOT NULL DEFAULT '0'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
@ -45,14 +46,14 @@ CREATE TABLE IF NOT EXISTS `images` (
-- Table structure for table `sites`
--
CREATE TABLE IF NOT EXISTS `sites` (
CREATE TABLE `sites` (
`id` int(11) NOT NULL,
`url` varchar(512) NOT NULL,
`title` varchar(512) NOT NULL,
`description` varchar(512) NOT NULL,
`keywords` varchar(512) NOT NULL,
`clicks` int(11) NOT NULL DEFAULT '0'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
-- --------------------------------------------------------
@ -60,12 +61,12 @@ CREATE TABLE IF NOT EXISTS `sites` (
-- Table structure for table `users`
--
CREATE TABLE IF NOT EXISTS `users` (
CREATE TABLE `users` (
`id` int(11) NOT NULL,
`username` varchar(100) NOT NULL,
`email` varchar(150) NOT NULL,
`password` varchar(255) NOT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
) ENGINE=InnoDB DEFAULT CHARSET=latin1;
--