wiby/c/checkrobots.h
2023-08-10 00:10:24 -04:00

257 lines
7.5 KiB
C
Executable file

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
//#include </usr/include/curl/curl.h> //RHEL/Rocky
//#include </usr/include/curl/easy.h> //RHEL/Rocky
//#include </usr/include/x86_64-linux-gnu/curl/curl.h> //ubuntu 20/22
//#include </usr/include/x86_64-linux-gnu/curl/easy.h> //ubuntu 20/22
//gcc checkrobots.c -o checkrobots -lcurl
#define rwindow_len 100
FILE *robotsfile;
char *robotsfilestr,robotsurl[1011],rwindow[rwindow_len];
//char rURLpath[] = "/dumpop/";
size_t write_data_checkrobots(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t written = fwrite(ptr, size, nmemb, stream);
return written;
}
int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length);
//int main(int argc, char **argv)
int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
{
if(rURLprefix[0]==0 || rDomain[0]==0 || rURLpath[0]==0)
return 1;
if(strlen(rDomain)>253)
return 0;
if(strlen(rURLpath)>500)
return 0;
memset(rwindow,'?',rwindow_len);
// rwindow[rwindow_len]=0;
curl_global_init(CURL_GLOBAL_DEFAULT);
CURL *curl;
FILE *fp;
CURLcode res;
curl = curl_easy_init();
memset(robotsurl,0,1011);
strcpy(robotsurl,rURLprefix);
strcat(robotsurl,rDomain);
strcat(robotsurl,"/robots.txt");
char outfilename[300];
memset(outfilename,0,300);
strcpy(outfilename,"robots/");
strcat(outfilename,rDomain);
strcat(outfilename,".txt");
long fsize=0,response_code_checkrobots=0;
char *finalURL_checkrobots = NULL;
int foundfile=0,alloced=0;
char rb,rwb;
printf("\nChecking robots.txt: ");
//open robots.txt file and load into memory, or download it if it doesn't exist
if(robotsfile = fopen(outfilename, "rb")){
fseek(robotsfile, 0, SEEK_END);
fsize = ftell(robotsfile);
fseek(robotsfile, 0, SEEK_SET); /* same as rewind(f); */
robotsfilestr = malloc(fsize + 1);
alloced=1;
if(fread(robotsfilestr, 1, fsize, robotsfile)){}
fclose(robotsfile);
robotsfilestr[fsize] = 0;
//printf("%ld",fsize);
foundfile=1;
}else if (curl) {
printf("Downloading... ");
if(fp = fopen(outfilename,"wb")){
//set curl options
curl_easy_setopt(curl, CURLOPT_URL, robotsurl);// set URL to get here
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; WebCrawler; SearchEngine)");
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data_checkrobots);// send all data to this function //
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle
curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30L);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15L);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);//max num of redirects
curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 1000000L);//don't download if over 1MB
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);//0 or 1 to verify ssl
res = curl_easy_perform(curl);// get it!
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL_checkrobots);
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code_checkrobots);
//curl_easy_cleanup(curl);// always cleanup (done further down)
fclose(fp);
if(response_code_checkrobots!=200){
fp = fopen(outfilename,"wb");
fclose(fp);
}
}else{
curl_easy_cleanup(curl);
curl_global_cleanup();
printf("\nFailed to create file: %s - proceeding anyway.",outfilename);
return 1;
}
}
if(response_code_checkrobots==200 && foundfile==0){
robotsfile = fopen(outfilename, "rb");
fseek(robotsfile, 0, SEEK_END);
fsize = ftell(robotsfile);
fseek(robotsfile, 0, SEEK_SET); // same as rewind(f);
robotsfilestr = malloc(fsize + 1);
alloced=1;
if(fread(robotsfilestr, 1, fsize, robotsfile)){}
fclose(robotsfile);
robotsfilestr[fsize] = 0;
//printf("%ld",fsize);
}
//parse the robots.txt file
if(response_code_checkrobots==200 || foundfile==1 && fsize > 11){
int foundUserAgent=0,foundDisallow=0,foundAllow=0,comment=0,match=0;
int k=0,lenurlpath=strlen(rURLpath),rwupdated=0,result=1;
for(int i=0;i<fsize;i++){
rb = robotsfilestr[i];
//use a rolling window of 100 bytes to detect elements, ignore space/null/tab
if(rb != 32 && rb != 0 && rb != 9){
for(int j=0;j<rwindow_len-1;j++){
rwindow[j] = rwindow[j+1];
}
rwindow[rwindow_len-1] = rwb = rb;
rwupdated=1;
}
if(rwb==35){
comment=1;
}
if(rwb==10 || rwb==13){
comment=0;
}
if(comment==0){
//get my specific user-agent
//change this to something else if you want
//robots.txt file would need to call this ahead of the '*' user-agent or else will get ignored.
if(foundUserAgent==0 && locateInRWindow(rwindow,"user-agent:wibybot","USER-AGENT:WIBYBOT",18)==1){
foundUserAgent=1;
//printf("\nfound user agent!");
}
//get universal user-agent //change this to something else if you want
if(foundUserAgent==0 && locateInRWindow(rwindow,"user-agent:*","USER-AGENT:*",12)==1){
foundUserAgent=1;
//printf("\nfound user agent!");
}
//if another user-agent detected after, end loop
if(foundUserAgent==1 && locateInRWindow(rwindow,"user-agent:","USER-AGENT:",11)==1){
break;
}
//end if 'Disallow: /'
if(foundUserAgent==1 && locateInRWindow(rwindow,"disallow:/\n","DISALLOW:/\n",11)==1){
result=0;
}
if(foundUserAgent==1 && locateInRWindow(rwindow,"disallow:/\r","DISALLOW:/\r",11)==1){
result=0;
}
if(i==fsize-1 && foundUserAgent==1 && locateInRWindow(rwindow,"disallow:/","DISALLOW:/",10)==1){
result=0;
}
//check if path is disallowed in url
if(rwupdated==1 && foundDisallow==1){
if(rwb!=10 && rwb!=13){
//get path
if(k<lenurlpath && rwb==rURLpath[k])
match=1;
if(k<lenurlpath && rwb!=rURLpath[k])
match=0;
if(k>=lenurlpath)
match=0;
k++;
}
if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){
result=0;
foundDisallow=0;
}
if(match==0)
foundDisallow=k=0;
}
//check if path is allowed in url
if(rwupdated==1 && foundAllow==1){
if(rwb!=10 && rwb!=13){
//get path
if(k<lenurlpath && rwb==rURLpath[k])
match=1;
if(k<lenurlpath && rwb!=rURLpath[k])
match=0;
if(k>=lenurlpath)
match=0;
k++;
}
if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){
printf("Permitted.");
curl_easy_cleanup(curl);
curl_global_cleanup();
if(alloced==1)
free(robotsfilestr);
return 1;
}
if(match==0)
foundAllow=k=0;
}
if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"disallow:","DISALLOW:",9)==1){
foundDisallow=1;
foundAllow=0;
k=0;
//printf("\nfound disallow");
}
if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"\nallow:","\nALLOW:",7)==1){
foundDisallow=0;
foundAllow=1;
k=0;
//printf("\nfound allow");
}
}
rwupdated=0;
}
if(result==0){
printf("Denied.");
curl_easy_cleanup(curl);
curl_global_cleanup();
if(alloced==1)
free(robotsfilestr);
return 0;
}else{
printf("Permitted.");
curl_easy_cleanup(curl);
curl_global_cleanup();
if(alloced==1)
free(robotsfilestr);
return 1;
}
}
printf("Permitted.");
curl_easy_cleanup(curl);
if(alloced==1)
free(robotsfilestr);
return 1;
}
int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length)
{
int start = rwindow_len-length;
for(int i=0;i<length;i++){
if(window[start] != birdLower[i] && window[start] != birdUpper[i]){
return 0;
}
start++;
}
return 1;
}