#include #include #include //#include //ubuntu 16 //#include //ubuntu 16 #include //ubuntu 20 #include //ubuntu 20 //gcc checkrobots.c -o checkrobots -lcurl #define rwindow_len 100 FILE *robotsfile; char *robotsfilestr,robotsurl[1011],rwindow[rwindow_len]; //char rURLpath[] = "/dumpop/"; size_t write_data_checkrobots(void *ptr, size_t size, size_t nmemb, FILE *stream) { size_t written = fwrite(ptr, size, nmemb, stream); return written; } int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length); //int main(int argc, char **argv) int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) { if(rURLprefix[0]==0 || rDomain[0]==0 || rURLpath[0]==0) return 1; if(strlen(rDomain)>253) return 0; if(strlen(rURLpath)>500) return 0; memset(rwindow,'?',rwindow_len); rwindow[rwindow_len]=0; //curl_global_init(CURL_GLOBAL_ALL); CURL *curl; FILE *fp; CURLcode res; memset(robotsurl,0,1011); strcpy(robotsurl,rURLprefix); strcat(robotsurl,rDomain); strcat(robotsurl,"/robots.txt"); char outfilename[300]; memset(outfilename,0,300); strcpy(outfilename,"robots/"); strcat(outfilename,rDomain); strcat(outfilename,".txt"); curl = curl_easy_init(); long fsize=0,response_code_checkrobots=0; char *finalURL_checkrobots = NULL; int foundfile=0; char rb,rwb; printf("\nChecking robots.txt: "); //open robots.txt file and load into memory, or download it if it doesn't exist if(robotsfile = fopen(outfilename, "rb")){ fseek(robotsfile, 0, SEEK_END); fsize = ftell(robotsfile); fseek(robotsfile, 0, SEEK_SET); /* same as rewind(f); */ robotsfilestr = malloc(fsize + 1); if(fread(robotsfilestr, 1, fsize, robotsfile)){} fclose(robotsfile); robotsfilestr[fsize] = 0; //printf("%ld",fsize); foundfile=1; }else if (curl) { printf("Downloading... "); if(fp = fopen(outfilename,"wb")){ //set curl options curl_easy_setopt(curl, CURLOPT_URL, robotsurl);// set URL to get here curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; compatible; WebCrawler; SearchEngine)"); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data_checkrobots);// send all data to this function // curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60L); curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 55L); curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);//max num of redirects curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 1000000L);//don't download if over 1MB curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);//0 or 1 to verify ssl res = curl_easy_perform(curl);// get it! curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL_checkrobots); curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code_checkrobots); curl_easy_cleanup(curl);// always cleanup fclose(fp); if(response_code_checkrobots!=200){ fp = fopen(outfilename,"wb"); fclose(fp); } }else{ printf("\nFailed to create file: %s - proceeding anyway.",outfilename); return 1; } } if(response_code_checkrobots==200 && foundfile==0){ robotsfile = fopen(outfilename, "rb"); fseek(robotsfile, 0, SEEK_END); fsize = ftell(robotsfile); fseek(robotsfile, 0, SEEK_SET); // same as rewind(f); robotsfilestr = malloc(fsize + 1); if(fread(robotsfilestr, 1, fsize, robotsfile)){} fclose(robotsfile); robotsfilestr[fsize] = 0; //printf("%ld",fsize); } //parse the robots.txt file if(response_code_checkrobots==200 || foundfile==1 && fsize > 11){ int foundUserAgent=0,foundDisallow=0,foundAllow=0,comment=0,match=0; int k=0,lenurlpath=strlen(rURLpath),rwupdated=0,result=1; for(int i=0;i=lenurlpath) match=0; k++; } if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){ result=0; foundDisallow=0; } if(match==0) foundDisallow=k=0; } //check if path is allowed in url if(rwupdated==1 && foundAllow==1){ if(rwb!=10 && rwb!=13){ //get path if(k=lenurlpath) match=0; k++; } if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){ printf("Permitted."); return 1; } if(match==0) foundAllow=k=0; } if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"disallow:","DISALLOW:",9)==1){ foundDisallow=1; foundAllow=0; k=0; //printf("\nfound disallow"); } if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"\nallow:","\nALLOW:",7)==1){ foundDisallow=0; foundAllow=1; k=0; //printf("\nfound allow"); } } rwupdated=0; } if(result==0){ printf("Denied."); return 0; }else{ printf("Permitted."); return 1; } } printf("Permitted."); return 1; } int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length) { int start = rwindow_len-length; for(int i=0;i