diff --git a/c/checkrobots.h b/c/checkrobots.h index f1f1e80..3d69df1 100755 --- a/c/checkrobots.h +++ b/c/checkrobots.h @@ -32,10 +32,11 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) memset(rwindow,'?',rwindow_len); // rwindow[rwindow_len]=0; - //curl_global_init(CURL_GLOBAL_ALL); + curl_global_init(CURL_GLOBAL_DEFAULT); CURL *curl; FILE *fp; CURLcode res; + curl = curl_easy_init(); memset(robotsurl,0,1011); strcpy(robotsurl,rURLprefix); strcat(robotsurl,rDomain); @@ -45,10 +46,9 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) strcpy(outfilename,"robots/"); strcat(outfilename,rDomain); strcat(outfilename,".txt"); - curl = curl_easy_init(); long fsize=0,response_code_checkrobots=0; char *finalURL_checkrobots = NULL; - int foundfile=0; + int foundfile=0,alloced=0; char rb,rwb; printf("\nChecking robots.txt: "); @@ -59,6 +59,7 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) fseek(robotsfile, 0, SEEK_SET); /* same as rewind(f); */ robotsfilestr = malloc(fsize + 1); + alloced=1; if(fread(robotsfilestr, 1, fsize, robotsfile)){} fclose(robotsfile); @@ -71,7 +72,7 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) if(fp = fopen(outfilename,"wb")){ //set curl options curl_easy_setopt(curl, CURLOPT_URL, robotsurl);// set URL to get here - curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; compatible; WebCrawler; SearchEngine)"); + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Wibybot; https://wiby.me/)"); curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data_checkrobots);// send all data to this function // curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects @@ -83,13 +84,15 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) res = curl_easy_perform(curl);// get it! curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL_checkrobots); curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code_checkrobots); - curl_easy_cleanup(curl);// always cleanup + //curl_easy_cleanup(curl);// always cleanup (done further down) fclose(fp); if(response_code_checkrobots!=200){ fp = fopen(outfilename,"wb"); fclose(fp); } }else{ + curl_easy_cleanup(curl); + curl_global_cleanup(); printf("\nFailed to create file: %s - proceeding anyway.",outfilename); return 1; } @@ -101,6 +104,7 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) fseek(robotsfile, 0, SEEK_SET); // same as rewind(f); robotsfilestr = malloc(fsize + 1); + alloced=1; if(fread(robotsfilestr, 1, fsize, robotsfile)){} fclose(robotsfile); @@ -190,6 +194,10 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) } if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){ printf("Permitted."); + curl_easy_cleanup(curl); + curl_global_cleanup(); + if(alloced==1) + free(robotsfilestr); return 1; } if(match==0) @@ -214,13 +222,24 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) if(result==0){ printf("Denied."); + curl_easy_cleanup(curl); + curl_global_cleanup(); + if(alloced==1) + free(robotsfilestr); return 0; }else{ printf("Permitted."); + curl_easy_cleanup(curl); + curl_global_cleanup(); + if(alloced==1) + free(robotsfilestr); return 1; } } printf("Permitted."); + curl_easy_cleanup(curl); + if(alloced==1) + free(robotsfilestr); return 1; } diff --git a/c/cr.c b/c/cr.c index a2062da..bc16066 100755 --- a/c/cr.c +++ b/c/cr.c @@ -23,9 +23,9 @@ char *shardfilestr; void finish_with_error(MYSQL *con) { - fprintf(stderr, "%s\n", mysql_error(con)); - mysql_close(con); - exit(1); + fprintf(stderr, "%s\n", mysql_error(con)); + mysql_close(con); + exit(1); } int isnum(char *source){ int sourcelength = strlen(source); @@ -37,8 +37,8 @@ int isnum(char *source){ return 1; } size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) { - size_t written = fwrite(ptr, size, nmemb, stream); - return written; + size_t written = fwrite(ptr, size, nmemb, stream); + return written; } int main(int argc, char **argv) @@ -60,7 +60,8 @@ int main(int argc, char **argv) //check if there are shards to include int nShards=0,fsize=0,shardnum=0; - char shardc, numshards[20], shardnumstr[20]; + char numshards[20], shardnumstr[20]; + memset(numshards,0,20); memset(shardnumstr,0,20); sprintf(shardnumstr,"0"); if(shardfile = fopen("shards", "r")){ @@ -72,16 +73,15 @@ int main(int argc, char **argv) if(fread(shardfilestr, 1, fsize, shardfile)){} shardfilestr[fsize] = 0; for(int i=0;i 47 && shardfilestr[i] < 58){ + numshards[i]=shardfilestr[i]; } } //check if its a number if(isnum(numshards)==1){ nShards = atoi(numshards); }else{ - printf("\nThe shard file contains gibberish: '%s'. Fix this to continue.",shardfilestr); + printf("\nThe shard file must contain a number. Indicate the number of available shards you are using or set it to 0 if you aren't.\n\n"); exit(0); } free(shardfilestr); @@ -91,8 +91,8 @@ int main(int argc, char **argv) } fclose(shardfile); }else{ - printf("\nWarning: 'shards' file is missing. Create the file and indicate the number of available shards you are using or set it to 0 if you aren't.\n\n"); - } + printf("\nWarning: 'shards' file is missing. Create the file and indicate the number of available shards you are using or set it to 0 if you aren't.\n\n"); + } if(nShards > 0){ srand(time(NULL)); shardnum = (rand() % nShards); @@ -104,7 +104,7 @@ int main(int argc, char **argv) { //printf("MySQL client version: %s\n", mysql_get_client_info()); int alreadydone = 0, permitted=1; - //allocates or initialises a MYSQL object + //allocates or initialises a MYSQL object MYSQL *con = mysql_init(NULL); @@ -121,13 +121,13 @@ int main(int argc, char **argv) if (mysql_query(con, "SET CHARSET utf8;")) { - finish_with_error(con); + finish_with_error(con); } - + if(id_assigned == 0){ if (mysql_query(con, "SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue limit 1;")) { - finish_with_error(con); + finish_with_error(con); } }else{ char indexqueuequery[2001]; @@ -137,13 +137,13 @@ int main(int argc, char **argv) strcat(indexqueuequery,"' LIMIT 1;"); if (mysql_query(con, indexqueuequery)) { - finish_with_error(con); + finish_with_error(con); } } //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set MYSQL_RES *result = mysql_store_result(con); - + if(result == NULL) { finish_with_error(con); @@ -154,17 +154,17 @@ int main(int argc, char **argv) //We fetch the rows and print them to the screen. /*MYSQL_ROW row; - while (row = mysql_fetch_row(result)) - { - for(int i=0; i 0) strcat(prefix,"://"); else if(https > 0) strcat(prefix,"s://"); else if(httpwww > 0) strcat(prefix,"://www."); @@ -321,7 +321,7 @@ int main(int argc, char **argv) if (mysql_query(con, checkurl)) { - finish_with_error(con); + finish_with_error(con); } //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set @@ -331,7 +331,7 @@ int main(int argc, char **argv) { finish_with_error(con); } - + //grab the first entry (fifo) printf("Found ID "); row = mysql_fetch_row(resulturlcheck); @@ -342,7 +342,7 @@ int main(int argc, char **argv) char *fault; char *dburl; char *shard; - + //Catalog the previous crawl attempts (to see if they are all for the same page - which would be a bad sign) previousID[4] = previousID[3]; previousID[3] = previousID[2]; @@ -405,13 +405,13 @@ int main(int argc, char **argv) }else{ sanity = 1; } - + }else{ sanity = 1; } //printf("\n\n%ld, %ld, %ld, %ld, %ld\n",previousID[0],previousID[1],previousID[2],previousID[3],previousID[4]); - + //see if the server will accept http only connections on older browsers, change url to HTTP only: char urlHTTP[strlen(url)+100]; memset(urlHTTP,0,strlen(url)+100); @@ -429,7 +429,7 @@ int main(int argc, char **argv) printf("\nAttempt HTTP connection: %s",urlHTTP); printf("\nDownloading page... "); //===============do the curl (download the webpage)===================== - //curl_global_init(CURL_GLOBAL_ALL); + curl_global_init(CURL_GLOBAL_DEFAULT); CURL *curl; FILE *fp; CURLcode res; @@ -467,7 +467,7 @@ int main(int argc, char **argv) } //curl_easy_cleanup(curl); //cleanup moved further down because finalURL is needed at insert - + //get file size fseek(fp, 0L, SEEK_END); size = ftell(fp); @@ -495,7 +495,7 @@ int main(int argc, char **argv) else if(http > 0 || httpwww > 0){ httpAllow[0] = '1'; } - + //Remove the prefix from the final URL, to store into url_noprefix //find out if its http or https or http://www. or https://www. httpwww=httpswww=http=https=0; @@ -541,7 +541,7 @@ int main(int argc, char **argv) strcat(doublecheckurl,"';"); if (mysql_query(con, doublecheckurl)) { - finish_with_error(con); + finish_with_error(con); } resulturlcheck = mysql_store_result(con); if(resulturlcheck == NULL) @@ -586,7 +586,7 @@ int main(int argc, char **argv) if(alreadydone==0 && id_assigned==1 && idexistsalready==1){ if (mysql_query(con, "use wibytemp;")) { - finish_with_error(con); + finish_with_error(con); } memset(idReserve,0,100); strcpy(idReserve,"INSERT into reserve_id (id) VALUES ("); @@ -605,7 +605,7 @@ int main(int argc, char **argv) //back to wiby database if (mysql_query(con, "use wiby;")) { - finish_with_error(con); + finish_with_error(con); } updatereserve=1; if(alreadydone==0){ @@ -646,7 +646,7 @@ int main(int argc, char **argv) //query db if (mysql_query(con, checkurl)) { - finish_with_error(con); + finish_with_error(con); } MYSQL_RES *resulturlcheck = mysql_store_result(con); if(resulturlcheck == NULL) @@ -719,18 +719,18 @@ int main(int argc, char **argv) windexupdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+1001,sizeof(char)); windexRandUpdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+1001,sizeof(char)); titlecheckinsert = (char*)calloc(finalURLsize+titlesize+1001,sizeof(char)); - + /*if(title == NULL || keywords == NULL || description == NULL || page == NULL || windexinsert == NULL || windexupdate == NULL) - { - printf("\nError allocating memory for webpage"); - //cleanup sql stuff - mysql_free_result(resulturlcheck); - mysql_free_result(result); - mysql_close(con); - exit(0); + { + printf("\nError allocating memory for webpage"); + //cleanup sql stuff + mysql_free_result(resulturlcheck); + mysql_free_result(result); + mysql_close(con); + exit(0); }*/ - + //Check if this is a new page: check if the title found in windex is the same as the parsed title. If not, put the page back into review. int dbtitlesize = 0,titlecheckTitleSize = 0, dbNoTitle=0,extrapos=0; if(idexistsalready==1) @@ -742,12 +742,12 @@ int main(int argc, char **argv) if (mysql_query(con, "use wibytemp;")) { - finish_with_error(con); + finish_with_error(con); } //set charset based on crawled page charset tag if (mysql_query(con, mysqlcharset)) { - finish_with_error(con); + finish_with_error(con); } //insert title into wibytemp for comparison strcpy(titlecheckinsert,"INSERT INTO titlecheck (url,title) VALUES ('"); @@ -757,11 +757,11 @@ int main(int argc, char **argv) strcat(titlecheckinsert,"');"); if (mysql_query(con, titlecheckinsert)) { - finish_with_error(con); + finish_with_error(con); } if (mysql_query(con, "SET CHARSET utf8;")) { - finish_with_error(con); + finish_with_error(con); } //now read back the title from the database char checktitle[finalURLsize+dbtitlesize+1000]; @@ -771,14 +771,14 @@ int main(int argc, char **argv) //query db if (mysql_query(con, checktitle)) { - finish_with_error(con); + finish_with_error(con); } MYSQL_RES *resulttitlecheck = mysql_store_result(con); if(resulttitlecheck == NULL) { finish_with_error(con); } - + //grab the first entry (fifo) MYSQL_ROW rowTitleCheck = mysql_fetch_row(resulttitlecheck); char *titlecheckTitle; @@ -793,13 +793,13 @@ int main(int argc, char **argv) strcat(titlecheckremove,finalURL);strcat(titlecheckremove,"';"); if (mysql_query(con, titlecheckremove)) { - finish_with_error(con); + finish_with_error(con); } //back to wiby database if (mysql_query(con, "use wiby;")) { - finish_with_error(con); + finish_with_error(con); } //check if original dburl is now getting redirected from finalurl (should be sent to review) @@ -868,7 +868,7 @@ int main(int argc, char **argv) dbNoTitle=0; } } - + //if((dbNoTitle == 0 && dbtitlesize != (titlesize-extrapos)) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0)) //previous, before db wibytemp titlecheck method if((dbNoTitle == 0 && dbtitlesize != titlecheckTitleSize) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0) || (URL_is_dbtitle == 1 && dbtitlesize != titlecheckTitleSize && titlesize > 0 && emptytitle == 0)) { @@ -886,12 +886,12 @@ int main(int argc, char **argv) if (mysql_query(con, mysqlcharset))//set charset based on page charset tag { - finish_with_error(con); + finish_with_error(con); } //strcpy(windexinsert,"INSERT INTO windex (url,title,tags,description,body,worksafe,enable,date,approver,surprise,updatable) VALUES ('"); strcpy(windexinsert,"INSERT INTO windex (url,url_noprefix,title,description,body,worksafe,enable,date,approver,surprise,http,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,shard) VALUES ('"); - + strcpy(windexupdate,"UPDATE windex SET url = '"); int copiedRandom = 0; @@ -914,12 +914,12 @@ int main(int argc, char **argv) if (mysql_query(con, "SELECT id, shard, url_noprefix FROM windex WHERE enable = 1 ORDER BY rand() LIMIT 1;")) { - finish_with_error(con); + finish_with_error(con); } resultRandID = mysql_store_result(con); if (resultRandID==NULL) { - finish_with_error(con); + finish_with_error(con); } MYSQL_ROW row = mysql_fetch_row(resultRandID); if(row != NULL){ @@ -933,7 +933,7 @@ int main(int argc, char **argv) if(row != NULL && id_assigned==1){ if (mysql_query(con, "use wibytemp;")) { - finish_with_error(con); + finish_with_error(con); } memset(randomreserve,0,100); strcpy(randomreserve,"INSERT into reserve_id (id) VALUES ("); @@ -952,7 +952,7 @@ int main(int argc, char **argv) //back to wiby database if (mysql_query(con, "use wiby;")) { - finish_with_error(con); + finish_with_error(con); } } @@ -1013,7 +1013,7 @@ int main(int argc, char **argv) strcat(windexinsert,")"); if (mysql_query(con, windexinsert)) { - finish_with_error(con); + finish_with_error(con); } //insert into the shard table for the new row @@ -1023,30 +1023,30 @@ int main(int argc, char **argv) strcat(windexinsert,shardnumstr); strcat(windexinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID();"); /*//get the last ID - MYSQL_RES *resultIDnum; - char *lastIDnum; + MYSQL_RES *resultIDnum; + char *lastIDnum; - if (mysql_query(con, "SELECT LAST_INSERT_ID() FROM windex limit 1")) - { - finish_with_error(con); - } - MYSQL_ROW rowLastID = mysql_fetch_row(resultIDnum); - if(rowLastID != NULL){ - lastIDnum = rowLastID[0]; - } + if (mysql_query(con, "SELECT LAST_INSERT_ID() FROM windex limit 1")) + { + finish_with_error(con); + } + MYSQL_ROW rowLastID = mysql_fetch_row(resultIDnum); + if(rowLastID != NULL){ + lastIDnum = rowLastID[0]; + } - strcpy(shardinsert,"INSERT INTO ws"); - strcat(shardinsert,shardnumstr); - strcat(shardinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = "); - strcat(shardinsert,lastIDnum); - if (mysql_query(con, shardinsert)) - { - finish_with_error(con); - } - mysql_free_result(resultIDnum); */ + strcpy(shardinsert,"INSERT INTO ws"); + strcat(shardinsert,shardnumstr); + strcat(shardinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = "); + strcat(shardinsert,lastIDnum); + if (mysql_query(con, shardinsert)) + { + finish_with_error(con); + } + mysql_free_result(resultIDnum); */ if (mysql_query(con, windexinsert)) { - finish_with_error(con); + finish_with_error(con); } } } @@ -1056,7 +1056,7 @@ int main(int argc, char **argv) strcat(windexRandUpdate,randID); if (mysql_query(con, windexRandUpdate)) { - finish_with_error(con); + finish_with_error(con); } if(nShards>0){//Also copy that new row into a new row of the same ID in the round-robin assigned shard table //update the shard id in windex @@ -1066,7 +1066,7 @@ int main(int argc, char **argv) strcat(windexRandUpdate," WHERE id = LAST_INSERT_ID()"); if (mysql_query(con, windexRandUpdate)) { - finish_with_error(con); + finish_with_error(con); } //insert that row into the next shard memset(windexRandUpdate,0,strlen(windexRandUpdate)); @@ -1075,7 +1075,7 @@ int main(int argc, char **argv) strcat(windexRandUpdate," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID()"); if (mysql_query(con, windexRandUpdate)) { - finish_with_error(con); + finish_with_error(con); } //Overwrite the randomly selected row with the contents of the newly crawled webpage @@ -1133,9 +1133,9 @@ int main(int argc, char **argv) strcat(windexRandUpdate,randID); if (mysql_query(con, windexRandUpdate)) { - finish_with_error(con); + finish_with_error(con); } - + //Finally, update the corresponding shard table row if(randshard != 0){ memset(windexRandUpdate,0,strlen(windexRandUpdate)); @@ -1194,7 +1194,7 @@ int main(int argc, char **argv) strcat(windexRandUpdate,randID); if (mysql_query(con, windexRandUpdate)) { - finish_with_error(con); + finish_with_error(con); } } } @@ -1263,7 +1263,7 @@ int main(int argc, char **argv) strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row if (mysql_query(con, windexupdate)) { - finish_with_error(con); + finish_with_error(con); } //update shard @@ -1317,7 +1317,7 @@ int main(int argc, char **argv) strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row if (mysql_query(con, windexupdate)) { - finish_with_error(con); + finish_with_error(con); } } } @@ -1326,7 +1326,7 @@ int main(int argc, char **argv) if(id_assigned==1 && idexistsalready==0 && reserveFail==0){ if (mysql_query(con, "use wibytemp;")) { - finish_with_error(con); + finish_with_error(con); } memset(randomreserve,0,100); strcpy(randomreserve,"DELETE FROM reserve_id where id = "); @@ -1339,14 +1339,14 @@ int main(int argc, char **argv) //back to wiby database if (mysql_query(con, "use wiby;")) { - finish_with_error(con); + finish_with_error(con); } } //unreserve ID if doing an update if(id_assigned==1 && updatereserve==1){ if (mysql_query(con, "use wibytemp;")) { - finish_with_error(con); + finish_with_error(con); } memset(idReserve,0,100); strcpy(idReserve,"DELETE FROM reserve_id where id = "); @@ -1359,14 +1359,14 @@ int main(int argc, char **argv) //back to wiby database if (mysql_query(con, "use wiby;")) { - finish_with_error(con); + finish_with_error(con); } } //free result if(idexistsalready == 0){ mysql_free_result(resultRandID); } - + //===================remove the entry from the indexqueue=============== //printf("\nRemoving from queue..."); char sqlqueryremove[200]; @@ -1375,9 +1375,9 @@ int main(int argc, char **argv) strcat(sqlqueryremove,id);strcat(sqlqueryremove,";"); if (mysql_query(con, sqlqueryremove)) { - finish_with_error(con); + finish_with_error(con); } - + printf("\n\nSuccess!"); } //clear page from memory @@ -1405,10 +1405,10 @@ int main(int argc, char **argv) memset(sqlqueryremove,0,200); strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id="); strcat(sqlqueryremove,id);strcat(sqlqueryremove,";"); - + if (mysql_query(con, sqlqueryremove)) { - finish_with_error(con); + finish_with_error(con); } if(alreadydone==0){ if(idexistsalready == 1 && fault[0] == '1') @@ -1427,7 +1427,7 @@ int main(int argc, char **argv) strcat(sqlqueryremove,idexistsvalue); if (mysql_query(con, sqlqueryremove)) { - finish_with_error(con); + finish_with_error(con); } if(nShards > 0 && shard != 0){ memset(sqlqueryremove,0,200); @@ -1437,7 +1437,7 @@ int main(int argc, char **argv) strcat(sqlqueryremove,idexistsvalue); if (mysql_query(con, sqlqueryremove)) { - finish_with_error(con); + finish_with_error(con); } } if(crawl_family == 0 || (crawl_family != 0 && crawl_family[0] =='0')){ @@ -1448,7 +1448,7 @@ int main(int argc, char **argv) strcat(sqlqueryreview,worksafe);strcat(sqlqueryreview,");"); if (mysql_query(con, sqlqueryreview)) { - finish_with_error(con); + finish_with_error(con); } } } @@ -1461,7 +1461,7 @@ int main(int argc, char **argv) strcat(sqlqueryfault,idexistsvalue); if (mysql_query(con, sqlqueryfault)) { - finish_with_error(con); + finish_with_error(con); } if(nShards>0 && shard != 0){ memset(sqlqueryfault,0,450); @@ -1471,7 +1471,7 @@ int main(int argc, char **argv) strcat(sqlqueryfault,idexistsvalue); if (mysql_query(con, sqlqueryfault)) { - finish_with_error(con); + finish_with_error(con); } } } @@ -1481,16 +1481,16 @@ int main(int argc, char **argv) fputs ("\r\n",abandoned); fclose(abandoned); } - } + } - //check if link crawling is specified - //make sure duplicates don't get crawled more than once - //check db if its already indexed too - do this at beginning instead? + //check if link crawling is specified + //make sure duplicates don't get crawled more than once + //check db if its already indexed too - do this at beginning instead? - //crawl links if refresh is from link crawler, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set + //crawl links if refresh is from link crawler, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set }else if(nofollow==0 && getURLs==1 && alreadydone==0){ //cycle through url list, then construct an sql string around it, then insert it to indexqueue; - + //force crawl depth of 1 during a refresh if crawl_repeat is set if(crawl_repeat != 0 && crawl_repeat[0]=='1' && task != 0 && task[0]=='1'){ n_crawl_depth=1; @@ -1514,7 +1514,7 @@ int main(int argc, char **argv) while(urlListShuffled[loopcount]!=0){ switch(urlListShuffled[loopcount]){ case '\n' ://see if url can be indexed, if so, add to sql insert statement - + urlparse(url_fromlist); //check if internal or external url @@ -1643,7 +1643,7 @@ int main(int argc, char **argv) } strcat(url_insert,")"); } - + memset(url_fromlist,0,url_fromlist_arraylen); elementnum=0; loopcount++; @@ -1662,12 +1662,14 @@ int main(int argc, char **argv) //insert into db if (mysql_query(con, url_insert)) { - finish_with_error(con); + finish_with_error(con); } } } - if (curl) + if (curl){ curl_easy_cleanup(curl);// cleanup curl (finalURL used at inserts, thats why we cleanup and the end here + curl_global_cleanup(); + } }else{ if(alreadydone == 0){ printf("\nPage was flagged as unable to crawl or banned."); @@ -1681,7 +1683,7 @@ int main(int argc, char **argv) strcat(sqlqueryremove,id); if (mysql_query(con, sqlqueryremove)) { - finish_with_error(con); + finish_with_error(con); } if(idexistsalready==1 && permitted==0){ printf(" Removing from index..."); @@ -1691,7 +1693,7 @@ int main(int argc, char **argv) strcat(sqlqueryremove," AND updatable != '0'"); if (mysql_query(con, sqlqueryremove)) { - finish_with_error(con); + finish_with_error(con); } if(nShards>0 && shard != 0){ memset(sqlqueryremove,0,200); @@ -1702,7 +1704,7 @@ int main(int argc, char **argv) strcat(sqlqueryremove," AND updatable != '0'"); if (mysql_query(con, sqlqueryremove)) { - finish_with_error(con); + finish_with_error(con); } } } @@ -1721,8 +1723,7 @@ int main(int argc, char **argv) shardnum=0; sprintf(shardnumstr,"%d",shardnum); } - - printf(" Awaiting next page in queue...\n\n"); + printf(" Awaiting next page in queue...\n\n"); } //cleanup more sql stuff mysql_free_result(result); @@ -1731,5 +1732,6 @@ int main(int argc, char **argv) if(empty==1) sleep(5);//sleep 5 seconds } - exit(0); + exit(0); } +