Add files via upload

This commit is contained in:
wibyweb 2023-03-28 22:58:53 -04:00 committed by GitHub
parent 26b48720bb
commit 25a8e227ee
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 153 additions and 132 deletions

View file

@ -32,10 +32,11 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
memset(rwindow,'?',rwindow_len);
// rwindow[rwindow_len]=0;
//curl_global_init(CURL_GLOBAL_ALL);
curl_global_init(CURL_GLOBAL_DEFAULT);
CURL *curl;
FILE *fp;
CURLcode res;
curl = curl_easy_init();
memset(robotsurl,0,1011);
strcpy(robotsurl,rURLprefix);
strcat(robotsurl,rDomain);
@ -45,10 +46,9 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
strcpy(outfilename,"robots/");
strcat(outfilename,rDomain);
strcat(outfilename,".txt");
curl = curl_easy_init();
long fsize=0,response_code_checkrobots=0;
char *finalURL_checkrobots = NULL;
int foundfile=0;
int foundfile=0,alloced=0;
char rb,rwb;
printf("\nChecking robots.txt: ");
@ -59,6 +59,7 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
fseek(robotsfile, 0, SEEK_SET); /* same as rewind(f); */
robotsfilestr = malloc(fsize + 1);
alloced=1;
if(fread(robotsfilestr, 1, fsize, robotsfile)){}
fclose(robotsfile);
@ -71,7 +72,7 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
if(fp = fopen(outfilename,"wb")){
//set curl options
curl_easy_setopt(curl, CURLOPT_URL, robotsurl);// set URL to get here
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; compatible; WebCrawler; SearchEngine)");
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Wibybot; https://wiby.me/)");
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data_checkrobots);// send all data to this function //
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle
curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects
@ -83,13 +84,15 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
res = curl_easy_perform(curl);// get it!
curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL_checkrobots);
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code_checkrobots);
curl_easy_cleanup(curl);// always cleanup
//curl_easy_cleanup(curl);// always cleanup (done further down)
fclose(fp);
if(response_code_checkrobots!=200){
fp = fopen(outfilename,"wb");
fclose(fp);
}
}else{
curl_easy_cleanup(curl);
curl_global_cleanup();
printf("\nFailed to create file: %s - proceeding anyway.",outfilename);
return 1;
}
@ -101,6 +104,7 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
fseek(robotsfile, 0, SEEK_SET); // same as rewind(f);
robotsfilestr = malloc(fsize + 1);
alloced=1;
if(fread(robotsfilestr, 1, fsize, robotsfile)){}
fclose(robotsfile);
@ -190,6 +194,10 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
}
if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){
printf("Permitted.");
curl_easy_cleanup(curl);
curl_global_cleanup();
if(alloced==1)
free(robotsfilestr);
return 1;
}
if(match==0)
@ -214,13 +222,24 @@ int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath)
if(result==0){
printf("Denied.");
curl_easy_cleanup(curl);
curl_global_cleanup();
if(alloced==1)
free(robotsfilestr);
return 0;
}else{
printf("Permitted.");
curl_easy_cleanup(curl);
curl_global_cleanup();
if(alloced==1)
free(robotsfilestr);
return 1;
}
}
printf("Permitted.");
curl_easy_cleanup(curl);
if(alloced==1)
free(robotsfilestr);
return 1;
}

256
c/cr.c
View file

@ -23,9 +23,9 @@ char *shardfilestr;
void finish_with_error(MYSQL *con)
{
fprintf(stderr, "%s\n", mysql_error(con));
mysql_close(con);
exit(1);
fprintf(stderr, "%s\n", mysql_error(con));
mysql_close(con);
exit(1);
}
int isnum(char *source){
int sourcelength = strlen(source);
@ -37,8 +37,8 @@ int isnum(char *source){
return 1;
}
size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
size_t written = fwrite(ptr, size, nmemb, stream);
return written;
size_t written = fwrite(ptr, size, nmemb, stream);
return written;
}
int main(int argc, char **argv)
@ -60,7 +60,8 @@ int main(int argc, char **argv)
//check if there are shards to include
int nShards=0,fsize=0,shardnum=0;
char shardc, numshards[20], shardnumstr[20];
char numshards[20], shardnumstr[20];
memset(numshards,0,20);
memset(shardnumstr,0,20);
sprintf(shardnumstr,"0");
if(shardfile = fopen("shards", "r")){
@ -72,16 +73,15 @@ int main(int argc, char **argv)
if(fread(shardfilestr, 1, fsize, shardfile)){}
shardfilestr[fsize] = 0;
for(int i=0;i<fsize;i++){
shardc = shardfilestr[i];
if(shardc != 10 && shardc != 13){
numshards[i]=shardc;
if(shardfilestr[i] > 47 && shardfilestr[i] < 58){
numshards[i]=shardfilestr[i];
}
}
//check if its a number
if(isnum(numshards)==1){
nShards = atoi(numshards);
}else{
printf("\nThe shard file contains gibberish: '%s'. Fix this to continue.",shardfilestr);
printf("\nThe shard file must contain a number. Indicate the number of available shards you are using or set it to 0 if you aren't.\n\n");
exit(0);
}
free(shardfilestr);
@ -91,8 +91,8 @@ int main(int argc, char **argv)
}
fclose(shardfile);
}else{
printf("\nWarning: 'shards' file is missing. Create the file and indicate the number of available shards you are using or set it to 0 if you aren't.\n\n");
}
printf("\nWarning: 'shards' file is missing. Create the file and indicate the number of available shards you are using or set it to 0 if you aren't.\n\n");
}
if(nShards > 0){
srand(time(NULL));
shardnum = (rand() % nShards);
@ -104,7 +104,7 @@ int main(int argc, char **argv)
{
//printf("MySQL client version: %s\n", mysql_get_client_info());
int alreadydone = 0, permitted=1;
//allocates or initialises a MYSQL object
//allocates or initialises a MYSQL object
MYSQL *con = mysql_init(NULL);
@ -121,13 +121,13 @@ int main(int argc, char **argv)
if (mysql_query(con, "SET CHARSET utf8;"))
{
finish_with_error(con);
finish_with_error(con);
}
if(id_assigned == 0){
if (mysql_query(con, "SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue limit 1;"))
{
finish_with_error(con);
finish_with_error(con);
}
}else{
char indexqueuequery[2001];
@ -137,13 +137,13 @@ int main(int argc, char **argv)
strcat(indexqueuequery,"' LIMIT 1;");
if (mysql_query(con, indexqueuequery))
{
finish_with_error(con);
finish_with_error(con);
}
}
//We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set
MYSQL_RES *result = mysql_store_result(con);
if(result == NULL)
{
finish_with_error(con);
@ -154,17 +154,17 @@ int main(int argc, char **argv)
//We fetch the rows and print them to the screen.
/*MYSQL_ROW row;
while (row = mysql_fetch_row(result))
{
for(int i=0; i<num_fields; i++)
{
printf("%s ", row[i] ? row[i] : "NULL");
}
printf("\n");
}*/
while (row = mysql_fetch_row(result))
{
for(int i=0; i<num_fields; i++)
{
printf("%s ", row[i] ? row[i] : "NULL");
}
printf("\n");
}*/
MYSQL_ROW row = mysql_fetch_row(result);
int empty=0;
if(row == NULL){
//printf("\nQueue is empty\n");
@ -181,9 +181,9 @@ int main(int argc, char **argv)
printf("-----------------------------------------------------------------------------------\nFetching:");
//grab the first entry (fifo)
/*for(int i=0; i<num_fields; i++)
{
printf("%s ", row[i] ? row[i] : "NULL");
}*/
{
printf("%s ", row[i] ? row[i] : "NULL");
}*/
//Store data in first row into variables
char *url = row[1];
@ -201,7 +201,7 @@ int main(int argc, char **argv)
char *crawl_type = row[11];
char *crawl_repeat = row[12];
char *force_rules = row[13];
//convert crawl depth, pages to int
int n_crawl_depth=0, n_crawl_pages=0;
if(crawl_depth!=0){
@ -242,7 +242,7 @@ int main(int argc, char **argv)
}
//set the prefix
if(http > 0) strcat(prefix,"://");
else if(https > 0) strcat(prefix,"s://");
else if(httpwww > 0) strcat(prefix,"://www.");
@ -321,7 +321,7 @@ int main(int argc, char **argv)
if (mysql_query(con, checkurl))
{
finish_with_error(con);
finish_with_error(con);
}
//We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set
@ -331,7 +331,7 @@ int main(int argc, char **argv)
{
finish_with_error(con);
}
//grab the first entry (fifo)
printf("Found ID ");
row = mysql_fetch_row(resulturlcheck);
@ -342,7 +342,7 @@ int main(int argc, char **argv)
char *fault;
char *dburl;
char *shard;
//Catalog the previous crawl attempts (to see if they are all for the same page - which would be a bad sign)
previousID[4] = previousID[3];
previousID[3] = previousID[2];
@ -405,13 +405,13 @@ int main(int argc, char **argv)
}else{
sanity = 1;
}
}else{
sanity = 1;
}
//printf("\n\n%ld, %ld, %ld, %ld, %ld\n",previousID[0],previousID[1],previousID[2],previousID[3],previousID[4]);
//see if the server will accept http only connections on older browsers, change url to HTTP only:
char urlHTTP[strlen(url)+100];
memset(urlHTTP,0,strlen(url)+100);
@ -429,7 +429,7 @@ int main(int argc, char **argv)
printf("\nAttempt HTTP connection: %s",urlHTTP);
printf("\nDownloading page... ");
//===============do the curl (download the webpage)=====================
//curl_global_init(CURL_GLOBAL_ALL);
curl_global_init(CURL_GLOBAL_DEFAULT);
CURL *curl;
FILE *fp;
CURLcode res;
@ -467,7 +467,7 @@ int main(int argc, char **argv)
}
//curl_easy_cleanup(curl); //cleanup moved further down because finalURL is needed at insert
//get file size
fseek(fp, 0L, SEEK_END);
size = ftell(fp);
@ -495,7 +495,7 @@ int main(int argc, char **argv)
else if(http > 0 || httpwww > 0){
httpAllow[0] = '1';
}
//Remove the prefix from the final URL, to store into url_noprefix
//find out if its http or https or http://www. or https://www.
httpwww=httpswww=http=https=0;
@ -541,7 +541,7 @@ int main(int argc, char **argv)
strcat(doublecheckurl,"';");
if (mysql_query(con, doublecheckurl))
{
finish_with_error(con);
finish_with_error(con);
}
resulturlcheck = mysql_store_result(con);
if(resulturlcheck == NULL)
@ -586,7 +586,7 @@ int main(int argc, char **argv)
if(alreadydone==0 && id_assigned==1 && idexistsalready==1){
if (mysql_query(con, "use wibytemp;"))
{
finish_with_error(con);
finish_with_error(con);
}
memset(idReserve,0,100);
strcpy(idReserve,"INSERT into reserve_id (id) VALUES (");
@ -605,7 +605,7 @@ int main(int argc, char **argv)
//back to wiby database
if (mysql_query(con, "use wiby;"))
{
finish_with_error(con);
finish_with_error(con);
}
updatereserve=1;
if(alreadydone==0){
@ -646,7 +646,7 @@ int main(int argc, char **argv)
//query db
if (mysql_query(con, checkurl))
{
finish_with_error(con);
finish_with_error(con);
}
MYSQL_RES *resulturlcheck = mysql_store_result(con);
if(resulturlcheck == NULL)
@ -719,18 +719,18 @@ int main(int argc, char **argv)
windexupdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+1001,sizeof(char));
windexRandUpdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+1001,sizeof(char));
titlecheckinsert = (char*)calloc(finalURLsize+titlesize+1001,sizeof(char));
/*if(title == NULL || keywords == NULL || description == NULL || page == NULL || windexinsert == NULL || windexupdate == NULL)
{
printf("\nError allocating memory for webpage");
//cleanup sql stuff
mysql_free_result(resulturlcheck);
mysql_free_result(result);
mysql_close(con);
exit(0);
{
printf("\nError allocating memory for webpage");
//cleanup sql stuff
mysql_free_result(resulturlcheck);
mysql_free_result(result);
mysql_close(con);
exit(0);
}*/
//Check if this is a new page: check if the title found in windex is the same as the parsed title. If not, put the page back into review.
int dbtitlesize = 0,titlecheckTitleSize = 0, dbNoTitle=0,extrapos=0;
if(idexistsalready==1)
@ -742,12 +742,12 @@ int main(int argc, char **argv)
if (mysql_query(con, "use wibytemp;"))
{
finish_with_error(con);
finish_with_error(con);
}
//set charset based on crawled page charset tag
if (mysql_query(con, mysqlcharset))
{
finish_with_error(con);
finish_with_error(con);
}
//insert title into wibytemp for comparison
strcpy(titlecheckinsert,"INSERT INTO titlecheck (url,title) VALUES ('");
@ -757,11 +757,11 @@ int main(int argc, char **argv)
strcat(titlecheckinsert,"');");
if (mysql_query(con, titlecheckinsert))
{
finish_with_error(con);
finish_with_error(con);
}
if (mysql_query(con, "SET CHARSET utf8;"))
{
finish_with_error(con);
finish_with_error(con);
}
//now read back the title from the database
char checktitle[finalURLsize+dbtitlesize+1000];
@ -771,14 +771,14 @@ int main(int argc, char **argv)
//query db
if (mysql_query(con, checktitle))
{
finish_with_error(con);
finish_with_error(con);
}
MYSQL_RES *resulttitlecheck = mysql_store_result(con);
if(resulttitlecheck == NULL)
{
finish_with_error(con);
}
//grab the first entry (fifo)
MYSQL_ROW rowTitleCheck = mysql_fetch_row(resulttitlecheck);
char *titlecheckTitle;
@ -793,13 +793,13 @@ int main(int argc, char **argv)
strcat(titlecheckremove,finalURL);strcat(titlecheckremove,"';");
if (mysql_query(con, titlecheckremove))
{
finish_with_error(con);
finish_with_error(con);
}
//back to wiby database
if (mysql_query(con, "use wiby;"))
{
finish_with_error(con);
finish_with_error(con);
}
//check if original dburl is now getting redirected from finalurl (should be sent to review)
@ -868,7 +868,7 @@ int main(int argc, char **argv)
dbNoTitle=0;
}
}
//if((dbNoTitle == 0 && dbtitlesize != (titlesize-extrapos)) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0)) //previous, before db wibytemp titlecheck method
if((dbNoTitle == 0 && dbtitlesize != titlecheckTitleSize) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0) || (URL_is_dbtitle == 1 && dbtitlesize != titlecheckTitleSize && titlesize > 0 && emptytitle == 0))
{
@ -886,12 +886,12 @@ int main(int argc, char **argv)
if (mysql_query(con, mysqlcharset))//set charset based on page charset tag
{
finish_with_error(con);
finish_with_error(con);
}
//strcpy(windexinsert,"INSERT INTO windex (url,title,tags,description,body,worksafe,enable,date,approver,surprise,updatable) VALUES ('");
strcpy(windexinsert,"INSERT INTO windex (url,url_noprefix,title,description,body,worksafe,enable,date,approver,surprise,http,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,shard) VALUES ('");
strcpy(windexupdate,"UPDATE windex SET url = '");
int copiedRandom = 0;
@ -914,12 +914,12 @@ int main(int argc, char **argv)
if (mysql_query(con, "SELECT id, shard, url_noprefix FROM windex WHERE enable = 1 ORDER BY rand() LIMIT 1;"))
{
finish_with_error(con);
finish_with_error(con);
}
resultRandID = mysql_store_result(con);
if (resultRandID==NULL)
{
finish_with_error(con);
finish_with_error(con);
}
MYSQL_ROW row = mysql_fetch_row(resultRandID);
if(row != NULL){
@ -933,7 +933,7 @@ int main(int argc, char **argv)
if(row != NULL && id_assigned==1){
if (mysql_query(con, "use wibytemp;"))
{
finish_with_error(con);
finish_with_error(con);
}
memset(randomreserve,0,100);
strcpy(randomreserve,"INSERT into reserve_id (id) VALUES (");
@ -952,7 +952,7 @@ int main(int argc, char **argv)
//back to wiby database
if (mysql_query(con, "use wiby;"))
{
finish_with_error(con);
finish_with_error(con);
}
}
@ -1013,7 +1013,7 @@ int main(int argc, char **argv)
strcat(windexinsert,")");
if (mysql_query(con, windexinsert))
{
finish_with_error(con);
finish_with_error(con);
}
//insert into the shard table for the new row
@ -1023,30 +1023,30 @@ int main(int argc, char **argv)
strcat(windexinsert,shardnumstr);
strcat(windexinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID();");
/*//get the last ID
MYSQL_RES *resultIDnum;
char *lastIDnum;
MYSQL_RES *resultIDnum;
char *lastIDnum;
if (mysql_query(con, "SELECT LAST_INSERT_ID() FROM windex limit 1"))
{
finish_with_error(con);
}
MYSQL_ROW rowLastID = mysql_fetch_row(resultIDnum);
if(rowLastID != NULL){
lastIDnum = rowLastID[0];
}
if (mysql_query(con, "SELECT LAST_INSERT_ID() FROM windex limit 1"))
{
finish_with_error(con);
}
MYSQL_ROW rowLastID = mysql_fetch_row(resultIDnum);
if(rowLastID != NULL){
lastIDnum = rowLastID[0];
}
strcpy(shardinsert,"INSERT INTO ws");
strcat(shardinsert,shardnumstr);
strcat(shardinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = ");
strcat(shardinsert,lastIDnum);
if (mysql_query(con, shardinsert))
{
finish_with_error(con);
}
mysql_free_result(resultIDnum); */
strcpy(shardinsert,"INSERT INTO ws");
strcat(shardinsert,shardnumstr);
strcat(shardinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = ");
strcat(shardinsert,lastIDnum);
if (mysql_query(con, shardinsert))
{
finish_with_error(con);
}
mysql_free_result(resultIDnum); */
if (mysql_query(con, windexinsert))
{
finish_with_error(con);
finish_with_error(con);
}
}
}
@ -1056,7 +1056,7 @@ int main(int argc, char **argv)
strcat(windexRandUpdate,randID);
if (mysql_query(con, windexRandUpdate))
{
finish_with_error(con);
finish_with_error(con);
}
if(nShards>0){//Also copy that new row into a new row of the same ID in the round-robin assigned shard table
//update the shard id in windex
@ -1066,7 +1066,7 @@ int main(int argc, char **argv)
strcat(windexRandUpdate," WHERE id = LAST_INSERT_ID()");
if (mysql_query(con, windexRandUpdate))
{
finish_with_error(con);
finish_with_error(con);
}
//insert that row into the next shard
memset(windexRandUpdate,0,strlen(windexRandUpdate));
@ -1075,7 +1075,7 @@ int main(int argc, char **argv)
strcat(windexRandUpdate," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID()");
if (mysql_query(con, windexRandUpdate))
{
finish_with_error(con);
finish_with_error(con);
}
//Overwrite the randomly selected row with the contents of the newly crawled webpage
@ -1133,9 +1133,9 @@ int main(int argc, char **argv)
strcat(windexRandUpdate,randID);
if (mysql_query(con, windexRandUpdate))
{
finish_with_error(con);
finish_with_error(con);
}
//Finally, update the corresponding shard table row
if(randshard != 0){
memset(windexRandUpdate,0,strlen(windexRandUpdate));
@ -1194,7 +1194,7 @@ int main(int argc, char **argv)
strcat(windexRandUpdate,randID);
if (mysql_query(con, windexRandUpdate))
{
finish_with_error(con);
finish_with_error(con);
}
}
}
@ -1263,7 +1263,7 @@ int main(int argc, char **argv)
strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row
if (mysql_query(con, windexupdate))
{
finish_with_error(con);
finish_with_error(con);
}
//update shard
@ -1317,7 +1317,7 @@ int main(int argc, char **argv)
strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row
if (mysql_query(con, windexupdate))
{
finish_with_error(con);
finish_with_error(con);
}
}
}
@ -1326,7 +1326,7 @@ int main(int argc, char **argv)
if(id_assigned==1 && idexistsalready==0 && reserveFail==0){
if (mysql_query(con, "use wibytemp;"))
{
finish_with_error(con);
finish_with_error(con);
}
memset(randomreserve,0,100);
strcpy(randomreserve,"DELETE FROM reserve_id where id = ");
@ -1339,14 +1339,14 @@ int main(int argc, char **argv)
//back to wiby database
if (mysql_query(con, "use wiby;"))
{
finish_with_error(con);
finish_with_error(con);
}
}
//unreserve ID if doing an update
if(id_assigned==1 && updatereserve==1){
if (mysql_query(con, "use wibytemp;"))
{
finish_with_error(con);
finish_with_error(con);
}
memset(idReserve,0,100);
strcpy(idReserve,"DELETE FROM reserve_id where id = ");
@ -1359,14 +1359,14 @@ int main(int argc, char **argv)
//back to wiby database
if (mysql_query(con, "use wiby;"))
{
finish_with_error(con);
finish_with_error(con);
}
}
//free result
if(idexistsalready == 0){
mysql_free_result(resultRandID);
}
//===================remove the entry from the indexqueue===============
//printf("\nRemoving from queue...");
char sqlqueryremove[200];
@ -1375,9 +1375,9 @@ int main(int argc, char **argv)
strcat(sqlqueryremove,id);strcat(sqlqueryremove,";");
if (mysql_query(con, sqlqueryremove))
{
finish_with_error(con);
finish_with_error(con);
}
printf("\n\nSuccess!");
}
//clear page from memory
@ -1405,10 +1405,10 @@ int main(int argc, char **argv)
memset(sqlqueryremove,0,200);
strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id=");
strcat(sqlqueryremove,id);strcat(sqlqueryremove,";");
if (mysql_query(con, sqlqueryremove))
{
finish_with_error(con);
finish_with_error(con);
}
if(alreadydone==0){
if(idexistsalready == 1 && fault[0] == '1')
@ -1427,7 +1427,7 @@ int main(int argc, char **argv)
strcat(sqlqueryremove,idexistsvalue);
if (mysql_query(con, sqlqueryremove))
{
finish_with_error(con);
finish_with_error(con);
}
if(nShards > 0 && shard != 0){
memset(sqlqueryremove,0,200);
@ -1437,7 +1437,7 @@ int main(int argc, char **argv)
strcat(sqlqueryremove,idexistsvalue);
if (mysql_query(con, sqlqueryremove))
{
finish_with_error(con);
finish_with_error(con);
}
}
if(crawl_family == 0 || (crawl_family != 0 && crawl_family[0] =='0')){
@ -1448,7 +1448,7 @@ int main(int argc, char **argv)
strcat(sqlqueryreview,worksafe);strcat(sqlqueryreview,");");
if (mysql_query(con, sqlqueryreview))
{
finish_with_error(con);
finish_with_error(con);
}
}
}
@ -1461,7 +1461,7 @@ int main(int argc, char **argv)
strcat(sqlqueryfault,idexistsvalue);
if (mysql_query(con, sqlqueryfault))
{
finish_with_error(con);
finish_with_error(con);
}
if(nShards>0 && shard != 0){
memset(sqlqueryfault,0,450);
@ -1471,7 +1471,7 @@ int main(int argc, char **argv)
strcat(sqlqueryfault,idexistsvalue);
if (mysql_query(con, sqlqueryfault))
{
finish_with_error(con);
finish_with_error(con);
}
}
}
@ -1481,16 +1481,16 @@ int main(int argc, char **argv)
fputs ("\r\n",abandoned);
fclose(abandoned);
}
}
}
//check if link crawling is specified
//make sure duplicates don't get crawled more than once
//check db if its already indexed too - do this at beginning instead?
//check if link crawling is specified
//make sure duplicates don't get crawled more than once
//check db if its already indexed too - do this at beginning instead?
//crawl links if refresh is from link crawler, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set
//crawl links if refresh is from link crawler, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set
}else if(nofollow==0 && getURLs==1 && alreadydone==0){
//cycle through url list, then construct an sql string around it, then insert it to indexqueue;
//force crawl depth of 1 during a refresh if crawl_repeat is set
if(crawl_repeat != 0 && crawl_repeat[0]=='1' && task != 0 && task[0]=='1'){
n_crawl_depth=1;
@ -1514,7 +1514,7 @@ int main(int argc, char **argv)
while(urlListShuffled[loopcount]!=0){
switch(urlListShuffled[loopcount]){
case '\n' ://see if url can be indexed, if so, add to sql insert statement
urlparse(url_fromlist);
//check if internal or external url
@ -1643,7 +1643,7 @@ int main(int argc, char **argv)
}
strcat(url_insert,")");
}
memset(url_fromlist,0,url_fromlist_arraylen);
elementnum=0;
loopcount++;
@ -1662,12 +1662,14 @@ int main(int argc, char **argv)
//insert into db
if (mysql_query(con, url_insert))
{
finish_with_error(con);
finish_with_error(con);
}
}
}
if (curl)
if (curl){
curl_easy_cleanup(curl);// cleanup curl (finalURL used at inserts, thats why we cleanup and the end here
curl_global_cleanup();
}
}else{
if(alreadydone == 0){
printf("\nPage was flagged as unable to crawl or banned.");
@ -1681,7 +1683,7 @@ int main(int argc, char **argv)
strcat(sqlqueryremove,id);
if (mysql_query(con, sqlqueryremove))
{
finish_with_error(con);
finish_with_error(con);
}
if(idexistsalready==1 && permitted==0){
printf(" Removing from index...");
@ -1691,7 +1693,7 @@ int main(int argc, char **argv)
strcat(sqlqueryremove," AND updatable != '0'");
if (mysql_query(con, sqlqueryremove))
{
finish_with_error(con);
finish_with_error(con);
}
if(nShards>0 && shard != 0){
memset(sqlqueryremove,0,200);
@ -1702,7 +1704,7 @@ int main(int argc, char **argv)
strcat(sqlqueryremove," AND updatable != '0'");
if (mysql_query(con, sqlqueryremove))
{
finish_with_error(con);
finish_with_error(con);
}
}
}
@ -1721,8 +1723,7 @@ int main(int argc, char **argv)
shardnum=0;
sprintf(shardnumstr,"%d",shardnum);
}
printf(" Awaiting next page in queue...\n\n");
printf(" Awaiting next page in queue...\n\n");
}
//cleanup more sql stuff
mysql_free_result(result);
@ -1731,5 +1732,6 @@ int main(int argc, char **argv)
if(empty==1)
sleep(5);//sleep 5 seconds
}
exit(0);
exit(0);
}