cr.c 72 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888
  1. //Wiby Web Crawler
  2. #include </usr/include/mysql/mysql.h>
  3. #include <stdlib.h>
  4. #include <stdio.h>
  5. #include <strings.h>
  6. //#include </usr/include/curl/curl.h> //RHEL/Rocky
  7. //#include </usr/include/curl/easy.h> //RHEL/Rocky
  8. #include </usr/include/x86_64-linux-gnu/curl/curl.h> //ubuntu 20/22
  9. #include </usr/include/x86_64-linux-gnu/curl/easy.h> //ubuntu 20/22
  10. #include "htmlparse.h"
  11. #include "urlparse.h"
  12. #include "checkrobots.h"
  13. #include <unistd.h>
  14. #define url_fromlist_arraylen 102400
  15. #define url_insert_arraylen 1024000
  16. char /**title, *keywords, *description, *page,*/ *windexinsert, *windexupdate, *windexRandUpdate, *titlecheckinsert, /**shardinsert,*/ correctedURL[1001], urlPath_finalURL[1001], folderPath_finalURL[1001], urlPrefix_finalURL[1001], urlNPNP_finalURL[1001], strDepth[101], url_fromlist[url_fromlist_arraylen], url_insert[url_insert_arraylen], previousfail[5][1001];
  17. FILE *shardfile;
  18. char *shardfilestr;
  19. void finish_with_error(MYSQL *con)
  20. {
  21. fprintf(stderr, "%s\n", mysql_error(con));
  22. mysql_close(con);
  23. exit(1);
  24. }
  25. int isnum(char *source){
  26. int sourcelength = strlen(source);
  27. for(int i=0;i < sourcelength; i++){
  28. if(source[i] < 48 || source[i] > 57){
  29. return 0;
  30. }
  31. }
  32. return 1;
  33. }
  34. size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) {
  35. size_t written = fwrite(ptr, size, nmemb, stream);
  36. return written;
  37. }
  38. int main(int argc, char **argv)
  39. {
  40. int id_assigned=0,sleeptime=1;
  41. if(argc >= 2 && isnum(argv[1])==1){
  42. if(argv[1][0] == 48){
  43. printf("When assigning ID's, you must start at 1. Cannot set an id of 0.\n");
  44. exit(0);
  45. }
  46. id_assigned=1;
  47. }
  48. if((argc >= 2 && isnum(argv[1])==0) || (argc >= 3 && isnum(argv[2])==0) || argc > 3){
  49. printf("\nWiby Web Crawler\n\nUsage: cr Crawler_ID Sleep_Time(s)\n\nThe indexqueue may have each page assigned a crawler ID. The ID is assigned when you specify to the Refresh Scheduler the total number of crawlers you are running, and when you update the variable '$num_crawlers' from inside of review.php and graveyard.php (line 73) to the number of crawlers you are using. The scheduler will assign pages in round-robin order a crawler ID within the range of that total.\n\nExample: If you want two crawlers running, then you should specify the first with an ID of 1, and the second with and ID of 2. Run them in separate folders, and provide a symlink to the 'robots' folder and 'shards' file in each. Each crawler will crawl pages in the indexqueue with its corresponding ID.\n\nYou can also not assign an ID, and in that case the crawler will ignore the ID assignments. So if you have only one crawler running, assigning an ID is optional unless you need to change the sleep time (then just use an ID of 1). Don't run multiple crawlers without assigning ID's.\n\nSpecify the total number of shard tables you wish to use in the 'shards' file. The crawler will round-robin insert/update rows in these tables (ws0 to wsX) along with the main 'windex' table. The default is 4.\n\nThe Sleep_Time is 1 second by default but can be set to 0 or higher, and is used when crawling hyperlinks is specified. It inserts a delay between each link that it crawls. This delay is not used between individual pages that were submitted by people.\n\n");
  50. exit(0);
  51. }
  52. if(argc >= 3){
  53. sleeptime = atoi(argv[2]);
  54. }
  55. long int previousID[5] = {0, 1, 2, 3, 4};
  56. int sanity = 1;
  57. for(int i=0;i<5;i++){
  58. previousfail[i][0]=0;
  59. }
  60. //check if there are shards to include
  61. int nShards=0,fsize=0,shardnum=0;
  62. char numshards[20], shardnumstr[20];
  63. memset(numshards,0,20);
  64. memset(shardnumstr,0,20);
  65. sprintf(shardnumstr,"0");
  66. if(shardfile = fopen("shards", "r")){
  67. fseek(shardfile, 0, SEEK_END);
  68. fsize = ftell(shardfile);
  69. fseek(shardfile, 0, SEEK_SET);
  70. if(fsize > 0 && fsize < 11){
  71. shardfilestr = malloc(fsize + 1);
  72. if(fread(shardfilestr, 1, fsize, shardfile)){}
  73. shardfilestr[fsize] = 0;
  74. for(int i=0;i<fsize;i++){
  75. if(shardfilestr[i] != 13 && shardfilestr[i] != 10){
  76. numshards[i]=shardfilestr[i];
  77. }
  78. }
  79. //check if its a number
  80. if(isnum(numshards)==1){
  81. nShards = atoi(numshards);
  82. }else{
  83. printf("The shard file must contain a number. Indicate the number of available shards you are using or set it to 0 if you aren't.\n\n");
  84. exit(0);
  85. }
  86. free(shardfilestr);
  87. }
  88. if(fsize>10 || fsize<1){
  89. printf("\nTotal number of shards is not specified or too large.\n");
  90. exit(0);
  91. }
  92. fclose(shardfile);
  93. }else{
  94. printf("\nWarning: 'shards' file is missing. Create the file and indicate the number of available shards you are using or set it to 0 if you aren't.\n\n");
  95. }
  96. if(nShards > 0){
  97. srand(time(NULL));
  98. shardnum = (rand() % nShards);
  99. memset(shardnumstr,0,20);
  100. sprintf(shardnumstr,"%d",shardnum);
  101. }
  102. while(1)
  103. {
  104. //printf("MySQL client version: %s\n", mysql_get_client_info());
  105. int alreadydone = 0, permitted=1;
  106. //allocates or initialises a MYSQL object
  107. MYSQL *con = mysql_init(NULL);
  108. if (con == NULL)
  109. {
  110. finish_with_error(con);
  111. }
  112. //establish a connection to the database. We provide connection handler, host name, user name and password parameters to the function. The other four parameters are the database name, port number, unix socket and finally the client flag
  113. if (mysql_real_connect(con, "localhost", "crawler", "seekout", "wiby", 0, NULL, 0) == NULL)
  114. {
  115. finish_with_error(con);
  116. }
  117. if (mysql_query(con, "SET CHARSET utf8;"))
  118. {
  119. finish_with_error(con);
  120. }
  121. if(id_assigned == 0){
  122. if (mysql_query(con, "SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue limit 1;"))
  123. {
  124. finish_with_error(con);
  125. }
  126. }else{
  127. char indexqueuequery[2001];
  128. memset(indexqueuequery,0,2001);
  129. strcpy(indexqueuequery,"SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue WHERE crawler_id = '");
  130. strcat(indexqueuequery,argv[1]);
  131. strcat(indexqueuequery,"' LIMIT 1;");
  132. if (mysql_query(con, indexqueuequery))
  133. {
  134. finish_with_error(con);
  135. }
  136. }
  137. //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set
  138. MYSQL_RES *result = mysql_store_result(con);
  139. if(result == NULL)
  140. {
  141. finish_with_error(con);
  142. }
  143. //get the number of fields (columns) in the table
  144. //int num_fields = mysql_num_fields(result);
  145. //We fetch the rows and print them to the screen.
  146. /*MYSQL_ROW row;
  147. while (row = mysql_fetch_row(result))
  148. {
  149. for(int i=0; i<num_fields; i++)
  150. {
  151. printf("%s ", row[i] ? row[i] : "NULL");
  152. }
  153. printf("\n");
  154. }*/
  155. MYSQL_ROW row = mysql_fetch_row(result);
  156. int empty=0;
  157. if(row == NULL){
  158. //printf("\nQueue is empty\n");
  159. empty=1;
  160. }else{
  161. //convert shardnum to string
  162. if(nShards > 0){
  163. sprintf(shardnumstr,"%d",shardnum);
  164. //itoa(shardnum,shardnumstr,10);
  165. }
  166. if(id_assigned == 0){
  167. printf("-----------------------------------------------------------------------------------\nFetching:");
  168. }else{
  169. printf("-----------------------------------------------------------------------------------\ncr%s Fetching:",argv[1]);
  170. }
  171. //grab the first entry (fifo)
  172. /*for(int i=0; i<num_fields; i++)
  173. {
  174. printf("%s ", row[i] ? row[i] : "NULL");
  175. }*/
  176. //Store data in first row into variables
  177. char *url = row[1];
  178. char *id = row[0];
  179. char *worksafe = row[2];
  180. char *approver = row[3];
  181. char *surprise = row[4];
  182. char *updatable = row[5];
  183. char *task = row[6];
  184. char *crawl_tree = row[7];
  185. char *crawl_family = row[8];
  186. char *crawl_depth = row[9];
  187. char *crawl_pages = row[10];
  188. char *crawl_type = row[11];
  189. char *crawl_repeat = row[12];
  190. char *force_rules = row[13];
  191. //convert crawl depth, pages to int
  192. int n_crawl_depth=0, n_crawl_pages=0;
  193. if(crawl_depth!=0){
  194. n_crawl_depth = atoi(crawl_depth);
  195. }
  196. if(crawl_pages!=0){
  197. n_crawl_pages = atoi(crawl_pages);
  198. }
  199. printf("\nURL: %s\nID: %s | Worksafe: %s | Surprise: %s | Approver: %s | Updatable: %s | Task: %s\n", url, id, worksafe, surprise, approver, updatable, task);
  200. printf("Tree: %s | Family: %s | Depth: %s | Pages: %s | Type: %s | Repeat: %s | Rules: %s\n",crawl_tree,crawl_family,crawl_depth,crawl_pages,crawl_type,crawl_repeat,force_rules);
  201. //===================check if url already indexed, ====================
  202. //find out if its http or https or http://www. or https://www.
  203. int httpwww=0, httpswww=0, http=0, https=0;
  204. char prefix[14];
  205. memset(prefix,0,14);
  206. strcpy(prefix,"http");
  207. int urlsize = strlen(url);
  208. if(urlsize > 4){
  209. if(url[4]==':' && (url[3]=='p' || url[3]=='P'))
  210. http = 7;
  211. }
  212. if(urlsize > 5){
  213. if(url[5]==':' && (url[4]=='s' || url[4]=='S'))
  214. https = 8;
  215. }
  216. if(urlsize > 11){
  217. if((url[7]=='w' || url[7]=='W') && (url[8]=='w' || url[8]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[10]=='.' ){
  218. httpwww = 11;
  219. http = https = 0;
  220. }
  221. if(url[7]=='/' && (url[8]=='w' || url[8]=='W') && (url[9]=='w' || url[9]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[11]=='.' ){
  222. httpswww = 12;
  223. http = https = 0;
  224. }
  225. }
  226. //set the prefix
  227. if(http > 0) strcat(prefix,"://");
  228. else if(https > 0) strcat(prefix,"s://");
  229. else if(httpwww > 0) strcat(prefix,"://www.");
  230. else if(httpswww > 0) strcat(prefix,"s://www.");
  231. int prefixsize = httpswww+httpwww+https+http;
  232. char urlnoprefix[urlsize-prefixsize+1];
  233. char urlnopathnoprefix[urlsize-prefixsize+1];
  234. memset(urlnoprefix,0,urlsize-prefixsize+2);
  235. memset(urlnopathnoprefix,0,urlsize-prefixsize+2);
  236. int urlcount=0,urlnoprefixcount=0,urlnopathnoprefix_done=0;
  237. //store the url without prefix to urlnoprefix
  238. while(urlcount < urlsize+1)
  239. {
  240. if(urlcount>prefixsize-1)
  241. {
  242. urlnoprefix[urlnoprefixcount]=url[urlcount];
  243. //get urlnopath
  244. if(url[urlcount] != '/' && urlnopathnoprefix_done==0){
  245. urlnopathnoprefix[urlnoprefixcount]=url[urlcount];
  246. }else{
  247. urlnopathnoprefix_done=1;
  248. }
  249. urlnoprefixcount++;
  250. }
  251. urlcount++;
  252. }
  253. //check for '/' at end of url. it may be already indexed without that so we need to account for it.
  254. //int urlnoprefixlength = strlen(urlnoprefix);
  255. int slashfound = 0;
  256. char urlnoprefixnoslash[urlnoprefixcount];
  257. memset(urlnoprefixnoslash,0,urlnoprefixcount);
  258. if(urlnoprefix[urlnoprefixcount-1] == '/')
  259. {
  260. strncpy(urlnoprefixnoslash,urlnoprefix,urlnoprefixcount-1);
  261. slashfound = 1;
  262. }
  263. //printf("\nurlnoprefix: %s\n",urlnoprefix);
  264. printf("Checking if page already exists in index... ");
  265. int idexistsalready = 0, checkurlsize = urlnoprefixcount*24+1000;
  266. char *idexistsvalue;
  267. char checkurl[checkurlsize];
  268. memset(checkurl,0,checkurlsize);
  269. if(task == 0 || task[0] == '2'){//index request did not come from refresh scheduler, or is an autocrawl url
  270. //strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url FROM windex WHERE url = 'http://"); //replace this with a simple check for url_noprefix column match
  271. strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url,url_noprefix,shard FROM windex WHERE url_noprefix = '");
  272. if(slashfound==0)
  273. {
  274. strcat(checkurl,urlnoprefix);
  275. strcat(checkurl,"' OR url_noprefix = '");
  276. strcat(checkurl,urlnoprefix);strcat(checkurl,"/");
  277. strcat(checkurl,"' OR url_noprefix = '");
  278. strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html");
  279. strcat(checkurl,"' OR url_noprefix = '/index.htm");
  280. strcat(checkurl,"';");
  281. }
  282. else
  283. {
  284. strcat(checkurl,urlnoprefix);
  285. strcat(checkurl,"' OR url_noprefix = '");
  286. strcat(checkurl,urlnoprefixnoslash);
  287. strcat(checkurl,"' OR url_noprefix = '");
  288. strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html");
  289. strcat(checkurl,"' OR url_noprefix = '");
  290. strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm");
  291. strcat(checkurl,"';");
  292. }
  293. }else{
  294. strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url,url_noprefix,shard FROM windex WHERE url = '");
  295. strcat(checkurl,url);
  296. strcat(checkurl,"';");
  297. }
  298. if (mysql_query(con, checkurl))
  299. {
  300. finish_with_error(con);
  301. }
  302. //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set
  303. MYSQL_RES *resulturlcheck = mysql_store_result(con);
  304. if(resulturlcheck == NULL)
  305. {
  306. finish_with_error(con);
  307. }
  308. //grab the first entry (fifo)
  309. printf("Found ID ");
  310. row = mysql_fetch_row(resulturlcheck);
  311. char updatedefault[] = "1";
  312. char *updatableOldDBval = updatedefault;
  313. char *enableOldDBval = updatedefault;
  314. char *dbtitle;
  315. char *fault;
  316. char *dburl;
  317. char *dburl_noprefix;
  318. char *shard;
  319. //Catalog the previous crawl attempts (to see if they are all for the same page - which would be a bad sign)
  320. previousID[4] = previousID[3];
  321. previousID[3] = previousID[2];
  322. previousID[2] = previousID[1];
  323. previousID[1] = previousID[0];
  324. if(row == NULL)
  325. {
  326. printf("null");
  327. previousID[0] = -1;
  328. }else {
  329. printf("%s",row[0]);
  330. idexistsalready = 1;
  331. idexistsvalue = row[0];
  332. previousID[0] = atoi(row[0]);
  333. updatableOldDBval = row[1];
  334. dbtitle = row[2];
  335. enableOldDBval = row[3];
  336. fault = row[4];
  337. dburl=row[5];
  338. dburl_noprefix=row[6];
  339. shard=row[7];
  340. if(task != 0 && task[0]=='2')
  341. alreadydone=1;
  342. }
  343. //Log duplicate rows (they shouldn't exist)
  344. int num_rows = mysql_num_rows(resulturlcheck);
  345. if(num_rows > 1){
  346. FILE *duplicates = fopen("duplicates.txt", "a");
  347. fputs (dburl,duplicates);
  348. fputs ("\r\n",duplicates);
  349. fclose(duplicates);
  350. }
  351. //check robots.txt file for this domain
  352. urlparse(url);
  353. //if(task != 0 && task[0]=='2'){ //enable this statement if you only want to check robots.txt when crawling through hyperlinks, but not on human submissions
  354. permitted = checkrobots(prefix,rootdomain,urlPath); //comment this line out if you want to completely disable checking robots.txt
  355. //}
  356. //Does this crawl attempt, along with the last 4 have the same ID? There is possibly a duplicate db entry, or some other problem.
  357. if(previousID[0] != -1 && alreadydone==0){
  358. if(previousID[0] == previousID[4] && previousID[0] == previousID[3] && previousID[0] == previousID[2] && previousID[0] == previousID[1]){
  359. sanity = 0;
  360. printf("\nWARNING: Last 5 crawl attempts are all for the same page. Will not continue crawling in this situation. Is the same page being submitted over and over? Also, duplicate table entries of the same URL in windex can cause this behavior. Check the database, and duplicates.txt");
  361. exit(0);
  362. }else{
  363. sanity = 1;
  364. }
  365. }else{
  366. sanity = 1;
  367. }
  368. int failedcrawl=0;
  369. if(task != 0 && task[0]=='2' && alreadydone==0 && permitted==1){
  370. //see if url failed to crawl last time (when link crawling)
  371. //as it might come up multiple times during crawl of website, should avoid recrawling it
  372. //will also check the database if this check passes
  373. for(int i=0;i<5;i++){
  374. if(strcasecmp(previousfail[i], urlnoprefix)==0){
  375. sanity=0;
  376. failedcrawl=1;
  377. break;
  378. }
  379. }
  380. if(sanity==1 && sleeptime > 0)
  381. sleep(sleeptime);//do link crawling slowly, 1 second is default unless specified
  382. }
  383. //if crawling through hyperlinks, doublecheck that this hyperlink hasn't been crawled recently, even if it was redirected elsewhere or failed
  384. int alreadylogged = 0;
  385. if(failedcrawl==0 && task !=0 && task[0]=='2' && alreadydone == 0){
  386. if (mysql_query(con, "use wibytemp"))
  387. {
  388. finish_with_error(con);
  389. }
  390. memset(checkurl,0,checkurlsize);
  391. strcpy(checkurl,"SELECT id FROM crawled WHERE url_noprefix = '");
  392. if(slashfound==0)
  393. {
  394. strcat(checkurl,urlnoprefix);
  395. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '");
  396. strcat(checkurl,urlnoprefix);strcat(checkurl,"/");
  397. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '");
  398. strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html");
  399. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '/index.htm");
  400. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR");
  401. }else{
  402. strcat(checkurl,urlnoprefix);
  403. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '");
  404. strcat(checkurl,urlnoprefixnoslash);
  405. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '");
  406. strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html");
  407. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '");
  408. strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm");
  409. strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR");
  410. }
  411. //query db
  412. if (mysql_query(con, checkurl))
  413. {
  414. finish_with_error(con);
  415. }
  416. MYSQL_RES *resultcrawledurlcheck = mysql_store_result(con);
  417. if(resultcrawledurlcheck == NULL)
  418. {
  419. finish_with_error(con);
  420. }
  421. //grab the first entry (fifo)
  422. MYSQL_ROW rowCrawledURLCheck = mysql_fetch_row(resultcrawledurlcheck);
  423. if(rowCrawledURLCheck != NULL)
  424. {
  425. sanity=0;
  426. alreadylogged = 1;
  427. printf("\nThis hyperlink was crawled recently. It cannot be crawled again for at least 12 hours.");
  428. }
  429. mysql_free_result(resultcrawledurlcheck);
  430. if (mysql_query(con, "use wiby"))
  431. {
  432. finish_with_error(con);
  433. }
  434. }
  435. //printf("\n\n%ld, %ld, %ld, %ld, %ld\n",previousID[0],previousID[1],previousID[2],previousID[3],previousID[4]);
  436. //see if the server will accept http only connections on older browsers, change url to HTTP only:
  437. char urlHTTP[strlen(url)+100];
  438. memset(urlHTTP,0,strlen(url)+100);
  439. strcpy(urlHTTP,"http");
  440. if(http > 0 || https > 0){
  441. strcat(urlHTTP,"://");
  442. }else if(httpwww > 0 || httpswww > 0){
  443. strcat(urlHTTP,"://www.");
  444. }
  445. strcat(urlHTTP,urlnoprefix);
  446. if(updatableOldDBval[0] != '0' && enableOldDBval[0] != '0' && sanity == 1 && alreadydone==0 && permitted==1)
  447. {
  448. printf("\nAttempt HTTP connection: %s",urlHTTP);
  449. printf("\nDownloading page... ");
  450. //===============do the curl (download the webpage)=====================
  451. curl_global_init(CURL_GLOBAL_DEFAULT);
  452. CURL *curl;
  453. FILE *fp;
  454. CURLcode res;
  455. char outfilename[FILENAME_MAX] = "page.out";
  456. curl = curl_easy_init();
  457. long size=0;
  458. char *finalURL = NULL;
  459. long response_code;
  460. int finalURLsize = 0,skipurl=0;
  461. if (curl) {
  462. fp = fopen(outfilename,"wb");
  463. //Get file size
  464. //fseek(fp, 0L, SEEK_END);
  465. //size = ftell(fp);
  466. //set curl options
  467. curl_easy_setopt(curl, CURLOPT_URL, urlHTTP);// set URL to get here
  468. curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; WebCrawler; SearchEngine)");
  469. curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);// send all data to this function //
  470. curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle
  471. curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects
  472. curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60L);
  473. curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 20L);
  474. curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);//max num of redirects
  475. curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 5000000L);//don't download if over 5MB
  476. curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);//0 or 1 to verify ssl
  477. //curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);//set verbose
  478. res = curl_easy_perform(curl);// get it!
  479. //if(res == CURLE_OK) {//get final redirect url //-- don't check for this, causes segfault if "transfer closed with outstanding read data remaining"
  480. curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL);
  481. curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code);
  482. if(finalURL){
  483. printf("Effective URL: %s\nResponse: %ld, ", finalURL,response_code);
  484. finalURLsize = strlen(finalURL);
  485. }
  486. //curl_easy_cleanup(curl); //cleanup moved further down because finalURL is needed at insert
  487. //get file size
  488. fseek(fp, 0L, SEEK_END);
  489. size = ftell(fp);
  490. fclose(fp);
  491. }
  492. //if effective URL contains ':443', CURL will fail to download this page on next update. Remove :443 from finalURL.
  493. char *ptr_substring = NULL;
  494. int substringpos=0;
  495. if(finalURLsize > 3)
  496. ptr_substring = strstr(finalURL,":443");
  497. if(ptr_substring != NULL && skipurl == 0 && finalURLsize<=500){
  498. substringpos = ptr_substring - finalURL;
  499. int poscount = substringpos;
  500. memcpy(correctedURL,finalURL,substringpos);//copy before substring
  501. while(1){//copy after substring
  502. correctedURL[poscount] = finalURL[poscount+4];
  503. if(finalURL[poscount+4] == 0)
  504. break;
  505. poscount++;
  506. }
  507. finalURL = correctedURL;
  508. finalURLsize = strlen(finalURL);
  509. printf("\nSetting final URL as: %s\n", finalURL);
  510. }
  511. int finalURLcount=0;
  512. while(finalURL[finalURLcount]!=0){
  513. if(finalURL[finalURLcount]=='\''){
  514. skipurl=1;
  515. printf("\nURL contains single-quote. Skipping.");
  516. }
  517. finalURLcount++;
  518. }
  519. //when crawling through hyperlinks, log that the url was accessed, use the original url, not finalURL
  520. if(skipurl==0 && task != 0 && task[0]=='2'){
  521. if (mysql_query(con, "use wibytemp"))
  522. {
  523. finish_with_error(con);
  524. }
  525. char sqlquerylogurl[2000];
  526. memset(sqlquerylogurl,0,2000);
  527. strcpy(sqlquerylogurl,"INSERT INTO crawled (url_noprefix) VALUES('");
  528. strcat(sqlquerylogurl,urlnoprefix);
  529. strcat(sqlquerylogurl,"')");
  530. if (mysql_query(con, sqlquerylogurl))
  531. {
  532. finish_with_error(con);
  533. }
  534. if (mysql_query(con, "use wiby"))
  535. {
  536. finish_with_error(con);
  537. }
  538. }
  539. if(finalURLsize>500){
  540. skipurl=1;
  541. printf("\nURL is too long");
  542. }
  543. if(task != 0 && task[0]=='2' && canCrawl(finalURLsize,finalURL)==0){
  544. printf("\nEffective URL failed crawl rules.");
  545. skipurl=1;
  546. }
  547. char finalURLnoprefix[finalURLsize-prefixsize+100];
  548. char httpAllow[] = "0";
  549. memset(finalURLnoprefix,0,finalURLsize-prefixsize+100);
  550. int updatereserve=0;
  551. char idReserve[200];
  552. if(skipurl==0){
  553. //see if server permitted an http connection
  554. if(finalURL != NULL){
  555. if(finalURL[4]==':')
  556. httpAllow[0] = '1';
  557. }
  558. else if(http > 0 || httpwww > 0){
  559. httpAllow[0] = '1';
  560. }
  561. //Remove the prefix from the final URL, to store into url_noprefix
  562. //find out if its http or https or http://www. or https://www.
  563. httpwww=httpswww=http=https=0;
  564. if(finalURLsize > 4){
  565. if(finalURL[4]==':')
  566. http = 7;
  567. if(finalURL[4]=='s' || finalURL[4]=='S')
  568. https = 8;
  569. }
  570. if(finalURLsize > 11){
  571. if((finalURL[7]=='w' || finalURL[7]=='W') && (finalURL[8]=='w' || finalURL[8]=='W') && (finalURL[9]=='w' || finalURL[9]=='W') && finalURL[10]=='.' ){
  572. httpwww = 11;
  573. http = https = 0;
  574. }
  575. if(finalURL[7]=='/' && (finalURL[8]=='w' || finalURL[8]=='W') && (finalURL[9]=='w' || finalURL[9]=='W') && (finalURL[10]=='w' || finalURL[10]=='W') && finalURL[11]=='.' ){
  576. httpswww = 12;
  577. http = https = 0;
  578. }
  579. }
  580. int finalURL_prefixsize = httpswww+httpwww+https+http, finalurlnoprefixcount = 0;
  581. urlcount=0;
  582. //store the final url without prefix to finalURLnoprefix
  583. while(finalURL[urlcount] != 0){
  584. if(urlcount>finalURL_prefixsize-1)
  585. {
  586. finalURLnoprefix[finalurlnoprefixcount]=finalURL[urlcount];
  587. finalurlnoprefixcount++;
  588. }
  589. urlcount++;
  590. }
  591. //Double check that the URL is in fact not in the DB, by also searching for the effective URL from libcurl and its url in the table
  592. int foundindoublecheck=0;
  593. if(idexistsalready == 0){
  594. mysql_free_result(resulturlcheck);
  595. char doublecheckurl[finalURLsize+100];
  596. memset(doublecheckurl,0,finalURLsize+100);
  597. strcpy(doublecheckurl,"SELECT id,updatable,title,enable,fault,url,url_noprefix,shard FROM windex WHERE url = '");
  598. strcat(doublecheckurl,finalURL);
  599. strcat(doublecheckurl,"';");
  600. if (mysql_query(con, doublecheckurl))
  601. {
  602. finish_with_error(con);
  603. }
  604. resulturlcheck = mysql_store_result(con);
  605. if(resulturlcheck == NULL)
  606. {
  607. finish_with_error(con);
  608. }
  609. row = mysql_fetch_row(resulturlcheck);
  610. if(row != NULL)
  611. {
  612. printf("\nDoublechecked effective URL in windex, found ID %s\n",row[0]);
  613. idexistsalready = 1;
  614. idexistsvalue = row[0];
  615. previousID[0] = atoi(row[0]);
  616. updatableOldDBval = row[1];
  617. dbtitle = row[2];
  618. enableOldDBval = row[3];
  619. fault = row[4];
  620. dburl=row[5];
  621. dburl_noprefix=row[6];
  622. shard=row[7];
  623. if((task != 0 && task[0]=='2') || updatableOldDBval[0] == '0')
  624. alreadydone=1;
  625. foundindoublecheck=1;
  626. }
  627. //Log duplicate rows (they shouldn't exist)
  628. num_rows = mysql_num_rows(resulturlcheck);
  629. if(num_rows > 1){
  630. FILE *duplicates = fopen("duplicates.txt", "a");
  631. fputs (dburl,duplicates);
  632. fputs ("\r\n",duplicates);
  633. fclose(duplicates);
  634. }
  635. //Does this crawl attempt, along with the last 4 have the same ID? There is possibly a duplicate db entry, or some other problem.
  636. if(previousID[0] != -1){
  637. if(previousID[0] == previousID[4] && previousID[0] == previousID[3] && previousID[0] == previousID[2] && previousID[0] == previousID[1]){
  638. printf("\nWARNING: Last 5 crawl attempts are all for the same page. Will not continue crawling in this situation. Is the same page being submitted over and over? Also, duplicate table entries of the same URL in windex can cause this behavior. Check the database, and duplicates.txt");
  639. exit(0);
  640. }
  641. }
  642. }
  643. //if doing an update when using multiple crawlers, reserve the id and verify the URL is still associated with it
  644. if(alreadydone==0 && id_assigned==1 && idexistsalready==1){
  645. if (mysql_query(con, "use wibytemp;"))
  646. {
  647. finish_with_error(con);
  648. }
  649. memset(idReserve,0,200);
  650. strcpy(idReserve,"INSERT into reserve_id (id, crawler_id) VALUES (");
  651. strcat(idReserve,idexistsvalue);
  652. strcat(idReserve,",");
  653. strcat(idReserve,argv[1]);
  654. strcat(idReserve,");");
  655. if(mysql_query(con, idReserve))
  656. {
  657. printf("\nID is already reserved, will try again. Clearing old reservations...");
  658. memset(idReserve,0,200);
  659. strcpy(idReserve,"DELETE FROM reserve_id WHERE time < NOW() - INTERVAL 10 MINUTE OR crawler_id = ");
  660. strcat(idReserve,argv[1]);
  661. if(mysql_query(con, idReserve)){
  662. finish_with_error(con);
  663. }else{
  664. printf(" Done.");
  665. }
  666. alreadydone=1;
  667. }
  668. //back to wiby database
  669. if (mysql_query(con, "use wiby;"))
  670. {
  671. finish_with_error(con);
  672. }
  673. updatereserve=1;
  674. //check that the url being updated is still assigned to that ID
  675. memset(checkurl,0,checkurlsize);
  676. if(task != 0 && task[0] == '1'){
  677. strcpy(checkurl,"SELECT id FROM windex WHERE url = '");
  678. strcat(checkurl,url);
  679. strcat(checkurl,"';");
  680. }else{
  681. if(foundindoublecheck==0){
  682. strcpy(checkurl,"SELECT id FROM windex WHERE url_noprefix = '");
  683. if(slashfound==0)
  684. {
  685. strcat(checkurl,urlnoprefix);
  686. strcat(checkurl,"' OR url_noprefix = '");
  687. strcat(checkurl,urlnoprefix);strcat(checkurl,"/");
  688. strcat(checkurl,"' OR url_noprefix = '");
  689. strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html");
  690. strcat(checkurl,"' OR url_noprefix = '/index.htm");
  691. strcat(checkurl,"';");
  692. }else{
  693. strcat(checkurl,urlnoprefix);
  694. strcat(checkurl,"' OR url_noprefix = '");
  695. strcat(checkurl,urlnoprefixnoslash);
  696. strcat(checkurl,"' OR url_noprefix = '");
  697. strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html");
  698. strcat(checkurl,"' OR url_noprefix = '");
  699. strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm");
  700. strcat(checkurl,"';");
  701. }
  702. }else{
  703. strcpy(checkurl,"SELECT id FROM windex WHERE url = '");
  704. strcat(checkurl,finalURL);
  705. strcat(checkurl,"';");
  706. }
  707. }
  708. //query db
  709. if (mysql_query(con, checkurl))
  710. {
  711. finish_with_error(con);
  712. }
  713. MYSQL_RES *resulturlreservecheck = mysql_store_result(con);
  714. if(resulturlcheck == NULL)
  715. {
  716. finish_with_error(con);
  717. }
  718. //grab the first entry (fifo)
  719. char *URLcheckID;
  720. MYSQL_ROW rowURLCheck = mysql_fetch_row(resulturlreservecheck);
  721. if(rowURLCheck != NULL)
  722. {
  723. URLcheckID = rowURLCheck[0];
  724. }
  725. if(URLcheckID != 0 && atoi(URLcheckID) != atoi(idexistsvalue)){
  726. printf("\nID was already reserved, will try again later.");
  727. alreadydone=1;
  728. }
  729. mysql_free_result(resulturlreservecheck);
  730. }
  731. }
  732. //=====================Extract text from HTML file=======================
  733. if(size < 5000000 && skipurl==0 && alreadydone==0)
  734. {
  735. //switch on/off hyperlink collecting (if crawling through hyperlinks, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set)
  736. if((task != 0 && task[0]=='2' && (n_crawl_depth > 0 || n_crawl_depth < 0) && (n_crawl_pages > 0 || n_crawl_pages < 0)) || (task==0 && (n_crawl_depth > 0 || n_crawl_depth < 0) && (n_crawl_pages > 0 || n_crawl_pages < 0)) || (task != 0 && task[0]=='1' && crawl_repeat != 0 && crawl_repeat[0]=='1' && (n_crawl_pages > 0 || n_crawl_pages < 0))){
  737. getURLs=1;
  738. }else{
  739. getURLs=0;
  740. }
  741. htmlparse();
  742. //need the finalURL path info also
  743. urlparse(finalURL);
  744. memset(urlPath_finalURL,0,1001);
  745. strcpy(urlPath_finalURL,urlPath);
  746. memset(folderPath_finalURL,0,1001);
  747. strcpy(folderPath_finalURL,folderPath);
  748. memset(urlPrefix_finalURL,0,1001);
  749. strcpy(urlPrefix_finalURL,prefix_fromlist);
  750. memset(urlNPNP_finalURL,0,1001);
  751. strcpy(urlNPNP_finalURL,urlnopathnoprefix_fromlist);
  752. if(urlPrefix_finalURL[0]==0 || urlNPNP_finalURL[0]==0 || urlPath_finalURL[0]==0)
  753. noindex = 1;
  754. }else{
  755. noindex = 1;
  756. }
  757. //check if rules are enforced (only for pages that are autocrawled)
  758. if(force_rules != 0 && force_rules[0]=='1' && task != 0 && task[0]=='2' && noindex == 0 && response_code == 200){
  759. if(num_scripts > 2 || num_stylesheets > 2){
  760. noindex = 1;
  761. printf("\nFailed rule check");
  762. }
  763. }
  764. int skip = 0, titlechanged = 0, escape = 0, escapetotal = 0, redirected = 0;
  765. //Check if noindex and size
  766. //if(((noindex == 0 /*&& bodysize < 1900000*/ && bodysize > 10) || (noindex == 0 /*&& bodysize < 1900000*/ && descriptionsize > 10)) && response_code == 200 && alreadydone==0)
  767. if((emptytitle == 0 || descriptionsize > 0 || bodysize > 0) && response_code == 200 && alreadydone==0 && noindex == 0)
  768. {
  769. //=================Allocate memory for the parsed text from htmlparse()
  770. //title = (char*)calloc(titlesize+1,sizeof(char));
  771. //keywords = (char*)calloc(keywordssize+1,sizeof(char));
  772. //description = (char*)calloc(descriptionsize+1,sizeof(char));
  773. //page = (char*)calloc(bodysize+1,sizeof(char));
  774. windexinsert = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char));
  775. //shardinsert = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char));
  776. windexupdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char));
  777. windexRandUpdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char));
  778. titlecheckinsert = (char*)calloc(finalURLsize+titlesize+1001,sizeof(char));
  779. /*if(title == NULL || keywords == NULL || description == NULL || page == NULL || windexinsert == NULL || windexupdate == NULL)
  780. {
  781. printf("\nError allocating memory for webpage");
  782. //cleanup sql stuff
  783. mysql_free_result(resulturlcheck);
  784. mysql_free_result(result);
  785. mysql_close(con);
  786. exit(0);
  787. }*/
  788. //Check if this is a new page: check if the title found in windex is the same as the parsed title. If not, put the page back into review.
  789. int dbtitlesize = 0,titlecheckTitleSize = 0, dbNoTitle=0,extrapos=0;
  790. if(idexistsalready==1)
  791. {
  792. //going to insert the crawled title into a "titlecheck" table with the url for reference, then we're going to read back the
  793. //title and count the number of bytes vs what was read from dbtitlesize to determine if title changed
  794. //this is because bytes read from db must be the same charset as what is crawled to get a proper count
  795. //unsupported charsets can end up truncating data, giving incorrect title check, this method avoids that issue
  796. if (mysql_query(con, "use wibytemp;"))
  797. {
  798. finish_with_error(con);
  799. }
  800. //set charset based on crawled page charset tag
  801. if (mysql_query(con, mysqlcharset))
  802. {
  803. finish_with_error(con);
  804. }
  805. //insert title into wibytemp for comparison
  806. strcpy(titlecheckinsert,"INSERT INTO titlecheck (url,title) VALUES ('");
  807. strcat(titlecheckinsert,finalURL);
  808. strcat(titlecheckinsert,"','");
  809. strcat(titlecheckinsert,title);
  810. strcat(titlecheckinsert,"');");
  811. if (mysql_query(con, titlecheckinsert))
  812. {
  813. finish_with_error(con);
  814. }
  815. if (mysql_query(con, "SET CHARSET utf8;"))
  816. {
  817. finish_with_error(con);
  818. }
  819. //now read back the title from the database
  820. char checktitle[finalURLsize+dbtitlesize+1000];
  821. memset(checktitle,0,finalURLsize+dbtitlesize+1000);
  822. strcpy(checktitle,"SELECT title FROM titlecheck WHERE url = '");
  823. strcat(checktitle,finalURL);strcat(checktitle,"' ORDER BY id DESC;");
  824. //query db
  825. if (mysql_query(con, checktitle))
  826. {
  827. finish_with_error(con);
  828. }
  829. MYSQL_RES *resulttitlecheck = mysql_store_result(con);
  830. if(resulttitlecheck == NULL)
  831. {
  832. finish_with_error(con);
  833. }
  834. //grab the first entry (fifo)
  835. MYSQL_ROW rowTitleCheck = mysql_fetch_row(resulttitlecheck);
  836. char *titlecheckTitle;
  837. int titlecheckTitleSize = 0;
  838. titlecheckTitle = rowTitleCheck[0];
  839. //printf("\n %s",rowTitleCheck[0]);
  840. //delete the entry from the table
  841. char titlecheckremove[finalURLsize+1000];
  842. memset(titlecheckremove,0,finalURLsize+1000);
  843. strcpy(titlecheckremove,"DELETE FROM titlecheck WHERE url ='");
  844. strcat(titlecheckremove,finalURL);strcat(titlecheckremove,"';");
  845. if (mysql_query(con, titlecheckremove))
  846. {
  847. finish_with_error(con);
  848. }
  849. //back to wiby database
  850. if (mysql_query(con, "use wiby;"))
  851. {
  852. finish_with_error(con);
  853. }
  854. //check if original dburl is now getting redirected from finalurl (should be sent to review)
  855. int finalURLnoprefix_size = strlen(finalURLnoprefix), dburl_noprefix_size = strlen(dburl_noprefix);
  856. if(finalURLnoprefix_size != dburl_noprefix_size){
  857. redirected = 1;
  858. printf("\nIndexed page is being redirected.");
  859. }else{
  860. for(int i=0;i<finalURLnoprefix_size;i++){
  861. if(dburl_noprefix[i] != finalURLnoprefix[i]){
  862. redirected = 1;
  863. printf("\nIndexed page is being redirected.");
  864. break;
  865. }
  866. }
  867. }
  868. while(titlecheckTitle[titlecheckTitleSize]!='\0')//get size of title in titlecheck
  869. {
  870. titlecheckTitleSize++;
  871. }
  872. //printf("\n%d",titlecheckTitleSize);
  873. dbtitlesize = 0,dbNoTitle=0,extrapos=0;
  874. while(dbtitle[dbtitlesize]!='\0')//get size of old title in db
  875. {
  876. dbtitlesize++;
  877. }
  878. //printf("\n%d",dbtitlesize);
  879. //check if dbtitle matches url - If no title exists, URL's smaller than 111 chars will be used as titles, otherwise, "Untitled" will be used.
  880. int URL_is_dbtitle = 1;
  881. dbNoTitle=1;
  882. if(dbtitlesize==finalURLsize){
  883. for(int i=0;i<finalURLsize;i++){
  884. if(dbtitle[i] != finalURL[i]){
  885. URL_is_dbtitle = dbNoTitle = 0;
  886. break;
  887. }
  888. }
  889. }else{
  890. URL_is_dbtitle = dbNoTitle = 0;
  891. }
  892. if(dbtitlesize == 8 && URL_is_dbtitle == 0)//check if old title in db is "Untitled"
  893. {
  894. if(dbtitle[0]=='U' && dbtitle[1]=='n' && dbtitle[2]=='t' && dbtitle[3]=='i' && dbtitle[4]=='t' && dbtitle[5]=='l' && dbtitle[6]=='e' && dbtitle[7]=='d')
  895. dbNoTitle=1;
  896. if(titlesize == 8 && emptytitle == 0){//off chance the title is actually called "Untitled".
  897. if(title[0]=='U' && title[1]=='n' && title[2]=='t' && title[3]=='i' && title[4]=='t' && title[5]=='l' && title[6]=='e' && title[7]=='d')
  898. dbNoTitle=0;
  899. }
  900. }
  901. //if((dbNoTitle == 0 && dbtitlesize != (titlesize-extrapos)) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0)) //previous, before db wibytemp titlecheck method
  902. if((dbNoTitle == 0 && dbtitlesize != titlecheckTitleSize) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0) || (URL_is_dbtitle == 1 && dbtitlesize != titlecheckTitleSize && titlesize > 0 && emptytitle == 0))
  903. {
  904. titlechanged = 1;
  905. }
  906. //printf("\n|%s|\n%d\n%d\n%d\n%d\n%d",dbtitle,titlesize,dbtitlesize,extrapos,dbNoTitle,titlechanged);
  907. //cleanup some sql stuff
  908. mysql_free_result(resulttitlecheck);
  909. }
  910. if(titlechanged == 0 && redirected == 0)
  911. {
  912. //====================Load the parsed text into windex!==================
  913. if (mysql_query(con, mysqlcharset))//set charset based on page charset tag
  914. {
  915. finish_with_error(con);
  916. }
  917. //strcpy(windexinsert,"INSERT INTO windex (url,title,tags,description,body,worksafe,enable,date,approver,surprise,updatable) VALUES ('");
  918. strcpy(windexinsert,"INSERT INTO windex (url,url_noprefix,title,description,body,worksafe,enable,date,approver,surprise,http,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,shard) VALUES ('");
  919. strcpy(windexupdate,"UPDATE windex SET url = '");
  920. int copiedRandom = 0;
  921. int reserveFail = 0;
  922. char randomreserve[100];
  923. char *randID;
  924. char *randshard;
  925. MYSQL_RES *resultRandID;
  926. if(idexistsalready == 0){//Insert new entry
  927. //For search topics to be evenly discovered by all replicas or duplicate connections assigned to a specific search section, new rows must be scattered randomly across the database insead of sequental:
  928. //Existing rows will be randomly selected and copied (inserted) into a new row at the bottom, and the new page will take the ID number of the old one through an update.
  929. //select id from windex where enable = 1 order by rand() limit 1;
  930. //insert into windex (url,title,tags,description,body,surprise,http,updatable,worksafe,enable,date,updated,approver,fault) select url,title,tags,description,body,surprise,http,updatable,worksafe,enable,date,updated,approver,fault from windex where id = 1338;
  931. //the corresponding shard table will also be updated with the same ID and contents, which can be offloaded to another replica.
  932. printf("\nInserting into index... ");
  933. if (mysql_query(con, "SELECT id, shard FROM windex WHERE enable = 1 ORDER BY rand() LIMIT 1;"))
  934. {
  935. finish_with_error(con);
  936. }
  937. resultRandID = mysql_store_result(con);
  938. if (resultRandID==NULL)
  939. {
  940. finish_with_error(con);
  941. }
  942. MYSQL_ROW row = mysql_fetch_row(resultRandID);
  943. if(row != NULL){
  944. randID = row[0];
  945. idexistsvalue = row[0];
  946. randshard = row[1];
  947. }
  948. //reserve the randomly selected ID when running more than one crawler
  949. if(row != NULL && id_assigned==1){
  950. if (mysql_query(con, "use wibytemp;"))
  951. {
  952. finish_with_error(con);
  953. }
  954. memset(randomreserve,0,100);
  955. strcpy(randomreserve,"INSERT into reserve_id (id) VALUES (");
  956. strcat(randomreserve,randID);
  957. strcat(randomreserve,");");
  958. if (mysql_query(con, randomreserve))
  959. {
  960. printf("\nID is already reserved. Clearing old reservations...");
  961. if(mysql_query(con, "DELETE FROM reserve_id WHERE time < NOW() - INTERVAL 10 MINUTE")){
  962. finish_with_error(con);
  963. }else{
  964. printf(" Done.");
  965. }
  966. reserveFail=1;//if error: more than one crawler attempted to reserve the same randomly selected ID
  967. }
  968. //back to wiby database
  969. if (mysql_query(con, "use wiby;"))
  970. {
  971. finish_with_error(con);
  972. }
  973. }
  974. if(row == NULL || reserveFail==1){//if no rows in db yet or fails to reserve an ID
  975. strcat(windexinsert,finalURL);strcat(windexinsert,"','");
  976. strcat(windexinsert,finalURLnoprefix);strcat(windexinsert,"','");
  977. //strcat(windexinsert,prefix);strcat(windexinsert,"','");
  978. if(titlesize > 0 && emptytitle == 0) {
  979. strcat(windexinsert,title);
  980. }
  981. else {
  982. if(finalURLsize < 111){
  983. strcat(windexinsert,finalURL);
  984. }
  985. else{
  986. strcat(windexinsert,"Untitled");
  987. }
  988. }
  989. strcat(windexinsert,"','");
  990. //if(tagsize > 0) {strcat(windexinsert,keywords);}
  991. //strcat(windexinsert,"','");
  992. if(descriptionsize > 0) {strcat(windexinsert,description);}
  993. strcat(windexinsert,"','");
  994. if(bodysize > 0) {strcat(windexinsert,body);}
  995. strcat(windexinsert,"',");
  996. strcat(windexinsert,worksafe);
  997. strcat(windexinsert,",1,now(),'");
  998. strcat(windexinsert,approver);
  999. strcat(windexinsert,"',");
  1000. strcat(windexinsert,surprise);
  1001. strcat(windexinsert,",");
  1002. strcat(windexinsert,httpAllow);
  1003. strcat(windexinsert,",");
  1004. strcat(windexinsert,updatable);
  1005. if(task != 0 && task[0]=='2'){//came from link crawling
  1006. strcat(windexinsert,",'");
  1007. strcat(windexinsert,crawl_tree);
  1008. strcat(windexinsert,"','");
  1009. strcat(windexinsert,crawl_family);
  1010. strcat(windexinsert,"',");
  1011. strcat(windexinsert,crawl_pages);
  1012. strcat(windexinsert,",");
  1013. strcat(windexinsert,crawl_type);
  1014. strcat(windexinsert,",");
  1015. strcat(windexinsert,"0");
  1016. strcat(windexinsert,", force_rules = ");
  1017. strcat(windexinsert,force_rules);
  1018. }else{
  1019. strcat(windexinsert,",");
  1020. strcat(windexinsert,"NULL,");
  1021. strcat(windexinsert,"NULL,");
  1022. strcat(windexinsert,crawl_pages);
  1023. strcat(windexinsert,",");
  1024. strcat(windexinsert,crawl_type);
  1025. strcat(windexinsert,",");
  1026. strcat(windexinsert,crawl_repeat);
  1027. strcat(windexinsert,", force_rules = ");
  1028. strcat(windexinsert,force_rules);
  1029. }
  1030. strcat(windexinsert,",");
  1031. strcat(windexinsert,shardnumstr);
  1032. strcat(windexinsert,")");
  1033. if (mysql_query(con, windexinsert))
  1034. {
  1035. finish_with_error(con);
  1036. }
  1037. //insert into the shard table for the new row
  1038. if(nShards>0){
  1039. memset(windexinsert,0,strlen(windexinsert));
  1040. strcpy(windexinsert,"INSERT INTO ws");
  1041. strcat(windexinsert,shardnumstr);
  1042. strcat(windexinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID();");
  1043. /*//get the last ID
  1044. MYSQL_RES *resultIDnum;
  1045. char *lastIDnum;
  1046. if (mysql_query(con, "SELECT LAST_INSERT_ID() FROM windex limit 1"))
  1047. {
  1048. finish_with_error(con);
  1049. }
  1050. MYSQL_ROW rowLastID = mysql_fetch_row(resultIDnum);
  1051. if(rowLastID != NULL){
  1052. lastIDnum = rowLastID[0];
  1053. }
  1054. strcpy(shardinsert,"INSERT INTO ws");
  1055. strcat(shardinsert,shardnumstr);
  1056. strcat(shardinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = ");
  1057. strcat(shardinsert,lastIDnum);
  1058. if (mysql_query(con, shardinsert))
  1059. {
  1060. finish_with_error(con);
  1061. }
  1062. mysql_free_result(resultIDnum); */
  1063. if (mysql_query(con, windexinsert))
  1064. {
  1065. finish_with_error(con);
  1066. }
  1067. }
  1068. }
  1069. else{
  1070. //copy contents of randomly selected row to a new row in windex.
  1071. strcpy(windexRandUpdate,"INSERT INTO windex (url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = ");
  1072. strcat(windexRandUpdate,randID);
  1073. if (mysql_query(con, windexRandUpdate))
  1074. {
  1075. finish_with_error(con);
  1076. }
  1077. if(nShards>0){
  1078. //Also copy that new row into a new row in the round-robin assigned shard table
  1079. //update the shard id in windex
  1080. memset(windexRandUpdate,0,strlen(windexRandUpdate));
  1081. strcpy(windexRandUpdate,"UPDATE windex set shard = ");
  1082. strcat(windexRandUpdate,shardnumstr);
  1083. strcat(windexRandUpdate," WHERE id = LAST_INSERT_ID()");
  1084. if (mysql_query(con, windexRandUpdate))
  1085. {
  1086. finish_with_error(con);
  1087. }
  1088. //insert that row into the next shard
  1089. memset(windexRandUpdate,0,strlen(windexRandUpdate));
  1090. strcpy(windexRandUpdate,"INSERT INTO ws");
  1091. strcat(windexRandUpdate,shardnumstr);
  1092. strcat(windexRandUpdate," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID()");
  1093. if (mysql_query(con, windexRandUpdate))
  1094. {
  1095. finish_with_error(con);
  1096. }
  1097. //Overwrite the randomly selected row with the contents of the newly crawled webpage
  1098. memset(windexRandUpdate,0,strlen(windexRandUpdate));
  1099. strcpy(windexRandUpdate,"UPDATE windex SET url = '");
  1100. strcat(windexRandUpdate,finalURL);
  1101. strcat(windexRandUpdate,"', url_noprefix = '");
  1102. strcat(windexRandUpdate,finalURLnoprefix);
  1103. strcat(windexRandUpdate,"', title = '");
  1104. if(titlesize > 0 && emptytitle == 0){
  1105. strcat(windexRandUpdate,title);
  1106. }
  1107. else{
  1108. if(finalURLsize < 111){
  1109. strcat(windexRandUpdate,finalURL);
  1110. }
  1111. else{
  1112. strcat(windexRandUpdate,"Untitled");
  1113. }
  1114. }
  1115. strcat(windexRandUpdate,"', tags = NULL, description = '");
  1116. strcat(windexRandUpdate,description);
  1117. strcat(windexRandUpdate,"', body = '");
  1118. strcat(windexRandUpdate,body);
  1119. strcat(windexRandUpdate,"', worksafe = ");
  1120. strcat(windexRandUpdate,worksafe);
  1121. strcat(windexRandUpdate,", approver = '");
  1122. strcat(windexRandUpdate,approver);
  1123. strcat(windexRandUpdate,"', surprise = ");
  1124. strcat(windexRandUpdate,surprise);
  1125. strcat(windexRandUpdate,", http = ");
  1126. strcat(windexRandUpdate,httpAllow);
  1127. strcat(windexRandUpdate,", updatable = ");
  1128. strcat(windexRandUpdate,updatable);
  1129. if(task==0){//didn't come from refresh or link crawling
  1130. strcat(windexRandUpdate,", crawl_tree = NULL");
  1131. strcat(windexRandUpdate,", crawl_family = NULL");
  1132. strcat(windexRandUpdate,", crawl_pages = ");
  1133. strcat(windexRandUpdate,crawl_pages);
  1134. strcat(windexRandUpdate,", crawl_type = ");
  1135. strcat(windexRandUpdate,crawl_type);
  1136. strcat(windexRandUpdate,", crawl_repeat = ");
  1137. strcat(windexRandUpdate,crawl_repeat);
  1138. strcat(windexRandUpdate,", force_rules = ");
  1139. strcat(windexRandUpdate,force_rules);
  1140. }else if(task != 0 && task[0]=='2'){//came from link crawling
  1141. strcat(windexRandUpdate,", crawl_tree = '");
  1142. strcat(windexRandUpdate,crawl_tree);
  1143. strcat(windexRandUpdate,"', crawl_family ='");
  1144. strcat(windexRandUpdate,crawl_family);
  1145. strcat(windexRandUpdate,"', crawl_pages = ");
  1146. strcat(windexRandUpdate,crawl_pages);
  1147. strcat(windexRandUpdate,", crawl_type = ");
  1148. strcat(windexRandUpdate,crawl_type);
  1149. strcat(windexRandUpdate,", crawl_repeat = ");
  1150. strcat(windexRandUpdate,"0");
  1151. strcat(windexRandUpdate,", force_rules = ");
  1152. strcat(windexRandUpdate,force_rules);
  1153. }
  1154. strcat(windexRandUpdate,", updated = CURRENT_TIMESTAMP, date = now(), fault = 0 WHERE id = ");
  1155. strcat(windexRandUpdate,randID);
  1156. if (mysql_query(con, windexRandUpdate))
  1157. {
  1158. finish_with_error(con);
  1159. }
  1160. //Finally, update the corresponding shard table row
  1161. if(randshard != 0){
  1162. memset(windexRandUpdate,0,strlen(windexRandUpdate));
  1163. strcpy(windexRandUpdate,"UPDATE ws");
  1164. strcat(windexRandUpdate,randshard);
  1165. strcat(windexRandUpdate," SET url = '");
  1166. strcat(windexRandUpdate,finalURL);
  1167. strcat(windexRandUpdate,"', url_noprefix = '");
  1168. strcat(windexRandUpdate,finalURLnoprefix);
  1169. strcat(windexRandUpdate,"', title = '");
  1170. if(titlesize > 0 && emptytitle == 0){
  1171. strcat(windexRandUpdate,title);
  1172. }
  1173. else{
  1174. if(finalURLsize < 111){
  1175. strcat(windexRandUpdate,finalURL);
  1176. }
  1177. else{
  1178. strcat(windexRandUpdate,"Untitled");
  1179. }
  1180. }
  1181. strcat(windexRandUpdate,"', tags = NULL, description = '");
  1182. strcat(windexRandUpdate,description);
  1183. strcat(windexRandUpdate,"', body = '");
  1184. strcat(windexRandUpdate,body);
  1185. strcat(windexRandUpdate,"', worksafe = ");
  1186. strcat(windexRandUpdate,worksafe);
  1187. strcat(windexRandUpdate,", approver = '");
  1188. strcat(windexRandUpdate,approver);
  1189. strcat(windexRandUpdate,"', surprise = ");
  1190. strcat(windexRandUpdate,surprise);
  1191. strcat(windexRandUpdate,", http = ");
  1192. strcat(windexRandUpdate,httpAllow);
  1193. strcat(windexRandUpdate,", updatable = ");
  1194. strcat(windexRandUpdate,updatable);
  1195. if(task==0){//didn't come from refresh or link crawling
  1196. strcat(windexRandUpdate,", crawl_tree = NULL");
  1197. strcat(windexRandUpdate,", crawl_family = NULL");
  1198. strcat(windexRandUpdate,", crawl_pages = ");
  1199. strcat(windexRandUpdate,crawl_pages);
  1200. strcat(windexRandUpdate,", crawl_type = ");
  1201. strcat(windexRandUpdate,crawl_type);
  1202. strcat(windexRandUpdate,", crawl_repeat = ");
  1203. strcat(windexRandUpdate,crawl_repeat);
  1204. strcat(windexRandUpdate,", force_rules = ");
  1205. strcat(windexRandUpdate,force_rules);
  1206. }else if(task != 0 && task[0]=='2'){//came from link crawling
  1207. strcat(windexRandUpdate,", crawl_tree = '");
  1208. strcat(windexRandUpdate,crawl_tree);
  1209. strcat(windexRandUpdate,"', crawl_family ='");
  1210. strcat(windexRandUpdate,crawl_family);
  1211. strcat(windexRandUpdate,"', crawl_pages = ");
  1212. strcat(windexRandUpdate,crawl_pages);
  1213. strcat(windexRandUpdate,", crawl_type = ");
  1214. strcat(windexRandUpdate,crawl_type);
  1215. strcat(windexRandUpdate,", crawl_repeat = ");
  1216. strcat(windexRandUpdate,"0");
  1217. strcat(windexRandUpdate,", force_rules = ");
  1218. strcat(windexRandUpdate,force_rules);
  1219. }
  1220. strcat(windexRandUpdate,", updated = CURRENT_TIMESTAMP, date = now(), fault = 0 WHERE id = ");
  1221. strcat(windexRandUpdate,randID);
  1222. if (mysql_query(con, windexRandUpdate))
  1223. {
  1224. finish_with_error(con);
  1225. }
  1226. }
  1227. }
  1228. copiedRandom = 1;
  1229. }
  1230. }
  1231. if(idexistsalready == 1 || (copiedRandom == 1 && nShards == 0)){ //update an existing entry or a new entry with no shard listed in row
  1232. if(idexistsalready == 1)
  1233. printf("\nUpdating index... ");
  1234. strcat(windexupdate,finalURL);
  1235. strcat(windexupdate,"', url_noprefix = '");
  1236. strcat(windexupdate,finalURLnoprefix);
  1237. strcat(windexupdate,"', title = '");
  1238. if(titlesize > 0 && emptytitle == 0){
  1239. strcat(windexupdate,title);
  1240. }
  1241. else{
  1242. if(finalURLsize < 111){
  1243. strcat(windexupdate,finalURL);
  1244. }
  1245. else{
  1246. strcat(windexupdate,"Untitled");
  1247. }
  1248. }
  1249. if(copiedRandom == 0)//normal update
  1250. strcat(windexupdate,"', description = '");
  1251. else{
  1252. strcat(windexupdate,"', tags = NULL, description = '");
  1253. }
  1254. strcat(windexupdate,description);
  1255. strcat(windexupdate,"', body = '");
  1256. strcat(windexupdate,body);
  1257. strcat(windexupdate,"', worksafe = ");
  1258. strcat(windexupdate,worksafe);
  1259. strcat(windexupdate,", approver = '");
  1260. strcat(windexupdate,approver);
  1261. strcat(windexupdate,"', surprise = ");
  1262. strcat(windexupdate,surprise);
  1263. strcat(windexupdate,", http = ");
  1264. strcat(windexupdate,httpAllow);
  1265. strcat(windexupdate,", updatable = ");
  1266. strcat(windexupdate,updatable);
  1267. if(task==0){//didn't come from refresh or link crawling
  1268. if(idexistsalready == 0){
  1269. strcat(windexupdate,", crawl_tree = NULL");
  1270. strcat(windexupdate,", crawl_family = NULL");
  1271. }
  1272. strcat(windexupdate,", crawl_pages = ");
  1273. strcat(windexupdate,crawl_pages);
  1274. strcat(windexupdate,", crawl_type = ");
  1275. strcat(windexupdate,crawl_type);
  1276. strcat(windexupdate,", crawl_repeat = ");
  1277. strcat(windexupdate,crawl_repeat);
  1278. strcat(windexupdate,", force_rules = ");
  1279. strcat(windexupdate,force_rules);
  1280. }else if(task != 0 && task[0]=='2' && idexistsalready == 0){//came from link crawling
  1281. strcat(windexupdate,", crawl_tree = '");
  1282. strcat(windexupdate,crawl_tree);
  1283. strcat(windexupdate,"', crawl_family ='");
  1284. strcat(windexupdate,crawl_family);
  1285. strcat(windexupdate,"', crawl_pages = ");
  1286. strcat(windexupdate,crawl_pages);
  1287. strcat(windexupdate,", crawl_type = ");
  1288. strcat(windexupdate,crawl_type);
  1289. strcat(windexupdate,", crawl_repeat = ");
  1290. strcat(windexupdate,"0");
  1291. strcat(windexupdate,", force_rules = ");
  1292. strcat(windexupdate,force_rules);
  1293. }
  1294. if(copiedRandom == 0)//normal update
  1295. strcat(windexupdate,", updated = CURRENT_TIMESTAMP, fault = 0 WHERE id = ");
  1296. else
  1297. strcat(windexupdate,", updated = CURRENT_TIMESTAMP, date = now(), fault = 0 WHERE id = ");
  1298. strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row
  1299. if (mysql_query(con, windexupdate))
  1300. {
  1301. finish_with_error(con);
  1302. }
  1303. //update shard
  1304. if(nShards>0 && idexistsalready == 1 && shard != 0){
  1305. memset(windexupdate,0,strlen(windexupdate));
  1306. strcpy(windexupdate,"UPDATE ws");
  1307. strcat(windexupdate,shard);
  1308. strcat(windexupdate," SET url = '");
  1309. strcat(windexupdate,finalURL);
  1310. strcat(windexupdate,"', url_noprefix = '");
  1311. strcat(windexupdate,finalURLnoprefix);
  1312. strcat(windexupdate,"', title = '");
  1313. if(titlesize > 0 && emptytitle == 0){
  1314. strcat(windexupdate,title);
  1315. }
  1316. else{
  1317. if(finalURLsize < 111){
  1318. strcat(windexupdate,finalURL);
  1319. }
  1320. else{
  1321. strcat(windexupdate,"Untitled");
  1322. }
  1323. }
  1324. if(copiedRandom == 0)//normal update
  1325. strcat(windexupdate,"', description = '");
  1326. else{
  1327. strcat(windexupdate,"', tags = NULL, description = '");
  1328. }
  1329. strcat(windexupdate,description);
  1330. strcat(windexupdate,"', body = '");
  1331. strcat(windexupdate,body);
  1332. strcat(windexupdate,"', worksafe = ");
  1333. strcat(windexupdate,worksafe);
  1334. strcat(windexupdate,", approver = '");
  1335. strcat(windexupdate,approver);
  1336. strcat(windexupdate,"', surprise = ");
  1337. strcat(windexupdate,surprise);
  1338. strcat(windexupdate,", http = ");
  1339. strcat(windexupdate,httpAllow);
  1340. strcat(windexupdate,", updatable = ");
  1341. strcat(windexupdate,updatable);
  1342. if(task==0){//didn't come from refresh or link crawling
  1343. strcat(windexupdate,", crawl_pages = ");
  1344. strcat(windexupdate,crawl_pages);
  1345. strcat(windexupdate,", crawl_type = ");
  1346. strcat(windexupdate,crawl_type);
  1347. strcat(windexupdate,", crawl_repeat = ");
  1348. strcat(windexupdate,crawl_repeat);
  1349. strcat(windexupdate,", force_rules = ");
  1350. strcat(windexupdate,force_rules);
  1351. }
  1352. strcat(windexupdate,", updated = CURRENT_TIMESTAMP, fault = 0 WHERE id = ");
  1353. strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row
  1354. if (mysql_query(con, windexupdate))
  1355. {
  1356. finish_with_error(con);
  1357. }
  1358. }
  1359. }
  1360. //unreserve randomly selected ID
  1361. if(id_assigned==1 && idexistsalready==0 && reserveFail==0){
  1362. if (mysql_query(con, "use wibytemp;"))
  1363. {
  1364. finish_with_error(con);
  1365. }
  1366. memset(randomreserve,0,100);
  1367. strcpy(randomreserve,"DELETE FROM reserve_id where id = ");
  1368. strcat(randomreserve,randID);
  1369. strcat(randomreserve,";");
  1370. if (mysql_query(con, randomreserve))
  1371. {
  1372. finish_with_error(con);
  1373. }
  1374. //back to wiby database
  1375. if (mysql_query(con, "use wiby;"))
  1376. {
  1377. finish_with_error(con);
  1378. }
  1379. }
  1380. //unreserve ID if doing an update
  1381. if(id_assigned==1 && updatereserve==1){
  1382. if (mysql_query(con, "use wibytemp;"))
  1383. {
  1384. finish_with_error(con);
  1385. }
  1386. memset(idReserve,0,200);
  1387. strcpy(idReserve,"DELETE FROM reserve_id where id = ");
  1388. strcat(idReserve,idexistsvalue);
  1389. strcat(idReserve,";");
  1390. if(mysql_query(con, idReserve))
  1391. {
  1392. finish_with_error(con);
  1393. }
  1394. //back to wiby database
  1395. if (mysql_query(con, "use wiby;"))
  1396. {
  1397. finish_with_error(con);
  1398. }
  1399. }
  1400. //free result
  1401. if(idexistsalready == 0){
  1402. mysql_free_result(resultRandID);
  1403. }
  1404. //===================remove the entry from the indexqueue===============
  1405. //printf("\nRemoving from queue...");
  1406. char sqlqueryremove[200];
  1407. memset(sqlqueryremove,0,200);
  1408. strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id=");
  1409. strcat(sqlqueryremove,id);strcat(sqlqueryremove,";");
  1410. if (mysql_query(con, sqlqueryremove))
  1411. {
  1412. finish_with_error(con);
  1413. }
  1414. printf("\n\nSuccess!");
  1415. }
  1416. //clear page from memory
  1417. free(windexinsert); free(windexupdate); free(titlecheckinsert); free(windexRandUpdate); //free(shardinsert);
  1418. }else{
  1419. skip = 1;
  1420. }
  1421. if((skip == 1 || titlechanged == 1 || redirected == 1)){
  1422. //from skip check: if(((noindex == 0 && bodysize < 1900000 && bodysize > 10) || (noindex == 0 && bodysize < 1900000 && descriptionsize > 10)) && response_code == 200 && alreadydone==0)
  1423. //printf("\nnoindex: %d\nbodysize: %ld\ndescriptionsize %ld\nresponse_code: %d\nalreadydone: %d\nskip: %d\ntitlechanged: %d\nredirected: %d",noindex,bodysize,descriptionsize,response_code,alreadydone,skip,titlechanged,redirected);
  1424. if(skip == 1){
  1425. printf("\nDoesn't want to be indexed, size too big, 404, already done, failed rules, or security issue.");
  1426. //log previous failed link crawls
  1427. strcpy(previousfail[4],previousfail[3]);
  1428. strcpy(previousfail[3],previousfail[2]);
  1429. strcpy(previousfail[2],previousfail[1]);
  1430. strcpy(previousfail[1],previousfail[0]);
  1431. strcpy(previousfail[0],urlnoprefix);
  1432. }
  1433. printf("\nRemoving from queue...");
  1434. char sqlqueryremove[200];
  1435. memset(sqlqueryremove,0,200);
  1436. strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id=");
  1437. strcat(sqlqueryremove,id);strcat(sqlqueryremove,";");
  1438. if (mysql_query(con, sqlqueryremove))
  1439. {
  1440. finish_with_error(con);
  1441. }
  1442. if(alreadydone==0){
  1443. if(idexistsalready == 1 && fault[0] == '1')
  1444. {
  1445. if(crawl_family != 0 && crawl_family[0] !='0'){
  1446. printf("\nPage may no longer exist. Originated from link crawling. Removing from the index.");
  1447. FILE *abandoned = fopen("abandoned.txt", "a");
  1448. fputs (url,abandoned);
  1449. fputs ("\r\n",abandoned);
  1450. fclose(abandoned);
  1451. }else{
  1452. printf("\nPage may no longer exist. Moving to review.");
  1453. }
  1454. memset(sqlqueryremove,0,200);
  1455. strcpy(sqlqueryremove,"DELETE FROM windex WHERE id =");
  1456. strcat(sqlqueryremove,idexistsvalue);
  1457. if (mysql_query(con, sqlqueryremove))
  1458. {
  1459. finish_with_error(con);
  1460. }
  1461. if(nShards > 0 && shard != 0){
  1462. memset(sqlqueryremove,0,200);
  1463. strcpy(sqlqueryremove,"DELETE FROM ws");
  1464. strcat(sqlqueryremove,shard);
  1465. strcat(sqlqueryremove," WHERE id = ");
  1466. strcat(sqlqueryremove,idexistsvalue);
  1467. if (mysql_query(con, sqlqueryremove))
  1468. {
  1469. finish_with_error(con);
  1470. }
  1471. }
  1472. if(crawl_family == 0 || (crawl_family != 0 && crawl_family[0] =='0')){
  1473. char sqlqueryreview[1001];
  1474. memset(sqlqueryreview,0,1001);
  1475. strcpy(sqlqueryreview,"INSERT INTO reviewqueue (url,worksafe) VALUES ('");
  1476. strcat(sqlqueryreview,url);strcat(sqlqueryreview,"',");
  1477. strcat(sqlqueryreview,worksafe);strcat(sqlqueryreview,");");
  1478. if (mysql_query(con, sqlqueryreview))
  1479. {
  1480. finish_with_error(con);
  1481. }
  1482. }
  1483. }
  1484. else if(idexistsalready == 1 && fault[0] != '1')//mark that there is a fault with the page, crawler will throw it back into review if it happens again
  1485. {
  1486. printf("\nFault found. Will try again later.");
  1487. char sqlqueryfault[450];
  1488. memset(sqlqueryfault,0,450);
  1489. strcpy(sqlqueryfault,"UPDATE windex SET updated = CURRENT_TIMESTAMP, fault = 1 WHERE id = ");
  1490. strcat(sqlqueryfault,idexistsvalue);
  1491. if (mysql_query(con, sqlqueryfault))
  1492. {
  1493. finish_with_error(con);
  1494. }
  1495. if(nShards>0 && shard != 0){
  1496. memset(sqlqueryfault,0,450);
  1497. strcpy(sqlqueryfault,"UPDATE ws");
  1498. strcat(sqlqueryfault,shard);
  1499. strcat(sqlqueryfault," SET updated = CURRENT_TIMESTAMP, fault = 1 WHERE id = ");
  1500. strcat(sqlqueryfault,idexistsvalue);
  1501. if (mysql_query(con, sqlqueryfault))
  1502. {
  1503. finish_with_error(con);
  1504. }
  1505. }
  1506. }
  1507. else
  1508. {
  1509. FILE *abandoned = fopen("abandoned.txt", "a");
  1510. fputs (url,abandoned);
  1511. fputs ("\r\n",abandoned);
  1512. fclose(abandoned);
  1513. }
  1514. }
  1515. //check if link crawling is specified
  1516. //make sure duplicates don't get crawled more than once
  1517. //check db if its already indexed too - do this at beginning instead?
  1518. //crawl links if crawling through hyperlinks, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set
  1519. }else if(nofollow==0 && getURLs==1 && alreadydone==0){
  1520. //cycle through url list, then construct an sql string around it, then insert it to indexqueue;
  1521. //force crawl depth of 1 during a refresh if crawl_repeat is set
  1522. if(crawl_repeat != 0 && crawl_repeat[0]=='1' && task != 0 && task[0]=='1'){
  1523. n_crawl_depth=1;
  1524. }
  1525. if(n_crawl_depth>0)//below 0 = unlimited depth
  1526. n_crawl_depth--;
  1527. memset(strDepth,0,101);
  1528. sprintf(strDepth,"%d",n_crawl_depth);
  1529. //itoa(n_crawl_depth,strDepth,10);
  1530. memset(url_fromlist,0,url_fromlist_arraylen);
  1531. memset(url_insert,0,url_insert_arraylen);
  1532. int loopcount=0,elementnum=0,urls=0;
  1533. if(id_assigned == 1){
  1534. strcpy(url_insert,"INSERT INTO indexqueue (url,worksafe,approver,surprise,task,crawl_tree,crawl_family,crawl_depth,crawl_pages,crawl_type,crawl_repeat,force_rules,crawler_id) VALUES (");
  1535. }else{
  1536. strcpy(url_insert,"INSERT INTO indexqueue (url,worksafe,approver,surprise,task,crawl_tree,crawl_family,crawl_depth,crawl_pages,crawl_type,crawl_repeat,force_rules) VALUES (");
  1537. }
  1538. while(urlListShuffled[loopcount]!=0){
  1539. switch(urlListShuffled[loopcount]){
  1540. case '\n' ://see if url can be indexed, if so, add to sql insert statement
  1541. if(strlen(url_fromlist) < 500){
  1542. urlparse(url_fromlist);
  1543. //check if internal or external url
  1544. int isinternal=1;
  1545. if(rootdomain[0]!=0){
  1546. isinternal=0;
  1547. }else if(url_fromlist[4]==':' || url_fromlist[5]==':'){
  1548. isinternal=0;
  1549. }else if((url_fromlist[0]=='w' || url_fromlist[0]=='W') && (url_fromlist[1]=='w' || url_fromlist[1]=='W') && (url_fromlist[2]=='w' || url_fromlist[2]=='W') && url_fromlist[3]=='.'){
  1550. isinternal=0;
  1551. }
  1552. int urlNPNP_finalURL_len=strlen(urlNPNP_finalURL);
  1553. int isabsolute=0;
  1554. if(isinternal==0 && urlNPNP_finalURL_len==strlen(urlnopathnoprefix_fromlist)){
  1555. isinternal=isabsolute=1;
  1556. for(int q=0;q<urlNPNP_finalURL_len;q++){
  1557. if(urlnopathnoprefix_fromlist[q]!=urlNPNP_finalURL[q]){
  1558. isinternal=isabsolute=0;
  1559. break;
  1560. }
  1561. }
  1562. }
  1563. if(isinternal==1 && ((crawl_type != 0 && crawl_type[0] != '2') || crawl_type == 0)){//is internal link
  1564. if(url_fromlist[0]=='/' && url_fromlist[1] != '.'){//can't handle '..' otherwise append to insert
  1565. urls++;
  1566. if(urls>1){
  1567. strcat(url_insert,", (");
  1568. }
  1569. strcat(url_insert,"'");
  1570. strcat(url_insert,urlPrefix_finalURL);
  1571. strcat(url_insert,urlNPNP_finalURL);
  1572. strcat(url_insert,url_fromlist);
  1573. strcat(url_insert,"',");
  1574. strcat(url_insert,worksafe);
  1575. strcat(url_insert,",'");
  1576. strcat(url_insert,approver);
  1577. strcat(url_insert,"',0,2,'");
  1578. if(task==0){
  1579. strcat(url_insert,url);
  1580. }else{
  1581. strcat(url_insert,crawl_tree);
  1582. }
  1583. strcat(url_insert,"','");
  1584. strcat(url_insert,finalURL);
  1585. strcat(url_insert,"',");
  1586. strcat(url_insert,strDepth);
  1587. strcat(url_insert,",");
  1588. strcat(url_insert,crawl_pages);
  1589. strcat(url_insert,",");
  1590. strcat(url_insert,crawl_type);
  1591. strcat(url_insert,",");
  1592. strcat(url_insert,"0");
  1593. strcat(url_insert,",");
  1594. strcat(url_insert,force_rules);
  1595. if(id_assigned == 1){
  1596. strcat(url_insert,",");
  1597. strcat(url_insert,argv[1]);
  1598. }
  1599. strcat(url_insert,")");
  1600. }else if(url_fromlist[0] != '/' && url_fromlist[0] != '.'){
  1601. urls++;
  1602. if(urls>1){
  1603. strcat(url_insert,", (");
  1604. }
  1605. strcat(url_insert,"'");
  1606. if(isabsolute==0){
  1607. strcat(url_insert,urlPrefix_finalURL);
  1608. strcat(url_insert,urlNPNP_finalURL);
  1609. strcat(url_insert,folderPath_finalURL);
  1610. strcat(url_insert,urlcopy);//scrubed index.html
  1611. }else{
  1612. strcat(url_insert,urlcopy);
  1613. }
  1614. strcat(url_insert,"',");
  1615. strcat(url_insert,worksafe);
  1616. strcat(url_insert,",'");
  1617. strcat(url_insert,approver);
  1618. strcat(url_insert,"',0,2,'");
  1619. if(task==0){
  1620. strcat(url_insert,url);
  1621. }else{
  1622. strcat(url_insert,crawl_tree);
  1623. }
  1624. strcat(url_insert,"','");
  1625. strcat(url_insert,finalURL);
  1626. strcat(url_insert,"',");
  1627. strcat(url_insert,strDepth);
  1628. strcat(url_insert,",");
  1629. strcat(url_insert,crawl_pages);
  1630. strcat(url_insert,",");
  1631. strcat(url_insert,crawl_type);
  1632. strcat(url_insert,",");
  1633. strcat(url_insert,"0");
  1634. strcat(url_insert,",");
  1635. strcat(url_insert,force_rules);
  1636. if(id_assigned == 1){
  1637. strcat(url_insert,",");
  1638. strcat(url_insert,argv[1]);
  1639. }
  1640. strcat(url_insert,")");
  1641. }
  1642. }else if(isinternal==0 && crawl_type != 0 && crawl_type[0] != '0'){//is external link
  1643. if(url_fromlist[0] != '.'){
  1644. urls++;
  1645. if(urls>1){
  1646. strcat(url_insert,", (");
  1647. }
  1648. strcat(url_insert,"'");
  1649. strcat(url_insert,prefix_fromlist);
  1650. strcat(url_insert,rootdomain);
  1651. strcat(url_insert,urlPath);
  1652. strcat(url_insert,"',");
  1653. strcat(url_insert,worksafe);
  1654. strcat(url_insert,",'");
  1655. strcat(url_insert,approver);
  1656. strcat(url_insert,"',0,2,'");
  1657. if(task==0){
  1658. strcat(url_insert,url);
  1659. }else{
  1660. strcat(url_insert,crawl_tree);
  1661. }
  1662. strcat(url_insert,"','");
  1663. strcat(url_insert,finalURL);
  1664. strcat(url_insert,"',");
  1665. strcat(url_insert,strDepth);
  1666. strcat(url_insert,",");
  1667. strcat(url_insert,crawl_pages);
  1668. strcat(url_insert,",");
  1669. strcat(url_insert,crawl_type);
  1670. strcat(url_insert,",");
  1671. strcat(url_insert,"0");
  1672. strcat(url_insert,",");
  1673. strcat(url_insert,force_rules);
  1674. if(id_assigned == 1){
  1675. strcat(url_insert,",");
  1676. strcat(url_insert,argv[1]);
  1677. }
  1678. strcat(url_insert,")");
  1679. }
  1680. }
  1681. }
  1682. memset(url_fromlist,0,url_fromlist_arraylen);
  1683. elementnum=0;
  1684. loopcount++;
  1685. default :
  1686. if(loopcount<url_fromlist_arraylen){
  1687. url_fromlist[elementnum]=urlListShuffled[loopcount];
  1688. }
  1689. elementnum++;
  1690. loopcount++;
  1691. }
  1692. if(n_crawl_pages == urls || strlen(url_insert)>(url_insert_arraylen-10000))
  1693. break;
  1694. }
  1695. if(urls>0){
  1696. strcat(url_insert,";");
  1697. //insert into db
  1698. if (mysql_query(con, url_insert))
  1699. {
  1700. finish_with_error(con);
  1701. }
  1702. }
  1703. }
  1704. if (curl){
  1705. curl_easy_cleanup(curl);// cleanup curl (finalURL used at inserts, thats why we cleanup and the end here
  1706. curl_global_cleanup();
  1707. }
  1708. }else{
  1709. if(alreadydone == 0){
  1710. printf("\nPage was flagged as unable to crawl or banned.");
  1711. }else if(idexistsalready==1){
  1712. printf("\nPage is already indexed.");
  1713. }
  1714. printf("\nRemoving from queue...");
  1715. char sqlqueryremove[200];
  1716. memset(sqlqueryremove,0,200);
  1717. strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id=");
  1718. strcat(sqlqueryremove,id);
  1719. if (mysql_query(con, sqlqueryremove))
  1720. {
  1721. finish_with_error(con);
  1722. }
  1723. if(idexistsalready==1 && permitted==0){
  1724. printf(" Removing from index...");
  1725. memset(sqlqueryremove,0,200);
  1726. strcpy(sqlqueryremove,"DELETE FROM windex WHERE id=");
  1727. strcat(sqlqueryremove,idexistsvalue);
  1728. strcat(sqlqueryremove," AND updatable != '0'");
  1729. if (mysql_query(con, sqlqueryremove))
  1730. {
  1731. finish_with_error(con);
  1732. }
  1733. if(nShards>0 && shard != 0){
  1734. memset(sqlqueryremove,0,200);
  1735. strcpy(sqlqueryremove,"DELETE FROM ws");
  1736. strcat(sqlqueryremove,shard);
  1737. strcat(sqlqueryremove," WHERE id=");
  1738. strcat(sqlqueryremove,idexistsvalue);
  1739. strcat(sqlqueryremove," AND updatable != '0'");
  1740. if (mysql_query(con, sqlqueryremove))
  1741. {
  1742. finish_with_error(con);
  1743. }
  1744. }
  1745. }
  1746. FILE *abandoned = fopen("abandoned.txt", "a");
  1747. fputs (url,abandoned);
  1748. fputs ("\r\n",abandoned);
  1749. fclose(abandoned);
  1750. }
  1751. //cleanup more sql stuff
  1752. mysql_free_result(resulturlcheck);
  1753. //rotate shard for next insert
  1754. if(nShards > 0){
  1755. shardnum++;
  1756. if(shardnum == nShards)
  1757. shardnum=0;
  1758. sprintf(shardnumstr,"%d",shardnum);
  1759. }
  1760. printf(" Awaiting next page in queue...\n\n");
  1761. }
  1762. //cleanup more sql stuff
  1763. mysql_free_result(result);
  1764. mysql_close(con);
  1765. if(empty==1)
  1766. sleep(5);//sleep 5 seconds
  1767. }
  1768. exit(0);
  1769. }