htmlparse.h 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577
  1. //Wiby HTML Parser
  2. //Separates text from an HTML file
  3. //Remember to also set sql_mode = "NO_BACKSLASH_ESCAPES" in my.cnf
  4. #include <stdio.h>
  5. #include <stdlib.h>
  6. #include <string.h>
  7. #include <time.h>
  8. #define window_len 100
  9. #define charset_len 100
  10. #define mysqlcharset_len 100
  11. #define title_len 152
  12. #define keywords_len 1024
  13. #define description_len 182
  14. #define robots_len 100
  15. #define body_len 81920
  16. #define urlList_len 102400
  17. #define strURL_len 102400
  18. FILE *bodyfile,*titlefile, *keywordsfile, *descriptionfile, *noindexfile, *nofollowfile, *charsetfile, *urlfile, *shuffledurlfile;
  19. static char filename[] = "page.out";
  20. char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1];
  21. char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1];
  22. int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1;
  23. long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0;
  24. int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match);
  25. int locateInWindow(char *window, char *birdLower, char *birdUpper, int length);
  26. int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize);
  27. int canCrawl(int urlSize);
  28. void shuffleURLs(int iterations, long urlListSize);
  29. void sqlsafe();
  30. void charset2mysql();
  31. FILE *f;
  32. char *fileStr;
  33. char c;
  34. void htmlparse(){
  35. long urlListSize=0;
  36. numURL=0;
  37. int intag=0,incomment=0,inscript=0,instyle=0,inlink=0,putspace=0,spacecount=0;
  38. int urlSize=0,dqcount=0;
  39. titlefound=charsetfound=descriptionfound=keywordsfound=robotsfound=nofollow=noindex=scriptfound=stylefound=0;
  40. charsetsize=titlesize=keywordssize=descriptionsize=robotssize=bodysize=0;
  41. memset(window,'#',window_len);
  42. // window[window_len]=0;
  43. memset(windowWithSpaces,'#',window_len);
  44. // windowWithSpaces[window_len]=0;
  45. memset(charset,0,charset_len+1);
  46. memset(mysqlcharset,0,mysqlcharset_len+1);
  47. memset(title,0,title_len+1);
  48. memset(keywords,0,keywords_len+1);
  49. memset(description,0,description_len+1);
  50. memset(robots,0,robots_len+1);
  51. memset(body,0,body_len+1);
  52. memset(urlList,0,urlList_len+1);
  53. memset(strURL,0,strURL_len+1);
  54. memset(urlListShuffled,0,urlList_len+1);
  55. memset(urlListHoldShuffled,0,urlList_len+1);
  56. printf("Parsing HTML... ");
  57. //open html file and load into memory
  58. f = fopen(filename, "rb");
  59. fseek(f, 0, SEEK_END);
  60. long fsize = ftell(f);
  61. fseek(f, 0, SEEK_SET); /* same as rewind(f); */
  62. fileStr = malloc(fsize + 1);
  63. if(fread(fileStr, 1, fsize, f)){};
  64. fclose(f);
  65. fileStr[fsize] = 0;
  66. //Locate the charset, title, description, keywords, robots, body
  67. //must accomodate human error in markup
  68. //must double all single quotes for mysql safety
  69. //dont allow extra whitespace, ignore cr/lf/tabs
  70. //complete it all in one pass
  71. for(int i=0;i<fsize;i++){
  72. c = fileStr[i];
  73. int skipchar = 0;
  74. if(c== 10 || c == 13 || c == 14 || c == 15 || c == 127 || c == 0 || c == 9){
  75. skipchar = 1;
  76. }
  77. //use a rolling window of 100 bytes to detect elements, ignore lf/cr/so/si/space/null/tab
  78. if(skipchar == 0 && c != 32){
  79. for(int j=0;j<window_len-1;j++){
  80. window[j] = window[j+1];
  81. }
  82. window[window_len-1] = c;
  83. }
  84. //use a rolling window of 100 bytes to detect elements, but permit space, ignore lf/cr/null/tab
  85. if(skipchar == 0){
  86. for(int j=0;j<window_len-1;j++){
  87. windowWithSpaces[j] = windowWithSpaces[j+1];
  88. }
  89. windowWithSpaces[window_len-1] = c;
  90. }
  91. //Get Title
  92. if(titlefound == 2){
  93. if(titlesize < (title_len-2) && skipchar == 0){
  94. title[titlesize]=c;
  95. titlesize++;
  96. if(c == 39){//check for single quotes and double them up for sql safety
  97. title[titlesize]=c;
  98. titlesize++;
  99. }
  100. if(c != 32 && skipchar == 0){//some titles are just a bunch of spaces or garbage, need to check for that
  101. emptytitle = 0;
  102. }
  103. }
  104. if(locateInWindow(window,"</title>","</TITLE>",8)==1){
  105. titlefound = 3;
  106. //remove </title> from end of title by inserting null at location of <
  107. titlesize -= 8;
  108. title[titlesize] = 0;
  109. //printf("\n%s",title);
  110. }
  111. }
  112. if(titlefound == 1 && c=='>')//in case of this situation: <title some_nonsense>
  113. titlefound=2;
  114. if(titlefound == 0 && locateInWindow(window,"<title","<TITLE",6)==1){
  115. titlefound = 1;
  116. }
  117. //Get Charset
  118. if(charsetfound == 1){
  119. if(c == '>' || c == '/'){
  120. charsetfound = 2;
  121. //printf("\n%s",charset);
  122. }
  123. if(charsetfound == 1 && charsetsize < charset_len && c != '"' && c != '\'' && skipchar == 0){
  124. charset[charsetsize]=c;
  125. charsetsize++;
  126. }
  127. }
  128. if(charsetfound == 0 && locateInWindow(window,"charset=","CHARSET=",8)==1){
  129. charsetfound = 1;
  130. }
  131. //Get Description
  132. if(descriptionfound == 1){
  133. if(c == '>' || c == '/'){
  134. descriptionfound = 2;
  135. //printf("\n%s",description);
  136. }
  137. if(descriptionfound == 1 && descriptionsize < (description_len-2) && c != '"' && skipchar == 0){
  138. description[descriptionsize]=c;
  139. descriptionsize++;
  140. if(c == 39){//check for single quotes and double them up for sql safety
  141. description[descriptionsize]=c;
  142. descriptionsize++;
  143. }
  144. }
  145. }
  146. if(descriptionfound == 0 && locateInWindow(window,"description\"content=","DESCRIPTION\"CONTENT=",20)==1){
  147. descriptionfound = 1;
  148. }
  149. //Get Keywords
  150. if(keywordsfound == 1){
  151. if(c == '>' || c == '/'){
  152. keywordsfound = 2;
  153. //printf("\n%s",keywords);
  154. }
  155. if(keywordsfound == 1 && keywordssize < (keywords_len-2) && c != '"' && skipchar == 0){
  156. keywords[keywordssize]=c;
  157. keywordssize++;
  158. if(c == 39){//check for single quotes and double them up for sql safety
  159. keywords[keywordssize]=c;
  160. keywordssize++;
  161. }
  162. }
  163. }
  164. if(keywordsfound == 0 && locateInWindow(window,"keywords\"content=","KEYWORDS\"CONTENT=",17)==1){
  165. keywordsfound = 1;
  166. }
  167. //Get Robots (nofollow, noindex)
  168. if(robotsfound == 1){
  169. if(c == '>' || c == '/'){
  170. robotsfound = 2;
  171. //printf("\n%s",robots);
  172. if(locateInWindow(window,"nofollow","NOFOLLOW",8)==1)
  173. nofollow=1;
  174. if(locateInWindow(window,"noindex","NOINDEX",7)==1 || locateInWindow(window,"none","NONE",4)==1)
  175. noindex=nofollow=1;
  176. }
  177. if(robotsfound == 1 && robotssize < robots_len && c != '"' && c != '\'' && skipchar == 0){
  178. robots[robotssize]=c;
  179. robotssize++;
  180. }
  181. }
  182. if(robotsfound == 0 && locateInWindow(window,"robots\"content=","ROBOTS\"CONTENT=",15)==1){
  183. robotsfound = 1;
  184. }
  185. if(titlefound != 2){
  186. //Ignore between scripts, styles, and remove all tags, repeated spaces, tabs, cr, lf, null, add a space at end of every tag
  187. if(c=='<'){
  188. intag = 1;
  189. }else if(c=='>'){
  190. intag = 0;
  191. putspace = 1;
  192. }
  193. if(locateInWindow(window,"<!--","<!--",4)==1){
  194. incomment = 1;
  195. }else if(locateInWindow(window,"-->","-->",3)==1){
  196. incomment = 0;
  197. }
  198. if(locateInWindow(window,"<script","<SCRIPT",7)==1){
  199. inscript = 1;
  200. num_scripts++;
  201. }else if(locateInWindow(window,"</script>","</SCRIPT>",9)==1){
  202. inscript = 0;
  203. }
  204. if(locateInWindow(window,"<style","<STYLE",6)==1){
  205. instyle = 1;
  206. }else if(locateInWindow(window,"</style>","</STYLE>",8)==1){
  207. instyle = 0;
  208. }
  209. if(locateInWindow(window,"<link","<LINK",5)==1){
  210. inlink = 1;
  211. }else if(inlink==1 && locateInWindow(window,">",">",1)==1){
  212. inlink = 0;
  213. }
  214. if(inlink==1){
  215. if(locateInWindow(window,".css",".CSS",4)==1)
  216. num_stylesheets++;
  217. }
  218. //Get Body
  219. //exclude remaining tags, comments, scripts, styles, cr, lf, null, tab, add a space after a '>' but only allow one
  220. if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && skipchar == 0 && bodysize < (body_len-2)){
  221. if(putspace == 1){
  222. if(spacecount == 0){
  223. body[bodysize]=32;
  224. bodysize++;
  225. }
  226. spacecount++;
  227. putspace=0;
  228. }else{
  229. if(c==32)
  230. spacecount++;
  231. else spacecount = 0;
  232. if(spacecount < 2){
  233. body[bodysize]=c;
  234. bodysize++;
  235. if(c == 39){//check for single quotes and double them up for sql safety
  236. body[bodysize]=c;
  237. bodysize++;
  238. }
  239. }
  240. }
  241. }
  242. }
  243. //Get URL's
  244. if(getURLs==1){
  245. if(urlFound == 1 && incomment==0 && instyle==0 && inscript==0 && inlink == 0){
  246. if(c=='"' || c=='\'')
  247. dqcount++;
  248. if((c == '#' && urlSize==0) || (dqcount == 2 && urlSize == 0) || (c == ' ' && urlSize == 0))
  249. urlFound=urlTagFound=dqcount=0;
  250. if((c == '>' || c == ' ') && urlFound == 1){
  251. if(canCrawl(urlSize)==0 || (urlSize+urlListSize) >= (urlList_len-1)){
  252. memset(strURL,0,strURL_len+1);
  253. }else{
  254. strcat(urlList,strURL);
  255. strcat(urlList,"\n");
  256. urlListSize+=urlSize+1;
  257. memset(strURL,0,strURL_len+1);
  258. numURL++;
  259. }
  260. urlFound = urlTagFound = urlSize = dqcount = 0;
  261. }
  262. if(urlFound == 1 && urlListSize < (urlList_len-2) && c != '"' && c != '\'' && urlSize < (strURL_len-2)){
  263. strURL[urlSize]=window[window_len-1];
  264. urlSize++;
  265. }
  266. if(urlSize==11){
  267. if(locateInWindow(window,"javascript:","JAVASCRIPT:",11)==1){
  268. urlFound=urlTagFound=urlSize=dqcount=0;
  269. memset(strURL,0,strURL_len+1);
  270. }
  271. }
  272. }
  273. if(urlFound == 0 && urlTagFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(windowWithSpaces,"<a ","<A ",3)==1){//sometimes there is something between "<a" and "href"
  274. urlTagFound = 1;
  275. }
  276. if(urlFound == 0 && urlTagFound == 1 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(window,"href=","HREF=",5)==1){
  277. urlFound = 1;
  278. }
  279. }
  280. }
  281. //Convert charset to mysql equivalent
  282. charset2mysql();
  283. //print body to file
  284. /* bodyfile = fopen("body.txt","wb");
  285. fputs(body,bodyfile);
  286. fclose(bodyfile);
  287. //print title to file
  288. titlefile = fopen("title.txt","wb");
  289. fputs(title,titlefile);
  290. fclose(titlefile);
  291. //print keywords to file
  292. keywordsfile = fopen("keywords.txt","wb");
  293. fputs(keywords,keywordsfile);
  294. fclose(keywordsfile);
  295. //print description to file
  296. descriptionfile = fopen("description.txt","wb");
  297. fputs(description,descriptionfile);
  298. fclose(descriptionfile);
  299. //print charset to file
  300. charsetfile = fopen("charset.txt","wb");
  301. fputs(mysqlcharset,charsetfile);
  302. fclose(charsetfile);
  303. //print noindex to file
  304. noindexfile = fopen("noindex.txt","wb");
  305. if(noindex==1)
  306. fputs("noindex",noindexfile);
  307. fclose(noindexfile);
  308. //print nofollow to file
  309. nofollowfile = fopen("nofollow.txt","wb");
  310. if(nofollow==1)
  311. fputs("nofollow",nofollowfile);
  312. fclose(nofollowfile);*/
  313. if(getURLs==1){
  314. //shuffle order of collected URLs list
  315. shuffleURLs(10,urlListSize);
  316. //printf("\n%s",urlList);
  317. //print URLs to file
  318. /* urlfile = fopen("url.txt","wb");
  319. fputs(urlList,urlfile);
  320. fclose(urlfile);
  321. //print shuffled URLs to file
  322. shuffledurlfile = fopen("urlshuffled.txt","wb");
  323. fputs(urlListShuffled,shuffledurlfile);
  324. fclose(shuffledurlfile);*/
  325. }
  326. free(fileStr);
  327. printf("\nbody: %ld, title: %ld, charset: %ld, description: %ld, keywords: %ld, noindex: %d, nofollow: %d",bodysize,titlesize,charsetsize,descriptionsize,keywordssize,noindex,nofollow);
  328. }
  329. void shuffleURLs(int iterations, long urlListSize)
  330. {
  331. if(seeded==0){
  332. srand(time(NULL));
  333. seeded=1;
  334. }
  335. int r1,r2,r1to2;
  336. int urlCount,i,j,k,l;
  337. if(numURL>2){
  338. strcpy(urlListHoldShuffled,urlList);
  339. for(int loops=0;loops<iterations;loops++){
  340. r1 = r1to2 = (rand() % numURL) + 1;
  341. r2 = (rand() % numURL) + 1;
  342. if(r1>r2){
  343. r1=r2;
  344. r2=r1to2;
  345. }
  346. if(r1==r2){
  347. continue;
  348. }
  349. urlCount=i=j=k=l=0;
  350. //skip to url number r1
  351. while(urlCount < r1 /*&& i<urlList_len*/){
  352. if(urlListHoldShuffled[i]=='\n')
  353. urlCount++;
  354. i++;
  355. }
  356. j=i;
  357. //copy to urlListShuffled starting at j until reaching r2 location
  358. while(urlCount<r2 /*&& j<urlList_len*/){
  359. urlListShuffled[k]=urlListHoldShuffled[j];
  360. if(urlListHoldShuffled[j]=='\n')
  361. urlCount++;
  362. j++;
  363. k++;
  364. }
  365. //concat url's before i
  366. while(l<i /*&& k<urlList_len*/){
  367. urlListShuffled[k]=urlListHoldShuffled[l];
  368. l++;
  369. k++;
  370. }
  371. //concat url's after k
  372. while(k<urlListSize /*&& k<urlList_len*/){
  373. urlListShuffled[k]=urlListHoldShuffled[k];
  374. k++;
  375. }
  376. strcpy(urlListHoldShuffled,urlListShuffled);
  377. }
  378. }else{
  379. strcpy(urlListShuffled,urlList);
  380. }
  381. }
  382. void charset2mysql()
  383. {
  384. //if no charset specified, use utf8
  385. if(charsetsize == 0){
  386. strcpy(mysqlcharset,"SET CHARSET utf8;");
  387. printf("No Charset found. %s",mysqlcharset);
  388. }
  389. else{ //else, match charset with a proper mysql charset
  390. if(matchMySQLcharset(charsetsize,charset,5,"utf-8","UTF-8")==1){
  391. strcpy(mysqlcharset,"SET CHARSET utf8mb4;");
  392. printf("%s",mysqlcharset);
  393. }
  394. else if(matchMySQLcharset(charsetsize,charset,6,"latin1","LATIN1")==1){
  395. strcpy(mysqlcharset,"SET CHARSET latin1;");
  396. printf("%s",mysqlcharset);
  397. }
  398. else if(matchMySQLcharset(charsetsize,charset,9,"shift-jis","SHIFT-JIS")==1){
  399. strcpy(mysqlcharset,"SET CHARSET cp932;");
  400. printf("%s",mysqlcharset);
  401. }
  402. else if(matchMySQLcharset(charsetsize,charset,6,"x-sjis","X-SJIS")==1){
  403. strcpy(mysqlcharset,"SET CHARSET cp932;");
  404. printf("%s",mysqlcharset);
  405. }
  406. else if(matchMySQLcharset(charsetsize,charset,10,"iso-8859-1","ISO-8859-1")==1){
  407. strcpy(mysqlcharset,"SET CHARSET latin1;");
  408. printf("%s",mysqlcharset);
  409. }
  410. else if(matchMySQLcharset(charsetsize,charset,12,"windows-1252","WINDOWS-1252")==1){
  411. strcpy(mysqlcharset,"SET CHARSET latin1;");
  412. printf("%s",mysqlcharset);
  413. }
  414. else if(matchMySQLcharset(charsetsize,charset,12,"windows-1251","WINDOWS-1251")==1){
  415. strcpy(mysqlcharset,"SET CHARSET cp1251;");
  416. printf("%s",mysqlcharset);
  417. }
  418. else if(matchMySQLcharset(charsetsize,charset,6,"koi8-r","KOI8-R")==1){
  419. strcpy(mysqlcharset,"SET CHARSET cp1251;");
  420. printf("%s",mysqlcharset);
  421. }
  422. else if(matchMySQLcharset(charsetsize,charset,6,"euc-kr","EUC-KR")==1){
  423. strcpy(mysqlcharset,"SET CHARSET euckr;");
  424. printf("%s",mysqlcharset);
  425. }
  426. else if(matchMySQLcharset(charsetsize,charset,4,"big5","BIG5")==1){
  427. strcpy(mysqlcharset,"SET CHARSET big5;");
  428. printf("%s",mysqlcharset);
  429. }
  430. else{
  431. strcpy(mysqlcharset,"SET CHARSET utf8;");
  432. printf("Charset mismatch. %s",mysqlcharset);
  433. }
  434. }
  435. }
  436. int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match)
  437. {
  438. int match = 0;
  439. int i=0;
  440. for(;i<html_match_length;i++){
  441. if(i > html_charset_length){
  442. return 0;
  443. }
  444. if(html_charset[i] != 95 && html_charset[i] != 45 && html_lowercase_match[i] != 95 && html_lowercase_match[i] != 45){ // _ or -
  445. if(html_lowercase_match[i] != html_charset[i] && html_uppercase_match[i] != html_charset[i]){
  446. return 0;
  447. }
  448. }
  449. match = 1;
  450. }
  451. return match;
  452. }
  453. int locateInWindow(char *window, char *birdLower, char *birdUpper, int length)
  454. {
  455. int start = window_len-length;
  456. for(int i=0;i<length;i++){
  457. if(window[start] != birdLower[i] && window[start] != birdUpper[i]){
  458. return 0;
  459. }
  460. start++;
  461. }
  462. return 1;
  463. }
  464. int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize)
  465. {
  466. long start = urlSize-length;
  467. if(urlSize >= length){
  468. for(int i=0;i<length;i++){
  469. if(url[start] != birdLower[i] && window[start] != birdUpper[i]){
  470. return 0;
  471. }
  472. start++;
  473. }
  474. return 1;
  475. }else{
  476. return 0;
  477. }
  478. }
  479. //Check if url can be indexed (allow relative links for html and txt files. Removing this check will add to the queue everything listed including external links.
  480. int canCrawl(int urlSize){
  481. int numDots=0,numSlash=0;
  482. int slashpos=0,dotspos=0;
  483. int extfound=0,extlocation=0,prefixfound=0;
  484. for(int i=0;i<urlSize;i++){
  485. if(urlSize>5 && strURL[i]==':' && i>3){
  486. if((strURL[0]!='h' && strURL[0]!='H') || (strURL[1]!='t' && strURL[1]!='T') || (strURL[2]!='t' && strURL[2]!='T') || (strURL[3]!='p' && strURL[3]!='P') || (strURL[4]!='s' && strURL[4]!='S' && strURL[4]!=':') || (strURL[5]!=':' && strURL[5]!='/'))
  487. return 0;
  488. prefixfound=1;
  489. }
  490. if(strURL[i]=='?' || strURL[i]=='\\'){
  491. return 0;
  492. }
  493. if(strURL[i]=='.'){
  494. numDots++;
  495. }
  496. if(strURL[i]=='/'){
  497. numSlash++;
  498. }
  499. if(strURL[i]=='.' ){
  500. extfound=1;
  501. extlocation=i;
  502. }
  503. if(strURL[i]=='/' && extfound==1 && i>extlocation){
  504. extfound=0;
  505. }
  506. if(prefixfound==1 && numSlash-2<=0){
  507. extfound=0;
  508. }
  509. }
  510. if(numDots == 0){
  511. return 1;
  512. }
  513. //restrict file extensions to these
  514. if(extfound==1 && (locateInURL(strURL,".html",".HTML",5,urlSize)==1 || locateInURL(strURL,".htm",".HTM",4,urlSize)==1 || locateInURL(strURL,".txt",".TXT",4,urlSize)==1 || locateInURL(strURL,".php",".PHP",4,urlSize)==1 || locateInURL(strURL,".asp",".ASP",4,urlSize)==1)){
  515. return 1;
  516. }
  517. if(extfound==0 )
  518. return 1;
  519. return 0;
  520. }