//Wiby HTML Parser //Separates text from an HTML file //Remember to also set sql_mode = "NO_BACKSLASH_ESCAPES" in my.cnf #include #include #include #include #define window_len 100 #define charset_len 100 #define mysqlcharset_len 100 #define title_len 144 #define keywords_len 1024 #define description_len 182 #define robots_len 100 #define body_len 81920 #define urlList_len 102400 #define strURL_len 102400 FILE *bodyfile,*titlefile, *keywordsfile, *descriptionfile, *noindexfile, *nofollowfile, *charsetfile, *urlfile, *shuffledurlfile; static char filename[] = "page.out"; char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1]; char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1]; char title_filtered[title_len+1], body_filtered[body_len+1], description_filtered[description_len+1]; int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1; long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0; int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match); int locateInWindow(char *window, char *birdLower, char *birdUpper, int length); int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize); int canCrawl(int urlSize, char *urltocheck); void shuffleURLs(int iterations, long urlListSize); void sqlsafe(); void charset2mysql(); void filtervars(); FILE *f; char *fileStr; char c; void htmlparse(){ long urlListSize=0; numURL=0; int intag=0,incomment=0,inscript=0,instyle=0,inlink=0,putspace=0,spacecount=0,foundbr=0; int urlSize=0,dqcount=0; titlefound=charsetfound=descriptionfound=keywordsfound=robotsfound=nofollow=noindex=scriptfound=stylefound=num_stylesheets=num_scripts=0; charsetsize=titlesize=keywordssize=descriptionsize=robotssize=bodysize=0; memset(window,'#',window_len); // window[window_len]=0; memset(windowWithSpaces,'#',window_len); // windowWithSpaces[window_len]=0; memset(charset,0,charset_len+1); memset(mysqlcharset,0,mysqlcharset_len+1); memset(title,0,title_len+1); memset(keywords,0,keywords_len+1); memset(description,0,description_len+1); memset(robots,0,robots_len+1); memset(body,0,body_len+1); memset(urlList,0,urlList_len+1); memset(strURL,0,strURL_len+1); memset(urlListShuffled,0,urlList_len+1); memset(urlListHoldShuffled,0,urlList_len+1); memset(title_filtered,0,title_len+1); memset(body_filtered,0,body_len+1); memset(description_filtered,0,description_len+1); printf("Parsing HTML... "); //open html file and load into memory f = fopen(filename, "rb"); fseek(f, 0, SEEK_END); long fsize = ftell(f); fseek(f, 0, SEEK_SET); /* same as rewind(f); */ fileStr = malloc(fsize + 1); if(fread(fileStr, 1, fsize, f)){}; fclose(f); fileStr[fsize] = 0; //Locate the charset, title, description, keywords, robots, body //must accomodate human error in markup //must double all single quotes for mysql safety //dont allow extra whitespace, ignore cr/lf/tabs //complete it all in one pass for(int i=0;i= (title_len-2)) titlefound=3; } if(locateInWindow(window,"","",8)==1 && titlefound!=3){ titlefound = 3; //remove from end of title by inserting null at location of < titlesize -= 8; if(titlesize < 0){ //avoids this: titlesize = 0; emptytitle = 1; } title[titlesize] = 0; //printf("\n%s",title); } } if(titlefound == 1 && c=='>')//in case of this situation: titlefound = 2; if(titlefound == 0 && locateInWindow(window,"<title","<TITLE",6)==1){ titlefound = 1; } //Get Charset if(charsetfound == 1){ if(c == '>' || c == '/'){ charsetfound = 2; //printf("\n%s",charset); } if(charsetfound == 1 && charsetsize < charset_len && c != '"' && c != '\'' && skipchar == 0){ charset[charsetsize]=c; charsetsize++; } } if(charsetfound == 0 && locateInWindow(window,"charset=","CHARSET=",8)==1){ charsetfound = 1; } //Get Description if(descriptionfound == 1){ if(c == '>' || c == '/'){ descriptionfound = 2; //printf("\n%s",description); } if(descriptionfound == 1 && descriptionsize < (description_len-2) && c != '"' && skipchar == 0){ description[descriptionsize]=c; descriptionsize++; if(c == 39){//check for single quotes and double them up for sql safety description[descriptionsize]=c; descriptionsize++; } } } if(descriptionfound == 0 && locateInWindow(window,"description\"content=","DESCRIPTION\"CONTENT=",20)==1){ descriptionfound = 1; } //Get Keywords if(keywordsfound == 1){ if(c == '>' || c == '/'){ keywordsfound = 2; //printf("\n%s",keywords); } if(keywordsfound == 1 && keywordssize < (keywords_len-2) && c != '"' && skipchar == 0){ keywords[keywordssize]=c; keywordssize++; if(c == 39){//check for single quotes and double them up for sql safety keywords[keywordssize]=c; keywordssize++; } } } if(keywordsfound == 0 && locateInWindow(window,"keywords\"content=","KEYWORDS\"CONTENT=",17)==1){ keywordsfound = 1; } //Get Robots (nofollow, noindex) if(robotsfound == 1){ if(c == '>' || c == '/'){ robotsfound = 2; //printf("\n%s",robots); if(locateInWindow(window,"nofollow","NOFOLLOW",8)==1) nofollow=1; if(locateInWindow(window,"noindex","NOINDEX",7)==1 || locateInWindow(window,"none","NONE",4)==1) noindex=nofollow=1; } if(robotsfound == 1 && robotssize < robots_len && c != '"' && c != '\'' && skipchar == 0){ robots[robotssize]=c; robotssize++; } } if(robotsfound == 0 && locateInWindow(window,"robots\"content=","ROBOTS\"CONTENT=",15)==1){ robotsfound = 1; } if(titlefound != 2){ //Ignore between scripts, styles, and remove all tags, repeated spaces, tabs, cr, lf, null, add a space at end of every tag if(c=='<'){ intag = 1; }else if(c=='>'){ intag = 0; putspace = 1; } if(locateInWindow(window,"<!--","<!--",4)==1){ incomment = 1; }else if(locateInWindow(window,"-->","-->",3)==1){ incomment = 0; } if(locateInWindow(window,"<script","<SCRIPT",7)==1 && c != ' ' && skipchar == 0){ inscript = 1; num_scripts++; }else if(locateInWindow(window,"</script>","</SCRIPT>",9)==1){ inscript = 0; } if(locateInWindow(window,"<style","<STYLE",6)==1 && c != ' ' && skipchar == 0){ instyle = 1; num_stylesheets++; }else if(locateInWindow(window,"</style>","</STYLE>",8)==1){ instyle = 0; } if(locateInWindow(window,"<link","<LINK",5)==1){ inlink = 1; }else if(inlink==1 && locateInWindow(window,">",">",1)==1){ inlink = 0; } if(inlink==1){ if(locateInWindow(window,".css",".CSS",4)==1 && c != ' ' && skipchar == 0) num_stylesheets++; } //Get Body //exclude remaining tags, comments, scripts, styles, cr, lf, null, tab, add a space after a '>' but only allow one if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && skipchar == 0 && bodysize < (body_len-2)){ if(putspace == 1){ if(spacecount == 0){ body[bodysize]=32; bodysize++; } spacecount++; putspace=0; }else{ if(c==32) spacecount++; else spacecount = 0; if(spacecount < 2){ body[bodysize]=c; bodysize++; if(c == 39){//check for single quotes and double them up for sql safety body[bodysize]=c; bodysize++; } } } } } //Get URL's if(getURLs==1){ if(urlFound == 1 && incomment==0 && instyle==0 && inscript==0 && inlink == 0){ if(c=='"' || c=='\'') dqcount++; if((c == '#' && urlSize==0) || (dqcount == 2 && urlSize == 0) || (c == ' ' && urlSize == 0)) urlFound=urlTagFound=dqcount=0; if((c == '>' || c == ' ') && urlFound == 1){ if(canCrawl(urlSize,strURL)==0 || (urlSize+urlListSize) >= (urlList_len-1)){ memset(strURL,0,strURL_len+1); }else{ strcat(urlList,strURL); strcat(urlList,"\n"); urlListSize+=urlSize+1; memset(strURL,0,strURL_len+1); numURL++; } urlFound = urlTagFound = urlSize = dqcount = 0; } if(urlFound == 1 && urlListSize < (urlList_len-2) && c != '"' && c != '\'' && urlSize < (strURL_len-2)){ strURL[urlSize]=window[window_len-1]; urlSize++; } if(urlSize==11){ if(locateInWindow(window,"javascript:","JAVASCRIPT:",11)==1){ urlFound=urlTagFound=urlSize=dqcount=0; memset(strURL,0,strURL_len+1); } } } if(urlFound == 0 && urlTagFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(windowWithSpaces,"<a ","<A ",3)==1){//sometimes there is something between "<a" and "href" urlTagFound = 1; } if(urlFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && (locateInWindow(window,"ahref=","AHREF=",6)==1 || (urlTagFound == 1 && locateInWindow(window,"href=","HREF=",5)==1))){ urlFound = 1; } } } //Convert charset to mysql equivalent charset2mysql(); //Filter additional characters *if* required filtervars(); //print body to file /* bodyfile = fopen("body.txt","wb"); fputs(body,bodyfile); fclose(bodyfile); //print title to file titlefile = fopen("title.txt","wb"); fputs(title,titlefile); fclose(titlefile); //print keywords to file keywordsfile = fopen("keywords.txt","wb"); fputs(keywords,keywordsfile); fclose(keywordsfile); //print description to file descriptionfile = fopen("description.txt","wb"); fputs(description,descriptionfile); fclose(descriptionfile); //print charset to file charsetfile = fopen("charset.txt","wb"); fputs(mysqlcharset,charsetfile); fclose(charsetfile); //print noindex to file noindexfile = fopen("noindex.txt","wb"); if(noindex==1) fputs("noindex",noindexfile); fclose(noindexfile); //print nofollow to file nofollowfile = fopen("nofollow.txt","wb"); if(nofollow==1) fputs("nofollow",nofollowfile); fclose(nofollowfile);*/ if(getURLs==1){ //shuffle order of collected URLs list shuffleURLs(10,urlListSize); //printf("\n%s",urlList); /*//print URLs to file urlfile = fopen("url.txt","wb"); fputs(urlList,urlfile); fclose(urlfile); //print shuffled URLs to file shuffledurlfile = fopen("urlshuffled.txt","wb"); fputs(urlListShuffled,shuffledurlfile); fclose(shuffledurlfile); */ } free(fileStr); printf("\nbody: %ld, title: %ld, charset: %ld, description: %ld, keywords: %ld, noindex: %d, nofollow: %d",bodysize,titlesize,charsetsize,descriptionsize,keywordssize,noindex,nofollow); } void shuffleURLs(int iterations, long urlListSize) { if(seeded==0){ srand(time(NULL)); seeded=1; } int r1,r2,r1to2; int urlCount,i,j,k,l; if(numURL>2){ strcpy(urlListHoldShuffled,urlList); for(int loops=0;loops<iterations;loops++){ r1 = r1to2 = (rand() % numURL) + 1; r2 = (rand() % numURL) + 1; if(r1>r2){ r1=r2; r2=r1to2; } if(r1==r2){ continue; } urlCount=i=j=k=l=0; //skip to url number r1 while(urlCount < r1 /*&& i<urlList_len*/){ if(urlListHoldShuffled[i]=='\n') urlCount++; i++; } j=i; //copy to urlListShuffled starting at j until reaching r2 location while(urlCount<r2 /*&& j<urlList_len*/){ urlListShuffled[k]=urlListHoldShuffled[j]; if(urlListHoldShuffled[j]=='\n') urlCount++; j++; k++; } //concat url's before i while(l<i /*&& k<urlList_len*/){ urlListShuffled[k]=urlListHoldShuffled[l]; l++; k++; } //concat url's after k while(k<urlListSize /*&& k<urlList_len*/){ urlListShuffled[k]=urlListHoldShuffled[k]; k++; } strcpy(urlListHoldShuffled,urlListShuffled); } }else{ strcpy(urlListShuffled,urlList); } } void charset2mysql() { //if no charset specified, use utf8 if(charsetsize == 0){ strcpy(mysqlcharset,"SET CHARSET utf8;"); printf("No Charset found. %s",mysqlcharset); } else{ //else, match charset with a proper mysql charset if(matchMySQLcharset(charsetsize,charset,5,"utf-8","UTF-8")==1){ strcpy(mysqlcharset,"SET CHARSET utf8mb4;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,6,"latin1","LATIN1")==1){ strcpy(mysqlcharset,"SET CHARSET latin1;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,9,"shift-jis","SHIFT-JIS")==1){ strcpy(mysqlcharset,"SET CHARSET cp932;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,6,"x-sjis","X-SJIS")==1){ strcpy(mysqlcharset,"SET CHARSET cp932;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,10,"iso-8859-1","ISO-8859-1")==1){ strcpy(mysqlcharset,"SET CHARSET latin1;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,12,"windows-1252","WINDOWS-1252")==1){ strcpy(mysqlcharset,"SET CHARSET latin1;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,12,"windows-1251","WINDOWS-1251")==1){ strcpy(mysqlcharset,"SET CHARSET cp1251;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,12,"windows-1250","WINDOWS-1250")==1){ strcpy(mysqlcharset,"SET CHARSET cp1250;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,6,"koi8-r","KOI8-R")==1){ strcpy(mysqlcharset,"SET CHARSET cp1251;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,6,"euc-kr","EUC-KR")==1){ strcpy(mysqlcharset,"SET CHARSET euckr;"); printf("%s",mysqlcharset); } else if(matchMySQLcharset(charsetsize,charset,4,"big5","BIG5")==1){ strcpy(mysqlcharset,"SET CHARSET big5;"); printf("%s",mysqlcharset); } else{ strcpy(mysqlcharset,"SET CHARSET utf8;"); printf("Charset mismatch. %s",mysqlcharset); } } } int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match) { int match = 0; int i=0; for(;i<html_match_length;i++){ if(i > html_charset_length){ return 0; } if(html_charset[i] != 95 && html_charset[i] != 45 && html_lowercase_match[i] != 95 && html_lowercase_match[i] != 45){ // _ or - if(html_lowercase_match[i] != html_charset[i] && html_uppercase_match[i] != html_charset[i]){ return 0; } } match = 1; } return match; } int locateInWindow(char *window, char *birdLower, char *birdUpper, int length) { int start = window_len-length; for(int i=0;i<length;i++){ if(window[start] != birdLower[i] && window[start] != birdUpper[i]){ return 0; } start++; } return 1; } int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize) { long start = urlSize-length; if(urlSize >= length){ for(int i=0;i<length;i++){ if(url[start] != birdLower[i] && url[start] != birdUpper[i]){ return 0; } start++; } return 1; }else{ return 0; } } //Check if url can be indexed (allow relative links for html and txt files. Removing this check will add to the queue everything listed including external links. int canCrawl(int urlSize, char *urltocheck){ int numDots=0,numSlash=0; int slashpos=0,dotspos=0; int extfound=0,extlocation=0,prefixfound=0; for(int i=0;i<urlSize;i++){ if(urlSize>5 && urltocheck[i]==':' && i>3){ if((urltocheck[0]!='h' && urltocheck[0]!='H') || (urltocheck[1]!='t' && urltocheck[1]!='T') || (urltocheck[2]!='t' && urltocheck[2]!='T') || (urltocheck[3]!='p' && urltocheck[3]!='P') || (urltocheck[4]!='s' && urltocheck[4]!='S' && urltocheck[4]!=':') || (urltocheck[5]!=':' && urltocheck[5]!='/')) return 0; prefixfound=1; } if(urltocheck[i]=='?' || urltocheck[i]=='\\' || urltocheck[i] == '"' || urltocheck[i] == '\'' || urltocheck[i] == ' '){ return 0; } if(urltocheck[i]=='.'){ numDots++; } if(urltocheck[i]=='/'){ numSlash++; } if(urltocheck[i]=='.' ){ extfound=1; extlocation=i; } if(urltocheck[i]=='/' && extfound==1 && i>extlocation){ extfound=0; } if(prefixfound==1 && numSlash-2<=0){ extfound=0; } } if(numDots == 0){ return 1; } //restrict file extensions to these if(extfound==1 && (locateInURL(urltocheck,".html",".HTML",5,urlSize)==1 || locateInURL(urltocheck,".htm",".HTM",4,urlSize)==1 || locateInURL(urltocheck,".txt",".TXT",4,urlSize)==1 || locateInURL(urltocheck,".php",".PHP",4,urlSize)==1 || locateInURL(urltocheck,".asp",".ASP",4,urlSize)==1 || locateInURL(urltocheck,".xhtml",".XHTML",6,urlSize)==1 || locateInURL(urltocheck,".shtml",".SHTML",6,urlSize)==1)){ return 1; } if(extfound==0 ) return 1; return 0; } void filtervars(){ //Creates a copy of title, description, body variables with single-quotes filtered out //will be used for the shard tables, but not on the primary 'windex' table //allows a more restrictive query to be used. Is agnostic to searches containing single-quotes as a compromise //filter title int j=0; for(int i=0;i<titlesize;i++){ if(title[i]!=39){ title_filtered[j]=title[i]; j++; } } //filter description j=0; for(int i=0;i<descriptionsize;i++){ if(description[i]!=39){ description_filtered[j]=description[i]; j++; } } //filter body j=0; for(int i=0;i<bodysize;i++){ if(body[i]!=39){ body_filtered[j]=body[i]; j++; } } }