123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619 |
- //Wiby HTML Parser
- //Separates text from an HTML file
- //Remember to also set sql_mode = "NO_BACKSLASH_ESCAPES" in my.cnf
- #include <stdio.h>
- #include <stdlib.h>
- #include <string.h>
- #include <time.h>
- #define window_len 100
- #define charset_len 100
- #define mysqlcharset_len 100
- #define title_len 152
- #define keywords_len 1024
- #define description_len 182
- #define robots_len 100
- #define body_len 81920
- #define urlList_len 102400
- #define strURL_len 102400
- FILE *bodyfile,*titlefile, *keywordsfile, *descriptionfile, *noindexfile, *nofollowfile, *charsetfile, *urlfile, *shuffledurlfile;
- static char filename[] = "page.out";
- char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1];
- char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1];
- char title_filtered[title_len+1], body_filtered[body_len+1], description_filtered[description_len+1];
- int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1;
- long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0;
- int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match);
- int locateInWindow(char *window, char *birdLower, char *birdUpper, int length);
- int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize);
- int canCrawl(int urlSize, char *urltocheck);
- void shuffleURLs(int iterations, long urlListSize);
- void sqlsafe();
- void charset2mysql();
- void filtervars();
- FILE *f;
- char *fileStr;
- char c;
- void htmlparse(){
- long urlListSize=0;
- numURL=0;
- int intag=0,incomment=0,inscript=0,instyle=0,inlink=0,putspace=0,spacecount=0;
- int urlSize=0,dqcount=0;
- titlefound=charsetfound=descriptionfound=keywordsfound=robotsfound=nofollow=noindex=scriptfound=stylefound=num_stylesheets=num_scripts=0;
- charsetsize=titlesize=keywordssize=descriptionsize=robotssize=bodysize=0;
- memset(window,'#',window_len);
- // window[window_len]=0;
- memset(windowWithSpaces,'#',window_len);
- // windowWithSpaces[window_len]=0;
- memset(charset,0,charset_len+1);
- memset(mysqlcharset,0,mysqlcharset_len+1);
- memset(title,0,title_len+1);
- memset(keywords,0,keywords_len+1);
- memset(description,0,description_len+1);
- memset(robots,0,robots_len+1);
- memset(body,0,body_len+1);
- memset(urlList,0,urlList_len+1);
- memset(strURL,0,strURL_len+1);
- memset(urlListShuffled,0,urlList_len+1);
- memset(urlListHoldShuffled,0,urlList_len+1);
- memset(title_filtered,0,title_len+1);
- memset(body_filtered,0,body_len+1);
- memset(description_filtered,0,description_len+1);
- printf("Parsing HTML... ");
- //open html file and load into memory
- f = fopen(filename, "rb");
- fseek(f, 0, SEEK_END);
- long fsize = ftell(f);
- fseek(f, 0, SEEK_SET); /* same as rewind(f); */
- fileStr = malloc(fsize + 1);
- if(fread(fileStr, 1, fsize, f)){};
- fclose(f);
- fileStr[fsize] = 0;
-
- //Locate the charset, title, description, keywords, robots, body
- //must accomodate human error in markup
- //must double all single quotes for mysql safety
- //dont allow extra whitespace, ignore cr/lf/tabs
- //complete it all in one pass
-
- for(int i=0;i<fsize;i++){
- c = fileStr[i];
- int skipchar = 0;
- if(c== 10 || c == 13 || c == 14 || c == 15 || c == 127 || c == 0 || c == 9){
- skipchar = 1;
- }
-
- //use a rolling window of 100 bytes to detect elements, ignore lf/cr/so/si/space/null/tab
- if(skipchar == 0 && c != 32){
- for(int j=0;j<window_len-1;j++){
- window[j] = window[j+1];
- }
- window[window_len-1] = c;
- }
- //use a rolling window of 100 bytes to detect elements, but permit space, ignore lf/cr/null/tab
- if(skipchar == 0){
- for(int j=0;j<window_len-1;j++){
- windowWithSpaces[j] = windowWithSpaces[j+1];
- }
- windowWithSpaces[window_len-1] = c;
- }
-
- //Get Title
- if(titlefound == 2){
- if(titlesize < (title_len-2) && skipchar == 0){
- title[titlesize]=c;
- titlesize++;
- if(c == 39){//check for single quotes and double them up for sql safety
- title[titlesize]=c;
- titlesize++;
- }
- if(c != 32 && skipchar == 0){//some titles are just a bunch of spaces or garbage, need to check for that
- emptytitle = 0;
- }
- }
- if(locateInWindow(window,"</title>","</TITLE>",8)==1){
- titlefound = 3;
- //remove </title> from end of title by inserting null at location of <
- titlesize -= 8;
- title[titlesize] = 0;
- //printf("\n%s",title);
- }
- }
- if(titlefound == 1 && c=='>')//in case of this situation: <title some_nonsense>
- titlefound=2;
- if(titlefound == 0 && locateInWindow(window,"<title","<TITLE",6)==1){
- titlefound = 1;
- }
-
- //Get Charset
- if(charsetfound == 1){
- if(c == '>' || c == '/'){
- charsetfound = 2;
- //printf("\n%s",charset);
- }
- if(charsetfound == 1 && charsetsize < charset_len && c != '"' && c != '\'' && skipchar == 0){
- charset[charsetsize]=c;
- charsetsize++;
- }
- }
- if(charsetfound == 0 && locateInWindow(window,"charset=","CHARSET=",8)==1){
- charsetfound = 1;
- }
-
- //Get Description
- if(descriptionfound == 1){
- if(c == '>' || c == '/'){
- descriptionfound = 2;
- //printf("\n%s",description);
- }
- if(descriptionfound == 1 && descriptionsize < (description_len-2) && c != '"' && skipchar == 0){
- description[descriptionsize]=c;
- descriptionsize++;
- if(c == 39){//check for single quotes and double them up for sql safety
- description[descriptionsize]=c;
- descriptionsize++;
- }
- }
- }
- if(descriptionfound == 0 && locateInWindow(window,"description\"content=","DESCRIPTION\"CONTENT=",20)==1){
- descriptionfound = 1;
- }
-
- //Get Keywords
- if(keywordsfound == 1){
- if(c == '>' || c == '/'){
- keywordsfound = 2;
- //printf("\n%s",keywords);
- }
- if(keywordsfound == 1 && keywordssize < (keywords_len-2) && c != '"' && skipchar == 0){
- keywords[keywordssize]=c;
- keywordssize++;
- if(c == 39){//check for single quotes and double them up for sql safety
- keywords[keywordssize]=c;
- keywordssize++;
- }
- }
- }
- if(keywordsfound == 0 && locateInWindow(window,"keywords\"content=","KEYWORDS\"CONTENT=",17)==1){
- keywordsfound = 1;
- }
-
- //Get Robots (nofollow, noindex)
- if(robotsfound == 1){
- if(c == '>' || c == '/'){
- robotsfound = 2;
- //printf("\n%s",robots);
- if(locateInWindow(window,"nofollow","NOFOLLOW",8)==1)
- nofollow=1;
- if(locateInWindow(window,"noindex","NOINDEX",7)==1 || locateInWindow(window,"none","NONE",4)==1)
- noindex=nofollow=1;
- }
- if(robotsfound == 1 && robotssize < robots_len && c != '"' && c != '\'' && skipchar == 0){
- robots[robotssize]=c;
- robotssize++;
- }
- }
- if(robotsfound == 0 && locateInWindow(window,"robots\"content=","ROBOTS\"CONTENT=",15)==1){
- robotsfound = 1;
- }
-
- if(titlefound != 2){
- //Ignore between scripts, styles, and remove all tags, repeated spaces, tabs, cr, lf, null, add a space at end of every tag
- if(c=='<'){
- intag = 1;
- }else if(c=='>'){
- intag = 0;
- putspace = 1;
- }
-
- if(locateInWindow(window,"<!--","<!--",4)==1){
- incomment = 1;
- }else if(locateInWindow(window,"-->","-->",3)==1){
- incomment = 0;
- }
-
- if(locateInWindow(window,"<script","<SCRIPT",7)==1 && c != ' ' && skipchar == 0){
- inscript = 1;
- num_scripts++;
- }else if(locateInWindow(window,"</script>","</SCRIPT>",9)==1){
- inscript = 0;
- }
-
- if(locateInWindow(window,"<style","<STYLE",6)==1 && c != ' ' && skipchar == 0){
- instyle = 1;
- num_stylesheets++;
- }else if(locateInWindow(window,"</style>","</STYLE>",8)==1){
- instyle = 0;
- }
-
- if(locateInWindow(window,"<link","<LINK",5)==1){
- inlink = 1;
- }else if(inlink==1 && locateInWindow(window,">",">",1)==1){
- inlink = 0;
- }
- if(inlink==1){
- if(locateInWindow(window,".css",".CSS",4)==1 && c != ' ' && skipchar == 0)
- num_stylesheets++;
- }
- //Get Body
- //exclude remaining tags, comments, scripts, styles, cr, lf, null, tab, add a space after a '>' but only allow one
- if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && skipchar == 0 && bodysize < (body_len-2)){
- if(putspace == 1){
- if(spacecount == 0){
- body[bodysize]=32;
- bodysize++;
- }
- spacecount++;
- putspace=0;
- }else{
- if(c==32)
- spacecount++;
- else spacecount = 0;
-
- if(spacecount < 2){
- body[bodysize]=c;
- bodysize++;
-
- if(c == 39){//check for single quotes and double them up for sql safety
- body[bodysize]=c;
- bodysize++;
- }
- }
- }
- }
- }
- //Get URL's
- if(getURLs==1){
- if(urlFound == 1 && incomment==0 && instyle==0 && inscript==0 && inlink == 0){
- if(c=='"' || c=='\'')
- dqcount++;
- if((c == '#' && urlSize==0) || (dqcount == 2 && urlSize == 0) || (c == ' ' && urlSize == 0))
- urlFound=urlTagFound=dqcount=0;
- if((c == '>' || c == ' ') && urlFound == 1){
- if(canCrawl(urlSize,strURL)==0 || (urlSize+urlListSize) >= (urlList_len-1)){
- memset(strURL,0,strURL_len+1);
- }else{
- strcat(urlList,strURL);
- strcat(urlList,"\n");
- urlListSize+=urlSize+1;
- memset(strURL,0,strURL_len+1);
- numURL++;
- }
- urlFound = urlTagFound = urlSize = dqcount = 0;
- }
- if(urlFound == 1 && urlListSize < (urlList_len-2) && c != '"' && c != '\'' && urlSize < (strURL_len-2)){
- strURL[urlSize]=window[window_len-1];
- urlSize++;
- }
- if(urlSize==11){
- if(locateInWindow(window,"javascript:","JAVASCRIPT:",11)==1){
- urlFound=urlTagFound=urlSize=dqcount=0;
- memset(strURL,0,strURL_len+1);
- }
- }
- }
- if(urlFound == 0 && urlTagFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(windowWithSpaces,"<a ","<A ",3)==1){//sometimes there is something between "<a" and "href"
- urlTagFound = 1;
- }
- if(urlFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && (locateInWindow(window,"ahref=","AHREF=",6)==1) || (urlTagFound == 1 && locateInWindow(window,"href=","HREF=",5)==1)){
- urlFound = 1;
- }
- }
- }
-
- //Convert charset to mysql equivalent
- charset2mysql();
-
- //Filter additional characters *if* required
- filtervars();
-
- //print body to file
- /* bodyfile = fopen("body.txt","wb");
- fputs(body,bodyfile);
- fclose(bodyfile);
- //print title to file
- titlefile = fopen("title.txt","wb");
- fputs(title,titlefile);
- fclose(titlefile);
-
- //print keywords to file
- keywordsfile = fopen("keywords.txt","wb");
- fputs(keywords,keywordsfile);
- fclose(keywordsfile);
-
- //print description to file
- descriptionfile = fopen("description.txt","wb");
- fputs(description,descriptionfile);
- fclose(descriptionfile);
-
- //print charset to file
- charsetfile = fopen("charset.txt","wb");
- fputs(mysqlcharset,charsetfile);
- fclose(charsetfile);
-
- //print noindex to file
- noindexfile = fopen("noindex.txt","wb");
- if(noindex==1)
- fputs("noindex",noindexfile);
- fclose(noindexfile);
- //print nofollow to file
- nofollowfile = fopen("nofollow.txt","wb");
- if(nofollow==1)
- fputs("nofollow",nofollowfile);
- fclose(nofollowfile);*/
-
- if(getURLs==1){
- //shuffle order of collected URLs list
- shuffleURLs(10,urlListSize);
- //printf("\n%s",urlList);
-
- /*//print URLs to file
- urlfile = fopen("url.txt","wb");
- fputs(urlList,urlfile);
- fclose(urlfile);
-
- //print shuffled URLs to file
- shuffledurlfile = fopen("urlshuffled.txt","wb");
- fputs(urlListShuffled,shuffledurlfile);
- fclose(shuffledurlfile); */
- }
-
- free(fileStr);
- printf("\nbody: %ld, title: %ld, charset: %ld, description: %ld, keywords: %ld, noindex: %d, nofollow: %d",bodysize,titlesize,charsetsize,descriptionsize,keywordssize,noindex,nofollow);
- }
- void shuffleURLs(int iterations, long urlListSize)
- {
- if(seeded==0){
- srand(time(NULL));
- seeded=1;
- }
-
- int r1,r2,r1to2;
- int urlCount,i,j,k,l;
- if(numURL>2){
- strcpy(urlListHoldShuffled,urlList);
- for(int loops=0;loops<iterations;loops++){
- r1 = r1to2 = (rand() % numURL) + 1;
- r2 = (rand() % numURL) + 1;
- if(r1>r2){
- r1=r2;
- r2=r1to2;
- }
- if(r1==r2){
- continue;
- }
- urlCount=i=j=k=l=0;
-
- //skip to url number r1
- while(urlCount < r1 /*&& i<urlList_len*/){
- if(urlListHoldShuffled[i]=='\n')
- urlCount++;
- i++;
- }
- j=i;
- //copy to urlListShuffled starting at j until reaching r2 location
- while(urlCount<r2 /*&& j<urlList_len*/){
- urlListShuffled[k]=urlListHoldShuffled[j];
- if(urlListHoldShuffled[j]=='\n')
- urlCount++;
- j++;
- k++;
- }
- //concat url's before i
- while(l<i /*&& k<urlList_len*/){
- urlListShuffled[k]=urlListHoldShuffled[l];
- l++;
- k++;
- }
- //concat url's after k
- while(k<urlListSize /*&& k<urlList_len*/){
- urlListShuffled[k]=urlListHoldShuffled[k];
- k++;
- }
- strcpy(urlListHoldShuffled,urlListShuffled);
- }
- }else{
- strcpy(urlListShuffled,urlList);
- }
-
- }
- void charset2mysql()
- {
- //if no charset specified, use utf8
- if(charsetsize == 0){
- strcpy(mysqlcharset,"SET CHARSET utf8;");
- printf("No Charset found. %s",mysqlcharset);
- }
- else{ //else, match charset with a proper mysql charset
-
- if(matchMySQLcharset(charsetsize,charset,5,"utf-8","UTF-8")==1){
- strcpy(mysqlcharset,"SET CHARSET utf8mb4;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,6,"latin1","LATIN1")==1){
- strcpy(mysqlcharset,"SET CHARSET latin1;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,9,"shift-jis","SHIFT-JIS")==1){
- strcpy(mysqlcharset,"SET CHARSET cp932;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,6,"x-sjis","X-SJIS")==1){
- strcpy(mysqlcharset,"SET CHARSET cp932;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,10,"iso-8859-1","ISO-8859-1")==1){
- strcpy(mysqlcharset,"SET CHARSET latin1;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,12,"windows-1252","WINDOWS-1252")==1){
- strcpy(mysqlcharset,"SET CHARSET latin1;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,12,"windows-1251","WINDOWS-1251")==1){
- strcpy(mysqlcharset,"SET CHARSET cp1251;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,6,"koi8-r","KOI8-R")==1){
- strcpy(mysqlcharset,"SET CHARSET cp1251;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,6,"euc-kr","EUC-KR")==1){
- strcpy(mysqlcharset,"SET CHARSET euckr;");
- printf("%s",mysqlcharset);
- }
- else if(matchMySQLcharset(charsetsize,charset,4,"big5","BIG5")==1){
- strcpy(mysqlcharset,"SET CHARSET big5;");
- printf("%s",mysqlcharset);
- }
- else{
- strcpy(mysqlcharset,"SET CHARSET utf8;");
- printf("Charset mismatch. %s",mysqlcharset);
- }
- }
- }
- int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match)
- {
- int match = 0;
- int i=0;
- for(;i<html_match_length;i++){
- if(i > html_charset_length){
- return 0;
- }
- if(html_charset[i] != 95 && html_charset[i] != 45 && html_lowercase_match[i] != 95 && html_lowercase_match[i] != 45){ // _ or -
- if(html_lowercase_match[i] != html_charset[i] && html_uppercase_match[i] != html_charset[i]){
- return 0;
- }
- }
- match = 1;
- }
- return match;
- }
- int locateInWindow(char *window, char *birdLower, char *birdUpper, int length)
- {
- int start = window_len-length;
- for(int i=0;i<length;i++){
- if(window[start] != birdLower[i] && window[start] != birdUpper[i]){
- return 0;
- }
- start++;
- }
- return 1;
- }
- int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize)
- {
- long start = urlSize-length;
- if(urlSize >= length){
- for(int i=0;i<length;i++){
- if(url[start] != birdLower[i] && window[start] != birdUpper[i]){
- return 0;
- }
- start++;
- }
- return 1;
- }else{
- return 0;
- }
- }
- //Check if url can be indexed (allow relative links for html and txt files. Removing this check will add to the queue everything listed including external links.
- int canCrawl(int urlSize, char *urltocheck){
- int numDots=0,numSlash=0;
- int slashpos=0,dotspos=0;
- int extfound=0,extlocation=0,prefixfound=0;
- for(int i=0;i<urlSize;i++){
- if(urlSize>5 && urltocheck[i]==':' && i>3){
- if((urltocheck[0]!='h' && urltocheck[0]!='H') || (urltocheck[1]!='t' && urltocheck[1]!='T') || (urltocheck[2]!='t' && urltocheck[2]!='T') || (urltocheck[3]!='p' && urltocheck[3]!='P') || (urltocheck[4]!='s' && urltocheck[4]!='S' && urltocheck[4]!=':') || (urltocheck[5]!=':' && urltocheck[5]!='/'))
- return 0;
- prefixfound=1;
- }
- if(urltocheck[i]=='?' || urltocheck[i]=='\\' || urltocheck[i] == '"' || urltocheck[i] == '\'' || urltocheck[i] == ' '){
- return 0;
- }
- if(urltocheck[i]=='.'){
- numDots++;
- }
- if(urltocheck[i]=='/'){
- numSlash++;
- }
- if(urltocheck[i]=='.' ){
- extfound=1;
- extlocation=i;
- }
- if(urltocheck[i]=='/' && extfound==1 && i>extlocation){
- extfound=0;
- }
- if(prefixfound==1 && numSlash-2<=0){
- extfound=0;
- }
- }
- if(numDots == 0){
- return 1;
- }
- //restrict file extensions to these
- if(extfound==1 && (locateInURL(urltocheck,".html",".HTML",5,urlSize)==1 || locateInURL(urltocheck,".htm",".HTM",4,urlSize)==1 || locateInURL(urltocheck,".txt",".TXT",4,urlSize)==1 || locateInURL(urltocheck,".php",".PHP",4,urlSize)==1 || locateInURL(urltocheck,".asp",".ASP",4,urlSize)==1 || locateInURL(urltocheck,".xhtml",".XHTML",6,urlSize)==1 || locateInURL(urltocheck,".shtml",".SHTML",6,urlSize)==1)){
- return 1;
- }
- if(extfound==0 )
- return 1;
- return 0;
- }
- void filtervars(){
- //Creates a copy of title, description, body variables with single-qutoes filtered out
- //will be used for the shard tables, but not on the primary 'windex' table
- //allows a more restrictive query to be used. Is agnostic to searches containing single-quotes as a compromise
- //filter title
- int j=0;
- for(int i=0;i<titlesize;i++){
- if(title[i]!=39){
- title_filtered[j]=title[i];
- j++;
- }
- }
-
- //filter description
- j=0;
- for(int i=0;i<descriptionsize;i++){
- if(description[i]!=39){
- description_filtered[j]=description[i];
- j++;
- }
- }
-
- //filter body
- j=0;
- for(int i=0;i<bodysize;i++){
- if(body[i]!=39){
- body_filtered[j]=body[i];
- j++;
- }
- }
- }
|