//Wiby HTML Parser
//Separates text from an HTML file
//Remember to also set sql_mode = "NO_BACKSLASH_ESCAPES" in my.cnf
#include
#include
#include
#include
#define window_len 100
#define charset_len 100
#define mysqlcharset_len 100
#define title_len 144
#define keywords_len 1024
#define description_len 182
#define robots_len 100
#define body_len 81920
#define urlList_len 102400
#define strURL_len 102400
FILE *bodyfile,*titlefile, *keywordsfile, *descriptionfile, *noindexfile, *nofollowfile, *charsetfile, *urlfile, *shuffledurlfile;
static char filename[] = "page.out";
char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1];
char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1];
char title_filtered[title_len+1], body_filtered[body_len+1], description_filtered[description_len+1];
int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1;
long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0;
int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match);
int locateInWindow(char *window, char *birdLower, char *birdUpper, int length);
int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize);
int canCrawl(int urlSize, char *urltocheck);
void shuffleURLs(int iterations, long urlListSize);
void sqlsafe();
void charset2mysql();
void filtervars();
FILE *f;
char *fileStr;
char c;
void htmlparse(){
long urlListSize=0;
numURL=0;
int intag=0,incomment=0,inscript=0,instyle=0,inlink=0,putspace=0,spacecount=0,foundbr=0;
int urlSize=0,dqcount=0;
titlefound=charsetfound=descriptionfound=keywordsfound=robotsfound=nofollow=noindex=scriptfound=stylefound=num_stylesheets=num_scripts=0;
charsetsize=titlesize=keywordssize=descriptionsize=robotssize=bodysize=0;
memset(window,'#',window_len);
// window[window_len]=0;
memset(windowWithSpaces,'#',window_len);
// windowWithSpaces[window_len]=0;
memset(charset,0,charset_len+1);
memset(mysqlcharset,0,mysqlcharset_len+1);
memset(title,0,title_len+1);
memset(keywords,0,keywords_len+1);
memset(description,0,description_len+1);
memset(robots,0,robots_len+1);
memset(body,0,body_len+1);
memset(urlList,0,urlList_len+1);
memset(strURL,0,strURL_len+1);
memset(urlListShuffled,0,urlList_len+1);
memset(urlListHoldShuffled,0,urlList_len+1);
memset(title_filtered,0,title_len+1);
memset(body_filtered,0,body_len+1);
memset(description_filtered,0,description_len+1);
printf("Parsing HTML... ");
//open html file and load into memory
f = fopen(filename, "rb");
fseek(f, 0, SEEK_END);
long fsize = ftell(f);
fseek(f, 0, SEEK_SET); /* same as rewind(f); */
fileStr = malloc(fsize + 1);
if(fread(fileStr, 1, fsize, f)){};
fclose(f);
fileStr[fsize] = 0;
//Locate the charset, title, description, keywords, robots, body
//must accomodate human error in markup
//must double all single quotes for mysql safety
//dont allow extra whitespace, ignore cr/lf/tabs
//complete it all in one pass
for(int i=0;i= (title_len-2))
titlefound=3;
}
if(locateInWindow(window,"","",8)==1 && titlefound!=3){
titlefound = 3;
//remove from end of title by inserting null at location of <
titlesize -= 8;
if(titlesize < 0){ //avoids this:
titlesize = 0;
emptytitle = 1;
}
title[titlesize] = 0;
//printf("\n%s",title);
}
}
if(titlefound == 1 && c=='>')//in case of this situation:
titlefound = 2;
if(titlefound == 0 && locateInWindow(window,"' || c == '/'){
charsetfound = 2;
//printf("\n%s",charset);
}
if(charsetfound == 1 && charsetsize < charset_len && c != '"' && c != '\'' && skipchar == 0){
charset[charsetsize]=c;
charsetsize++;
}
}
if(charsetfound == 0 && locateInWindow(window,"charset=","CHARSET=",8)==1){
charsetfound = 1;
}
//Get Description
if(descriptionfound == 1){
if(c == '>' || c == '/'){
descriptionfound = 2;
//printf("\n%s",description);
}
if(descriptionfound == 1 && descriptionsize < (description_len-2) && c != '"' && skipchar == 0){
description[descriptionsize]=c;
descriptionsize++;
if(c == 39){//check for single quotes and double them up for sql safety
description[descriptionsize]=c;
descriptionsize++;
}
}
}
if(descriptionfound == 0 && locateInWindow(window,"description\"content=","DESCRIPTION\"CONTENT=",20)==1){
descriptionfound = 1;
}
//Get Keywords
if(keywordsfound == 1){
if(c == '>' || c == '/'){
keywordsfound = 2;
//printf("\n%s",keywords);
}
if(keywordsfound == 1 && keywordssize < (keywords_len-2) && c != '"' && skipchar == 0){
keywords[keywordssize]=c;
keywordssize++;
if(c == 39){//check for single quotes and double them up for sql safety
keywords[keywordssize]=c;
keywordssize++;
}
}
}
if(keywordsfound == 0 && locateInWindow(window,"keywords\"content=","KEYWORDS\"CONTENT=",17)==1){
keywordsfound = 1;
}
//Get Robots (nofollow, noindex)
if(robotsfound == 1){
if(c == '>' || c == '/'){
robotsfound = 2;
//printf("\n%s",robots);
if(locateInWindow(window,"nofollow","NOFOLLOW",8)==1)
nofollow=1;
if(locateInWindow(window,"noindex","NOINDEX",7)==1 || locateInWindow(window,"none","NONE",4)==1)
noindex=nofollow=1;
}
if(robotsfound == 1 && robotssize < robots_len && c != '"' && c != '\'' && skipchar == 0){
robots[robotssize]=c;
robotssize++;
}
}
if(robotsfound == 0 && locateInWindow(window,"robots\"content=","ROBOTS\"CONTENT=",15)==1){
robotsfound = 1;
}
if(titlefound != 2){
//Ignore between scripts, styles, and remove all tags, repeated spaces, tabs, cr, lf, null, add a space at end of every tag
if(c=='<'){
intag = 1;
}else if(c=='>'){
intag = 0;
putspace = 1;
}
if(locateInWindow(window,"","-->",3)==1){
incomment = 0;
}
if(locateInWindow(window,"",9)==1){
inscript = 0;
}
if(locateInWindow(window,"",8)==1){
instyle = 0;
}
if(locateInWindow(window,"",">",1)==1){
inlink = 0;
}
if(inlink==1){
if(locateInWindow(window,".css",".CSS",4)==1 && c != ' ' && skipchar == 0)
num_stylesheets++;
}
//Get Body
//exclude remaining tags, comments, scripts, styles, cr, lf, null, tab, add a space after a '>' but only allow one
if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && skipchar == 0 && bodysize < (body_len-2)){
if(putspace == 1){
if(spacecount == 0){
body[bodysize]=32;
bodysize++;
}
spacecount++;
putspace=0;
}else{
if(c==32)
spacecount++;
else spacecount = 0;
if(spacecount < 2){
body[bodysize]=c;
bodysize++;
if(c == 39){//check for single quotes and double them up for sql safety
body[bodysize]=c;
bodysize++;
}
}
}
}
}
//Get URL's
if(getURLs==1){
if(urlFound == 1 && incomment==0 && instyle==0 && inscript==0 && inlink == 0){
if(c=='"' || c=='\'')
dqcount++;
if((c == '#' && urlSize==0) || (dqcount == 2 && urlSize == 0) || (c == ' ' && urlSize == 0))
urlFound=urlTagFound=dqcount=0;
if((c == '>' || c == ' ') && urlFound == 1){
if(canCrawl(urlSize,strURL)==0 || (urlSize+urlListSize) >= (urlList_len-1)){
memset(strURL,0,strURL_len+1);
}else{
strcat(urlList,strURL);
strcat(urlList,"\n");
urlListSize+=urlSize+1;
memset(strURL,0,strURL_len+1);
numURL++;
}
urlFound = urlTagFound = urlSize = dqcount = 0;
}
if(urlFound == 1 && urlListSize < (urlList_len-2) && c != '"' && c != '\'' && urlSize < (strURL_len-2)){
strURL[urlSize]=window[window_len-1];
urlSize++;
}
if(urlSize==11){
if(locateInWindow(window,"javascript:","JAVASCRIPT:",11)==1){
urlFound=urlTagFound=urlSize=dqcount=0;
memset(strURL,0,strURL_len+1);
}
}
}
if(urlFound == 0 && urlTagFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(windowWithSpaces,"2){
strcpy(urlListHoldShuffled,urlList);
for(int loops=0;loopsr2){
r1=r2;
r2=r1to2;
}
if(r1==r2){
continue;
}
urlCount=i=j=k=l=0;
//skip to url number r1
while(urlCount < r1 /*&& i html_charset_length){
return 0;
}
if(html_charset[i] != 95 && html_charset[i] != 45 && html_lowercase_match[i] != 95 && html_lowercase_match[i] != 45){ // _ or -
if(html_lowercase_match[i] != html_charset[i] && html_uppercase_match[i] != html_charset[i]){
return 0;
}
}
match = 1;
}
return match;
}
int locateInWindow(char *window, char *birdLower, char *birdUpper, int length)
{
int start = window_len-length;
for(int i=0;i= length){
for(int i=0;i5 && urltocheck[i]==':' && i>3){
if((urltocheck[0]!='h' && urltocheck[0]!='H') || (urltocheck[1]!='t' && urltocheck[1]!='T') || (urltocheck[2]!='t' && urltocheck[2]!='T') || (urltocheck[3]!='p' && urltocheck[3]!='P') || (urltocheck[4]!='s' && urltocheck[4]!='S' && urltocheck[4]!=':') || (urltocheck[5]!=':' && urltocheck[5]!='/'))
return 0;
prefixfound=1;
}
if(urltocheck[i]=='?' || urltocheck[i]=='\\' || urltocheck[i] == '"' || urltocheck[i] == '\'' || urltocheck[i] == ' '){
return 0;
}
if(urltocheck[i]=='.'){
numDots++;
}
if(urltocheck[i]=='/'){
numSlash++;
}
if(urltocheck[i]=='.' ){
extfound=1;
extlocation=i;
}
if(urltocheck[i]=='/' && extfound==1 && i>extlocation){
extfound=0;
}
if(prefixfound==1 && numSlash-2<=0){
extfound=0;
}
}
if(numDots == 0){
return 1;
}
//restrict file extensions to these
if(extfound==1 && (locateInURL(urltocheck,".html",".HTML",5,urlSize)==1 || locateInURL(urltocheck,".htm",".HTM",4,urlSize)==1 || locateInURL(urltocheck,".txt",".TXT",4,urlSize)==1 || locateInURL(urltocheck,".php",".PHP",4,urlSize)==1 || locateInURL(urltocheck,".asp",".ASP",4,urlSize)==1 || locateInURL(urltocheck,".xhtml",".XHTML",6,urlSize)==1 || locateInURL(urltocheck,".shtml",".SHTML",6,urlSize)==1)){
return 1;
}
if(extfound==0 )
return 1;
return 0;
}
void filtervars(){
//Creates a copy of title, description, body variables with single-quotes filtered out
//will be used for the shard tables, but not on the primary 'windex' table
//allows a more restrictive query to be used. Is agnostic to searches containing single-quotes as a compromise
//filter title
int j=0;
for(int i=0;i