Pārlūkot izejas kodu

Add files via upload

wibyweb 1 gadu atpakaļ
vecāks
revīzija
552bcce204
2 mainītis faili ar 47 papildinājumiem un 6 dzēšanām
  1. 6 6
      c/cr.c
  2. 41 0
      c/htmlparse.h

+ 6 - 6
c/cr.c

@@ -1257,7 +1257,7 @@ int main(int argc, char **argv)
 										strcat(windexRandUpdate,finalURLnoprefix);
 										strcat(windexRandUpdate,finalURLnoprefix);
 										strcat(windexRandUpdate,"', title = '");
 										strcat(windexRandUpdate,"', title = '");
 										if(titlesize > 0 && emptytitle == 0){
 										if(titlesize > 0 && emptytitle == 0){
-											strcat(windexRandUpdate,title);
+											strcat(windexRandUpdate,title_filtered);
 										}
 										}
 										else{
 										else{
 											if(finalURLsize < 111){
 											if(finalURLsize < 111){
@@ -1268,9 +1268,9 @@ int main(int argc, char **argv)
 											}
 											}
 										}
 										}
 										strcat(windexRandUpdate,"', tags = NULL, description = '");
 										strcat(windexRandUpdate,"', tags = NULL, description = '");
-										strcat(windexRandUpdate,description);
+										strcat(windexRandUpdate,description_filtered);
 										strcat(windexRandUpdate,"', body = '");
 										strcat(windexRandUpdate,"', body = '");
-										strcat(windexRandUpdate,body);	
+										strcat(windexRandUpdate,body_filtered);	
 										strcat(windexRandUpdate,"', worksafe = ");
 										strcat(windexRandUpdate,"', worksafe = ");
 										strcat(windexRandUpdate,worksafe);
 										strcat(windexRandUpdate,worksafe);
 										strcat(windexRandUpdate,", approver = '");
 										strcat(windexRandUpdate,", approver = '");
@@ -1404,7 +1404,7 @@ int main(int argc, char **argv)
 								strcat(windexupdate,finalURLnoprefix);
 								strcat(windexupdate,finalURLnoprefix);
 								strcat(windexupdate,"', title = '");
 								strcat(windexupdate,"', title = '");
 								if(titlesize > 0 && emptytitle == 0){
 								if(titlesize > 0 && emptytitle == 0){
-									strcat(windexupdate,title);
+									strcat(windexupdate,title_filtered);
 								}
 								}
 								else{
 								else{
 									if(finalURLsize < 111){
 									if(finalURLsize < 111){
@@ -1419,9 +1419,9 @@ int main(int argc, char **argv)
 								else{
 								else{
 									strcat(windexupdate,"', tags = NULL, description = '");
 									strcat(windexupdate,"', tags = NULL, description = '");
 								}
 								}
-								strcat(windexupdate,description);
+								strcat(windexupdate,description_filtered);
 								strcat(windexupdate,"', body = '");
 								strcat(windexupdate,"', body = '");
-								strcat(windexupdate,body);	
+								strcat(windexupdate,body_filtered);	
 								strcat(windexupdate,"', worksafe = ");
 								strcat(windexupdate,"', worksafe = ");
 								strcat(windexupdate,worksafe);
 								strcat(windexupdate,worksafe);
 								//strcat(windexupdate,", approver = '");
 								//strcat(windexupdate,", approver = '");

+ 41 - 0
c/htmlparse.h

@@ -24,6 +24,7 @@ static char filename[] = "page.out";
 
 
 char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1];
 char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1];
 char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1];
 char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1];
+char title_filtered[title_len+1], body_filtered[body_len+1], description_filtered[description_len+1];
 int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1;
 int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1;
 long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0;
 long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0;
 
 
@@ -34,6 +35,7 @@ int canCrawl(int urlSize, char *urltocheck);
 void shuffleURLs(int iterations, long urlListSize);
 void shuffleURLs(int iterations, long urlListSize);
 void sqlsafe();
 void sqlsafe();
 void charset2mysql();
 void charset2mysql();
+void filtervars();
 
 
 FILE *f;
 FILE *f;
 char *fileStr;
 char *fileStr;
@@ -62,6 +64,9 @@ void htmlparse(){
 	memset(strURL,0,strURL_len+1);
 	memset(strURL,0,strURL_len+1);
 	memset(urlListShuffled,0,urlList_len+1);
 	memset(urlListShuffled,0,urlList_len+1);
 	memset(urlListHoldShuffled,0,urlList_len+1);
 	memset(urlListHoldShuffled,0,urlList_len+1);
+	memset(title_filtered,0,title_len+1);
+	memset(body_filtered,0,body_len+1);
+	memset(description_filtered,0,description_len+1);	
 	printf("Parsing HTML... ");
 	printf("Parsing HTML... ");
 
 
 	//open html file and load into memory
 	//open html file and load into memory
@@ -312,6 +317,9 @@ void htmlparse(){
 	//Convert charset to mysql equivalent
 	//Convert charset to mysql equivalent
 	charset2mysql();
 	charset2mysql();
 	
 	
+	//Filter additional characters *if* required
+	filtervars();	
+	
 	//print body to file
 	//print body to file
 /*	bodyfile = fopen("body.txt","wb");
 /*	bodyfile = fopen("body.txt","wb");
 	fputs(body,bodyfile);
 	fputs(body,bodyfile);
@@ -576,3 +584,36 @@ int canCrawl(int urlSize, char *urltocheck){
 		return 1;
 		return 1;
 	return 0;
 	return 0;
 }
 }
+
+void filtervars(){
+	//Creates a copy of title, description, body variables with single-qutoes filtered out
+	//will be used for the shard tables, but not on the primary 'windex' table
+	//allows a more restrictive query to be used. Is agnostic to searches containing single-quotes as a compromise
+
+	//filter title
+	int j=0;
+	for(int i=0;i<titlesize;i++){
+			if(title[i]!=39){
+				title_filtered[j]=title[i];
+				j++;
+			}
+	}
+	
+	//filter description
+	j=0;
+	for(int i=0;i<descriptionsize;i++){
+			if(description[i]!=39){
+				description_filtered[j]=description[i];
+				j++;
+			}
+	}
+	
+	//filter body
+	j=0;
+	for(int i=0;i<bodysize;i++){
+			if(body[i]!=39){
+				body_filtered[j]=body[i];
+				j++;
+			}
+	}
+}