Add files via upload

This commit is contained in:
wibyweb 2023-09-23 14:15:42 -04:00 committed by GitHub
parent a72ff77e9d
commit 552bcce204
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 47 additions and 6 deletions

12
c/cr.c
View file

@ -1257,7 +1257,7 @@ int main(int argc, char **argv)
strcat(windexRandUpdate,finalURLnoprefix);
strcat(windexRandUpdate,"', title = '");
if(titlesize > 0 && emptytitle == 0){
strcat(windexRandUpdate,title);
strcat(windexRandUpdate,title_filtered);
}
else{
if(finalURLsize < 111){
@ -1268,9 +1268,9 @@ int main(int argc, char **argv)
}
}
strcat(windexRandUpdate,"', tags = NULL, description = '");
strcat(windexRandUpdate,description);
strcat(windexRandUpdate,description_filtered);
strcat(windexRandUpdate,"', body = '");
strcat(windexRandUpdate,body);
strcat(windexRandUpdate,body_filtered);
strcat(windexRandUpdate,"', worksafe = ");
strcat(windexRandUpdate,worksafe);
strcat(windexRandUpdate,", approver = '");
@ -1404,7 +1404,7 @@ int main(int argc, char **argv)
strcat(windexupdate,finalURLnoprefix);
strcat(windexupdate,"', title = '");
if(titlesize > 0 && emptytitle == 0){
strcat(windexupdate,title);
strcat(windexupdate,title_filtered);
}
else{
if(finalURLsize < 111){
@ -1419,9 +1419,9 @@ int main(int argc, char **argv)
else{
strcat(windexupdate,"', tags = NULL, description = '");
}
strcat(windexupdate,description);
strcat(windexupdate,description_filtered);
strcat(windexupdate,"', body = '");
strcat(windexupdate,body);
strcat(windexupdate,body_filtered);
strcat(windexupdate,"', worksafe = ");
strcat(windexupdate,worksafe);
//strcat(windexupdate,", approver = '");

View file

@ -24,6 +24,7 @@ static char filename[] = "page.out";
char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1];
char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1];
char title_filtered[title_len+1], body_filtered[body_len+1], description_filtered[description_len+1];
int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1;
long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0;
@ -34,6 +35,7 @@ int canCrawl(int urlSize, char *urltocheck);
void shuffleURLs(int iterations, long urlListSize);
void sqlsafe();
void charset2mysql();
void filtervars();
FILE *f;
char *fileStr;
@ -62,6 +64,9 @@ void htmlparse(){
memset(strURL,0,strURL_len+1);
memset(urlListShuffled,0,urlList_len+1);
memset(urlListHoldShuffled,0,urlList_len+1);
memset(title_filtered,0,title_len+1);
memset(body_filtered,0,body_len+1);
memset(description_filtered,0,description_len+1);
printf("Parsing HTML... ");
//open html file and load into memory
@ -312,6 +317,9 @@ void htmlparse(){
//Convert charset to mysql equivalent
charset2mysql();
//Filter additional characters *if* required
filtervars();
//print body to file
/* bodyfile = fopen("body.txt","wb");
fputs(body,bodyfile);
@ -576,3 +584,36 @@ int canCrawl(int urlSize, char *urltocheck){
return 1;
return 0;
}
void filtervars(){
//Creates a copy of title, description, body variables with single-qutoes filtered out
//will be used for the shard tables, but not on the primary 'windex' table
//allows a more restrictive query to be used. Is agnostic to searches containing single-quotes as a compromise
//filter title
int j=0;
for(int i=0;i<titlesize;i++){
if(title[i]!=39){
title_filtered[j]=title[i];
j++;
}
}
//filter description
j=0;
for(int i=0;i<descriptionsize;i++){
if(description[i]!=39){
description_filtered[j]=description[i];
j++;
}
}
//filter body
j=0;
for(int i=0;i<bodysize;i++){
if(body[i]!=39){
body_filtered[j]=body[i];
j++;
}
}
}