From ae06ca73ade5ba9769686aeef36488dab4e49dad Mon Sep 17 00:00:00 2001 From: Wiby Date: Thu, 7 Jul 2022 23:48:28 -0400 Subject: [PATCH] Initial commit. --- README.md | 4 + c/abandoned.txt | 0 c/checkrobots.h | 238 + c/cr.c | 1388 ++ c/htmlparse.h | 574 + c/rs.c | 226 + c/rt.c | 306 + c/servers_example.csv | 4 + c/urlparse.h | 296 + db/wiby.sql | 214 + db/wibytemp.sql | 99 + etc/nginx/sites-available/default_example | 270 + go/core/1core.go | 976 + go/core/core.go | 1172 ++ go/core/coreassets/error.html.go | 20 + go/core/coreassets/form.html.go | 45 + go/core/coreassets/json/json.html.go | 19 + go/core/coreassets/json/results.json.go | 13 + go/core/coreassets/results.html.go | 32 + go/core/coreassets/settings/agree.html.go | 33 + go/core/coreassets/settings/gohome.html | 9 + go/core/coreassets/settings/settings.html.go | 75 + go/core/coreassets/surprise.html.go | 10 + html/about/button.gif | Bin 0 -> 2150 bytes html/about/guide.html | 533 + html/about/index.html | 24 + html/about/mug.gif | Bin 0 -> 1465 bytes html/about/pp.html | 12 + html/about/wiby.gif | Bin 0 -> 2150 bytes html/about/wiby.org.gif | Bin 0 -> 2310 bytes html/about/wibyplex.gif | Bin 0 -> 25753 bytes html/accounts/accounts.html.php | 25 + html/accounts/accounts.php | 249 + html/accounts/admin.html.php | 42 + html/accounts/error.html.php | 25 + html/accounts/guardian.html.php | 25 + html/accounts/index.php | 96 + html/accounts/login.html.php | 31 + html/ban/ban.html.php | 25 + html/ban/ban.php | 65 + html/ban/error.html.php | 25 + html/ban/form.html.php | 22 + html/ban/index.php | 94 + html/ban/login.html.php | 31 + html/bulksubmit/bulksubmit.php | 118 + html/bulksubmit/error.html.php | 25 + html/bulksubmit/form.html.php | 59 + html/bulksubmit/index.php | 95 + html/bulksubmit/login.html.php | 31 + html/bulksubmit/submit.html.php | 28 + html/error.html.php | 25 + html/favicon.ico | Bin 0 -> 894 bytes html/feedback/error.html.php | 25 + html/feedback/form.html.php | 40 + html/feedback/index.php | 73 + html/feedback/submit.html.php | 36 + html/form.html.php | 45 + html/grave/error.html.php | 25 + html/grave/graveyard.php | 195 + html/grave/graveyardqueue.html.php | 64 + html/grave/index.php | 95 + html/grave/login.html.php | 31 + html/hash/hashmake.php | 4 + html/index.php | 450 + html/insert/error.html.php | 25 + html/insert/form.html.php | 91 + html/insert/index.php | 94 + html/insert/insert.html.php | 25 + html/insert/insert.php | 102 + html/insert/login.html.php | 31 + html/json/error.html.php | 25 + html/json/form.html.php | 19 + html/json/index.php | 460 + html/json/results.json.php | 22 + html/opensearch.xml | 14 + html/readf/error.html.php | 25 + html/readf/feedback.php | 110 + html/readf/form.html.php | 47 + html/readf/index.php | 96 + html/readf/login.html.php | 31 + html/results.html.php | 41 + html/review/error.html.php | 25 + html/review/index.php | 95 + html/review/login.html.php | 31 + html/review/review.php | 239 + html/review/reviewqueue.html.php | 66 + html/securimage/AHGBold.ttf | Bin 0 -> 144556 bytes html/securimage/LICENSE.txt | 25 + html/securimage/README.FONT.txt | 12 + html/securimage/README.md | 244 + html/securimage/README.txt | 222 + html/securimage/WavFile.php | 1913 ++ html/securimage/audio/.htaccess | 11 + html/securimage/audio/en/0.wav | Bin 0 -> 12694 bytes html/securimage/audio/en/1.wav | Bin 0 -> 11870 bytes html/securimage/audio/en/10.wav | Bin 0 -> 11172 bytes html/securimage/audio/en/11.wav | Bin 0 -> 13328 bytes html/securimage/audio/en/12.wav | Bin 0 -> 13454 bytes html/securimage/audio/en/13.wav | Bin 0 -> 18098 bytes html/securimage/audio/en/14.wav | Bin 0 -> 18040 bytes html/securimage/audio/en/15.wav | Bin 0 -> 18168 bytes html/securimage/audio/en/16.wav | Bin 0 -> 20872 bytes html/securimage/audio/en/17.wav | Bin 0 -> 20784 bytes html/securimage/audio/en/18.wav | Bin 0 -> 15962 bytes html/securimage/audio/en/19.wav | Bin 0 -> 17454 bytes html/securimage/audio/en/2.wav | Bin 0 -> 10114 bytes html/securimage/audio/en/20.wav | Bin 0 -> 11946 bytes html/securimage/audio/en/3.wav | Bin 0 -> 11456 bytes html/securimage/audio/en/4.wav | Bin 0 -> 11416 bytes html/securimage/audio/en/5.wav | Bin 0 -> 13852 bytes html/securimage/audio/en/6.wav | Bin 0 -> 14096 bytes html/securimage/audio/en/7.wav | Bin 0 -> 14180 bytes html/securimage/audio/en/8.wav | Bin 0 -> 10908 bytes html/securimage/audio/en/9.wav | Bin 0 -> 12882 bytes html/securimage/audio/en/A.wav | Bin 0 -> 10180 bytes html/securimage/audio/en/B.wav | Bin 0 -> 10528 bytes html/securimage/audio/en/C.wav | Bin 0 -> 13208 bytes html/securimage/audio/en/D.wav | Bin 0 -> 10236 bytes html/securimage/audio/en/E.wav | Bin 0 -> 12024 bytes html/securimage/audio/en/F.wav | Bin 0 -> 11372 bytes html/securimage/audio/en/G.wav | Bin 0 -> 11584 bytes html/securimage/audio/en/H.wav | Bin 0 -> 10960 bytes html/securimage/audio/en/I.wav | Bin 0 -> 10892 bytes html/securimage/audio/en/J.wav | Bin 0 -> 12474 bytes html/securimage/audio/en/K.wav | Bin 0 -> 12486 bytes html/securimage/audio/en/L.wav | Bin 0 -> 10588 bytes html/securimage/audio/en/M.wav | Bin 0 -> 9758 bytes html/securimage/audio/en/MINUS.wav | Bin 0 -> 22254 bytes html/securimage/audio/en/N.wav | Bin 0 -> 10076 bytes html/securimage/audio/en/O.wav | Bin 0 -> 9980 bytes html/securimage/audio/en/P.wav | Bin 0 -> 10320 bytes html/securimage/audio/en/PLUS.wav | Bin 0 -> 23026 bytes html/securimage/audio/en/Q.wav | Bin 0 -> 11060 bytes html/securimage/audio/en/R.wav | Bin 0 -> 10010 bytes html/securimage/audio/en/S.wav | Bin 0 -> 12802 bytes html/securimage/audio/en/T.wav | Bin 0 -> 10820 bytes html/securimage/audio/en/TIMES.wav | Bin 0 -> 21804 bytes html/securimage/audio/en/U.wav | Bin 0 -> 10176 bytes html/securimage/audio/en/V.wav | Bin 0 -> 12036 bytes html/securimage/audio/en/W.wav | Bin 0 -> 16570 bytes html/securimage/audio/en/X.wav | Bin 0 -> 12124 bytes html/securimage/audio/en/Y.wav | Bin 0 -> 13044 bytes html/securimage/audio/en/Z.wav | Bin 0 -> 12138 bytes html/securimage/audio/en/error.wav | Bin 0 -> 288044 bytes html/securimage/audio/noise/check-point-1.wav | Bin 0 -> 1515428 bytes .../audio/noise/crowd-talking-1.wav | Bin 0 -> 1274002 bytes .../audio/noise/crowd-talking-6.wav | Bin 0 -> 1985106 bytes .../audio/noise/crowd-talking-7.wav | Bin 0 -> 1301458 bytes .../securimage/audio/noise/kids-playing-1.wav | Bin 0 -> 1169154 bytes html/securimage/backgrounds/bg3.jpg | Bin 0 -> 15854 bytes html/securimage/backgrounds/bg4.jpg | Bin 0 -> 16438 bytes html/securimage/backgrounds/bg5.jpg | Bin 0 -> 48746 bytes html/securimage/backgrounds/bg6.png | Bin 0 -> 42880 bytes html/securimage/captcha.html | 136 + html/securimage/config.inc.php.SAMPLE | 87 + html/securimage/database/.htaccess | 11 + html/securimage/database/index.html | 1 + html/securimage/database/securimage.sq3 | Bin 0 -> 4096 bytes html/securimage/example_form.ajax.php | 205 + html/securimage/example_form.php | 232 + html/securimage/images/audio_icon.png | Bin 0 -> 1684 bytes html/securimage/images/loading.png | Bin 0 -> 1136 bytes html/securimage/images/refresh.png | Bin 0 -> 4835 bytes html/securimage/securimage.css | 41 + html/securimage/securimage.js | 252 + html/securimage/securimage.php | 3770 ++++ html/securimage/securimage_play.php | 70 + html/securimage/securimage_play.swf | Bin 0 -> 7833 bytes html/securimage/securimage_show.php | 79 + html/securimage/words/words.txt | 15457 ++++++++++++++++ html/settings/agree.html.php | 33 + html/settings/error.html.php | 25 + html/settings/form.html.php | 77 + html/settings/gohome.html | 9 + html/settings/index.php | 48 + html/styles.css | 16 + html/submit/error.html.php | 25 + html/submit/form.html.php | 68 + html/submit/index.php | 91 + html/submit/submit.html.php | 28 + html/surprise/error.html.php | 25 + html/surprise/index.php | 66 + 182 files changed, 33804 insertions(+) create mode 100755 README.md create mode 100755 c/abandoned.txt create mode 100755 c/checkrobots.h create mode 100755 c/cr.c create mode 100755 c/htmlparse.h create mode 100755 c/rs.c create mode 100755 c/rt.c create mode 100755 c/servers_example.csv create mode 100755 c/urlparse.h create mode 100755 db/wiby.sql create mode 100755 db/wibytemp.sql create mode 100755 etc/nginx/sites-available/default_example create mode 100755 go/core/1core.go create mode 100755 go/core/core.go create mode 100755 go/core/coreassets/error.html.go create mode 100755 go/core/coreassets/form.html.go create mode 100755 go/core/coreassets/json/json.html.go create mode 100755 go/core/coreassets/json/results.json.go create mode 100755 go/core/coreassets/results.html.go create mode 100755 go/core/coreassets/settings/agree.html.go create mode 100755 go/core/coreassets/settings/gohome.html create mode 100755 go/core/coreassets/settings/settings.html.go create mode 100755 go/core/coreassets/surprise.html.go create mode 100755 html/about/button.gif create mode 100755 html/about/guide.html create mode 100755 html/about/index.html create mode 100755 html/about/mug.gif create mode 100755 html/about/pp.html create mode 100755 html/about/wiby.gif create mode 100755 html/about/wiby.org.gif create mode 100755 html/about/wibyplex.gif create mode 100755 html/accounts/accounts.html.php create mode 100755 html/accounts/accounts.php create mode 100755 html/accounts/admin.html.php create mode 100755 html/accounts/error.html.php create mode 100755 html/accounts/guardian.html.php create mode 100755 html/accounts/index.php create mode 100755 html/accounts/login.html.php create mode 100755 html/ban/ban.html.php create mode 100755 html/ban/ban.php create mode 100755 html/ban/error.html.php create mode 100755 html/ban/form.html.php create mode 100755 html/ban/index.php create mode 100755 html/ban/login.html.php create mode 100755 html/bulksubmit/bulksubmit.php create mode 100755 html/bulksubmit/error.html.php create mode 100755 html/bulksubmit/form.html.php create mode 100755 html/bulksubmit/index.php create mode 100755 html/bulksubmit/login.html.php create mode 100755 html/bulksubmit/submit.html.php create mode 100755 html/error.html.php create mode 100755 html/favicon.ico create mode 100755 html/feedback/error.html.php create mode 100755 html/feedback/form.html.php create mode 100755 html/feedback/index.php create mode 100755 html/feedback/submit.html.php create mode 100755 html/form.html.php create mode 100755 html/grave/error.html.php create mode 100755 html/grave/graveyard.php create mode 100755 html/grave/graveyardqueue.html.php create mode 100755 html/grave/index.php create mode 100755 html/grave/login.html.php create mode 100755 html/hash/hashmake.php create mode 100755 html/index.php create mode 100755 html/insert/error.html.php create mode 100755 html/insert/form.html.php create mode 100755 html/insert/index.php create mode 100755 html/insert/insert.html.php create mode 100755 html/insert/insert.php create mode 100755 html/insert/login.html.php create mode 100755 html/json/error.html.php create mode 100755 html/json/form.html.php create mode 100755 html/json/index.php create mode 100755 html/json/results.json.php create mode 100755 html/opensearch.xml create mode 100755 html/readf/error.html.php create mode 100755 html/readf/feedback.php create mode 100755 html/readf/form.html.php create mode 100755 html/readf/index.php create mode 100755 html/readf/login.html.php create mode 100755 html/results.html.php create mode 100755 html/review/error.html.php create mode 100755 html/review/index.php create mode 100755 html/review/login.html.php create mode 100755 html/review/review.php create mode 100755 html/review/reviewqueue.html.php create mode 100755 html/securimage/AHGBold.ttf create mode 100755 html/securimage/LICENSE.txt create mode 100755 html/securimage/README.FONT.txt create mode 100755 html/securimage/README.md create mode 100755 html/securimage/README.txt create mode 100755 html/securimage/WavFile.php create mode 100755 html/securimage/audio/.htaccess create mode 100755 html/securimage/audio/en/0.wav create mode 100755 html/securimage/audio/en/1.wav create mode 100755 html/securimage/audio/en/10.wav create mode 100755 html/securimage/audio/en/11.wav create mode 100755 html/securimage/audio/en/12.wav create mode 100755 html/securimage/audio/en/13.wav create mode 100755 html/securimage/audio/en/14.wav create mode 100755 html/securimage/audio/en/15.wav create mode 100755 html/securimage/audio/en/16.wav create mode 100755 html/securimage/audio/en/17.wav create mode 100755 html/securimage/audio/en/18.wav create mode 100755 html/securimage/audio/en/19.wav create mode 100755 html/securimage/audio/en/2.wav create mode 100755 html/securimage/audio/en/20.wav create mode 100755 html/securimage/audio/en/3.wav create mode 100755 html/securimage/audio/en/4.wav create mode 100755 html/securimage/audio/en/5.wav create mode 100755 html/securimage/audio/en/6.wav create mode 100755 html/securimage/audio/en/7.wav create mode 100755 html/securimage/audio/en/8.wav create mode 100755 html/securimage/audio/en/9.wav create mode 100755 html/securimage/audio/en/A.wav create mode 100755 html/securimage/audio/en/B.wav create mode 100755 html/securimage/audio/en/C.wav create mode 100755 html/securimage/audio/en/D.wav create mode 100755 html/securimage/audio/en/E.wav create mode 100755 html/securimage/audio/en/F.wav create mode 100755 html/securimage/audio/en/G.wav create mode 100755 html/securimage/audio/en/H.wav create mode 100755 html/securimage/audio/en/I.wav create mode 100755 html/securimage/audio/en/J.wav create mode 100755 html/securimage/audio/en/K.wav create mode 100755 html/securimage/audio/en/L.wav create mode 100755 html/securimage/audio/en/M.wav create mode 100755 html/securimage/audio/en/MINUS.wav create mode 100755 html/securimage/audio/en/N.wav create mode 100755 html/securimage/audio/en/O.wav create mode 100755 html/securimage/audio/en/P.wav create mode 100755 html/securimage/audio/en/PLUS.wav create mode 100755 html/securimage/audio/en/Q.wav create mode 100755 html/securimage/audio/en/R.wav create mode 100755 html/securimage/audio/en/S.wav create mode 100755 html/securimage/audio/en/T.wav create mode 100755 html/securimage/audio/en/TIMES.wav create mode 100755 html/securimage/audio/en/U.wav create mode 100755 html/securimage/audio/en/V.wav create mode 100755 html/securimage/audio/en/W.wav create mode 100755 html/securimage/audio/en/X.wav create mode 100755 html/securimage/audio/en/Y.wav create mode 100755 html/securimage/audio/en/Z.wav create mode 100755 html/securimage/audio/en/error.wav create mode 100755 html/securimage/audio/noise/check-point-1.wav create mode 100755 html/securimage/audio/noise/crowd-talking-1.wav create mode 100755 html/securimage/audio/noise/crowd-talking-6.wav create mode 100755 html/securimage/audio/noise/crowd-talking-7.wav create mode 100755 html/securimage/audio/noise/kids-playing-1.wav create mode 100755 html/securimage/backgrounds/bg3.jpg create mode 100755 html/securimage/backgrounds/bg4.jpg create mode 100755 html/securimage/backgrounds/bg5.jpg create mode 100755 html/securimage/backgrounds/bg6.png create mode 100755 html/securimage/captcha.html create mode 100755 html/securimage/config.inc.php.SAMPLE create mode 100755 html/securimage/database/.htaccess create mode 100755 html/securimage/database/index.html create mode 100755 html/securimage/database/securimage.sq3 create mode 100755 html/securimage/example_form.ajax.php create mode 100755 html/securimage/example_form.php create mode 100755 html/securimage/images/audio_icon.png create mode 100755 html/securimage/images/loading.png create mode 100755 html/securimage/images/refresh.png create mode 100755 html/securimage/securimage.css create mode 100755 html/securimage/securimage.js create mode 100755 html/securimage/securimage.php create mode 100755 html/securimage/securimage_play.php create mode 100755 html/securimage/securimage_play.swf create mode 100755 html/securimage/securimage_show.php create mode 100755 html/securimage/words/words.txt create mode 100755 html/settings/agree.html.php create mode 100755 html/settings/error.html.php create mode 100755 html/settings/form.html.php create mode 100755 html/settings/gohome.html create mode 100755 html/settings/index.php create mode 100755 html/styles.css create mode 100755 html/submit/error.html.php create mode 100755 html/submit/form.html.php create mode 100755 html/submit/index.php create mode 100755 html/submit/submit.html.php create mode 100755 html/surprise/error.html.php create mode 100755 html/surprise/index.php diff --git a/README.md b/README.md new file mode 100755 index 0000000..0fd5af0 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +These are the source files for the Wiby search engine. + +Refer to the installation guide located in /html/about/guide.html +You can also access it at http://wiby.me/about/guide.html \ No newline at end of file diff --git a/c/abandoned.txt b/c/abandoned.txt new file mode 100755 index 0000000..e69de29 diff --git a/c/checkrobots.h b/c/checkrobots.h new file mode 100755 index 0000000..2288e25 --- /dev/null +++ b/c/checkrobots.h @@ -0,0 +1,238 @@ +#include +#include +#include +//#include //ubuntu 16 +//#include //ubuntu 16 +#include //ubuntu 20 +#include //ubuntu 20 + +//gcc checkrobots.c -o checkrobots -lcurl + +#define rwindow_len 100 +FILE *robotsfile; +char *robotsfilestr,robotsurl[1011],rwindow[rwindow_len]; +//char rURLpath[] = "/dumpop/"; + +size_t write_data_checkrobots(void *ptr, size_t size, size_t nmemb, FILE *stream) { + size_t written = fwrite(ptr, size, nmemb, stream); + return written; +} +int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length); + +//int main(int argc, char **argv) +int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) +{ + if(rURLprefix[0]==0 || rDomain[0]==0 || rURLpath[0]==0) + return 1; + if(strlen(rDomain)>253) + return 0; + if(strlen(rURLpath)>500) + return 0; + + memset(rwindow,'?',rwindow_len); + rwindow[rwindow_len]=0; + + //curl_global_init(CURL_GLOBAL_ALL); + CURL *curl; + FILE *fp; + CURLcode res; + memset(robotsurl,0,1011); + strcpy(robotsurl,rURLprefix); + strcat(robotsurl,rDomain); + strcat(robotsurl,"/robots.txt"); + char outfilename[300]; + memset(outfilename,0,300); + strcpy(outfilename,"robots/"); + strcat(outfilename,rDomain); + strcat(outfilename,".txt"); + curl = curl_easy_init(); + long fsize=0,response_code_checkrobots=0; + char *finalURL_checkrobots = NULL; + int foundfile=0; + char rb,rwb; + printf("\nChecking robots.txt: "); + + //open robots.txt file and load into memory, or download it if it doesn't exist + if(robotsfile = fopen(outfilename, "rb")){ + fseek(robotsfile, 0, SEEK_END); + fsize = ftell(robotsfile); + fseek(robotsfile, 0, SEEK_SET); /* same as rewind(f); */ + + robotsfilestr = malloc(fsize + 1); + if(fread(robotsfilestr, 1, fsize, robotsfile)){} + fclose(robotsfile); + + robotsfilestr[fsize] = 0; + //printf("%ld",fsize); + + foundfile=1; + }else if (curl) { + printf("Downloading... "); + if(fp = fopen(outfilename,"wb")){ + //set curl options + curl_easy_setopt(curl, CURLOPT_URL, robotsurl);// set URL to get here + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; compatible; WebCrawler; SearchEngine)"); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data_checkrobots);// send all data to this function // + curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60L); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 55L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);//max num of redirects + curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 1000000L);//don't download if over 1MB + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);//0 or 1 to verify ssl + res = curl_easy_perform(curl);// get it! + curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL_checkrobots); + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code_checkrobots); + curl_easy_cleanup(curl);// always cleanup + fclose(fp); + if(response_code_checkrobots!=200){ + fp = fopen(outfilename,"wb"); + fclose(fp); + } + }else{ + printf("\nFailed to create file: %s - proceeding anyway.",outfilename); + return 1; + } + } + if(response_code_checkrobots==200 && foundfile==0){ + robotsfile = fopen(outfilename, "rb"); + fseek(robotsfile, 0, SEEK_END); + fsize = ftell(robotsfile); + fseek(robotsfile, 0, SEEK_SET); // same as rewind(f); + + robotsfilestr = malloc(fsize + 1); + if(fread(robotsfilestr, 1, fsize, robotsfile)){} + fclose(robotsfile); + + robotsfilestr[fsize] = 0; + //printf("%ld",fsize); + } + //parse the robots.txt file + if(response_code_checkrobots==200 || foundfile==1 && fsize > 11){ + int foundUserAgent=0,foundDisallow=0,foundAllow=0,comment=0,match=0; + int k=0,lenurlpath=strlen(rURLpath),rwupdated=0,result=1; + for(int i=0;i=lenurlpath) + match=0; + k++; + } + if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){ + result=0; + foundDisallow=0; + } + if(match==0) + foundDisallow=k=0; + } + //check if path is allowed in url + if(rwupdated==1 && foundAllow==1){ + if(rwb!=10 && rwb!=13){ + //get path + if(k=lenurlpath) + match=0; + k++; + } + if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){ + printf("Permitted."); + return 1; + } + if(match==0) + foundAllow=k=0; + } + + if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"disallow:","DISALLOW:",9)==1){ + foundDisallow=1; + foundAllow=0; + k=0; + //printf("\nfound disallow"); + } + if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"\nallow:","\nALLOW:",7)==1){ + foundDisallow=0; + foundAllow=1; + k=0; + //printf("\nfound allow"); + } + } + rwupdated=0; + } + + if(result==0){ + printf("Denied."); + return 0; + }else{ + printf("Permitted."); + return 1; + } + } + printf("Permitted."); + return 1; +} + + +int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length) +{ + int start = rwindow_len-length; + for(int i=0;i +#include +#include +#include +//#include //ubuntu 16 +//#include //ubuntu 16 +#include //ubuntu 20 +#include //ubuntu 20 +#include "htmlparse.h" +#include "urlparse.h" +#include "checkrobots.h" +#include + +#define url_fromlist_arraylen 102400 +#define url_insert_arraylen 1024000 + +char /**title, *keywords, *description, *page,*/ *windexinsert, *windexupdate, *titlecheckinsert, urlPath_finalURL[1001], folderPath_finalURL[1001], urlPrefix_finalURL[1001], urlNPNP_finalURL[1001], strDepth[101], url_fromlist[url_fromlist_arraylen], url_insert[url_insert_arraylen], previousfail[5][1001]; + +void finish_with_error(MYSQL *con) +{ + fprintf(stderr, "%s\n", mysql_error(con)); + mysql_close(con); + exit(1); +} +int isnum(char *source){ + int sourcelength = strlen(source); + for(int i=0;i < sourcelength; i++){ + if(source[i] < 48 || source[i] > 57){ + return 0; + } + } + return 1; +} +size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) { + size_t written = fwrite(ptr, size, nmemb, stream); + return written; +} + +int main(int argc, char **argv) +{ + int id_assigned=0; + if(argc == 2 && isnum(argv[1])==1){ + id_assigned=1; + }else if(argc >= 2){ + printf("\nWiby Web Crawler\n\nUsage: cr Crawler_ID\n\nThe indexqueue may have each page assigned a crawler ID. The ID is assigned when you specify to the Refresh Scheduler the total number of crawlers you are running. The scheduler will assign pages in round-robin order a crawler ID within the range of that total.\n\nExample: If you have two crawlers running, then you should specify the first with an ID of 1, and the second with and ID of 2. Each crawler will crawl pages in the indexqueue with its corresponding ID.\n\nYou can also not assign an ID, and in that case the crawler will ignore the ID assignments. So if you have only one crawler running, assigning an ID is optional. Don't run multiple crawlers without assigning ID's.\n\n"); + exit(0); + } + + long int previousID[5] = {0, 1, 2, 3, 4}; + int sanity = 1; + + for(int i=0;i<5;i++){ + previousfail[i][0]=0; + } + + while(1) + { + //printf("MySQL client version: %s\n", mysql_get_client_info()); + int alreadydone = 0, permitted=1; + //allocates or initialises a MYSQL object + MYSQL *con = mysql_init(NULL); + + if (con == NULL) + { + finish_with_error(con); + } + + //establish a connection to the database. We provide connection handler, host name, user name and password parameters to the function. The other four parameters are the database name, port number, unix socket and finally the client flag + if (mysql_real_connect(con, "localhost", "crawler", "seekout", NULL, 0, NULL, 0) == NULL) + { + finish_with_error(con); + } + + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + + if (mysql_query(con, "SET CHARSET utf8;")) + { + finish_with_error(con); + } + + if(id_assigned == 0){ + if (mysql_query(con, "SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue limit 1;")) + { + finish_with_error(con); + } + }else{ + char indexqueuequery[2001]; + memset(indexqueuequery,0,2001); + strcpy(indexqueuequery,"SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue WHERE crawler_id = '"); + strcat(indexqueuequery,argv[1]); + strcat(indexqueuequery,"' LIMIT 1;"); + if (mysql_query(con, indexqueuequery)) + { + finish_with_error(con); + } + } + + //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set + MYSQL_RES *result = mysql_store_result(con); + + if(result == NULL) + { + finish_with_error(con); + } + + //get the number of fields (columns) in the table + //int num_fields = mysql_num_fields(result); + + //We fetch the rows and print them to the screen. + /*MYSQL_ROW row; + while (row = mysql_fetch_row(result)) + { + for(int i=0; i 4){ + if(url[4]==':' && (url[3]=='p' || url[3]=='P')) + http = 7; + } + if(urlsize > 5){ + if(url[5]==':' && (url[4]=='s' || url[4]=='S')) + https = 8; + } + if(urlsize > 11){ + if((url[7]=='w' || url[7]=='W') && (url[8]=='w' || url[8]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[10]=='.' ){ + httpwww = 11; + http = https = 0; + } + if(url[7]=='/' && (url[8]=='w' || url[8]=='W') && (url[9]=='w' || url[9]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[11]=='.' ){ + httpswww = 12; + http = https = 0; + } + } + + //set the prefix + + if(http > 0) strcat(prefix,"://"); + else if(https > 0) strcat(prefix,"s://"); + else if(httpwww > 0) strcat(prefix,"://www."); + else if(httpswww > 0) strcat(prefix,"s://www."); + + int prefixsize = httpswww+httpwww+https+http; + char urlnoprefix[urlsize-prefixsize+1]; + char urlnopathnoprefix[urlsize-prefixsize+1]; + memset(urlnoprefix,0,urlsize-prefixsize+2); + memset(urlnopathnoprefix,0,urlsize-prefixsize+2); + int urlcount=0,urlnoprefixcount=0,urlnopathnoprefix_done=0; + + //store the url without prefix to urlnoprefix + while(urlcount < urlsize+1) + { + if(urlcount>prefixsize-1) + { + urlnoprefix[urlnoprefixcount]=url[urlcount]; + //get urlnopath + if(url[urlcount] != '/' && urlnopathnoprefix_done==0){ + urlnopathnoprefix[urlnoprefixcount]=url[urlcount]; + }else{ + urlnopathnoprefix_done=1; + } + urlnoprefixcount++; + } + urlcount++; + } + + //check for '/' at end of url. it may be already indexed without that so we need to account for it. + //int urlnoprefixlength = strlen(urlnoprefix); + int slashfound = 0; + char urlnoprefixnoslash[urlnoprefixcount]; + memset(urlnoprefixnoslash,0,urlnoprefixcount); + if(urlnoprefix[urlnoprefixcount-1] == '/') + { + strncpy(urlnoprefixnoslash,urlnoprefix,urlnoprefixcount-1); + slashfound = 1; + } + //printf("\nurlnoprefix: %s\n",urlnoprefix); + + printf("Checking if page already exists in index... "); + int idexistsalready = 0; + char *idexistsvalue; + char checkurl[urlnoprefixcount*24+1000]; + memset(checkurl,0,urlnoprefixcount*24+1000); + if(task == 0 || task[0] == '2'){//index request did not come from refresh scheduler, or is an autocrawl url + //strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url FROM windex WHERE url = 'http://"); //replace this with a simple check for url_noprefix column match + strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url FROM windex WHERE url_noprefix = '"); + if(slashfound==0) + { + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html"); + strcat(checkurl,"' OR url_noprefix = '/index.htm"); + strcat(checkurl,"';"); + } + else + { + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefixnoslash); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm"); + strcat(checkurl,"';"); + } + }else{ + strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url FROM windex WHERE url = '"); + strcat(checkurl,url); + strcat(checkurl,"';"); + } + + if (mysql_query(con, checkurl)) + { + finish_with_error(con); + } + + //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set + MYSQL_RES *resulturlcheck = mysql_store_result(con); + + if(resulturlcheck == NULL) + { + finish_with_error(con); + } + + //grab the first entry (fifo) + printf("Found ID "); + row = mysql_fetch_row(resulturlcheck); + char updatedefault[] = "1"; + char *updatableOldDBval = updatedefault; + char *enableOldDBval = updatedefault; + char *dbtitle; + char *fault; + char *dburl; + + //Catalog the previous crawl attempts (to see if they are all for the same page - which would be a bad sign) + previousID[4] = previousID[3]; + previousID[3] = previousID[2]; + previousID[2] = previousID[1]; + previousID[1] = previousID[0]; + + if(row == NULL) + { + printf("null"); + previousID[0] = -1; + } + else { + printf("%s",row[0]); + idexistsalready = 1; + idexistsvalue = row[0]; + previousID[0] = atoi(row[0]); + updatableOldDBval = row[1]; + dbtitle = row[2]; + enableOldDBval = row[3]; + fault = row[4]; + dburl=row[5]; + if(task != 0 && task[0]=='2') + alreadydone=1; + } + + //Log duplicate rows (they shouldn't exist) + int num_rows = mysql_num_rows(resulturlcheck); + if(num_rows > 1){ + FILE *duplicates = fopen("duplicates.txt", "a"); + fputs (dburl,duplicates); + fputs ("\r\n",duplicates); + fclose(duplicates); + } + + //check robots.txt file for this domain + urlparse(url); + permitted = checkrobots(prefix,rootdomain,urlPath); + + int failedcrawl=0; + if(task != 0 && task[0]=='2' && alreadydone==0 && permitted==1){ + //see if url failed to crawl last time (when link crawling) + //as it might come up multiple times during crawl of website, should avoid recrawling it + for(int i=0;i<5;i++){ + if(strcasecmp(previousfail[i], urlnoprefix)==0){ + sanity=0; + failedcrawl=1; + break; + } + } + if(sanity==1) + sleep(1);//do link crawling slowly + } + + //Does this crawl attempt, along with the last 4 have the same ID? There is possibly a duplicate db entry, or some other problem. + if(previousID[0] != -1 && alreadydone==0 && failedcrawl==0){ + if(previousID[0] == previousID[4] && previousID[0] == previousID[3] && previousID[0] == previousID[2] && previousID[0] == previousID[1]){ + sanity = 0; + printf("\nWARNING: Last 5 crawl attempts are all for the same page. Will not continue crawling in this situation. Is the same page being submitted over and over? Also, duplicate table entries of the same URL in windex can cause this behavior. Check duplicates.txt"); + }else{ + sanity = 1; + } + + }else{ + sanity = 1; + } + + //printf("\n\n%ld, %ld, %ld, %ld, %ld\n",previousID[0],previousID[1],previousID[2],previousID[3],previousID[4]); + + //see if the server will accept http only connections on older browsers, change url to HTTP only: + char urlHTTP[strlen(url)+100]; + memset(urlHTTP,0,strlen(url)+100); + strcpy(urlHTTP,"http"); + if(http > 0 || https > 0){ + strcat(urlHTTP,"://"); + } + else if(httpwww > 0 || httpswww > 0){ + strcat(urlHTTP,"://www."); + } + strcat(urlHTTP,urlnoprefix); + + if(updatableOldDBval[0] != '0' && enableOldDBval[0] != '0' && sanity == 1 && alreadydone==0 && permitted==1) + { + printf("\nAttempt HTTP connection: %s",urlHTTP); + printf("\nDownloading page... "); + //===============do the curl (download the webpage)===================== + //curl_global_init(CURL_GLOBAL_ALL); + CURL *curl; + FILE *fp; + CURLcode res; + char outfilename[FILENAME_MAX] = "page.out"; + curl = curl_easy_init(); + long size=0; + char *finalURL = NULL; + long response_code; + int finalURLsize = 0,urltoolong=0; + if (curl) { + fp = fopen(outfilename,"wb"); + //Get file size + //fseek(fp, 0L, SEEK_END); + //size = ftell(fp); + //set curl options + curl_easy_setopt(curl, CURLOPT_URL, urlHTTP);// set URL to get here + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; WebCrawler; SearchEngine)"); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);// send all data to this function // + curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60L); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 55L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);//max num of redirects + curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 5000000L);//don't download if over 5MB + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);//0 or 1 to verify ssl + //curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);//set verbose + res = curl_easy_perform(curl);// get it! + //if(res == CURLE_OK) {//get final redirect url //-- don't check for this, causes segfault if "transfer closed with outstanding read data remaining" + curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL); + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); + + if(finalURL){ + printf("Effective URL: %s\nResponse: %ld, ", finalURL,response_code); + finalURLsize = strlen(finalURL); + } + + //curl_easy_cleanup(curl); //cleanup moved further down because finalURL is needed at insert + + //get file size + fseek(fp, 0L, SEEK_END); + size = ftell(fp); + + fclose(fp); + } + + if(finalURLsize>500){ + urltoolong=1; + printf("\nURL is too long"); + } + + char finalURLnoprefix[finalURLsize-prefixsize+100]; + char httpAllow[] = "0"; + memset(finalURLnoprefix,0,finalURLsize-prefixsize+100); + int updatereserve=0; + char idReserve[100]; + + if(urltoolong==0){ + //see if server permitted an http connection + if(finalURL != NULL){ + if(finalURL[4]==':') + httpAllow[0] = '1'; + } + else if(http > 0 || httpwww > 0){ + httpAllow[0] = '1'; + } + + //Remove the prefix from the final URL, to store into url_noprefix + //find out if its http or https or http://www. or https://www. + httpwww=httpswww=http=https=0; + + if(finalURLsize > 4){ + if(finalURL[4]==':') + http = 7; + if(finalURL[4]=='s' || finalURL[4]=='S') + https = 8; + } + if(finalURLsize > 11){ + if((finalURL[7]=='w' || finalURL[7]=='W') && (finalURL[8]=='w' || finalURL[8]=='W') && (finalURL[9]=='w' || finalURL[9]=='W') && finalURL[10]=='.' ){ + httpwww = 11; + http = https = 0; + } + if(finalURL[7]=='/' && (finalURL[8]=='w' || finalURL[8]=='W') && (finalURL[9]=='w' || finalURL[9]=='W') && (finalURL[10]=='w' || finalURL[10]=='W') && finalURL[11]=='.' ){ + httpswww = 12; + http = https = 0; + } + } + + prefixsize = httpswww+httpwww+https+http; + urlcount=urlnoprefixcount=0; + + //store the url without prefix to urlnoprefix + while(finalURL[urlcount] != 0){ + if(urlcount>prefixsize-1) + { + finalURLnoprefix[urlnoprefixcount]=finalURL[urlcount]; + urlnoprefixcount++; + } + urlcount++; + } + + //Double check that the URL is in fact not in the DB, by also searching for the effective URL from libcurl and its url in the table + int foundindoublecheck=0; + if(idexistsalready == 0){ + mysql_free_result(resulturlcheck); + char doublecheckurl[finalURLsize+100]; + memset(doublecheckurl,0,finalURLsize+100); + strcpy(doublecheckurl,"SELECT id,updatable,title,enable,fault,url FROM windex WHERE url = '"); + strcat(doublecheckurl,finalURL); + strcat(doublecheckurl,"';"); + if (mysql_query(con, doublecheckurl)) + { + finish_with_error(con); + } + resulturlcheck = mysql_store_result(con); + if(resulturlcheck == NULL) + { + finish_with_error(con); + } + row = mysql_fetch_row(resulturlcheck); + if(row != NULL) + { + printf("\nDoublechecked effective URL in windex, found ID %s",row[0]); + idexistsalready = 1; + idexistsvalue = row[0]; + previousID[0] = atoi(row[0]); + updatableOldDBval = row[1]; + dbtitle = row[2]; + enableOldDBval = row[3]; + fault = row[4]; + dburl=row[5]; + if(task != 0 && task[0]=='2') + alreadydone=1; + foundindoublecheck=1; + } + //Log duplicate rows (they shouldn't exist) + num_rows = mysql_num_rows(resulturlcheck); + if(num_rows > 1){ + FILE *duplicates = fopen("duplicates.txt", "a"); + fputs (dburl,duplicates); + fputs ("\r\n",duplicates); + fclose(duplicates); + } + //Does this crawl attempt, along with the last 4 have the same ID? There is possibly a duplicate db entry, or some other problem. + if(previousID[0] != -1){ + if(previousID[0] == previousID[4] && previousID[0] == previousID[3] && previousID[0] == previousID[2] && previousID[0] == previousID[1]){ + printf("\nWARNING: Last 5 crawl attempts are all for the same page. Will not continue crawling in this situation. Is the same page being submitted over and over? Also, duplicate table entries of the same URL in windex can cause this behavior. Check duplicates.txt"); + exit(0); + } + } + } + + //if doing an update when using multiple crawlers, reserve the id and verify the URL is still associated with it + if(alreadydone==0 && id_assigned==1 && idexistsalready==1){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(idReserve,0,100); + strcpy(idReserve,"INSERT into reserve_id (id) VALUES ("); + strcat(idReserve,idexistsvalue); + strcat(idReserve,");"); + if(mysql_query(con, idReserve)) + { + printf("\nID is already reserved, will try again later. Clearing old reservations..."); + if(mysql_query(con, "DELETE FROM reserve_id WHERE time < NOW() - INTERVAL 10 MINUTE")){ + finish_with_error(con); + }else{ + printf(" Done."); + } + alreadydone=1; + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + updatereserve=1; + if(alreadydone==0){ + //check that the url being updated is still assigned to that ID + memset(checkurl,0,urlnoprefixcount*24+1000); + if(task != 0 && task[0] == '1'){ + strcpy(checkurl,"SELECT id FROM windex WHERE url = '"); + strcat(checkurl,url); + strcat(checkurl,"';"); + }else{ + if(foundindoublecheck==0){ + strcpy(checkurl,"SELECT id FROM windex WHERE url_noprefix = '"); + if(slashfound==0) + { + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html"); + strcat(checkurl,"' OR url_noprefix = '/index.htm"); + strcat(checkurl,"';"); + }else{ + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefixnoslash); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm"); + strcat(checkurl,"';"); + } + }else{ + strcpy(checkurl,"SELECT id FROM windex WHERE url = '"); + strcat(checkurl,finalURL); + strcat(checkurl,"';"); + } + } + //query db + if (mysql_query(con, checkurl)) + { + finish_with_error(con); + } + MYSQL_RES *resulturlcheck = mysql_store_result(con); + if(resulturlcheck == NULL) + { + finish_with_error(con); + } + //grab the first entry (fifo) + char *URLcheckID; + MYSQL_ROW rowURLCheck = mysql_fetch_row(resulturlcheck); + if(rowURLCheck != NULL) + { + URLcheckID = rowURLCheck[0]; + } + if(URLcheckID != 0 && atoi(URLcheckID) != atoi(idexistsvalue)){ + printf("\nID was already reserved, will try again later."); + alreadydone=1; + } + } + } + } + //=====================Extract text from HTML file======================= + if(size < 5000000 && urltoolong==0 && alreadydone==0) + { + //switch on/off hyperlink collecting (if refresh is from link crawler, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set) + if((task != 0 && task[0]=='2' && (n_crawl_depth > 0 || n_crawl_depth < 0) && (n_crawl_pages > 0 || n_crawl_pages < 0)) || (task==0 && (n_crawl_depth > 0 || n_crawl_depth < 0) && (n_crawl_pages > 0 || n_crawl_pages < 0)) || (task != 0 && task[0]=='1' && crawl_repeat != 0 && crawl_repeat[0]=='1' && (n_crawl_pages > 0 || n_crawl_pages < 0))){ + getURLs=1; + }else{ + getURLs=0; + } + + htmlparse(); + + //need the finalURL path info also + urlparse(finalURL); + memset(urlPath_finalURL,0,1001); + strcpy(urlPath_finalURL,urlPath); + memset(folderPath_finalURL,0,1001); + strcpy(folderPath_finalURL,folderPath); + memset(urlPrefix_finalURL,0,1001); + strcpy(urlPrefix_finalURL,prefix_fromlist); + memset(urlNPNP_finalURL,0,1001); + strcpy(urlNPNP_finalURL,urlnopathnoprefix_fromlist); + + if(urlPrefix_finalURL[0]==0 || urlNPNP_finalURL[0]==0 || urlPath_finalURL[0]==0) + noindex = 1; + + }else{ + noindex = 1; + } + + //check if rules are enforced (only for pages that are autocrawled) + if(force_rules != 0 && force_rules[0]=='1' && task != 0 && task[0]=='2' && noindex == 0){ + if(num_scripts > 2 || num_stylesheets > 1) + noindex = 1; + printf("\nFailed rule check"); + } + + int skip = 0, titlechanged = 0, escape = 0, escapetotal = 0, redirected = 0; + //Check if noindex and size + if(((noindex == 0 /*&& bodysize < 1900000*/ && bodysize > 10) || (noindex == 0 /*&& bodysize < 1900000*/ && descriptionsize > 10)) && response_code == 200 && alreadydone==0) + { + //=================Allocate memory for the parsed text from htmlparse() + //title = (char*)calloc(titlesize+1,sizeof(char)); + //keywords = (char*)calloc(keywordssize+1,sizeof(char)); + //description = (char*)calloc(descriptionsize+1,sizeof(char)); + //page = (char*)calloc(bodysize+1,sizeof(char)); + windexinsert = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+1001,sizeof(char)); + windexupdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+1001,sizeof(char)); + titlecheckinsert = (char*)calloc(finalURLsize+titlesize+1001,sizeof(char)); + + /*if(title == NULL || keywords == NULL || description == NULL || page == NULL || windexinsert == NULL || windexupdate == NULL) + { + printf("\nError allocating memory for webpage"); + //cleanup sql stuff + mysql_free_result(resulturlcheck); + mysql_free_result(result); + mysql_close(con); + exit(0); + }*/ + + + //Check if this is a new page: check if the title found in windex is the same as the parsed title. If not, put the page back into review. + int dbtitlesize = 0,titlecheckTitleSize = 0, dbNoTitle=0,extrapos=0; + if(idexistsalready==1) + { + //going to insert the crawled title into a "titlecheck" table with the url for reference, then we're going to read back the + //title and count the number of bytes vs what was read from dbtitlesize to determine if title changed + //this is because bytes read from db must be the same charset as what is crawled to get a proper count + //unsupported charsets can end up truncating data, giving incorrect title check, this method avoids that issue + + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + //set charset based on crawled page charset tag + if (mysql_query(con, mysqlcharset)) + { + finish_with_error(con); + } + //insert title into wibytemp for comparison + strcpy(titlecheckinsert,"INSERT INTO titlecheck (url,title) VALUES ('"); + strcat(titlecheckinsert,finalURL); + strcat(titlecheckinsert,"','"); + strcat(titlecheckinsert,title); + strcat(titlecheckinsert,"');"); + if (mysql_query(con, titlecheckinsert)) + { + finish_with_error(con); + } + if (mysql_query(con, "SET CHARSET utf8;")) + { + finish_with_error(con); + } + //now read back the title from the database + char checktitle[finalURLsize+dbtitlesize+1000]; + memset(checktitle,0,finalURLsize+dbtitlesize+1000); + strcpy(checktitle,"SELECT title FROM titlecheck WHERE url = '"); + strcat(checktitle,finalURL);strcat(checktitle,"' ORDER BY id DESC;"); + //query db + if (mysql_query(con, checktitle)) + { + finish_with_error(con); + } + MYSQL_RES *resulttitlecheck = mysql_store_result(con); + if(resulttitlecheck == NULL) + { + finish_with_error(con); + } + + //grab the first entry (fifo) + MYSQL_ROW rowTitleCheck = mysql_fetch_row(resulttitlecheck); + char *titlecheckTitle; + int titlecheckTitleSize = 0; + titlecheckTitle = rowTitleCheck[0]; + //printf("\n %s",rowTitleCheck[0]); + + //delete the entry from the table + char titlecheckremove[finalURLsize+1000]; + memset(titlecheckremove,0,finalURLsize+1000); + strcpy(titlecheckremove,"DELETE FROM titlecheck WHERE url ='"); + strcat(titlecheckremove,finalURL);strcat(titlecheckremove,"';"); + if (mysql_query(con, titlecheckremove)) + { + finish_with_error(con); + } + + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + + //check if original dburl is now getting redirected from finalurl (should be sent to review) + int finalUrlsize_noprefix, dburlsize_noprefix = 0, finalURL_prefixsize = 0, dburl_prefixsize = 0,dburlsize=strlen(dburl); + if(finalURL[4] == ':'){//if its just a switch from http to https, ignore + finalUrlsize_noprefix = finalURLsize - 7; + finalURL_prefixsize = 7; + }else{ + finalUrlsize_noprefix = finalURLsize - 8; + finalURL_prefixsize = 8; + } + if(dburl[4] == ':'){ + dburlsize_noprefix = dburlsize - 7; + dburl_prefixsize = 7; + }else{ + dburlsize_noprefix = dburlsize - 8; + dburl_prefixsize = 8; + } + if(finalURLsize-finalURL_prefixsize != dburlsize-dburl_prefixsize){ + redirected = 1; + printf("\nIndexed page is being redirected."); + }else{ + for(int i=0;i 0 && emptytitle == 0)) //previous, before db wibytemp titlecheck method + if((dbNoTitle == 0 && dbtitlesize != titlecheckTitleSize) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0) || (URL_is_dbtitle == 1 && dbtitlesize != titlecheckTitleSize && titlesize > 0 && emptytitle == 0)) + { + titlechanged = 1; + } + //printf("\n|%s|\n%d\n%d\n%d\n%d\n%d",dbtitle,titlesize,dbtitlesize,extrapos,dbNoTitle,titlechanged); + + //cleanup some sql stuff + mysql_free_result(resulttitlecheck); + } + + if(titlechanged == 0 && redirected == 0) + { + //====================Load the parsed text into windex!================== + + if (mysql_query(con, mysqlcharset))//set charset based on page charset tag + { + finish_with_error(con); + } + + //strcpy(windexinsert,"INSERT INTO windex (url,title,tags,description,body,worksafe,enable,date,approver,surprise,updatable) VALUES ('"); + strcpy(windexinsert,"INSERT INTO windex (url,url_noprefix,title,description,body,worksafe,enable,date,approver,surprise,http,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat) VALUES ('"); + + strcpy(windexupdate,"UPDATE windex SET url = '"); + + int copiedRandom = 0; + int reserveFail = 0; + char randomreserve[100]; + char *randID; + MYSQL_RES *resultRandID; + if(idexistsalready == 0){//Insert new entry + //For search topics to be evenly discovered by all read-sharded replication servers, new rows must be scattered randomly across the database insead of sequental: + //Existing rows will be randomly selected and copied (inserted) into a new row at the bottom, and the new page will take the ID number of the old one through an update. + //select id from windex where enable = 1 order by rand() limit 1; + //insert into windex (url,title,tags,description,body,surprise,http,updatable,worksafe,enable,date,updated,approver,fault) select url,title,tags,description,body,surprise,http,updatable,worksafe,enable,date,updated,approver,fault from windex where id = 1338; + printf("\nInserting into index... "); + char windexRandUpdate[1500]; + memset (windexRandUpdate,0,1500); + if (mysql_query(con, "SELECT id FROM windex WHERE enable = 1 AND tags IS NULL ORDER BY rand() LIMIT 1;")) //must make sure tags are null, since tags are manually added by admin, not crawler, giving that row special visibility + { + finish_with_error(con); + } + resultRandID = mysql_store_result(con); + if (resultRandID==NULL) + { + finish_with_error(con); + } + MYSQL_ROW row = mysql_fetch_row(resultRandID); + if(row != NULL){ + randID = row[0]; + idexistsvalue = row[0]; + } + + //reserve the randomly selected ID when running more than one crawler + if(row != NULL && id_assigned==1){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(randomreserve,0,100); + strcpy(randomreserve,"INSERT into reserve_id (id) VALUES ("); + strcat(randomreserve,randID); + strcat(randomreserve,");"); + if (mysql_query(con, randomreserve)) + { + printf("\nID is already reserved. Clearing old reservations..."); + if(mysql_query(con, "DELETE FROM reserve_id WHERE time < NOW() - INTERVAL 10 MINUTE")){ + finish_with_error(con); + }else{ + printf(" Done."); + } + reserveFail=1;//if error: more than one crawler attempted to reserve the same randomly selected ID + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + } + + if(row == NULL || reserveFail==1){//if no rows in db yet or fails to reserve an ID (this was the code for idexistsalready == 0 condition before randomization strategy was adopted. + strcat(windexinsert,finalURL);strcat(windexinsert,"','"); + strcat(windexinsert,finalURLnoprefix);strcat(windexinsert,"','"); + //strcat(windexinsert,prefix);strcat(windexinsert,"','"); + if(titlesize > 0 && emptytitle == 0) { + strcat(windexinsert,title); + } + else { + if(finalURLsize < 111){ + strcat(windexinsert,finalURL); + } + else{ + strcat(windexinsert,"Untitled"); + } + } + strcat(windexinsert,"','"); + //if(tagsize > 0) {strcat(windexinsert,keywords);} + //strcat(windexinsert,"','"); + if(descriptionsize > 0) {strcat(windexinsert,description);} + strcat(windexinsert,"','"); + if(bodysize > 0) {strcat(windexinsert,body);} + strcat(windexinsert,"',"); + strcat(windexinsert,worksafe); + strcat(windexinsert,",1,now(),'"); + strcat(windexinsert,approver); + strcat(windexinsert,"',"); + strcat(windexinsert,surprise); + strcat(windexinsert,","); + strcat(windexinsert,httpAllow); + strcat(windexinsert,","); + strcat(windexinsert,updatable); + if(task != 0 && task[0]=='2'){//came from link crawling + strcat(windexinsert,",'"); + strcat(windexinsert,crawl_tree); + strcat(windexinsert,"','"); + strcat(windexinsert,crawl_family); + strcat(windexinsert,"',"); + strcat(windexinsert,crawl_pages); + strcat(windexinsert,","); + strcat(windexinsert,crawl_type); + strcat(windexinsert,","); + strcat(windexinsert,"0"); + }else{ + strcat(windexinsert,","); + strcat(windexinsert,"NULL,"); + strcat(windexinsert,"NULL,"); + strcat(windexinsert,crawl_pages); + strcat(windexinsert,","); + strcat(windexinsert,crawl_type); + strcat(windexinsert,","); + strcat(windexinsert,crawl_repeat); + } + + strcat(windexinsert,");"); + if (mysql_query(con, windexinsert)) + { + finish_with_error(con); + } + } + else{ + strcpy(windexRandUpdate,"INSERT INTO windex (url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault) SELECT url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault FROM windex WHERE id = "); + strcat(windexRandUpdate,randID); + //printf("\n%s",windexRandUpdate); + if (mysql_query(con, windexRandUpdate))//will copy random row to bottom, old row will be overwritten with new indexed page, ensures random distribution of content. + { + finish_with_error(con); + } + copiedRandom = 1; + } + } + if(idexistsalready == 1 || copiedRandom == 1){ //update existing entry + + printf("\nUpdating index... "); + strcat(windexupdate,finalURL); + strcat(windexupdate,"', url_noprefix = '"); + strcat(windexupdate,finalURLnoprefix); + strcat(windexupdate,"', title = '"); + if(titlesize > 0 && emptytitle == 0){ + strcat(windexupdate,title); + } + else{ + if(finalURLsize < 111){ + strcat(windexupdate,finalURL); + } + else{ + strcat(windexupdate,"Untitled"); + } + } + //strcat(windexupdate,"', tags = '"); + //strcat(windexupdate,keywords); + strcat(windexupdate,"', description = '"); + strcat(windexupdate,description); + strcat(windexupdate,"', body = '"); + strcat(windexupdate,body); + strcat(windexupdate,"', worksafe = "); + strcat(windexupdate,worksafe); + strcat(windexupdate,", approver = '"); + strcat(windexupdate,approver); + strcat(windexupdate,"', surprise = "); + strcat(windexupdate,surprise); + strcat(windexupdate,", http = "); + strcat(windexupdate,httpAllow); + strcat(windexupdate,", updatable = "); + strcat(windexupdate,updatable); + if(task==0){//didn't come from refresh or link crawling + strcat(windexupdate,", crawl_pages = "); + strcat(windexupdate,crawl_pages); + strcat(windexupdate,", crawl_type = "); + strcat(windexupdate,crawl_type); + strcat(windexupdate,", crawl_repeat = "); + strcat(windexupdate,crawl_repeat); + } + if(copiedRandom == 0)//normal update + strcat(windexupdate,", updated = CURRENT_TIMESTAMP, fault = 0 WHERE id = "); + else + strcat(windexupdate,", updated = CURRENT_TIMESTAMP, date = now(), fault = 0 WHERE id = "); + strcat(windexupdate,idexistsvalue); + strcat(windexupdate,";"); + if (mysql_query(con, windexupdate)) + { + finish_with_error(con); + } + } + //unreserve randomly selected ID + if(id_assigned==1 && idexistsalready==0 && reserveFail==0){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(randomreserve,0,100); + strcpy(randomreserve,"DELETE FROM reserve_id where id = "); + strcat(randomreserve,randID); + strcat(randomreserve,";"); + if (mysql_query(con, randomreserve)) + { + finish_with_error(con); + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + } + //unreserve ID if doing an update + if(id_assigned==1 && updatereserve==1){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(idReserve,0,100); + strcpy(idReserve,"DELETE FROM reserve_id where id = "); + strcat(idReserve,idexistsvalue); + strcat(idReserve,";"); + if(mysql_query(con, idReserve)) + { + finish_with_error(con); + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + } + //free result + if(idexistsalready == 0){ + mysql_free_result(resultRandID); + } + + //===================remove the entry from the indexqueue=============== + //printf("\nRemoving from queue..."); + char sqlqueryremove[200]; + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id="); + strcat(sqlqueryremove,id);strcat(sqlqueryremove,";"); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + + printf("\n\nSuccess!"); + } + //clear page from memory + /*free(title); free(keywords); free(description); free(page);*/ free(windexinsert); free(windexupdate); free(titlecheckinsert); + } + else + { + skip = 1; + } + + if((skip == 1 || titlechanged == 1 || redirected == 1)){ + //from skip check: if(((noindex == 0 && bodysize < 1900000 && bodysize > 10) || (noindex == 0 && bodysize < 1900000 && descriptionsize > 10)) && response_code == 200 && alreadydone==0) + //printf("\nnoindex: %d\nbodysize: %ld\ndescriptionsize %ld\nresponse_code: %d\nalreadydone: %d\nskip: %d\ntitlechanged: %d\nredirected: %d",noindex,bodysize,descriptionsize,response_code,alreadydone,skip,titlechanged,redirected); + if(skip == 1){ + printf("\nDoesn't want to be indexed, size too big, 404, already done, failed rules, or security issue."); + //log previous failed link crawls + strcpy(previousfail[4],previousfail[3]); + strcpy(previousfail[3],previousfail[2]); + strcpy(previousfail[2],previousfail[1]); + strcpy(previousfail[1],previousfail[0]); + strcpy(previousfail[0],urlnoprefix); + } + printf("\nRemoving from queue..."); + char sqlqueryremove[200]; + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id="); + strcat(sqlqueryremove,id);strcat(sqlqueryremove,";"); + + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + if(alreadydone==0){ + if(idexistsalready == 1 && fault[0] == '1') + { + if(crawl_family != 0 && crawl_family[0] !='0'){ + printf("\nPage may no longer exist. Originated from link crawling. Removing from the index."); + FILE *abandoned = fopen("abandoned.txt", "a"); + fputs (url,abandoned); + fputs ("\r\n",abandoned); + fclose(abandoned); + }else{ + printf("\nPage may no longer exist. Moving to review."); + } + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM windex WHERE id="); + strcat(sqlqueryremove,idexistsvalue);strcat(sqlqueryremove,";"); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + if(crawl_family == 0 || (crawl_family != 0 && crawl_family[0] =='0')){ + char sqlqueryreview[1001]; + memset(sqlqueryreview,0,1001); + strcpy(sqlqueryreview,"INSERT INTO reviewqueue (url,worksafe) VALUES ('"); + strcat(sqlqueryreview,url);strcat(sqlqueryreview,"',"); + strcat(sqlqueryreview,worksafe);strcat(sqlqueryreview,");"); + if (mysql_query(con, sqlqueryreview)) + { + finish_with_error(con); + } + } + } + else if(idexistsalready == 1 && fault[0] != '1')//mark that there is a fault with the page, crawler will throw it back into review if it happens again + { + printf("\nFault found. Will try again later."); + char sqlqueryfault[250]; + memset(sqlqueryfault,0,250); + strcpy(sqlqueryfault,"UPDATE windex SET updated = CURRENT_TIMESTAMP, fault = 1 WHERE id = "); + strcat(sqlqueryfault,idexistsvalue);strcat(sqlqueryfault,";"); + if (mysql_query(con, sqlqueryfault)) + { + finish_with_error(con); + } + } + else{ + FILE *abandoned = fopen("abandoned.txt", "a"); + fputs (url,abandoned); + fputs ("\r\n",abandoned); + fclose(abandoned); + } + } + + //check if link crawling is specified + //make sure duplicates don't get crawled more than once + //check db if its already indexed too - do this at beginning instead? + + //crawl links if refresh is from link crawler, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set + }else if(nofollow==0 && getURLs==1 && alreadydone==0){ + //cycle through url list, then construct an sql string around it, then insert it to indexqueue; + + //force crawl depth of 1 during a refresh if crawl_repeat is set + if(crawl_repeat != 0 && crawl_repeat[0]=='1' && task != 0 && task[0]=='1'){ + n_crawl_depth=1; + } + + if(n_crawl_depth>0)//below 0 = unlimited depth + n_crawl_depth--; + + memset(strDepth,0,101); + sprintf(strDepth,"%d",n_crawl_depth); + //itoa(n_crawl_depth,strDepth,10); + + memset(url_fromlist,0,url_fromlist_arraylen); + memset(url_insert,0,url_insert_arraylen); + int loopcount=0,elementnum=0,urls=0; + if(id_assigned == 1){ + strcpy(url_insert,"INSERT INTO indexqueue (url,worksafe,approver,surprise,task,crawl_tree,crawl_family,crawl_depth,crawl_pages,crawl_type,crawl_repeat,crawler_id) VALUES ("); + }else{ + strcpy(url_insert,"INSERT INTO indexqueue (url,worksafe,approver,surprise,task,crawl_tree,crawl_family,crawl_depth,crawl_pages,crawl_type,crawl_repeat) VALUES ("); + } + while(urlListShuffled[loopcount]!=0){ + switch(urlListShuffled[loopcount]){ + case '\n' ://see if url can be indexed, if so, add to sql insert statement + + urlparse(url_fromlist); + + //check if internal or external url + int isinternal=1; + if(rootdomain[0]!=0){ + isinternal=0; + }else if(url_fromlist[4]==':' || url_fromlist[5]==':'){ + isinternal=0; + }else if((url_fromlist[0]=='w' || url_fromlist[0]=='W') && (url_fromlist[1]=='w' || url_fromlist[1]=='W') && (url_fromlist[2]=='w' || url_fromlist[2]=='W') && url_fromlist[3]=='.'){ + isinternal=0; + } + int urlNPNP_finalURL_len=strlen(urlNPNP_finalURL); + int isabsolute=0; + if(isinternal==0 && urlNPNP_finalURL_len==strlen(urlnopathnoprefix_fromlist)){ + isinternal=isabsolute=1; + for(int q=0;q1){ + strcat(url_insert,", ("); + } + if(url_fromlist[0]=='/' && url_fromlist[1] != '.'){//can't handle '..' otherwise append to insert + strcat(url_insert,"'"); + strcat(url_insert,urlPrefix_finalURL); + strcat(url_insert,urlNPNP_finalURL); + strcat(url_insert,url_fromlist); + strcat(url_insert,"',"); + strcat(url_insert,worksafe); + strcat(url_insert,",'"); + strcat(url_insert,approver); + strcat(url_insert,"',0,2,'"); + if(task==0){ + strcat(url_insert,url); + }else{ + strcat(url_insert,crawl_tree); + } + strcat(url_insert,"','"); + strcat(url_insert,finalURL); + strcat(url_insert,"',"); + strcat(url_insert,strDepth); + strcat(url_insert,","); + strcat(url_insert,crawl_pages); + strcat(url_insert,","); + strcat(url_insert,crawl_type); + strcat(url_insert,","); + strcat(url_insert,"0"); + if(id_assigned == 1){ + strcat(url_insert,","); + strcat(url_insert,argv[1]); + } + strcat(url_insert,")"); + }else if(url_fromlist[0] != '/' && url_fromlist[0] != '.'){ + strcat(url_insert,"'"); + if(isabsolute==0){ + strcat(url_insert,urlPrefix_finalURL); + strcat(url_insert,urlNPNP_finalURL); + strcat(url_insert,folderPath_finalURL); + strcat(url_insert,urlcopy);//scrubed index.html + }else{ + strcat(url_insert,urlcopy); + } + strcat(url_insert,"',"); + strcat(url_insert,worksafe); + strcat(url_insert,",'"); + strcat(url_insert,approver); + strcat(url_insert,"',0,2,'"); + if(task==0){ + strcat(url_insert,url); + }else{ + strcat(url_insert,crawl_tree); + } + strcat(url_insert,"','"); + strcat(url_insert,finalURL); + strcat(url_insert,"',"); + strcat(url_insert,strDepth); + strcat(url_insert,","); + strcat(url_insert,crawl_pages); + strcat(url_insert,","); + strcat(url_insert,crawl_type); + strcat(url_insert,","); + strcat(url_insert,"0"); + if(id_assigned == 1){ + strcat(url_insert,","); + strcat(url_insert,argv[1]); + } + strcat(url_insert,")"); + } + }else if(isinternal==0 && crawl_type != 0 && crawl_type[0] != '0'){//is external link + urls++; + if(urls>1){ + strcat(url_insert,", ("); + } + strcat(url_insert,"'"); + strcat(url_insert,rootdomain); + strcat(url_insert,urlPath); + strcat(url_insert,"',"); + strcat(url_insert,worksafe); + strcat(url_insert,",'"); + strcat(url_insert,approver); + strcat(url_insert,"',0,2,'"); + if(task==0){ + strcat(url_insert,url); + }else{ + strcat(url_insert,crawl_tree); + } + strcat(url_insert,"','"); + strcat(url_insert,finalURL); + strcat(url_insert,"',"); + strcat(url_insert,strDepth); + strcat(url_insert,","); + strcat(url_insert,crawl_pages); + strcat(url_insert,","); + strcat(url_insert,crawl_type); + strcat(url_insert,","); + strcat(url_insert,"0"); + if(id_assigned == 1){ + strcat(url_insert,","); + strcat(url_insert,argv[1]); + } + strcat(url_insert,")"); + } + + memset(url_fromlist,0,url_fromlist_arraylen); + elementnum=0; + loopcount++; + default : + if(loopcount(url_insert_arraylen-10000)) + break; + } + if(urls>0){ + strcat(url_insert,";"); + //insert into db + if (mysql_query(con, url_insert)) + { + finish_with_error(con); + } + } + } + if (curl) + curl_easy_cleanup(curl);// cleanup curl (finalURL used at inserts, thats why we cleanup and the end here + }else{ + if(alreadydone == 0){ + printf("\nPage was flagged as unable to crawl or banned."); + }else if(idexistsalready==1){ + printf("\nPage is already indexed."); + } + printf("\nRemoving from queue..."); + char sqlqueryremove[200]; + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id="); + strcat(sqlqueryremove,id); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + if(idexistsalready==1 && permitted==0){ + printf(" Removing from index..."); + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM windex WHERE id="); + strcat(sqlqueryremove,idexistsvalue);strcat(sqlqueryremove," AND updatable != '0'"); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + } + FILE *abandoned = fopen("abandoned.txt", "a"); + fputs (url,abandoned); + fputs ("\r\n",abandoned); + fclose(abandoned); + } + //cleanup more sql stuff + mysql_free_result(resulturlcheck); + printf(" Awaiting next page in queue...\n\n"); + } + //cleanup more sql stuff + mysql_free_result(result); + mysql_close(con); + + if(empty==1) + sleep(5);//sleep 5 seconds + } + exit(0); +} diff --git a/c/htmlparse.h b/c/htmlparse.h new file mode 100755 index 0000000..856b257 --- /dev/null +++ b/c/htmlparse.h @@ -0,0 +1,574 @@ +//HTMLparse +//Separates text from an HTML file +//Remember to also set sql_mode = "NO_BACKSLASH_ESCAPES" in my.cnf + +#include +#include +#include +#include + +#define window_len 100 +#define charset_len 100 +#define mysqlcharset_len 100 +#define title_len 152 +#define keywords_len 1024 +#define description_len 182 +#define robots_len 100 +#define body_len 81920 +#define urlList_len 102400 +#define strURL_len 102400 + +FILE *bodyfile,*titlefile, *keywordsfile, *descriptionfile, *noindexfile, *nofollowfile, *charsetfile, *urlfile, *shuffledurlfile; + +static char filename[] = "page.out"; + +char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1]; +char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1]; +int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1; +long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0; + +int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match); +int locateInWindow(char *window, char *birdLower, char *birdUpper, int length); +int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize); +int canCrawl(int urlSize); +void shuffleURLs(int iterations, long urlListSize); +void sqlsafe(); +void charset2mysql(); + +FILE *f; +char *fileStr; +char c; + +void htmlparse(){ + long urlListSize=0; + numURL=0; + int intag=0,incomment=0,inscript=0,instyle=0,inlink=0,putspace=0,spacecount=0; + int urlSize=0,dqcount=0; + titlefound=charsetfound=descriptionfound=keywordsfound=robotsfound=nofollow=noindex=scriptfound=stylefound=0; + charsetsize=titlesize=keywordssize=descriptionsize=robotssize=bodysize=0; + + memset(window,'#',window_len); + window[window_len]=0; + memset(windowWithSpaces,'#',window_len); + windowWithSpaces[window_len]=0; + memset(charset,0,charset_len+1); + memset(mysqlcharset,0,mysqlcharset_len+1); + memset(title,0,title_len+1); + memset(keywords,0,keywords_len+1); + memset(description,0,description_len+1); + memset(robots,0,robots_len+1); + memset(body,0,body_len+1); + memset(urlList,0,urlList_len+1); + memset(strURL,0,strURL_len+1); + memset(urlListShuffled,0,urlList_len+1); + memset(urlListHoldShuffled,0,urlList_len+1); + printf("Parsing HTML... "); + + //open html file and load into memory + f = fopen(filename, "rb"); + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + fseek(f, 0, SEEK_SET); /* same as rewind(f); */ + + fileStr = malloc(fsize + 1); + if(fread(fileStr, 1, fsize, f)){}; + fclose(f); + + fileStr[fsize] = 0; + + //Locate the charset, title, description, keywords, robots, body + //must accomodate human error in markup + //must double all single quotes for mysql safety + //dont allow extra whitespace, ignore cr/lf/tabs + //complete it all in one pass + + for(int i=0;i","",8)==1){ + titlefound = 3; + //remove from end of title by inserting null at location of < + titlesize -= 8; + title[titlesize] = 0; + //printf("\n%s",title); + } + } + if(titlefound == 1 && c=='>')//in case of this situation: + titlefound=2; + if(titlefound == 0 && locateInWindow(window,"<title","<TITLE",6)==1){ + titlefound = 1; + } + + //Get Charset + if(charsetfound == 1){ + if(c == '>' || c == '/'){ + charsetfound = 2; + //printf("\n%s",charset); + } + if(charsetfound == 1 && charsetsize < charset_len && c != '"' && c != '\''){ + charset[charsetsize]=c; + charsetsize++; + } + } + if(charsetfound == 0 && locateInWindow(window,"charset=","CHARSET=",8)==1){ + charsetfound = 1; + } + + //Get Description + if(descriptionfound == 1){ + if(c == '>' || c == '/'){ + descriptionfound = 2; + //printf("\n%s",description); + } + if(descriptionfound == 1 && descriptionsize < (description_len-2) && c != '"'){ + description[descriptionsize]=c; + descriptionsize++; + if(c == 39){//check for single quotes and double them up for sql safety + description[descriptionsize]=c; + descriptionsize++; + } + } + } + if(descriptionfound == 0 && locateInWindow(window,"description\"content=","DESCRIPTION\"CONTENT=",20)==1){ + descriptionfound = 1; + } + + //Get Keywords + if(keywordsfound == 1){ + if(c == '>' || c == '/'){ + keywordsfound = 2; + //printf("\n%s",keywords); + } + if(keywordsfound == 1 && keywordssize < (keywords_len-2) && c != '"'){ + keywords[keywordssize]=c; + keywordssize++; + if(c == 39){//check for single quotes and double them up for sql safety + keywords[keywordssize]=c; + keywordssize++; + } + } + } + if(keywordsfound == 0 && locateInWindow(window,"keywords\"content=","KEYWORDS\"CONTENT=",17)==1){ + keywordsfound = 1; + } + + //Get Robots (nofollow, noindex) + if(robotsfound == 1){ + if(c == '>' || c == '/'){ + robotsfound = 2; + //printf("\n%s",robots); + if(locateInWindow(window,"nofollow","NOFOLLOW",8)==1) + nofollow=1; + if(locateInWindow(window,"noindex","NOINDEX",7)==1 || locateInWindow(window,"none","NONE",4)==1) + noindex=nofollow=1; + } + if(robotsfound == 1 && robotssize < robots_len && c != '"' && c != '\''){ + robots[robotssize]=c; + robotssize++; + } + } + if(robotsfound == 0 && locateInWindow(window,"robots\"content=","ROBOTS\"CONTENT=",15)==1){ + robotsfound = 1; + } + + + if(titlefound != 1){ + //Ignore between scripts, styles, and remove all tags, repeated spaces, tabs, cr, lf, null, add a space at end of every tag + if(c=='<'){ + intag = 1; + }else if(c=='>'){ + intag = 0; + putspace = 1; + } + + if(locateInWindow(window,"<!--","<!--",4)==1){ + incomment = 1; + }else if(locateInWindow(window,"-->","-->",3)==1){ + incomment = 0; + } + + if(locateInWindow(window,"<script","<SCRIPT",7)==1){ + inscript = 1; + num_scripts++; + }else if(locateInWindow(window,"</script>","</SCRIPT>",9)==1){ + inscript = 0; + } + + if(locateInWindow(window,"<style","<STYLE",6)==1){ + instyle = 1; + }else if(locateInWindow(window,"</style>","</STYLE>",8)==1){ + instyle = 0; + } + + if(locateInWindow(window,"<link","<LINK",5)==1){ + inlink = 1; + }else if(inlink==1 && locateInWindow(window,">",">",1)==1){ + inlink = 0; + } + if(inlink==1){ + if(locateInWindow(window,".css",".CSS",4)==1) + num_stylesheets++; + } + + //Get Body + //exclude remaining tags, comments, scripts, styles, cr, lf, null, tab, add a space after a '>' but only allow one + if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && c!= 13 && c != 10 && c != 0 && c != 9 && bodysize < (body_len-2)){ + if(putspace == 1){ + if(spacecount == 0){ + body[bodysize]=32; + bodysize++; + } + spacecount++; + putspace=0; + }else{ + if(c==32) + spacecount++; + else spacecount = 0; + + if(spacecount < 2){ + body[bodysize]=c; + bodysize++; + + if(c == 39){//check for single quotes and double them up for sql safety + body[bodysize]=c; + bodysize++; + } + } + } + } + } + + //Get URL's + if(getURLs==1){ + if(urlFound == 1 && incomment==0 && instyle==0 && inscript==0 && inlink == 0){ + if(c=='"' || c=='\'') + dqcount++; + if((c == '#' && urlSize==0) || (dqcount == 2 && urlSize == 0) || (c == ' ' && urlSize == 0)) + urlFound=urlTagFound=dqcount=0; + if((c == '>' || c == ' ') && urlFound == 1){ + if(canCrawl(urlSize)==0 || (urlSize+urlListSize) >= (urlList_len-1)){ + memset(strURL,0,strURL_len+1); + }else{ + strcat(urlList,strURL); + strcat(urlList,"\n"); + urlListSize+=urlSize+1; + memset(strURL,0,strURL_len+1); + numURL++; + } + urlFound = urlTagFound = urlSize = dqcount = 0; + } + if(urlFound == 1 && urlListSize < (urlList_len-2) && c != '"' && c != '\'' && urlSize < (strURL_len-2)){ + strURL[urlSize]=window[window_len-1]; + urlSize++; + } + if(urlSize==11){ + if(locateInWindow(window,"javascript:","JAVASCRIPT:",11)==1){ + urlFound=urlTagFound=urlSize=dqcount=0; + memset(strURL,0,strURL_len+1); + } + } + } + if(urlFound == 0 && urlTagFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(windowWithSpaces,"<a ","<A ",3)==1){//sometimes there is something between "<a" and "href" + urlTagFound = 1; + } + if(urlFound == 0 && urlTagFound == 1 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(window,"href=","HREF=",5)==1){ + urlFound = 1; + } + } + } + + //Convert charset to mysql equivalent + charset2mysql(); + + //print body to file +/* bodyfile = fopen("body.txt","wb"); + fputs(body,bodyfile); + fclose(bodyfile); + + //print title to file + titlefile = fopen("title.txt","wb"); + fputs(title,titlefile); + fclose(titlefile); + + //print keywords to file + keywordsfile = fopen("keywords.txt","wb"); + fputs(keywords,keywordsfile); + fclose(keywordsfile); + + //print description to file + descriptionfile = fopen("description.txt","wb"); + fputs(description,descriptionfile); + fclose(descriptionfile); + + //print charset to file + charsetfile = fopen("charset.txt","wb"); + fputs(mysqlcharset,charsetfile); + fclose(charsetfile); + + //print noindex to file + noindexfile = fopen("noindex.txt","wb"); + if(noindex==1) + fputs("noindex",noindexfile); + fclose(noindexfile); + + //print nofollow to file + nofollowfile = fopen("nofollow.txt","wb"); + if(nofollow==1) + fputs("nofollow",nofollowfile); + fclose(nofollowfile);*/ + + if(getURLs==1){ + //shuffle order of collected URLs list + shuffleURLs(10,urlListSize); + //printf("\n%s",urlList); + + //print URLs to file +/* urlfile = fopen("url.txt","wb"); + fputs(urlList,urlfile); + fclose(urlfile); + + //print shuffled URLs to file + shuffledurlfile = fopen("urlshuffled.txt","wb"); + fputs(urlListShuffled,shuffledurlfile); + fclose(shuffledurlfile);*/ + } + + free(fileStr); + + printf("\nbody: %ld, title: %ld, charset: %ld, description: %ld, keywords: %ld, noindex: %d, nofollow: %d",bodysize,titlesize,charsetsize,descriptionsize,keywordssize,noindex,nofollow); +} + +void shuffleURLs(int iterations, long urlListSize) +{ + if(seeded==0){ + srand(time(NULL)); + seeded=1; + } + + int r1,r2,r1to2; + int urlCount,i,j,k,l; + + if(numURL>2){ + strcpy(urlListHoldShuffled,urlList); + for(int loops=0;loops<iterations;loops++){ + r1 = r1to2 = (rand() % numURL) + 1; + r2 = (rand() % numURL) + 1; + + if(r1>r2){ + r1=r2; + r2=r1to2; + } + if(r1==r2){ + continue; + } + + urlCount=i=j=k=l=0; + + //skip to url number r1 + while(urlCount < r1 /*&& i<urlList_len*/){ + if(urlListHoldShuffled[i]=='\n') + urlCount++; + i++; + } + j=i; + //copy to urlListShuffled starting at j until reaching r2 location + while(urlCount<r2 /*&& j<urlList_len*/){ + urlListShuffled[k]=urlListHoldShuffled[j]; + if(urlListHoldShuffled[j]=='\n') + urlCount++; + j++; + k++; + } + //concat url's before i + while(l<i /*&& k<urlList_len*/){ + urlListShuffled[k]=urlListHoldShuffled[l]; + l++; + k++; + } + //concat url's after k + while(k<urlListSize /*&& k<urlList_len*/){ + urlListShuffled[k]=urlListHoldShuffled[k]; + k++; + } + strcpy(urlListHoldShuffled,urlListShuffled); + } + }else{ + strcpy(urlListShuffled,urlList); + } + +} + +void charset2mysql() +{ + //if no charset specified, use utf8 + if(charsetsize == 0){ + strcpy(mysqlcharset,"SET CHARSET utf8;"); + printf("No Charset found. %s",mysqlcharset); + } + else{ //else, match charset with a proper mysql charset + + if(matchMySQLcharset(charsetsize,charset,5,"utf-8","UTF-8")==1){ + strcpy(mysqlcharset,"SET CHARSET utf8mb4;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"latin1","LATIN1")==1){ + strcpy(mysqlcharset,"SET CHARSET latin1;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,9,"shift-jis","SHIFT-JIS")==1){ + strcpy(mysqlcharset,"SET CHARSET cp932;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"x-sjis","X-SJIS")==1){ + strcpy(mysqlcharset,"SET CHARSET cp932;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,10,"iso-8859-1","ISO-8859-1")==1){ + strcpy(mysqlcharset,"SET CHARSET latin1;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,12,"windows-1252","WINDOWS-1252")==1){ + strcpy(mysqlcharset,"SET CHARSET latin1;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,12,"windows-1251","WINDOWS-1251")==1){ + strcpy(mysqlcharset,"SET CHARSET cp1251;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"koi8-r","KOI8-R")==1){ + strcpy(mysqlcharset,"SET CHARSET cp1251;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"euc-kr","EUC-KR")==1){ + strcpy(mysqlcharset,"SET CHARSET euckr;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,4,"big5","BIG5")==1){ + strcpy(mysqlcharset,"SET CHARSET big5;"); + printf("%s",mysqlcharset); + } + else{ + strcpy(mysqlcharset,"SET CHARSET utf8;"); + printf("Charset mismatch. %s",mysqlcharset); + } + } +} + +int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match) +{ + int match = 0; + int i=0; + for(;i<html_match_length;i++){ + if(i > html_charset_length){ + return 0; + } + if(html_charset[i] != 95 && html_charset[i] != 45 && html_lowercase_match[i] != 95 && html_lowercase_match[i] != 45){ // _ or - + if(html_lowercase_match[i] != html_charset[i] && html_uppercase_match[i] != html_charset[i]){ + return 0; + } + } + match = 1; + } + return match; +} + +int locateInWindow(char *window, char *birdLower, char *birdUpper, int length) +{ + int start = window_len-length; + for(int i=0;i<length;i++){ + if(window[start] != birdLower[i] && window[start] != birdUpper[i]){ + return 0; + } + start++; + } + return 1; +} + +int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize) +{ + long start = urlSize-length; + if(urlSize >= length){ + for(int i=0;i<length;i++){ + if(url[start] != birdLower[i] && window[start] != birdUpper[i]){ + return 0; + } + start++; + } + return 1; + }else{ + return 0; + } +} + +//Check if url can be indexed (allow relative links for html and txt files. Removing this check will add to the queue everything listed including external links. +int canCrawl(int urlSize){ + int numDots=0,numSlash=0; + int slashpos=0,dotspos=0; + int extfound=0,extlocation=0,prefixfound=0; + + for(int i=0;i<urlSize;i++){ + if(urlSize>5 && strURL[i]==':' && i>3){ + if((strURL[0]!='h' && strURL[0]!='H') || (strURL[1]!='t' && strURL[1]!='T') || (strURL[2]!='t' && strURL[2]!='T') || (strURL[3]!='p' && strURL[3]!='P') || (strURL[4]!='s' && strURL[4]!='S' && strURL[4]!=':') || (strURL[5]!=':' && strURL[5]!='/')) + return 0; + prefixfound=1; + } + if(strURL[i]=='?' || strURL[i]=='\\'){ + return 0; + } + if(strURL[i]=='.'){ + numDots++; + } + if(strURL[i]=='/'){ + numSlash++; + } + if(strURL[i]=='.' ){ + extfound=1; + extlocation=i; + } + if(strURL[i]=='/' && extfound==1 && i>extlocation){ + extfound=0; + } + if(prefixfound==1 && numSlash-2<=0){ + extfound=0; + } + } + if(numDots == 0){ + return 1; + } + + //restrict file extensions to these + if(extfound==1 && (locateInURL(strURL,".html",".HTML",5,urlSize)==1 || locateInURL(strURL,".htm",".HTM",4,urlSize)==1 || locateInURL(strURL,".txt",".TXT",4,urlSize)==1 || locateInURL(strURL,".php",".PHP",4,urlSize)==1 || locateInURL(strURL,".asp",".ASP",4,urlSize)==1)){ + return 1; + } + if(extfound==0 ) + return 1; + return 0; +} diff --git a/c/rs.c b/c/rs.c new file mode 100755 index 0000000..9f9df3a --- /dev/null +++ b/c/rs.c @@ -0,0 +1,226 @@ +//wiby refresh scheduler + +#include <mysql.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <ctype.h> + +void finish_with_error(MYSQL *con) +{ + fprintf(stderr, "%s\n", mysql_error(con)); + mysql_close(con); + exit(1); +} +void help(){ + printf("\nWiby Refresh Scheduler\n\nUsage: re Batch_Limit Total_Crawlers\n\nThe refresh scheduler finds pages that need to be refreshed and adds them to the indexqueue to be crawled. It will wait for the batch to complete before adding more.\n\nThere are two arguments you can set, the max number of pages to grab for each batch, and the total number of crawlers running.\n\nIf you set no arguments, it assumes you have one crawler running with an unassigned ID and will set a limit of one page per batch, rechecking if it finishes every 5 seconds. This slow paced default is fine for an index of 100k pages or so and will not use much CPU.\n\nIf you have two crawlers running and a batch limit of 100 pages, this is how you would run the scheduler:\n\n./re 100 2\n\nIn that example, each crawler will be assigned 50 pages. Once all 100 have been crawled, another batch will be assigned.\n\nYou can also specify only a batch limit and omit the total number of crawlers, it will then assume one crawler with an unassigned ID by default.\n\nIf you do not specify the number of crawlers, do not assign a number (ID) to the crawler that you have running and do not run more than one crawler.\n\nThe program will sleep for 60 seconds if there are no stale pages found.\n\n"); + exit(0); +} +int isnum(char *source){ + int sourcelength = strlen(source); + for(int i=0;i < sourcelength; i++){ + if(source[i] < 48 || source[i] > 57){ + return 0; + } + } + return 1; +} + +int main(int argc, char **argv) +{ + int wait_batch = 0,n_lim=1,num_cr=0,cr_count=1; + char lim[100] = "1"; + + if(argc == 3 && isnum(argv[2])==1 && isnum(argv[1])==1){ + num_cr = atoi(argv[2]); + n_lim = atoi(argv[1]); + }else if(argc == 2 && isnum(argv[1])==1){ + n_lim = atoi(argv[1]); + }else if(argc > 1){ + help(); + } + if(n_lim > 0 && argc > 1){ + strcpy(lim,argv[1]); + } + + while(1) + { + //allocates or initialises a MYSQL object + MYSQL *con = mysql_init(NULL); + + if (con == NULL) + { + finish_with_error(con); + } + + //establish a connection to the database. We provide connection handler, host name, user name and password parameters to the function. The other four parameters are the database name, port number, unix socket and finally the client flag + if (mysql_real_connect(con, "localhost", "crawler", "seekout", NULL, 0, NULL, 0) == NULL) + { + finish_with_error(con); + } + + if (mysql_query(con, "use wiby")) + { + finish_with_error(con); + } + + //check if indexqueue has rows from a previous batch sent by the scheduler (should not insert more until it's empty) + if (mysql_query(con, "SELECT id FROM indexqueue WHERE task = 1")) + { + finish_with_error(con); + } + + //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set + MYSQL_RES *result = mysql_store_result(con); + + if(result == NULL) + { + finish_with_error(con); + } + + int num_rows = 0; + int re_rows = mysql_num_rows(result); + mysql_free_result(result); + + if(re_rows > 0){ + mysql_close(con); + if(wait_batch == 0){ + printf("\nWaiting for batch to complete...\n\n"); + } + wait_batch = 1; + }else{ + wait_batch = 0; + char querywindex[1000]; + memset(querywindex,0,1000); + strcpy(querywindex,"SELECT id,url,worksafe,approver,surprise,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules FROM windex WHERE (CASE WHEN updatable = 1 THEN updated < NOW() - INTERVAL 1 WEEK WHEN updatable = 2 THEN updated < NOW() - INTERVAL 1 DAY WHEN updatable = 3 THEN updated < NOW() - INTERVAL 12 HOUR WHEN updatable = 4 THEN updated < NOW() - INTERVAL 6 HOUR WHEN updatable = 5 THEN updated < NOW() - INTERVAL 3 HOUR WHEN updatable = 6 THEN updated < NOW() - INTERVAL 1 HOUR END) AND updatable != 0 AND enable = 1 LIMIT "); + strcat(querywindex,lim); + strcat(querywindex,";"); + //printf("\n%s",querywindex); + + //Get aging windex entries + if (mysql_query(con,querywindex)) + { + finish_with_error(con); + } + + result = mysql_store_result(con); + + if(result == NULL) + { + finish_with_error(con); + } + + //get the number of fields (columns) in the table + //int num_fields = mysql_num_fields(result); + num_rows = mysql_num_rows(result); + + MYSQL_ROW row; + + while(row = mysql_fetch_row(result)){ + printf("----------------------------------------------------------\nRefresh:"); + + //Store data in first row into variables + char *id = row[0]; + char *url = row[1]; + char *worksafe = row[2]; + char *approver = row[3]; + char *surprise = row[4]; + char *updatable = row[5]; + char *crawl_tree = row[6]; + char *crawl_family = row[7]; + char *crawl_pages = row[8]; + char *crawl_type = row[9]; + char *crawl_repeat = row[10]; + char *force_rules = row[11]; + + char str_cr_count[100]; + memset(str_cr_count,0,100); + sprintf(str_cr_count,"%d",cr_count); + + printf("\nURL: %s\nID: %s\nWorksafe: %s\nSurprise: %s\nApprover: %s\nUpdatable: %s", url, id, worksafe, surprise, approver, updatable); + if(num_cr > 0){ + printf("\nCrawler ID: %d",cr_count); + }else{ + printf("\nCrawler ID: (null)"); + } + + char sqlqueryinsertindexqueue[2000]; + memset(sqlqueryinsertindexqueue,0,2000); + if(num_cr == 0){ + strcpy(sqlqueryinsertindexqueue,"INSERT INTO indexqueue (url,worksafe,approver,surprise,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,task) VALUES ('"); + }else{ + strcpy(sqlqueryinsertindexqueue,"INSERT INTO indexqueue (url,worksafe,approver,surprise,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,task,crawler_id) VALUES ('"); + } + strcat(sqlqueryinsertindexqueue,url);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,worksafe);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,approver);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,surprise);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,updatable);strcat(sqlqueryinsertindexqueue,"',"); + if(crawl_tree != NULL){ + strcat(sqlqueryinsertindexqueue,"'");strcat(sqlqueryinsertindexqueue,crawl_tree);strcat(sqlqueryinsertindexqueue,"',"); + }else{ + strcat(sqlqueryinsertindexqueue,"NULL");strcat(sqlqueryinsertindexqueue,","); + } + if(crawl_family != NULL){ + strcat(sqlqueryinsertindexqueue,"'");strcat(sqlqueryinsertindexqueue,crawl_family);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"NULL");strcat(sqlqueryinsertindexqueue,",'"); + } + if(crawl_pages != NULL){ + strcat(sqlqueryinsertindexqueue,crawl_pages);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','"); + } + if(crawl_type != NULL){ + strcat(sqlqueryinsertindexqueue,crawl_type);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','"); + } + if(crawl_repeat != NULL){ + strcat(sqlqueryinsertindexqueue,crawl_repeat);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','"); + } + if(force_rules != NULL){ + strcat(sqlqueryinsertindexqueue,force_rules);strcat(sqlqueryinsertindexqueue,"','1"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','1"); + } + if(num_cr > 0){ + strcat(sqlqueryinsertindexqueue,"','");strcat(sqlqueryinsertindexqueue,str_cr_count); + } + strcat(sqlqueryinsertindexqueue,"');"); + + printf("\nInserting into indexqueue...\n"); + if(mysql_query(con,sqlqueryinsertindexqueue)) + { + finish_with_error(con); + } + + //Assign to crawlers in round robin fashion if user indicated more than one crawler. + if(cr_count < num_cr && num_cr > 0){ + cr_count++; + }else if(num_cr > 0){ + cr_count=1; + } + } + + //cleanup sql stuff + mysql_free_result(result); + mysql_close(con); + + if(num_rows > 0){ + printf("\nAwaiting next set of pages...\n\n"); + } + } + + sleep(5);//sleep 5 seconds + + if(num_rows==0 && re_rows == 0)//sleep if no rows were found + sleep(60);//sleep 60 seconds + } + + exit(0); +} + diff --git a/c/rt.c b/c/rt.c new file mode 100755 index 0000000..484ea28 --- /dev/null +++ b/c/rt.c @@ -0,0 +1,306 @@ +//Wiby slave replication server tracker +//Admin creates file 'servers.csv' containing only IP and database name, one per line +//Tracker will check status of slave databases by attempting to connect to all listed every few seconds +//Tracker will create a copy of this file called 'res.csv' and display only the confirmed online servers +//as well as ID ranges divided across all servers so each has the same number of rows. + +#include <mysql.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/time.h> + +FILE *servers; +FILE *error; +FILE *res; +int c,d; +char ip[1000][100]; +char db[1000][100]; +char ipOK[1000][100]; +char dbOK[1000][100]; +char startID[1000][100]; +char endID[1000][100]; +char firstOnlineServerIP[100]; +char firstOnlineServerDB[100]; +char *resfiletext; +char totalRows[50]; +char lastID[50]; +char strSQL[200]; + +struct timeval stop, start; + +void handle_error(MYSQL *con) +{ + error = fopen("rtlog", "a"); + printf("%s\n", mysql_error(con)); + fprintf(error, "%s\n", mysql_error(con)); + fclose(error); + mysql_close(con); +} + +int main(int argc, char **argv) +{ + int timetest=0,reportinit=0,running=0; + printf("\nStarting Replication Tracker:\n\nConnection Latency\n--------------------------------\n"); + while(1) + { + long bytecount=0; + int serverCount=0, onlineServers=0, i=0, ipcnt=0, dbcnt=0, errcount=0, foundfirst=0,timeout=5,ignore = 0; + int ipORdb = 0; //0 = ip, 1 = space + servers = fopen("servers.csv", "rb"); + if (servers==NULL) + { + printf("Error opening 'servers.csv' file.\n"); + exit(0); + } + //parse server list + while((c = fgetc(servers)) != EOF) + { + if(c == 35)//check if line is commented out (#) + ignore = 1; + if(c != 10 && c != 13 && c != 32 && c != 44 && ipORdb == 0 && ignore == 0){//if no cr/lf, commas, spaces, or comments, gather ip + ip[serverCount][i] = c; + ipcnt++; + } + if(c==44 && ignore == 0){//if comma detected, switch to gather db name + ipORdb = 1; + i = -1; + } + if(c != 10 && c != 13 && c != 32 && c != 44 && ipORdb == 1 && ignore == 0){//if no cr/lf, commas, spaces, or comments, gather db + db[serverCount][i] = c; + dbcnt++; + } + if(c == 10){//count replication slaves + ipORdb = 0; + ip[serverCount][ipcnt] = 0;//null terminate string + db[serverCount][dbcnt] = 0; + if(ipcnt && dbcnt > 0) + serverCount++; + ipcnt = dbcnt = 0; + i = -1; + ignore = 0; + } + if(c != 13){ + i++; + bytecount++; + } + d=c; + } + if(i>0 && d != 10) + serverCount++; + fclose(servers); + + //Allocate bytes for the res file text +// resfiletext = (char*)calloc(bytecount+1000+(i*50),sizeof(char)); + char resfiletext[10000]; + memset(resfiletext,0,10000); + + //conect to each listed server and verify it works + for (i=0;i<serverCount;i++){ + int err = 0; + MYSQL *con = mysql_init(NULL); + if (con == NULL) + { + handle_error(con); + exit(0); + } + mysql_options(con,MYSQL_OPT_CONNECT_TIMEOUT,&timeout); + if(timetest==0){ + gettimeofday(&start, NULL); + } + if (mysql_real_connect(con, ip[i], "remote_guest", "d0gemuchw0w", db[i], 0, NULL, 0) == NULL) + { + handle_error(con); + err=1; + } + if(timetest==0){ + gettimeofday(&stop, NULL); + printf("%s %s | %lums", ip[i], db[i], ((stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec)/1000); + if(err==1) + printf(" (Fail)"); + printf("\n"); + } + if(err==0){//append successful connection info to res string + strcpy(ipOK[onlineServers],ip[i]); + strcpy(dbOK[onlineServers],db[i]); + onlineServers++; + mysql_close(con); + } + } + timetest=1; + + //get more database info needed for distributed queries + //-------------------------------------------------------------------------------------------------------------------- + + // connect to first available slave server and get info needed for all available slaves to handle a distributed query + int initialinfo = 0, nRows=0; + for (i=0;i<onlineServers;i++){ + int err = 0, startIDint=0; + long long int numrows=0; + MYSQL *con = mysql_init(NULL); + if (con == NULL) + { + handle_error(con); + exit(0); + } + mysql_options(con,MYSQL_OPT_CONNECT_TIMEOUT,&timeout); + if (mysql_real_connect(con, ipOK[0], "remote_guest", "d0gemuchw0w", dbOK[0], 0, NULL, 0) == NULL) //connect to the same server each iteration + { + handle_error(con); + err=1; + } + if(err==0){ + if(i==0){//get initial info + + //Get total number of rows + if (mysql_query(con, "SELECT COUNT(id) FROM windex;")) + { + handle_error(con); + } + MYSQL_RES *result = mysql_store_result(con); + if(result == NULL) + { + handle_error(con); + exit(0); + } + MYSQL_ROW row = mysql_fetch_row(result); + nRows = atoi(row[0]); + + //free old result data or else you'll get a memory leak + mysql_free_result(result); + + //Get the last row id number + if (mysql_query(con, "SELECT id FROM windex ORDER BY id DESC LIMIT 1;")) + { + handle_error(con); + } + result = mysql_store_result(con); + if(result == NULL) + { + handle_error(con); + exit(0); + } + row = mysql_fetch_row(result); + memset(lastID, 0, 50); + strcpy(lastID,row[0]); + + //free old result data or else you'll get a memory leak + mysql_free_result(result); + + if(reportinit==0) + printf("\nCurrent ID Ranges (Rows: %d)\n--------------------------------",nRows); + } + + //Get id of last row of the % of the db you want to search (depending on # of slaves) + numrows = (nRows / onlineServers * i) + (nRows / onlineServers) - 1; + //printf("\n%lld",numrows);fflush(stdout); + sprintf(totalRows, "%lld", numrows);//convert int to string + strcpy(strSQL,"SELECT id FROM windex ORDER BY id LIMIT "); + strcat(strSQL,totalRows); + strcat(strSQL,",1;"); + //SELECT id FROM windex ORDER BY id LIMIT n-1,1; + if (mysql_query(con, strSQL)) + { + handle_error(con); + } + MYSQL_RES *result2 = mysql_store_result(con); + if(result2 == NULL) + { + handle_error(con); + exit(0); + } + MYSQL_ROW row = mysql_fetch_row(result2); + + //store endID and startID + if(i+1 != onlineServers) + strcpy(endID[i],row[0]); + else + strcpy(endID[i],lastID); + //strcpy(endID[i],row[0]); + + if(i==0){ + strcpy(startID[i],"0"); + }else{ + startIDint = atoi(endID[i-1])+1; + sprintf(startID[i], "%d", startIDint); + } + if(reportinit==0){ + printf("\n%s %s | %s %s",ipOK[i],dbOK[i],startID[i],endID[i]); + if(i+1 == onlineServers) + printf("\n\n"); + fflush(stdout); + } + + //free old result data or else you'll get a memory leak + mysql_free_result(result2); + mysql_close(con); + + //update res file + if(i>0) + strcat(resfiletext,"\n"); + strcat(resfiletext,ipOK[i]); + strcat(resfiletext,","); + strcat(resfiletext,dbOK[i]); + strcat(resfiletext,","); + strcat(resfiletext,startID[i]); + strcat(resfiletext,","); + strcat(resfiletext,endID[i]); + } + } + //-------------------------------------------------------------------------------------------------------------------- + + //get resfiletext length + long resfiletextlen = strlen(resfiletext); + res = fopen("res.csv","rb"); + if (res==NULL) + { + printf("Error opening 'res.csv' file. Will create a new one.\n"); + res = fopen("res","w+"); + if (res==NULL) + { + printf("Error creating 'res.csv' file.\n"); + exit(0); + } + } + //Get file size + fseek(res, 0L, SEEK_END); + bytecount = ftell(res); + rewind(res); + + //check if res file is different from resfiletext string. + i=0; + int changed=0; + if(bytecount == resfiletextlen){ + while((c = fgetc(res)) != EOF) + { + if(c != resfiletext[i]){ + changed = 1; + } + i++; + } + fclose(res); + }else{ + changed = 1; + } + + reportinit = 1; + //store available servers in res file + if(changed == 1){ + res = fopen("res.csv", "w"); + fprintf(res, "%s", resfiletext); + fclose(res); + reportinit = 0; + } + if(running == 0){ + printf("Running\n"); + fflush(stdout); + running = 1; + } + + //fflush(stdout); + //free(resfiletext); + sleep(5); + } +} + diff --git a/c/servers_example.csv b/c/servers_example.csv new file mode 100755 index 0000000..5f6ff66 --- /dev/null +++ b/c/servers_example.csv @@ -0,0 +1,4 @@ +192.168.0.101,wiby +192.168.0.102,wiby +192.168.0.103,wiby +192.168.0.104,wiby \ No newline at end of file diff --git a/c/urlparse.h b/c/urlparse.h new file mode 100755 index 0000000..4a08441 --- /dev/null +++ b/c/urlparse.h @@ -0,0 +1,296 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +//char url[] = "index.htm\0"; +char urlcopy[1000]; +char domain[1000]; +char tldlist[] = "co.uk,co.jp\0"; +char buffer[1000]; +char rootdomain[1000]; +char urlPath[1000]; +char folderPath[1000]; +char urlnopathnoprefix_fromlist[1000]; +char urlnoprefix_fromlist[10000]; +char prefix_fromlist[14]; +int prefixsize_fromlist=0; +int checkDomain(char *domain, char *substrLower, char *substrUpper, int domainLen, int substrLen); + +void urlparse(char* url){ +//int main(int argc, char *argv[]) { + int foundDot=0,foundDotInPath=0,foundSlash=0,foundColon=0,slashPos=0,lastSlashPos=0,folderPathLength=0,isFile=0,pathlen=0; + int rootdomaincount=0; + int isIPv4=1,isIPv6=1; + memset(buffer,0,1000); + memset(urlcopy,0,1000); + memset(domain,0,1000); + memset(rootdomain,0,1000); + memset(urlPath,0,1000); + memset(folderPath,0,1000); + memset(urlnoprefix_fromlist,0,1000); + memset(urlnopathnoprefix_fromlist,0,1000); + + //find out if its http or https or http://www. or https://www. + int httpwww=0, httpswww=0, http=0, https=0; + //char prefix[12]; + memset(prefix_fromlist,0,14); + strcpy(prefix_fromlist,"http"); + int urlsize = strlen(url); + + if(urlsize<998){ + + //copy url (variable from crawler) + strcpy(urlcopy,url); + + //truncate any "index.html" files and just use the directory path + if(urlsize == 10){ + if(checkDomain(urlcopy,"index.html","INDEX.HTML",urlsize,10)==1){ + urlcopy[0]=0; + urlsize=0; + } + }else if(urlsize == 9){ + if(checkDomain(urlcopy,"index.htm","INDEX.HTM",urlsize,9)==1){ + urlcopy[0]=0; + urlsize=0; + } + } + if(urlsize > 10){ + if(checkDomain(urlcopy,"/index.html","/INDEX.HTML",urlsize,11)==1){ + urlcopy[urlsize-10]=0; + urlsize-=10; + } + } + if(urlsize > 9){ + if(checkDomain(urlcopy,"/index.htm","/INDEX.HTM",urlsize,10)==1){ + urlcopy[urlsize-9]=0; + urlsize-=9; + } + } + + if(urlsize > 4){ + if(url[4]==':' && (url[3]=='p' || url[3]=='P')) + http = 7; + } + if(urlsize > 5){ + if(url[5]==':' && (url[4]=='s' || url[4]=='S')) + https = 8; + } + if(urlsize > 11){ + if((url[7]=='w' || url[7]=='W') && (url[8]=='w' || url[8]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[10]=='.' ){ + httpwww = 11; + http = https = 0; + } + if(url[7]=='/' && (url[8]=='w' || url[8]=='W') && (url[9]=='w' || url[9]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[11]=='.' ){ + httpswww = 12; + http = https = 0; + } + } + + //set the prefix + if(http > 0) strcat(prefix_fromlist,"://"); + else if(https > 0) strcat(prefix_fromlist,"s://"); + else if(httpwww > 0) strcat(prefix_fromlist,"://www."); + else if(httpswww > 0) strcat(prefix_fromlist,"s://www."); + + int prefixsize_fromlist = httpswww+httpwww+https+http; + //char urlnoprefix[urlsize-prefixsize+1]; + //memset(urlnoprefix,0,urlsize-prefixsize+1); + + int urlcount=0,urlnoprefixcount=0,urlnopathnoprefix_done=0,urlnopathnoprefix_len=0; + + //if no prefix, see if it might be a domain + int noprebutisdomain=0; + if(prefixsize_fromlist==0){ + memset(prefix_fromlist,0,14); + while(urlcount < urlsize+1) + { + if(urlcopy[urlcount]=='.' && urlcount>0) + { + noprebutisdomain=1; + break; + } + if(urlcopy[urlcount]=='/') + { + noprebutisdomain=0; + break; + } + urlcount++; + } + } + + //store the url without prefix to urlnoprefix + urlcount=0; + if(prefixsize_fromlist!=0 || noprebutisdomain==1){ + while(urlcount < urlsize) + { + if(urlcount>prefixsize_fromlist-1) + { + urlnoprefix_fromlist[urlnoprefixcount]=urlcopy[urlcount]; + + //get urlnopath + if(urlcopy[urlcount] != '/' && urlnopathnoprefix_done==0){ + urlnopathnoprefix_fromlist[urlnoprefixcount]=urlcopy[urlcount]; + urlnopathnoprefix_len++; + }else{ + urlnopathnoprefix_done=1; + } + urlnoprefixcount++; + } + urlcount++; + } + } + + //check for file extension like html/htm/txt if no prefix in url + if(noprebutisdomain==1 && urlsize>4){ + if(checkDomain(urlnopathnoprefix_fromlist,".html",".HTML",urlnopathnoprefix_len,5)==1 || checkDomain(urlnopathnoprefix_fromlist,".htm",".HTM",urlnopathnoprefix_len,4)==1 || checkDomain(urlnopathnoprefix_fromlist,".txt",".txt",urlnopathnoprefix_len,4)==1){ + memset(domain,0,1000); + memset(urlnoprefix_fromlist,0,1000); + memset(urlnopathnoprefix_fromlist,0,1000); + urlnoprefixcount=0; + } + } + + //get domain name + int lenurl=strlen(urlnoprefix_fromlist); + int numDots=0; + int i=0; + for(i;i<lenurl;i++){ + + //to get folder path, locate final slash position + if(urlnoprefix_fromlist[i]=='/') + lastSlashPos=i; + + //Null terminate hostname at first slash + if(urlnoprefix_fromlist[i]!='/') + domain[i]=urlnoprefix_fromlist[i]; + if(urlnoprefix_fromlist[i]=='.' && foundSlash==0) + numDots++; + + //get path after hostname + if(urlnoprefix_fromlist[i]=='/' && foundSlash==0){ + foundSlash=1; + slashPos=i-1; + pathlen++; + } + if(foundSlash==1){ + urlPath[i-slashPos-1]=urlnoprefix_fromlist[i]; + pathlen++; + if(urlnoprefix_fromlist[i]=='.') + foundDotInPath=1; + } + + if(urlnoprefix_fromlist[i]==':') + foundColon=1; + + //Check if hostname is an IPv4 address + if(((urlnoprefix_fromlist[i]<48 && urlnoprefix_fromlist[i] != '.') || (urlnoprefix_fromlist[i]>57)) && foundSlash==0) + isIPv4=0; + //Check if hostname is an IPv6 address + if(((urlnoprefix_fromlist[i]<48 && urlnoprefix_fromlist[i] > 57) || (urlnoprefix_fromlist[i]<65 && urlnoprefix_fromlist[i]>70) || (urlnoprefix_fromlist[i]<97 && urlnoprefix_fromlist[i]>102)) && foundSlash==0) + isIPv6=0; + } + + if(foundColon==0) + isIPv6=0; + + if(isIPv6==1)//if ipv6, force it into working + numDots=1; + + if(foundDotInPath==0 && pathlen>1){ + //urlPath[pathlen-1]='/'; + //pathlen++; + //urlnoprefix[lenurl]='/'; + //lenurl++; + lastSlashPos=lenurl; + } + + + //get folder path + folderPathLength=lastSlashPos-slashPos; + for(i=0;i<folderPathLength;i++){ + folderPath[i]=urlnoprefix_fromlist[i+slashPos+1]; + } + if(numDots==0 && isIPv6==0){ + memset(urlPath,0,1000); + memset(folderPath,0,1000); + strcpy(urlPath,urlnoprefix_fromlist); + strcpy(folderPath,urlnoprefix_fromlist); + } + + if(folderPathLength>2 && folderPath[i-2] != 0 && folderPath[i-2] != '/') + folderPath[i-1]='/'; + + if(urlPath[0]==0) + urlPath[0]='/'; + if(folderPath[0]==0) + folderPath[0]='/'; + + int lendomain=strlen(domain); + //get tld + int lentldlist=strlen(tldlist); + int foundDoubleDotTLD=0, k=0, dotcount=0, firstSlash=0; + for(i=0;i<=lentldlist;i++){ + if(tldlist[i] != ',' && tldlist[i] != 0){ + buffer[k]=tldlist[i]; + k++; + }else if(foundDoubleDotTLD==0 && (tldlist[i] == ',' || tldlist[i] == 0)){ + if(strstr(urlnoprefix_fromlist,buffer)!=NULL) + foundDoubleDotTLD=1; + if(numDots <=2 && foundDoubleDotTLD==1) + strcpy(rootdomain,domain); + if(numDots > 2 && foundDoubleDotTLD==1){ + int j=0; + for(j;j<lenurl;j++){ + if(foundDot==1){ + if(urlnoprefix_fromlist[j]=='/') + firstSlash=1; + if(firstSlash==0){ + rootdomain[rootdomaincount]=urlnoprefix_fromlist[j]; + rootdomaincount++; + } + } + if(urlnoprefix_fromlist[j]=='.') + foundDot=1; + } + } + if (tldlist[i] == ','){ + memset(buffer,0,1000); + k=0; + } + }else if(foundDoubleDotTLD==1){ + break; + } + } + + if(foundDoubleDotTLD==0){ + foundDot=rootdomaincount=0; + if(numDots==1){ + strcpy(rootdomain,domain); + }else if(numDots>1){ + //skip text before first dot + for(i=0;i<lendomain;i++){ + if(foundDot==1 || isIPv4==1){ + rootdomain[rootdomaincount]=domain[i]; + rootdomaincount++; + } + if(domain[i]=='.') + foundDot=1; + } + } + } + +// printf("\nURL: %s\nHostname: %s\nPath: %s\nURL nopathnopre: %s\nFolder Path: %s\nURL_noprefix: %s\nPrefix: %s\nPrefix Size: %d",url,rootdomain,urlPath,urlnopathnoprefix_fromlist,folderPath,urlnoprefix_fromlist,prefix_fromlist,prefixsize_fromlist); + } +// return 0; +} + +int checkDomain(char *domain, char *substrLower, char *substrUpper, int domainLen, int substrLen){ + int j=0; + for(int i=domainLen-substrLen;i<domainLen;i++){ + if(domain[i]!=substrLower[j] && domain[i]!=substrUpper[j]){ + return 0; + } + j++; + } + return 1; +} diff --git a/db/wiby.sql b/db/wiby.sql new file mode 100755 index 0000000..ef35ceb --- /dev/null +++ b/db/wiby.sql @@ -0,0 +1,214 @@ +-- MySQL dump 10.13 Distrib 8.0.18, for Linux (x86_64) +-- +-- Host: localhost Database: wiby +-- ------------------------------------------------------ +-- Server version 8.0.18 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!50503 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `accounts` +-- + +DROP TABLE IF EXISTS `accounts`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `accounts` ( + `name` varchar(50) NOT NULL, + `hash` text, + `level` text, + `attempts` int(11) DEFAULT '0', + `updated` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (`name`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `accounts` +-- + +LOCK TABLES `accounts` WRITE; +/*!40000 ALTER TABLE `accounts` DISABLE KEYS */; +/*!40000 ALTER TABLE `accounts` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `feedback` +-- + +DROP TABLE IF EXISTS `feedback`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `feedback` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `message` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `feedback` +-- + +LOCK TABLES `feedback` WRITE; +/*!40000 ALTER TABLE `feedback` DISABLE KEYS */; +/*!40000 ALTER TABLE `feedback` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `graveyard` +-- + +DROP TABLE IF EXISTS `graveyard`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `graveyard` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text, + `worksafe` tinyint(1) DEFAULT NULL, + `reserved` text, + `reservetime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `graveyard` +-- + +LOCK TABLES `graveyard` WRITE; +/*!40000 ALTER TABLE `graveyard` DISABLE KEYS */; +/*!40000 ALTER TABLE `graveyard` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `indexqueue` +-- + +DROP TABLE IF EXISTS `indexqueue`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `indexqueue` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET latin1 COLLATE latin1_swedish_ci, + `worksafe` tinyint(1) DEFAULT NULL, + `approver` text CHARACTER SET latin1 COLLATE latin1_swedish_ci, + `surprise` tinyint(1) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `task` tinyint(4) DEFAULT NULL, + `crawl_tree` text, + `crawl_family` text, + `crawl_depth` int(11) DEFAULT NULL, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(4) DEFAULT NULL, + `force_rules` tinyint(1) DEFAULT NULL, + `crawler_id` int(11) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `indexqueue` +-- + +LOCK TABLES `indexqueue` WRITE; +/*!40000 ALTER TABLE `indexqueue` DISABLE KEYS */; +/*!40000 ALTER TABLE `indexqueue` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `reviewqueue` +-- + +DROP TABLE IF EXISTS `reviewqueue`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reviewqueue` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text, + `worksafe` tinyint(1) DEFAULT NULL, + `reserved` text, + `reservetime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `time` datetime DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=latin1; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `reviewqueue` +-- + +LOCK TABLES `reviewqueue` WRITE; +/*!40000 ALTER TABLE `reviewqueue` DISABLE KEYS */; +/*!40000 ALTER TABLE `reviewqueue` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `windex` +-- + +DROP TABLE IF EXISTS `windex`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `windex` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci, + `url_noprefix` text, + `title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci, + `tags` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci, + `description` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci, + `body` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci, + `language` text, + `surprise` tinyint(1) DEFAULT NULL, + `http` tinyint(1) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `worksafe` tinyint(1) DEFAULT NULL, + `crawl_tree` text, + `crawl_family` text, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(1) DEFAULT NULL, + `force_rules` tinyint(1) DEFAULT NULL, + `enable` tinyint(1) DEFAULT NULL, + `date` datetime NOT NULL DEFAULT '0000-00-00 00:00:00', + `updated` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `approver` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci, + `fault` tinyint(1) DEFAULT '0', + PRIMARY KEY (`id`), + FULLTEXT KEY `main` (`tags`,`title`,`body`,`description`,`url`), + FULLTEXT KEY `title` (`title`), + FULLTEXT KEY `url` (`url`), + FULLTEXT KEY `url_noprefix` (`url_noprefix`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `windex` +-- + +LOCK TABLES `windex` WRITE; +/*!40000 ALTER TABLE `windex` DISABLE KEYS */; +/*!40000 ALTER TABLE `windex` ENABLE KEYS */; +UNLOCK TABLES; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2022-07-05 1:23:45 diff --git a/db/wibytemp.sql b/db/wibytemp.sql new file mode 100755 index 0000000..b73780f --- /dev/null +++ b/db/wibytemp.sql @@ -0,0 +1,99 @@ +-- MySQL dump 10.13 Distrib 8.0.18, for Linux (x86_64) +-- +-- Host: localhost Database: wibytemp +-- ------------------------------------------------------ +-- Server version 8.0.18 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!50503 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `rejected` +-- + +DROP TABLE IF EXISTS `rejected`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `rejected` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text, + `user` text, + `date` datetime DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `rejected` +-- + +LOCK TABLES `rejected` WRITE; +/*!40000 ALTER TABLE `rejected` DISABLE KEYS */; +/*!40000 ALTER TABLE `rejected` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `reserve_id` +-- + +DROP TABLE IF EXISTS `reserve_id`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reserve_id` ( + `id` bigint(20) NOT NULL, + `time` datetime DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `reserve_id` +-- + +LOCK TABLES `reserve_id` WRITE; +/*!40000 ALTER TABLE `reserve_id` DISABLE KEYS */; +/*!40000 ALTER TABLE `reserve_id` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `titlecheck` +-- + +DROP TABLE IF EXISTS `titlecheck`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `titlecheck` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text, + `title` text, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `titlecheck` +-- + +LOCK TABLES `titlecheck` WRITE; +/*!40000 ALTER TABLE `titlecheck` DISABLE KEYS */; +/*!40000 ALTER TABLE `titlecheck` ENABLE KEYS */; +UNLOCK TABLES; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2022-07-05 1:23:55 diff --git a/etc/nginx/sites-available/default_example b/etc/nginx/sites-available/default_example new file mode 100755 index 0000000..08a1d81 --- /dev/null +++ b/etc/nginx/sites-available/default_example @@ -0,0 +1,270 @@ +## +# You should look at the following URL's in order to grasp a solid understanding +# of Nginx configuration files in order to fully unleash the power of Nginx. +# https://www.nginx.com/resources/wiki/start/ +# https://www.nginx.com/resources/wiki/start/topics/tutorials/config_pitfalls/ +# https://wiki.debian.org/Nginx/DirectoryStructure +# +# In most cases, administrators will remove this file from sites-enabled/ and +# leave it as reference inside of sites-available where it will continue to be +# updated by the nginx packaging team. +# +# This file will automatically load configuration files provided by other +# applications, such as Drupal or Wordpress. These applications will be made +# available underneath a path with that package name, such as /drupal8. +# +# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. +## + +# Default server configuration +# + +fastcgi_cache_path /etc/nginx/phpcache levels=1:2 max_size=1g keys_zone=MYAPP:100m inactive=5m; +fastcgi_cache_key "$scheme$request_method$host$request_uri"; + +proxy_cache_path /etc/nginx/cache levels=1:2 keys_zone=main_cache:100m max_size=1g inactive=5m; +proxy_cache_key "$scheme$request_method$host$request_uri$cookie_ws"; + +#server { #redirect http to https +# listen 80 default_server; +# listen [::]:80 default_server ipv6only=on; +# server_name wiby.me; +# return 301 https://$host$request_uri; +#} +upstream remote_core { +# server 10.8.0.101:8080; +# server 10.8.0.102:8080; +# server 10.8.0.103:8080; +# server 10.8.0.104:8080; +# server 127.0.0.1:8080 backup; + server 127.0.0.1:8080; +} +server { #handles http requests. Allows for legacy browsers or else redirects to https + listen 80 default_server; +# listen [::]:80 default_server ipv6only=off; #this prevented nginx from starting on my vps, said port was in use + server_name wiby.me www.wiby.me; + + if ( $http_user_agent ~ (Chrome)) { #redirect to https for old chrome devices + return 301 https://$host$request_uri; + } + + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.php index.html index.htm; + + #comment all "core app" location entries to revert wiby search to php + + location = / { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/; + proxy_pass http://remote_core/; + } + location /settings/ { #core app + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/settings/; + #proxy_pass http://remote_core/settings/; + } + location = /json/ { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/json/; + proxy_pass http://remote_core/json/; + } + location = /surprise/ { #core app + # try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/surprise/; + #proxy_pass http://remote_core/surprise/; + } + + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + location ~ \.php$ { + # try_files $url = 404; + fastcgi_split_path_info ^(.+\.php)(/.+)$; + include snippets/fastcgi-php.conf; + include fastcgi_params; + # + # # With php-fpm (or other unix sockets): + fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; + fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # fastcgi_index index.php; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + # #for microcaching + fastcgi_cache MYAPP; + fastcgi_cache_valid 5m; + fastcgi_cache_bypass $no_cache; + fastcgi_no_cache $no_cache; + } + + # deny access to .htaccess files, if Apache's document root + + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} + + # Don't cache the following URLs + if ($request_uri ~* "/(review/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|inndexqueue.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|review.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|error.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(insert/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|error.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(surprise/|index.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(submit/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|submit.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|error.html.php)") { set $no_cache 1; } +} + +server { + #listen 80 default_server; #comment this out if you dont want http + #listen [::]:80 default_server; #comment this out if you dont want http + + # SSL configuration + # + listen 443 ssl default_server; + #listen [::]:443 ssl default_server; #nginx wasnt starting on my vps with this + + ssl_certificate /etc/nginx/ssl/YOUR_SSL_CERT.crt; + ssl_certificate_key /etc/nginx/ssl/YOUR_SSL_KEY.key; + # + # Note: You should disable gzip for SSL traffic. + # See: https://bugs.debian.org/773332 + # + # Read up on ssl_ciphers to ensure a secure configuration. + # See: https://bugs.debian.org/765782 + # + # Self signed certs generated by the ssl-cert package + # Don't use them in a production server! + # + # include snippets/snakeoil.conf; + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.php index.html index.htm; + + server_name wiby.me www.wiby.me; + + #comment all "core app" location entries to revert wiby search to php + + location = / { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/; + proxy_pass http://remote_core/; + } + location /settings/ { #core app + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/settings/; + #proxy_pass http://remote_core/settings/; + } + location = /json/ { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/json/; + proxy_pass http://remote_core/json/; + } + location = /surprise/ { #core app + # try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/surprise/; + #proxy_pass http://remote_core/surprise/; + } + + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + location ~ \.php$ { + # try_files $url = 404; + fastcgi_split_path_info ^(.+\.php)(/.+)$; + include snippets/fastcgi-php.conf; + include fastcgi_params; + # + # # With php-fpm (or other unix sockets): + fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; + fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # fastcgi_index index.php; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + # #for microcaching + fastcgi_cache MYAPP; + fastcgi_cache_valid 5m; + fastcgi_cache_bypass $no_cache; + fastcgi_no_cache $no_cache; + } + + + # deny access to .htaccess files, if Apache's document root + + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} + + # Don't cache the following URLs + if ($request_uri ~* "/(review/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|inndexqueue.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|review.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|error.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(insert/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|error.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(surprise/|index.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(submit/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|submit.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|error.html.php)") { set $no_cache 1; } +} \ No newline at end of file diff --git a/go/core/1core.go b/go/core/1core.go new file mode 100755 index 0000000..e6e71d1 --- /dev/null +++ b/go/core/1core.go @@ -0,0 +1,976 @@ +package main + +import ( + "database/sql" + _ "github.com/go-sql-driver/mysql" +// "fmt" + "html" + "html/template" + "log" + "net/http" + "net/url" + "strconv" + "strings" + "unicode/utf8" + // "time" +) + +type indexPage struct{} +type errorReport struct{ Error string } +type surpriseURL struct{ Url string } +type settingsPage struct{ Worksafe, FilterHTTPS bool } +type MySQLResults struct{ Id, Url, Title, Description, Body string } +type PageData struct { + DBResults []MySQLResults + Query, Totalcount string + FindMore bool +} + +func main() { + http.HandleFunc("/", handler) + http.HandleFunc("/json", handler) + http.HandleFunc("/json/", handler) + http.HandleFunc("/surprise", surprise) + http.HandleFunc("/surprise/", surprise) + http.HandleFunc("/settings/", settings) + http.HandleFunc("/settings", settings) + log.Fatal(http.ListenAndServe("localhost:8080", nil)) +} + +//https://golang.org/pkg/net/http/#Request +func handler(w http.ResponseWriter, r *http.Request) { + //fmt.Fprintf(w, "%s %s \n", r.Method, r.URL) + //fmt.Fprintf(w, "%s \n", r.URL.RawQuery) + + //check if worksafe+https cookie enabled. + filterHTTPS := false + worksafe := true + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //Get the raw query + m, _ := url.ParseQuery(r.URL.RawQuery) + //Get the query parameters (q and o) + //fmt.Fprintf(w,"%s\n%s\n", m["q"][0], m["o"][0]) + + json := false + if strings.Contains(r.URL.Path, "/json") { + json = true + if _, ok := m["nsfw"]; ok { //check if &nsfw added to json url + worksafe = false + } + } + + query := "" + queryNoQuotes := "" + queryNoQuotes_SQLsafe := "" + + offset := "0" + + //Check if query and offset params exist + if _, ok := m["q"]; ok { + query = strings.Replace(m["q"][0], "'", "''", -1) + queryNoQuotes = m["q"][0] + } + if _, ok := m["o"]; ok { + offset = strings.Replace(m["o"][0], "'", "''", -1) + } + + lim := "12" + + if query == "" { //what do if no query found? + //load index if no query detected + if r.URL.Path == "/" { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } else if strings.Contains(r.URL.Path, "/json") { //load json info page if json selected + p := indexPage{} + t, _ := template.ParseFiles("coreassets/json/json.html.go") + t.Execute(w, p) + } else { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } + } else { + + //Make sure offset is a number + offsetInt, err := strconv.Atoi(offset) + if err != nil { + offset = "0" + offsetInt = 0 + } + //Convert lim to number also + limInt, _ := strconv.Atoi(lim) + + //get some details from the raw query + var additions string + querylen := len(query) + + //see if a search redirect (! or &) is used for a different search engine + if json == false && (strings.Contains(m["q"][0],"!") || strings.Contains(m["q"][0],"&")){ + searchredirect(w, r, m["q"][0]) + } + + //phone users + if query[querylen-1] == ' '{ + query = query[:querylen-1] + queryNoQuotes = queryNoQuotes[:len(queryNoQuotes)-1] + querylen = len(query) + } + + //check if user wants to limit search to a specific website + sitePos := -1 + siteEnd := 0 + siteURL := "" + if strings.Index(strings.ToLower(query), "site:") > -1 { + //get url user wants to search and remove it from the query stringre + sitePos = strings.Index(strings.ToLower(query), "site:") + siteEnd = strings.Index(query[sitePos:], " ") + //fmt.Printf("\n%d\n%d\n",sitePos,siteEnd) + if siteEnd > -1 && sitePos > 1 { //site is not last part of query + siteURL = query[sitePos+5 : siteEnd+sitePos] + query = query[:sitePos-1] + query[siteEnd+sitePos:] + queryNoQuotes = queryNoQuotes[:sitePos-1] + queryNoQuotes[siteEnd+sitePos:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd > -1 && sitePos == 0 { //site is at beginning + siteURL = query[sitePos+5 : siteEnd] + query = query[siteEnd+1:] + queryNoQuotes = queryNoQuotes[siteEnd+1:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd < 0 && sitePos > 1 { //site is at end + siteURL = query[sitePos+5:] + query = query[:sitePos-1] + queryNoQuotes = queryNoQuotes[:sitePos-1] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + }else if querylen > 5{ + query = query[5:] + } + querylen = len(query) + } + //fmt.Printf("Addition: \n%s\nQuery: '%s'\n",additions,query) + + //see if user uses -https flag (instead of cookie settings option) + if querylen > 7 && strings.ToLower(query[querylen-7:querylen]) == " -https" { + filterHTTPS = true + query = query[0 : querylen-7] + querylen = len(query) + } + + //check if user wants to search within a time window (day,week,month) + option := "" + //fmt.Printf("\n'%s'\n",query) + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:querylen], " ") == -1 { //option is at end of query + option = query[location+2 : querylen] + query = query[:location] + queryNoQuotes = queryNoQuotes[:location] + querylen = len(query) + }else if querylen > 0 && (query[0] == '!' || query[0] == '&') && strings.Index(query, " ") > -1{ //option is at start of query + option = query[1:strings.Index(query, " ")] + query = query[strings.Index(query, " ")+1:] + queryNoQuotes = queryNoQuotes[strings.Index(queryNoQuotes, " ")+1:] + querylen = len(query) + } + option = strings.ToLower(option) + if option != "" { + if option == "td" { //day + additions = additions + "AND date > NOW() - INTERVAL 1 DAY " + } else if option == "tw" { //week + additions = additions + "AND date > NOW() - INTERVAL 7 DAY " + } else if option == "tm" { //month + additions = additions + "AND date > NOW() - INTERVAL 30 DAY " + } else if option == "ty" { //year + additions = additions + "AND date > NOW() - INTERVAL 365 DAY " + } + } + + //check if worksafe and filterHTTPS flags set + if worksafe == true { + additions = additions + "AND worksafe = '1' " + } + if filterHTTPS == true { + additions = additions + "AND http = '1' " + } + + //if query is just 1 or 2 letters, help make it work. Also CIA :D + if len(query) < 3 || query == "cia" || query == "CIA" { + queryfix := " " + query + " *" + query = queryfix + queryNoQuotes = queryfix + } + + //search if query has quotes and remove them (so we can find the longest word in the query) + exactMatch := false + //queryNoQuotes := query + if strings.Contains(query, "\"") { + exactMatch = true + queryNoQuotes = strings.TrimLeft(queryNoQuotes, "\"") + getlastquote := strings.Split(queryNoQuotes, "\"") + queryNoQuotes = getlastquote[0] + //fmt.Printf("%s \n", queryNoQuotes) + } + + //Prepare to find longest word in query + words := strings.Split(queryNoQuotes, " ") + longestWordLength := 0 + longestWord := "" + wordcount := 0 + longestwordelementnum := 0 + queryNoQuotesOrFlags := "" + requiredword := "" + //queryNoFlags := "" + //first remove any flags inside var queryNoQuotes, also grab any required words (+ prefix) + if strings.Contains(queryNoQuotes, "-") || strings.Contains(queryNoQuotes, "+") { + for i, wordNoFlags := range words { + if i > 0 && strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { //add a space after + queryNoQuotesOrFlags += " " + } + if strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { + queryNoQuotesOrFlags += wordNoFlags + } + if strings.HasPrefix(wordNoFlags, "+") == true && len(wordNoFlags) > 1 { //get requiredword + requiredword = wordNoFlags[1:len(wordNoFlags)] + } + } + queryNoQuotes = queryNoQuotesOrFlags + } + //now find longest word + words = strings.Split(queryNoQuotes, " ") + if exactMatch == false { + for _, word := range words { + if len(word) > longestWordLength { + longestWordLength = len(word) + longestWord = word + longestwordelementnum = wordcount + } + wordcount++ + } + } + + //remove the '*' if contained anywhere in queryNoQuotes + if strings.Contains(queryNoQuotes, "*") && exactMatch == false { + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + } + + //get sql safe querynoquotes + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes, "'", "''", -1) + + //fmt.Printf("\nquery: %s\nquerynoquotes: %s\nquerynoquotes_sqlsafe: %s\n",query,queryNoQuotes,queryNoQuotes_SQLsafe) + //fmt.Fprintf(w,"%s\n%s\n", query,offset) + //fmt.Printf("hai\n") + + //get copy of original query because we might have to modify it further + queryOriginal := query + + tRes := MySQLResults{} + var res = PageData{} + + //init the db and set charset + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, p) + } + defer db.Close() + + // Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //Check if query is a url. + urlDetected := false + isURL := "" + if strings.Index(query, " ") == -1 && strings.Index(query, "\"") == -1 && strings.Index(query, ".") > -1 { //note this will also flag on file extensions + if len(query) > 6 && (query[0:7] == "http://" || query[0:7] == "HTTP://") { + query = query[7:] + } else if len(query) > 7 && (query[0:8] == "https://" || query[0:8] == "HTTPS://") { + query = query[8:] + } + if len(queryNoQuotes_SQLsafe) > 6 && (queryNoQuotes_SQLsafe[0:7] == "http://" || queryNoQuotes_SQLsafe[0:7] == "HTTP://") { + queryNoQuotes_SQLsafe = queryNoQuotes_SQLsafe[7:] + } else if len(queryNoQuotes_SQLsafe) > 7 && (queryNoQuotes_SQLsafe[0:8] == "https://" || queryNoQuotes_SQLsafe[0:8] == "HTTPS://") { + queryNoQuotes_SQLsafe = queryNoQuotes_SQLsafe[8:] + } + query = "\"" + query + "\"" + urlDetected = true + isURL = "WHEN LOCATE('" + queryNoQuotes_SQLsafe + "',url)>0 THEN 25" + } + + //Check if query contains a hyphenated word. Will wrap quotes around hyphenated words that aren't part of a string which is already wraped in quotes. + if (strings.Contains(queryNoQuotes_SQLsafe, "-") || strings.Contains(queryNoQuotes_SQLsafe, "+")) && urlDetected == false { + if query == "c++" || query == "C++" { //shitty but works for now + query = "c++ programming" + } + hyphenwords := strings.Split(query, " ") + query = "" + quotes := 0 + for i, word := range hyphenwords { + if strings.Contains(word, "\"") { + quotes++ + } + if ((strings.Contains(word, "-") && word[0] != '-') || (strings.Contains(word, "+") && word[0] != '+')) && quotes%2 == 0 { //if hyphen or plus exists, not a flag, not wrapped in quotes already + word = "\"" + word + "\"" + } + if i > 0 { + query += " " + } + query += word + } + } + //fmt.Printf(">%s<\n", query) + + //perform full text search FOR InnoDB STORAGE ENGINE or MyISAM + var sqlQuery, id, url, title, description, body string + + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 " + isURL + " WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + lim + " OFFSET " + offset + "" + //sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 " + isURL + " WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 END DESC LIMIT " + lim + " OFFSET " + offset + "" + //sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 " + isURL + " WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 14 END DESC LIMIT " + lim + " OFFSET " + offset + "" + + rows, err := db.Query(sqlQuery) + + if err != nil { + res.Totalcount = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + //p := indexPage{} + //t, _ := template.ParseFiles("coreassets/form.html.go") + //t.Execute(w, p) + return + } + + if urlDetected == true { + query = queryOriginal + } + + count := 0 + + for rows.Next() { + count++ + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //find query inside body of page + if exactMatch == false { + /* //remove the '*' if contained anywhere in query + if strings.Contains(queryNoQuotes,"*"){ + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + } */ + + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), strings.ToLower(requiredword)) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + + if pos == -1 { //prepare to find position of longest query word (or required word) within body + //remove the '*' at the end of the longest word if present + if strings.Contains(longestWord, "*") { + longestWord = strings.Replace(longestWord, "*", "", -1) + } + //search within body for position of longest query word. + pos = strings.Index(strings.ToLower(body), strings.ToLower(longestWord)) + //not found?, set position to a different word, make sure there's no wildcard on it + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + words[0] = strings.Replace(words[0], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[0])) + } + if longestwordelementnum == 0 { + words[1] = strings.Replace(words[1], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[1])) + } + } + } + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows.Close() + rows.Close() + //================================================================================================================================ + //no results found (count==0), so do a wildcard search (repeat the above process) + addWildcard := false + if count == 0 && offset == "0" && urlDetected == false && exactMatch == false { + addWildcard = true + query = strings.Replace(query, "\"", "", -1) //remove some things innodb gets fussy over + query = strings.Replace(query, "*", "", -1) + query = strings.Replace(query, "'", "", -1) + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes_SQLsafe, "\"", "", -1) + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes_SQLsafe, "*", "", -1) + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes_SQLsafe, "'", "", -1) + query = query + "*" + + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + lim + " OFFSET " + offset + "" + rows2, err := db.Query(sqlQuery) + if err != nil { + res.Totalcount = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + //p := indexPage{} + //t, _ := template.ParseFiles("coreassets/form.html.go") + //t.Execute(w, p) + return + } + + for rows2.Next() { + count++ + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows2.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //find query inside body of page + if exactMatch == false { + //remove the '*' if contained anywhere in query + /*if strings.Contains(queryNoQuotes,"*"){ + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + }*/ + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), strings.ToLower(requiredword)) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + if pos == -1 { //Not found? prepare to find position of longest query word within body + //remove the '*' at the end of the longest word if present + if strings.Contains(longestWord, "*") { + longestWord = strings.Replace(longestWord, "*", "", -1) + } + //search within body for position of longest query word. + pos = strings.Index(strings.ToLower(body), strings.ToLower(longestWord)) + //not found?, set position to a different word, make sure there's no wildcard on it + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + words[0] = strings.Replace(words[0], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[0])) + } + if longestwordelementnum == 0 { + words[1] = strings.Replace(words[1], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[1])) + } + } + } + + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows2.Close() + rows2.Close() + } + //======================================================================================================================= + //http://go-database-sql.org/retrieving.html + + //Close DB + db.Close() + + //If results = lim, allow the find more link + if count >= limInt && addWildcard == false { + res.FindMore = true + } else { + res.FindMore = false + } + + totalCountInt := count + offsetInt + res.Totalcount = strconv.Itoa(totalCountInt) + res.Query = m["q"][0] //get original unsafe query + + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + + } +} + +func settings(w http.ResponseWriter, r *http.Request) { + //setup for error report + error := errorReport{} + + //check if worksafe (adult content) cookie enabled. + filterHTTPS := false + worksafe := true + worksafewasoff := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //check if and what is the user posting + switch r.Method { + case "POST": + if err := r.ParseForm(); err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + worksafebox := r.Form.Get("worksafe") + agreecheck := r.Form.Get("agree") + agreesubmit := r.Form.Get("agreesubmit") + httpsbox := r.Form.Get("filterHTTPS") + + //if user agrees to terms to disable adult content, set cookie and return to index + if agreecheck == "on" { + worksafe = false + //expiration := time.Now().Add(365 * 24 * time.Hour) + if filterHTTPS == false { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox is checked, return to index with worksafe on + } else if worksafebox == "on" || agreesubmit == "on" { + //expiration := time.Now().Add(365 * 24 * time.Hour) + if httpsbox != "on" { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox unchecked and no cookie, go to content agreement section + } else if worksafebox != "on" && worksafewasoff == false && agreesubmit != "on" { + p := indexPage{} + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } + t, _ := template.ParseFiles("coreassets/settings/agree.html.go") + t.Execute(w, p) + //else if worksafebox unchecked and cookie alredy agreed, go back to index + } else if worksafebox != "on" && worksafewasoff == true { + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + } + default: + //load the settings page if no post value + settingspage := settingsPage{} + settingspage.Worksafe = worksafe + settingspage.FilterHTTPS = filterHTTPS + t, _ := template.ParseFiles("coreassets/settings/settings.html.go") + t.Execute(w, settingspage) + } +} + +func surprise(w http.ResponseWriter, r *http.Request) { + surprise := surpriseURL{} + + //check if worksafe+HTTPS cookie enabled. + filterHTTPS := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //init the db and set charset + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + defer db.Close() + // Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //grab a random page + var sqlQuery string + if filterHTTPS == false { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 order by rand() limit 1" + } else { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 and http = 1 order by rand() limit 1" + } + rows, err := db.Query(sqlQuery) + + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + var url string + for rows.Next() { + err := rows.Scan(&url) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + surprise.Url = url + } + defer rows.Close() + rows.Close() + db.Close() + t, _ := template.ParseFiles("coreassets/surprise.html.go") + t.Execute(w, surprise) +} + +func MysqlRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "'": `\'`, "\\0": "\\\\0", "\n": "\\n", "\r": "\\r", `"`: `\"`, "\x1a": "\\Z"} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + return value +} +func JSONRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "\t": "\\t", "\b": "\\b", "\n": "\\n", "\r": "\\r", "\f": "\\f" /*, `"`:`\"`*/} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + return value +} +func substr(s string, start int, end int) string { + start_str_idx := 0 + i := 0 + for j := range s { + if i == start { + start_str_idx = j + } + if i == end { + return s[start_str_idx:j] + } + i++ + } + return s[start_str_idx:] +} + +func searchredirect(w http.ResponseWriter, r *http.Request, query string) { + //separate actual query from search redirect + actualquery := "" + redirect := "" + lenquery := len(query) + if strings.Index(query," ") > -1{ + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:lenquery], " ") == -1 { //redirect is at end of query + redirect = query[location+2 : lenquery] + actualquery = query[:location] + } else if (strings.Index(query, "!") == 0 || strings.Index(query, "&") == 0){ //redirect is at start of query + redirect = query[1:strings.Index(query, " ")] + actualquery = query[strings.Index(query, " ")+1:] + //fmt.Printf("\nRedirect: %s\nquery: %s\n",redirect,actualquery) + } + redirect = strings.ToLower(redirect) + }else if (query[0] == '!' || query[0] == '&') && lenquery > 1{ + redirect = query[1:] + } + if redirect != "" { + //determine which search engine to redirect + if redirect == "g" { //if google text search + http.Redirect(w, r, "http://google.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "b" { //if bing text search + http.Redirect(w, r, "http://bing.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gi" { //if google image search + http.Redirect(w, r, "http://www.google.com/search?tbm=isch&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bi" { //if bing image search + http.Redirect(w, r, "http://www.bing.com/images/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gv" { //if google video search + http.Redirect(w, r, "http://www.google.com/search?tbm=vid&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bv" { //if bing video search + http.Redirect(w, r, "http://www.bing.com/videos/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gm" { //if google maps search + http.Redirect(w, r, "http://www.google.com/maps/search/"+actualquery, http.StatusSeeOther) + } else if redirect == "bm" { //if bing maps search + http.Redirect(w, r, "http://www.bing.com/maps?q="+actualquery, http.StatusSeeOther) + }/* else { + http.Redirect(w, r, "/?q="+actualquery, http.StatusSeeOther) + }*/ + } +} + +/*func caseInsenstiveContains(fullstring, substring string) bool { + return strings.Contains(strings.ToLower(fullstring), strings.ToLower(substring)) +}*/ + +/* +A QueryString is, by definition, in the URL. You can access the URL of the request using req.URL (doc). The URL object has a Query() method (doc) that returns a Values type, which is simply a map[string][]string of the QueryString parameters. + +If what you're looking for is the POST data as submitted by an HTML form, then this is (usually) a key-value pair in the request body. You're correct in your answer that you can call ParseForm() and then use req.Form field to get the map of key-value pairs, but you can also call FormValue(key) to get the value of a specific key. This calls ParseForm() if required, and gets values regardless of how they were sent (i.e. in query string or in the request body). + +req.URL.RawQuery returns everything after the ? on a GET request, if that helps. +*/ + +/*import ( + "net/http" +) + +func main() { + http.Handle("/", http.StripPrefix("/", http.FileServer(http.Dir("./")))) + if err := http.ListenAndServe(":8080", nil); err != nil { + panic(err) + } +}*/ + +/*func handler(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, "%s %s %s \n", r.Method, r.URL, r.Proto) + //Iterate over all header fields + for k, v := range r.Header { + fmt.Fprintf(w, "Header field %q, Value %q\n", k, v) + } + + fmt.Fprintf(w, "Host = %q\n", r.Host) + fmt.Fprintf(w, "RemoteAddr= %q\n", r.RemoteAddr) + //Get value for a specified token + fmt.Fprintf(w, "\n\nFinding value of \"Accept\" %q", r.Header["Accept"]) +}*/ diff --git a/go/core/core.go b/go/core/core.go new file mode 100755 index 0000000..8d975f5 --- /dev/null +++ b/go/core/core.go @@ -0,0 +1,1172 @@ +package main + +import ( + "database/sql" + "fmt" + _ "github.com/go-sql-driver/mysql" + "html" + "html/template" + "io/ioutil" + "log" + "net/http" + "net/url" + "strconv" + "strings" + "unicode/utf8" + // "sync" + // "time" +) + +type indexPage struct{} +type errorReport struct{ Error string } +type surpriseURL struct{ Url string } +type settingsPage struct{ Worksafe, FilterHTTPS bool } +type MySQLResults struct{ Id, Url, Title, Description, Body string } +type PageData struct { + DBResults []MySQLResults + Query, Totalcount string + FindMore bool +} + +func main() { + http.HandleFunc("/", handler) + http.HandleFunc("/json", handler) + http.HandleFunc("/json/", handler) + http.HandleFunc("/surprise", surprise) + http.HandleFunc("/surprise/", surprise) + http.HandleFunc("/settings/", settings) + http.HandleFunc("/settings", settings) + log.Fatal(http.ListenAndServe("0.0.0.0:8080", nil)) //set IP to 127.0.0.1 if reverse proxy is on the same machine +} + +//https://golang.org/pkg/net/http/#Request +func handler(w http.ResponseWriter, r *http.Request) { + //fmt.Fprintf(w, "%s %s \n", r.Method, r.URL) + //fmt.Fprintf(w, "%s \n", r.URL.RawQuery) + + //check if worksafe+https cookie enabled. + filterHTTPS := false + worksafe := true + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //Get the raw query + m, _ := url.ParseQuery(r.URL.RawQuery) + //Get the query parameters (q and o) + //fmt.Fprintf(w,"%s\n%s\n", m["q"][0], m["o"][0]) + + json := false + if strings.Contains(r.URL.Path, "/json") { + json = true + if _, ok := m["nsfw"]; ok { //check if &nsfw added to json url + worksafe = false + } + } + + query := "" + queryNoQuotes := "" + queryNoQuotes_SQLsafe := "" + + offset := "0" + + //Check if query and offset params exist + if _, ok := m["q"]; ok { + query = strings.Replace(m["q"][0], "'", "''", -1) + queryNoQuotes = m["q"][0] + } + if _, ok := m["o"]; ok { + offset = strings.Replace(m["o"][0], "'", "''", -1) + } + + lim := "12" + // limDistributedInt := + + if query == "" { //what do if no query found? + //load index if no query detected + if r.URL.Path == "/" { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } else if strings.Contains(r.URL.Path, "/json") { //load json info page if json selected + p := indexPage{} + t, _ := template.ParseFiles("coreassets/json/json.html.go") + t.Execute(w, p) + } else { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } + } else { + + //Make sure offset is a number + offsetInt, err := strconv.Atoi(offset) + if err != nil { + offset = "0" + offsetInt = 0 + } + //Convert lim to number also + limInt, _ := strconv.Atoi(lim) + + //get some details from the raw query + var additions string + querylen := len(query) + + //see if a search redirect (! or &) is used for a different search engine + if json == false && (strings.Contains(m["q"][0],"!") || strings.Contains(m["q"][0],"&")){ + searchredirect(w, r, m["q"][0]) + } + + //phone users + if query[querylen-1] == ' '{ + query = query[:querylen-1] + queryNoQuotes = queryNoQuotes[:len(queryNoQuotes)-1] + querylen = len(query) + } + + //check if user wants to limit search to a specific website + sitePos := -1 + siteEnd := 0 + siteURL := "" + if strings.Index(strings.ToLower(query), "site:") > -1 { + //get url user wants to search and remove it from the query string + sitePos = strings.Index(strings.ToLower(query), "site:") + siteEnd = strings.Index(query[sitePos:], " ") + //fmt.Printf("\n%d\n%d\n",sitePos,siteEnd) + if siteEnd > -1 && sitePos > 1 { //site is not last part of query + siteURL = query[sitePos+5 : siteEnd+sitePos] + query = query[:sitePos-1] + query[siteEnd+sitePos:] + queryNoQuotes = queryNoQuotes[:sitePos-1] + queryNoQuotes[siteEnd+sitePos:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd > -1 && sitePos == 0 { //site is at beginning + siteURL = query[sitePos+5 : siteEnd] + query = query[siteEnd+1:] + queryNoQuotes = queryNoQuotes[siteEnd+1:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd < 0 && sitePos > 1 { //site is at end + siteURL = query[sitePos+5:] + query = query[:sitePos-1] + queryNoQuotes = queryNoQuotes[:sitePos-1] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + }else if querylen > 5{ + query = query[5:] + } + querylen = len(query) + } + //fmt.Printf("Addition: \n%s\nQuery: '%s'\n",additions,query) + + //see if user uses -https flag (instead of cookie settings option) + if querylen > 7 && strings.ToLower(query[querylen-7:querylen]) == " -https" { + filterHTTPS = true + query = query[0 : querylen-7] + querylen = len(query) + } + + //check if user wants to search within a time window (day,week,month) + option := "" + //fmt.Printf("\n'%s'\n",query) + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:querylen], " ") == -1 { //option is at end of query + option = query[location+2 : querylen] + query = query[:location] + queryNoQuotes = queryNoQuotes[:location] + querylen = len(query) + }else if querylen > 0 && (query[0] == '!' || query[0] == '&') && strings.Index(query, " ") > -1{ //option is at start of query + option = query[1:strings.Index(query, " ")] + query = query[strings.Index(query, " ")+1:] + queryNoQuotes = queryNoQuotes[strings.Index(queryNoQuotes, " ")+1:] + querylen = len(query) + } + option = strings.ToLower(option) + if option != "" { + if option == "td" { //day + additions = additions + "AND date > NOW() - INTERVAL 1 DAY " + } else if option == "tw" { //week + additions = additions + "AND date > NOW() - INTERVAL 7 DAY " + } else if option == "tm" { //month + additions = additions + "AND date > NOW() - INTERVAL 30 DAY " + } else if option == "ty" { //year + additions = additions + "AND date > NOW() - INTERVAL 365 DAY " + } + } + + //check if worksafe and filterHTTPS flags set + if worksafe == true { + additions = additions + "AND worksafe = '1' " + } + if filterHTTPS == true { + additions = additions + "AND http = '1' " + } + + //if query is just 1 or 2 letters, help make it work. Also CIA :D + oneletterquery := 0 + if len(query) < 3 || query == "cia" || query == "CIA" { + queryfix := " " + query + " *" + query = queryfix + queryNoQuotes = queryfix + if len(query) == 1 { + oneletterquery = 1 + } + } + + //search if query has quotes and remove them (so we can find the longest word in the query) + exactMatch := false + //queryNoQuotes := query + if strings.Contains(query, "\"") { + exactMatch = true + queryNoQuotes = strings.TrimLeft(queryNoQuotes, "\"") + getlastquote := strings.Split(queryNoQuotes, "\"") + queryNoQuotes = getlastquote[0] + //fmt.Printf("%s \n", queryNoQuotes) + } + + //Prepare to find longest word in query + words := strings.Split(queryNoQuotes, " ") + longestWordLength := 0 + longestWord := "" + wordcount := 0 + longestwordelementnum := 0 + queryNoQuotesOrFlags := "" + requiredword := "" + //queryNoFlags := "" + //first remove any flags inside var queryNoQuotes, also grab any required words (+ prefix) + if strings.Contains(queryNoQuotes, "-") || strings.Contains(queryNoQuotes, "+") { + for i, wordNoFlags := range words { + if i > 0 && strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { + queryNoQuotesOrFlags += " " + } + if strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { + queryNoQuotesOrFlags += wordNoFlags + } + if strings.HasPrefix(wordNoFlags, "+") == true && len(wordNoFlags) > 1 { //get requiredword + requiredword = wordNoFlags[1:len(wordNoFlags)] + } + } + queryNoQuotes = queryNoQuotesOrFlags + } + //now find longest word + words = strings.Split(queryNoQuotes, " ") + if exactMatch == false { + for _, word := range words { + if len(word) > longestWordLength { + longestWordLength = len(word) + longestWord = word + longestwordelementnum = wordcount + } + wordcount++ + } + } + + //remove the '*' if contained anywhere in queryNoQuotes + if strings.Contains(queryNoQuotes, "*") && exactMatch == false { + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + } + + //get sql safe querynoquotes + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes, "'", "''", -1) + + //fmt.Fprintf(w,"%s\n%s\n", query,offset) + //fmt.Printf("hai\n") + + //get copy of original query because we might have to modify it somewhat + queryOriginal := query + + tRes := MySQLResults{} + var res = PageData{} + + //Check if query is a url + urlDetected := false + isURL := "" + if strings.Index(query, " ") == -1 && strings.Index(query, "\"") == -1 && strings.Index(query, ".") > -1 { //note this will also flag on file extensions + if len(query) > 6 && (query[0:7] == "http://" || query[0:7] == "HTTP://") { + query = query[7:] + } else if len(query) > 7 && (query[0:8] == "https://" || query[0:8] == "HTTPS://") { + query = query[8:] + } + if len(queryNoQuotes_SQLsafe) > 6 && (queryNoQuotes_SQLsafe[0:7] == "http://" || queryNoQuotes_SQLsafe[0:7] == "HTTP://") { + queryNoQuotes_SQLsafe = queryNoQuotes_SQLsafe[7:] + } else if len(queryNoQuotes_SQLsafe) > 7 && (queryNoQuotes_SQLsafe[0:8] == "https://" || queryNoQuotes_SQLsafe[0:8] == "HTTPS://") { + queryNoQuotes_SQLsafe = queryNoQuotes_SQLsafe[8:] + } + query = "\"" + query + "\"" + urlDetected = true + isURL = "WHEN LOCATE('" + queryNoQuotes_SQLsafe + "',url)>0 THEN 25" + } + + //Check if query contains a hyphenated word. Will wrap quotes around hyphenated words that aren't part of a string which is already wraped in quotes. + if (strings.Contains(queryNoQuotes_SQLsafe, "-") || strings.Contains(queryNoQuotes_SQLsafe, "+")) && urlDetected == false { + if query == "c++" || query == "C++" { //shitty but works for now + query = "c++ programming" + } + hyphenwords := strings.Split(query, " ") + query = "" + quotes := 0 + for i, word := range hyphenwords { + if strings.Contains(word, "\"") { + quotes++ + } + if ((strings.Contains(word, "-") && word[0] != '-') || (strings.Contains(word, "+") && word[0] != '+')) && quotes%2 == 0 { //if hyphen or plus exists, not a flag, not wrapped in quotes already + word = "\"" + word + "\"" + } + if i > 0 { + query += " " + } + query += word + } + } + //fmt.Printf(">%s<\n", query) + + //perform full text search FOR InnoDB or MyISAM + var sqlQuery, id, url, title, description, body, idList string + rangeOffset := 0 + serverCount := 0 + var servers []string + numServers := 0 + //parse res.csv + noservers := 0 + repLim, _ := strconv.Atoi(lim) + repOffset, _ := strconv.Atoi(offset) + repLimStr := "" + repOffsetStr := "" + noresults := 0 + repsearchfail := 0 + var idListChans []chan string + + resourceFile, err := ioutil.ReadFile("res.csv") + if err != nil { + noservers = 1 + } else { + if len(resourceFile) < 2 { + noservers = 1 + } + } + + //this switches off use of concurrent slaves to process a one word query. Should remove this if the database grows significantly larger + if strings.Contains(query, " ") == false && oneletterquery == 0 { + noservers = 1 + } + + if noservers == 0 { + //send query to go routines. + resourceFilestring := string(resourceFile) + //just in case user is messing around res.csv with a text editor and the editor ads a line feed to the end of the file + if len(resourceFilestring) > 0 && resourceFilestring[len(resourceFilestring)-1] == byte('\n') { + resourceFilestring = resourceFilestring[0 : len(resourceFilestring)-1] + } + servers = strings.Split(resourceFilestring, "\n") + numServers = len(servers) + + //numServers must divide evenly into lim, or lim must divide evenly into numservers + //if they do not, automatically adjust numServers until they divide evenly + + //calculate number of servers to use based on lim size + if limInt > numServers { + for limInt%numServers > 0 { + numServers -= 1 + } + } else if numServers > limInt { + for numServers%limInt > 0 { + numServers -= 1 + } + } + + //calculate limit and offset on distributed servers. + if numServers < limInt { + repLim = limInt / numServers + } else { + repLim = 1 + } + repOffset = offsetInt / numServers + + //calculate rangeOffset (offset for the range of returned results, important if numServers > 2*lim) + rangeOffset = offsetInt - (repOffset * numServers) + + repLimStr = strconv.Itoa(repLim) + repOffsetStr = strconv.Itoa(repOffset) + + //create a channel for each available server + for i := 0; i < numServers; i++ { + idListChans = append(idListChans, make(chan string)) + } + for _, server := range servers { + serverSettings := strings.Split(server, ",") + if len(serverSettings) == 4 { //if line contains all 4 settings + //ip, database, startID, endID + //create SQL connection string //db, err := sql.Open("mysql", "remote_guest:d0gemuchw0w@tcp(10.8.0.102:3306)/wiby?charset=utf8mb4") + serverIP := serverSettings[0] + serverDB := serverSettings[1] + startID := serverSettings[2] + endID := serverSettings[3] + sqlString := "remote_guest:d0gemuchw0w@tcp(" + serverIP + ":3306)/" + serverDB + "?charset=utf8mb4" + // fmt.Printf("%s %s %s %d\n",sqlString,startID,endID,numServers) + + //send special distributed query, only need ID returned and also search between startID/endID + sqlQuery = "SELECT id FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND id BETWEEN " + startID + " AND " + endID + " AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 " + isURL + " WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + go distributedQuery(sqlString, sqlQuery, startID, endID, idListChans[serverCount]) + serverCount++ + } + } + for i := 0; i < serverCount; i++ { + //wait for channels to complete and collect results + idList += <-idListChans[i] + } + if len(idList) > 0 { + switch strings.Contains(idList, "e") { + case true: + repsearchfail = 1 + default: + idList = idList[1:len(idList)] //trim the first comma in the list + } + } else { + noresults = 1 + } + //fmt.Printf("\nChan: %s",idList) + } + + //if all went well with replication servers, send query to master containing idList and use the rangeOffset + if numServers == serverCount && numServers > 0 && repsearchfail == 0 { + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND id IN (" + idList + ") AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 " + isURL + " WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + lim + " OFFSET " + strconv.Itoa(rangeOffset) + "" + } else { //else, if no replication servers or there was some sort of error, just search the database locally instead + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 " + isURL + " WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + lim + " OFFSET " + offset + "" + } + + //init the db and set charset + + //create SQL connection string + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, p) + } + defer db.Close() + + // If Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + count := 0 + countResults := 0 + switch noresults { //if noresults == 1, no results were found during search on active replication servers + case 0: + // Send the query + rows, err := db.Query(sqlQuery) + if err != nil { + fmt.Printf("\n%s", err) + res.Totalcount = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + //p := indexPage{} + //t, _ := template.ParseFiles("coreassets/form.html.go") + //t.Execute(w, p) + return + } + + if urlDetected == true { + query = queryOriginal + } + + for rows.Next() { + count++ + countResults++ + + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //find query inside body of page + if exactMatch == false { + //remove the '*' if contained anywhere in query + /*if strings.Contains(queryNoQuotes,"*"){ + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + } */ + + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), strings.ToLower(requiredword)) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + + if pos == -1 { //prepare to find position of longest query word (or required word) within body + //remove the '*' at the end of the longest word if present + if strings.Contains(longestWord, "*") { + longestWord = strings.Replace(longestWord, "*", "", -1) + } + //search within body for position of longest query word. + pos = strings.Index(strings.ToLower(body), strings.ToLower(longestWord)) + //not found?, set position to a different word, make sure there's no wildcard on it + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + words[0] = strings.Replace(words[0], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[0])) + } + if longestwordelementnum == 0 { + words[1] = strings.Replace(words[1], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[1])) + } + } + } + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows.Close() + rows.Close() + if count > 0 { //new search method may cause less than the limit of row results per page even if there are more results to come, so we force a full count + count = limInt + } + } //end switch + + //================================================================================================================================ + //no results found (count==0), so do a wildcard search (repeat the above process) + addWildcard := false + if count == 0 && offset == "0" && urlDetected == false && exactMatch == false { + addWildcard = true + query = strings.Replace(query, "\"", "", -1) //remove some things innodb gets fussy over + query = strings.Replace(query, "*", "", -1) + query = strings.Replace(query, "'", "", -1) + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes_SQLsafe, "\"", "", -1) + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes_SQLsafe, "*", "", -1) + queryNoQuotes_SQLsafe = strings.Replace(queryNoQuotes_SQLsafe, "'", "", -1) + query = query + "*" + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + lim + " OFFSET " + offset + "" + + if repsearchfail == 0 && noservers == 0 { + serverCount = 0 + idList = "" + for _, server := range servers { + serverSettings := strings.Split(server, ",") + if len(serverSettings) == 4 { //if line contains all 4 settings + //ip, database, startID, endID + //create SQL connection string //db, err := sql.Open("mysql", "remote_guest:d0gemuchw0w@tcp(10.8.0.102:3306)/wiby?charset=utf8mb4") + serverIP := serverSettings[0] + serverDB := serverSettings[1] + startID := serverSettings[2] + endID := serverSettings[3] + sqlString := "remote_guest:d0gemuchw0w@tcp(" + serverIP + ":3306)/" + serverDB + "?charset=utf8mb4" + //fmt.Printf("%s %s %s %d\n",sqlString,startID,endID,numServers) + + //send special distributed query, only need ID returned and also search between startID/endID + sqlQuery = "SELECT id FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND id BETWEEN " + startID + " AND " + endID + " AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + go distributedQuery(sqlString, sqlQuery, startID, endID, idListChans[serverCount]) + serverCount++ + } + } + for i := 0; i < serverCount; i++ { + //wait for channels to complete and collect results + idList += <-idListChans[i] + } + if len(idList) > 0 { + switch strings.Contains(idList, "e") { + case true: + repsearchfail = 1 + default: + idList = idList[1:len(idList)] //trim the first comma in the list + } + } else { + noresults = 1 + } + //if all went well with replication servers, send query to master containing idList and use the rangeOffset + if numServers == serverCount && numServers > 0 && repsearchfail == 0 { + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND id IN (" + idList + ") AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + lim + " OFFSET " + strconv.Itoa(rangeOffset) + "" + } else { //else, if no replication servers or there was some sort of error, just search the database locally instead + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', tags)>0 THEN 30 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', title)>0 THEN 16 WHEN LOCATE('" + queryNoQuotes_SQLsafe + "', body)>0 THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) END DESC LIMIT " + lim + " OFFSET " + offset + "" + } + } + + rows2, err := db.Query(sqlQuery) + if err != nil { + res.Totalcount = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + + return + } + + for rows2.Next() { + count++ + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows2.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //find query inside body of page + if exactMatch == false { + + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), strings.ToLower(requiredword)) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + + if pos == -1 { //prepare to find position of longest query word (or required word) within body + //remove the '*' at the end of the longest word if present + if strings.Contains(longestWord, "*") { + longestWord = strings.Replace(longestWord, "*", "", -1) + } + //search within body for position of longest query word. + pos = strings.Index(strings.ToLower(body), strings.ToLower(longestWord)) + //not found?, set position to a different word, make sure there's no wildcard on it + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + words[0] = strings.Replace(words[0], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[0])) + } + if longestwordelementnum == 0 { + words[1] = strings.Replace(words[1], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[1])) + } + } + } + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotes)) + } + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows2.Close() + rows2.Close() + } + //======================================================================================================================= + //http://go-database-sql.org/retrieving.html + + //Close DB + db.Close() + + //Allow the find more link + if (countResults >= limInt && addWildcard == false) || addWildcard == false && countResults > 2 { + res.FindMore = true + } else { + res.FindMore = false + } + totalCountInt := count + offsetInt + res.Totalcount = strconv.Itoa(totalCountInt) + res.Query = m["q"][0] //get original unsafe query + + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + } +} + +func settings(w http.ResponseWriter, r *http.Request) { + //setup for error report + error := errorReport{} + + //check if worksafe (adult content) cookie enabled. + filterHTTPS := false + worksafe := true + worksafewasoff := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //check if and what is the user posting + switch r.Method { + case "POST": + if err := r.ParseForm(); err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + worksafebox := r.Form.Get("worksafe") + agreecheck := r.Form.Get("agree") + agreesubmit := r.Form.Get("agreesubmit") + httpsbox := r.Form.Get("filterHTTPS") + + //if user agrees to terms to disable adult content, set cookie and return to index + if agreecheck == "on" { + worksafe = false + //expiration := time.Now().Add(365 * 24 * time.Hour) + if filterHTTPS == false { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox is checked, return to index with worksafe on + } else if worksafebox == "on" || agreesubmit == "on" { + //expiration := time.Now().Add(365 * 24 * time.Hour) + if httpsbox != "on" { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox unchecked and no cookie, go to content agreement section + } else if worksafebox != "on" && worksafewasoff == false && agreesubmit != "on" { + p := indexPage{} + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } + t, _ := template.ParseFiles("coreassets/settings/agree.html.go") + t.Execute(w, p) + //else if worksafebox unchecked and cookie alredy agreed, go back to index + } else if worksafebox != "on" && worksafewasoff == true { + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + } + default: + //load the settings page if no post value + settingspage := settingsPage{} + settingspage.Worksafe = worksafe + settingspage.FilterHTTPS = filterHTTPS + t, _ := template.ParseFiles("coreassets/settings/settings.html.go") + t.Execute(w, settingspage) + } +} + +func surprise(w http.ResponseWriter, r *http.Request) { + surprise := surpriseURL{} + + //check if worksafe+HTTPS cookie enabled. + filterHTTPS := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //init the db and set charset + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + defer db.Close() + // Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //grab a random page + var sqlQuery string + if filterHTTPS == false { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 order by rand() limit 1" + } else { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 and http = 1 order by rand() limit 1" + } + rows, err := db.Query(sqlQuery) + + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + var url string + for rows.Next() { + err := rows.Scan(&url) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + surprise.Url = url + } + defer rows.Close() + rows.Close() + db.Close() + t, _ := template.ParseFiles("coreassets/surprise.html.go") + t.Execute(w, surprise) +} + +func MysqlRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "'": `\'`, "\\0": "\\\\0", "\n": "\\n", "\r": "\\r", `"`: `\"`, "\x1a": "\\Z"} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + return value +} +func JSONRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "\t": "\\t", "\b": "\\b", "\n": "\\n", "\r": "\\r", "\f": "\\f" /*, `"`:`\"`*/} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + return value +} + +func substr(s string, start int, end int) string { + start_str_idx := 0 + i := 0 + for j := range s { + if i == start { + start_str_idx = j + } + if i == end { + return s[start_str_idx:j] + } + i++ + } + return s[start_str_idx:] +} + +func searchredirect(w http.ResponseWriter, r *http.Request, query string) { + //separate actual query from search redirect + actualquery := "" + redirect := "" + lenquery := len(query) + if strings.Index(query," ") > -1{ + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:lenquery], " ") == -1 { //redirect is at end of query + redirect = query[location+2 : lenquery] + actualquery = query[:location] + } else if (strings.Index(query, "!") == 0 || strings.Index(query, "&") == 0){ //redirect is at start of query + redirect = query[1:strings.Index(query, " ")] + actualquery = query[strings.Index(query, " ")+1:] + //fmt.Printf("\nRedirect: %s\nquery: %s\n",redirect,actualquery) + } + redirect = strings.ToLower(redirect) + }else if (query[0] == '!' || query[0] == '&') && lenquery > 1{ + redirect = query[1:] + } + if redirect != "" { + //determine which search engine to redirect + if redirect == "g" { //if google text search + http.Redirect(w, r, "http://google.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "b" { //if bing text search + http.Redirect(w, r, "http://bing.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gi" { //if google image search + http.Redirect(w, r, "http://www.google.com/search?tbm=isch&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bi" { //if bing image search + http.Redirect(w, r, "http://www.bing.com/images/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gv" { //if google video search + http.Redirect(w, r, "http://www.google.com/search?tbm=vid&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bv" { //if bing video search + http.Redirect(w, r, "http://www.bing.com/videos/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gm" { //if google maps search + http.Redirect(w, r, "http://www.google.com/maps/search/"+actualquery, http.StatusSeeOther) + } else if redirect == "bm" { //if bing maps search + http.Redirect(w, r, "http://www.bing.com/maps?q="+actualquery, http.StatusSeeOther) + }/* else { + http.Redirect(w, r, "/?q="+actualquery, http.StatusSeeOther) + }*/ + } +} + +func distributedQuery(con string, sqlQuery string, startID string, endID string, idListChan chan<- string) { + var id string + var idList string + count := 0 + //defer wg.Done() + //init the db + db, err := sql.Open("mysql", con) + if err != nil { + idList = idList + "e" //will look for this when channels are processed + } + defer db.Close() + // If Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + } + + // Send the query + rows, err := db.Query(sqlQuery) + if err == nil { + for rows.Next() { + err := rows.Scan(&id) + if err != nil { + } + //idString = idstring + "id = " + id + " or " + idList += "," + id + count++ + } + } else { + idList = idList + "e" //will look for this when channels are processed + fmt.Printf("%s", err) + } + //fmt.Printf("%s\n", idList) + idListChan <- idList +} + +/*func caseInsenstiveContains(fullstring, substring string) bool { + return strings.Contains(strings.ToLower(fullstring), strings.ToLower(substring)) +}*/ + +/* +A QueryString is, by definition, in the URL. You can access the URL of the request using req.URL (doc). The URL object has a Query() method (doc) that returns a Values type, which is simply a map[string][]string of the QueryString parameters. + +If what you're looking for is the POST data as submitted by an HTML form, then this is (usually) a key-value pair in the request body. You're correct in your answer that you can call ParseForm() and then use req.Form field to get the map of key-value pairs, but you can also call FormValue(key) to get the value of a specific key. This calls ParseForm() if required, and gets values regardless of how they were sent (i.e. in query string or in the request body). + +req.URL.RawQuery returns everything after the ? on a GET request, if that helps. +*/ + +/*import ( + "net/http" +) + +func main() { + http.Handle("/", http.StripPrefix("/", http.FileServer(http.Dir("./")))) + if err := http.ListenAndServe(":8080", nil); err != nil { + panic(err) + } +}*/ + +/*func handler(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, "%s %s %s \n", r.Method, r.URL, r.Proto) + //Iterate over all header fields + for k, v := range r.Header { + fmt.Fprintf(w, "Header field %q, Value %q\n", k, v) + } + + fmt.Fprintf(w, "Host = %q\n", r.Host) + fmt.Fprintf(w, "RemoteAddr= %q\n", r.RemoteAddr) + //Get value for a specified token + fmt.Fprintf(w, "\n\nFinding value of \"Accept\" %q", r.Header["Accept"]) +}*/ diff --git a/go/core/coreassets/error.html.go b/go/core/coreassets/error.html.go new file mode 100755 index 0000000..334b411 --- /dev/null +++ b/go/core/coreassets/error.html.go @@ -0,0 +1,20 @@ +<!DOCTYPE html> + +<html> + + <head> + + <title>Wiby Error + + + + + + +

Wiby kaputnik :( ...

+ + + + diff --git a/go/core/coreassets/form.html.go b/go/core/coreassets/form.html.go new file mode 100755 index 0000000..e0e1897 --- /dev/null +++ b/go/core/coreassets/form.html.go @@ -0,0 +1,45 @@ + + + + Put your title + + + + + + + + +

Name


+
+
+ + +

+ +
+
+
+
+
+
+
+               .n.                     |
+              /___\          _.---.  \ _ /
+              [|||]         (_._ ) )--;_) =-
+              [___]          '---'.__,' \
+              }-=-{                    |
+              |-" |
+              |.-"|                p
+       ~^=~^~-|_.-|~^-~^~ ~^~ -^~^~|\ ~^-~^~-
+       ^   .=.| _.|__  ^       ~  /| \
+        ~ /:. \" _|_/\    ~      /_|__\  ^
+       .-/::.  |   |""|-._    ^   ~~~~
+         `===-'-----'""`  '-.              ~
+      jgs               __.-'      ^
+        
+
+
Privacy | About +
+ + diff --git a/go/core/coreassets/json/json.html.go b/go/core/coreassets/json/json.html.go new file mode 100755 index 0000000..6528a21 --- /dev/null +++ b/go/core/coreassets/json/json.html.go @@ -0,0 +1,19 @@ + + + + JSON API + + + + +

Using JSON API

+

Use https://domain/json/ to get a JSON output of search results.

+ Example: https://domain/json/?q=test outputs results for the query 'test'.

+ Append the parameter &o=NUM to get the next page of results.
+ To determine the value of NUM, look for the presence of NextOffset at the end of the JSON data.

+ Example: https://domain/json/?q=test&o=12 +

Terms of Use: +
1. Set terms here. +

+ + diff --git a/go/core/coreassets/json/results.json.go b/go/core/coreassets/json/results.json.go new file mode 100755 index 0000000..ba05ac6 --- /dev/null +++ b/go/core/coreassets/json/results.json.go @@ -0,0 +1,13 @@ +[ +{{range $i, $e:=.DBResults}}{{if $i}}, +{{end}} { + "URL": "{{.Url}}", + "Title": "{{.Title}}", + "Snippet": "{{.Body}}", + "Description": "{{.Description}}" + }{{end}}{{if .FindMore }}, + { + "NextOffset": "{{.Totalcount}}" + } +{{end}} +] diff --git a/go/core/coreassets/results.html.go b/go/core/coreassets/results.html.go new file mode 100755 index 0000000..786081f --- /dev/null +++ b/go/core/coreassets/results.html.go @@ -0,0 +1,32 @@ + + + + + {{.Query}} + + + + +
+
+ name   + + +
+

+
+


+ + {{range .DBResults}} +
+ {{ printf "%.150s" .Title}}

{{.Url}}

{{printf "%.180s" .Body}}
{{printf "%.180s" .Description}}

+
+ {{end}} + + {{if .FindMore }} +


Find more...
+ {{else}} +


That's everything I could find.
Help make me smarter by submitting a page.

+ {{end}} + + diff --git a/go/core/coreassets/settings/agree.html.go b/go/core/coreassets/settings/agree.html.go new file mode 100755 index 0000000..ff53e3d --- /dev/null +++ b/go/core/coreassets/settings/agree.html.go @@ -0,0 +1,33 @@ + + + + + + + Adult Content Agreement + + + + + + + + +

Adult Content Agreement

+ + You have indicated that you do not want adult content filtered.
+ By clicking agree, you accept that you will not freak out over what could end up displayed in the search results.
+ We try to ensure content that is illegal does not get stored into the index. + If you are 18 years of age or older and agree to the terms, check the box and press Submit. +

+

+
+
I agree to the terms and conditions (check and submit)
+

+ Return to Wiby search + +
+ + + + diff --git a/go/core/coreassets/settings/gohome.html b/go/core/coreassets/settings/gohome.html new file mode 100755 index 0000000..033b9c6 --- /dev/null +++ b/go/core/coreassets/settings/gohome.html @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/go/core/coreassets/settings/settings.html.go b/go/core/coreassets/settings/settings.html.go new file mode 100755 index 0000000..95c2d75 --- /dev/null +++ b/go/core/coreassets/settings/settings.html.go @@ -0,0 +1,75 @@ + + + + + Settings + + + + + + +
+
+ + {{ if .Worksafe }} + + {{else}} + + {{end}}  

+ + {{ if .FilterHTTPS }} + + {{else}} + + {{end}} +

*for old browsers


+
+ +

+ About +
+




Search Options:

+ + cats +tabby (results must contain the word tabby)
+ cats -tabby (results must not contain the word tabby)
+ "I love you" (use quotes to find an exact match)
+ join* (asterisk finds matches such as join, joins, joining)
+
+ !td tornado (find within the frame of one day)
+ !tw tornado (find within the frame of one week)
+ !tm tornado (find within the frame of one month)
+ !ty tornado (find within the frame of one year)
+
+ site:URL Lorem ipsum (limit search within a domain or URL)
+
+
+

Redirect Options:
+
+ !g Paris (Google Text Search)
+ !gi Paris (Google Images)
+ !gv Paris (Google Videos)
+ !gm Paris (Google Maps)
+
+ !b Paris (Bing Text Search)
+ !bi Paris (Bing Images)
+ !bv Paris (Bing Videos)
+ !bm Paris (Bing Maps)
+
+ You may also use '&' in place of '!'. +

+
+ + + + + + + diff --git a/go/core/coreassets/surprise.html.go b/go/core/coreassets/surprise.html.go new file mode 100755 index 0000000..73460c1 --- /dev/null +++ b/go/core/coreassets/surprise.html.go @@ -0,0 +1,10 @@ + + + + + + + + You asked for it! + + diff --git a/html/about/button.gif b/html/about/button.gif new file mode 100755 index 0000000000000000000000000000000000000000..87a6a757743b0e99df6d3ef6afee72a1e98f80be GIT binary patch literal 2150 zcmeHI?N`!y9{nMpNNPCwtU01&mgOkp8Z~D4ESjqjG@4J4BB8NrrIo4iQ}7{|(8fYE z)1TxRqN7ZaIcECBAv2w$He;jiIx9}hS;yRlDL1WY*=GKReYy9=eerp5@45G!&$-dG zD9V0L67(VT8iG(Ll&Y7G005)F!6UqAPHBNed9d&{0s{%6$vi8bn3!4}Hg&3pB6&3i zgF&fOHh1Kvew+yo)dqz#Jf!tEQ$*>VeUrWD$z-BsoXsqib!v2=1}J(45dk=oSKrw?B#8$q z03;}I=t!=t226Z2qf`&|_VgwQBlX+1%Vr;EN@#?^C?k zHEe?4`^JO)(J2!(6AFd8uu6t>0EsWq2tZH&XdL(A>gp=%bbUfbVI;p6=^5(i9fJOk z|3w4DP0SxMdw%ENf5HN1f)w|hWBS^fZN1+3JpWkRHd;mSa)_8qY}b`THe|QknHFQ) zOR0WYlzdT)XbYYEyph0-zFhiHH()6d!1g}%(@AnO@s*WzIIk6J^q zT~E@Nyx~}DDkc60g0q%E48C*gNq4;ZsbJK@+!1eOXV8<`Lfq|yfrsy0`&CJ*!^yX@ zLmTl7)DD}-9o(|MRp8<;y*of^u4LXVlA* zc~0ln8?uT^JHF#zkZ9%#_fU-xhOso}L|=#(JEmjGPJGHFZFr;65F=VTH#iL#s8F3w^MQZbclAg+~i(r2@%hZj}acDniUS7v4D)Ky) zSJ-clp0@oQg$XG!n%nO6&rd6gx>^^_+v&98>dM%qjb!@W-w&l33M!q^=QkX9HV>?Y z;xv|(5IW@^GPlSu3T8)aLU2?e_6)y|lM=OkPR^`3Qv@?v)m7JSF!W2$pkp!X=Dd*r zQ+^hYSPVqTy7Mvze)Nj179T=ohcz8m^BWwgHuDkNTgmJ$=B-$+yCr0{Fi)ILUy?J` zm$|cad5+MWkz|}@x@KZh()cIWjfQT!RCvUrh2u?P5buwZgsS6gM;`iQ%}HKdu>PYA z--X#-U$Dhgen<43=??RUv4xR!(@#^dm!?nMwZ~nrpH|*5pPT&}a!e$IlP(4T-8eZv zTh!e`gpc!GC}gufGoSpg@0HH~yX`JIkZ6x>|B1UdzX6`qfgO-DI7)zO=8LVY*b(-H z9s$*ktVS4Z_8+ad!FFpn6(S`PS!(((3u2E94(0%4&A_6vEUgHp7LbA1cnDcs?l)BX zMl4&1y`?kC2+=_L_I}bZof2#5HPL*4f1R&V#Zm*4%ZbistrdPvdSY$?f zq&XV(7u|`}6Z)>5vMQT5hOBLrdmh!Yes5_>!B9?HSj<;d1yyFs*!8;usbu{5Hf_K% ztCF_wWBiOiJMdqkNRA*p+?7;aN%-r;ZhU_DS0zGBbaflf1H+MCIxEwnZtZ#bW0L;Y z5p_>$_lfi`<8a~Ugr1n=m4Vmwu5bKNiv~7MI`V!V?vG&o_H;!wsmiU>gKh@nqJu9b zdERN`-hee;)>mbF>G!5wL1}KDLxPU zwi4LTXAYH-dzb5K%P`NJYx{)zg4=8*k6@g80OkYA70k&u6P=Tg?_>o1v|*RL`*~m3 z{xDEmHmSG5T0U|N)Yp@INOngzD~XWp)i=txnG26 zuUsF$W?OCkkvQ(guMu~2hnDMXehS`ci3j0RWe!^GZ + + +Build your own Search Engine + + + + + +
+

Build Your Own Search Engine

+
(Wiby Install Guide)
+
+

+Overview +
+Installation +
+Controlling +
+Scaling + +

Overview

+July 7, 2022. Wiby is a search engine for the World Wide Web. I am ready to give it away under the GPLv2 license now that it has the features I want. +
+
+It includes a web interface allowing guardians to control where, how far, and how often it crawls websites and follows hyperlinks. The search index is stored inside of a MySQL full-text index. +
+
+Fast queries are maintained by concurrently reading different sections of the index across multiple replication servers, returning a list of top results from each replica, +then searching the combined list to ensure correct ordering. Replicas that fail are automatically excluded; new replicas are easy to include. +As new pages are crawled, they are stored randomly across the index, ensuring each replica can obtain relevant results.
+
+It is not meant to index the entire web and then sort it with a ranking algorithm. +It prefers to seed its index through human submissions made by guests, or by the guardian(s) of the search engine. +
+
+The software is designed for anyone with some extra computers (even a Pi), to host their own search engine catering to whatever niche matters to them. The search engine includes a simple API +for meta search engines to harness. +
+
+I hope this will enable anyone with a love of computers to cheaply build and maintain a search engine of their own. +I hope it can cultivate free and independent search engines, ensuring accessibility of ideas and information across the World Wide Web. +
+
+
+
+       Web Traffic
+            |
+            |
++-----------+-----------+
+| Reverse Proxy (nginx) |
++-----------+-----------+
+            |
+            |
++-----------+-----------+
+|  Wiby Core Server(s)  |+-----------------+----------------------------+
+|(PHP or Golang version)|                  |                            |
++-----------+-----------+       +----------+----------+       +---------+---------+
+            |                   |Replication Databases|+-----+|Replication Tracker|
+            |                   +----------+----------+       +-------------------+
++-----------+-----------+                  |
+|    Primary Database   |+-----------------+
+|   (MySQL or MariaDB)  |
++----+-------------+----+
+     |             |  
+     |             |  
++----+-----+  +----+----+
+|   Web    |  | Refresh |
+|Crawler(s)|  |Scheduler|
++----------+  +---------+
+
+
+
+

Installation

+I can only provide manual install instructions at this time. +
+
+Note that while the software is functionally complete, it is still in beta. Anticipate that some bugs will be discovered now that the source is released. +Ensure that you isolate the search engine from your other important services, and if you are running parts of it out of your home, keep the servers +on a separate VLAN. Continue this practise even when the software reaches "1.0". +
+
+If you have created a "LAMP", or rather a "LEMP" server before, this isn't much more complicated. If you've never done that, I suggest you find a "LEMP" tutorial. +

+

Build a LEMP server

+Digital Ocean tutorials are usually pretty good so here's a link to one. +
+
+For the sake of simplicity, assume all instructions are for Ubuntu 20.04. If you are on a different distro, modify the install steps accordingly to suit your distro. +
+If you don't have a physical server, you can rent computing space by looking for a "VPS provider". This virtual computer will be your reverse proxy, and if you want, it can host everything else too. +
+
+

Install the following additional packages:

+
apt install build-essential php-gd libcurl4-openssl-dev libmysqlclient-dev mysql-server golang git
+go get -u github.com/go-sql-driver/mysql
+ +
+

Get Wiby Source Files

+Download the source directly from Wiby here, or from GitHub. The source is released under the GPLv2 license. Copy the source files for Wiby to your server. +
+
+ +

Compile the crawler (cr), refresh scheduler (rs), replication tracker (rt):

+
+gcc cr.c -o cr -I/usr/include/mysql -lmysqlclient -lcurl -std=c99 -O3
+gcc rs.c -o rs -I/usr/include/mysql -lmysqlclient -std=c99 -O3
+gcc rt.c -o rt -I/usr/include/mysql -lmysqlclient -std=c99 -O3
+
+If you get any compile errors, it is likely due to the path of the mysql or libcurl header files. +This could happen if you are not using Ubuntu 20. You might have to locate the correct path for curl.h, easy.h, and mysql.h. +
+
+

Build the core server application:

+
+Inside the go folder:
+
+go build core.go
+go build 1core.go
+
+If you eventually plan use replication servers or you are using a computer with a lot of available cores, you can use 'core'. +Alternately, if you dont intend to have a large index and do not plan on adding replication servers, you can use '1core'. +You can also use index.php in the root of the www directory and not use the Go version at all. Though the PHP version +is used mainly for prototyping. +
+
+

Build the database:

+Make sure these lines are inside of /etc/mysql/my.cnf, then restart mysql +
+[client]
+default-character-set=utf8mb4
+
+[mysql]
+default-character-set = utf8mb4
+
+[mysqld]
+max_connections = 2000
+ft_min_word_len=2
+sql_mode = "NO_BACKSLASH_ESCAPES"
+character-set-server = utf8mb4
+collation-server = utf8mb4_0900_ai_ci
+skip-character-set-client-handshake
+default-authentication-plugin=mysql_native_password
+wait_timeout = 800
+
+#memory use settings, you should adjust this based on your hardware
+innodb_buffer_pool_size = 1342177280
+innodb_buffer_pool_instances = 2
+
+
+Login to MySQL and type: +
+create database wiby;
+create database wibytemp;
+
+Import the wiby and wibytemp database files: +
+mysql -u root -p wiby < wiby.sql
+mysql -u root -p wibytemp < wibytemp.sql
+
+Login to MySQL, create the following accounts and give them the correct access: +
+create user 'guest'@'localhost' identified by 'qwer';
+create user 'approver'@'localhost' identified by 'foobar';
+create user 'crawler'@'localhost' identified by 'seekout';
+use wiby;
+grant select on accounts to 'approver'@'localhost';
+grant select on reviewqueue to 'approver'@'localhost';
+grant insert on indexqueue to 'approver'@'localhost';
+grant delete on reviewqueue to 'approver'@'localhost';
+grant update on reviewqueue to 'approver'@'localhost';
+grant select on indexqueue to 'crawler'@'localhost';
+grant insert on windex to 'crawler'@'localhost';
+grant insert on indexqueue to 'crawler'@'localhost';
+grant update on windex to 'crawler'@'localhost';
+grant delete on indexqueue to 'crawler'@'localhost';
+grant delete on windex to 'crawler'@'localhost';
+grant select on windex to 'crawler'@'localhost';
+grant insert on reviewqueue to 'crawler'@'localhost';
+grant select on windex to 'guest'@'localhost';
+grant insert on reviewqueue to 'guest'@'localhost';
+grant insert on feedback to 'guest'@'localhost';
+grant select on feedback to 'approver'@'localhost';
+grant delete on feedback to 'approver'@'localhost';
+grant insert on graveyard to 'approver'@'localhost';
+grant update on graveyard to 'approver'@'localhost';
+grant delete on graveyard to 'approver'@'localhost';
+grant select on graveyard to 'approver'@'localhost';
+grant update on accounts to 'approver'@'localhost';
+grant insert on accounts to 'approver'@'localhost';
+grant delete on accounts to 'approver'@'localhost';
+use wibytemp;
+grant select on titlecheck to 'crawler'@'localhost';
+grant insert on titlecheck to 'crawler'@'localhost';
+grant delete on titlecheck to 'crawler'@'localhost';
+grant select on rejected to 'approver'@'localhost';
+grant insert on rejected to 'approver'@'localhost';
+grant delete on rejected to 'approver'@'localhost';
+grant select on reserve_id to 'crawler'@'localhost';
+grant insert on reserve_id to 'crawler'@'localhost';
+grant delete on reserve_id to 'crawler'@'localhost';
+FLUSH PRIVILEGES;
+
+

Copy the HTML files and PHP scripts to your web server

+
Copy the contents of the the html directory into the nginx html directory (/var/www/html)
+ +

Configure nginx for Wiby

+In /etc/nginx/, create a directory called 'phpcache', and another one called 'cache'. +
+Instead of going through every detail, I will provide a template for you to try out as your default nginx config from inside /etc/nginx/sites-available/ of the source code. +
+
+You should learn nginx configuration on your own, this template is just to assist. +If you are using only the php version, comment all "core app" location entries to revert Wiby search to the php only version. +
+Make sure ssl_certificate and ssl_certificate_key have the path for your SSL files instead of the example paths. If you don't want to use SSL, just remove the server {} configuration for SSL connections (on port 443). +
+
+

Start the Refresh Scheduler

+This program (rs) will make sure all pages indexed are refreshed at least once per week (or sooner depending on how you assign updates to an individual website). +You may want to run this on startup, easiest way to set that is with a cron job (crontab -e). Run './rs -h' to get more parameters. +
+
+

Start the Crawler

+It is best to run the crawler in a screen session so that you can monitor its output. You can have more than one crawler running as long as you keep them in separate directories, include a symlink to the same robots folder, and also set the correct parameters on each. +To view the parameters, type './cr -h'. Without any parameters set, you can only run one crawler (which is probably all you need anyway). +
+
+Note that you may need to change the crawler's user-agent if you have issues indexing some websites. Pages that fail to index are noted inside of abandoned.txt. +
+
+Make sure the robots folder exists. robots.txt files are stored in the robots folder and are downloaded once and then referenced from that folder on future updates. Clear this folder every few weeks to ensure robots.txt files get refreshed from time to time. +You can turn off checking for robots.txt files by commenting out the line calling the "checkrobots" function inside of cr.c. +
+
+If crawling through hyperlinks on a page, the following file types are accepted: html, htm, txt, php, asp. Links containing parameters are ignored. These limitations do not apply to pages directly submitted by people. +
+
+

Start the core server

+'1core' is fine to get started if you have a smaller index. Use 'core' if you intend to scale computer resources as the index grows. You don't necessarily have to run this if you would prefer to use the simple index.php version. +You may want to run the core server on startup with a cron job. +
+
+

Set Administrator Password for the Web Interface

+There is no default web login, you will have to set this manually the first time: +
+Rename the /html/hash folder to something private.
+
+Edit html/private_folder_name/hashmake.php and change 'secretpassword' to your preferred admin password. 
+
+Access /private_folder_name/hashmake.php from your browser and copy down the hash.
+
+After you have copied it down, delete or remove hashmake.php from your web server folder so that the hash cannot be discovered.
+
+Login to MySQL and create the account: +
+use wiby;
+INSERT INTO accounts (name,hash,level) VALUES('your_username','your_password_hash','admin');
+
+You can now access /accounts/ from your browser, login to create and manage all accounts for administrators and guardians of the search engine. +
+
+admin - Can access all web forms for the search engine including the /accounts/ page to create and delete accounts. +
+
+guardian - Can access all forms except the /accounts/ form. The main role of a guardian is to gatekeep the index of the search engine. +
+
+
+
+

Controlling the Search Engine

+
+There are several forms to control the search engine. There is no central form linking everything together, just a collection of different folders that you can rename if you want. +
+
+

/submit/

This public facing form allows users of the search engine to submit websites for indexing, provided they comply with your submission criteria, which you can modify on /submit/form.html.php. +
+
+

/accounts/

+This is the account management page. Admins have options to create, lock, change account type, delete, and reset passwords. Guardians have the option to change their password. +
+
+

/review/

This is the most important form, intended for you to verify website submissions meet your criteria. Up to 10 pages are assigned to each guardian or admin that accesses the form. The pages will remain assigned to that account for up to 30 minutes. +From here you can control how much, how deep, and how often the web crawler will access each submission. Here is an example of the available options for a website submission: +
+
+url_that_was_submitted +
+[Worksafe] + [Surprise] + [Skip] + [Bury] + [Deny] + [Updatable] +
+ [Crawl: Depth + Pages + Type + Enforce Rules + Repeat] +
+
+Explanation of the above options: +
+
+Worksafe - Indicates if the website is safe for work. Set by the user who submitted the website, however you can change it based on your determination. +
+
+Surprise - Checking this box will put it in the "surprise me" feature, where users get redirected to random websites when they click "surprise me". Note that this feature won't show NSFW websites even if they are set to surprise. +
+
+Skip - Selecting this option will skip indexing the page and it will reappear on the review form after you submit the rest of the pages for crawling. +
+
+Bury - Selecting this will move the page to a grave yard (/grave/), a holding place with the same options as /review/ for websites that might have stopped working but that you suspect may come back online. The crawler will detect this automatically and send the page back into review. When you click on the link and see a 404, you can be assured the crawler sent it back to review after failing two update cycles. This also happens if the title of the page changes. The crawler will only do this for pages directly submitted by people. This curtesy is not given to websites that are automatically crawled but then fail to work later on. For those sites, after two failed update cycles, the page will be removed. +
+
+Deny - Select this to drop the page from being indexed. If the page does not meet your submission criteria, this would be the option to remove it from the queue. +
+
+Updatable - The update cycle for the web crawler to return to the page. This only applies to pages submitted by people, pages found by link crawling always go on a 1 week update cycle. +
+
+------------------- Crawl ------------------- +
+The options listed below control how the crawler indexes hyperlinks on the website. By default, the crawler does not index any hyperlinks, it will only index the page that is submitted. +
+
+Depth - How many layers of links to crawl through. You must set at least a depth of 1 if you want to crawl any hyperlinks. Setting a negative value = no limit. Be careful about that. +
+
+Pages - How many pages to crawl on each link layer (depth). They will be randomly selected. You must set at least 1 if you want to crawl any hyperlinks. Setting a negative value = no limit. Be careful about that. +
+
+Type - Indicates if you want to only crawl links local to the website, or links external to the website, or both. +
+
+Enforce rules - This is a blunt tool that checks if pages have more than two scripts and/or css files. If the limit is exceded, the page will not be indexed. I don't use it and prefer to manually check based on more forgiving criteria. +
+
+Repeat - While the crawler will always return to update each page in the index, it wont crawl through hyperlinks again unless you tell it to. Even so, it only crawls hyperlinks on the page at a depth of 1 when repeat is selected. +
+
+

/ban/

+You can delete or ban individual URL's from the index with this form. Its pretty simple as I don't use it much. You can't delete an entire domain with it, for that you can build your own query in the MySQL console. +
+
+

/bulksubmit/

+Admins/Guardians can import a list of URLs into the review queue with this form. +
+
+

/feedback/

+Users can submit feedback for you with this form. +
+
+

/readf/

+Where admin accounts can read feedback submitted by users. +
+
+

/grave/

+It has the same features as /review/. Websites that you don't yet want to index but don't want to forget about are stored inside /grave/ by selecting 'bury' from inside /review/. The web crawler will (only for pages submitted directly by people), move 404'd pages or pages where the title has changed back to /review/ after two update cycles +where the page does not return to normal. So after a few weeks you may notice dead pages appearing in /review/, you can decide to drop the page or to bury it where it will be moved to /grave/. The page might go back to normal at some point and you can check /grave/ to see if it resurrects. +
+
+

/insert/

+This was the first form created back in late 2016 to populate the Wiby index and see if the search engine could even work as a proof of concept. It was meant to manually enter pages into the index as no crawler existed yet. +It is still useful if you want to manually index a page that refuses to permit the crawler to access it. In that case, set updatable to 0. +
+
+

Additional Notes

+If you want to force a website to appear at the top rank for a specific single word query, (like "weather"), you can force it by adding "weather" to the tags column for the target url in the windex table. Use this sparingly. +There is no form to do this on an existing website, you will have to update the row in mysql manually. +
+
+
+
+

Scaling the Search Engine

+
+You can help ensure sub-second search queries as your index grows by building MySQL replica servers on a local netowork close to eachother, run the core application AND replication tracker (rt) on one or more replica servers and point your reverse proxy to use it. +Edit the servers.csv file for rt to indicate all available replica servers. If you have a machine with a huge amount of resources and cores, entering multiple duplicate entries to the same sever inside servers.csv (e.g. one for each core) works also. +
+
+The core application checks the replication tracker (rt) output to determine if any replicas are online, it will initiate a connection on those replicas and task each one to search a different section of the index, +drastically speeding up search speeds especially for multi-word queries. By default, single-word queries will not initiate multiple connections across replicas. To enable that on single-word queries, comment out the IF statement +on line 365 and rebuild the core application. +
+
+The reverse proxy and replica servers can be connected through a VPN such as wireguard or openvpn, however the IPs for servers.csv should be the local IPs for the LAN +the replicas are all connected on. Here is a tutorial for setting up MySQL replicas. Full instructions below: +
+
+On the primary server add these lines to my.cnf under [mysqld] but only once you have a VPN to reach your replicas. Replace my.vpn.ip with your own. +
+#setting up replication below
+bind-address = 127.0.0.1,my.vpn.ip
+server-id = 1
+log_bin = /var/log/mysql/mysql-bin.log
+binlog_do_db = wiby
+binlog_format = mixed
+
+In MySQL on the primary server, create a user for replica access: +
+create user 'slave_user'@'%' identified by 'd0gemuchw0w';
+GRANT REPLICATION SLAVE ON *.* TO 'slave_user'@'%';
+FLUSH PRIVILEGES;
+
+On the replica server, ensure the following my.cnf configuration, set the server-id as a unique id for each replica, then restart mysql: +
+[client]
+default-character-set=utf8mb4
+
+[mysql]
+default-character-set = utf8mb4
+
+[mysqld]
+max_connections = 2000
+ft_min_word_len=2
+sql_mode = "NO_BACKSLASH_ESCAPES"
+#character-set-client-handshake = FALSE
+character-set-server = utf8mb4
+collation-server = utf8mb4_0900_ai_ci
+skip-character-set-client-handshake
+default-authentication-plugin=mysql_native_password
+wait_timeout = 800
+
+#memory use settings, you should adjust this based on your hardware
+innodb_buffer_pool_size = 1342177280
+innodb_buffer_pool_instances = 2
+
+#setting up replication below
+bind-address = 0.0.0.0
+server-id = 2
+relay_log_info_repository = TABLE
+relay_log_recovery = ON
+sync_binlog=1
+
+Make sure only VPN and VLAN addresses can reach your replicas. The bind address of 0.0.0.0 can be replaced with '127.0.0.1,replica.vpn.ip' which is safer but also more crash prone if the VPN address is not available on startup. +
+
+To export the database to the replica server, on the primary server, stop the web crawler and hide any web forms that can accept new data, then open MySQL and do the following. +
+USE wiby;
+FLUSH TABLES WITH READ LOCK;
+SHOW MASTER STATUS;
+
++------------------+----------+--------------+------------------+-------------------+
+| File             | Position | Binlog_Do_DB | Binlog_Ignore_DB | Executed_Gtid_Set |
++------------------+----------+--------------+------------------+-------------------+
+| mysql-bin.000055 | 15871269 | wiby         |                  |                   |
++------------------+----------+--------------+------------------+-------------------+
+
+Keep the above session window open (or run it in a screen session). +
+Copy down the information from that table. In a separate session window, export the database: +
+mysqldump -u root -p wiby > wiby.sql
+
+Once you have exported the database and recorded what you need, you can unlock the tables, and resume as normal. On the session window displaying the master status: +
+UNLOCK TABLES;
+
+You can now close that window if you want. +
+On the replica server, import the database: +
+mysql -u root -p wiby < wiby.sql
+
+Login to MySQL and type the following but replace the IP, MASTER_LOG_FILE, and MASTER_LOG_POS with yours from the table: +
+CHANGE MASTER TO MASTER_HOST='primary.server.ip',MASTER_USER='slave_user', MASTER_PASSWORD='d0gemuchw0w', MASTER_LOG_FILE='mysql-bin.000055', MASTER_LOG_POS=15871269;
+START SLAVE;
+
+To verify that the replica is syncronized, type the following on the replica in MySQL: +
+SHOW SLAVE STATUS\G
+
+Make sure that: +
+Slave_IO_Running: Yes
+Slave_SQL_Running: Yes
+
+In MySQL on the replica: +
+use wiby;
+create user 'remote_guest'@'%' identified by 'd0gemuchw0w';
+grant select on windex to 'remote_guest'@'%';
+create user 'guest'@'localhost' identified by 'qwer';
+grant select on windex to 'guest'@'localhost';
+FLUSH PRIVILEGES;
+
+
+

Load Balancing

+You should run the core application on one or more of your replicas and have nginx send traffic to it, this way you can reduce the burden on your VPS. The replication tracker (rt) must run on the same server +and directory that the core application is running on (not required for 1core). +
+
+Add the replica server's VPN address/port to upstream remote_core {} from the default config for nginx (see the provided example template). You can use the VPS as a backup instead by adding 'backup' to its address (eg: server 127.0.0.1:8080 backup;) +
+

+
+ + diff --git a/html/about/index.html b/html/about/index.html new file mode 100755 index 0000000..d388a71 --- /dev/null +++ b/html/about/index.html @@ -0,0 +1,24 @@ + + + +About + + + +
+

Heading

+

+Write your about section. +
+
+Send Feedback

+

+

Additional Features: +

Developers can connect their applications using the JSON output available at /json. +
+
+Privacy Policy +

+
+ + diff --git a/html/about/mug.gif b/html/about/mug.gif new file mode 100755 index 0000000000000000000000000000000000000000..55c4f27c3aaed3e20bd02a41345979791aa872be GIT binary patch literal 1465 zcmV;q1xETuNk%w1VH*G;0QUd@5fc=9iIGW9QtrjEIa7ISlB*~+LxpZyjIF)3u(oJV zN+c{beXq<}Z+Vuv%O*Tp(WH+;KSH;PbuUk7IXOP)3(t@?sX=`#A8z8f_ z!SC+xSV>1WU3^7PXQgy!G&VdZGCVp#O-yES;o#w~uC`}yhnT$1yQZZ}X@$3ibwpf# z!i;!|yVf^ig+FVKPf1S{78!VO0cxpjJPneUMj
  • zJw6>GD1>2Mdau-7bD2?oqIOtQHC}-~WQ1u~O@n7yj&Et#v#aXp>4&)1u7!EfpN6)3 zX??QQ){KcjHauNUN?T7=mci6HWrx_HjLXHvkiONKb!$OALqA1Ta)hF!qpL7QU4wvz zRB4N4XKwJyy=RQ1K3HUqmaJHMi|D?q(a+Psz{B0CnnhV=adLSuNo!SWdxT|OH&$%i+}xv$%aMQc(8OF~aD}qUba{|VT5yS} zxWAHlb99H3Us=nPgTa)Ex4X;Jn1eD;b9Q!qUuS1UMoOf9aIJf9ifmyfJWi^pv1NLP z)2fupu&Y~jl~Pq(G)iFc%ef*hJ9}keIaz}qDKl1HUhUb_kY`vUGfACqXTNiCkyuuU zb#%eY*K|=xH#IkMgpuaDq-9A)OjK6Y)7v^gNjyhX!J&&jXN#G5VR3DDxr=*UL`J1| zYr(Io1 zfepLDCo@<_)#`O{tJ0}Bad4elXRUAup6EP`3!rx|ENXG6;es63ni_Nh1kCdgi)Q1c z>m)lH;7ssfYX|Ji(@b^K=cT74_v$qOSDaH4tR1%FbKJhy0si6|UHbOm1D9jP0t#F> zaa^C`B1b%r8M?fBH^GhL%vp|4fY%ZK4lyql>v^vctOE@FcJ2^r=1pBa-_+~3r;vxR zCWn~(|MZPB4`YojK@DLCEVGOpt-yyGeDMju%sjtsg+NTyP=ne5*2K{YG0ViY4pdVu zM%0BDLIzkI;C(ocHGfT4T`Zd{B+e#>Y+{Kbr{Dq|cyWNJ23*#twplDOBpHJuOe)gA zdZn$?3N8uGqs9TLl}LjkB5;|YDfZM);|R0Qn~EHf#}QK;5dvIS zKDdm46RN@HI4uB$0R~45>eh&AtRulX3Nq6tRX>@~LJch(QK+RySRkxybSyj!J(}D?-#)3#J9*|Ij2dE9HAy~+b;{h{@jWcQxTGXKH zvNc$#hNcJ}kfx(kwGc!QwJ1x%vLv9h&OG}eIF(|>wjf9zXiUH?31ZZ+hco0o^TbKO2h_ZP3t{mA_}@>6Owm9PlyECd5*7UL?_`L|`(IFsG=Tv;`p!d5J}>;> TOl+kc2 + + +Privacy Policy + + + +

    Privacy Policy

    +Include your privacy policy here. +

    + + diff --git a/html/about/wiby.gif b/html/about/wiby.gif new file mode 100755 index 0000000000000000000000000000000000000000..87a6a757743b0e99df6d3ef6afee72a1e98f80be GIT binary patch literal 2150 zcmeHI?N`!y9{nMpNNPCwtU01&mgOkp8Z~D4ESjqjG@4J4BB8NrrIo4iQ}7{|(8fYE z)1TxRqN7ZaIcECBAv2w$He;jiIx9}hS;yRlDL1WY*=GKReYy9=eerp5@45G!&$-dG zD9V0L67(VT8iG(Ll&Y7G005)F!6UqAPHBNed9d&{0s{%6$vi8bn3!4}Hg&3pB6&3i zgF&fOHh1Kvew+yo)dqz#Jf!tEQ$*>VeUrWD$z-BsoXsqib!v2=1}J(45dk=oSKrw?B#8$q z03;}I=t!=t226Z2qf`&|_VgwQBlX+1%Vr;EN@#?^C?k zHEe?4`^JO)(J2!(6AFd8uu6t>0EsWq2tZH&XdL(A>gp=%bbUfbVI;p6=^5(i9fJOk z|3w4DP0SxMdw%ENf5HN1f)w|hWBS^fZN1+3JpWkRHd;mSa)_8qY}b`THe|QknHFQ) zOR0WYlzdT)XbYYEyph0-zFhiHH()6d!1g}%(@AnO@s*WzIIk6J^q zT~E@Nyx~}DDkc60g0q%E48C*gNq4;ZsbJK@+!1eOXV8<`Lfq|yfrsy0`&CJ*!^yX@ zLmTl7)DD}-9o(|MRp8<;y*of^u4LXVlA* zc~0ln8?uT^JHF#zkZ9%#_fU-xhOso}L|=#(JEmjGPJGHFZFr;65F=VTH#iL#s8F3w^MQZbclAg+~i(r2@%hZj}acDniUS7v4D)Ky) zSJ-clp0@oQg$XG!n%nO6&rd6gx>^^_+v&98>dM%qjb!@W-w&l33M!q^=QkX9HV>?Y z;xv|(5IW@^GPlSu3T8)aLU2?e_6)y|lM=OkPR^`3Qv@?v)m7JSF!W2$pkp!X=Dd*r zQ+^hYSPVqTy7Mvze)Nj179T=ohcz8m^BWwgHuDkNTgmJ$=B-$+yCr0{Fi)ILUy?J` zm$|cad5+MWkz|}@x@KZh()cIWjfQT!RCvUrh2u?P5buwZgsS6gM;`iQ%}HKdu>PYA z--X#-U$Dhgen<43=??RUv4xR!(@#^dm!?nMwZ~nrpH|*5pPT&}a!e$IlP(4T-8eZv zTh!e`gpc!GC}gufGoSpg@0HH~yX`JIkZ6x>|B1UdzX6`qfgO-DI7)zO=8LVY*b(-H z9s$*ktVS4Z_8+ad!FFpn6(S`PS!(((3u2E94(0%4&A_6vEUgHp7LbA1cnDcs?l)BX zMl4&1y`?kC2+=_L_I}bZof2#5HPL*4f1R&V#Zm*4%ZbistrdPvdSY$?f zq&XV(7u|`}6Z)>5vMQT5hOBLrdmh!Yes5_>!B9?HSj<;d1yyFs*!8;usbu{5Hf_K% ztCF_wWBiOiJMdqkNRA*p+?7;aN%-r;ZhU_DS0zGBbaflf1H+MCIxEwnZtZ#bW0L;Y z5p_>$_lfi`<8a~Ugr1n=m4Vmwu5bKNiv~7MI`V!V?vG&o_H;!wsmiU>gKh@nqJu9b zdERN`-hee;)>mbF>G!5wL1}KDLxPU zwi4LTXAYH-dzb5K%P`NJYx{)zg4=8*k6@g80OkYA70k&u6P=Tg?_>o1v|*RL`*~m3 z{xDEmHmSG5T0U|N)Yp@INOngzD~XWp)i=txnG26 zuUsF$W?OCkkvQ(guMu~2hnDMXehS`ci3j0RWe!^GZ?j}VTRSFh9aj~A`~S@k+jKT4@S(`Nv;gBOgJUdm2FH-cGfryOKo?C zwcIt1@7LjdkRwj&91iL@weTqIINIr`oWAUybN+z)7u?qmuOD8|-=2@hD}-`nzh4pq z+6ujaAQ*<#nkYE{To{BT73K(4ToRCTfl%HamRdBS)g8=jOU~nujZHLZ`xX}$BQvk4 zR4T3$kT!L`di9E%1juOuX{AxF?%YM9^J~>$!Wfk!;(F<{-Gj@^%VC*frKUIRvy24p z4H~ORl)Ea0r?@ozgXuZ02YC1H9Y_LEtSh40CJ79Q(47w51^nZ73%iE(LZ+~kb^gG-W;T*U8&cutSl=ueTU-DWEM-+ znLteiiRUG2Mu`vx$rL55SgMt)Bh!kyJKGnR^l~{6W&#RJFroXP@|x=I8`eG6iM+;) zMxz`CP3^-#02D2~v@9__qqs@aM@h^Bxns$N5->P=D48!O0f_)mQn~tmw%Ov3435nV3~5w7HigDlZ_U0-ywPC1}qCEuDjUz5Wm_V{L8iFiRAfm@B-Y zO}Zc*`JV=y)=dBAJOApxK4FeBLfud_ z+$D8|AT*L+n@79jVYI-YN>xFf>DF)^?;*6tF*(IuvN!*)p}hK~|B z2S+#)ZJmOg-NY&PLVfoZr0l=yLM?_M)K9l{BS>EdvQ#|bG4IiKKh_z z*tue!N7Tqi&Z+7pdI#HPca_KVGEJMjsCRui2|Sp+M{nW3_{Fk}sxWf&xdrpBRc>i6{|JBx zj_5zqPxjkk>5Uv}T)$;k!L@KCXZVh#CmQ-We=Uk1#9K_rv%46bFy%n2`*hzm2!Z+r z%CRZ49i_)pTDjPDY~+ZkMvD9c15&Gy+KWqWq{4a!R*-c*%;@ATH>LY*8-vd#>? zZ*HG>jPaO8I_o6e7?ZjWB9!gAIb&D5pqjDdbr?%WicCtt5p4d$o13kb# z0M3mazod(rvw2EdG~h;$3=FJ~%N;IFcMm4hF^YTHM5oqV{b}aK%l<_1WJLz*dWn1X zKwaK=wjnjty)lIu!d$Gzl5tLuxh<#MW~{PMd$JR4=}zr5U_ulMJ?V6v833JaF)NgXp?|d5#3c*DG!O}A>1l0VjFUHHkB_Gz$G}ic)N1T16xVx?^t!7D1w9*tG#+}1! z{vh7*A{XH3?~yeP=kJGRZ+^1%H>|{G2Z4BVGc@-c?=WeG`Dd&fVsZh`56qS&URXPt zik;M$?z-&Gt1YWA-g||sGfPL7HD5VCgOzpryxbjPA1rQoUL6Q+KNT&5*G0doiX^-2 zEV4plx(Qys&QCXr3}Ja+D>_?u{!;wYXIzWpFD-s`97V^)#OkgOgano$v2DrY`4#8* z+uT~X2yZcS!G}1mTE`MIzma6})ff3HUt%3R4)yej6nFb$1iO4T__Na<>MV zwc&PfA`iT?#W3~~F{e*dbd4)L?Zmg5$ph;vv%lGB`y>RqTC!`NRbr2R#dlSKwBI~p zHbhUgY(A^D8m-*A`cgI)xS>s2h#uM}IFBR+Jh488LRg{jyidB?%!`J?r;%T|z%C@; zOROQd!)AU@sd71v3pIK6FHr+MPrwoRX&8GvYS;c4ECCCgaWCX8N6C9)o&z+Yy41Dh z-$%{g-nF|$Z%99#R$*$>?EFA>zxp-a+LOK`T*vC2t07y|?&i*W8w7`rW>|S)gJrK( zoS^Y2gsC^tzUPTccH^Wh5+Vg9_|qhfPK_%#Y9eu@zBAjr#4Zy}5Sb41>e@vWHSVz& zh?hHhs<3<0GmbyAobI`^n%JnDDE$3}kJ#|c5~j2E{ZK&XU(~ha6snnay^IIWi`uKg dTPdz#vv-fZZms$5*2#Tiv-b*oTVWVF@L$uGu-^ax literal 0 HcmV?d00001 diff --git a/html/about/wibyplex.gif b/html/about/wibyplex.gif new file mode 100755 index 0000000000000000000000000000000000000000..cd5a5d129e7b007aced5ea69b55a53f429911d50 GIT binary patch literal 25753 zcmWifd0b5G`^V2R`(D#B?UPbzowQ9VX4+Rmh~k+P4JDI`l;xb6X+a3_gfNPp5C&ni zaHdTWrX)RtDIuh1?0s$X>v#S*uk+{qdflJ<{#@7l`kaWUaK3*^5ikgR2LK~H<%E3A zUm+eTDI46agNsvRvdv5b?5vK2x_H^yW2*v|hDFX`*@b$$1VZ>X?MK851xVV$efy79 zu21o5-{oab<8 ze4dpkDWzypWM$@G0md8)0(*fSZJryiI>BdC3}2nQXn+1np{r*_#*!r#<_<>2%e?Gk zU8bvZq7!_)iWYfw$v3VK@`(=(n&aZ+Vqv*0De9qYQH;I$Snk4;xhuCVoO5NVcYCe8 zBragqjpy3G{n(_mUk|Gv7sM19@Pv-_z0VbRLYfRNn~5?cFWLC?tn5O? zjy3kw;e?el>=P#*+&C8P|0*X+|7Y^hZ~eQ|8ye*4V~009TsQyq^U;rocLq?6-Zd0m zD9X`2==JehF5g=6_~6k(@yd4bSX8Y0(oyt@Bfg7Bh^} zVm#MRGt)h)3!f3v;ThDmJMZ7~Wl1wF8}iGro~?sVR-%71_7*txPiS;nH4Ei0ckmA^kCev%QfXO_>2ow+Xy zqmIRRjFhdi{T~ka|Ht+JU;lq5h$KC5aM&U;-c&6kv7Id=nSZsF(5#~6k=ZLdig6y% z?ddGL@EuI+K8v@KQ{4x?1Q#1^{v@h!GO?h1k@nq7S6Fv8xCARnE!FGKzq@ne zfMeLcIX)xCOB>eTQ!Tz|BwT3!#CV&>3VGPkYpy$PL)J0JUHxaS>axd4Qd`&EX58CL zyjeE$%lWs5^A^IzPp@CiI5p`+du|>l5~6o#$D3kr5XrP=n{!qxt}$^h#U0-gt2wzwM}+~X=G*FGTixxw;8NV>0Z;?az(Y#{Coo} zNBmVCm$P7+1GDedMMvUh02?~z->{U~ zXLidg_FnQ?wkUO^hL+F&H*DUK=x}}Q$nlvYtOo}^bZ@Sy`hZ7_x6WJfv07SmtcIKS zrXp&d5i4@j&c$Jo-S^P2a^vCb`LiR2st>OiG`rS*qiCK{STo`H<+Cluh~$W|0h3YD zc-Y3T^}YqROV`5Rd;YJMI%7ZauQS@!<<%X#3F?oXajP#e9w(l@9oHq2_v{F34Wt_z z;LoP6w4WBujJ)J7cD}IW{V~Udv+T}$+`qoadPU3ANB+OG%7d3yEs6-vJDzv#-|h2P zCJoE)d9MDlbiwh{=MOD*cXhuJxi!)#WY$t2!}`oadHCJg_wU-7e<=t%(^NPj@z@&B z=y+}0i`rMdY)RX46ZW5V8Q-5pd-X9u+ZA zlvmp{5H+@h_t4R6>&NGm#Y-BV2c{Ot-Onz58}IwmQbWkdIMlcLOh@I{Eq0E;Z2QnF zuCveZ@{AH*7uOvKY5Hp{>+|HRvl&Z1_vQyaNs0)`+y>{sKd$ZWrR`IEIpJ(f_kXM-2IbZQUv(!f(ysnvfPrWPW+K*#fe&kty)0|DL*cFWFK=59@ zyW5+5q0{|d;1b^>#@X=5JKD9zF|gFyo&?O+D;MBSbBkt}oGO{woNB(W9dc z(SsxHtMJdxE-nv&n=b%LVc)q8%iA9xA3fD<(RyjY0WfB1a;1r-7TKLu)mMRC4FbCi zVnYJ&Xn*xt$u#Y;%6Db^b#^Ad>c(9ga+l)KR_HSr{OkByQxqcs0!`*U| zo-6u^?=D>`>6Psdyz=VWn{tdQOg6bu*{Xj~JNatQ&~UfI#?ObTR{zbO*1*kI>yl`rDYH;!6yx;D(rd$V=_10=1#Bw)rwki$5okQ?vuTilWO^YC$xtGUyR z%u2i>9_=Rv0@u%)=bTOd_^P@Nss8uin{!Xr>@bXKyO}rRjO%~9ih*kz8OrYWl%pR6 z>1`YTxsV^`*;rP>%<|zfU8A!eE=)f?b8Vwu-~QoR%u`xW#LgfG)xh@uWu8eY-r(+V zVIJ3Z*zTx%dsv5oGq8H0wEKcq27Ul`o0y{%M!6g&X~TZ0q{J00qyVSIS!D%fcS@WiQf6*W*9isSN_1y*guuiEe+R}kEmzZ(4ddw$CIr!fx(6I;x? zvf+(AHqR%EadTz^WJ3(@7AwF*wo1vu5K~-X=pq98C@x5Ol}HtMvXl{2N<3gAz=+m> zEXeojszR{{+T?_=B6%n?6w@2}u!0W_(_}c>zsx}H<@Kv2hyn0F0D107-ct{P{FM}H zew)*Mh+u;V$p(9nl^gMgOcPtJH_j?y-cMm#)dsPz2m46I@Ml6=2t3k zHJoqWwV;#@)I42A>L9CjlWeuN&}>meLn<=iIV6JKC=L9^Y8FgyD8*TEaso3?IgHJ9 zp`0lVw>P{zc;_rPG=hJb-R0k`Zg**jG$r_#3UP+toH)4g(Ws&)pFbilJqj>TP`M#9 z&C;e{^Q2Ua;le?JZL)|OGS)`zG(b$8Y>nbrQZhxf!*NhH=s|=zwVpefWg|F|1|=7> zKvo*aSr4djN0j)3M3Lpl1p!ek+OdbIWmF!A6W^VxKJ!xW&hYY~>2LS#j0|0Ay>^Kt zOMda}$aQbO#;!Q$Tw6xAcBQo!f97@y>ys6?`q;eyA}3HJwF(6|HI>DBU_4uD6{{tC zaIQ?h5cqz1c`zwSC7b>lp!tpgoLN8{d9eucQZ(Ufv@n-PN!SEAasjR%=n+pyLg;-y zjM9$;=PT~L8pk7HHWI0w=p?R@MLy}vOxd*265N!<2T&ji-8N*w;s|FtrYo9TE-y} zc|8|53%~~}Gu!KsIkZ$OYE#x0$Cg26AkAS)q2P77UbAjw?&{ zDXp3DEV584Rc#rGr@L|(>u=i-1tFyhGN#9vQY{FI=y)ZZ#AA49SG|cRuvwr3K%1u7 zJevi^BE)Y1w4R&EmOvp|cxgUVQ(nx~fEFyEhDY$1zzsSATS<)*Pz@Bg?}}Y)1#X!G zuA>XTRg{g_OdIH%cGo-apWD_mtEY`9O2SwS4*=ju7={LTw<6~D(g2T>G?!9pxegfN z5_okBtm7d=fOeBhf_3&kmcSl7moZ1+p(Z~`vfV?6V{4#uPPrji<|cr!6@VTpa4dxA zh!7v@Ku1wlu!wL-2N^4=hFvRrB}8^S(?UQM38~Isq5shMA)WPKDN%pTFBAVXZb zxLPn+hdT*CAtExZG}B6xGyQS)-@cS@n;-m+9V?TPpxtp z)1O=8_2ihNWamSLe7(|%&Letr&71|$wqh`PCupZQ75Bpe8%}p-0sksU*UK4YUoE;= z)b{}JUcg*CKg4+@CtHC3S3(2tP;f`4>f2KLO^kQNBu5r}h(#H8z@0yXh$o?-A6s|V z;zZvFZW7`q3GByXtcr&P5+G1tj*9~5Q9NiLg>DC&nX9dn0Jt>*IF1XY#Dn2nBA