Update htmlparse.h

This commit is contained in:
wibyweb 2022-10-17 23:59:26 -04:00 committed by GitHub
parent 7382848607
commit 6147404a86
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -85,15 +85,15 @@ void htmlparse(){
for(int i=0;i<fsize;i++){
c = fileStr[i];
//use a rolling window of 100 bytes to detect elements, ignore control characters and spaces
if(c != 127 && c > 15 && c !=32){
//use a rolling window of 100 bytes to detect elements, ignore lf/cr/so/si/space/null/tab
if(c!= 10 && c != 13 && c != 14 && c != 15 && c != 127 && c != 32 && c != 0 && c != 9){
for(int j=0;j<window_len-1;j++){
window[j] = window[j+1];
}
window[window_len-1] = c;
}
//use a rolling window of 100 bytes to detect elements, but permit space, ignore control characters
if(c != 127 && c > 15){
//use a rolling window of 100 bytes to detect elements, but permit space, ignore lf/cr/null/tab
if(c!= 10 && c != 13 && c != 14 && c != 15 && c != 127 && c != 0 && c != 9){
for(int j=0;j<window_len-1;j++){
windowWithSpaces[j] = windowWithSpaces[j+1];
}
@ -102,14 +102,14 @@ void htmlparse(){
//Get Title
if(titlefound == 2){
if(titlesize < (title_len-2) && c != 127 && c > 15){
if(titlesize < (title_len-2) && c!= 10 && c != 13 && c != 14 && c != 15 && c != 127 && c != 0 && c != 9){
title[titlesize]=c;
titlesize++;
if(c == 39){//check for single quotes and double them up for sql safety
title[titlesize]=c;
titlesize++;
}
if(c != 127 && c > 32){//some titles are just a bunch of spaces or garbage, need to check for that
if(c != 32 && c != 12 && c != 13 && c != 14 && c != 15 && c != 127 && c != 10 && c != 9){//some titles are just a bunch of spaces or garbage, need to check for that
emptytitle = 0;
}
}
@ -239,8 +239,8 @@ void htmlparse(){
}
//Get Body
//exclude remaining tags, comments, scripts, styles, control characters, add a space after a '>' but only allow one
if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && c != 127 && c > 15 && bodysize < (body_len-2)){
//exclude remaining tags, comments, scripts, styles, cr, lf, null, tab, add a space after a '>' but only allow one
if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && c!= 13 && c != 14 && c != 15 && c != 127 && c != 10 && c != 0 && c != 9 && bodysize < (body_len-2)){
if(putspace == 1){
if(spacecount == 0){
body[bodysize]=32;