add code documentation for the tokenizer (#9354)

Also clean up/simplify the code a bit.
This commit is contained in:
Pentarctagon 2024-09-23 13:29:43 -05:00 committed by GitHub
parent feef53d4ab
commit bd8322a40b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 227 additions and 111 deletions

View file

@ -27,7 +27,6 @@
#include "log.hpp"
#include "serialization/preprocessor.hpp"
#include "serialization/string_utils.hpp"
#include "serialization/tokenizer.hpp"
#include "serialization/validator.hpp"
#include "wesconfig.h"
@ -128,10 +127,10 @@ void parser::operator()()
tok_.next_token();
switch(tok_.current_token().type) {
case token::LF:
case token::NEWLINE:
continue;
case '[':
case token::OPEN_BRACKET:
parse_element();
break;
@ -195,7 +194,7 @@ void parser::parse_element()
case token::STRING: // [element]
elname = tok_.current_token().value;
if(tok_.next_token().type != ']') {
if(tok_.next_token().type != token::CLOSE_BRACKET) {
error(_("Unterminated [element] tag"));
}
@ -210,14 +209,14 @@ void parser::parse_element()
break;
case '+': // [+element]
case token::PLUS: // [+element]
if(tok_.next_token().type != token::STRING) {
error(_("Invalid tag name"));
}
elname = tok_.current_token().value;
if(tok_.next_token().type != ']') {
if(tok_.next_token().type != token::CLOSE_BRACKET) {
error(_("Unterminated [+element] tag"));
}
@ -240,14 +239,14 @@ void parser::parse_element()
elements.emplace(current_element, elname, tok_.get_start_line(), tok_.get_file());
break;
case '/': // [/element]
case token::SLASH: // [/element]
if(tok_.next_token().type != token::STRING) {
error(_("Invalid closing tag name"));
}
elname = tok_.current_token().value;
if(tok_.next_token().type != ']') {
if(tok_.next_token().type != token::CLOSE_BRACKET) {
error(_("Unterminated closing tag"));
}
@ -290,7 +289,7 @@ void parser::parse_variable()
std::vector<std::string> variables;
variables.emplace_back();
while(tok_.current_token().type != '=') {
while(tok_.current_token().type != token::EQUALS) {
switch(tok_.current_token().type) {
case token::STRING:
if(!variables.back().empty()) {
@ -300,7 +299,7 @@ void parser::parse_variable()
variables.back() += tok_.current_token().value;
break;
case ',':
case token::COMMA:
if(variables.back().empty()) {
error(_("Empty variable name"));
} else {
@ -332,7 +331,7 @@ void parser::parse_variable()
assert(curvar != variables.end());
switch(tok_.current_token().type) {
case ',':
case token::COMMA:
if((curvar + 1) != variables.end()) {
if(buffer.translatable()) {
cfg[*curvar] = t_string(buffer);
@ -352,7 +351,7 @@ void parser::parse_variable()
break;
case '_':
case token::UNDERSCORE:
tok_.next_token();
switch(tok_.current_token().type) {
@ -370,14 +369,14 @@ void parser::parse_variable()
break;
case token::END:
case token::LF:
case token::NEWLINE:
buffer += "_";
goto finish;
}
break;
case '+':
case token::PLUS:
ignore_next_newlines = true;
continue;
@ -400,7 +399,7 @@ void parser::parse_variable()
error(_("Unterminated quoted string"));
break;
case token::LF:
case token::NEWLINE:
if(ignore_next_newlines) {
continue;
}

View file

@ -19,6 +19,7 @@
#include "config.hpp"
#include "configr_assign.hpp"
#include "serialization/tokenizer.hpp"
class abstract_validator;

View file

@ -52,8 +52,6 @@ static bool encode_filename = true;
static std::string preprocessor_error_detail_prefix = "\n ";
static const char OUTPUT_SEPARATOR = '\xFE';
// get filename associated to this code
static std::string get_filename(const std::string& file_code)
{
@ -482,11 +480,11 @@ void preprocessor_streambuf::restore_old_preprocessor()
preprocessor* current = this->current();
if(!current->old_location_.empty()) {
buffer_ << OUTPUT_SEPARATOR << "line " << current->old_linenum_ << ' ' << current->old_location_ << '\n';
buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << current->old_linenum_ << ' ' << current->old_location_ << '\n';
}
if(!current->old_textdomain_.empty() && textdomain_ != current->old_textdomain_) {
buffer_ << OUTPUT_SEPARATOR << "textdomain " << current->old_textdomain_ << '\n';
buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << current->old_textdomain_ << '\n';
}
location_ = current->old_location_;
@ -866,10 +864,10 @@ preprocessor_data::preprocessor_data(preprocessor_streambuf& t,
t.location_ = s.str();
t.linenum_ = linenum;
t.buffer_ << OUTPUT_SEPARATOR << "line " << linenum << ' ' << t.location_ << '\n';
t.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum << ' ' << t.location_ << '\n';
if(t.textdomain_ != domain) {
t.buffer_ << OUTPUT_SEPARATOR << "textdomain " << domain << '\n';
t.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << domain << '\n';
t.textdomain_ = domain;
}
@ -899,8 +897,8 @@ void preprocessor_data::push_token(token_desc::token_type t)
std::ostringstream s;
if(!skipping_ && slowpath_) {
s << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
<< OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
s << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
<< INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';
}
strings_.push_back(s.str());
@ -1050,7 +1048,7 @@ void preprocessor_data::put(char c)
if(diff <= parent_.location_.size() + 11) {
parent_.buffer_ << std::string(diff, '\n');
} else {
parent_.buffer_ << OUTPUT_SEPARATOR << "line " << parent_.linenum_ << ' ' << parent_.location_ << '\n';
parent_.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << parent_.linenum_ << ' ' << parent_.location_ << '\n';
}
}
@ -1128,7 +1126,7 @@ bool preprocessor_data::get_chunk()
++linenum_;
}
if(c == OUTPUT_SEPARATOR) {
if(c == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
std::string buffer(1, c);
while(true) {
@ -1491,7 +1489,7 @@ bool preprocessor_data::get_chunk()
std::string symbol = strings_[token.stack_pos];
std::string::size_type pos;
while((pos = symbol.find(OUTPUT_SEPARATOR)) != std::string::npos) {
while((pos = symbol.find(INLINED_PREPROCESS_DIRECTIVE_CHAR)) != std::string::npos) {
std::string::iterator b = symbol.begin(); // invalidated at each iteration
symbol.erase(b + pos, b + symbol.find('\n', pos + 1) + 1);
}
@ -1521,8 +1519,8 @@ bool preprocessor_data::get_chunk()
}
std::ostringstream v;
v << arg->second << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
<< OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
v << arg->second << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
<< INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';
pop_token();
put(v.str());
@ -1672,8 +1670,8 @@ bool preprocessor_data::get_chunk()
} else if(!skipping_) {
if(token.type == token_desc::token_type::macro_space) {
std::ostringstream s;
s << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
<< OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
s << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
<< INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';
strings_.push_back(s.str());
token.type = token_desc::token_type::macro_chunk;

View file

@ -26,9 +26,9 @@ tokenizer::tokenizer(std::istream& in) :
token_(),
in_(in)
{
for (int c = 0; c < 128; ++c)
for (int c = 0; c < END_STANDARD_ASCII; ++c)
{
int t = 0;
character_type t = TOK_NONE;
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
t = TOK_ALPHA;
} else if (c >= '0' && c <= '9') {
@ -39,7 +39,7 @@ tokenizer::tokenizer(std::istream& in) :
char_types_[c] = t;
}
in_.stream().exceptions(std::ios_base::badbit);
next_char_fast();
next_char_skip_cr();
}
tokenizer::~tokenizer()
@ -59,92 +59,130 @@ const token &tokenizer::next_token()
while(true)
{
while (is_space(current_)) {
next_char_fast();
next_char_skip_cr();
}
if (current_ != 254)
if (current_ != INLINED_PREPROCESS_DIRECTIVE_CHAR)
break;
skip_comment();
// skip the line end
next_char_fast();
next_char_skip_cr();
}
if (current_ == '#')
// skip comments on their own line
if (current_ == token::POUND)
skip_comment();
// set the line number the next token will start on
startlineno_ = lineno_;
switch(current_) {
// we reached the end of the file being read
case EOF:
token_.type = token::END;
break;
case '<':
if (peek_char() != '<') {
// handle open/closed angle brackets
// most commonly used for enclosing lua code
// more generally is used to indicate the preprocessor should skip over a particular block of text
case token::LEFT_ANGLE_BRACKET:
// if there aren't double left angle brackets, there is no extra handling needed - this is just a regular left angle bracket
if (peek_char() != token::LEFT_ANGLE_BRACKET) {
token_.type = token::MISC;
token_.value += current_;
break;
}
// else, treat this like a quoted string
token_.type = token::QSTRING;
next_char_fast();
next_char_skip_cr();
// keep getting characters and appending them to the current token's value until either the file ends or double right angle brackets are found
// finding the end of the file first is an error since double left angle brackets must always be closed by double right angle brackets
for (;;) {
next_char();
if (current_ == EOF) {
token_.type = token::UNTERMINATED_QSTRING;
break;
}
if (current_ == '>' && peek_char() == '>') {
next_char_fast();
} else if (current_ == token::RIGHT_ANGLE_BRACKET && peek_char() == token::RIGHT_ANGLE_BRACKET) {
next_char_skip_cr();
break;
}
token_.value += current_;
}
break;
case '"':
// very similar to the double left+right angle bracket handling
// the main difference is the need to handle INLINED_PREPROCESS_DIRECTIVE_CHAR since double quotes don't affect the preprocessor
case token::DOUBLE_QUOTE:
token_.type = token::QSTRING;
for (;;) {
next_char();
if (current_ == EOF) {
token_.type = token::UNTERMINATED_QSTRING;
break;
} else if (current_ == token::DOUBLE_QUOTE) {
if (peek_char() != token::DOUBLE_QUOTE) {
break;
} else {
next_char_skip_cr();
}
}
if (current_ == '"') {
if (peek_char() != '"') break;
next_char_fast();
}
if (current_ == 254) {
// ignore this line and decrement the current line number
if (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
skip_comment();
--lineno_;
continue;
}
token_.value += current_;
}
break;
case '[': case ']': case '/': case '\n': case '=': case ',': case '+':
token_.type = token::token_type(current_);
// tag name delimiters
case token::OPEN_BRACKET:
case token::CLOSE_BRACKET:
// closing tag
case token::SLASH:
case token::NEWLINE:
case token::EQUALS:
// handles multiple attributes on the same line
// ie: x,y = 5,5
case token::COMMA:
// tag merge aka node append, or string concatenation
case token::PLUS:
token_.type = static_cast<token::token_type>(current_);
token_.value = current_;
break;
case '_':
// when in front of a QSTRING, indicates that the string is translatable
case token::UNDERSCORE:
// this check seems off - there are certainly other non-alphanumeric characters that shouldn't mean anything - but it looks like the parser handles those cases
if (!is_alnum(peek_char())) {
token_.type = token::token_type(current_);
token_.type = token::UNDERSCORE;
token_.value = current_;
break;
}
[[fallthrough]];
// everything else
default:
if (is_alnum(current_) || current_ == '$') {
// if alphanumeric (regular text) or the dollar sign (variable)
// not quite sure how this works with non-ascii text particularly since the parser doesn't reference token_type::MISC
// but maybe the default handling does what's needed
if (is_alnum(current_) || current_ == token::DOLLAR) {
token_.type = token::STRING;
do {
token_.value += current_;
next_char_fast();
while (current_ == 254) {
next_char_skip_cr();
while (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
skip_comment();
next_char_fast();
next_char_skip_cr();
}
} while (is_alnum(current_) || current_ == '$');
} while (is_alnum(current_) || current_ == token::DOLLAR);
} else {
token_.type = token::MISC;
token_.value += current_;
@ -153,59 +191,91 @@ const token &tokenizer::next_token()
return token_;
}
if (current_ != EOF)
// if this isn't the end of the file, get the next character in preparation for the next call to this method
if (current_ != EOF) {
next_char();
}
return token_;
}
bool tokenizer::skip_command(char const *cmd)
{
// check that the character match the provided text, else return false
for (; *cmd; ++cmd) {
next_char_fast();
if (current_ != *cmd) return false;
next_char_skip_cr();
if (current_ != *cmd) {
return false;
}
}
next_char_fast();
if (!is_space(current_)) return false;
next_char_fast();
// check that it's followed by a space, else return false
next_char_skip_cr();
if (!is_space(current_)) {
return false;
}
next_char_skip_cr();
return true;
}
void tokenizer::skip_comment()
{
next_char_fast();
if (current_ == '\n' || current_ == EOF) return;
// nothing to do if the line ends or the file ends
next_char_skip_cr();
if (current_ == token::NEWLINE || current_ == EOF) {
return;
}
// used to point to either textdomain_ or file_, and populate that field with the value following the respective command
std::string *dst = nullptr;
// if this is a #textdomain, point to textdomain_
if (current_ == 't')
{
if (!skip_command("extdomain")) goto fail;
if (!skip_command("extdomain")) {
goto not_a_command;
}
dst = &textdomain_;
}
// else if this is a #line, determine the line number and then point to file_
else if (current_ == 'l')
{
if (!skip_command("ine")) goto fail;
if (!skip_command("ine")) {
goto not_a_command;
}
lineno_ = 0;
while (is_num(current_)) {
// ie if the line number is 587
// (0 * 10) + 5 = 5
// (5 * 10) + 8 = 58
// (58 * 10) + 7 = 587
lineno_ = lineno_ * 10 + (current_ - '0');
next_char_fast();
next_char_skip_cr();
}
if (!is_space(current_)) goto fail;
next_char_fast();
if (!is_space(current_)) {
goto not_a_command;
}
next_char_skip_cr();
dst = &file_;
}
// else this turned out to not be a #textdomain or a #line, then this is a normal comment so just read off characters until finding the next line or the end of the file
else
{
fail:
while (current_ != '\n' && current_ != EOF) {
next_char_fast();
not_a_command:
while (current_ != token::NEWLINE && current_ != EOF) {
next_char_skip_cr();
}
return;
}
// clear the current value of either textdomain_ or file_ and populate it with the new value
dst->clear();
while (current_ != '\n' && current_ != EOF) {
while (current_ != token::NEWLINE && current_ != EOF) {
*dst += current_;
next_char_fast();
next_char_skip_cr();
}
}

View file

@ -20,9 +20,23 @@
#include "buffered_istream.hpp"
#include <array>
#include <istream>
#include <string>
// use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98
// added by the preprocessor to allow special handling for #line and #textdomain commands
constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR = 254;
// normal ascii is 0-127
// extended ascii is from 128-255, none of which need any special handling
constexpr int END_STANDARD_ASCII = 128;
/**
* contains the current text being parsed as well as the token_type of what's being parsed.
* multi-character token types will have a value that's a string with zero or more characters in it.
* single character token types are a single character with special meaning for a config
*/
struct token
{
token() :
@ -30,14 +44,23 @@ struct token
value()
{}
/**
* used for a token's type field
*/
enum token_type
{
// multi-character
/** unquoted text */
STRING,
/** quoted string, contained within double quotes or by less than/greater than symbols */
QSTRING,
/** reached end of file without finding the closing character for a QSTRING */
UNTERMINATED_QSTRING,
/** any characters that don't have special meaning */
MISC,
LF = '\n',
// single characters
NEWLINE = '\n',
EQUALS = '=',
COMMA = ',',
PLUS = '+',
@ -45,20 +68,42 @@ struct token
OPEN_BRACKET = '[',
CLOSE_BRACKET = ']',
UNDERSCORE = '_',
END
/** set when EOF is returned by the input stream */
END = 256
};
/**
* not used for a token's type field
*/
enum source_chars
{
POUND = '#',
LEFT_ANGLE_BRACKET = '<',
RIGHT_ANGLE_BRACKET = '>',
DOUBLE_QUOTE = '"',
DOLLAR = '$',
};
token_type type;
/** the token's value, can be either a single character or multiple characters */
std::string value;
};
/** Abstract baseclass for the tokenizer. */
/**
* class responsible for parsing the provided text into tokens and tracking information about the current token.
* can also track the previous token when built with the DEBUG_TOKENIZER compiler define.
* does not otherwise keep track of the processing history.
*/
class tokenizer
{
public:
tokenizer(std::istream& in);
~tokenizer();
/**
* Reads characters off of @a in_ to return the next token type and its value.
*/
const token &next_token();
const token &current_token() const
@ -94,75 +139,78 @@ private:
int lineno_;
int startlineno_;
/**
* increments the line number if the current character is a newline
* set current_ to the next character that's not `\r`
*/
void next_char()
{
if (current_ == '\n')
if (current_ == token::NEWLINE)
++lineno_;
next_char_fast();
next_char_skip_cr();
}
void next_char_fast()
/**
* set current_ to the next character
* skip the `\r` in the `\r\n` Windows-style line endings
* the test_cvs_2018_1999023_2.cfg file also uses `\r\n` line endings for some reason - otherwise that check isn't needed on non-Windows platforms since `\r` characters are removed from cfg files on upload
*/
void next_char_skip_cr()
{
do {
current_ = in_.get();
if(current_ == '\r') {
current_ = in_.get();
} while (current_ == '\r');
#if 0
// TODO: disabled until the campaign server is fixed
if(in_.good()) {
current_ = in_.get();
if (current_ == '\r')
{
// we assume that there is only one '\r'
if(in_.good()) {
current_ = in_.get();
} else {
current_ = EOF;
}
}
} else {
current_ = EOF;
}
#endif
}
}
/**
* return the next character without incrementing the current position in the istream
*/
int peek_char()
{
return in_.peek();
}
enum
/**
* the different types of characters while parsing
* TOK_NONE is also the default for anything beyond standard ascii
*/
enum character_type
{
TOK_NONE = 0,
TOK_SPACE = 1,
TOK_NUMERIC = 2,
TOK_ALPHA = 4
TOK_ALPHA = 3
};
int char_type(unsigned c) const
character_type char_type(unsigned c) const
{
return c < 128 ? char_types_[c] : 0;
return c < END_STANDARD_ASCII ? char_types_[c] : TOK_NONE;
}
bool is_space(int c) const
{
return (char_type(c) & TOK_SPACE) == TOK_SPACE;
return char_type(c) == TOK_SPACE;
}
bool is_num(int c) const
{
return (char_type(c) & TOK_NUMERIC) == TOK_NUMERIC;
return char_type(c) == TOK_NUMERIC;
}
bool is_alnum(int c) const
{
return (char_type(c) & (TOK_ALPHA | TOK_NUMERIC)) != TOK_NONE;
return char_type(c) > TOK_SPACE;
}
/**
* handles skipping over comments (inline and on a separate line) as well as the special processing needed for \#textdomain and \#line
*/
void skip_comment();
/**
* Returns true if the next characters are the one from @a cmd
* followed by a space. Skips all the matching characters.
* Returns true if the next characters are the one from @a cmd followed by a space. Skips all the matching characters.
* Currently only used by \#textdomain (specified by the WML) and \#line (added by the preprocessor)
*/
bool skip_command(char const *cmd);
@ -173,5 +221,5 @@ private:
token previous_token_;
#endif
buffered_istream in_;
char char_types_[128];
std::array<character_type, END_STANDARD_ASCII> char_types_;
};