add code documentation for the tokenizer (#9354)

Also clean up/simplify the code a bit.
2024-09-23 13:29:43 -05:00 · 2024-09-23 13:29:43 -05:00 · bd8322a40b
commit bd8322a40b
parent feef53d4ab
5 changed files with 227 additions and 111 deletions
--- a/src/serialization/parser.cpp
+++ b/src/serialization/parser.cpp
@ -27,7 +27,6 @@
 #include "log.hpp"
 #include "serialization/preprocessor.hpp"
 #include "serialization/string_utils.hpp"
-#include "serialization/tokenizer.hpp"
 #include "serialization/validator.hpp"
 #include "wesconfig.h"

@ -128,10 +127,10 @@ void parser::operator()()
 		tok_.next_token();

 		switch(tok_.current_token().type) {
-		case token::LF:
+		case token::NEWLINE:
 			continue;

-		case '[':
+		case token::OPEN_BRACKET:
 			parse_element();
 			break;

@ -195,7 +194,7 @@ void parser::parse_element()
 	case token::STRING: // [element]
 		elname = tok_.current_token().value;

-		if(tok_.next_token().type != ']') {
+		if(tok_.next_token().type != token::CLOSE_BRACKET) {
 			error(_("Unterminated [element] tag"));
 		}

@ -210,14 +209,14 @@ void parser::parse_element()

 		break;

-	case '+': // [+element]
+	case token::PLUS: // [+element]
 		if(tok_.next_token().type != token::STRING) {
 			error(_("Invalid tag name"));
 		}

 		elname = tok_.current_token().value;

-		if(tok_.next_token().type != ']') {
+		if(tok_.next_token().type != token::CLOSE_BRACKET) {
 			error(_("Unterminated [+element] tag"));
 		}

@ -240,14 +239,14 @@ void parser::parse_element()
 		elements.emplace(current_element, elname, tok_.get_start_line(), tok_.get_file());
 		break;

-	case '/': // [/element]
+	case token::SLASH: // [/element]
 		if(tok_.next_token().type != token::STRING) {
 			error(_("Invalid closing tag name"));
 		}

 		elname = tok_.current_token().value;

-		if(tok_.next_token().type != ']') {
+		if(tok_.next_token().type != token::CLOSE_BRACKET) {
 			error(_("Unterminated closing tag"));
 		}

@ -290,7 +289,7 @@ void parser::parse_variable()
 	std::vector<std::string> variables;
 	variables.emplace_back();

-	while(tok_.current_token().type != '=') {
+	while(tok_.current_token().type != token::EQUALS) {
 		switch(tok_.current_token().type) {
 		case token::STRING:
 			if(!variables.back().empty()) {
@ -300,7 +299,7 @@ void parser::parse_variable()
 			variables.back() += tok_.current_token().value;
 			break;

-		case ',':
+		case token::COMMA:
 			if(variables.back().empty()) {
 				error(_("Empty variable name"));
 			} else {
@ -332,7 +331,7 @@ void parser::parse_variable()
 		assert(curvar != variables.end());

 		switch(tok_.current_token().type) {
-		case ',':
+		case token::COMMA:
 			if((curvar + 1) != variables.end()) {
 				if(buffer.translatable()) {
 					cfg[*curvar] = t_string(buffer);
@ -352,7 +351,7 @@ void parser::parse_variable()

 			break;

-		case '_':
+		case token::UNDERSCORE:
 			tok_.next_token();

 			switch(tok_.current_token().type) {
@ -370,14 +369,14 @@ void parser::parse_variable()
 				break;

 			case token::END:
-			case token::LF:
+			case token::NEWLINE:
 				buffer += "_";
 				goto finish;
 			}

 			break;

-		case '+':
+		case token::PLUS:
 			ignore_next_newlines = true;
 			continue;

@ -400,7 +399,7 @@ void parser::parse_variable()
 			error(_("Unterminated quoted string"));
 			break;

-		case token::LF:
+		case token::NEWLINE:
 			if(ignore_next_newlines) {
 				continue;
 			}
--- a/src/serialization/parser.hpp
+++ b/src/serialization/parser.hpp
@ -19,6 +19,7 @@

 #include "config.hpp"
 #include "configr_assign.hpp"
+#include "serialization/tokenizer.hpp"

 class abstract_validator;

--- a/src/serialization/preprocessor.cpp
+++ b/src/serialization/preprocessor.cpp
@ -52,8 +52,6 @@ static bool encode_filename = true;

 static std::string preprocessor_error_detail_prefix = "\n    ";

-static const char OUTPUT_SEPARATOR = '\xFE';
-
 // get filename associated to this code
 static std::string get_filename(const std::string& file_code)
 {
@ -482,11 +480,11 @@ void preprocessor_streambuf::restore_old_preprocessor()
 	preprocessor* current = this->current();

 	if(!current->old_location_.empty()) {
-		buffer_ << OUTPUT_SEPARATOR << "line " << current->old_linenum_ << ' ' << current->old_location_ << '\n';
+		buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << current->old_linenum_ << ' ' << current->old_location_ << '\n';
 	}

 	if(!current->old_textdomain_.empty() && textdomain_ != current->old_textdomain_) {
-		buffer_ << OUTPUT_SEPARATOR << "textdomain " << current->old_textdomain_ << '\n';
+		buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << current->old_textdomain_ << '\n';
 	}

 	location_ = current->old_location_;
@ -866,10 +864,10 @@ preprocessor_data::preprocessor_data(preprocessor_streambuf& t,
 	t.location_ = s.str();
 	t.linenum_ = linenum;

-	t.buffer_ << OUTPUT_SEPARATOR << "line " << linenum << ' ' << t.location_ << '\n';
+	t.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum << ' ' << t.location_ << '\n';

 	if(t.textdomain_ != domain) {
-		t.buffer_ << OUTPUT_SEPARATOR << "textdomain " << domain << '\n';
+		t.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << domain << '\n';
 		t.textdomain_ = domain;
 	}

@ -899,8 +897,8 @@ void preprocessor_data::push_token(token_desc::token_type t)

 	std::ostringstream s;
 	if(!skipping_ && slowpath_) {
-		s << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
-		  << OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
+		s << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
+		  << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';
 	}

 	strings_.push_back(s.str());
@ -1050,7 +1048,7 @@ void preprocessor_data::put(char c)
 		if(diff <= parent_.location_.size() + 11) {
 			parent_.buffer_ << std::string(diff, '\n');
 		} else {
-			parent_.buffer_ << OUTPUT_SEPARATOR << "line " << parent_.linenum_ << ' ' << parent_.location_ << '\n';
+			parent_.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << parent_.linenum_ << ' ' << parent_.location_ << '\n';
 		}
 	}

@ -1128,7 +1126,7 @@ bool preprocessor_data::get_chunk()
 		++linenum_;
 	}

-	if(c == OUTPUT_SEPARATOR) {
+	if(c == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
 		std::string buffer(1, c);

 		while(true) {
@ -1491,7 +1489,7 @@ bool preprocessor_data::get_chunk()

 			std::string symbol = strings_[token.stack_pos];
 			std::string::size_type pos;
-			while((pos = symbol.find(OUTPUT_SEPARATOR)) != std::string::npos) {
+			while((pos = symbol.find(INLINED_PREPROCESS_DIRECTIVE_CHAR)) != std::string::npos) {
 				std::string::iterator b = symbol.begin(); // invalidated at each iteration
 				symbol.erase(b + pos, b + symbol.find('\n', pos + 1) + 1);
 			}
@ -1521,8 +1519,8 @@ bool preprocessor_data::get_chunk()
 				}

 				std::ostringstream v;
-				v << arg->second << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
-				  << OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
+				v << arg->second << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
+				  << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';

 				pop_token();
 				put(v.str());
@ -1672,8 +1670,8 @@ bool preprocessor_data::get_chunk()
 		} else if(!skipping_) {
 			if(token.type == token_desc::token_type::macro_space) {
 				std::ostringstream s;
-				s << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
-				  << OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
+				s << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
+				  << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';

 				strings_.push_back(s.str());
 				token.type = token_desc::token_type::macro_chunk;
--- a/src/serialization/tokenizer.cpp
+++ b/src/serialization/tokenizer.cpp
@ -26,9 +26,9 @@ tokenizer::tokenizer(std::istream& in) :
 	token_(),
 	in_(in)
 {
-	for (int c = 0; c < 128; ++c)
+	for (int c = 0; c < END_STANDARD_ASCII; ++c)
 	{
-		int t = 0;
+		character_type t = TOK_NONE;
 		if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
 			t = TOK_ALPHA;
 		} else if (c >= '0' && c <= '9') {
@ -39,7 +39,7 @@ tokenizer::tokenizer(std::istream& in) :
 		char_types_[c] = t;
 	}
 	in_.stream().exceptions(std::ios_base::badbit);
-	next_char_fast();
+	next_char_skip_cr();
 }

 tokenizer::~tokenizer()
@ -59,92 +59,130 @@ const token &tokenizer::next_token()
 	while(true)
 	{
 		while (is_space(current_)) {
-			next_char_fast();
+			next_char_skip_cr();
 		}
-		if (current_ != 254)
+		if (current_ != INLINED_PREPROCESS_DIRECTIVE_CHAR)
 			break;
 		skip_comment();
 		// skip the line end
-		next_char_fast();
+		next_char_skip_cr();
 	}

-	if (current_ == '#')
+	// skip comments on their own line
+	if (current_ == token::POUND)
 		skip_comment();

+	// set the line number the next token will start on
 	startlineno_ = lineno_;

 	switch(current_) {
+	// we reached the end of the file being read
 	case EOF:
 		token_.type = token::END;
 		break;

-	case '<':
-		if (peek_char() != '<') {
+	// handle open/closed angle brackets
+	// most commonly used for enclosing lua code
+	// more generally is used to indicate the preprocessor should skip over a particular block of text
+	case token::LEFT_ANGLE_BRACKET:
+		// if there aren't double left angle brackets, there is no extra handling needed - this is just a regular left angle bracket
+		if (peek_char() != token::LEFT_ANGLE_BRACKET) {
 			token_.type = token::MISC;
 			token_.value += current_;
 			break;
 		}
+
+		// else, treat this like a quoted string
 		token_.type = token::QSTRING;
-		next_char_fast();
+		next_char_skip_cr();
+
+		// keep getting characters and appending them to the current token's value until either the file ends or double right angle brackets are found
+		// finding the end of the file first is an error since double left angle brackets must always be closed by double right angle brackets
 		for (;;) {
 			next_char();
 			if (current_ == EOF) {
 				token_.type = token::UNTERMINATED_QSTRING;
 				break;
-			}
-			if (current_ == '>' && peek_char() == '>') {
-				next_char_fast();
+			} else if (current_ == token::RIGHT_ANGLE_BRACKET && peek_char() == token::RIGHT_ANGLE_BRACKET) {
+				next_char_skip_cr();
 				break;
 			}
 			token_.value += current_;
 		}
 		break;

-	case '"':
+	// very similar to the double left+right angle bracket handling
+	// the main difference is the need to handle INLINED_PREPROCESS_DIRECTIVE_CHAR since double quotes don't affect the preprocessor
+	case token::DOUBLE_QUOTE:
 		token_.type = token::QSTRING;
+
 		for (;;) {
 			next_char();
 			if (current_ == EOF) {
 				token_.type = token::UNTERMINATED_QSTRING;
 				break;
+			} else if (current_ == token::DOUBLE_QUOTE) {
+				if (peek_char() != token::DOUBLE_QUOTE) {
+					break;
+				} else {
+					next_char_skip_cr();
+				}
 			}
-			if (current_ == '"') {
-				if (peek_char() != '"') break;
-				next_char_fast();
-			}
-			if (current_ == 254) {
+
+			// ignore this line and decrement the current line number
+			if (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
 				skip_comment();
 				--lineno_;
 				continue;
 			}
+
 			token_.value += current_;
 		}
 		break;

-	case '[': case ']': case '/': case '\n': case '=': case ',': case '+':
-		token_.type = token::token_type(current_);
+	// tag name delimiters
+	case token::OPEN_BRACKET:
+	case token::CLOSE_BRACKET:
+	// closing tag
+	case token::SLASH:
+	case token::NEWLINE:
+	case token::EQUALS:
+	// handles multiple attributes on the same line
+	// ie: x,y = 5,5
+	case token::COMMA:
+	// tag merge aka node append, or string concatenation
+	case token::PLUS:
+		token_.type = static_cast<token::token_type>(current_);
 		token_.value = current_;
 		break;

-	case '_':
+	// when in front of a QSTRING, indicates that the string is translatable
+	case token::UNDERSCORE:
+		// this check seems off - there are certainly other non-alphanumeric characters that shouldn't mean anything - but it looks like the parser handles those cases
 		if (!is_alnum(peek_char())) {
-			token_.type = token::token_type(current_);
+			token_.type = token::UNDERSCORE;
 			token_.value = current_;
 			break;
 		}
 		[[fallthrough]];

+	// everything else
 	default:
-		if (is_alnum(current_) || current_ == '$') {
+		// if alphanumeric (regular text) or the dollar sign (variable)
+		// not quite sure how this works with non-ascii text particularly since the parser doesn't reference token_type::MISC
+		// but maybe the default handling does what's needed
+		if (is_alnum(current_) || current_ == token::DOLLAR) {
 			token_.type = token::STRING;
+
 			do {
 				token_.value += current_;
-				next_char_fast();
-				while (current_ == 254) {
+				next_char_skip_cr();
+
+				while (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
 					skip_comment();
-					next_char_fast();
+					next_char_skip_cr();
 				}
-			} while (is_alnum(current_) || current_ == '$');
+			} while (is_alnum(current_) || current_ == token::DOLLAR);
 		} else {
 			token_.type = token::MISC;
 			token_.value += current_;
@ -153,59 +191,91 @@ const token &tokenizer::next_token()
 		return token_;
 	}

-	if (current_ != EOF)
+	// if this isn't the end of the file, get the next character in preparation for the next call to this method
+	if (current_ != EOF) {
 		next_char();
+	}

 	return token_;
 }

 bool tokenizer::skip_command(char const *cmd)
 {
+	// check that the character match the provided text, else return false
 	for (; *cmd; ++cmd) {
-		next_char_fast();
-		if (current_ != *cmd) return false;
+		next_char_skip_cr();
+		if (current_ != *cmd) {
+			return false;
+		}
 	}
-	next_char_fast();
-	if (!is_space(current_)) return false;
-	next_char_fast();
+
+	// check that it's followed by a space, else return false
+	next_char_skip_cr();
+	if (!is_space(current_)) {
+		return false;
+	}
+
+	next_char_skip_cr();
 	return true;
 }

 void tokenizer::skip_comment()
 {
-	next_char_fast();
-	if (current_ == '\n' || current_ == EOF) return;
+	// nothing to do if the line ends or the file ends
+	next_char_skip_cr();
+	if (current_ == token::NEWLINE || current_ == EOF) {
+		return;
+	}
+
+	// used to point to either textdomain_ or file_, and populate that field with the value following the respective command
 	std::string *dst = nullptr;

+	// if this is a #textdomain, point to textdomain_
 	if (current_ == 't')
 	{
-		if (!skip_command("extdomain")) goto fail;
+		if (!skip_command("extdomain")) {
+			goto not_a_command;
+		}
 		dst = &textdomain_;
 	}
+	// else if this is a #line, determine the line number and then point to file_
 	else if (current_ == 'l')
 	{
-		if (!skip_command("ine")) goto fail;
+		if (!skip_command("ine")) {
+			goto not_a_command;
+		}
+
 		lineno_ = 0;
 		while (is_num(current_)) {
+			// ie if the line number is 587
+			// (0 * 10) + 5 = 5
+			// (5 * 10) + 8 = 58
+			// (58 * 10) + 7 = 587
 			lineno_ = lineno_ * 10 + (current_ - '0');
-			next_char_fast();
+			next_char_skip_cr();
 		}
-		if (!is_space(current_)) goto fail;
-		next_char_fast();
+
+		if (!is_space(current_)) {
+			goto not_a_command;
+		}
+
+		next_char_skip_cr();
 		dst = &file_;
 	}
+	// else this turned out to not be a #textdomain or a #line, then this is a normal comment so just read off characters until finding the next line or the end of the file
 	else
 	{
-		fail:
-		while (current_ != '\n' && current_ != EOF) {
-			next_char_fast();
+		not_a_command:
+		while (current_ != token::NEWLINE && current_ != EOF) {
+			next_char_skip_cr();
 		}
 		return;
 	}

+	// clear the current value of either textdomain_ or file_ and populate it with the new value
 	dst->clear();
-	while (current_ != '\n' && current_ != EOF) {
+	while (current_ != token::NEWLINE && current_ != EOF) {
 		*dst += current_;
-		next_char_fast();
+		next_char_skip_cr();
 	}
 }
--- a/src/serialization/tokenizer.hpp
+++ b/src/serialization/tokenizer.hpp
@ -20,9 +20,23 @@

 #include "buffered_istream.hpp"

+#include <array>
 #include <istream>
 #include <string>

+// use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98
+// added by the preprocessor to allow special handling for #line and #textdomain commands
+constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR = 254;
+
+// normal ascii is 0-127
+// extended ascii is from 128-255, none of which need any special handling
+constexpr int END_STANDARD_ASCII = 128;
+
+/**
+ * contains the current text being parsed as well as the token_type of what's being parsed.
+ * multi-character token types will have a value that's a string with zero or more characters in it.
+ * single character token types are a single character with special meaning for a config
+ */
 struct token
 {
 	token() :
@ -30,14 +44,23 @@ struct token
 		value()
 	{}

+	/**
+	 * used for a token's type field
+	 */
 	enum token_type
 	{
+		// multi-character
+		/** unquoted text */
 		STRING,
+		/** quoted string, contained within double quotes or by less than/greater than symbols */
 		QSTRING,
+		/** reached end of file without finding the closing character for a QSTRING */
 		UNTERMINATED_QSTRING,
+		/** any characters that don't have special meaning */
 		MISC,

-		LF = '\n',
+		// single characters
+		NEWLINE = '\n',
 		EQUALS = '=',
 		COMMA = ',',
 		PLUS = '+',
@ -45,20 +68,42 @@ struct token
 		OPEN_BRACKET = '[',
 		CLOSE_BRACKET = ']',
 		UNDERSCORE = '_',
-		END
+
+		/** set when EOF is returned by the input stream */
+		END = 256
+	};
+
+	/**
+	 * not used for a token's type field
+	 */
+	enum source_chars
+	{
+		POUND = '#',
+		LEFT_ANGLE_BRACKET = '<',
+		RIGHT_ANGLE_BRACKET = '>',
+		DOUBLE_QUOTE = '"',
+		DOLLAR = '$',
 	};

 	token_type type;
+	/** the token's value, can be either a single character or multiple characters */
 	std::string value;
 };

-/** Abstract baseclass for the tokenizer. */
+/**
+ * class responsible for parsing the provided text into tokens and tracking information about the current token.
+ * can also track the previous token when built with the DEBUG_TOKENIZER compiler define.
+ * does not otherwise keep track of the processing history.
+ */
 class tokenizer
 {
 public:
 	tokenizer(std::istream& in);
 	~tokenizer();

+	/**
+	 * Reads characters off of @a in_ to return the next token type and its value.
+	 */
 	const token &next_token();

 	const token &current_token() const
@ -94,75 +139,78 @@ private:
 	int lineno_;
 	int startlineno_;

+	/**
+	 * increments the line number if the current character is a newline
+	 * set current_ to the next character that's not `\r`
+	 */
 	void next_char()
 	{
-		if (current_ == '\n')
+		if (current_ == token::NEWLINE)
 			++lineno_;
-		next_char_fast();
+		next_char_skip_cr();
 	}

-	void next_char_fast()
+	/**
+	 * set current_ to the next character
+	 * skip the `\r` in the `\r\n` Windows-style line endings
+	 * the test_cvs_2018_1999023_2.cfg file also uses `\r\n` line endings for some reason - otherwise that check isn't needed on non-Windows platforms since `\r` characters are removed from cfg files on upload
+	 */
+	void next_char_skip_cr()
 	{
-		do {
+		current_ = in_.get();
+		if(current_ == '\r') {
 			current_ = in_.get();
-		} while (current_ == '\r');
-#if 0
-			// TODO: disabled until the campaign server is fixed
-			if(in_.good()) {
-				current_ = in_.get();
-				if (current_ == '\r')
-				{
-					// we assume that there is only one '\r'
-					if(in_.good()) {
-						current_ = in_.get();
-					} else {
-						current_ = EOF;
-					}
-				}
-			} else {
-				current_ = EOF;
-			}
-#endif
+		}
 	}

+	/**
+	 * return the next character without incrementing the current position in the istream
+	 */
 	int peek_char()
 	{
 		return in_.peek();
 	}

-	enum
+	/**
+	 * the different types of characters while parsing
+	 * TOK_NONE is also the default for anything beyond standard ascii
+	 */
+	enum character_type
 	{
 		TOK_NONE = 0,
 		TOK_SPACE = 1,
 		TOK_NUMERIC = 2,
-		TOK_ALPHA = 4
+		TOK_ALPHA = 3
 	};

-	int char_type(unsigned c) const
+	character_type char_type(unsigned c) const
 	{
-		return c < 128 ? char_types_[c] : 0;
+		return c < END_STANDARD_ASCII ? char_types_[c] : TOK_NONE;
 	}

 	bool is_space(int c) const
 	{
-		return (char_type(c) & TOK_SPACE) == TOK_SPACE;
+		return char_type(c) == TOK_SPACE;
 	}

 	bool is_num(int c) const
 	{
-		return (char_type(c) & TOK_NUMERIC) == TOK_NUMERIC;
+		return char_type(c) == TOK_NUMERIC;
 	}

 	bool is_alnum(int c) const
 	{
-		return (char_type(c) & (TOK_ALPHA | TOK_NUMERIC)) != TOK_NONE;
+		return char_type(c) > TOK_SPACE;
 	}

+	/**
+	 * handles skipping over comments (inline and on a separate line) as well as the special processing needed for \#textdomain and \#line
+	 */
 	void skip_comment();

 	/**
-	 * Returns true if the next characters are the one from @a cmd
-	 * followed by a space. Skips all the matching characters.
+	 * Returns true if the next characters are the one from @a cmd followed by a space. Skips all the matching characters.
+	 * Currently only used by \#textdomain (specified by the WML) and \#line (added by the preprocessor)
 	 */
 	bool skip_command(char const *cmd);

@ -173,5 +221,5 @@ private:
 	token previous_token_;
 #endif
 	buffered_istream in_;
-	char char_types_[128];
+	std::array<character_type, END_STANDARD_ASCII> char_types_;
 };