add code documentation for the tokenizer (#9354)
Also clean up/simplify the code a bit.
This commit is contained in:
parent
feef53d4ab
commit
bd8322a40b
5 changed files with 227 additions and 111 deletions
|
@ -27,7 +27,6 @@
|
|||
#include "log.hpp"
|
||||
#include "serialization/preprocessor.hpp"
|
||||
#include "serialization/string_utils.hpp"
|
||||
#include "serialization/tokenizer.hpp"
|
||||
#include "serialization/validator.hpp"
|
||||
#include "wesconfig.h"
|
||||
|
||||
|
@ -128,10 +127,10 @@ void parser::operator()()
|
|||
tok_.next_token();
|
||||
|
||||
switch(tok_.current_token().type) {
|
||||
case token::LF:
|
||||
case token::NEWLINE:
|
||||
continue;
|
||||
|
||||
case '[':
|
||||
case token::OPEN_BRACKET:
|
||||
parse_element();
|
||||
break;
|
||||
|
||||
|
@ -195,7 +194,7 @@ void parser::parse_element()
|
|||
case token::STRING: // [element]
|
||||
elname = tok_.current_token().value;
|
||||
|
||||
if(tok_.next_token().type != ']') {
|
||||
if(tok_.next_token().type != token::CLOSE_BRACKET) {
|
||||
error(_("Unterminated [element] tag"));
|
||||
}
|
||||
|
||||
|
@ -210,14 +209,14 @@ void parser::parse_element()
|
|||
|
||||
break;
|
||||
|
||||
case '+': // [+element]
|
||||
case token::PLUS: // [+element]
|
||||
if(tok_.next_token().type != token::STRING) {
|
||||
error(_("Invalid tag name"));
|
||||
}
|
||||
|
||||
elname = tok_.current_token().value;
|
||||
|
||||
if(tok_.next_token().type != ']') {
|
||||
if(tok_.next_token().type != token::CLOSE_BRACKET) {
|
||||
error(_("Unterminated [+element] tag"));
|
||||
}
|
||||
|
||||
|
@ -240,14 +239,14 @@ void parser::parse_element()
|
|||
elements.emplace(current_element, elname, tok_.get_start_line(), tok_.get_file());
|
||||
break;
|
||||
|
||||
case '/': // [/element]
|
||||
case token::SLASH: // [/element]
|
||||
if(tok_.next_token().type != token::STRING) {
|
||||
error(_("Invalid closing tag name"));
|
||||
}
|
||||
|
||||
elname = tok_.current_token().value;
|
||||
|
||||
if(tok_.next_token().type != ']') {
|
||||
if(tok_.next_token().type != token::CLOSE_BRACKET) {
|
||||
error(_("Unterminated closing tag"));
|
||||
}
|
||||
|
||||
|
@ -290,7 +289,7 @@ void parser::parse_variable()
|
|||
std::vector<std::string> variables;
|
||||
variables.emplace_back();
|
||||
|
||||
while(tok_.current_token().type != '=') {
|
||||
while(tok_.current_token().type != token::EQUALS) {
|
||||
switch(tok_.current_token().type) {
|
||||
case token::STRING:
|
||||
if(!variables.back().empty()) {
|
||||
|
@ -300,7 +299,7 @@ void parser::parse_variable()
|
|||
variables.back() += tok_.current_token().value;
|
||||
break;
|
||||
|
||||
case ',':
|
||||
case token::COMMA:
|
||||
if(variables.back().empty()) {
|
||||
error(_("Empty variable name"));
|
||||
} else {
|
||||
|
@ -332,7 +331,7 @@ void parser::parse_variable()
|
|||
assert(curvar != variables.end());
|
||||
|
||||
switch(tok_.current_token().type) {
|
||||
case ',':
|
||||
case token::COMMA:
|
||||
if((curvar + 1) != variables.end()) {
|
||||
if(buffer.translatable()) {
|
||||
cfg[*curvar] = t_string(buffer);
|
||||
|
@ -352,7 +351,7 @@ void parser::parse_variable()
|
|||
|
||||
break;
|
||||
|
||||
case '_':
|
||||
case token::UNDERSCORE:
|
||||
tok_.next_token();
|
||||
|
||||
switch(tok_.current_token().type) {
|
||||
|
@ -370,14 +369,14 @@ void parser::parse_variable()
|
|||
break;
|
||||
|
||||
case token::END:
|
||||
case token::LF:
|
||||
case token::NEWLINE:
|
||||
buffer += "_";
|
||||
goto finish;
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case '+':
|
||||
case token::PLUS:
|
||||
ignore_next_newlines = true;
|
||||
continue;
|
||||
|
||||
|
@ -400,7 +399,7 @@ void parser::parse_variable()
|
|||
error(_("Unterminated quoted string"));
|
||||
break;
|
||||
|
||||
case token::LF:
|
||||
case token::NEWLINE:
|
||||
if(ignore_next_newlines) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
|
||||
#include "config.hpp"
|
||||
#include "configr_assign.hpp"
|
||||
#include "serialization/tokenizer.hpp"
|
||||
|
||||
class abstract_validator;
|
||||
|
||||
|
|
|
@ -52,8 +52,6 @@ static bool encode_filename = true;
|
|||
|
||||
static std::string preprocessor_error_detail_prefix = "\n ";
|
||||
|
||||
static const char OUTPUT_SEPARATOR = '\xFE';
|
||||
|
||||
// get filename associated to this code
|
||||
static std::string get_filename(const std::string& file_code)
|
||||
{
|
||||
|
@ -482,11 +480,11 @@ void preprocessor_streambuf::restore_old_preprocessor()
|
|||
preprocessor* current = this->current();
|
||||
|
||||
if(!current->old_location_.empty()) {
|
||||
buffer_ << OUTPUT_SEPARATOR << "line " << current->old_linenum_ << ' ' << current->old_location_ << '\n';
|
||||
buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << current->old_linenum_ << ' ' << current->old_location_ << '\n';
|
||||
}
|
||||
|
||||
if(!current->old_textdomain_.empty() && textdomain_ != current->old_textdomain_) {
|
||||
buffer_ << OUTPUT_SEPARATOR << "textdomain " << current->old_textdomain_ << '\n';
|
||||
buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << current->old_textdomain_ << '\n';
|
||||
}
|
||||
|
||||
location_ = current->old_location_;
|
||||
|
@ -866,10 +864,10 @@ preprocessor_data::preprocessor_data(preprocessor_streambuf& t,
|
|||
t.location_ = s.str();
|
||||
t.linenum_ = linenum;
|
||||
|
||||
t.buffer_ << OUTPUT_SEPARATOR << "line " << linenum << ' ' << t.location_ << '\n';
|
||||
t.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum << ' ' << t.location_ << '\n';
|
||||
|
||||
if(t.textdomain_ != domain) {
|
||||
t.buffer_ << OUTPUT_SEPARATOR << "textdomain " << domain << '\n';
|
||||
t.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << domain << '\n';
|
||||
t.textdomain_ = domain;
|
||||
}
|
||||
|
||||
|
@ -899,8 +897,8 @@ void preprocessor_data::push_token(token_desc::token_type t)
|
|||
|
||||
std::ostringstream s;
|
||||
if(!skipping_ && slowpath_) {
|
||||
s << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
|
||||
<< OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
|
||||
s << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
|
||||
<< INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';
|
||||
}
|
||||
|
||||
strings_.push_back(s.str());
|
||||
|
@ -1050,7 +1048,7 @@ void preprocessor_data::put(char c)
|
|||
if(diff <= parent_.location_.size() + 11) {
|
||||
parent_.buffer_ << std::string(diff, '\n');
|
||||
} else {
|
||||
parent_.buffer_ << OUTPUT_SEPARATOR << "line " << parent_.linenum_ << ' ' << parent_.location_ << '\n';
|
||||
parent_.buffer_ << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << parent_.linenum_ << ' ' << parent_.location_ << '\n';
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1128,7 +1126,7 @@ bool preprocessor_data::get_chunk()
|
|||
++linenum_;
|
||||
}
|
||||
|
||||
if(c == OUTPUT_SEPARATOR) {
|
||||
if(c == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
|
||||
std::string buffer(1, c);
|
||||
|
||||
while(true) {
|
||||
|
@ -1491,7 +1489,7 @@ bool preprocessor_data::get_chunk()
|
|||
|
||||
std::string symbol = strings_[token.stack_pos];
|
||||
std::string::size_type pos;
|
||||
while((pos = symbol.find(OUTPUT_SEPARATOR)) != std::string::npos) {
|
||||
while((pos = symbol.find(INLINED_PREPROCESS_DIRECTIVE_CHAR)) != std::string::npos) {
|
||||
std::string::iterator b = symbol.begin(); // invalidated at each iteration
|
||||
symbol.erase(b + pos, b + symbol.find('\n', pos + 1) + 1);
|
||||
}
|
||||
|
@ -1521,8 +1519,8 @@ bool preprocessor_data::get_chunk()
|
|||
}
|
||||
|
||||
std::ostringstream v;
|
||||
v << arg->second << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
|
||||
<< OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
|
||||
v << arg->second << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
|
||||
<< INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';
|
||||
|
||||
pop_token();
|
||||
put(v.str());
|
||||
|
@ -1672,8 +1670,8 @@ bool preprocessor_data::get_chunk()
|
|||
} else if(!skipping_) {
|
||||
if(token.type == token_desc::token_type::macro_space) {
|
||||
std::ostringstream s;
|
||||
s << OUTPUT_SEPARATOR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
|
||||
<< OUTPUT_SEPARATOR << "textdomain " << parent_.textdomain_ << '\n';
|
||||
s << INLINED_PREPROCESS_DIRECTIVE_CHAR << "line " << linenum_ << ' ' << parent_.location_ << "\n"
|
||||
<< INLINED_PREPROCESS_DIRECTIVE_CHAR << "textdomain " << parent_.textdomain_ << '\n';
|
||||
|
||||
strings_.push_back(s.str());
|
||||
token.type = token_desc::token_type::macro_chunk;
|
||||
|
|
|
@ -26,9 +26,9 @@ tokenizer::tokenizer(std::istream& in) :
|
|||
token_(),
|
||||
in_(in)
|
||||
{
|
||||
for (int c = 0; c < 128; ++c)
|
||||
for (int c = 0; c < END_STANDARD_ASCII; ++c)
|
||||
{
|
||||
int t = 0;
|
||||
character_type t = TOK_NONE;
|
||||
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
|
||||
t = TOK_ALPHA;
|
||||
} else if (c >= '0' && c <= '9') {
|
||||
|
@ -39,7 +39,7 @@ tokenizer::tokenizer(std::istream& in) :
|
|||
char_types_[c] = t;
|
||||
}
|
||||
in_.stream().exceptions(std::ios_base::badbit);
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
}
|
||||
|
||||
tokenizer::~tokenizer()
|
||||
|
@ -59,92 +59,130 @@ const token &tokenizer::next_token()
|
|||
while(true)
|
||||
{
|
||||
while (is_space(current_)) {
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
}
|
||||
if (current_ != 254)
|
||||
if (current_ != INLINED_PREPROCESS_DIRECTIVE_CHAR)
|
||||
break;
|
||||
skip_comment();
|
||||
// skip the line end
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
}
|
||||
|
||||
if (current_ == '#')
|
||||
// skip comments on their own line
|
||||
if (current_ == token::POUND)
|
||||
skip_comment();
|
||||
|
||||
// set the line number the next token will start on
|
||||
startlineno_ = lineno_;
|
||||
|
||||
switch(current_) {
|
||||
// we reached the end of the file being read
|
||||
case EOF:
|
||||
token_.type = token::END;
|
||||
break;
|
||||
|
||||
case '<':
|
||||
if (peek_char() != '<') {
|
||||
// handle open/closed angle brackets
|
||||
// most commonly used for enclosing lua code
|
||||
// more generally is used to indicate the preprocessor should skip over a particular block of text
|
||||
case token::LEFT_ANGLE_BRACKET:
|
||||
// if there aren't double left angle brackets, there is no extra handling needed - this is just a regular left angle bracket
|
||||
if (peek_char() != token::LEFT_ANGLE_BRACKET) {
|
||||
token_.type = token::MISC;
|
||||
token_.value += current_;
|
||||
break;
|
||||
}
|
||||
|
||||
// else, treat this like a quoted string
|
||||
token_.type = token::QSTRING;
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
|
||||
// keep getting characters and appending them to the current token's value until either the file ends or double right angle brackets are found
|
||||
// finding the end of the file first is an error since double left angle brackets must always be closed by double right angle brackets
|
||||
for (;;) {
|
||||
next_char();
|
||||
if (current_ == EOF) {
|
||||
token_.type = token::UNTERMINATED_QSTRING;
|
||||
break;
|
||||
}
|
||||
if (current_ == '>' && peek_char() == '>') {
|
||||
next_char_fast();
|
||||
} else if (current_ == token::RIGHT_ANGLE_BRACKET && peek_char() == token::RIGHT_ANGLE_BRACKET) {
|
||||
next_char_skip_cr();
|
||||
break;
|
||||
}
|
||||
token_.value += current_;
|
||||
}
|
||||
break;
|
||||
|
||||
case '"':
|
||||
// very similar to the double left+right angle bracket handling
|
||||
// the main difference is the need to handle INLINED_PREPROCESS_DIRECTIVE_CHAR since double quotes don't affect the preprocessor
|
||||
case token::DOUBLE_QUOTE:
|
||||
token_.type = token::QSTRING;
|
||||
|
||||
for (;;) {
|
||||
next_char();
|
||||
if (current_ == EOF) {
|
||||
token_.type = token::UNTERMINATED_QSTRING;
|
||||
break;
|
||||
} else if (current_ == token::DOUBLE_QUOTE) {
|
||||
if (peek_char() != token::DOUBLE_QUOTE) {
|
||||
break;
|
||||
} else {
|
||||
next_char_skip_cr();
|
||||
}
|
||||
}
|
||||
if (current_ == '"') {
|
||||
if (peek_char() != '"') break;
|
||||
next_char_fast();
|
||||
}
|
||||
if (current_ == 254) {
|
||||
|
||||
// ignore this line and decrement the current line number
|
||||
if (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
|
||||
skip_comment();
|
||||
--lineno_;
|
||||
continue;
|
||||
}
|
||||
|
||||
token_.value += current_;
|
||||
}
|
||||
break;
|
||||
|
||||
case '[': case ']': case '/': case '\n': case '=': case ',': case '+':
|
||||
token_.type = token::token_type(current_);
|
||||
// tag name delimiters
|
||||
case token::OPEN_BRACKET:
|
||||
case token::CLOSE_BRACKET:
|
||||
// closing tag
|
||||
case token::SLASH:
|
||||
case token::NEWLINE:
|
||||
case token::EQUALS:
|
||||
// handles multiple attributes on the same line
|
||||
// ie: x,y = 5,5
|
||||
case token::COMMA:
|
||||
// tag merge aka node append, or string concatenation
|
||||
case token::PLUS:
|
||||
token_.type = static_cast<token::token_type>(current_);
|
||||
token_.value = current_;
|
||||
break;
|
||||
|
||||
case '_':
|
||||
// when in front of a QSTRING, indicates that the string is translatable
|
||||
case token::UNDERSCORE:
|
||||
// this check seems off - there are certainly other non-alphanumeric characters that shouldn't mean anything - but it looks like the parser handles those cases
|
||||
if (!is_alnum(peek_char())) {
|
||||
token_.type = token::token_type(current_);
|
||||
token_.type = token::UNDERSCORE;
|
||||
token_.value = current_;
|
||||
break;
|
||||
}
|
||||
[[fallthrough]];
|
||||
|
||||
// everything else
|
||||
default:
|
||||
if (is_alnum(current_) || current_ == '$') {
|
||||
// if alphanumeric (regular text) or the dollar sign (variable)
|
||||
// not quite sure how this works with non-ascii text particularly since the parser doesn't reference token_type::MISC
|
||||
// but maybe the default handling does what's needed
|
||||
if (is_alnum(current_) || current_ == token::DOLLAR) {
|
||||
token_.type = token::STRING;
|
||||
|
||||
do {
|
||||
token_.value += current_;
|
||||
next_char_fast();
|
||||
while (current_ == 254) {
|
||||
next_char_skip_cr();
|
||||
|
||||
while (current_ == INLINED_PREPROCESS_DIRECTIVE_CHAR) {
|
||||
skip_comment();
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
}
|
||||
} while (is_alnum(current_) || current_ == '$');
|
||||
} while (is_alnum(current_) || current_ == token::DOLLAR);
|
||||
} else {
|
||||
token_.type = token::MISC;
|
||||
token_.value += current_;
|
||||
|
@ -153,59 +191,91 @@ const token &tokenizer::next_token()
|
|||
return token_;
|
||||
}
|
||||
|
||||
if (current_ != EOF)
|
||||
// if this isn't the end of the file, get the next character in preparation for the next call to this method
|
||||
if (current_ != EOF) {
|
||||
next_char();
|
||||
}
|
||||
|
||||
return token_;
|
||||
}
|
||||
|
||||
bool tokenizer::skip_command(char const *cmd)
|
||||
{
|
||||
// check that the character match the provided text, else return false
|
||||
for (; *cmd; ++cmd) {
|
||||
next_char_fast();
|
||||
if (current_ != *cmd) return false;
|
||||
next_char_skip_cr();
|
||||
if (current_ != *cmd) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
next_char_fast();
|
||||
if (!is_space(current_)) return false;
|
||||
next_char_fast();
|
||||
|
||||
// check that it's followed by a space, else return false
|
||||
next_char_skip_cr();
|
||||
if (!is_space(current_)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
next_char_skip_cr();
|
||||
return true;
|
||||
}
|
||||
|
||||
void tokenizer::skip_comment()
|
||||
{
|
||||
next_char_fast();
|
||||
if (current_ == '\n' || current_ == EOF) return;
|
||||
// nothing to do if the line ends or the file ends
|
||||
next_char_skip_cr();
|
||||
if (current_ == token::NEWLINE || current_ == EOF) {
|
||||
return;
|
||||
}
|
||||
|
||||
// used to point to either textdomain_ or file_, and populate that field with the value following the respective command
|
||||
std::string *dst = nullptr;
|
||||
|
||||
// if this is a #textdomain, point to textdomain_
|
||||
if (current_ == 't')
|
||||
{
|
||||
if (!skip_command("extdomain")) goto fail;
|
||||
if (!skip_command("extdomain")) {
|
||||
goto not_a_command;
|
||||
}
|
||||
dst = &textdomain_;
|
||||
}
|
||||
// else if this is a #line, determine the line number and then point to file_
|
||||
else if (current_ == 'l')
|
||||
{
|
||||
if (!skip_command("ine")) goto fail;
|
||||
if (!skip_command("ine")) {
|
||||
goto not_a_command;
|
||||
}
|
||||
|
||||
lineno_ = 0;
|
||||
while (is_num(current_)) {
|
||||
// ie if the line number is 587
|
||||
// (0 * 10) + 5 = 5
|
||||
// (5 * 10) + 8 = 58
|
||||
// (58 * 10) + 7 = 587
|
||||
lineno_ = lineno_ * 10 + (current_ - '0');
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
}
|
||||
if (!is_space(current_)) goto fail;
|
||||
next_char_fast();
|
||||
|
||||
if (!is_space(current_)) {
|
||||
goto not_a_command;
|
||||
}
|
||||
|
||||
next_char_skip_cr();
|
||||
dst = &file_;
|
||||
}
|
||||
// else this turned out to not be a #textdomain or a #line, then this is a normal comment so just read off characters until finding the next line or the end of the file
|
||||
else
|
||||
{
|
||||
fail:
|
||||
while (current_ != '\n' && current_ != EOF) {
|
||||
next_char_fast();
|
||||
not_a_command:
|
||||
while (current_ != token::NEWLINE && current_ != EOF) {
|
||||
next_char_skip_cr();
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// clear the current value of either textdomain_ or file_ and populate it with the new value
|
||||
dst->clear();
|
||||
while (current_ != '\n' && current_ != EOF) {
|
||||
while (current_ != token::NEWLINE && current_ != EOF) {
|
||||
*dst += current_;
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,9 +20,23 @@
|
|||
|
||||
#include "buffered_istream.hpp"
|
||||
|
||||
#include <array>
|
||||
#include <istream>
|
||||
#include <string>
|
||||
|
||||
// use of illegal utf8 character for this purpose was added in a76be7ef1e921dabacd99f16ef440bf9673b8d98
|
||||
// added by the preprocessor to allow special handling for #line and #textdomain commands
|
||||
constexpr unsigned char INLINED_PREPROCESS_DIRECTIVE_CHAR = 254;
|
||||
|
||||
// normal ascii is 0-127
|
||||
// extended ascii is from 128-255, none of which need any special handling
|
||||
constexpr int END_STANDARD_ASCII = 128;
|
||||
|
||||
/**
|
||||
* contains the current text being parsed as well as the token_type of what's being parsed.
|
||||
* multi-character token types will have a value that's a string with zero or more characters in it.
|
||||
* single character token types are a single character with special meaning for a config
|
||||
*/
|
||||
struct token
|
||||
{
|
||||
token() :
|
||||
|
@ -30,14 +44,23 @@ struct token
|
|||
value()
|
||||
{}
|
||||
|
||||
/**
|
||||
* used for a token's type field
|
||||
*/
|
||||
enum token_type
|
||||
{
|
||||
// multi-character
|
||||
/** unquoted text */
|
||||
STRING,
|
||||
/** quoted string, contained within double quotes or by less than/greater than symbols */
|
||||
QSTRING,
|
||||
/** reached end of file without finding the closing character for a QSTRING */
|
||||
UNTERMINATED_QSTRING,
|
||||
/** any characters that don't have special meaning */
|
||||
MISC,
|
||||
|
||||
LF = '\n',
|
||||
// single characters
|
||||
NEWLINE = '\n',
|
||||
EQUALS = '=',
|
||||
COMMA = ',',
|
||||
PLUS = '+',
|
||||
|
@ -45,20 +68,42 @@ struct token
|
|||
OPEN_BRACKET = '[',
|
||||
CLOSE_BRACKET = ']',
|
||||
UNDERSCORE = '_',
|
||||
END
|
||||
|
||||
/** set when EOF is returned by the input stream */
|
||||
END = 256
|
||||
};
|
||||
|
||||
/**
|
||||
* not used for a token's type field
|
||||
*/
|
||||
enum source_chars
|
||||
{
|
||||
POUND = '#',
|
||||
LEFT_ANGLE_BRACKET = '<',
|
||||
RIGHT_ANGLE_BRACKET = '>',
|
||||
DOUBLE_QUOTE = '"',
|
||||
DOLLAR = '$',
|
||||
};
|
||||
|
||||
token_type type;
|
||||
/** the token's value, can be either a single character or multiple characters */
|
||||
std::string value;
|
||||
};
|
||||
|
||||
/** Abstract baseclass for the tokenizer. */
|
||||
/**
|
||||
* class responsible for parsing the provided text into tokens and tracking information about the current token.
|
||||
* can also track the previous token when built with the DEBUG_TOKENIZER compiler define.
|
||||
* does not otherwise keep track of the processing history.
|
||||
*/
|
||||
class tokenizer
|
||||
{
|
||||
public:
|
||||
tokenizer(std::istream& in);
|
||||
~tokenizer();
|
||||
|
||||
/**
|
||||
* Reads characters off of @a in_ to return the next token type and its value.
|
||||
*/
|
||||
const token &next_token();
|
||||
|
||||
const token ¤t_token() const
|
||||
|
@ -94,75 +139,78 @@ private:
|
|||
int lineno_;
|
||||
int startlineno_;
|
||||
|
||||
/**
|
||||
* increments the line number if the current character is a newline
|
||||
* set current_ to the next character that's not `\r`
|
||||
*/
|
||||
void next_char()
|
||||
{
|
||||
if (current_ == '\n')
|
||||
if (current_ == token::NEWLINE)
|
||||
++lineno_;
|
||||
next_char_fast();
|
||||
next_char_skip_cr();
|
||||
}
|
||||
|
||||
void next_char_fast()
|
||||
/**
|
||||
* set current_ to the next character
|
||||
* skip the `\r` in the `\r\n` Windows-style line endings
|
||||
* the test_cvs_2018_1999023_2.cfg file also uses `\r\n` line endings for some reason - otherwise that check isn't needed on non-Windows platforms since `\r` characters are removed from cfg files on upload
|
||||
*/
|
||||
void next_char_skip_cr()
|
||||
{
|
||||
do {
|
||||
current_ = in_.get();
|
||||
if(current_ == '\r') {
|
||||
current_ = in_.get();
|
||||
} while (current_ == '\r');
|
||||
#if 0
|
||||
// TODO: disabled until the campaign server is fixed
|
||||
if(in_.good()) {
|
||||
current_ = in_.get();
|
||||
if (current_ == '\r')
|
||||
{
|
||||
// we assume that there is only one '\r'
|
||||
if(in_.good()) {
|
||||
current_ = in_.get();
|
||||
} else {
|
||||
current_ = EOF;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
current_ = EOF;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* return the next character without incrementing the current position in the istream
|
||||
*/
|
||||
int peek_char()
|
||||
{
|
||||
return in_.peek();
|
||||
}
|
||||
|
||||
enum
|
||||
/**
|
||||
* the different types of characters while parsing
|
||||
* TOK_NONE is also the default for anything beyond standard ascii
|
||||
*/
|
||||
enum character_type
|
||||
{
|
||||
TOK_NONE = 0,
|
||||
TOK_SPACE = 1,
|
||||
TOK_NUMERIC = 2,
|
||||
TOK_ALPHA = 4
|
||||
TOK_ALPHA = 3
|
||||
};
|
||||
|
||||
int char_type(unsigned c) const
|
||||
character_type char_type(unsigned c) const
|
||||
{
|
||||
return c < 128 ? char_types_[c] : 0;
|
||||
return c < END_STANDARD_ASCII ? char_types_[c] : TOK_NONE;
|
||||
}
|
||||
|
||||
bool is_space(int c) const
|
||||
{
|
||||
return (char_type(c) & TOK_SPACE) == TOK_SPACE;
|
||||
return char_type(c) == TOK_SPACE;
|
||||
}
|
||||
|
||||
bool is_num(int c) const
|
||||
{
|
||||
return (char_type(c) & TOK_NUMERIC) == TOK_NUMERIC;
|
||||
return char_type(c) == TOK_NUMERIC;
|
||||
}
|
||||
|
||||
bool is_alnum(int c) const
|
||||
{
|
||||
return (char_type(c) & (TOK_ALPHA | TOK_NUMERIC)) != TOK_NONE;
|
||||
return char_type(c) > TOK_SPACE;
|
||||
}
|
||||
|
||||
/**
|
||||
* handles skipping over comments (inline and on a separate line) as well as the special processing needed for \#textdomain and \#line
|
||||
*/
|
||||
void skip_comment();
|
||||
|
||||
/**
|
||||
* Returns true if the next characters are the one from @a cmd
|
||||
* followed by a space. Skips all the matching characters.
|
||||
* Returns true if the next characters are the one from @a cmd followed by a space. Skips all the matching characters.
|
||||
* Currently only used by \#textdomain (specified by the WML) and \#line (added by the preprocessor)
|
||||
*/
|
||||
bool skip_command(char const *cmd);
|
||||
|
||||
|
@ -173,5 +221,5 @@ private:
|
|||
token previous_token_;
|
||||
#endif
|
||||
buffered_istream in_;
|
||||
char char_types_[128];
|
||||
std::array<character_type, END_STANDARD_ASCII> char_types_;
|
||||
};
|
||||
|
|
Loading…
Add table
Reference in a new issue