Improved tokenizer for formula language

This commit is contained in:
Bartek Waresiak 2009-11-08 14:10:07 +00:00
parent f54d91fa26
commit bcece30f2e
2 changed files with 232 additions and 48 deletions

View file

@ -13,8 +13,6 @@
#include <sstream>
#include <boost/regex.hpp>
#include "foreach.hpp"
#include "formula_tokenizer.hpp"
@ -23,51 +21,7 @@ namespace formula_tokenizer
namespace {
using boost::regex;
struct token_type {
regex re;
TOKEN_TYPE type;
};
//create the array with list of possible tokens
token_type token_types[] = { { regex("^(not\\b|and\\b|or\\b|d(?=[^a-zA-Z])|where\\b)"), TOKEN_OPERATOR },
{ regex("^\\n"), TOKEN_EOL },
{ regex("^([ ]|\\t|\\r|\\f)"), TOKEN_WHITESPACE },
{ regex("^\\("), TOKEN_LPARENS },
{ regex("^\\)"), TOKEN_RPARENS },
{ regex("^\\["), TOKEN_LSQUARE },
{ regex("^\\]"), TOKEN_RSQUARE },
{ regex("^->"), TOKEN_POINTER },
{ regex("^'[^']*'"), TOKEN_STRING_LITERAL },
{ regex("^fai\\b"), TOKEN_KEYWORD },
{ regex("^faiend\\b"), TOKEN_KEYWORD },
{ regex("^functions\\b"), TOKEN_KEYWORD },
{ regex("^def\\b"), TOKEN_KEYWORD },
{ regex("^[a-zA-Z_]+"), TOKEN_IDENTIFIER },
{ regex("^\\d+\\.\\d+"), TOKEN_DECIMAL },
{ regex("^\\d+"), TOKEN_INTEGER },
{ regex("^(\\*|\\+|-(?=[^>])|\\^|%|/|<=|>=|<|>|!=|=|\\.\\+|\\.-|\\.\\*|\\./|\\.)"), TOKEN_OPERATOR },
{ regex("^,"), TOKEN_COMMA },
{ regex("^;"), TOKEN_SEMICOLON },
{ regex("^#.*?#"), TOKEN_COMMENT }
};
}
token get_token(iterator& i1, iterator i2) {
foreach(const token_type& t, token_types) {
boost::smatch match;
if(boost::regex_search(i1, i2, match, t.re, boost::match_single_line)) {
token res;
res.type = t.type;
res.begin = i1;
i1 = res.end = i1 + match.length();
return res;
}
}
void raise_exception(iterator& i1, iterator i2, std::string str) {
std::ostringstream expr;
while( (i1 != i2) && (*i1 != '\n') ) {
if( (*i1 != '\t') )
@ -75,7 +29,228 @@ token get_token(iterator& i1, iterator i2) {
++i1;
}
throw token_error("Unrecognized token", expr.str() );
if( str.empty() )
throw token_error("Unrecognized token", expr.str() );
else
throw token_error(str, expr.str() );
}
}
token get_token(iterator& i1, iterator i2) {
iterator it = i1;
if( *i1 >= 'A' ) {
//current character is >= 'A', limit search to the upper-half of the ASCII table
// check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
while( i1 != i2 && ( ( *i1 >= 'a' && *i1 <= 'z' ) || *i1 == '_' || ( *i1 >= 'A' && *i1 <= 'Z' ) ) )
++i1;
int diff = i1 - it;
TOKEN_TYPE t = TOKEN_IDENTIFIER;
//check if this string matches any keyword or an operator
//possible opearators and keywords:
// d, or, def, and, not, fai, where, faiend, functions
if( diff == 1 ) {
if( *it == 'd' )
t = TOKEN_OPERATOR;
} else if( diff == 2 ) {
if( *it == 'o' && *(it+1) == 'r' )
t = TOKEN_OPERATOR;
} else if( diff == 3 ) {
if( *it == 'd' ) { //def
if( *(it+1) == 'e' && *(it+2) == 'f' )
t = TOKEN_KEYWORD;
} else if( *it == 'a' ) { //and
if( *(it+1) == 'n' && *(it+2) == 'd' )
t = TOKEN_OPERATOR;
} else if( *it == 'n' ) { //not
if( *(it+1) == 'o' && *(it+2) == 't' )
t = TOKEN_OPERATOR;
} else if( *it == 'f' ) { //fai
if( *(it+1) == 'a' && *(it+2) == 'i' )
t = TOKEN_KEYWORD;
}
} else if( diff == 5 ) {
std::string s(it, i1);
if( s == "where" )
t = TOKEN_OPERATOR;
} else if( diff == 6 ) {
std::string s(it, i1);
if( s == "faiend" )
t = TOKEN_KEYWORD;
} else if( diff == 9 ) {
std::string s(it, i1);
if( s == "functions" )
t = TOKEN_KEYWORD;
}
return token( it, i1, t);
} else {
//at this point only 3 chars left to check:
if( *i1 == '[' )
return token( it, ++i1, TOKEN_LSQUARE );
if( *i1 == ']' )
return token( it, ++i1, TOKEN_RSQUARE );
if( *i1 == '^' )
return token( it, ++i1, TOKEN_OPERATOR );
}
} else {
//limit search to the lower-half of the ASCII table
//start by checking for whitespaces/end of line char
if( *i1 <= ' ' ) {
if( *i1 == '\n' ) {
return token( it, ++i1, TOKEN_EOL);
} else {
while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
++i1;
return token( it, i1, TOKEN_WHITESPACE );
}
//try to further limit number of characters that we need to check:
} else if ( *i1 >= '0' ){
//current character is between '0' and '@'
if( *i1 <= '9' ) {
//we parse integer or decimal number
++i1;
bool dot = false;
while( i1 != i2 ) {
if( *i1 >= '0' && *i1 <= '9' ) {
//do nothing
} else {
//look for '.' in case of decimal numer
if( *i1 == '.' ) {
//allow only one dot in such expression
if( !dot )
dot = true;
else
raise_exception(it, i2, "Multiple dots near decimal expression");
} else
break;
}
++i1;
}
if( dot )
return token( it, i1, TOKEN_DECIMAL );
else
return token( it, i1, TOKEN_INTEGER );
} else {
//current character is between ':' and '@'
//possible tokens at this point that we are intersted with:
// ; < = > <= >=
if( *i1 == ';' ) {
return token( it, ++i1, TOKEN_SEMICOLON);
} else if( *i1 == '=' ) {
return token( it, ++i1, TOKEN_OPERATOR);
} else if( *i1 == '<' ) {
++i1;
if( i1 != i2 ) {
if( *i1 == '=' )
return token( it, ++i1, TOKEN_OPERATOR);
else
return token( it, i1, TOKEN_OPERATOR);
} else
return token( it, i1, TOKEN_OPERATOR);
} else if( *i1 == '>' ) {
++i1;
if( i1 != i2 ) {
if( *i1 == '=' )
return token( it, ++i1, TOKEN_OPERATOR);
else
return token( it, i1, TOKEN_OPERATOR);
} else
return token( it, i1, TOKEN_OPERATOR);
}
}
//current character is between '!' and '/'
} else if ( *i1 == ',' ) {
return token( it, ++i1, TOKEN_COMMA);
} else if ( *i1 == '.' ) {
++i1;
if( i1 != i2 ) {
if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/')
return token( it, ++i1, TOKEN_OPERATOR );
else
return token( it, i1, TOKEN_OPERATOR );
} else {
return token( it, i1, TOKEN_OPERATOR);
}
} else if ( *i1 == '(' ) {
return token( it, ++i1, TOKEN_LPARENS);
} else if ( *i1 == ')' ) {
return token( it, ++i1, TOKEN_RPARENS);
} else if ( *i1 == '\'' ) {
++i1;
while( i1 != i2 && *i1 != '\'' )
++i1;
if( i1 != i2 ) {
return token( it, ++i1, TOKEN_STRING_LITERAL );
} else {
raise_exception(it, i2, "Missing closing ' for formula string");
}
} else if ( *i1 == '#' ) {
++i1;
while( i1 != i2 && *i1 != '#' )
++i1;
if( i1 != i2 ) {
return token( it, ++i1, TOKEN_COMMENT );
} else {
raise_exception(it, i2, "Missing closing # for formula comment");
}
} else if ( *i1 == '+' ) {
return token( it, ++i1, TOKEN_OPERATOR);
} else if ( *i1 == '-' ) {
++i1;
if( i1 != i2 ) {
if( *i1 == '>' )
return token( it, ++i1, TOKEN_POINTER );
else
return token( it, i1, TOKEN_OPERATOR );
} else {
return token( it, i1, TOKEN_OPERATOR);
}
} else if ( *i1 == '*' ) {
return token( it, ++i1, TOKEN_OPERATOR);
} else if ( *i1 == '/' ) {
return token( it, ++i1, TOKEN_OPERATOR);
} else if ( *i1 == '%' ) {
return token( it, ++i1, TOKEN_OPERATOR);
} else if ( *i1 == '!' ) {
++i1;
if( *i1 == '=' )
return token( it, ++i1, TOKEN_OPERATOR);
else
raise_exception(it, i2, std::string() );
}
}
raise_exception(it, i2, std::string() );
}
}

View file

@ -40,6 +40,15 @@ struct token {
{
}
token(iterator& i1, iterator i2, TOKEN_TYPE type) :
type(type),
begin(i1),
end(i2),
line_number(1),
filename()
{
}
TOKEN_TYPE type;
iterator begin, end;
int line_number;