Improved tokenizer for formula language

2009-11-08 14:10:07 +00:00 · 2009-11-08 14:10:07 +00:00 · bcece30f2e
commit bcece30f2e
parent f54d91fa26
2 changed files with 232 additions and 48 deletions
--- a/src/formula_tokenizer.cpp
+++ b/src/formula_tokenizer.cpp
@ -13,8 +13,6 @@

 #include <sstream>

-#include <boost/regex.hpp>
-
 #include "foreach.hpp"
 #include "formula_tokenizer.hpp"

@ -23,51 +21,7 @@ namespace formula_tokenizer

 namespace {

-using boost::regex;
-
-struct token_type {
-	regex re;
-	TOKEN_TYPE type;
-};
-
-//create the array with list of possible tokens
-token_type token_types[] = {	{ regex("^(not\\b|and\\b|or\\b|d(?=[^a-zA-Z])|where\\b)"), TOKEN_OPERATOR },
-				{ regex("^\\n"),               TOKEN_EOL },
-				{ regex("^([ ]|\\t|\\r|\\f)"), TOKEN_WHITESPACE },
-				{ regex("^\\("),               TOKEN_LPARENS },
-				{ regex("^\\)"),               TOKEN_RPARENS },
-				{ regex("^\\["),               TOKEN_LSQUARE },
-				{ regex("^\\]"),               TOKEN_RSQUARE },
-				{ regex("^->"),                TOKEN_POINTER },
-				{ regex("^'[^']*'"),           TOKEN_STRING_LITERAL },
-				{ regex("^fai\\b"),            TOKEN_KEYWORD },
-				{ regex("^faiend\\b"),         TOKEN_KEYWORD },
-				{ regex("^functions\\b"),      TOKEN_KEYWORD },
-				{ regex("^def\\b"),            TOKEN_KEYWORD },
-				{ regex("^[a-zA-Z_]+"),        TOKEN_IDENTIFIER },
-				{ regex("^\\d+\\.\\d+"),       TOKEN_DECIMAL },
-				{ regex("^\\d+"),              TOKEN_INTEGER },
-{ regex("^(\\*|\\+|-(?=[^>])|\\^|%|/|<=|>=|<|>|!=|=|\\.\\+|\\.-|\\.\\*|\\./|\\.)"), TOKEN_OPERATOR },
-				{ regex("^,"),                 TOKEN_COMMA },
-				{ regex("^;"),                 TOKEN_SEMICOLON },
-				{ regex("^#.*?#"),             TOKEN_COMMENT }
-};
-
-}
-
-token get_token(iterator& i1, iterator i2) {
-	foreach(const token_type& t, token_types) {
-		boost::smatch match;
-		if(boost::regex_search(i1, i2, match, t.re, boost::match_single_line)) {
-			token res;
-			res.type = t.type;
-			res.begin = i1;
-			i1 = res.end = i1 + match.length();
-
-			return res;
-		}
-	}
-
+void raise_exception(iterator& i1, iterator i2, std::string str) {
 	std::ostringstream expr;
 	while( (i1 != i2) && (*i1 != '\n') ) {
 		if( (*i1 != '\t') )
@ -75,7 +29,228 @@ token get_token(iterator& i1, iterator i2) {
 		++i1;
 	}

-	throw token_error("Unrecognized token", expr.str() );
+	if( str.empty() )
+		throw token_error("Unrecognized token", expr.str() );
+	else
+		throw token_error(str, expr.str() );
+}
+
+}
+
+token get_token(iterator& i1, iterator i2) {
+
+	iterator it = i1;
+	if( *i1 >= 'A' ) {
+		//current character is >= 'A', limit search to the upper-half of the ASCII table
+		
+		// check if we parse now TOKEN_IDENTIFIER or TOKEN_OPERATOR/KEYWORD based on string
+		if( *i1 <= 'Z' || ( *i1 >= 'a' && *it <= 'z' ) || *i1 == '_' ) {
+			
+			while( i1 != i2 && ( ( *i1 >= 'a' && *i1 <= 'z' ) || *i1 == '_' || ( *i1 >= 'A' && *i1 <= 'Z' ) ) )
+				++i1;
+
+			int diff = i1 - it;
+			TOKEN_TYPE t = TOKEN_IDENTIFIER;
+
+			//check if this string matches any keyword or an operator
+			//possible opearators and keywords:
+			// d, or, def, and, not, fai, where, faiend, functions
+			if( diff == 1 ) {
+				if( *it == 'd' )
+					t = TOKEN_OPERATOR;
+			} else if( diff == 2 ) {
+				if( *it == 'o' && *(it+1) == 'r' )
+					t = TOKEN_OPERATOR;
+			} else if( diff == 3 ) {
+				if( *it == 'd' ) { //def
+					if( *(it+1) == 'e' && *(it+2) == 'f' )
+						t = TOKEN_KEYWORD;
+				} else if( *it == 'a' ) { //and
+					if( *(it+1) == 'n' && *(it+2) == 'd' )
+						t = TOKEN_OPERATOR;
+				} else if( *it == 'n' ) { //not
+					if( *(it+1) == 'o' && *(it+2) == 't' )
+						t = TOKEN_OPERATOR;
+				} else if( *it == 'f' ) { //fai
+					if( *(it+1) == 'a' && *(it+2) == 'i' )
+						t = TOKEN_KEYWORD;
+				}
+			} else if( diff == 5 ) {
+				std::string s(it, i1);
+				if( s == "where" )
+					t = TOKEN_OPERATOR;
+			} else if( diff == 6 ) {
+				std::string s(it, i1);
+				if( s == "faiend" )
+					t = TOKEN_KEYWORD;
+			} else if( diff == 9 ) {
+				std::string s(it, i1);
+				if( s == "functions" )
+					t = TOKEN_KEYWORD;
+			}
+
+			return token( it, i1, t);
+		} else {
+			//at this point only 3 chars left to check:
+			if( *i1 == '[' )
+				return token( it, ++i1, TOKEN_LSQUARE );
+
+			if( *i1 == ']' )
+				return token( it, ++i1, TOKEN_RSQUARE );
+
+			if( *i1 == '^' )
+				return token( it, ++i1, TOKEN_OPERATOR );
+
+		}
+	} else {
+		//limit search to the lower-half of the ASCII table
+		//start by checking for whitespaces/end of line char
+		if( *i1 <= ' ' ) {
+			if( *i1 == '\n' ) {
+				return token( it, ++i1, TOKEN_EOL);
+			} else {
+
+				while( i1 != i2 && *i1 <= ' ' && *i1 != '\n' )
+					++i1;
+	
+				return token( it, i1, TOKEN_WHITESPACE );
+			}
+		//try to further limit number of characters that we need to check:
+		} else if ( *i1 >= '0' ){
+			//current character is between '0' and '@'
+			if( *i1 <= '9' ) {
+				//we parse integer or decimal number
+				++i1;
+				bool dot = false;
+
+				while( i1 != i2 ) {
+					if( *i1 >= '0' && *i1 <= '9' ) {
+						//do nothing
+					} else {
+						//look for '.' in case of decimal numer
+						if( *i1 == '.' ) {
+							//allow only one dot in such expression
+							if( !dot )
+								dot = true;
+							else
+								raise_exception(it, i2, "Multiple dots near decimal expression");
+						} else
+							break;
+					}
+					++i1;
+				}
+
+				if( dot )
+					return token( it, i1, TOKEN_DECIMAL );
+				else
+					return token( it, i1, TOKEN_INTEGER );
+
+			} else {
+				//current character is between ':' and '@'
+				//possible tokens at this point that we are intersted with:
+				// ; < = > <= >=
+
+				if( *i1 == ';' ) {
+					return token( it, ++i1, TOKEN_SEMICOLON);
+				} else if( *i1 == '=' ) {
+					return token( it, ++i1, TOKEN_OPERATOR);
+				} else if( *i1 == '<' ) {
+					++i1;
+					if( i1 != i2 ) {
+						if( *i1 == '=' )
+							return token( it, ++i1, TOKEN_OPERATOR);
+						else
+							return token( it, i1, TOKEN_OPERATOR);
+					} else 
+						return token( it, i1, TOKEN_OPERATOR);
+				} else if( *i1 == '>' ) {
+					++i1;
+					if( i1 != i2 ) {
+						if( *i1 == '=' )
+							return token( it, ++i1, TOKEN_OPERATOR);
+						else
+							return token( it, i1, TOKEN_OPERATOR);
+					} else 
+						return token( it, i1, TOKEN_OPERATOR);
+				}
+			}
+		//current character is between '!' and '/'
+		} else if ( *i1 == ',' ) {
+			return token( it, ++i1, TOKEN_COMMA);
+
+		} else if ( *i1 == '.' ) {
+			++i1;
+
+			if( i1 != i2 ) {
+				if( *i1 == '+' || *i1 == '-' || *i1 == '*' || *i1 == '/')
+					return token( it, ++i1, TOKEN_OPERATOR );
+				else
+					return token( it, i1, TOKEN_OPERATOR );
+			} else {
+				return token( it, i1, TOKEN_OPERATOR);
+			}
+
+		} else if ( *i1 == '(' ) {
+			return token( it, ++i1, TOKEN_LPARENS);
+
+		} else if ( *i1 == ')' ) {
+			return token( it, ++i1, TOKEN_RPARENS);
+
+		} else if ( *i1 == '\'' ) {
+			++i1;
+			while( i1 != i2 && *i1 != '\'' )
+				++i1;
+
+			if( i1 != i2 ) {
+				return token( it, ++i1, TOKEN_STRING_LITERAL );
+			} else {
+				raise_exception(it, i2, "Missing closing ' for formula string");
+			}
+
+		} else if ( *i1 == '#' ) {
+			++i1;
+			while( i1 != i2 && *i1 != '#' )
+				++i1;
+
+			if( i1 != i2 ) {
+				return token( it, ++i1, TOKEN_COMMENT );
+			} else {
+				raise_exception(it, i2, "Missing closing # for formula comment");
+			}
+
+		} else if ( *i1 == '+' ) {
+			return token( it, ++i1, TOKEN_OPERATOR);
+
+		} else if ( *i1 == '-' ) {
+			++i1;
+
+			if( i1 != i2 ) {
+				if( *i1 == '>' )
+					return token( it, ++i1, TOKEN_POINTER );
+				else
+					return token( it, i1, TOKEN_OPERATOR );
+			} else {
+				return token( it, i1, TOKEN_OPERATOR);
+			}
+
+		} else if ( *i1 == '*' ) {
+			return token( it, ++i1, TOKEN_OPERATOR);
+
+		} else if ( *i1 == '/' ) {
+			return token( it, ++i1, TOKEN_OPERATOR);
+
+		} else if ( *i1 == '%' ) {
+			return token( it, ++i1, TOKEN_OPERATOR);
+
+		} else if ( *i1 == '!' ) {
+			++i1;
+			if( *i1 == '=' )
+				return token( it, ++i1, TOKEN_OPERATOR);
+			else
+				raise_exception(it, i2, std::string() );
+		}
+	}
+	raise_exception(it, i2, std::string() );
 }

 }
--- a/src/formula_tokenizer.hpp
+++ b/src/formula_tokenizer.hpp
@ -40,6 +40,15 @@ struct token {
 	{
 	}

+	token(iterator& i1, iterator i2, TOKEN_TYPE type) :
+		type(type),
+		begin(i1),
+		end(i2),
+		line_number(1),
+		filename()
+	{
+	}
+
 	TOKEN_TYPE type;
 	iterator begin, end;
 	int line_number;