%{ #include "lexer_helper.hh" #include "option.hh" #include "parser.hh" #include "syntax.hh" #include #include #include #include #include using namespace std; #define YY_USER_ACTION \ do { \ yylloc->start = yyget_extra(yyscanner); \ yylloc->end = yylloc->start + yyleng; \ yyset_extra(yylloc->end, yyscanner); \ } while (0); static string tmp_bracket, tmp_str; static long tmp_str_pos; static bool semicolon; static long invalid_escape(YYSTYPE* yylval, const char* text) { yylval->errmsg = aprintf("invalid \\-escape: %s", text); return INVALID_CHARACTER; } static int invalid_escape_octonary(YYSTYPE* yylval, const char* text) { yylval->errmsg = aprintf("invalid number after \\-escape: %s", text); return INVALID_CHARACTER; } static int invalid_escape_x(YYSTYPE* yylval, const char* text) { yylval->errmsg = aprintf("invalid number after \\x-escape: %s", text); return INVALID_CHARACTER; } static int invalid_escape_u(YYSTYPE* yylval, const char* text) { yylval->errmsg = aprintf("invalid number after \\u-escape: %s", text); return INVALID_CHARACTER; } static int invalid_escape_U(YYSTYPE* yylval, const char* text) { yylval->errmsg = aprintf("invalid number after \\U-escape: %s", text); return INVALID_CHARACTER; } static void unexpected_eof(YYSTYPE* yylval, const char* token_end) { yylval->errmsg = aprintf("missing %s at end of file", token_end); } static void unexpected_newline(YYSTYPE* yylval, const char* token_end) { yylval->errmsg = aprintf("missing %s at end of line", token_end); } static int unexpected_codepoint(YYSTYPE* yylval) { yylval->errmsg = aprintf("cannot use Unicode codepoints"); return INVALID_CHARACTER; } extern "C" int raw_yywrap(yyscan_t yyscanner) { semicolon = false; return 1; } %} %option yywrap noinput %option reentrant %option extra-type="long" %option bison-bridge bison-locations %option prefix="raw_yy" %option stack %x EXPECT_CODE %x AFTER_ACTION_OP %x AFTER_EXPORT %x IN_BRACE %x IN_CODE %x IN_COMMENT %x IN_BRACKET %x IN_BRACKET_FIRST %x IN_LINE_COMMENT %s IN_PAREN %x IN_Q_STRING %x IN_QQ_STRING D [0-9] H [0-9A-Fa-f] L [a-zA-Z_\x80-\xff] %% "::" return COLONCOLON; ".." return DOTDOT; "&&" return AMPERAMPER; ";" if (semicolon) return '\n'; [-~!&*=+,.?|{}:] return yytext[0]; "action" yy_push_state(EXPECT_CODE, yyscanner); return ACTION; "as" return AS; "c++" yy_push_state(EXPECT_CODE, yyscanner); return CPP; "epsilon" return EPSILON; "export" yy_push_state(AFTER_EXPORT, yyscanner); return EXPORT; "import" return IMPORT; "intact" return INTACT; "semicolon" semicolon = true; "nosemicolon" semicolon = false; {L}({L}|{D})* yylval->str = new string(yytext); return IDENT; {D}+ yylval->integer = atol(yytext); return INTEGER; "#define" return PREPROCESS_DEFINE; "#" yy_push_state(IN_LINE_COMMENT, yyscanner); "//" yy_push_state(IN_LINE_COMMENT, yyscanner); { "\n" yy_pop_state(yyscanner); unput('\n'); yyset_extra(yylloc->end-1, yyscanner); <> yy_pop_state(yyscanner); . {} } "/*" yy_push_state(IN_COMMENT, yyscanner); { "*/" yy_pop_state(yyscanner); <> yy_pop_state(yyscanner); .|\n {} } "(" yy_push_state(IN_PAREN, yyscanner); return '('; ")" { if (YY_START != IN_PAREN) { unexpected_newline(yylval, ")"); return INVALID_CHARACTER; } yy_pop_state(yyscanner); return ')'; } "[" yy_push_state(IN_BRACKET_FIRST, yyscanner); return '['; { "^" BEGIN IN_BRACKET; return '^'; [^-\\\]\n] { yy_pop_state(yyscanner); yy_push_state(IN_BRACKET, yyscanner); yylval->integer = yytext[0]; return CHAR; } "-" { yylval->integer = '-'; return CHAR; } } { "]" { yy_pop_state(yyscanner); if (YY_START == INITIAL || YY_START == IN_PAREN) return ']'; } [^-\\\]\n] yylval->integer = yytext[0]; return CHAR; } { \\[0-7]+ { BEGIN IN_BRACKET; long c = strtol(yytext+1, NULL, 8); if (UCHAR_MAX < c) return invalid_escape_octonary(yylval, yytext); yylval->integer = c; return CHAR; } \\u[0-9a-fA-F]+ { BEGIN IN_BRACKET; if (opt_bytes) return unexpected_codepoint(yylval); long c = strtol(yytext+2, NULL, 16), len = 0; if (UINT16_MAX < c) return invalid_escape_u(yylval, yytext); yylval->integer = c; return CHAR; } \\U[0-9a-fA-F]+ { BEGIN IN_BRACKET; if (opt_bytes) return unexpected_codepoint(yylval); long c = strtol(yytext+2, NULL, 16), len = 0; if (MAX_CODEPOINT < c) return invalid_escape_U(yylval, yytext); yylval->integer = c; return CHAR; } \\x[0-9a-fA-F]+ { BEGIN IN_BRACKET; long c = strtol(yytext+2, NULL, 16); if (UCHAR_MAX < c) return invalid_escape_x(yylval, yytext); yylval->integer = c; return CHAR; } \\a BEGIN IN_BRACKET; yylval->integer = '\a'; return CHAR; \\b BEGIN IN_BRACKET; yylval->integer = '\b'; return CHAR; \\f BEGIN IN_BRACKET; yylval->integer = '\f'; return CHAR; \\n BEGIN IN_BRACKET; yylval->integer = '\n'; return CHAR; \\r BEGIN IN_BRACKET; yylval->integer = '\r'; return CHAR; \\t BEGIN IN_BRACKET; yylval->integer = '\t'; return CHAR; \\v BEGIN IN_BRACKET; yylval->integer = '\v'; return CHAR; \\. BEGIN IN_BRACKET; yylval->integer = yytext[1]; return CHAR; - BEGIN IN_BRACKET; return '-'; "\n" unexpected_newline(yylval, "]"); return INVALID_CHARACTER; <> yy_pop_state(yyscanner); unexpected_eof(yylval, "]"); } { // optional 'BRACED_CODE' to specify extra parameters "intact" yy_pop_state(yyscanner); return INTACT; {L}({L}|{D})* yy_pop_state(yyscanner); yylval->str = new string(yytext); return IDENT; "{" BEGIN IN_CODE; tmp_bracket.clear(); [ \t\n] {} <> yy_pop_state(yyscanner); } { {L}({L}|{D})* yylval->str = new string(yytext); return IDENT; "{" BEGIN IN_CODE; tmp_bracket.clear(); [ \t\n] {} <> yy_pop_state(yyscanner); } [>@%$] yy_push_state(AFTER_ACTION_OP, yyscanner); return yytext[0]; { -?{D}+ yylval->integer = atol(yytext); return INTEGER; {L}({L}|{D})* yy_pop_state(yyscanner); yylval->str = new string(yytext); return IDENT; "{" BEGIN IN_CODE; tmp_bracket.clear(); [ \t\n]+ {} <> yy_pop_state(yyscanner); . yylval->errmsg = strdup("invalid character"); return INVALID_CHARACTER; } { "'" { tmp_bracket += '\''; yy_push_state(IN_Q_STRING, yyscanner); } "\"" { tmp_bracket += '"'; yy_push_state(IN_QQ_STRING, yyscanner); } "{" { tmp_bracket += '{'; yy_push_state(IN_CODE, yyscanner); } "}" { yy_pop_state(yyscanner); if (YY_START == INITIAL || YY_START == IN_PAREN) { yylval->str = new string(tmp_bracket); return BRACED_CODE; } else tmp_bracket += '}'; } .|"\n" tmp_bracket += yytext[0]; <> yy_pop_state(yyscanner); unexpected_eof(yylval, "}"); } ' tmp_str.clear(); tmp_str_pos = yylloc->start; yy_push_state(IN_Q_STRING, yyscanner); "\"" tmp_str.clear(); tmp_str_pos = yylloc->start; yy_push_state(IN_QQ_STRING, yyscanner); { ' { yy_pop_state(yyscanner); if (YY_START == INITIAL || YY_START == IN_PAREN) { yylval->str = new string(tmp_str); yylloc->start = tmp_str_pos; return STRING_LITERAL; } tmp_bracket += yytext; } <> yy_pop_state(yyscanner); unexpected_eof(yylval, "'"); } { "\"" { yy_pop_state(yyscanner); if (YY_START == INITIAL || YY_START == IN_PAREN) { yylval->str = new string(tmp_str); yylloc->start = tmp_str_pos; return STRING_LITERAL; } tmp_bracket += yytext; } <> yy_pop_state(yyscanner); unexpected_eof(yylval, "\""); } { \\[0-7]+ { long c = strtol(yytext+1, NULL, 8); if (UCHAR_MAX < c) return invalid_escape_octonary(yylval, yytext); tmp_str.push_back(c); tmp_bracket += yytext; } \\x[0-9a-fA-F]+ { long c = strtol(yytext+2, NULL, 16); if (UCHAR_MAX < c) return invalid_escape_x(yylval, yytext); tmp_str.push_back(c); tmp_bracket += yytext; } \\u[0-9a-fA-F]+ { char s[4]; long c = strtol(yytext+2, NULL, 16), len = 0; if (UINT16_MAX < c) return invalid_escape_u(yylval, yytext); U8_APPEND_UNSAFE(s, len, c); tmp_str.insert(tmp_str.end(), s, s+len); tmp_bracket += yytext; } \\U[0-9a-fA-F]+ { char s[4]; long c = strtol(yytext+2, NULL, 16), len = 0; if (MAX_CODEPOINT < c) return invalid_escape_U(yylval, yytext); U8_APPEND_UNSAFE(s, len, c); tmp_str.insert(tmp_str.end(), s, s+len); tmp_bracket += yytext; } \\a tmp_str += '\a'; tmp_bracket += yytext; \\b tmp_str += '\b'; tmp_bracket += yytext; \\f tmp_str += '\f'; tmp_bracket += yytext; \\n tmp_str += '\n'; tmp_bracket += yytext; \\r tmp_str += '\r'; tmp_bracket += yytext; \\t tmp_str += '\t'; tmp_bracket += yytext; \\v tmp_str += '\v'; tmp_bracket += yytext; \\[\n\"\'?\\] tmp_str += yytext[1]; tmp_bracket += yytext; \\. return invalid_escape(yylval, yytext); .|\n tmp_str += yytext[0]; tmp_bracket += yytext[0]; } \\\n {} "\n" if (YY_START == INITIAL && ! semicolon) return '\n'; [ \t]+ {} . { yylval->errmsg = strdup("invalid character"); return INVALID_CHARACTER; }