SafeLine/yanshi/src/lexer.l
2023-07-20 15:19:03 +08:00

326 lines
9.2 KiB
Text

%{
#include "lexer_helper.hh"
#include "option.hh"
#include "parser.hh"
#include "syntax.hh"
#include <limits.h>
#include <stdlib.h>
#include <string>
#include <string.h>
#include <unicode/utf8.h>
using namespace std;
#define YY_USER_ACTION \
do { \
yylloc->start = yyget_extra(yyscanner); \
yylloc->end = yylloc->start + yyleng; \
yyset_extra(yylloc->end, yyscanner); \
} while (0);
static string tmp_bracket, tmp_str;
static long tmp_str_pos;
static bool semicolon;
static long invalid_escape(YYSTYPE* yylval, const char* text)
{
yylval->errmsg = aprintf("invalid \\-escape: %s", text);
return INVALID_CHARACTER;
}
static int invalid_escape_octonary(YYSTYPE* yylval, const char* text)
{
yylval->errmsg = aprintf("invalid number after \\-escape: %s", text);
return INVALID_CHARACTER;
}
static int invalid_escape_x(YYSTYPE* yylval, const char* text)
{
yylval->errmsg = aprintf("invalid number after \\x-escape: %s", text);
return INVALID_CHARACTER;
}
static int invalid_escape_u(YYSTYPE* yylval, const char* text)
{
yylval->errmsg = aprintf("invalid number after \\u-escape: %s", text);
return INVALID_CHARACTER;
}
static int invalid_escape_U(YYSTYPE* yylval, const char* text)
{
yylval->errmsg = aprintf("invalid number after \\U-escape: %s", text);
return INVALID_CHARACTER;
}
static void unexpected_eof(YYSTYPE* yylval, const char* token_end)
{
yylval->errmsg = aprintf("missing %s at end of file", token_end);
}
static void unexpected_newline(YYSTYPE* yylval, const char* token_end)
{
yylval->errmsg = aprintf("missing %s at end of line", token_end);
}
static int unexpected_codepoint(YYSTYPE* yylval)
{
yylval->errmsg = aprintf("cannot use Unicode codepoints");
return INVALID_CHARACTER;
}
extern "C" int raw_yywrap(yyscan_t yyscanner)
{
semicolon = false;
return 1;
}
%}
%option yywrap noinput
%option reentrant
%option extra-type="long"
%option bison-bridge bison-locations
%option prefix="raw_yy"
%option stack
%x EXPECT_CODE
%x AFTER_ACTION_OP
%x AFTER_EXPORT
%x IN_BRACE
%x IN_CODE
%x IN_COMMENT
%x IN_BRACKET
%x IN_BRACKET_FIRST
%x IN_LINE_COMMENT
%s IN_PAREN
%x IN_Q_STRING
%x IN_QQ_STRING
D [0-9]
H [0-9A-Fa-f]
L [a-zA-Z_\x80-\xff]
%%
"::" return COLONCOLON;
".." return DOTDOT;
"&&" return AMPERAMPER;
";" if (semicolon) return '\n';
[-~!&*=+,.?|{}:] return yytext[0];
"action" yy_push_state(EXPECT_CODE, yyscanner); return ACTION;
"as" return AS;
"c++" yy_push_state(EXPECT_CODE, yyscanner); return CPP;
"epsilon" return EPSILON;
"export" yy_push_state(AFTER_EXPORT, yyscanner); return EXPORT;
"import" return IMPORT;
"intact" return INTACT;
"semicolon" semicolon = true;
"nosemicolon" semicolon = false;
{L}({L}|{D})* yylval->str = new string(yytext); return IDENT;
{D}+ yylval->integer = atol(yytext); return INTEGER;
"#define" return PREPROCESS_DEFINE;
"#" yy_push_state(IN_LINE_COMMENT, yyscanner);
"//" yy_push_state(IN_LINE_COMMENT, yyscanner);
<IN_LINE_COMMENT>{
"\n" yy_pop_state(yyscanner); unput('\n'); yyset_extra(yylloc->end-1, yyscanner);
<<EOF>> yy_pop_state(yyscanner);
. {}
}
"/*" yy_push_state(IN_COMMENT, yyscanner);
<IN_COMMENT>{
"*/" yy_pop_state(yyscanner);
<<EOF>> yy_pop_state(yyscanner);
.|\n {}
}
"(" yy_push_state(IN_PAREN, yyscanner); return '(';
")" {
if (YY_START != IN_PAREN) {
unexpected_newline(yylval, ")");
return INVALID_CHARACTER;
}
yy_pop_state(yyscanner);
return ')';
}
"[" yy_push_state(IN_BRACKET_FIRST, yyscanner); return '[';
<IN_BRACKET_FIRST>{
"^" BEGIN IN_BRACKET; return '^';
[^-\\\]\n] { yy_pop_state(yyscanner); yy_push_state(IN_BRACKET, yyscanner); yylval->integer = yytext[0]; return CHAR; }
"-" { yylval->integer = '-'; return CHAR; }
}
<IN_BRACKET>{
"]" {
yy_pop_state(yyscanner);
if (YY_START == INITIAL || YY_START == IN_PAREN)
return ']';
}
[^-\\\]\n] yylval->integer = yytext[0]; return CHAR;
}
<IN_BRACKET_FIRST,IN_BRACKET>{
\\[0-7]+ {
BEGIN IN_BRACKET;
long c = strtol(yytext+1, NULL, 8);
if (UCHAR_MAX < c)
return invalid_escape_octonary(yylval, yytext);
yylval->integer = c;
return CHAR;
}
\\u[0-9a-fA-F]+ {
BEGIN IN_BRACKET;
if (opt_bytes)
return unexpected_codepoint(yylval);
long c = strtol(yytext+2, NULL, 16), len = 0;
if (UINT16_MAX < c)
return invalid_escape_u(yylval, yytext);
yylval->integer = c;
return CHAR;
}
\\U[0-9a-fA-F]+ {
BEGIN IN_BRACKET;
if (opt_bytes)
return unexpected_codepoint(yylval);
long c = strtol(yytext+2, NULL, 16), len = 0;
if (MAX_CODEPOINT < c)
return invalid_escape_U(yylval, yytext);
yylval->integer = c;
return CHAR;
}
\\x[0-9a-fA-F]+ {
BEGIN IN_BRACKET;
long c = strtol(yytext+2, NULL, 16);
if (UCHAR_MAX < c)
return invalid_escape_x(yylval, yytext);
yylval->integer = c;
return CHAR;
}
\\a BEGIN IN_BRACKET; yylval->integer = '\a'; return CHAR;
\\b BEGIN IN_BRACKET; yylval->integer = '\b'; return CHAR;
\\f BEGIN IN_BRACKET; yylval->integer = '\f'; return CHAR;
\\n BEGIN IN_BRACKET; yylval->integer = '\n'; return CHAR;
\\r BEGIN IN_BRACKET; yylval->integer = '\r'; return CHAR;
\\t BEGIN IN_BRACKET; yylval->integer = '\t'; return CHAR;
\\v BEGIN IN_BRACKET; yylval->integer = '\v'; return CHAR;
\\. BEGIN IN_BRACKET; yylval->integer = yytext[1]; return CHAR;
- BEGIN IN_BRACKET; return '-';
"\n" unexpected_newline(yylval, "]"); return INVALID_CHARACTER;
<<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "]");
}
<AFTER_EXPORT>{ // optional 'BRACED_CODE' to specify extra parameters
"intact" yy_pop_state(yyscanner); return INTACT;
{L}({L}|{D})* yy_pop_state(yyscanner); yylval->str = new string(yytext); return IDENT;
"{" BEGIN IN_CODE; tmp_bracket.clear();
[ \t\n] {}
<<EOF>> yy_pop_state(yyscanner);
}
<EXPECT_CODE>{
{L}({L}|{D})* yylval->str = new string(yytext); return IDENT;
"{" BEGIN IN_CODE; tmp_bracket.clear();
[ \t\n] {}
<<EOF>> yy_pop_state(yyscanner);
}
[>@%$] yy_push_state(AFTER_ACTION_OP, yyscanner); return yytext[0];
<AFTER_ACTION_OP>{
-?{D}+ yylval->integer = atol(yytext); return INTEGER;
{L}({L}|{D})* yy_pop_state(yyscanner); yylval->str = new string(yytext); return IDENT;
"{" BEGIN IN_CODE; tmp_bracket.clear();
[ \t\n]+ {}
<<EOF>> yy_pop_state(yyscanner);
. yylval->errmsg = strdup("invalid character"); return INVALID_CHARACTER;
}
<IN_CODE>{
"'" { tmp_bracket += '\''; yy_push_state(IN_Q_STRING, yyscanner); }
"\"" { tmp_bracket += '"'; yy_push_state(IN_QQ_STRING, yyscanner); }
"{" { tmp_bracket += '{'; yy_push_state(IN_CODE, yyscanner); }
"}" {
yy_pop_state(yyscanner);
if (YY_START == INITIAL || YY_START == IN_PAREN) {
yylval->str = new string(tmp_bracket);
return BRACED_CODE;
} else
tmp_bracket += '}';
}
.|"\n" tmp_bracket += yytext[0];
<<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "}");
}
' tmp_str.clear(); tmp_str_pos = yylloc->start; yy_push_state(IN_Q_STRING, yyscanner);
"\"" tmp_str.clear(); tmp_str_pos = yylloc->start; yy_push_state(IN_QQ_STRING, yyscanner);
<IN_Q_STRING>{
' {
yy_pop_state(yyscanner);
if (YY_START == INITIAL || YY_START == IN_PAREN) {
yylval->str = new string(tmp_str);
yylloc->start = tmp_str_pos;
return STRING_LITERAL;
}
tmp_bracket += yytext;
}
<<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "'");
}
<IN_QQ_STRING>{
"\"" {
yy_pop_state(yyscanner);
if (YY_START == INITIAL || YY_START == IN_PAREN) {
yylval->str = new string(tmp_str);
yylloc->start = tmp_str_pos;
return STRING_LITERAL;
}
tmp_bracket += yytext;
}
<<EOF>> yy_pop_state(yyscanner); unexpected_eof(yylval, "\"");
}
<IN_Q_STRING,IN_QQ_STRING>{
\\[0-7]+ {
long c = strtol(yytext+1, NULL, 8);
if (UCHAR_MAX < c)
return invalid_escape_octonary(yylval, yytext);
tmp_str.push_back(c);
tmp_bracket += yytext;
}
\\x[0-9a-fA-F]+ {
long c = strtol(yytext+2, NULL, 16);
if (UCHAR_MAX < c)
return invalid_escape_x(yylval, yytext);
tmp_str.push_back(c);
tmp_bracket += yytext;
}
\\u[0-9a-fA-F]+ {
char s[4];
long c = strtol(yytext+2, NULL, 16), len = 0;
if (UINT16_MAX < c)
return invalid_escape_u(yylval, yytext);
U8_APPEND_UNSAFE(s, len, c);
tmp_str.insert(tmp_str.end(), s, s+len);
tmp_bracket += yytext;
}
\\U[0-9a-fA-F]+ {
char s[4];
long c = strtol(yytext+2, NULL, 16), len = 0;
if (MAX_CODEPOINT < c)
return invalid_escape_U(yylval, yytext);
U8_APPEND_UNSAFE(s, len, c);
tmp_str.insert(tmp_str.end(), s, s+len);
tmp_bracket += yytext;
}
\\a tmp_str += '\a'; tmp_bracket += yytext;
\\b tmp_str += '\b'; tmp_bracket += yytext;
\\f tmp_str += '\f'; tmp_bracket += yytext;
\\n tmp_str += '\n'; tmp_bracket += yytext;
\\r tmp_str += '\r'; tmp_bracket += yytext;
\\t tmp_str += '\t'; tmp_bracket += yytext;
\\v tmp_str += '\v'; tmp_bracket += yytext;
\\[\n\"\'?\\] tmp_str += yytext[1]; tmp_bracket += yytext;
\\. return invalid_escape(yylval, yytext);
.|\n tmp_str += yytext[0]; tmp_bracket += yytext[0];
}
\\\n {}
"\n" if (YY_START == INITIAL && ! semicolon) return '\n';
[ \t]+ {}
. { yylval->errmsg = strdup("invalid character"); return INVALID_CHARACTER; }