RegexParser.cpp 81 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * Copyright (c) 2020-2021, the SerenityOS developers.
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include "RegexParser.h"
  8. #include "RegexDebug.h"
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/String.h>
  11. #include <AK/StringBuilder.h>
  12. #include <AK/StringUtils.h>
  13. #include <AK/Utf16View.h>
  14. #include <LibUnicode/CharacterTypes.h>
  15. namespace regex {
  16. static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
  17. static constexpr u64 s_ecma262_maximum_repetition_count = (1ull << 53) - 1;
  18. static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
  19. static constexpr auto s_decimal_characters = "0123456789"sv;
  20. static constexpr StringView identity_escape_characters(bool unicode, bool browser_extended)
  21. {
  22. if (unicode)
  23. return "^$\\.*+?()[]{}|/"sv;
  24. if (browser_extended)
  25. return "^$\\.*+?()[|"sv;
  26. return "^$\\.*+?()[]{}|"sv;
  27. }
  28. ALWAYS_INLINE bool Parser::set_error(Error error)
  29. {
  30. if (m_parser_state.error == Error::NoError) {
  31. m_parser_state.error = error;
  32. m_parser_state.error_token = m_parser_state.current_token;
  33. }
  34. return false; // always return false, that eases the API usage (return set_error(...)) :^)
  35. }
  36. ALWAYS_INLINE bool Parser::done() const
  37. {
  38. return match(TokenType::Eof);
  39. }
  40. ALWAYS_INLINE bool Parser::match(TokenType type) const
  41. {
  42. return m_parser_state.current_token.type() == type;
  43. }
  44. ALWAYS_INLINE bool Parser::match(char ch) const
  45. {
  46. return m_parser_state.current_token.type() == TokenType::Char && m_parser_state.current_token.value().length() == 1 && m_parser_state.current_token.value()[0] == ch;
  47. }
  48. ALWAYS_INLINE Token Parser::consume()
  49. {
  50. auto old_token = m_parser_state.current_token;
  51. m_parser_state.current_token = m_parser_state.lexer.next();
  52. return old_token;
  53. }
  54. ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
  55. {
  56. if (m_parser_state.current_token.type() != type) {
  57. set_error(error);
  58. dbgln_if(REGEX_DEBUG, "[PARSER] Error: Unexpected token {}. Expected: {}", m_parser_state.current_token.name(), Token::name(type));
  59. }
  60. return consume();
  61. }
  62. ALWAYS_INLINE bool Parser::consume(String const& str)
  63. {
  64. size_t potentially_go_back { 1 };
  65. for (auto ch : str) {
  66. if (match(TokenType::Char)) {
  67. if (m_parser_state.current_token.value()[0] != ch) {
  68. m_parser_state.lexer.back(potentially_go_back);
  69. m_parser_state.current_token = m_parser_state.lexer.next();
  70. return false;
  71. }
  72. } else {
  73. m_parser_state.lexer.back(potentially_go_back);
  74. m_parser_state.current_token = m_parser_state.lexer.next();
  75. return false;
  76. }
  77. consume(TokenType::Char, Error::NoError);
  78. ++potentially_go_back;
  79. }
  80. return true;
  81. }
  82. ALWAYS_INLINE bool Parser::try_skip(StringView str)
  83. {
  84. if (str.starts_with(m_parser_state.current_token.value()))
  85. str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
  86. else
  87. return false;
  88. size_t potentially_go_back { 0 };
  89. for (auto ch : str) {
  90. if (!m_parser_state.lexer.try_skip(ch)) {
  91. m_parser_state.lexer.back(potentially_go_back);
  92. return false;
  93. }
  94. ++potentially_go_back;
  95. }
  96. m_parser_state.current_token = m_parser_state.lexer.next();
  97. return true;
  98. }
  99. ALWAYS_INLINE bool Parser::lookahead_any(StringView str)
  100. {
  101. for (auto ch : str) {
  102. if (match(ch))
  103. return true;
  104. }
  105. return false;
  106. }
  107. ALWAYS_INLINE char Parser::skip()
  108. {
  109. char ch;
  110. if (m_parser_state.current_token.value().length() == 1) {
  111. ch = m_parser_state.current_token.value()[0];
  112. } else {
  113. m_parser_state.lexer.back(m_parser_state.current_token.value().length());
  114. ch = m_parser_state.lexer.skip();
  115. }
  116. m_parser_state.current_token = m_parser_state.lexer.next();
  117. return ch;
  118. }
  119. ALWAYS_INLINE void Parser::back(size_t count)
  120. {
  121. m_parser_state.lexer.back(count);
  122. m_parser_state.current_token = m_parser_state.lexer.next();
  123. }
  124. ALWAYS_INLINE void Parser::reset()
  125. {
  126. m_parser_state.bytecode.clear();
  127. m_parser_state.lexer.reset();
  128. m_parser_state.current_token = m_parser_state.lexer.next();
  129. m_parser_state.error = Error::NoError;
  130. m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
  131. m_parser_state.capture_group_minimum_lengths.clear();
  132. m_parser_state.capture_groups_count = 0;
  133. m_parser_state.named_capture_groups_count = 0;
  134. m_parser_state.named_capture_groups.clear();
  135. }
  136. Parser::Result Parser::parse(Optional<AllOptions> regex_options)
  137. {
  138. reset();
  139. if (regex_options.has_value())
  140. m_parser_state.regex_options = regex_options.value();
  141. if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
  142. consume(TokenType::Eof, Error::InvalidPattern);
  143. else
  144. set_error(Error::InvalidPattern);
  145. dbgln_if(REGEX_DEBUG, "[PARSER] Produced bytecode with {} entries (opcodes + arguments)", m_parser_state.bytecode.size());
  146. return {
  147. move(m_parser_state.bytecode),
  148. move(m_parser_state.capture_groups_count),
  149. move(m_parser_state.named_capture_groups_count),
  150. move(m_parser_state.match_length_minimum),
  151. move(m_parser_state.error),
  152. move(m_parser_state.error_token)
  153. };
  154. }
  155. ALWAYS_INLINE bool Parser::match_ordinary_characters()
  156. {
  157. // NOTE: This method must not be called during bracket and repetition parsing!
  158. // FIXME: Add assertion for that?
  159. auto type = m_parser_state.current_token.type();
  160. return (type == TokenType::Char
  161. || type == TokenType::Comma
  162. || type == TokenType::Slash
  163. || type == TokenType::EqualSign
  164. || type == TokenType::HyphenMinus
  165. || type == TokenType::Colon);
  166. }
  167. // =============================
  168. // Abstract Posix Parser
  169. // =============================
  170. ALWAYS_INLINE bool AbstractPosixParser::parse_bracket_expression(Vector<CompareTypeAndValuePair>& values, size_t& match_length_minimum)
  171. {
  172. for (; !done();) {
  173. if (match(TokenType::HyphenMinus)) {
  174. consume();
  175. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  176. // first in the bracket expression
  177. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  178. } else if (match(TokenType::RightBracket)) {
  179. // Last in the bracket expression
  180. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  181. } else if (values.last().type == CharacterCompareType::Char) {
  182. values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
  183. if (done())
  184. return set_error(Error::MismatchingBracket);
  185. if (match(TokenType::HyphenMinus)) {
  186. consume();
  187. // Valid range, add ordinary character
  188. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  189. }
  190. } else {
  191. return set_error(Error::InvalidRange);
  192. }
  193. } else if (match(TokenType::Circumflex)) {
  194. auto t = consume();
  195. if (values.is_empty())
  196. values.append({ CharacterCompareType::Inverse, 0 });
  197. else
  198. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
  199. } else if (match(TokenType::LeftBracket)) {
  200. consume();
  201. if (match(TokenType::Period)) {
  202. consume();
  203. // FIXME: Parse collating element, this is needed when we have locale support
  204. // This could have impact on length parameter, I guess.
  205. set_error(Error::InvalidCollationElement);
  206. consume(TokenType::Period, Error::InvalidCollationElement);
  207. consume(TokenType::RightBracket, Error::MismatchingBracket);
  208. } else if (match(TokenType::EqualSign)) {
  209. consume();
  210. // FIXME: Parse collating element, this is needed when we have locale support
  211. // This could have impact on length parameter, I guess.
  212. set_error(Error::InvalidCollationElement);
  213. consume(TokenType::EqualSign, Error::InvalidCollationElement);
  214. consume(TokenType::RightBracket, Error::MismatchingBracket);
  215. } else if (match(TokenType::Colon)) {
  216. consume();
  217. CharClass ch_class;
  218. // parse character class
  219. if (match(TokenType::Char)) {
  220. if (consume("alnum"))
  221. ch_class = CharClass::Alnum;
  222. else if (consume("alpha"))
  223. ch_class = CharClass::Alpha;
  224. else if (consume("blank"))
  225. ch_class = CharClass::Blank;
  226. else if (consume("cntrl"))
  227. ch_class = CharClass::Cntrl;
  228. else if (consume("digit"))
  229. ch_class = CharClass::Digit;
  230. else if (consume("graph"))
  231. ch_class = CharClass::Graph;
  232. else if (consume("lower"))
  233. ch_class = CharClass::Lower;
  234. else if (consume("print"))
  235. ch_class = CharClass::Print;
  236. else if (consume("punct"))
  237. ch_class = CharClass::Punct;
  238. else if (consume("space"))
  239. ch_class = CharClass::Space;
  240. else if (consume("upper"))
  241. ch_class = CharClass::Upper;
  242. else if (consume("xdigit"))
  243. ch_class = CharClass::Xdigit;
  244. else
  245. return set_error(Error::InvalidCharacterClass);
  246. values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
  247. } else
  248. return set_error(Error::InvalidCharacterClass);
  249. // FIXME: we do not support locale specific character classes until locales are implemented
  250. consume(TokenType::Colon, Error::InvalidCharacterClass);
  251. consume(TokenType::RightBracket, Error::MismatchingBracket);
  252. } else {
  253. return set_error(Error::MismatchingBracket);
  254. }
  255. } else if (match(TokenType::RightBracket)) {
  256. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  257. // handle bracket as ordinary character
  258. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
  259. } else {
  260. // closing bracket expression
  261. break;
  262. }
  263. } else {
  264. values.append({ CharacterCompareType::Char, (ByteCodeValueType)skip() });
  265. }
  266. // check if range expression has to be completed...
  267. if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
  268. if (values.last().type != CharacterCompareType::Char)
  269. return set_error(Error::InvalidRange);
  270. auto value2 = values.take_last();
  271. values.take_last(); // RangeExpressionDummy
  272. auto value1 = values.take_last();
  273. values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) });
  274. }
  275. }
  276. if (!values.is_empty()) {
  277. match_length_minimum = 1;
  278. if (values.first().type == CharacterCompareType::Inverse)
  279. match_length_minimum = 0;
  280. }
  281. return true;
  282. }
  283. // =============================
  284. // PosixBasic Parser
  285. // =============================
  286. bool PosixBasicParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  287. {
  288. return parse_root(stack, match_length_minimum);
  289. }
  290. bool PosixBasicParser::parse_root(ByteCode& bytecode, size_t& match_length_minimum)
  291. {
  292. // basic_reg_exp : L_ANCHOR? RE_expression R_ANCHOR?
  293. if (match(TokenType::Circumflex)) {
  294. consume();
  295. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  296. }
  297. if (!parse_re_expression(bytecode, match_length_minimum))
  298. return false;
  299. if (match(TokenType::Dollar)) {
  300. consume();
  301. bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  302. }
  303. return !has_error();
  304. }
  305. bool PosixBasicParser::parse_re_expression(ByteCode& bytecode, size_t& match_length_minimum)
  306. {
  307. // RE_expression : RE_expression? simple_RE
  308. while (!done()) {
  309. if (!parse_simple_re(bytecode, match_length_minimum))
  310. break;
  311. }
  312. return !has_error();
  313. }
  314. bool PosixBasicParser::parse_simple_re(ByteCode& bytecode, size_t& match_length_minimum)
  315. {
  316. // simple_RE : nondupl_RE RE_dupl_symbol?
  317. ByteCode simple_re_bytecode;
  318. size_t re_match_length_minimum = 0;
  319. if (!parse_nonduplicating_re(simple_re_bytecode, re_match_length_minimum))
  320. return false;
  321. // RE_dupl_symbol : '*' | Back_open_brace DUP_COUNT (',' DUP_COUNT?)? Back_close_brace
  322. if (match(TokenType::Asterisk)) {
  323. consume();
  324. ByteCode::transform_bytecode_repetition_any(simple_re_bytecode, true);
  325. } else if (try_skip("\\{")) {
  326. auto read_number = [&]() -> Optional<size_t> {
  327. if (!match(TokenType::Char))
  328. return {};
  329. size_t value = 0;
  330. while (match(TokenType::Char)) {
  331. auto c = m_parser_state.current_token.value().substring_view(0, 1);
  332. auto c_value = c.to_uint();
  333. if (!c_value.has_value())
  334. break;
  335. value *= 10;
  336. value += *c_value;
  337. consume();
  338. }
  339. return value;
  340. };
  341. size_t min_limit;
  342. Optional<size_t> max_limit;
  343. if (auto limit = read_number(); !limit.has_value())
  344. return set_error(Error::InvalidRepetitionMarker);
  345. else
  346. min_limit = *limit;
  347. if (match(TokenType::Comma)) {
  348. consume();
  349. max_limit = read_number();
  350. }
  351. if (!try_skip("\\}"))
  352. return set_error(Error::MismatchingBrace);
  353. if (max_limit.value_or(min_limit) < min_limit)
  354. return set_error(Error::InvalidBraceContent);
  355. if (min_limit > s_maximum_repetition_count || (max_limit.has_value() && *max_limit > s_maximum_repetition_count))
  356. return set_error(Error::InvalidBraceContent);
  357. auto repetition_mark_id = m_parser_state.repetition_mark_count++;
  358. ByteCode::transform_bytecode_repetition_min_max(simple_re_bytecode, min_limit, max_limit, repetition_mark_id, true);
  359. match_length_minimum += re_match_length_minimum * min_limit;
  360. } else {
  361. match_length_minimum += re_match_length_minimum;
  362. }
  363. bytecode.extend(move(simple_re_bytecode));
  364. return true;
  365. }
  366. bool PosixBasicParser::parse_nonduplicating_re(ByteCode& bytecode, size_t& match_length_minimum)
  367. {
  368. // nondupl_RE : one_char_or_coll_elem_RE | Back_open_paren RE_expression Back_close_paren | BACKREF
  369. if (try_skip("\\(")) {
  370. ByteCode capture_bytecode;
  371. size_t capture_length_minimum = 0;
  372. auto capture_group_index = ++m_parser_state.capture_groups_count;
  373. if (!parse_re_expression(capture_bytecode, capture_length_minimum))
  374. return false;
  375. if (!try_skip("\\)"))
  376. return set_error(Error::MismatchingParen);
  377. match_length_minimum += capture_length_minimum;
  378. if (capture_group_index <= number_of_addressable_capture_groups) {
  379. m_capture_group_minimum_lengths[capture_group_index - 1] = capture_length_minimum;
  380. m_capture_group_seen[capture_group_index - 1] = true;
  381. bytecode.insert_bytecode_group_capture_left(capture_group_index);
  382. }
  383. bytecode.extend(capture_bytecode);
  384. if (capture_group_index <= number_of_addressable_capture_groups)
  385. bytecode.insert_bytecode_group_capture_right(capture_group_index);
  386. return true;
  387. }
  388. for (size_t i = 1; i < 10; ++i) {
  389. char backref_name[2] { '\\', '0' };
  390. backref_name[1] += i;
  391. if (try_skip({ backref_name, 2 })) {
  392. if (!m_capture_group_seen[i - 1])
  393. return set_error(Error::InvalidNumber);
  394. match_length_minimum += m_capture_group_minimum_lengths[i - 1];
  395. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)i } });
  396. return true;
  397. }
  398. }
  399. return parse_one_char_or_collation_element(bytecode, match_length_minimum);
  400. }
  401. bool PosixBasicParser::parse_one_char_or_collation_element(ByteCode& bytecode, size_t& match_length_minimum)
  402. {
  403. // one_char_or_coll_elem_RE : ORD_CHAR | QUOTED_CHAR | '.' | bracket_expression
  404. if (match(TokenType::Period)) {
  405. consume();
  406. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  407. match_length_minimum += 1;
  408. return true;
  409. }
  410. // None of these are special in BRE.
  411. if (match(TokenType::Char) || match(TokenType::Questionmark) || match(TokenType::RightParen) || match(TokenType::HyphenMinus)
  412. || match(TokenType::Circumflex) || match(TokenType::RightCurly) || match(TokenType::Comma) || match(TokenType::Colon)
  413. || match(TokenType::Dollar) || match(TokenType::EqualSign) || match(TokenType::LeftCurly) || match(TokenType::LeftParen)
  414. || match(TokenType::Pipe) || match(TokenType::Slash) || match(TokenType::RightBracket) || match(TokenType::RightParen)) {
  415. auto ch = consume().value()[0];
  416. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
  417. match_length_minimum += 1;
  418. return true;
  419. }
  420. if (match(TokenType::EscapeSequence)) {
  421. if (m_parser_state.current_token.value().is_one_of("\\)"sv, "\\}"sv, "\\("sv, "\\{"sv))
  422. return false;
  423. auto ch = consume().value()[1];
  424. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
  425. match_length_minimum += 1;
  426. return true;
  427. }
  428. Vector<CompareTypeAndValuePair> values;
  429. size_t bracket_minimum_length = 0;
  430. if (match(TokenType::LeftBracket)) {
  431. consume();
  432. if (!AbstractPosixParser::parse_bracket_expression(values, bracket_minimum_length))
  433. return false;
  434. consume(TokenType::RightBracket, Error::MismatchingBracket);
  435. if (!has_error())
  436. bytecode.insert_bytecode_compare_values(move(values));
  437. match_length_minimum += bracket_minimum_length;
  438. return !has_error();
  439. }
  440. return set_error(Error::InvalidPattern);
  441. }
  442. // =============================
  443. // PosixExtended Parser
  444. // =============================
  445. bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  446. {
  447. return parse_root(stack, match_length_minimum);
  448. }
  449. ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
  450. {
  451. auto type = m_parser_state.current_token.type();
  452. return (type == TokenType::Asterisk
  453. || type == TokenType::Plus
  454. || type == TokenType::Questionmark
  455. || type == TokenType::LeftCurly);
  456. }
  457. ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
  458. {
  459. if (match(TokenType::LeftCurly)) {
  460. consume();
  461. StringBuilder number_builder;
  462. while (match(TokenType::Char)) {
  463. number_builder.append(consume().value());
  464. }
  465. auto maybe_minimum = number_builder.build().to_uint();
  466. if (!maybe_minimum.has_value())
  467. return set_error(Error::InvalidBraceContent);
  468. auto minimum = maybe_minimum.value();
  469. match_length_minimum *= minimum;
  470. if (minimum > s_maximum_repetition_count)
  471. return set_error(Error::InvalidBraceContent);
  472. if (match(TokenType::Comma)) {
  473. consume();
  474. } else {
  475. auto repetition_mark_id = m_parser_state.repetition_mark_count++;
  476. ByteCode bytecode;
  477. bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, repetition_mark_id);
  478. bytecode_to_repeat = move(bytecode);
  479. consume(TokenType::RightCurly, Error::MismatchingBrace);
  480. return !has_error();
  481. }
  482. Optional<u32> maybe_maximum {};
  483. number_builder.clear();
  484. while (match(TokenType::Char)) {
  485. number_builder.append(consume().value());
  486. }
  487. if (!number_builder.is_empty()) {
  488. auto value = number_builder.build().to_uint();
  489. if (!value.has_value() || minimum > value.value() || *value > s_maximum_repetition_count)
  490. return set_error(Error::InvalidBraceContent);
  491. maybe_maximum = value.value();
  492. }
  493. auto repetition_mark_id = m_parser_state.repetition_mark_count++;
  494. ByteCode::transform_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum, repetition_mark_id);
  495. consume(TokenType::RightCurly, Error::MismatchingBrace);
  496. return !has_error();
  497. } else if (match(TokenType::Plus)) {
  498. consume();
  499. bool nongreedy = match(TokenType::Questionmark);
  500. if (nongreedy)
  501. consume();
  502. // Note: don't touch match_length_minimum, it's already correct
  503. ByteCode::transform_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
  504. return !has_error();
  505. } else if (match(TokenType::Asterisk)) {
  506. consume();
  507. match_length_minimum = 0;
  508. bool nongreedy = match(TokenType::Questionmark);
  509. if (nongreedy)
  510. consume();
  511. ByteCode::transform_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
  512. return !has_error();
  513. } else if (match(TokenType::Questionmark)) {
  514. consume();
  515. match_length_minimum = 0;
  516. bool nongreedy = match(TokenType::Questionmark);
  517. if (nongreedy)
  518. consume();
  519. ByteCode::transform_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
  520. return !has_error();
  521. }
  522. return false;
  523. }
  524. ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
  525. {
  526. Vector<CompareTypeAndValuePair> values;
  527. if (!AbstractPosixParser::parse_bracket_expression(values, match_length_minimum))
  528. return false;
  529. if (!has_error())
  530. stack.insert_bytecode_compare_values(move(values));
  531. return !has_error();
  532. }
  533. ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
  534. {
  535. ByteCode bytecode;
  536. size_t length = 0;
  537. bool should_parse_repetition_symbol { false };
  538. for (;;) {
  539. if (match_ordinary_characters()) {
  540. Token start_token = m_parser_state.current_token;
  541. Token last_token = m_parser_state.current_token;
  542. for (;;) {
  543. if (!match_ordinary_characters())
  544. break;
  545. ++length;
  546. last_token = consume();
  547. }
  548. if (length > 1) {
  549. // last character is inserted into 'bytecode' for duplication symbol handling
  550. auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
  551. stack.insert_bytecode_compare_string({ start_token.value().characters_without_null_termination(), new_length });
  552. }
  553. if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
  554. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
  555. should_parse_repetition_symbol = true;
  556. break;
  557. }
  558. if (match_repetition_symbol())
  559. return set_error(Error::InvalidRepetitionMarker);
  560. if (match(TokenType::Period)) {
  561. length = 1;
  562. consume();
  563. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  564. should_parse_repetition_symbol = true;
  565. break;
  566. }
  567. if (match(TokenType::EscapeSequence)) {
  568. length = 1;
  569. Token t = consume();
  570. dbgln_if(REGEX_DEBUG, "[PARSER] EscapeSequence with substring {}", t.value());
  571. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
  572. should_parse_repetition_symbol = true;
  573. break;
  574. }
  575. if (match(TokenType::LeftBracket)) {
  576. consume();
  577. ByteCode sub_ops;
  578. if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
  579. return set_error(Error::InvalidBracketContent);
  580. bytecode.extend(move(sub_ops));
  581. consume(TokenType::RightBracket, Error::MismatchingBracket);
  582. should_parse_repetition_symbol = true;
  583. break;
  584. }
  585. if (match(TokenType::RightBracket)) {
  586. return set_error(Error::MismatchingBracket);
  587. }
  588. if (match(TokenType::RightCurly)) {
  589. return set_error(Error::MismatchingBrace);
  590. }
  591. if (match(TokenType::Circumflex)) {
  592. consume();
  593. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  594. break;
  595. }
  596. if (match(TokenType::Dollar)) {
  597. consume();
  598. bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  599. break;
  600. }
  601. if (match(TokenType::RightParen))
  602. return false;
  603. if (match(TokenType::LeftParen)) {
  604. enum GroupMode {
  605. Normal,
  606. Lookahead,
  607. NegativeLookahead,
  608. Lookbehind,
  609. NegativeLookbehind,
  610. } group_mode { Normal };
  611. consume();
  612. Optional<StringView> capture_group_name;
  613. bool prevent_capture_group = false;
  614. if (match(TokenType::Questionmark)) {
  615. consume();
  616. if (match(TokenType::Colon)) {
  617. consume();
  618. prevent_capture_group = true;
  619. } else if (consume("<")) { // named capturing group
  620. Token start_token = m_parser_state.current_token;
  621. Token last_token = m_parser_state.current_token;
  622. size_t capture_group_name_length = 0;
  623. for (;;) {
  624. if (!match_ordinary_characters())
  625. return set_error(Error::InvalidNameForCaptureGroup);
  626. if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
  627. consume();
  628. break;
  629. }
  630. ++capture_group_name_length;
  631. last_token = consume();
  632. }
  633. capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
  634. } else if (match(TokenType::EqualSign)) { // positive lookahead
  635. consume();
  636. group_mode = Lookahead;
  637. } else if (consume("!")) { // negative lookahead
  638. group_mode = NegativeLookahead;
  639. } else if (consume("<")) {
  640. if (match(TokenType::EqualSign)) { // positive lookbehind
  641. consume();
  642. group_mode = Lookbehind;
  643. }
  644. if (consume("!")) // negative lookbehind
  645. group_mode = NegativeLookbehind;
  646. } else {
  647. return set_error(Error::InvalidRepetitionMarker);
  648. }
  649. }
  650. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group))
  651. bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
  652. ByteCode capture_group_bytecode;
  653. if (!parse_root(capture_group_bytecode, length))
  654. return set_error(Error::InvalidPattern);
  655. switch (group_mode) {
  656. case Normal:
  657. bytecode.extend(move(capture_group_bytecode));
  658. break;
  659. case Lookahead:
  660. bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::LookAhead, length);
  661. break;
  662. case NegativeLookahead:
  663. bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::NegatedLookAhead, length);
  664. break;
  665. case Lookbehind:
  666. bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::LookBehind, length);
  667. break;
  668. case NegativeLookbehind:
  669. bytecode.insert_bytecode_lookaround(move(capture_group_bytecode), ByteCode::LookAroundType::NegatedLookBehind, length);
  670. break;
  671. }
  672. consume(TokenType::RightParen, Error::MismatchingParen);
  673. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  674. if (capture_group_name.has_value()) {
  675. bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count, capture_group_name.value());
  676. ++m_parser_state.named_capture_groups_count;
  677. } else {
  678. bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
  679. }
  680. ++m_parser_state.capture_groups_count;
  681. }
  682. should_parse_repetition_symbol = true;
  683. break;
  684. }
  685. return false;
  686. }
  687. if (match_repetition_symbol()) {
  688. if (should_parse_repetition_symbol)
  689. parse_repetition_symbol(bytecode, length);
  690. else
  691. return set_error(Error::InvalidRepetitionMarker);
  692. }
  693. stack.extend(move(bytecode));
  694. match_length_minimum += length;
  695. return true;
  696. }
  697. bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
  698. {
  699. ByteCode bytecode_left;
  700. size_t match_length_minimum_left { 0 };
  701. if (match_repetition_symbol())
  702. return set_error(Error::InvalidRepetitionMarker);
  703. for (;;) {
  704. if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
  705. break;
  706. if (match(TokenType::Pipe)) {
  707. consume();
  708. ByteCode bytecode_right;
  709. size_t match_length_minimum_right { 0 };
  710. if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
  711. return set_error(Error::InvalidPattern);
  712. ByteCode new_bytecode;
  713. new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
  714. bytecode_left = move(new_bytecode);
  715. match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
  716. }
  717. }
  718. if (bytecode_left.is_empty())
  719. set_error(Error::EmptySubExpression);
  720. stack.extend(move(bytecode_left));
  721. match_length_minimum = match_length_minimum_left;
  722. return !has_error();
  723. }
  724. // =============================
  725. // ECMA262 Parser
  726. // =============================
  727. bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  728. {
  729. if (m_parser_state.regex_options.has_flag_set(AllFlags::Unicode)) {
  730. return parse_pattern(stack, match_length_minimum, true, true);
  731. } else {
  732. ByteCode new_stack;
  733. size_t new_match_length = 0;
  734. auto res = parse_pattern(new_stack, new_match_length, false, false);
  735. if (m_parser_state.named_capture_groups_count > 0) {
  736. reset();
  737. return parse_pattern(stack, match_length_minimum, false, true);
  738. }
  739. if (!res)
  740. return false;
  741. stack.extend(new_stack);
  742. match_length_minimum = new_match_length;
  743. return res;
  744. }
  745. }
  746. bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  747. {
  748. return parse_disjunction(stack, match_length_minimum, unicode, named);
  749. }
  750. bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  751. {
  752. ByteCode left_alternative_stack;
  753. size_t left_alternative_min_length = 0;
  754. auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named);
  755. if (!alt_ok)
  756. return false;
  757. if (!match(TokenType::Pipe)) {
  758. stack.extend(left_alternative_stack);
  759. match_length_minimum = left_alternative_min_length;
  760. return alt_ok;
  761. }
  762. consume();
  763. ByteCode right_alternative_stack;
  764. size_t right_alternative_min_length = 0;
  765. auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named);
  766. if (!continuation_ok)
  767. return false;
  768. stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack));
  769. match_length_minimum = min(left_alternative_min_length, right_alternative_min_length);
  770. return continuation_ok;
  771. }
  772. bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  773. {
  774. for (;;) {
  775. if (match(TokenType::Eof))
  776. return true;
  777. if (parse_term(stack, match_length_minimum, unicode, named))
  778. continue;
  779. return !has_error();
  780. }
  781. }
  782. bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  783. {
  784. if (parse_assertion(stack, match_length_minimum, unicode, named))
  785. return true;
  786. ByteCode atom_stack;
  787. size_t minimum_atom_length = 0;
  788. auto parse_with_quantifier = [&] {
  789. bool did_parse_one = false;
  790. if (m_should_use_browser_extended_grammar)
  791. did_parse_one = parse_extended_atom(atom_stack, minimum_atom_length, named);
  792. if (!did_parse_one)
  793. did_parse_one = parse_atom(atom_stack, minimum_atom_length, unicode, named);
  794. if (!did_parse_one)
  795. return false;
  796. VERIFY(did_parse_one);
  797. if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named))
  798. return false;
  799. return true;
  800. };
  801. if (!parse_with_quantifier())
  802. return false;
  803. stack.extend(move(atom_stack));
  804. match_length_minimum += minimum_atom_length;
  805. return true;
  806. }
  807. bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
  808. {
  809. if (match(TokenType::Circumflex)) {
  810. consume();
  811. stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  812. return true;
  813. }
  814. if (match(TokenType::Dollar)) {
  815. consume();
  816. stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  817. return true;
  818. }
  819. if (try_skip("\\b")) {
  820. stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
  821. return true;
  822. }
  823. if (try_skip("\\B")) {
  824. stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
  825. return true;
  826. }
  827. if (match(TokenType::LeftParen)) {
  828. if (!try_skip("(?"))
  829. return false;
  830. if (done()) {
  831. set_error(Error::InvalidCaptureGroup);
  832. return false;
  833. }
  834. ByteCode assertion_stack;
  835. size_t length_dummy = 0;
  836. bool should_parse_forward_assertion = m_should_use_browser_extended_grammar ? unicode : true;
  837. if (should_parse_forward_assertion && try_skip("=")) {
  838. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  839. return false;
  840. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
  841. return true;
  842. }
  843. if (should_parse_forward_assertion && try_skip("!")) {
  844. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  845. return false;
  846. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
  847. return true;
  848. }
  849. if (m_should_use_browser_extended_grammar) {
  850. if (!unicode) {
  851. if (parse_quantifiable_assertion(assertion_stack, match_length_minimum, named)) {
  852. stack.extend(move(assertion_stack));
  853. return true;
  854. }
  855. }
  856. }
  857. if (try_skip("<=")) {
  858. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  859. return false;
  860. // FIXME: Somehow ensure that this assertion regexp has a fixed length.
  861. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
  862. return true;
  863. }
  864. if (try_skip("<!")) {
  865. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  866. return false;
  867. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
  868. return true;
  869. }
  870. // If none of these matched, put the '(?' back.
  871. m_parser_state.lexer.back(3);
  872. m_parser_state.current_token = m_parser_state.lexer.next();
  873. return false;
  874. }
  875. return false;
  876. }
  877. bool ECMA262Parser::parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named)
  878. {
  879. auto disjunction_ok = parse_disjunction(bytecode_stack, length, unicode, named);
  880. if (!disjunction_ok)
  881. return false;
  882. consume(TokenType::RightParen, Error::MismatchingParen);
  883. return true;
  884. }
  885. bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool named)
  886. {
  887. VERIFY(m_should_use_browser_extended_grammar);
  888. ByteCode assertion_stack;
  889. size_t match_length_minimum = 0;
  890. if (try_skip("=")) {
  891. if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
  892. return false;
  893. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
  894. return true;
  895. }
  896. if (try_skip("!")) {
  897. if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
  898. return false;
  899. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
  900. return true;
  901. }
  902. return false;
  903. }
  904. StringView ECMA262Parser::read_digits_as_string(ReadDigitsInitialZeroState initial_zero, bool hex, int max_count, int min_count)
  905. {
  906. if (!match(TokenType::Char))
  907. return {};
  908. if (initial_zero == ReadDigitsInitialZeroState::Disallow && m_parser_state.current_token.value() == "0")
  909. return {};
  910. int count = 0;
  911. size_t offset = 0;
  912. auto start_token = m_parser_state.current_token;
  913. while (match(TokenType::Char)) {
  914. auto& c = m_parser_state.current_token.value();
  915. if (max_count > 0 && count >= max_count)
  916. break;
  917. if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
  918. break;
  919. if (!hex && !c.to_uint().has_value())
  920. break;
  921. offset += consume().value().length();
  922. ++count;
  923. }
  924. if (count < min_count)
  925. return {};
  926. return StringView { start_token.value().characters_without_null_termination(), offset };
  927. }
  928. Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, bool hex, int max_count, int min_count)
  929. {
  930. auto str = read_digits_as_string(initial_zero, hex, max_count, min_count);
  931. if (str.is_empty())
  932. return {};
  933. if (hex)
  934. return AK::StringUtils::convert_to_uint_from_hex(str);
  935. return str.to_uint();
  936. }
  937. bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
  938. {
  939. enum class Repetition {
  940. OneOrMore,
  941. ZeroOrMore,
  942. Optional,
  943. Explicit,
  944. None,
  945. } repetition_mark { Repetition::None };
  946. bool ungreedy = false;
  947. Optional<u64> repeat_min, repeat_max;
  948. if (match(TokenType::Asterisk)) {
  949. consume();
  950. repetition_mark = Repetition::ZeroOrMore;
  951. } else if (match(TokenType::Plus)) {
  952. consume();
  953. repetition_mark = Repetition::OneOrMore;
  954. } else if (match(TokenType::Questionmark)) {
  955. consume();
  956. repetition_mark = Repetition::Optional;
  957. } else if (match(TokenType::LeftCurly)) {
  958. repetition_mark = Repetition::Explicit;
  959. if (!parse_interval_quantifier(repeat_min, repeat_max)) {
  960. if (unicode) {
  961. // Invalid interval quantifiers are disallowed in Unicode mod - they must be esacped with '\{'.
  962. set_error(Error::InvalidPattern);
  963. }
  964. return !has_error();
  965. }
  966. } else {
  967. return true;
  968. }
  969. if (match(TokenType::Questionmark)) {
  970. consume();
  971. ungreedy = true;
  972. }
  973. switch (repetition_mark) {
  974. case Repetition::OneOrMore:
  975. ByteCode::transform_bytecode_repetition_min_one(stack, !ungreedy);
  976. break;
  977. case Repetition::ZeroOrMore:
  978. ByteCode::transform_bytecode_repetition_any(stack, !ungreedy);
  979. match_length_minimum = 0;
  980. break;
  981. case Repetition::Optional:
  982. ByteCode::transform_bytecode_repetition_zero_or_one(stack, !ungreedy);
  983. match_length_minimum = 0;
  984. break;
  985. case Repetition::Explicit: {
  986. auto repetition_mark_id = m_parser_state.repetition_mark_count++;
  987. ByteCode::transform_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, repetition_mark_id, !ungreedy);
  988. match_length_minimum *= repeat_min.value();
  989. break;
  990. }
  991. case Repetition::None:
  992. VERIFY_NOT_REACHED();
  993. }
  994. return true;
  995. }
  996. bool ECMA262Parser::parse_interval_quantifier(Optional<u64>& repeat_min, Optional<u64>& repeat_max)
  997. {
  998. VERIFY(match(TokenType::LeftCurly));
  999. consume();
  1000. auto chars_consumed = 1;
  1001. auto low_bound_string = read_digits_as_string();
  1002. chars_consumed += low_bound_string.length();
  1003. auto low_bound = low_bound_string.to_uint<u64>();
  1004. if (!low_bound.has_value()) {
  1005. if (!m_should_use_browser_extended_grammar && done())
  1006. return set_error(Error::MismatchingBrace);
  1007. back(chars_consumed + !done());
  1008. return false;
  1009. }
  1010. repeat_min = low_bound.value();
  1011. if (match(TokenType::Comma)) {
  1012. consume();
  1013. ++chars_consumed;
  1014. auto high_bound_string = read_digits_as_string();
  1015. auto high_bound = high_bound_string.to_uint<u64>();
  1016. if (high_bound.has_value()) {
  1017. repeat_max = high_bound.value();
  1018. chars_consumed += high_bound_string.length();
  1019. }
  1020. } else {
  1021. repeat_max = repeat_min;
  1022. }
  1023. if (!match(TokenType::RightCurly)) {
  1024. if (!m_should_use_browser_extended_grammar && done())
  1025. return set_error(Error::MismatchingBrace);
  1026. back(chars_consumed + !done());
  1027. return false;
  1028. }
  1029. consume();
  1030. ++chars_consumed;
  1031. if (repeat_max.has_value()) {
  1032. if (repeat_min.value() > repeat_max.value())
  1033. set_error(Error::InvalidBraceContent);
  1034. }
  1035. if ((*repeat_min > s_ecma262_maximum_repetition_count) || (repeat_max.has_value() && (*repeat_max > s_ecma262_maximum_repetition_count)))
  1036. return set_error(Error::InvalidBraceContent);
  1037. return true;
  1038. }
  1039. bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  1040. {
  1041. if (match(TokenType::EscapeSequence)) {
  1042. // Also part of AtomEscape.
  1043. auto token = consume();
  1044. match_length_minimum += 1;
  1045. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[1] } });
  1046. return true;
  1047. }
  1048. if (try_skip("\\")) {
  1049. // AtomEscape.
  1050. return parse_atom_escape(stack, match_length_minimum, unicode, named);
  1051. }
  1052. if (match(TokenType::LeftBracket)) {
  1053. // Character class.
  1054. return parse_character_class(stack, match_length_minimum, unicode, named);
  1055. }
  1056. if (match(TokenType::LeftParen)) {
  1057. // Non-capturing group, or a capture group.
  1058. return parse_capture_group(stack, match_length_minimum, unicode, named);
  1059. }
  1060. if (match(TokenType::Period)) {
  1061. consume();
  1062. match_length_minimum += 1;
  1063. stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  1064. return true;
  1065. }
  1066. if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightParen)
  1067. || match(TokenType::Pipe) || match(TokenType::Plus) || match(TokenType::Asterisk)
  1068. || match(TokenType::Questionmark)) {
  1069. return false;
  1070. }
  1071. if (match(TokenType::RightBracket) || match(TokenType::RightCurly) || match(TokenType::LeftCurly)) {
  1072. if (unicode)
  1073. return set_error(Error::InvalidPattern);
  1074. if (m_should_use_browser_extended_grammar) {
  1075. auto token = consume();
  1076. match_length_minimum += 1;
  1077. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
  1078. return true;
  1079. } else {
  1080. return false;
  1081. }
  1082. }
  1083. if (match_ordinary_characters()) {
  1084. auto token = consume().value();
  1085. match_length_minimum += 1;
  1086. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
  1087. return true;
  1088. }
  1089. set_error(Error::InvalidPattern);
  1090. return false;
  1091. }
  1092. bool ECMA262Parser::parse_extended_atom(ByteCode&, size_t&, bool)
  1093. {
  1094. // Note: This includes only rules *not* present in parse_atom()
  1095. VERIFY(m_should_use_browser_extended_grammar);
  1096. if (parse_invalid_braced_quantifier())
  1097. return true; // FAIL FAIL FAIL
  1098. return false;
  1099. }
  1100. bool ECMA262Parser::parse_invalid_braced_quantifier()
  1101. {
  1102. if (!match(TokenType::LeftCurly))
  1103. return false;
  1104. consume();
  1105. size_t chars_consumed = 1;
  1106. auto low_bound = read_digits_as_string();
  1107. StringView high_bound;
  1108. if (low_bound.is_empty()) {
  1109. back(chars_consumed + !done());
  1110. return false;
  1111. }
  1112. chars_consumed += low_bound.length();
  1113. if (match(TokenType::Comma)) {
  1114. consume();
  1115. ++chars_consumed;
  1116. high_bound = read_digits_as_string();
  1117. chars_consumed += high_bound.length();
  1118. }
  1119. if (!match(TokenType::RightCurly)) {
  1120. back(chars_consumed + !done());
  1121. return false;
  1122. }
  1123. consume();
  1124. set_error(Error::InvalidPattern);
  1125. return true;
  1126. }
  1127. bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  1128. {
  1129. if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
  1130. if (auto escape = escape_str.to_uint(); escape.has_value()) {
  1131. // See if this is a "back"-reference (we've already parsed the group it refers to)
  1132. auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
  1133. if (maybe_length.has_value()) {
  1134. match_length_minimum += maybe_length.value();
  1135. stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
  1136. return true;
  1137. }
  1138. // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
  1139. if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
  1140. // This refers to a future group, and it will _always_ be matching an empty string
  1141. // So just match nothing and move on.
  1142. return true;
  1143. }
  1144. if (!m_should_use_browser_extended_grammar) {
  1145. set_error(Error::InvalidNumber);
  1146. return false;
  1147. }
  1148. }
  1149. // If not, put the characters back.
  1150. back(escape_str.length());
  1151. }
  1152. // CharacterEscape > ControlEscape
  1153. if (try_skip("f")) {
  1154. match_length_minimum += 1;
  1155. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
  1156. return true;
  1157. }
  1158. if (try_skip("n")) {
  1159. match_length_minimum += 1;
  1160. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
  1161. return true;
  1162. }
  1163. if (try_skip("r")) {
  1164. match_length_minimum += 1;
  1165. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
  1166. return true;
  1167. }
  1168. if (try_skip("t")) {
  1169. match_length_minimum += 1;
  1170. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
  1171. return true;
  1172. }
  1173. if (try_skip("v")) {
  1174. match_length_minimum += 1;
  1175. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
  1176. return true;
  1177. }
  1178. // CharacterEscape > ControlLetter
  1179. if (try_skip("c")) {
  1180. for (auto c : s_alphabetic_characters) {
  1181. if (try_skip({ &c, 1 })) {
  1182. match_length_minimum += 1;
  1183. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
  1184. return true;
  1185. }
  1186. }
  1187. if (unicode) {
  1188. set_error(Error::InvalidPattern);
  1189. return false;
  1190. }
  1191. if (m_should_use_browser_extended_grammar) {
  1192. back(1 + !done());
  1193. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
  1194. match_length_minimum += 1;
  1195. return true;
  1196. }
  1197. // Allow '\c' in non-unicode mode, just matches 'c'.
  1198. match_length_minimum += 1;
  1199. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
  1200. return true;
  1201. }
  1202. // '\0'
  1203. if (try_skip("0")) {
  1204. if (!lookahead_any(s_decimal_characters)) {
  1205. match_length_minimum += 1;
  1206. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
  1207. return true;
  1208. }
  1209. back();
  1210. }
  1211. // LegacyOctalEscapeSequence
  1212. if (m_should_use_browser_extended_grammar) {
  1213. if (!unicode) {
  1214. if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
  1215. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } });
  1216. match_length_minimum += 1;
  1217. return true;
  1218. }
  1219. }
  1220. }
  1221. // HexEscape
  1222. if (try_skip("x")) {
  1223. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
  1224. match_length_minimum += 1;
  1225. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
  1226. return true;
  1227. } else if (!unicode) {
  1228. // '\x' is allowed in non-unicode mode, just matches 'x'.
  1229. match_length_minimum += 1;
  1230. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
  1231. return true;
  1232. } else {
  1233. set_error(Error::InvalidPattern);
  1234. return false;
  1235. }
  1236. }
  1237. if (try_skip("u")) {
  1238. if (match(TokenType::LeftCurly)) {
  1239. consume();
  1240. if (!unicode) {
  1241. // FIXME: In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
  1242. TODO();
  1243. }
  1244. auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 6);
  1245. if (code_point.has_value() && is_unicode(*code_point) && match(TokenType::RightCurly)) {
  1246. consume();
  1247. match_length_minimum += 1;
  1248. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
  1249. return true;
  1250. }
  1251. set_error(Error::InvalidPattern);
  1252. return false;
  1253. }
  1254. if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) {
  1255. // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
  1256. // rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
  1257. // but doesn't form a valid surrogate pair, insert bytecode for both code units individually.
  1258. Optional<u32> low_surrogate;
  1259. if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) {
  1260. low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4);
  1261. if (!low_surrogate.has_value()) {
  1262. set_error(Error::InvalidPattern);
  1263. return false;
  1264. }
  1265. if (Utf16View::is_low_surrogate(*low_surrogate)) {
  1266. *code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate);
  1267. low_surrogate.clear();
  1268. }
  1269. }
  1270. match_length_minimum += 1;
  1271. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
  1272. if (low_surrogate.has_value()) {
  1273. match_length_minimum += 1;
  1274. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } });
  1275. }
  1276. return true;
  1277. } else if (!unicode) {
  1278. // '\u' is allowed in non-unicode mode, just matches 'u'.
  1279. match_length_minimum += 1;
  1280. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
  1281. return true;
  1282. } else {
  1283. set_error(Error::InvalidPattern);
  1284. return false;
  1285. }
  1286. }
  1287. // IdentityEscape
  1288. for (auto ch : identity_escape_characters(unicode, m_should_use_browser_extended_grammar)) {
  1289. if (try_skip({ &ch, 1 })) {
  1290. match_length_minimum += 1;
  1291. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
  1292. return true;
  1293. }
  1294. }
  1295. if (unicode) {
  1296. if (try_skip("/")) {
  1297. match_length_minimum += 1;
  1298. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
  1299. return true;
  1300. }
  1301. }
  1302. if (named && try_skip("k")) {
  1303. auto name = read_capture_group_specifier(true);
  1304. if (name.is_empty()) {
  1305. set_error(Error::InvalidNameForCaptureGroup);
  1306. return false;
  1307. }
  1308. auto maybe_capture_group = m_parser_state.named_capture_groups.get(name);
  1309. if (!maybe_capture_group.has_value()) {
  1310. set_error(Error::InvalidNameForCaptureGroup);
  1311. return false;
  1312. }
  1313. match_length_minimum += maybe_capture_group->minimum_length;
  1314. stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group->group_index } });
  1315. return true;
  1316. }
  1317. if (unicode) {
  1318. PropertyEscape property { Empty {} };
  1319. bool negated = false;
  1320. if (parse_unicode_property_escape(property, negated)) {
  1321. Vector<CompareTypeAndValuePair> compares;
  1322. if (negated)
  1323. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
  1324. property.visit(
  1325. [&](Unicode::Property property) {
  1326. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property });
  1327. },
  1328. [&](Unicode::GeneralCategory general_category) {
  1329. compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category });
  1330. },
  1331. [&](Script script) {
  1332. if (script.is_extension)
  1333. compares.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)script.script });
  1334. else
  1335. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)script.script });
  1336. },
  1337. [](Empty&) { VERIFY_NOT_REACHED(); });
  1338. stack.insert_bytecode_compare_values(move(compares));
  1339. match_length_minimum += 1;
  1340. return true;
  1341. }
  1342. }
  1343. if (done())
  1344. return set_error(Error::InvalidTrailingEscape);
  1345. bool negate = false;
  1346. auto ch = parse_character_class_escape(negate);
  1347. if (!ch.has_value()) {
  1348. if (!unicode) {
  1349. // Allow all SourceCharacter's as escapes here.
  1350. auto token = consume();
  1351. match_length_minimum += 1;
  1352. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
  1353. return true;
  1354. }
  1355. set_error(Error::InvalidCharacterClass);
  1356. return false;
  1357. }
  1358. Vector<CompareTypeAndValuePair> compares;
  1359. if (negate)
  1360. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
  1361. compares.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)ch.value() });
  1362. match_length_minimum += 1;
  1363. stack.insert_bytecode_compare_values(move(compares));
  1364. return true;
  1365. }
  1366. Optional<u8> ECMA262Parser::parse_legacy_octal_escape()
  1367. {
  1368. constexpr auto all_octal_digits = "01234567";
  1369. auto read_octal_digit = [&](auto start, auto end, bool should_ensure_no_following_octal_digit) -> Optional<u8> {
  1370. for (char c = '0' + start; c <= '0' + end; ++c) {
  1371. if (try_skip({ &c, 1 })) {
  1372. if (!should_ensure_no_following_octal_digit || !lookahead_any(all_octal_digits))
  1373. return c - '0';
  1374. back(2);
  1375. return {};
  1376. }
  1377. }
  1378. return {};
  1379. };
  1380. // OctalDigit(1)
  1381. if (auto digit = read_octal_digit(0, 7, true); digit.has_value()) {
  1382. return digit.value();
  1383. }
  1384. // OctalDigit(2)
  1385. if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
  1386. if (auto right_digit = read_octal_digit(0, 7, true); right_digit.has_value()) {
  1387. return left_digit.value() * 8 + right_digit.value();
  1388. }
  1389. back(2);
  1390. }
  1391. // OctalDigit(2)
  1392. if (auto left_digit = read_octal_digit(4, 7, false); left_digit.has_value()) {
  1393. if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
  1394. return left_digit.value() * 8 + right_digit.value();
  1395. }
  1396. back(2);
  1397. }
  1398. // OctalDigit(3)
  1399. if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
  1400. size_t chars_consumed = 1;
  1401. if (auto mid_digit = read_octal_digit(0, 7, false); mid_digit.has_value()) {
  1402. ++chars_consumed;
  1403. if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
  1404. return left_digit.value() * 64 + mid_digit.value() * 8 + right_digit.value();
  1405. }
  1406. }
  1407. back(chars_consumed);
  1408. }
  1409. return {};
  1410. }
  1411. Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
  1412. {
  1413. if (expect_backslash && !try_skip("\\"))
  1414. return {};
  1415. // CharacterClassEscape
  1416. CharClass ch_class;
  1417. if (try_skip("d")) {
  1418. ch_class = CharClass::Digit;
  1419. } else if (try_skip("D")) {
  1420. ch_class = CharClass::Digit;
  1421. negate = true;
  1422. } else if (try_skip("s")) {
  1423. ch_class = CharClass::Space;
  1424. } else if (try_skip("S")) {
  1425. ch_class = CharClass::Space;
  1426. negate = true;
  1427. } else if (try_skip("w")) {
  1428. ch_class = CharClass::Word;
  1429. } else if (try_skip("W")) {
  1430. ch_class = CharClass::Word;
  1431. negate = true;
  1432. } else {
  1433. return {};
  1434. }
  1435. return ch_class;
  1436. }
  1437. bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
  1438. {
  1439. consume(TokenType::LeftBracket, Error::InvalidPattern);
  1440. Vector<CompareTypeAndValuePair> compares;
  1441. if (match(TokenType::Circumflex)) {
  1442. // Negated charclass
  1443. consume();
  1444. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
  1445. }
  1446. if (match(TokenType::RightBracket)) {
  1447. consume();
  1448. // Should only have at most an 'Inverse'
  1449. VERIFY(compares.size() <= 1);
  1450. stack.insert_bytecode_compare_values(move(compares));
  1451. return true;
  1452. }
  1453. if (!parse_nonempty_class_ranges(compares, unicode))
  1454. return false;
  1455. match_length_minimum += 1;
  1456. stack.insert_bytecode_compare_values(move(compares));
  1457. return true;
  1458. }
  1459. struct CharClassRangeElement {
  1460. union {
  1461. CharClass character_class;
  1462. u32 code_point { 0 };
  1463. Unicode::Property property;
  1464. Unicode::GeneralCategory general_category;
  1465. Unicode::Script script;
  1466. };
  1467. bool is_negated { false };
  1468. bool is_character_class { false };
  1469. bool is_property { false };
  1470. bool is_general_category { false };
  1471. bool is_script { false };
  1472. bool is_script_extension { false };
  1473. };
  1474. bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
  1475. {
  1476. auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
  1477. if (match(TokenType::EscapeSequence)) {
  1478. auto token = consume().value();
  1479. return { CharClassRangeElement { .code_point = (u32)token[1], .is_character_class = false } };
  1480. }
  1481. if (try_skip("\\")) {
  1482. if (done()) {
  1483. set_error(Error::InvalidTrailingEscape);
  1484. return {};
  1485. }
  1486. if (try_skip("f"))
  1487. return { CharClassRangeElement { .code_point = '\f', .is_character_class = false } };
  1488. if (try_skip("n"))
  1489. return { CharClassRangeElement { .code_point = '\n', .is_character_class = false } };
  1490. if (try_skip("r"))
  1491. return { CharClassRangeElement { .code_point = '\r', .is_character_class = false } };
  1492. if (try_skip("t"))
  1493. return { CharClassRangeElement { .code_point = '\t', .is_character_class = false } };
  1494. if (try_skip("v"))
  1495. return { CharClassRangeElement { .code_point = '\v', .is_character_class = false } };
  1496. if (try_skip("b"))
  1497. return { CharClassRangeElement { .code_point = '\b', .is_character_class = false } };
  1498. if (try_skip("/"))
  1499. return { CharClassRangeElement { .code_point = '/', .is_character_class = false } };
  1500. // CharacterEscape > ControlLetter
  1501. if (try_skip("c")) {
  1502. for (auto c : s_alphabetic_characters) {
  1503. if (try_skip({ &c, 1 })) {
  1504. return { CharClassRangeElement { .code_point = (u32)(c % 32), .is_character_class = false } };
  1505. }
  1506. }
  1507. if (unicode) {
  1508. set_error(Error::InvalidPattern);
  1509. return {};
  1510. }
  1511. if (m_should_use_browser_extended_grammar) {
  1512. for (auto c = '0'; c <= '9'; ++c) {
  1513. if (try_skip({ &c, 1 }))
  1514. return { CharClassRangeElement { .code_point = (u32)(c % 32), .is_character_class = false } };
  1515. }
  1516. if (try_skip("_"))
  1517. return { CharClassRangeElement { .code_point = (u32)('_' % 32), .is_character_class = false } };
  1518. back(1 + !done());
  1519. return { CharClassRangeElement { .code_point = '\\', .is_character_class = false } };
  1520. }
  1521. }
  1522. // '\0'
  1523. if (try_skip("0")) {
  1524. if (!lookahead_any(s_decimal_characters))
  1525. return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
  1526. back();
  1527. }
  1528. // LegacyOctalEscapeSequence
  1529. if (m_should_use_browser_extended_grammar && !unicode) {
  1530. if (auto escape = parse_legacy_octal_escape(); escape.has_value())
  1531. return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } };
  1532. }
  1533. // HexEscape
  1534. if (try_skip("x")) {
  1535. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
  1536. return { CharClassRangeElement { .code_point = hex_escape.value(), .is_character_class = false } };
  1537. } else if (!unicode) {
  1538. // '\x' is allowed in non-unicode mode, just matches 'x'.
  1539. return { CharClassRangeElement { .code_point = 'x', .is_character_class = false } };
  1540. } else {
  1541. set_error(Error::InvalidPattern);
  1542. return {};
  1543. }
  1544. }
  1545. if (try_skip("u")) {
  1546. if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4, 4); code_point.has_value()) {
  1547. // FIXME: While code point ranges are supported, code point matches as "Char" are not!
  1548. return { CharClassRangeElement { .code_point = code_point.value(), .is_character_class = false } };
  1549. } else if (!unicode) {
  1550. // '\u' is allowed in non-unicode mode, just matches 'u'.
  1551. return { CharClassRangeElement { .code_point = 'u', .is_character_class = false } };
  1552. } else {
  1553. set_error(Error::InvalidPattern);
  1554. return {};
  1555. }
  1556. }
  1557. // IdentityEscape
  1558. for (auto ch : identity_escape_characters(unicode, m_should_use_browser_extended_grammar)) {
  1559. if (try_skip({ &ch, 1 }))
  1560. return { CharClassRangeElement { .code_point = (u32)ch, .is_character_class = false } };
  1561. }
  1562. if (unicode) {
  1563. if (try_skip("-"))
  1564. return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
  1565. PropertyEscape property { Empty {} };
  1566. bool negated = false;
  1567. if (parse_unicode_property_escape(property, negated)) {
  1568. return property.visit(
  1569. [&](Unicode::Property property) {
  1570. return CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property = true };
  1571. },
  1572. [&](Unicode::GeneralCategory general_category) {
  1573. return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
  1574. },
  1575. [&](Script script) {
  1576. if (script.is_extension)
  1577. return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script_extension = true };
  1578. else
  1579. return CharClassRangeElement { .script = script.script, .is_negated = negated, .is_character_class = true, .is_script = true };
  1580. },
  1581. [](Empty&) -> CharClassRangeElement { VERIFY_NOT_REACHED(); });
  1582. }
  1583. }
  1584. if (try_skip("d"))
  1585. return { CharClassRangeElement { .character_class = CharClass::Digit, .is_character_class = true } };
  1586. if (try_skip("s"))
  1587. return { CharClassRangeElement { .character_class = CharClass::Space, .is_character_class = true } };
  1588. if (try_skip("w"))
  1589. return { CharClassRangeElement { .character_class = CharClass::Word, .is_character_class = true } };
  1590. if (try_skip("D"))
  1591. return { CharClassRangeElement { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
  1592. if (try_skip("S"))
  1593. return { CharClassRangeElement { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
  1594. if (try_skip("W"))
  1595. return { CharClassRangeElement { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
  1596. if (!unicode) {
  1597. // Any unrecognised escape is allowed in non-unicode mode.
  1598. return { CharClassRangeElement { .code_point = (u32)skip(), .is_character_class = false } };
  1599. }
  1600. set_error(Error::InvalidPattern);
  1601. return {};
  1602. }
  1603. if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
  1604. return {};
  1605. // Allow any (other) SourceCharacter.
  1606. return { CharClassRangeElement { .code_point = (u32)skip(), .is_character_class = false } };
  1607. };
  1608. auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
  1609. if (match(TokenType::HyphenMinus)) {
  1610. consume();
  1611. return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
  1612. }
  1613. return read_class_atom_no_dash();
  1614. };
  1615. auto empend_atom = [&](auto& atom) {
  1616. if (atom.is_character_class) {
  1617. if (atom.is_negated)
  1618. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
  1619. if (atom.is_property)
  1620. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
  1621. else if (atom.is_general_category)
  1622. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
  1623. else if (atom.is_script)
  1624. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Script, (ByteCodeValueType)(atom.script) });
  1625. else if (atom.is_script_extension)
  1626. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::ScriptExtension, (ByteCodeValueType)(atom.script) });
  1627. else
  1628. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
  1629. } else {
  1630. VERIFY(!atom.is_negated);
  1631. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, atom.code_point });
  1632. }
  1633. };
  1634. while (!match(TokenType::RightBracket)) {
  1635. if (match(TokenType::Eof)) {
  1636. set_error(Error::MismatchingBracket);
  1637. return false;
  1638. }
  1639. auto first_atom = read_class_atom();
  1640. if (!first_atom.has_value())
  1641. return false;
  1642. if (match(TokenType::HyphenMinus)) {
  1643. consume();
  1644. if (match(TokenType::RightBracket)) {
  1645. // Allow '-' as the last element in a charclass, even after an atom.
  1646. m_parser_state.lexer.back(2); // -]
  1647. m_parser_state.current_token = m_parser_state.lexer.next();
  1648. goto read_as_single_atom;
  1649. }
  1650. auto second_atom = read_class_atom();
  1651. if (!second_atom.has_value())
  1652. return false;
  1653. if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
  1654. if (m_should_use_browser_extended_grammar) {
  1655. if (unicode) {
  1656. set_error(Error::InvalidRange);
  1657. return false;
  1658. }
  1659. // CharacterRangeOrUnion > !Unicode > CharClass
  1660. empend_atom(*first_atom);
  1661. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)'-' });
  1662. empend_atom(*second_atom);
  1663. continue;
  1664. } else {
  1665. set_error(Error::InvalidRange);
  1666. return false;
  1667. }
  1668. }
  1669. if (first_atom.value().code_point > second_atom.value().code_point) {
  1670. set_error(Error::InvalidRange);
  1671. return false;
  1672. }
  1673. VERIFY(!first_atom.value().is_negated);
  1674. VERIFY(!second_atom.value().is_negated);
  1675. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point } });
  1676. continue;
  1677. }
  1678. read_as_single_atom:;
  1679. auto atom = first_atom.value();
  1680. empend_atom(atom);
  1681. }
  1682. consume(TokenType::RightBracket, Error::MismatchingBracket);
  1683. return true;
  1684. }
  1685. bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated)
  1686. {
  1687. negated = false;
  1688. if (try_skip("p"))
  1689. negated = false;
  1690. else if (try_skip("P"))
  1691. negated = true;
  1692. else
  1693. return false;
  1694. auto parsed_property = read_unicode_property_escape();
  1695. if (!parsed_property.has_value()) {
  1696. set_error(Error::InvalidNameForProperty);
  1697. return false;
  1698. }
  1699. property = move(*parsed_property);
  1700. return property.visit(
  1701. [this](Unicode::Property property) {
  1702. if (!Unicode::is_ecma262_property(property)) {
  1703. set_error(Error::InvalidNameForProperty);
  1704. return false;
  1705. }
  1706. return true;
  1707. },
  1708. [](Unicode::GeneralCategory) { return true; },
  1709. [](Script) { return true; },
  1710. [](Empty&) -> bool { VERIFY_NOT_REACHED(); });
  1711. }
  1712. StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
  1713. {
  1714. if (take_starting_angle_bracket && !consume("<"))
  1715. return {};
  1716. auto start_token = m_parser_state.current_token;
  1717. size_t offset = 0;
  1718. while (match(TokenType::Char) || match(TokenType::Dollar)) {
  1719. auto c = m_parser_state.current_token.value();
  1720. if (c == ">")
  1721. break;
  1722. offset += consume().value().length();
  1723. }
  1724. StringView name { start_token.value().characters_without_null_termination(), offset };
  1725. if (!consume(">") || name.is_empty())
  1726. set_error(Error::InvalidNameForCaptureGroup);
  1727. return name;
  1728. }
  1729. Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_escape()
  1730. {
  1731. consume(TokenType::LeftCurly, Error::InvalidPattern);
  1732. // Note: clang-format is disabled here because it doesn't handle templated lambdas yet.
  1733. // clang-format off
  1734. auto read_until = [&]<typename... Ts>(Ts&&... terminators) {
  1735. auto start_token = m_parser_state.current_token;
  1736. size_t offset = 0;
  1737. while (match(TokenType::Char)) {
  1738. if (m_parser_state.current_token.value().is_one_of(forward<Ts>(terminators)...))
  1739. break;
  1740. offset += consume().value().length();
  1741. }
  1742. return StringView { start_token.value().characters_without_null_termination(), offset };
  1743. };
  1744. // clang-format on
  1745. StringView property_type;
  1746. StringView property_name = read_until("="sv, "}"sv);
  1747. if (try_skip("="sv)) {
  1748. if (property_name.is_empty())
  1749. return {};
  1750. property_type = property_name;
  1751. property_name = read_until("}"sv);
  1752. }
  1753. consume(TokenType::RightCurly, Error::InvalidPattern);
  1754. if (property_type.is_empty()) {
  1755. if (auto property = Unicode::property_from_string(property_name); property.has_value())
  1756. return { *property };
  1757. if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
  1758. return { *general_category };
  1759. } else if ((property_type == "General_Category"sv) || (property_type == "gc"sv)) {
  1760. if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
  1761. return { *general_category };
  1762. } else if ((property_type == "Script"sv) || (property_type == "sc"sv)) {
  1763. if (auto script = Unicode::script_from_string(property_name); script.has_value())
  1764. return Script { *script, false };
  1765. } else if ((property_type == "Script_Extensions"sv) || (property_type == "scx"sv)) {
  1766. if (auto script = Unicode::script_from_string(property_name); script.has_value())
  1767. return Script { *script, true };
  1768. }
  1769. return {};
  1770. }
  1771. bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  1772. {
  1773. consume(TokenType::LeftParen, Error::InvalidPattern);
  1774. auto enter_capture_group_scope = [&] {
  1775. m_capture_groups_in_scope.empend();
  1776. };
  1777. auto exit_capture_group_scope = [&] {
  1778. auto last = m_capture_groups_in_scope.take_last();
  1779. m_capture_groups_in_scope.last().extend(move(last));
  1780. };
  1781. auto register_capture_group_in_current_scope = [&](auto identifier) {
  1782. m_capture_groups_in_scope.last().empend(identifier);
  1783. };
  1784. auto clear_all_capture_groups_in_scope = [&] {
  1785. for (auto& index : m_capture_groups_in_scope.last())
  1786. stack.insert_bytecode_clear_capture_group(index);
  1787. };
  1788. if (match(TokenType::Questionmark)) {
  1789. // Non-capturing group or group with specifier.
  1790. consume();
  1791. if (match(TokenType::Colon)) {
  1792. consume();
  1793. ByteCode noncapture_group_bytecode;
  1794. size_t length = 0;
  1795. enter_capture_group_scope();
  1796. if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
  1797. return set_error(Error::InvalidPattern);
  1798. clear_all_capture_groups_in_scope();
  1799. exit_capture_group_scope();
  1800. consume(TokenType::RightParen, Error::MismatchingParen);
  1801. stack.extend(move(noncapture_group_bytecode));
  1802. match_length_minimum += length;
  1803. return true;
  1804. }
  1805. if (consume("<")) {
  1806. ++m_parser_state.named_capture_groups_count;
  1807. auto group_index = ++m_parser_state.capture_groups_count; // Named capture groups count as normal capture groups too.
  1808. auto name = read_capture_group_specifier();
  1809. if (name.is_empty()) {
  1810. set_error(Error::InvalidNameForCaptureGroup);
  1811. return false;
  1812. }
  1813. ByteCode capture_group_bytecode;
  1814. size_t length = 0;
  1815. enter_capture_group_scope();
  1816. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1817. return set_error(Error::InvalidPattern);
  1818. clear_all_capture_groups_in_scope();
  1819. exit_capture_group_scope();
  1820. register_capture_group_in_current_scope(group_index);
  1821. consume(TokenType::RightParen, Error::MismatchingParen);
  1822. stack.insert_bytecode_group_capture_left(group_index);
  1823. stack.extend(move(capture_group_bytecode));
  1824. stack.insert_bytecode_group_capture_right(group_index, name);
  1825. match_length_minimum += length;
  1826. m_parser_state.capture_group_minimum_lengths.set(group_index, length);
  1827. m_parser_state.named_capture_groups.set(name, { group_index, length });
  1828. return true;
  1829. }
  1830. set_error(Error::InvalidCaptureGroup);
  1831. return false;
  1832. }
  1833. auto group_index = ++m_parser_state.capture_groups_count;
  1834. enter_capture_group_scope();
  1835. ByteCode capture_group_bytecode;
  1836. size_t length = 0;
  1837. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1838. return set_error(Error::InvalidPattern);
  1839. clear_all_capture_groups_in_scope();
  1840. exit_capture_group_scope();
  1841. register_capture_group_in_current_scope(group_index);
  1842. stack.insert_bytecode_group_capture_left(group_index);
  1843. stack.extend(move(capture_group_bytecode));
  1844. m_parser_state.capture_group_minimum_lengths.set(group_index, length);
  1845. consume(TokenType::RightParen, Error::MismatchingParen);
  1846. stack.insert_bytecode_group_capture_right(group_index);
  1847. match_length_minimum += length;
  1848. return true;
  1849. }
  1850. size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
  1851. {
  1852. if (m_total_number_of_capturing_parenthesis.has_value())
  1853. return m_total_number_of_capturing_parenthesis.value();
  1854. GenericLexer lexer { m_parser_state.lexer.source() };
  1855. size_t count = 0;
  1856. while (!lexer.is_eof()) {
  1857. switch (lexer.peek()) {
  1858. case '\\':
  1859. lexer.consume(2);
  1860. continue;
  1861. case '[':
  1862. while (!lexer.is_eof()) {
  1863. if (lexer.consume_specific('\\'))
  1864. lexer.consume();
  1865. else if (lexer.consume_specific(']'))
  1866. break;
  1867. lexer.consume();
  1868. }
  1869. break;
  1870. case '(':
  1871. if (lexer.consume_specific('?')) {
  1872. // non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
  1873. if (!lexer.consume_specific('<'))
  1874. break;
  1875. if (lexer.next_is(is_any_of("=!")))
  1876. break;
  1877. ++count;
  1878. } else {
  1879. ++count;
  1880. }
  1881. break;
  1882. }
  1883. lexer.consume();
  1884. }
  1885. m_total_number_of_capturing_parenthesis = count;
  1886. return count;
  1887. }
  1888. }