RegexParser.cpp 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * Copyright (c) 2020-2021, the SerenityOS developers.
  4. * All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions are met:
  8. *
  9. * 1. Redistributions of source code must retain the above copyright notice, this
  10. * list of conditions and the following disclaimer.
  11. *
  12. * 2. Redistributions in binary form must reproduce the above copyright notice,
  13. * this list of conditions and the following disclaimer in the documentation
  14. * and/or other materials provided with the distribution.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  20. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #include "RegexParser.h"
  28. #include "RegexDebug.h"
  29. #include <AK/String.h>
  30. #include <AK/StringBuilder.h>
  31. #include <AK/StringUtils.h>
  32. namespace regex {
  33. ALWAYS_INLINE bool Parser::set_error(Error error)
  34. {
  35. if (m_parser_state.error == Error::NoError) {
  36. m_parser_state.error = error;
  37. m_parser_state.error_token = m_parser_state.current_token;
  38. }
  39. return false; // always return false, that eases the API usage (return set_error(...)) :^)
  40. }
  41. ALWAYS_INLINE bool Parser::done() const
  42. {
  43. return match(TokenType::Eof);
  44. }
  45. ALWAYS_INLINE bool Parser::match(TokenType type) const
  46. {
  47. return m_parser_state.current_token.type() == type;
  48. }
  49. ALWAYS_INLINE bool Parser::match(char ch) const
  50. {
  51. return m_parser_state.current_token.type() == TokenType::Char && m_parser_state.current_token.value().length() == 1 && m_parser_state.current_token.value()[0] == ch;
  52. }
  53. ALWAYS_INLINE Token Parser::consume()
  54. {
  55. auto old_token = m_parser_state.current_token;
  56. m_parser_state.current_token = m_parser_state.lexer.next();
  57. return old_token;
  58. }
  59. ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
  60. {
  61. if (m_parser_state.current_token.type() != type) {
  62. set_error(error);
  63. dbgln("[PARSER] Error: Unexpected token {}. Expected: {}", m_parser_state.current_token.name(), Token::name(type));
  64. }
  65. return consume();
  66. }
  67. ALWAYS_INLINE bool Parser::consume(const String& str)
  68. {
  69. size_t potentially_go_back { 1 };
  70. for (auto ch : str) {
  71. if (match(TokenType::Char)) {
  72. if (m_parser_state.current_token.value()[0] != ch) {
  73. m_parser_state.lexer.back(potentially_go_back);
  74. m_parser_state.current_token = m_parser_state.lexer.next();
  75. return false;
  76. }
  77. } else {
  78. m_parser_state.lexer.back(potentially_go_back);
  79. m_parser_state.current_token = m_parser_state.lexer.next();
  80. return false;
  81. }
  82. consume(TokenType::Char, Error::NoError);
  83. ++potentially_go_back;
  84. }
  85. return true;
  86. }
  87. ALWAYS_INLINE bool Parser::try_skip(StringView str)
  88. {
  89. if (str.starts_with(m_parser_state.current_token.value()))
  90. str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
  91. else
  92. return false;
  93. size_t potentially_go_back { 0 };
  94. for (auto ch : str) {
  95. if (!m_parser_state.lexer.try_skip(ch)) {
  96. m_parser_state.lexer.back(potentially_go_back);
  97. return false;
  98. }
  99. ++potentially_go_back;
  100. }
  101. m_parser_state.current_token = m_parser_state.lexer.next();
  102. return true;
  103. }
  104. ALWAYS_INLINE bool Parser::lookahead_any(StringView str)
  105. {
  106. for (auto ch : str) {
  107. if (match(ch))
  108. return true;
  109. }
  110. return false;
  111. }
  112. ALWAYS_INLINE char Parser::skip()
  113. {
  114. char ch;
  115. if (m_parser_state.current_token.value().length() == 1) {
  116. ch = m_parser_state.current_token.value()[0];
  117. } else {
  118. m_parser_state.lexer.back(m_parser_state.current_token.value().length());
  119. ch = m_parser_state.lexer.skip();
  120. }
  121. m_parser_state.current_token = m_parser_state.lexer.next();
  122. return ch;
  123. }
  124. ALWAYS_INLINE void Parser::back(size_t count)
  125. {
  126. m_parser_state.lexer.back(count);
  127. m_parser_state.current_token = m_parser_state.lexer.next();
  128. }
  129. ALWAYS_INLINE void Parser::reset()
  130. {
  131. m_parser_state.bytecode.clear();
  132. m_parser_state.lexer.reset();
  133. m_parser_state.current_token = m_parser_state.lexer.next();
  134. m_parser_state.error = Error::NoError;
  135. m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
  136. m_parser_state.capture_group_minimum_lengths.clear();
  137. m_parser_state.capture_groups_count = 0;
  138. m_parser_state.named_capture_groups_count = 0;
  139. m_parser_state.named_capture_group_minimum_lengths.clear();
  140. m_parser_state.named_capture_groups.clear();
  141. }
  142. Parser::Result Parser::parse(Optional<AllOptions> regex_options)
  143. {
  144. reset();
  145. if (regex_options.has_value())
  146. m_parser_state.regex_options = regex_options.value();
  147. if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
  148. consume(TokenType::Eof, Error::InvalidPattern);
  149. else
  150. set_error(Error::InvalidPattern);
  151. #if REGEX_DEBUG
  152. fprintf(stderr, "[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.bytecode.size());
  153. #endif
  154. return {
  155. move(m_parser_state.bytecode),
  156. move(m_parser_state.capture_groups_count),
  157. move(m_parser_state.named_capture_groups_count),
  158. move(m_parser_state.match_length_minimum),
  159. move(m_parser_state.error),
  160. move(m_parser_state.error_token)
  161. };
  162. }
  163. ALWAYS_INLINE bool Parser::match_ordinary_characters()
  164. {
  165. // NOTE: This method must not be called during bracket and repetition parsing!
  166. // FIXME: Add assertion for that?
  167. auto type = m_parser_state.current_token.type();
  168. return (type == TokenType::Char
  169. || type == TokenType::Comma
  170. || type == TokenType::Slash
  171. || type == TokenType::EqualSign
  172. || type == TokenType::HyphenMinus
  173. || type == TokenType::Colon);
  174. }
  175. // =============================
  176. // PosixExtended Parser
  177. // =============================
  178. bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  179. {
  180. return parse_root(stack, match_length_minimum);
  181. }
  182. ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
  183. {
  184. auto type = m_parser_state.current_token.type();
  185. return (type == TokenType::Asterisk
  186. || type == TokenType::Plus
  187. || type == TokenType::Questionmark
  188. || type == TokenType::LeftCurly);
  189. }
  190. ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
  191. {
  192. if (match(TokenType::LeftCurly)) {
  193. consume();
  194. StringBuilder number_builder;
  195. while (match(TokenType::Char)) {
  196. number_builder.append(consume().value());
  197. }
  198. auto maybe_minimum = number_builder.build().to_uint();
  199. if (!maybe_minimum.has_value())
  200. return set_error(Error::InvalidBraceContent);
  201. auto minimum = maybe_minimum.value();
  202. match_length_minimum *= minimum;
  203. if (match(TokenType::Comma)) {
  204. consume();
  205. } else {
  206. ByteCode bytecode;
  207. bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
  208. bytecode_to_repeat = move(bytecode);
  209. consume(TokenType::RightCurly, Error::MismatchingBrace);
  210. return !has_error();
  211. }
  212. Optional<size_t> maybe_maximum {};
  213. number_builder.clear();
  214. while (match(TokenType::Char)) {
  215. number_builder.append(consume().value());
  216. }
  217. if (!number_builder.is_empty()) {
  218. auto value = number_builder.build().to_uint();
  219. if (!value.has_value() || minimum > value.value())
  220. return set_error(Error::InvalidBraceContent);
  221. maybe_maximum = value.value();
  222. }
  223. bytecode_to_repeat.insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
  224. consume(TokenType::RightCurly, Error::MismatchingBrace);
  225. return !has_error();
  226. } else if (match(TokenType::Plus)) {
  227. consume();
  228. bool nongreedy = match(TokenType::Questionmark);
  229. if (nongreedy)
  230. consume();
  231. // Note: don't touch match_length_minimum, it's already correct
  232. bytecode_to_repeat.insert_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
  233. return !has_error();
  234. } else if (match(TokenType::Asterisk)) {
  235. consume();
  236. match_length_minimum = 0;
  237. bool nongreedy = match(TokenType::Questionmark);
  238. if (nongreedy)
  239. consume();
  240. bytecode_to_repeat.insert_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
  241. return !has_error();
  242. } else if (match(TokenType::Questionmark)) {
  243. consume();
  244. match_length_minimum = 0;
  245. bool nongreedy = match(TokenType::Questionmark);
  246. if (nongreedy)
  247. consume();
  248. bytecode_to_repeat.insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
  249. return !has_error();
  250. }
  251. return false;
  252. }
  253. ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
  254. {
  255. Vector<CompareTypeAndValuePair> values;
  256. for (;;) {
  257. if (match(TokenType::HyphenMinus)) {
  258. consume();
  259. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  260. // first in the bracket expression
  261. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  262. } else if (match(TokenType::RightBracket)) {
  263. // Last in the bracket expression
  264. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  265. } else if (values.last().type == CharacterCompareType::Char) {
  266. values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
  267. if (match(TokenType::HyphenMinus)) {
  268. consume();
  269. // Valid range, add ordinary character
  270. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  271. }
  272. } else {
  273. return set_error(Error::InvalidRange);
  274. }
  275. } else if (match(TokenType::Circumflex)) {
  276. auto t = consume();
  277. if (values.is_empty())
  278. values.append({ CharacterCompareType::Inverse, 0 });
  279. else
  280. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
  281. } else if (match(TokenType::LeftBracket)) {
  282. consume();
  283. if (match(TokenType::Period)) {
  284. consume();
  285. // FIXME: Parse collating element, this is needed when we have locale support
  286. // This could have impact on length parameter, I guess.
  287. VERIFY_NOT_REACHED();
  288. consume(TokenType::Period, Error::InvalidCollationElement);
  289. consume(TokenType::RightBracket, Error::MismatchingBracket);
  290. } else if (match(TokenType::EqualSign)) {
  291. consume();
  292. // FIXME: Parse collating element, this is needed when we have locale support
  293. // This could have impact on length parameter, I guess.
  294. VERIFY_NOT_REACHED();
  295. consume(TokenType::EqualSign, Error::InvalidCollationElement);
  296. consume(TokenType::RightBracket, Error::MismatchingBracket);
  297. } else if (match(TokenType::Colon)) {
  298. consume();
  299. CharClass ch_class;
  300. // parse character class
  301. if (match(TokenType::Char)) {
  302. if (consume("alnum"))
  303. ch_class = CharClass::Alnum;
  304. else if (consume("alpha"))
  305. ch_class = CharClass::Alpha;
  306. else if (consume("blank"))
  307. ch_class = CharClass::Blank;
  308. else if (consume("cntrl"))
  309. ch_class = CharClass::Cntrl;
  310. else if (consume("digit"))
  311. ch_class = CharClass::Digit;
  312. else if (consume("graph"))
  313. ch_class = CharClass::Graph;
  314. else if (consume("lower"))
  315. ch_class = CharClass::Lower;
  316. else if (consume("print"))
  317. ch_class = CharClass::Print;
  318. else if (consume("punct"))
  319. ch_class = CharClass::Punct;
  320. else if (consume("space"))
  321. ch_class = CharClass::Space;
  322. else if (consume("upper"))
  323. ch_class = CharClass::Upper;
  324. else if (consume("xdigit"))
  325. ch_class = CharClass::Xdigit;
  326. else
  327. return set_error(Error::InvalidCharacterClass);
  328. values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
  329. } else
  330. return set_error(Error::InvalidCharacterClass);
  331. // FIXME: we do not support locale specific character classes until locales are implemented
  332. consume(TokenType::Colon, Error::InvalidCharacterClass);
  333. consume(TokenType::RightBracket, Error::MismatchingBracket);
  334. } else {
  335. return set_error(Error::MismatchingBracket);
  336. }
  337. } else if (match(TokenType::RightBracket)) {
  338. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  339. // handle bracket as ordinary character
  340. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
  341. } else {
  342. // closing bracket expression
  343. break;
  344. }
  345. } else {
  346. values.append({ CharacterCompareType::Char, (ByteCodeValueType)skip() });
  347. }
  348. // check if range expression has to be completed...
  349. if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
  350. if (values.last().type != CharacterCompareType::Char)
  351. return set_error(Error::InvalidRange);
  352. auto value2 = values.take_last();
  353. values.take_last(); // RangeExpressionDummy
  354. auto value1 = values.take_last();
  355. values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) });
  356. }
  357. }
  358. if (values.size())
  359. match_length_minimum = 1;
  360. if (values.first().type == CharacterCompareType::Inverse)
  361. match_length_minimum = 0;
  362. stack.insert_bytecode_compare_values(move(values));
  363. return !has_error();
  364. }
  365. ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
  366. {
  367. ByteCode bytecode;
  368. size_t length = 0;
  369. bool should_parse_repetition_symbol { false };
  370. for (;;) {
  371. if (match_ordinary_characters()) {
  372. Token start_token = m_parser_state.current_token;
  373. Token last_token = m_parser_state.current_token;
  374. for (;;) {
  375. if (!match_ordinary_characters())
  376. break;
  377. ++length;
  378. last_token = consume();
  379. }
  380. if (length > 1) {
  381. // last character is inserted into 'bytecode' for duplication symbol handling
  382. auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
  383. stack.insert_bytecode_compare_string({ start_token.value().characters_without_null_termination(), new_length });
  384. }
  385. if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
  386. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
  387. should_parse_repetition_symbol = true;
  388. break;
  389. }
  390. if (match_repetition_symbol())
  391. return set_error(Error::InvalidRepetitionMarker);
  392. if (match(TokenType::Period)) {
  393. length = 1;
  394. consume();
  395. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  396. should_parse_repetition_symbol = true;
  397. break;
  398. }
  399. if (match(TokenType::EscapeSequence)) {
  400. length = 1;
  401. Token t = consume();
  402. #if REGEX_DEBUG
  403. printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
  404. #endif
  405. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
  406. should_parse_repetition_symbol = true;
  407. break;
  408. }
  409. if (match(TokenType::LeftBracket)) {
  410. consume();
  411. ByteCode sub_ops;
  412. if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
  413. return set_error(Error::InvalidBracketContent);
  414. bytecode.append(move(sub_ops));
  415. consume(TokenType::RightBracket, Error::MismatchingBracket);
  416. should_parse_repetition_symbol = true;
  417. break;
  418. }
  419. if (match(TokenType::RightBracket)) {
  420. return set_error(Error::MismatchingBracket);
  421. }
  422. if (match(TokenType::RightCurly)) {
  423. return set_error(Error::MismatchingBrace);
  424. }
  425. if (match(TokenType::Circumflex)) {
  426. consume();
  427. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  428. break;
  429. }
  430. if (match(TokenType::Dollar)) {
  431. consume();
  432. bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  433. break;
  434. }
  435. if (match(TokenType::RightParen))
  436. return false;
  437. if (match(TokenType::LeftParen)) {
  438. consume();
  439. Optional<StringView> capture_group_name;
  440. bool prevent_capture_group = false;
  441. if (match(TokenType::Questionmark)) {
  442. consume();
  443. if (match(TokenType::Colon)) {
  444. consume();
  445. prevent_capture_group = true;
  446. } else if (consume("<")) { // named capturing group
  447. Token start_token = m_parser_state.current_token;
  448. Token last_token = m_parser_state.current_token;
  449. size_t capture_group_name_length = 0;
  450. for (;;) {
  451. if (!match_ordinary_characters())
  452. return set_error(Error::InvalidNameForCaptureGroup);
  453. if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
  454. consume();
  455. break;
  456. }
  457. ++capture_group_name_length;
  458. last_token = consume();
  459. }
  460. capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
  461. } else if (match(TokenType::EqualSign)) { // positive lookahead
  462. consume();
  463. VERIFY_NOT_REACHED();
  464. } else if (consume("!")) { // negative lookahead
  465. VERIFY_NOT_REACHED();
  466. } else if (consume("<")) {
  467. if (match(TokenType::EqualSign)) { // positive lookbehind
  468. consume();
  469. VERIFY_NOT_REACHED();
  470. }
  471. if (consume("!")) // negative lookbehind
  472. VERIFY_NOT_REACHED();
  473. } else {
  474. return set_error(Error::InvalidRepetitionMarker);
  475. }
  476. }
  477. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  478. if (capture_group_name.has_value())
  479. bytecode.insert_bytecode_group_capture_left(capture_group_name.value());
  480. else
  481. bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
  482. }
  483. ByteCode capture_group_bytecode;
  484. if (!parse_root(capture_group_bytecode, length))
  485. return set_error(Error::InvalidPattern);
  486. bytecode.append(move(capture_group_bytecode));
  487. consume(TokenType::RightParen, Error::MismatchingParen);
  488. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  489. if (capture_group_name.has_value()) {
  490. bytecode.insert_bytecode_group_capture_right(capture_group_name.value());
  491. ++m_parser_state.named_capture_groups_count;
  492. } else {
  493. bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
  494. ++m_parser_state.capture_groups_count;
  495. }
  496. }
  497. should_parse_repetition_symbol = true;
  498. break;
  499. }
  500. return false;
  501. }
  502. if (match_repetition_symbol()) {
  503. if (should_parse_repetition_symbol)
  504. parse_repetition_symbol(bytecode, length);
  505. else
  506. return set_error(Error::InvalidRepetitionMarker);
  507. }
  508. stack.append(move(bytecode));
  509. match_length_minimum += length;
  510. return true;
  511. }
  512. bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
  513. {
  514. ByteCode bytecode_left;
  515. size_t match_length_minimum_left { 0 };
  516. if (match_repetition_symbol())
  517. return set_error(Error::InvalidRepetitionMarker);
  518. for (;;) {
  519. if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
  520. break;
  521. if (match(TokenType::Pipe)) {
  522. consume();
  523. ByteCode bytecode_right;
  524. size_t match_length_minimum_right { 0 };
  525. if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
  526. return set_error(Error::InvalidPattern);
  527. ByteCode new_bytecode;
  528. new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
  529. bytecode_left = move(new_bytecode);
  530. match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
  531. }
  532. }
  533. if (bytecode_left.is_empty())
  534. set_error(Error::EmptySubExpression);
  535. stack.append(move(bytecode_left));
  536. match_length_minimum = match_length_minimum_left;
  537. return !has_error();
  538. }
  539. // =============================
  540. // ECMA262 Parser
  541. // =============================
  542. bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  543. {
  544. if (m_parser_state.regex_options.has_flag_set(AllFlags::Unicode)) {
  545. return parse_pattern(stack, match_length_minimum, true, true);
  546. } else {
  547. ByteCode new_stack;
  548. size_t new_match_length = 0;
  549. auto res = parse_pattern(new_stack, new_match_length, false, false);
  550. if (m_parser_state.named_capture_groups_count > 0) {
  551. reset();
  552. return parse_pattern(stack, match_length_minimum, false, true);
  553. }
  554. if (!res)
  555. return false;
  556. stack.append(new_stack);
  557. match_length_minimum = new_match_length;
  558. return res;
  559. }
  560. }
  561. bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  562. {
  563. return parse_disjunction(stack, match_length_minimum, unicode, named);
  564. }
  565. bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  566. {
  567. ByteCode left_alternative_stack;
  568. size_t left_alternative_min_length = 0;
  569. auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named);
  570. if (!alt_ok)
  571. return false;
  572. if (!match(TokenType::Pipe)) {
  573. stack.append(left_alternative_stack);
  574. match_length_minimum = left_alternative_min_length;
  575. return alt_ok;
  576. }
  577. consume();
  578. ByteCode right_alternative_stack;
  579. size_t right_alternative_min_length = 0;
  580. auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named);
  581. if (!continuation_ok)
  582. return false;
  583. stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack));
  584. match_length_minimum = min(left_alternative_min_length, right_alternative_min_length);
  585. return continuation_ok;
  586. }
  587. bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  588. {
  589. for (;;) {
  590. if (match(TokenType::Eof))
  591. return true;
  592. if (parse_term(stack, match_length_minimum, unicode, named))
  593. continue;
  594. return !has_error();
  595. }
  596. }
  597. bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  598. {
  599. if (parse_assertion(stack, match_length_minimum, unicode, named))
  600. return true;
  601. ByteCode atom_stack;
  602. size_t minimum_atom_length = 0;
  603. auto parse_with_quantifier = [&] {
  604. bool did_parse_one = false;
  605. if (m_should_use_browser_extended_grammar)
  606. did_parse_one = parse_extended_atom(atom_stack, minimum_atom_length, named);
  607. if (!did_parse_one)
  608. did_parse_one = parse_atom(atom_stack, minimum_atom_length, unicode, named);
  609. if (!did_parse_one)
  610. return false;
  611. VERIFY(did_parse_one);
  612. if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named))
  613. return false;
  614. return true;
  615. };
  616. if (!parse_with_quantifier())
  617. return false;
  618. stack.append(move(atom_stack));
  619. match_length_minimum += minimum_atom_length;
  620. return true;
  621. }
  622. bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
  623. {
  624. if (match(TokenType::Circumflex)) {
  625. consume();
  626. stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  627. return true;
  628. }
  629. if (match(TokenType::Dollar)) {
  630. consume();
  631. stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  632. return true;
  633. }
  634. if (try_skip("\\b")) {
  635. stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
  636. return true;
  637. }
  638. if (try_skip("\\B")) {
  639. stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
  640. return true;
  641. }
  642. if (match(TokenType::LeftParen)) {
  643. if (!try_skip("(?"))
  644. return false;
  645. if (done()) {
  646. set_error(Error::InvalidCaptureGroup);
  647. return false;
  648. }
  649. ByteCode assertion_stack;
  650. size_t length_dummy = 0;
  651. bool should_parse_forward_assertion = m_should_use_browser_extended_grammar ? unicode : true;
  652. if (should_parse_forward_assertion && try_skip("=")) {
  653. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  654. return false;
  655. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
  656. return true;
  657. }
  658. if (should_parse_forward_assertion && try_skip("!")) {
  659. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  660. return false;
  661. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
  662. return true;
  663. }
  664. if (m_should_use_browser_extended_grammar) {
  665. if (!unicode) {
  666. if (parse_quantifiable_assertion(assertion_stack, match_length_minimum, named)) {
  667. stack.append(move(assertion_stack));
  668. return true;
  669. }
  670. }
  671. }
  672. if (try_skip("<=")) {
  673. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  674. return false;
  675. // FIXME: Somehow ensure that this assertion regexp has a fixed length.
  676. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
  677. return true;
  678. }
  679. if (try_skip("<!")) {
  680. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  681. return false;
  682. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
  683. return true;
  684. }
  685. // If none of these matched, put the '(?' back.
  686. m_parser_state.lexer.back(3);
  687. m_parser_state.current_token = m_parser_state.lexer.next();
  688. return false;
  689. }
  690. return false;
  691. }
  692. bool ECMA262Parser::parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named)
  693. {
  694. auto disjunction_ok = parse_disjunction(bytecode_stack, length, unicode, named);
  695. if (!disjunction_ok)
  696. return false;
  697. consume(TokenType::RightParen, Error::MismatchingParen);
  698. return true;
  699. }
  700. bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool named)
  701. {
  702. VERIFY(m_should_use_browser_extended_grammar);
  703. ByteCode assertion_stack;
  704. size_t match_length_minimum = 0;
  705. if (try_skip("=")) {
  706. if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
  707. return false;
  708. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
  709. return true;
  710. }
  711. if (try_skip("!")) {
  712. if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
  713. return false;
  714. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
  715. return true;
  716. }
  717. return false;
  718. }
  719. StringView ECMA262Parser::read_digits_as_string(ReadDigitsInitialZeroState initial_zero, bool hex, int max_count)
  720. {
  721. if (!match(TokenType::Char))
  722. return {};
  723. if (initial_zero == ReadDigitsInitialZeroState::Disallow && m_parser_state.current_token.value() == "0")
  724. return {};
  725. int count = 0;
  726. size_t offset = 0;
  727. auto start_token = m_parser_state.current_token;
  728. while (match(TokenType::Char)) {
  729. auto& c = m_parser_state.current_token.value();
  730. if (max_count > 0 && count >= max_count)
  731. break;
  732. if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
  733. break;
  734. if (!hex && !c.to_uint().has_value())
  735. break;
  736. offset += consume().value().length();
  737. ++count;
  738. }
  739. return StringView { start_token.value().characters_without_null_termination(), offset };
  740. }
  741. Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, bool hex, int max_count)
  742. {
  743. auto str = read_digits_as_string(initial_zero, hex, max_count);
  744. if (str.is_empty())
  745. return {};
  746. if (hex)
  747. return AK::StringUtils::convert_to_uint_from_hex(str);
  748. return str.to_uint();
  749. }
  750. bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool)
  751. {
  752. enum class Repetition {
  753. OneOrMore,
  754. ZeroOrMore,
  755. Optional,
  756. Explicit,
  757. None,
  758. } repetition_mark { Repetition::None };
  759. bool ungreedy = false;
  760. Optional<size_t> repeat_min, repeat_max;
  761. if (match(TokenType::Asterisk)) {
  762. consume();
  763. repetition_mark = Repetition::ZeroOrMore;
  764. } else if (match(TokenType::Plus)) {
  765. consume();
  766. repetition_mark = Repetition::OneOrMore;
  767. } else if (match(TokenType::Questionmark)) {
  768. consume();
  769. repetition_mark = Repetition::Optional;
  770. } else if (match(TokenType::LeftCurly)) {
  771. consume();
  772. auto chars_consumed = 1;
  773. repetition_mark = Repetition::Explicit;
  774. auto low_bound_string = read_digits_as_string();
  775. chars_consumed += low_bound_string.length();
  776. auto low_bound = low_bound_string.to_uint();
  777. if (!low_bound.has_value()) {
  778. back(chars_consumed + 1);
  779. return true;
  780. }
  781. repeat_min = low_bound.value();
  782. if (match(TokenType::Comma)) {
  783. consume();
  784. ++chars_consumed;
  785. auto high_bound_string = read_digits_as_string();
  786. auto high_bound = high_bound_string.to_uint();
  787. if (high_bound.has_value()) {
  788. repeat_max = high_bound.value();
  789. chars_consumed += high_bound_string.length();
  790. }
  791. } else {
  792. repeat_max = repeat_min;
  793. }
  794. if (!match(TokenType::RightCurly)) {
  795. back(chars_consumed + 1);
  796. return true;
  797. }
  798. consume();
  799. ++chars_consumed;
  800. if (repeat_max.has_value()) {
  801. if (repeat_min.value() > repeat_max.value())
  802. set_error(Error::InvalidBraceContent);
  803. }
  804. } else {
  805. return true;
  806. }
  807. if (match(TokenType::Questionmark)) {
  808. consume();
  809. ungreedy = true;
  810. }
  811. ByteCode new_bytecode;
  812. switch (repetition_mark) {
  813. case Repetition::OneOrMore:
  814. new_bytecode.insert_bytecode_repetition_min_one(stack, !ungreedy);
  815. break;
  816. case Repetition::ZeroOrMore:
  817. new_bytecode.insert_bytecode_repetition_any(stack, !ungreedy);
  818. match_length_minimum = 0;
  819. break;
  820. case Repetition::Optional:
  821. new_bytecode.insert_bytecode_repetition_zero_or_one(stack, !ungreedy);
  822. match_length_minimum = 0;
  823. break;
  824. case Repetition::Explicit:
  825. new_bytecode.insert_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max, !ungreedy);
  826. match_length_minimum *= repeat_min.value();
  827. break;
  828. case Repetition::None:
  829. VERIFY_NOT_REACHED();
  830. }
  831. return true;
  832. }
  833. bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  834. {
  835. if (match(TokenType::EscapeSequence)) {
  836. // Also part of AtomEscape.
  837. auto token = consume();
  838. match_length_minimum += 1;
  839. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[1] } });
  840. return true;
  841. }
  842. if (try_skip("\\")) {
  843. // AtomEscape.
  844. return parse_atom_escape(stack, match_length_minimum, unicode, named);
  845. }
  846. if (match(TokenType::LeftBracket)) {
  847. // Character class.
  848. return parse_character_class(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named);
  849. }
  850. if (match(TokenType::LeftParen)) {
  851. // Non-capturing group, or a capture group.
  852. return parse_capture_group(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named);
  853. }
  854. if (match(TokenType::Period)) {
  855. consume();
  856. match_length_minimum += 1;
  857. stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  858. return true;
  859. }
  860. if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightParen)
  861. || match(TokenType::Pipe) || match(TokenType::Plus) || match(TokenType::Asterisk)
  862. || match(TokenType::Questionmark)) {
  863. return false;
  864. }
  865. if (match(TokenType::RightBracket) || match(TokenType::RightCurly) || match(TokenType::LeftCurly)) {
  866. if (m_should_use_browser_extended_grammar) {
  867. auto token = consume();
  868. match_length_minimum += 1;
  869. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
  870. return true;
  871. } else {
  872. return false;
  873. }
  874. }
  875. if (match_ordinary_characters()) {
  876. auto token = consume().value();
  877. match_length_minimum += 1;
  878. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
  879. return true;
  880. }
  881. set_error(Error::InvalidPattern);
  882. return false;
  883. }
  884. bool ECMA262Parser::parse_extended_atom(ByteCode&, size_t&, bool)
  885. {
  886. // Note: This includes only rules *not* present in parse_atom()
  887. VERIFY(m_should_use_browser_extended_grammar);
  888. if (parse_invalid_braced_quantifier())
  889. return true; // FAIL FAIL FAIL
  890. return false;
  891. }
  892. bool ECMA262Parser::parse_invalid_braced_quantifier()
  893. {
  894. if (!match(TokenType::LeftCurly))
  895. return false;
  896. consume();
  897. size_t chars_consumed = 1;
  898. auto low_bound = read_digits_as_string();
  899. StringView high_bound;
  900. if (low_bound.is_empty()) {
  901. back(chars_consumed + 1);
  902. return false;
  903. }
  904. chars_consumed += low_bound.length();
  905. if (match(TokenType::Comma)) {
  906. consume();
  907. ++chars_consumed;
  908. high_bound = read_digits_as_string();
  909. chars_consumed += high_bound.length();
  910. }
  911. if (!match(TokenType::RightCurly)) {
  912. back(chars_consumed + 1);
  913. return false;
  914. }
  915. consume();
  916. set_error(Error::InvalidPattern);
  917. return true;
  918. }
  919. bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  920. {
  921. if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow); !escape_str.is_empty()) {
  922. if (auto escape = escape_str.to_uint(); escape.has_value()) {
  923. // See if this is a "back"-reference (we've already parsed the group it refers to)
  924. auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
  925. if (maybe_length.has_value()) {
  926. match_length_minimum += maybe_length.value();
  927. stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
  928. return true;
  929. }
  930. // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
  931. if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
  932. // This refers to a future group, and it will _always_ be matching an empty string
  933. // So just match nothing and move on.
  934. return true;
  935. }
  936. if (!m_should_use_browser_extended_grammar) {
  937. set_error(Error::InvalidNumber);
  938. return false;
  939. }
  940. }
  941. // If not, put the characters back.
  942. back(escape_str.length());
  943. }
  944. // CharacterEscape > ControlEscape
  945. if (try_skip("f")) {
  946. match_length_minimum += 1;
  947. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
  948. return true;
  949. }
  950. if (try_skip("n")) {
  951. match_length_minimum += 1;
  952. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
  953. return true;
  954. }
  955. if (try_skip("r")) {
  956. match_length_minimum += 1;
  957. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
  958. return true;
  959. }
  960. if (try_skip("t")) {
  961. match_length_minimum += 1;
  962. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
  963. return true;
  964. }
  965. if (try_skip("v")) {
  966. match_length_minimum += 1;
  967. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
  968. return true;
  969. }
  970. // CharacterEscape > ControlLetter
  971. if (try_skip("c")) {
  972. for (auto c = 'A'; c <= 'z'; ++c) {
  973. if (try_skip({ &c, 1 })) {
  974. match_length_minimum += 1;
  975. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
  976. return true;
  977. }
  978. }
  979. if (m_should_use_browser_extended_grammar) {
  980. back(2);
  981. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
  982. match_length_minimum += 1;
  983. return true;
  984. }
  985. if (unicode) {
  986. set_error(Error::InvalidPattern);
  987. return false;
  988. }
  989. // Allow '\c' in non-unicode mode, just matches 'c'.
  990. match_length_minimum += 1;
  991. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
  992. return true;
  993. }
  994. // LegacyOctalEscapeSequence
  995. if (m_should_use_browser_extended_grammar) {
  996. if (!unicode) {
  997. if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
  998. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } });
  999. match_length_minimum += 1;
  1000. return true;
  1001. }
  1002. }
  1003. }
  1004. // '\0'
  1005. if (try_skip("0")) {
  1006. match_length_minimum += 1;
  1007. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
  1008. return true;
  1009. }
  1010. // HexEscape
  1011. if (try_skip("x")) {
  1012. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2); hex_escape.has_value()) {
  1013. match_length_minimum += 1;
  1014. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
  1015. return true;
  1016. } else if (!unicode) {
  1017. // '\x' is allowed in non-unicode mode, just matches 'x'.
  1018. match_length_minimum += 1;
  1019. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
  1020. return true;
  1021. } else {
  1022. set_error(Error::InvalidPattern);
  1023. return false;
  1024. }
  1025. }
  1026. if (try_skip("u")) {
  1027. if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
  1028. // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
  1029. match_length_minimum += 1;
  1030. StringBuilder builder;
  1031. builder.append_code_point(code_point.value());
  1032. // FIXME: This isn't actually correct for ECMAScript.
  1033. auto u8_encoded = builder.string_view();
  1034. stack.insert_bytecode_compare_string(u8_encoded);
  1035. return true;
  1036. } else if (!unicode) {
  1037. // '\u' is allowed in non-unicode mode, just matches 'u'.
  1038. match_length_minimum += 1;
  1039. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
  1040. return true;
  1041. } else {
  1042. set_error(Error::InvalidPattern);
  1043. return false;
  1044. }
  1045. }
  1046. // IdentityEscape
  1047. auto source_characters = m_should_use_browser_extended_grammar ? "^$\\.*+?()[|"sv : "^$\\.*+?()[]{}|"sv;
  1048. for (auto ch : source_characters) {
  1049. if (try_skip({ &ch, 1 })) {
  1050. match_length_minimum += 1;
  1051. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
  1052. return true;
  1053. }
  1054. }
  1055. if (unicode) {
  1056. if (try_skip("/")) {
  1057. match_length_minimum += 1;
  1058. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
  1059. return true;
  1060. }
  1061. }
  1062. if (named && try_skip("k")) {
  1063. auto name = read_capture_group_specifier(true);
  1064. if (name.is_empty()) {
  1065. set_error(Error::InvalidNameForCaptureGroup);
  1066. return false;
  1067. }
  1068. auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name);
  1069. if (!maybe_length.has_value()) {
  1070. set_error(Error::InvalidNameForCaptureGroup);
  1071. return false;
  1072. }
  1073. match_length_minimum += maybe_length.value();
  1074. stack.insert_bytecode_compare_named_reference(name);
  1075. return true;
  1076. }
  1077. if (unicode) {
  1078. if (try_skip("p{")) {
  1079. // FIXME: Implement this path, Unicode property match.
  1080. TODO();
  1081. }
  1082. if (try_skip("P{")) {
  1083. // FIXME: Implement this path, Unicode property match.
  1084. TODO();
  1085. }
  1086. }
  1087. if (done())
  1088. return set_error(Error::InvalidTrailingEscape);
  1089. bool negate = false;
  1090. auto ch = parse_character_class_escape(negate);
  1091. if (!ch.has_value()) {
  1092. if (!unicode) {
  1093. // Allow all SourceCharacter's as escapes here.
  1094. auto token = consume();
  1095. match_length_minimum += 1;
  1096. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
  1097. return true;
  1098. }
  1099. set_error(Error::InvalidCharacterClass);
  1100. return false;
  1101. }
  1102. Vector<CompareTypeAndValuePair> compares;
  1103. if (negate)
  1104. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
  1105. compares.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)ch.value() });
  1106. match_length_minimum += 1;
  1107. stack.insert_bytecode_compare_values(move(compares));
  1108. return true;
  1109. }
  1110. Optional<u8> ECMA262Parser::parse_legacy_octal_escape()
  1111. {
  1112. constexpr auto all_octal_digits = "01234567";
  1113. auto read_octal_digit = [&](auto start, auto end, bool should_ensure_no_following_octal_digit) -> Optional<u8> {
  1114. for (char c = '0' + start; c <= '0' + end; ++c) {
  1115. if (try_skip({ &c, 1 })) {
  1116. if (!should_ensure_no_following_octal_digit || !lookahead_any(all_octal_digits))
  1117. return c - '0';
  1118. back(2);
  1119. return {};
  1120. }
  1121. }
  1122. return {};
  1123. };
  1124. // OctalDigit(1)
  1125. if (auto digit = read_octal_digit(0, 7, true); digit.has_value()) {
  1126. return digit.value();
  1127. }
  1128. // OctalDigit(2)
  1129. if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
  1130. if (auto right_digit = read_octal_digit(0, 7, true); right_digit.has_value()) {
  1131. return left_digit.value() * 8 + right_digit.value();
  1132. }
  1133. back(2);
  1134. }
  1135. // OctalDigit(2)
  1136. if (auto left_digit = read_octal_digit(4, 7, false); left_digit.has_value()) {
  1137. if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
  1138. return left_digit.value() * 8 + right_digit.value();
  1139. }
  1140. back(2);
  1141. }
  1142. // OctalDigit(3)
  1143. if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
  1144. size_t chars_consumed = 1;
  1145. if (auto mid_digit = read_octal_digit(0, 7, false); mid_digit.has_value()) {
  1146. ++chars_consumed;
  1147. if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
  1148. return left_digit.value() * 64 + mid_digit.value() * 8 + right_digit.value();
  1149. }
  1150. }
  1151. back(chars_consumed);
  1152. }
  1153. return {};
  1154. }
  1155. Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
  1156. {
  1157. if (expect_backslash && !try_skip("\\"))
  1158. return {};
  1159. // CharacterClassEscape
  1160. CharClass ch_class;
  1161. if (try_skip("d")) {
  1162. ch_class = CharClass::Digit;
  1163. } else if (try_skip("D")) {
  1164. ch_class = CharClass::Digit;
  1165. negate = true;
  1166. } else if (try_skip("s")) {
  1167. ch_class = CharClass::Space;
  1168. } else if (try_skip("S")) {
  1169. ch_class = CharClass::Space;
  1170. negate = true;
  1171. } else if (try_skip("w")) {
  1172. ch_class = CharClass::Word;
  1173. } else if (try_skip("W")) {
  1174. ch_class = CharClass::Word;
  1175. negate = true;
  1176. } else {
  1177. return {};
  1178. }
  1179. return ch_class;
  1180. }
  1181. bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
  1182. {
  1183. consume(TokenType::LeftBracket, Error::InvalidPattern);
  1184. Vector<CompareTypeAndValuePair> compares;
  1185. if (match(TokenType::Circumflex)) {
  1186. // Negated charclass
  1187. consume();
  1188. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
  1189. }
  1190. if (match(TokenType::RightBracket)) {
  1191. consume();
  1192. return true;
  1193. }
  1194. if (!parse_nonempty_class_ranges(compares, unicode))
  1195. return false;
  1196. match_length_minimum += 1;
  1197. stack.insert_bytecode_compare_values(move(compares));
  1198. return true;
  1199. }
  1200. struct CharClassRangeElement {
  1201. union {
  1202. CharClass character_class;
  1203. u32 code_point { 0 };
  1204. };
  1205. bool is_negated { false };
  1206. bool is_character_class { false };
  1207. };
  1208. bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
  1209. {
  1210. auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
  1211. if (match(TokenType::EscapeSequence)) {
  1212. auto token = consume().value();
  1213. return { { .code_point = (u32)token[1], .is_character_class = false } };
  1214. }
  1215. if (try_skip("\\")) {
  1216. if (done()) {
  1217. set_error(Error::InvalidTrailingEscape);
  1218. return {};
  1219. }
  1220. if (try_skip("f"))
  1221. return { { .code_point = '\f', .is_character_class = false } };
  1222. if (try_skip("n"))
  1223. return { { .code_point = '\n', .is_character_class = false } };
  1224. if (try_skip("r"))
  1225. return { { .code_point = '\r', .is_character_class = false } };
  1226. if (try_skip("t"))
  1227. return { { .code_point = '\t', .is_character_class = false } };
  1228. if (try_skip("v"))
  1229. return { { .code_point = '\v', .is_character_class = false } };
  1230. if (try_skip("b"))
  1231. return { { .code_point = '\b', .is_character_class = false } };
  1232. if (try_skip("/"))
  1233. return { { .code_point = '/', .is_character_class = false } };
  1234. // CharacterEscape > ControlLetter
  1235. if (try_skip("c")) {
  1236. for (auto c = 'A'; c <= 'z'; ++c) {
  1237. if (try_skip({ &c, 1 }))
  1238. return { { .code_point = (u32)(c % 32), .is_character_class = false } };
  1239. }
  1240. if (m_should_use_browser_extended_grammar) {
  1241. for (auto c = '0'; c <= '9'; ++c) {
  1242. if (try_skip({ &c, 1 }))
  1243. return { { .code_point = (u32)(c % 32), .is_character_class = false } };
  1244. }
  1245. if (try_skip("_"))
  1246. return { { .code_point = (u32)('_' % 32), .is_character_class = false } };
  1247. back(2);
  1248. return { { .code_point = '\\', .is_character_class = false } };
  1249. }
  1250. }
  1251. // '\0'
  1252. if (try_skip("0"))
  1253. return { { .code_point = 0, .is_character_class = false } };
  1254. // HexEscape
  1255. if (try_skip("x")) {
  1256. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2); hex_escape.has_value()) {
  1257. return { { .code_point = hex_escape.value(), .is_character_class = false } };
  1258. } else if (!unicode) {
  1259. // '\x' is allowed in non-unicode mode, just matches 'x'.
  1260. return { { .code_point = 'x', .is_character_class = false } };
  1261. } else {
  1262. set_error(Error::InvalidPattern);
  1263. return {};
  1264. }
  1265. }
  1266. if (try_skip("u")) {
  1267. if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
  1268. // FIXME: While codepoint ranges are supported, codepoint matches as "Char" are not!
  1269. return { { .code_point = code_point.value(), .is_character_class = false } };
  1270. } else if (!unicode) {
  1271. // '\u' is allowed in non-unicode mode, just matches 'u'.
  1272. return { { .code_point = 'u', .is_character_class = false } };
  1273. } else {
  1274. set_error(Error::InvalidPattern);
  1275. return {};
  1276. }
  1277. }
  1278. if (unicode) {
  1279. if (try_skip("-"))
  1280. return { { .code_point = '-', .is_character_class = false } };
  1281. }
  1282. if (try_skip("p{") || try_skip("P{")) {
  1283. // FIXME: Implement these; unicode properties.
  1284. TODO();
  1285. }
  1286. if (try_skip("d"))
  1287. return { { .character_class = CharClass::Digit, .is_character_class = true } };
  1288. if (try_skip("s"))
  1289. return { { .character_class = CharClass::Space, .is_character_class = true } };
  1290. if (try_skip("w"))
  1291. return { { .character_class = CharClass::Word, .is_character_class = true } };
  1292. if (try_skip("D"))
  1293. return { { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
  1294. if (try_skip("S"))
  1295. return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
  1296. if (try_skip("W"))
  1297. return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
  1298. if (!unicode) {
  1299. // Any unrecognised escape is allowed in non-unicode mode.
  1300. return { { .code_point = (u32)skip(), .is_character_class = false } };
  1301. }
  1302. }
  1303. if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
  1304. return {};
  1305. // Allow any (other) SourceCharacter.
  1306. return { { .code_point = (u32)skip(), .is_character_class = false } };
  1307. };
  1308. auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
  1309. if (match(TokenType::HyphenMinus)) {
  1310. consume();
  1311. return { { .code_point = '-', .is_character_class = false } };
  1312. }
  1313. return read_class_atom_no_dash();
  1314. };
  1315. while (!match(TokenType::RightBracket)) {
  1316. if (match(TokenType::Eof)) {
  1317. set_error(Error::MismatchingBracket);
  1318. return false;
  1319. }
  1320. auto first_atom = read_class_atom();
  1321. if (!first_atom.has_value())
  1322. return false;
  1323. if (match(TokenType::HyphenMinus)) {
  1324. consume();
  1325. if (match(TokenType::RightBracket)) {
  1326. // Allow '-' as the last element in a charclass, even after an atom.
  1327. m_parser_state.lexer.back(2); // -]
  1328. m_parser_state.current_token = m_parser_state.lexer.next();
  1329. goto read_as_single_atom;
  1330. }
  1331. auto second_atom = read_class_atom();
  1332. if (!second_atom.has_value())
  1333. return false;
  1334. if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
  1335. if (m_should_use_browser_extended_grammar) {
  1336. if (unicode) {
  1337. set_error(Error::InvalidRange);
  1338. return false;
  1339. }
  1340. // CharacterRangeOrUnion > !Unicode > CharClass
  1341. if (first_atom->is_character_class)
  1342. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom->character_class });
  1343. else
  1344. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)first_atom->code_point });
  1345. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)'-' });
  1346. if (second_atom->is_character_class)
  1347. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)second_atom->character_class });
  1348. else
  1349. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)second_atom->code_point });
  1350. continue;
  1351. } else {
  1352. set_error(Error::InvalidRange);
  1353. return false;
  1354. }
  1355. }
  1356. if (first_atom.value().code_point > second_atom.value().code_point) {
  1357. set_error(Error::InvalidRange);
  1358. return false;
  1359. }
  1360. VERIFY(!first_atom.value().is_negated);
  1361. VERIFY(!second_atom.value().is_negated);
  1362. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point } });
  1363. continue;
  1364. }
  1365. read_as_single_atom:;
  1366. auto atom = first_atom.value();
  1367. if (atom.is_character_class) {
  1368. if (atom.is_negated)
  1369. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
  1370. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class });
  1371. } else {
  1372. VERIFY(!atom.is_negated);
  1373. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, first_atom.value().code_point });
  1374. }
  1375. }
  1376. consume(TokenType::RightBracket, Error::MismatchingBracket);
  1377. return true;
  1378. }
  1379. StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
  1380. {
  1381. if (take_starting_angle_bracket && !consume("<"))
  1382. return {};
  1383. auto start_token = m_parser_state.current_token;
  1384. size_t offset = 0;
  1385. while (match(TokenType::Char)) {
  1386. auto c = m_parser_state.current_token.value();
  1387. if (c == ">")
  1388. break;
  1389. offset += consume().value().length();
  1390. }
  1391. StringView name { start_token.value().characters_without_null_termination(), offset };
  1392. if (!consume(">") || name.is_empty())
  1393. set_error(Error::InvalidNameForCaptureGroup);
  1394. return name;
  1395. }
  1396. bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  1397. {
  1398. consume(TokenType::LeftParen, Error::InvalidPattern);
  1399. if (match(TokenType::Questionmark)) {
  1400. // Non-capturing group or group with specifier.
  1401. consume();
  1402. if (match(TokenType::Colon)) {
  1403. consume();
  1404. ByteCode noncapture_group_bytecode;
  1405. size_t length = 0;
  1406. if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
  1407. return set_error(Error::InvalidPattern);
  1408. consume(TokenType::RightParen, Error::MismatchingParen);
  1409. stack.append(move(noncapture_group_bytecode));
  1410. match_length_minimum += length;
  1411. return true;
  1412. }
  1413. if (consume("<")) {
  1414. ++m_parser_state.named_capture_groups_count;
  1415. auto group_index = ++m_parser_state.capture_groups_count; // Named capture groups count as normal capture groups too.
  1416. auto name = read_capture_group_specifier();
  1417. if (name.is_empty()) {
  1418. set_error(Error::InvalidNameForCaptureGroup);
  1419. return false;
  1420. }
  1421. ByteCode capture_group_bytecode;
  1422. size_t length = 0;
  1423. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1424. return set_error(Error::InvalidPattern);
  1425. consume(TokenType::RightParen, Error::MismatchingParen);
  1426. stack.insert_bytecode_group_capture_left(name);
  1427. stack.insert_bytecode_group_capture_left(group_index);
  1428. stack.append(move(capture_group_bytecode));
  1429. stack.insert_bytecode_group_capture_right(name);
  1430. stack.insert_bytecode_group_capture_right(group_index);
  1431. match_length_minimum += length;
  1432. m_parser_state.named_capture_group_minimum_lengths.set(name, length);
  1433. m_parser_state.capture_group_minimum_lengths.set(group_index, length);
  1434. return true;
  1435. }
  1436. set_error(Error::InvalidCaptureGroup);
  1437. return false;
  1438. }
  1439. auto group_index = ++m_parser_state.capture_groups_count;
  1440. stack.insert_bytecode_group_capture_left(group_index);
  1441. ByteCode capture_group_bytecode;
  1442. size_t length = 0;
  1443. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1444. return set_error(Error::InvalidPattern);
  1445. stack.append(move(capture_group_bytecode));
  1446. m_parser_state.capture_group_minimum_lengths.set(group_index, length);
  1447. consume(TokenType::RightParen, Error::MismatchingParen);
  1448. stack.insert_bytecode_group_capture_right(group_index);
  1449. match_length_minimum += length;
  1450. return true;
  1451. }
  1452. size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
  1453. {
  1454. if (m_total_number_of_capturing_parenthesis.has_value())
  1455. return m_total_number_of_capturing_parenthesis.value();
  1456. GenericLexer lexer { m_parser_state.lexer.source() };
  1457. size_t count = 0;
  1458. while (!lexer.is_eof()) {
  1459. switch (lexer.peek()) {
  1460. case '\\':
  1461. lexer.consume(2);
  1462. continue;
  1463. case '[':
  1464. while (!lexer.is_eof()) {
  1465. if (lexer.consume_specific('\\'))
  1466. lexer.consume();
  1467. else if (lexer.consume_specific(']'))
  1468. break;
  1469. lexer.consume();
  1470. }
  1471. break;
  1472. case '(':
  1473. if (lexer.consume_specific('?')) {
  1474. // non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
  1475. if (!lexer.consume_specific('<'))
  1476. break;
  1477. if (lexer.next_is(is_any_of("=!")))
  1478. break;
  1479. ++count;
  1480. } else {
  1481. ++count;
  1482. }
  1483. break;
  1484. }
  1485. lexer.consume();
  1486. }
  1487. m_total_number_of_capturing_parenthesis = count;
  1488. return count;
  1489. }
  1490. }