RegexParser.cpp 47 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include "RegexParser.h"
  27. #include "RegexDebug.h"
  28. #include <AK/String.h>
  29. #include <AK/StringBuilder.h>
  30. #include <AK/StringUtils.h>
  31. namespace regex {
  32. ALWAYS_INLINE bool Parser::set_error(Error error)
  33. {
  34. if (m_parser_state.error == Error::NoError) {
  35. m_parser_state.error = error;
  36. m_parser_state.error_token = m_parser_state.current_token;
  37. }
  38. return false; // always return false, that eases the API usage (return set_error(...)) :^)
  39. }
  40. ALWAYS_INLINE bool Parser::done() const
  41. {
  42. return match(TokenType::Eof);
  43. }
  44. ALWAYS_INLINE bool Parser::match(TokenType type) const
  45. {
  46. return m_parser_state.current_token.type() == type;
  47. }
  48. ALWAYS_INLINE Token Parser::consume()
  49. {
  50. auto old_token = m_parser_state.current_token;
  51. m_parser_state.current_token = m_parser_state.lexer.next();
  52. return old_token;
  53. }
  54. ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
  55. {
  56. if (m_parser_state.current_token.type() != type) {
  57. set_error(error);
  58. dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.current_token.name() << ". Expected: " << Token::name(type);
  59. }
  60. return consume();
  61. }
  62. ALWAYS_INLINE bool Parser::consume(const String& str)
  63. {
  64. size_t potentially_go_back { 1 };
  65. for (auto ch : str) {
  66. if (match(TokenType::Char)) {
  67. if (m_parser_state.current_token.value()[0] != ch) {
  68. m_parser_state.lexer.back(potentially_go_back);
  69. m_parser_state.current_token = m_parser_state.lexer.next();
  70. return false;
  71. }
  72. } else {
  73. m_parser_state.lexer.back(potentially_go_back);
  74. m_parser_state.current_token = m_parser_state.lexer.next();
  75. return false;
  76. }
  77. consume(TokenType::Char, Error::NoError);
  78. ++potentially_go_back;
  79. }
  80. return true;
  81. }
  82. ALWAYS_INLINE bool Parser::try_skip(StringView str)
  83. {
  84. if (str.starts_with(m_parser_state.current_token.value()))
  85. str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
  86. else
  87. return false;
  88. size_t potentially_go_back { 0 };
  89. for (auto ch : str) {
  90. if (!m_parser_state.lexer.try_skip(ch)) {
  91. m_parser_state.lexer.back(potentially_go_back);
  92. return false;
  93. }
  94. ++potentially_go_back;
  95. }
  96. m_parser_state.current_token = m_parser_state.lexer.next();
  97. return true;
  98. }
  99. ALWAYS_INLINE void Parser::reset()
  100. {
  101. m_parser_state.bytecode.clear();
  102. m_parser_state.lexer.reset();
  103. m_parser_state.current_token = m_parser_state.lexer.next();
  104. m_parser_state.error = Error::NoError;
  105. m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
  106. m_parser_state.regex_options = {};
  107. }
  108. Parser::Result Parser::parse(Optional<AllOptions> regex_options)
  109. {
  110. reset();
  111. if (regex_options.has_value())
  112. m_parser_state.regex_options = regex_options.value();
  113. if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
  114. consume(TokenType::Eof, Error::InvalidPattern);
  115. else
  116. set_error(Error::InvalidPattern);
  117. #ifdef REGEX_DEBUG
  118. fprintf(stderr, "[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.bytecode.size());
  119. #endif
  120. return {
  121. move(m_parser_state.bytecode),
  122. move(m_parser_state.capture_groups_count),
  123. move(m_parser_state.named_capture_groups_count),
  124. move(m_parser_state.match_length_minimum),
  125. move(m_parser_state.error),
  126. move(m_parser_state.error_token)
  127. };
  128. }
  129. // =============================
  130. // PosixExtended Parser
  131. // =============================
  132. bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  133. {
  134. return parse_root(stack, match_length_minimum);
  135. }
  136. ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
  137. {
  138. auto type = m_parser_state.current_token.type();
  139. return (type == TokenType::Asterisk
  140. || type == TokenType::Plus
  141. || type == TokenType::Questionmark
  142. || type == TokenType::LeftCurly);
  143. }
  144. ALWAYS_INLINE bool PosixExtendedParser::match_ordinary_characters()
  145. {
  146. // NOTE: This method must not be called during bracket and repetition parsing!
  147. // FIXME: Add assertion for that?
  148. auto type = m_parser_state.current_token.type();
  149. return (type == TokenType::Char
  150. || type == TokenType::Comma
  151. || type == TokenType::Slash
  152. || type == TokenType::EqualSign
  153. || type == TokenType::HyphenMinus
  154. || type == TokenType::Colon);
  155. }
  156. ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
  157. {
  158. if (match(TokenType::LeftCurly)) {
  159. consume();
  160. StringBuilder number_builder;
  161. while (match(TokenType::Char)) {
  162. number_builder.append(consume().value());
  163. }
  164. auto maybe_minimum = number_builder.build().to_uint();
  165. if (!maybe_minimum.has_value())
  166. return set_error(Error::InvalidBraceContent);
  167. auto minimum = maybe_minimum.value();
  168. match_length_minimum *= minimum;
  169. if (match(TokenType::Comma)) {
  170. consume();
  171. } else {
  172. ByteCode bytecode;
  173. bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
  174. bytecode_to_repeat = move(bytecode);
  175. consume(TokenType::RightCurly, Error::MismatchingBrace);
  176. return !has_error();
  177. }
  178. Optional<size_t> maybe_maximum {};
  179. number_builder.clear();
  180. while (match(TokenType::Char)) {
  181. number_builder.append(consume().value());
  182. }
  183. if (!number_builder.is_empty()) {
  184. auto value = number_builder.build().to_uint();
  185. if (!value.has_value() || minimum > value.value())
  186. return set_error(Error::InvalidBraceContent);
  187. maybe_maximum = value.value();
  188. }
  189. bytecode_to_repeat.insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
  190. consume(TokenType::RightCurly, Error::MismatchingBrace);
  191. return !has_error();
  192. } else if (match(TokenType::Plus)) {
  193. consume();
  194. bool nongreedy = match(TokenType::Questionmark);
  195. if (nongreedy)
  196. consume();
  197. // Note: dont touch match_length_minimum, it's already correct
  198. bytecode_to_repeat.insert_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
  199. return !has_error();
  200. } else if (match(TokenType::Asterisk)) {
  201. consume();
  202. match_length_minimum = 0;
  203. bool nongreedy = match(TokenType::Questionmark);
  204. if (nongreedy)
  205. consume();
  206. bytecode_to_repeat.insert_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
  207. return !has_error();
  208. } else if (match(TokenType::Questionmark)) {
  209. consume();
  210. match_length_minimum = 0;
  211. bool nongreedy = match(TokenType::Questionmark);
  212. if (nongreedy)
  213. consume();
  214. bytecode_to_repeat.insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
  215. return !has_error();
  216. }
  217. return false;
  218. }
  219. ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
  220. {
  221. Vector<CompareTypeAndValuePair> values;
  222. for (;;) {
  223. if (match(TokenType::HyphenMinus)) {
  224. consume();
  225. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  226. // first in the bracket expression
  227. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  228. } else if (match(TokenType::RightBracket)) {
  229. // Last in the bracket expression
  230. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  231. } else if (values.last().type == CharacterCompareType::Char) {
  232. values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
  233. if (match(TokenType::HyphenMinus)) {
  234. consume();
  235. // Valid range, add ordinary character
  236. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  237. }
  238. } else {
  239. return set_error(Error::InvalidRange);
  240. }
  241. } else if (match(TokenType::Char) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) {
  242. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
  243. } else if (match(TokenType::Circumflex)) {
  244. auto t = consume();
  245. if (values.is_empty())
  246. values.append({ CharacterCompareType::Inverse, 0 });
  247. else
  248. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
  249. } else if (match(TokenType::LeftBracket)) {
  250. consume();
  251. if (match(TokenType::Period)) {
  252. consume();
  253. // FIXME: Parse collating element, this is needed when we have locale support
  254. // This could have impact on length parameter, I guess.
  255. ASSERT_NOT_REACHED();
  256. consume(TokenType::Period, Error::InvalidCollationElement);
  257. consume(TokenType::RightBracket, Error::MismatchingBracket);
  258. } else if (match(TokenType::EqualSign)) {
  259. consume();
  260. // FIXME: Parse collating element, this is needed when we have locale support
  261. // This could have impact on length parameter, I guess.
  262. ASSERT_NOT_REACHED();
  263. consume(TokenType::EqualSign, Error::InvalidCollationElement);
  264. consume(TokenType::RightBracket, Error::MismatchingBracket);
  265. } else if (match(TokenType::Colon)) {
  266. consume();
  267. CharClass ch_class;
  268. // parse character class
  269. if (match(TokenType::Char)) {
  270. if (consume("alnum"))
  271. ch_class = CharClass::Alnum;
  272. else if (consume("alpha"))
  273. ch_class = CharClass::Alpha;
  274. else if (consume("blank"))
  275. ch_class = CharClass::Blank;
  276. else if (consume("cntrl"))
  277. ch_class = CharClass::Cntrl;
  278. else if (consume("digit"))
  279. ch_class = CharClass::Digit;
  280. else if (consume("graph"))
  281. ch_class = CharClass::Graph;
  282. else if (consume("lower"))
  283. ch_class = CharClass::Lower;
  284. else if (consume("print"))
  285. ch_class = CharClass::Print;
  286. else if (consume("punct"))
  287. ch_class = CharClass::Punct;
  288. else if (consume("space"))
  289. ch_class = CharClass::Space;
  290. else if (consume("upper"))
  291. ch_class = CharClass::Upper;
  292. else if (consume("xdigit"))
  293. ch_class = CharClass::Xdigit;
  294. else
  295. return set_error(Error::InvalidCharacterClass);
  296. values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
  297. } else
  298. return set_error(Error::InvalidCharacterClass);
  299. // FIXME: we do not support locale specific character classes until locales are implemented
  300. consume(TokenType::Colon, Error::InvalidCharacterClass);
  301. consume(TokenType::RightBracket, Error::MismatchingBracket);
  302. } else {
  303. return set_error(Error::MismatchingBracket);
  304. }
  305. } else if (match(TokenType::RightBracket)) {
  306. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  307. // handle bracket as ordinary character
  308. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
  309. } else {
  310. // closing bracket expression
  311. break;
  312. }
  313. } else
  314. // nothing matched, this is a failure, as at least the closing bracket must match...
  315. return set_error(Error::MismatchingBracket);
  316. // check if range expression has to be completed...
  317. if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
  318. if (values.last().type != CharacterCompareType::Char)
  319. return set_error(Error::InvalidRange);
  320. auto value2 = values.take_last();
  321. values.take_last(); // RangeExpressionDummy
  322. auto value1 = values.take_last();
  323. values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) });
  324. }
  325. }
  326. if (values.size())
  327. match_length_minimum = 1;
  328. if (values.first().type == CharacterCompareType::Inverse)
  329. match_length_minimum = 0;
  330. stack.insert_bytecode_compare_values(move(values));
  331. return !has_error();
  332. }
  333. ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
  334. {
  335. ByteCode bytecode;
  336. size_t length = 0;
  337. bool should_parse_repetition_symbol { false };
  338. for (;;) {
  339. if (match_ordinary_characters()) {
  340. Token start_token = m_parser_state.current_token;
  341. Token last_token = m_parser_state.current_token;
  342. for (;;) {
  343. if (!match_ordinary_characters())
  344. break;
  345. ++length;
  346. last_token = consume();
  347. }
  348. if (length > 1) {
  349. // last character is inserted into 'bytecode' for duplication symbol handling
  350. auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
  351. stack.insert_bytecode_compare_string(start_token.value(), new_length);
  352. }
  353. if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
  354. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
  355. should_parse_repetition_symbol = true;
  356. break;
  357. }
  358. if (match_repetition_symbol())
  359. return set_error(Error::InvalidRepetitionMarker);
  360. if (match(TokenType::Period)) {
  361. length = 1;
  362. consume();
  363. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  364. should_parse_repetition_symbol = true;
  365. break;
  366. }
  367. if (match(TokenType::EscapeSequence)) {
  368. length = 1;
  369. Token t = consume();
  370. #ifdef REGEX_DEBUG
  371. printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
  372. #endif
  373. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
  374. should_parse_repetition_symbol = true;
  375. break;
  376. }
  377. if (match(TokenType::LeftBracket)) {
  378. consume();
  379. ByteCode sub_ops;
  380. if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
  381. return set_error(Error::InvalidBracketContent);
  382. bytecode.append(move(sub_ops));
  383. consume(TokenType::RightBracket, Error::MismatchingBracket);
  384. should_parse_repetition_symbol = true;
  385. break;
  386. }
  387. if (match(TokenType::RightBracket)) {
  388. return set_error(Error::MismatchingBracket);
  389. }
  390. if (match(TokenType::RightCurly)) {
  391. return set_error(Error::MismatchingBrace);
  392. }
  393. if (match(TokenType::Circumflex)) {
  394. consume();
  395. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  396. break;
  397. }
  398. if (match(TokenType::Dollar)) {
  399. consume();
  400. bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  401. break;
  402. }
  403. if (match(TokenType::RightParen))
  404. return false;
  405. if (match(TokenType::LeftParen)) {
  406. consume();
  407. Optional<StringView> capture_group_name;
  408. bool prevent_capture_group = false;
  409. if (match(TokenType::Questionmark)) {
  410. consume();
  411. if (match(TokenType::Colon)) {
  412. consume();
  413. prevent_capture_group = true;
  414. } else if (consume("<")) { // named capturing group
  415. Token start_token = m_parser_state.current_token;
  416. Token last_token = m_parser_state.current_token;
  417. size_t capture_group_name_length = 0;
  418. for (;;) {
  419. if (!match_ordinary_characters())
  420. return set_error(Error::InvalidNameForCaptureGroup);
  421. if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
  422. consume();
  423. break;
  424. }
  425. ++capture_group_name_length;
  426. last_token = consume();
  427. }
  428. capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
  429. } else if (match(TokenType::EqualSign)) { // positive lookahead
  430. consume();
  431. ASSERT_NOT_REACHED();
  432. } else if (consume("!")) { // negative lookahead
  433. ASSERT_NOT_REACHED();
  434. } else if (consume("<")) {
  435. if (match(TokenType::EqualSign)) { // positive lookbehind
  436. consume();
  437. ASSERT_NOT_REACHED();
  438. }
  439. if (consume("!")) // negative lookbehind
  440. ASSERT_NOT_REACHED();
  441. } else {
  442. return set_error(Error::InvalidRepetitionMarker);
  443. }
  444. }
  445. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  446. if (capture_group_name.has_value())
  447. bytecode.insert_bytecode_group_capture_left(capture_group_name.value());
  448. else
  449. bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
  450. }
  451. ByteCode capture_group_bytecode;
  452. if (!parse_root(capture_group_bytecode, length))
  453. return set_error(Error::InvalidPattern);
  454. bytecode.append(move(capture_group_bytecode));
  455. consume(TokenType::RightParen, Error::MismatchingParen);
  456. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  457. if (capture_group_name.has_value()) {
  458. bytecode.insert_bytecode_group_capture_right(capture_group_name.value());
  459. ++m_parser_state.named_capture_groups_count;
  460. } else {
  461. bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
  462. ++m_parser_state.capture_groups_count;
  463. }
  464. }
  465. should_parse_repetition_symbol = true;
  466. break;
  467. }
  468. return false;
  469. }
  470. if (match_repetition_symbol()) {
  471. if (should_parse_repetition_symbol)
  472. parse_repetition_symbol(bytecode, length);
  473. else
  474. return set_error(Error::InvalidRepetitionMarker);
  475. }
  476. stack.append(move(bytecode));
  477. match_length_minimum += length;
  478. return true;
  479. }
  480. bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
  481. {
  482. ByteCode bytecode_left;
  483. size_t match_length_minimum_left { 0 };
  484. if (match_repetition_symbol())
  485. return set_error(Error::InvalidRepetitionMarker);
  486. for (;;) {
  487. if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
  488. break;
  489. if (match(TokenType::Pipe)) {
  490. consume();
  491. ByteCode bytecode_right;
  492. size_t match_length_minimum_right { 0 };
  493. if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
  494. return set_error(Error::InvalidPattern);
  495. ByteCode new_bytecode;
  496. new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
  497. bytecode_left = move(new_bytecode);
  498. match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
  499. }
  500. }
  501. if (bytecode_left.is_empty())
  502. set_error(Error::EmptySubExpression);
  503. stack.append(move(bytecode_left));
  504. match_length_minimum = match_length_minimum_left;
  505. return !has_error();
  506. }
  507. // =============================
  508. // ECMA262 Parser
  509. // =============================
  510. bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  511. {
  512. if (m_parser_state.regex_options & AllFlags::Unicode) {
  513. return parse_pattern(stack, match_length_minimum, true, true);
  514. } else {
  515. ByteCode new_stack;
  516. size_t new_match_length = 0;
  517. auto res = parse_pattern(new_stack, new_match_length, false, false);
  518. if (m_parser_state.named_capture_groups_count > 0) {
  519. reset();
  520. return parse_pattern(stack, match_length_minimum, false, true);
  521. }
  522. if (!res)
  523. return false;
  524. stack.append(new_stack);
  525. match_length_minimum = new_match_length;
  526. return res;
  527. }
  528. }
  529. bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  530. {
  531. return parse_disjunction(stack, match_length_minimum, unicode, named);
  532. }
  533. bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  534. {
  535. ByteCode left_alternative_stack;
  536. size_t left_alternative_min_length = 0;
  537. auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named);
  538. if (!alt_ok)
  539. return false;
  540. if (!match(TokenType::Pipe)) {
  541. stack.append(left_alternative_stack);
  542. match_length_minimum = left_alternative_min_length;
  543. return alt_ok;
  544. }
  545. consume();
  546. ByteCode right_alternative_stack;
  547. size_t right_alternative_min_length = 0;
  548. auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named);
  549. if (!continuation_ok)
  550. return false;
  551. stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack));
  552. match_length_minimum = min(left_alternative_min_length, right_alternative_min_length);
  553. return continuation_ok;
  554. }
  555. bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  556. {
  557. for (;;) {
  558. if (match(TokenType::Eof))
  559. return true;
  560. if (parse_term(stack, match_length_minimum, unicode, named))
  561. continue;
  562. return !has_error();
  563. }
  564. }
  565. bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  566. {
  567. if (parse_assertion(stack, match_length_minimum, unicode, named))
  568. return true;
  569. ByteCode atom_stack;
  570. size_t minimum_atom_length = 0;
  571. if (!parse_atom(atom_stack, minimum_atom_length, unicode, named))
  572. return false;
  573. if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named))
  574. return false;
  575. stack.append(move(atom_stack));
  576. match_length_minimum += minimum_atom_length;
  577. return true;
  578. }
  579. bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
  580. {
  581. if (match(TokenType::Circumflex)) {
  582. consume();
  583. stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  584. return true;
  585. }
  586. if (match(TokenType::Dollar)) {
  587. consume();
  588. stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  589. return true;
  590. }
  591. if (try_skip("\\b")) {
  592. stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
  593. return true;
  594. }
  595. if (try_skip("\\B")) {
  596. stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
  597. return true;
  598. }
  599. if (match(TokenType::LeftParen)) {
  600. if (!try_skip("(?"))
  601. return false;
  602. ByteCode assertion_stack;
  603. size_t length_dummy = 0;
  604. auto parse_inner_disjunction = [&] {
  605. auto disjunction_ok = parse_disjunction(assertion_stack, length_dummy, unicode, named);
  606. if (!disjunction_ok)
  607. return false;
  608. consume(TokenType::RightParen, Error::MismatchingParen);
  609. return true;
  610. };
  611. if (try_skip("=")) {
  612. if (!parse_inner_disjunction())
  613. return false;
  614. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
  615. return true;
  616. }
  617. if (try_skip("!")) {
  618. if (!parse_inner_disjunction())
  619. return false;
  620. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
  621. return true;
  622. }
  623. if (try_skip("<=")) {
  624. if (!parse_inner_disjunction())
  625. return false;
  626. // FIXME: Somehow ensure that this assertion regexp has a fixed length.
  627. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
  628. return true;
  629. }
  630. if (try_skip("<!")) {
  631. if (!parse_inner_disjunction())
  632. return false;
  633. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
  634. return true;
  635. }
  636. // If none of these matched, put the '(?' back.
  637. m_parser_state.lexer.back(3);
  638. m_parser_state.current_token = m_parser_state.lexer.next();
  639. return false;
  640. }
  641. return false;
  642. }
  643. Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, ECMA262Parser::ReadDigitFollowPolicy follow_policy, bool hex, int max_count)
  644. {
  645. if (!match(TokenType::Char))
  646. return {};
  647. if (initial_zero != ReadDigitsInitialZeroState::Allow) {
  648. auto has_initial_zero = m_parser_state.current_token.value() == "0";
  649. if (initial_zero == ReadDigitsInitialZeroState::Disallow && has_initial_zero)
  650. return {};
  651. if (initial_zero == ReadDigitsInitialZeroState::Require && !has_initial_zero)
  652. return {};
  653. }
  654. int count = 0;
  655. size_t offset = 0;
  656. while (match(TokenType::Char)) {
  657. auto c = m_parser_state.current_token.value();
  658. if (follow_policy == ReadDigitFollowPolicy::DisallowDigit) {
  659. if (hex && AK::StringUtils::convert_to_uint_from_hex(c).has_value())
  660. break;
  661. if (!hex && c.to_uint().has_value())
  662. break;
  663. }
  664. if (follow_policy == ReadDigitFollowPolicy::DisallowNonDigit) {
  665. if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
  666. break;
  667. if (!hex && !c.to_uint().has_value())
  668. break;
  669. }
  670. if (max_count > 0 && count >= max_count)
  671. break;
  672. offset += consume().value().length();
  673. ++count;
  674. }
  675. auto str = m_parser_state.lexer.slice_back(offset);
  676. if (hex)
  677. return AK::StringUtils::convert_to_uint_from_hex(str);
  678. return str.to_uint();
  679. }
  680. bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool)
  681. {
  682. enum class Repetition {
  683. OneOrMore,
  684. ZeroOrMore,
  685. Optional,
  686. Explicit,
  687. None,
  688. } repetition_mark { Repetition::None };
  689. bool ungreedy = false;
  690. Optional<size_t> repeat_min, repeat_max;
  691. if (match(TokenType::Asterisk)) {
  692. consume();
  693. repetition_mark = Repetition::ZeroOrMore;
  694. } else if (match(TokenType::Plus)) {
  695. consume();
  696. repetition_mark = Repetition::OneOrMore;
  697. } else if (match(TokenType::Questionmark)) {
  698. consume();
  699. repetition_mark = Repetition::Optional;
  700. } else if (match(TokenType::LeftCurly)) {
  701. consume();
  702. repetition_mark = Repetition::Explicit;
  703. auto low_bound = read_digits();
  704. if (!low_bound.has_value()) {
  705. set_error(Error::InvalidBraceContent);
  706. return false;
  707. }
  708. repeat_min = low_bound.value();
  709. if (match(TokenType::Comma)) {
  710. consume();
  711. auto high_bound = read_digits();
  712. if (!high_bound.has_value()) {
  713. set_error(Error::InvalidBraceContent);
  714. return false;
  715. }
  716. repeat_max = high_bound.value();
  717. }
  718. if (!match(TokenType::RightCurly)) {
  719. set_error(Error::MismatchingBrace);
  720. return false;
  721. }
  722. consume();
  723. if (repeat_max.has_value()) {
  724. if (repeat_min.value() > repeat_max.value())
  725. set_error(Error::InvalidBraceContent);
  726. }
  727. } else {
  728. return true;
  729. }
  730. if (match(TokenType::Questionmark)) {
  731. if (repetition_mark == Repetition::Explicit) {
  732. set_error(Error::InvalidRepetitionMarker);
  733. return false;
  734. }
  735. consume();
  736. ungreedy = true;
  737. }
  738. ByteCode new_bytecode;
  739. switch (repetition_mark) {
  740. case Repetition::OneOrMore:
  741. new_bytecode.insert_bytecode_repetition_min_one(stack, !ungreedy);
  742. break;
  743. case Repetition::ZeroOrMore:
  744. new_bytecode.insert_bytecode_repetition_any(stack, !ungreedy);
  745. match_length_minimum = 0;
  746. break;
  747. case Repetition::Optional:
  748. new_bytecode.insert_bytecode_repetition_zero_or_one(stack, !ungreedy);
  749. match_length_minimum = 0;
  750. break;
  751. case Repetition::Explicit:
  752. new_bytecode.insert_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max);
  753. match_length_minimum *= repeat_min.value();
  754. break;
  755. case Repetition::None:
  756. ASSERT_NOT_REACHED();
  757. }
  758. return true;
  759. }
  760. bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  761. {
  762. if (try_skip("\\")) {
  763. // AtomEscape.
  764. return parse_atom_escape(stack, match_length_minimum, unicode, named);
  765. }
  766. if (match(TokenType::LeftBracket)) {
  767. // Character class.
  768. return parse_character_class(stack, match_length_minimum, unicode, named);
  769. }
  770. if (match(TokenType::LeftParen)) {
  771. // Non-capturing group, or a capture group.
  772. return parse_capture_group(stack, match_length_minimum, unicode, named);
  773. }
  774. if (match(TokenType::Period)) {
  775. consume();
  776. match_length_minimum += 1;
  777. stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  778. return true;
  779. }
  780. if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightBracket)
  781. || match(TokenType::RightCurly) || match(TokenType::RightParen) || match(TokenType::Pipe)
  782. || match(TokenType::Plus) || match(TokenType::Asterisk) || match(TokenType::Questionmark)) {
  783. return false;
  784. }
  785. if (match(TokenType::Char)) {
  786. auto token = consume().value();
  787. match_length_minimum += 1;
  788. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
  789. return true;
  790. }
  791. set_error(Error::InvalidPattern);
  792. return false;
  793. }
  794. bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  795. {
  796. if (auto escape = read_digits(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); escape.has_value()) {
  797. auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
  798. if (!maybe_length.has_value()) {
  799. set_error(Error::InvalidNumber);
  800. return false;
  801. }
  802. match_length_minimum += maybe_length.value();
  803. stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
  804. return true;
  805. }
  806. // CharacterEscape > ControlEscape
  807. if (try_skip("f")) {
  808. match_length_minimum += 1;
  809. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
  810. return true;
  811. }
  812. if (try_skip("n")) {
  813. match_length_minimum += 1;
  814. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
  815. return true;
  816. }
  817. if (try_skip("r")) {
  818. match_length_minimum += 1;
  819. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
  820. return true;
  821. }
  822. if (try_skip("t")) {
  823. match_length_minimum += 1;
  824. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
  825. return true;
  826. }
  827. if (try_skip("v")) {
  828. match_length_minimum += 1;
  829. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
  830. return true;
  831. }
  832. // CharacterEscape > ControlLetter
  833. if (try_skip("c")) {
  834. for (auto c = 'A'; c <= 'z'; ++c) {
  835. if (try_skip({ &c, 1 })) {
  836. match_length_minimum += 1;
  837. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c & 0x3f) } });
  838. return true;
  839. }
  840. }
  841. }
  842. // '\0'
  843. if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value()) {
  844. match_length_minimum += 1;
  845. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
  846. return true;
  847. }
  848. // HexEscape
  849. if (try_skip("x")) {
  850. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
  851. match_length_minimum += 1;
  852. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
  853. return true;
  854. }
  855. }
  856. if (try_skip("u")) {
  857. // FIXME: Implement this path, unicode escape sequence.
  858. TODO();
  859. }
  860. // IdentityEscape
  861. if (match(TokenType::EscapeSequence)) {
  862. match_length_minimum += 1;
  863. auto token = consume().value();
  864. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[token.length() - 1] } });
  865. return true;
  866. }
  867. if (named && try_skip("k")) {
  868. auto name = read_capture_group_specifier(true);
  869. if (name.is_empty()) {
  870. set_error(Error::InvalidNameForCaptureGroup);
  871. return false;
  872. }
  873. auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name);
  874. if (!maybe_length.has_value()) {
  875. set_error(Error::InvalidNameForCaptureGroup);
  876. return false;
  877. }
  878. match_length_minimum += maybe_length.value();
  879. stack.insert_bytecode_compare_named_reference(name, name.length());
  880. return true;
  881. }
  882. if (unicode) {
  883. if (try_skip("p{")) {
  884. // FIXME: Implement this path, Unicode property match.
  885. TODO();
  886. }
  887. if (try_skip("P{")) {
  888. // FIXME: Implement this path, Unicode property match.
  889. TODO();
  890. }
  891. }
  892. bool negate = false;
  893. auto ch = parse_character_class_escape(negate);
  894. if (!ch.has_value()) {
  895. set_error(Error::InvalidCharacterClass);
  896. return false;
  897. }
  898. Vector<CompareTypeAndValuePair> compares;
  899. if (negate)
  900. compares.empend(CharacterCompareType::Inverse, 0);
  901. compares.empend(CharacterCompareType::CharClass, (ByteCodeValueType)ch.value());
  902. match_length_minimum += 1;
  903. stack.insert_bytecode_compare_values(move(compares));
  904. return true;
  905. }
  906. Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
  907. {
  908. if (expect_backslash && !try_skip("\\"))
  909. return {};
  910. // CharacterClassEscape
  911. CharClass ch_class;
  912. if (try_skip("d")) {
  913. ch_class = CharClass::Digit;
  914. } else if (try_skip("D")) {
  915. ch_class = CharClass::Digit;
  916. negate = true;
  917. } else if (try_skip("s")) {
  918. ch_class = CharClass::Space;
  919. } else if (try_skip("S")) {
  920. ch_class = CharClass::Space;
  921. negate = true;
  922. } else if (try_skip("w")) {
  923. ch_class = CharClass::Word;
  924. } else if (try_skip("W")) {
  925. ch_class = CharClass::Word;
  926. negate = true;
  927. } else {
  928. return {};
  929. }
  930. return ch_class;
  931. }
  932. bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
  933. {
  934. consume(TokenType::LeftBracket, Error::InvalidPattern);
  935. Vector<CompareTypeAndValuePair> compares;
  936. if (match(TokenType::Circumflex)) {
  937. // Negated charclass
  938. consume();
  939. compares.empend(CharacterCompareType::Inverse, 0);
  940. }
  941. if (match(TokenType::RightBracket)) {
  942. consume();
  943. return true;
  944. }
  945. if (!parse_nonempty_class_ranges(compares, unicode))
  946. return false;
  947. match_length_minimum += 1;
  948. stack.insert_bytecode_compare_values(move(compares));
  949. return true;
  950. }
  951. struct CharClassRangeElement {
  952. union {
  953. CharClass character_class;
  954. u32 code_point { 0 };
  955. };
  956. bool is_negated { false };
  957. bool is_character_class { false };
  958. };
  959. bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
  960. {
  961. auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
  962. if (match(TokenType::EscapeSequence)) {
  963. auto token = consume().value();
  964. return { { .code_point = (u32)token[1], .is_character_class = false } };
  965. }
  966. if (try_skip("\\")) {
  967. if (try_skip("f"))
  968. return { { .code_point = '\f', .is_character_class = false } };
  969. if (try_skip("n"))
  970. return { { .code_point = '\n', .is_character_class = false } };
  971. if (try_skip("r"))
  972. return { { .code_point = '\r', .is_character_class = false } };
  973. if (try_skip("t"))
  974. return { { .code_point = '\t', .is_character_class = false } };
  975. if (try_skip("v"))
  976. return { { .code_point = '\v', .is_character_class = false } };
  977. if (try_skip("b"))
  978. return { { .code_point = '\b', .is_character_class = false } };
  979. // CharacterEscape > ControlLetter
  980. if (try_skip("c")) {
  981. for (auto c = 'A'; c <= 'z'; ++c) {
  982. if (try_skip({ &c, 1 }))
  983. return { { .code_point = (u32)(c & 0x3f), .is_character_class = false } };
  984. }
  985. }
  986. // '\0'
  987. if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value())
  988. return { { .code_point = 0, .is_character_class = false } };
  989. // HexEscape
  990. if (try_skip("x")) {
  991. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value())
  992. return { { .code_point = hex_escape.value(), .is_character_class = false } };
  993. }
  994. if (try_skip("u")) {
  995. // FIXME: Implement this path, unicode escape sequence.
  996. TODO();
  997. }
  998. if (unicode) {
  999. if (try_skip("-"))
  1000. return { { .code_point = '-', .is_character_class = false } };
  1001. }
  1002. if (try_skip("p{") || try_skip("P{")) {
  1003. // FIXME: Implement these; unicode properties.
  1004. TODO();
  1005. }
  1006. if (try_skip("d"))
  1007. return { { .character_class = CharClass::Digit, .is_character_class = true } };
  1008. if (try_skip("s"))
  1009. return { { .character_class = CharClass::Space, .is_character_class = true } };
  1010. if (try_skip("w"))
  1011. return { { .character_class = CharClass::Word, .is_character_class = true } };
  1012. if (try_skip("D"))
  1013. return { { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
  1014. if (try_skip("S"))
  1015. return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
  1016. if (try_skip("W"))
  1017. return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
  1018. }
  1019. if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
  1020. return {};
  1021. auto token = consume(TokenType::Char, Error::InvalidCharacterClass);
  1022. return { { .code_point = (u32)token.value()[0], .is_character_class = false } };
  1023. };
  1024. auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
  1025. if (match(TokenType::HyphenMinus)) {
  1026. consume();
  1027. return { { .code_point = '-', .is_character_class = false } };
  1028. }
  1029. return read_class_atom_no_dash();
  1030. };
  1031. while (!match(TokenType::RightBracket)) {
  1032. if (match(TokenType::Eof)) {
  1033. set_error(Error::MismatchingBracket);
  1034. return false;
  1035. }
  1036. auto first_atom = read_class_atom();
  1037. if (!first_atom.has_value())
  1038. return false;
  1039. if (match(TokenType::HyphenMinus)) {
  1040. consume();
  1041. auto second_atom = read_class_atom();
  1042. if (!second_atom.has_value())
  1043. return false;
  1044. if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
  1045. set_error(Error::InvalidRange);
  1046. return false;
  1047. }
  1048. if (first_atom.value().code_point > second_atom.value().code_point) {
  1049. set_error(Error::InvalidRange);
  1050. return false;
  1051. }
  1052. ASSERT(!first_atom.value().is_negated);
  1053. ASSERT(!second_atom.value().is_negated);
  1054. ranges.empend(CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point });
  1055. continue;
  1056. }
  1057. auto atom = first_atom.value();
  1058. if (atom.is_character_class) {
  1059. if (atom.is_negated)
  1060. ranges.empend(CharacterCompareType::TemporaryInverse, 0);
  1061. ranges.empend(CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class);
  1062. } else {
  1063. ASSERT(!atom.is_negated);
  1064. ranges.empend(CharacterCompareType::Char, first_atom.value().code_point);
  1065. }
  1066. }
  1067. consume(TokenType::RightBracket, Error::MismatchingBracket);
  1068. return true;
  1069. }
  1070. StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
  1071. {
  1072. if (take_starting_angle_bracket && !consume("<"))
  1073. return {};
  1074. size_t offset = 0;
  1075. while (match(TokenType::Char)) {
  1076. auto c = m_parser_state.current_token.value();
  1077. if (c == ">")
  1078. break;
  1079. offset += consume().value().length();
  1080. }
  1081. auto name = m_parser_state.lexer.slice_back(offset);
  1082. if (!consume(">") || name.is_empty())
  1083. set_error(Error::InvalidNameForCaptureGroup);
  1084. return name;
  1085. }
  1086. bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  1087. {
  1088. consume(TokenType::LeftParen, Error::InvalidPattern);
  1089. if (match(TokenType::Questionmark)) {
  1090. // Non-capturing group or group with specifier.
  1091. consume();
  1092. if (match(TokenType::Colon)) {
  1093. consume();
  1094. ByteCode noncapture_group_bytecode;
  1095. size_t length = 0;
  1096. if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
  1097. return set_error(Error::InvalidPattern);
  1098. consume(TokenType::RightParen, Error::MismatchingParen);
  1099. stack.append(move(noncapture_group_bytecode));
  1100. match_length_minimum += length;
  1101. return true;
  1102. }
  1103. if (consume("<")) {
  1104. ++m_parser_state.named_capture_groups_count;
  1105. auto name = read_capture_group_specifier();
  1106. if (name.is_empty()) {
  1107. set_error(Error::InvalidNameForCaptureGroup);
  1108. return false;
  1109. }
  1110. ByteCode capture_group_bytecode;
  1111. size_t length = 0;
  1112. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1113. return set_error(Error::InvalidPattern);
  1114. consume(TokenType::RightParen, Error::MismatchingParen);
  1115. stack.insert_bytecode_group_capture_left(name);
  1116. stack.append(move(capture_group_bytecode));
  1117. stack.insert_bytecode_group_capture_right(name);
  1118. match_length_minimum += length;
  1119. m_parser_state.named_capture_group_minimum_lengths.set(name, length);
  1120. return true;
  1121. }
  1122. set_error(Error::InvalidCaptureGroup);
  1123. return false;
  1124. }
  1125. auto group_index = ++m_parser_state.capture_groups_count;
  1126. stack.insert_bytecode_group_capture_left(group_index);
  1127. ByteCode capture_group_bytecode;
  1128. size_t length = 0;
  1129. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1130. return set_error(Error::InvalidPattern);
  1131. stack.append(move(capture_group_bytecode));
  1132. m_parser_state.capture_group_minimum_lengths.set(group_index, length);
  1133. consume(TokenType::RightParen, Error::MismatchingParen);
  1134. stack.insert_bytecode_group_capture_right(group_index);
  1135. match_length_minimum += length;
  1136. return true;
  1137. }
  1138. }