RegexParser.cpp 62 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * Copyright (c) 2020-2021, the SerenityOS developers.
  4. * All rights reserved.
  5. *
  6. * Redistribution and use in source and binary forms, with or without
  7. * modification, are permitted provided that the following conditions are met:
  8. *
  9. * 1. Redistributions of source code must retain the above copyright notice, this
  10. * list of conditions and the following disclaimer.
  11. *
  12. * 2. Redistributions in binary form must reproduce the above copyright notice,
  13. * this list of conditions and the following disclaimer in the documentation
  14. * and/or other materials provided with the distribution.
  15. *
  16. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  17. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  18. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  19. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  20. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  21. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  22. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  23. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  24. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  25. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  26. */
  27. #include "RegexParser.h"
  28. #include "RegexDebug.h"
  29. #include <AK/String.h>
  30. #include <AK/StringBuilder.h>
  31. #include <AK/StringUtils.h>
  32. namespace regex {
  33. ALWAYS_INLINE bool Parser::set_error(Error error)
  34. {
  35. if (m_parser_state.error == Error::NoError) {
  36. m_parser_state.error = error;
  37. m_parser_state.error_token = m_parser_state.current_token;
  38. }
  39. return false; // always return false, that eases the API usage (return set_error(...)) :^)
  40. }
  41. ALWAYS_INLINE bool Parser::done() const
  42. {
  43. return match(TokenType::Eof);
  44. }
  45. ALWAYS_INLINE bool Parser::match(TokenType type) const
  46. {
  47. return m_parser_state.current_token.type() == type;
  48. }
  49. ALWAYS_INLINE bool Parser::match(char ch) const
  50. {
  51. return m_parser_state.current_token.type() == TokenType::Char && m_parser_state.current_token.value().length() == 1 && m_parser_state.current_token.value()[0] == ch;
  52. }
  53. ALWAYS_INLINE Token Parser::consume()
  54. {
  55. auto old_token = m_parser_state.current_token;
  56. m_parser_state.current_token = m_parser_state.lexer.next();
  57. return old_token;
  58. }
  59. ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
  60. {
  61. if (m_parser_state.current_token.type() != type) {
  62. set_error(error);
  63. dbgln("[PARSER] Error: Unexpected token {}. Expected: {}", m_parser_state.current_token.name(), Token::name(type));
  64. }
  65. return consume();
  66. }
  67. ALWAYS_INLINE bool Parser::consume(const String& str)
  68. {
  69. size_t potentially_go_back { 1 };
  70. for (auto ch : str) {
  71. if (match(TokenType::Char)) {
  72. if (m_parser_state.current_token.value()[0] != ch) {
  73. m_parser_state.lexer.back(potentially_go_back);
  74. m_parser_state.current_token = m_parser_state.lexer.next();
  75. return false;
  76. }
  77. } else {
  78. m_parser_state.lexer.back(potentially_go_back);
  79. m_parser_state.current_token = m_parser_state.lexer.next();
  80. return false;
  81. }
  82. consume(TokenType::Char, Error::NoError);
  83. ++potentially_go_back;
  84. }
  85. return true;
  86. }
  87. ALWAYS_INLINE bool Parser::try_skip(StringView str)
  88. {
  89. if (str.starts_with(m_parser_state.current_token.value()))
  90. str = str.substring_view(m_parser_state.current_token.value().length(), str.length() - m_parser_state.current_token.value().length());
  91. else
  92. return false;
  93. size_t potentially_go_back { 0 };
  94. for (auto ch : str) {
  95. if (!m_parser_state.lexer.try_skip(ch)) {
  96. m_parser_state.lexer.back(potentially_go_back);
  97. return false;
  98. }
  99. ++potentially_go_back;
  100. }
  101. m_parser_state.current_token = m_parser_state.lexer.next();
  102. return true;
  103. }
  104. ALWAYS_INLINE bool Parser::lookahead_any(StringView str)
  105. {
  106. for (auto ch : str) {
  107. if (match(ch))
  108. return true;
  109. }
  110. return false;
  111. }
  112. ALWAYS_INLINE char Parser::skip()
  113. {
  114. char ch;
  115. if (m_parser_state.current_token.value().length() == 1) {
  116. ch = m_parser_state.current_token.value()[0];
  117. } else {
  118. m_parser_state.lexer.back(m_parser_state.current_token.value().length());
  119. ch = m_parser_state.lexer.skip();
  120. }
  121. m_parser_state.current_token = m_parser_state.lexer.next();
  122. return ch;
  123. }
  124. ALWAYS_INLINE void Parser::back(size_t count)
  125. {
  126. m_parser_state.lexer.back(count);
  127. m_parser_state.current_token = m_parser_state.lexer.next();
  128. }
  129. ALWAYS_INLINE void Parser::reset()
  130. {
  131. m_parser_state.bytecode.clear();
  132. m_parser_state.lexer.reset();
  133. m_parser_state.current_token = m_parser_state.lexer.next();
  134. m_parser_state.error = Error::NoError;
  135. m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
  136. m_parser_state.capture_group_minimum_lengths.clear();
  137. m_parser_state.capture_groups_count = 0;
  138. m_parser_state.named_capture_groups_count = 0;
  139. m_parser_state.named_capture_group_minimum_lengths.clear();
  140. m_parser_state.named_capture_groups.clear();
  141. }
  142. Parser::Result Parser::parse(Optional<AllOptions> regex_options)
  143. {
  144. reset();
  145. if (regex_options.has_value())
  146. m_parser_state.regex_options = regex_options.value();
  147. if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
  148. consume(TokenType::Eof, Error::InvalidPattern);
  149. else
  150. set_error(Error::InvalidPattern);
  151. #if REGEX_DEBUG
  152. fprintf(stderr, "[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.bytecode.size());
  153. #endif
  154. return {
  155. move(m_parser_state.bytecode),
  156. move(m_parser_state.capture_groups_count),
  157. move(m_parser_state.named_capture_groups_count),
  158. move(m_parser_state.match_length_minimum),
  159. move(m_parser_state.error),
  160. move(m_parser_state.error_token)
  161. };
  162. }
  163. ALWAYS_INLINE bool Parser::match_ordinary_characters()
  164. {
  165. // NOTE: This method must not be called during bracket and repetition parsing!
  166. // FIXME: Add assertion for that?
  167. auto type = m_parser_state.current_token.type();
  168. return (type == TokenType::Char
  169. || type == TokenType::Comma
  170. || type == TokenType::Slash
  171. || type == TokenType::EqualSign
  172. || type == TokenType::HyphenMinus
  173. || type == TokenType::Colon);
  174. }
  175. // =============================
  176. // PosixExtended Parser
  177. // =============================
  178. bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  179. {
  180. return parse_root(stack, match_length_minimum);
  181. }
  182. ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
  183. {
  184. auto type = m_parser_state.current_token.type();
  185. return (type == TokenType::Asterisk
  186. || type == TokenType::Plus
  187. || type == TokenType::Questionmark
  188. || type == TokenType::LeftCurly);
  189. }
  190. ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
  191. {
  192. if (match(TokenType::LeftCurly)) {
  193. consume();
  194. StringBuilder number_builder;
  195. while (match(TokenType::Char)) {
  196. number_builder.append(consume().value());
  197. }
  198. auto maybe_minimum = number_builder.build().to_uint();
  199. if (!maybe_minimum.has_value())
  200. return set_error(Error::InvalidBraceContent);
  201. auto minimum = maybe_minimum.value();
  202. match_length_minimum *= minimum;
  203. if (match(TokenType::Comma)) {
  204. consume();
  205. } else {
  206. ByteCode bytecode;
  207. bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
  208. bytecode_to_repeat = move(bytecode);
  209. consume(TokenType::RightCurly, Error::MismatchingBrace);
  210. return !has_error();
  211. }
  212. Optional<size_t> maybe_maximum {};
  213. number_builder.clear();
  214. while (match(TokenType::Char)) {
  215. number_builder.append(consume().value());
  216. }
  217. if (!number_builder.is_empty()) {
  218. auto value = number_builder.build().to_uint();
  219. if (!value.has_value() || minimum > value.value())
  220. return set_error(Error::InvalidBraceContent);
  221. maybe_maximum = value.value();
  222. }
  223. bytecode_to_repeat.insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
  224. consume(TokenType::RightCurly, Error::MismatchingBrace);
  225. return !has_error();
  226. } else if (match(TokenType::Plus)) {
  227. consume();
  228. bool nongreedy = match(TokenType::Questionmark);
  229. if (nongreedy)
  230. consume();
  231. // Note: don't touch match_length_minimum, it's already correct
  232. bytecode_to_repeat.insert_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
  233. return !has_error();
  234. } else if (match(TokenType::Asterisk)) {
  235. consume();
  236. match_length_minimum = 0;
  237. bool nongreedy = match(TokenType::Questionmark);
  238. if (nongreedy)
  239. consume();
  240. bytecode_to_repeat.insert_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
  241. return !has_error();
  242. } else if (match(TokenType::Questionmark)) {
  243. consume();
  244. match_length_minimum = 0;
  245. bool nongreedy = match(TokenType::Questionmark);
  246. if (nongreedy)
  247. consume();
  248. bytecode_to_repeat.insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
  249. return !has_error();
  250. }
  251. return false;
  252. }
  253. ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
  254. {
  255. Vector<CompareTypeAndValuePair> values;
  256. for (;;) {
  257. if (match(TokenType::HyphenMinus)) {
  258. consume();
  259. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  260. // first in the bracket expression
  261. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  262. } else if (match(TokenType::RightBracket)) {
  263. // Last in the bracket expression
  264. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  265. } else if (values.last().type == CharacterCompareType::Char) {
  266. values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
  267. if (match(TokenType::HyphenMinus)) {
  268. consume();
  269. // Valid range, add ordinary character
  270. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  271. }
  272. } else {
  273. return set_error(Error::InvalidRange);
  274. }
  275. } else if (match(TokenType::Circumflex)) {
  276. auto t = consume();
  277. if (values.is_empty())
  278. values.append({ CharacterCompareType::Inverse, 0 });
  279. else
  280. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
  281. } else if (match(TokenType::LeftBracket)) {
  282. consume();
  283. if (match(TokenType::Period)) {
  284. consume();
  285. // FIXME: Parse collating element, this is needed when we have locale support
  286. // This could have impact on length parameter, I guess.
  287. VERIFY_NOT_REACHED();
  288. consume(TokenType::Period, Error::InvalidCollationElement);
  289. consume(TokenType::RightBracket, Error::MismatchingBracket);
  290. } else if (match(TokenType::EqualSign)) {
  291. consume();
  292. // FIXME: Parse collating element, this is needed when we have locale support
  293. // This could have impact on length parameter, I guess.
  294. VERIFY_NOT_REACHED();
  295. consume(TokenType::EqualSign, Error::InvalidCollationElement);
  296. consume(TokenType::RightBracket, Error::MismatchingBracket);
  297. } else if (match(TokenType::Colon)) {
  298. consume();
  299. CharClass ch_class;
  300. // parse character class
  301. if (match(TokenType::Char)) {
  302. if (consume("alnum"))
  303. ch_class = CharClass::Alnum;
  304. else if (consume("alpha"))
  305. ch_class = CharClass::Alpha;
  306. else if (consume("blank"))
  307. ch_class = CharClass::Blank;
  308. else if (consume("cntrl"))
  309. ch_class = CharClass::Cntrl;
  310. else if (consume("digit"))
  311. ch_class = CharClass::Digit;
  312. else if (consume("graph"))
  313. ch_class = CharClass::Graph;
  314. else if (consume("lower"))
  315. ch_class = CharClass::Lower;
  316. else if (consume("print"))
  317. ch_class = CharClass::Print;
  318. else if (consume("punct"))
  319. ch_class = CharClass::Punct;
  320. else if (consume("space"))
  321. ch_class = CharClass::Space;
  322. else if (consume("upper"))
  323. ch_class = CharClass::Upper;
  324. else if (consume("xdigit"))
  325. ch_class = CharClass::Xdigit;
  326. else
  327. return set_error(Error::InvalidCharacterClass);
  328. values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
  329. } else
  330. return set_error(Error::InvalidCharacterClass);
  331. // FIXME: we do not support locale specific character classes until locales are implemented
  332. consume(TokenType::Colon, Error::InvalidCharacterClass);
  333. consume(TokenType::RightBracket, Error::MismatchingBracket);
  334. } else {
  335. return set_error(Error::MismatchingBracket);
  336. }
  337. } else if (match(TokenType::RightBracket)) {
  338. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  339. // handle bracket as ordinary character
  340. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
  341. } else {
  342. // closing bracket expression
  343. break;
  344. }
  345. } else {
  346. values.append({ CharacterCompareType::Char, (ByteCodeValueType)skip() });
  347. }
  348. // check if range expression has to be completed...
  349. if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
  350. if (values.last().type != CharacterCompareType::Char)
  351. return set_error(Error::InvalidRange);
  352. auto value2 = values.take_last();
  353. values.take_last(); // RangeExpressionDummy
  354. auto value1 = values.take_last();
  355. values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) });
  356. }
  357. }
  358. if (values.size())
  359. match_length_minimum = 1;
  360. if (values.first().type == CharacterCompareType::Inverse)
  361. match_length_minimum = 0;
  362. stack.insert_bytecode_compare_values(move(values));
  363. return !has_error();
  364. }
  365. ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
  366. {
  367. ByteCode bytecode;
  368. size_t length = 0;
  369. bool should_parse_repetition_symbol { false };
  370. for (;;) {
  371. if (match_ordinary_characters()) {
  372. Token start_token = m_parser_state.current_token;
  373. Token last_token = m_parser_state.current_token;
  374. for (;;) {
  375. if (!match_ordinary_characters())
  376. break;
  377. ++length;
  378. last_token = consume();
  379. }
  380. if (length > 1) {
  381. // last character is inserted into 'bytecode' for duplication symbol handling
  382. auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
  383. stack.insert_bytecode_compare_string({ start_token.value().characters_without_null_termination(), new_length });
  384. }
  385. if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
  386. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
  387. should_parse_repetition_symbol = true;
  388. break;
  389. }
  390. if (match_repetition_symbol())
  391. return set_error(Error::InvalidRepetitionMarker);
  392. if (match(TokenType::Period)) {
  393. length = 1;
  394. consume();
  395. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  396. should_parse_repetition_symbol = true;
  397. break;
  398. }
  399. if (match(TokenType::EscapeSequence)) {
  400. length = 1;
  401. Token t = consume();
  402. #if REGEX_DEBUG
  403. printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
  404. #endif
  405. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
  406. should_parse_repetition_symbol = true;
  407. break;
  408. }
  409. if (match(TokenType::LeftBracket)) {
  410. consume();
  411. ByteCode sub_ops;
  412. if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
  413. return set_error(Error::InvalidBracketContent);
  414. bytecode.append(move(sub_ops));
  415. consume(TokenType::RightBracket, Error::MismatchingBracket);
  416. should_parse_repetition_symbol = true;
  417. break;
  418. }
  419. if (match(TokenType::RightBracket)) {
  420. return set_error(Error::MismatchingBracket);
  421. }
  422. if (match(TokenType::RightCurly)) {
  423. return set_error(Error::MismatchingBrace);
  424. }
  425. if (match(TokenType::Circumflex)) {
  426. consume();
  427. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  428. break;
  429. }
  430. if (match(TokenType::Dollar)) {
  431. consume();
  432. bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  433. break;
  434. }
  435. if (match(TokenType::RightParen))
  436. return false;
  437. if (match(TokenType::LeftParen)) {
  438. consume();
  439. Optional<StringView> capture_group_name;
  440. bool prevent_capture_group = false;
  441. if (match(TokenType::Questionmark)) {
  442. consume();
  443. if (match(TokenType::Colon)) {
  444. consume();
  445. prevent_capture_group = true;
  446. } else if (consume("<")) { // named capturing group
  447. Token start_token = m_parser_state.current_token;
  448. Token last_token = m_parser_state.current_token;
  449. size_t capture_group_name_length = 0;
  450. for (;;) {
  451. if (!match_ordinary_characters())
  452. return set_error(Error::InvalidNameForCaptureGroup);
  453. if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
  454. consume();
  455. break;
  456. }
  457. ++capture_group_name_length;
  458. last_token = consume();
  459. }
  460. capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
  461. } else if (match(TokenType::EqualSign)) { // positive lookahead
  462. consume();
  463. VERIFY_NOT_REACHED();
  464. } else if (consume("!")) { // negative lookahead
  465. VERIFY_NOT_REACHED();
  466. } else if (consume("<")) {
  467. if (match(TokenType::EqualSign)) { // positive lookbehind
  468. consume();
  469. VERIFY_NOT_REACHED();
  470. }
  471. if (consume("!")) // negative lookbehind
  472. VERIFY_NOT_REACHED();
  473. } else {
  474. return set_error(Error::InvalidRepetitionMarker);
  475. }
  476. }
  477. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  478. if (capture_group_name.has_value())
  479. bytecode.insert_bytecode_group_capture_left(capture_group_name.value());
  480. else
  481. bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
  482. }
  483. ByteCode capture_group_bytecode;
  484. if (!parse_root(capture_group_bytecode, length))
  485. return set_error(Error::InvalidPattern);
  486. bytecode.append(move(capture_group_bytecode));
  487. consume(TokenType::RightParen, Error::MismatchingParen);
  488. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  489. if (capture_group_name.has_value()) {
  490. bytecode.insert_bytecode_group_capture_right(capture_group_name.value());
  491. ++m_parser_state.named_capture_groups_count;
  492. } else {
  493. bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
  494. ++m_parser_state.capture_groups_count;
  495. }
  496. }
  497. should_parse_repetition_symbol = true;
  498. break;
  499. }
  500. return false;
  501. }
  502. if (match_repetition_symbol()) {
  503. if (should_parse_repetition_symbol)
  504. parse_repetition_symbol(bytecode, length);
  505. else
  506. return set_error(Error::InvalidRepetitionMarker);
  507. }
  508. stack.append(move(bytecode));
  509. match_length_minimum += length;
  510. return true;
  511. }
  512. bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
  513. {
  514. ByteCode bytecode_left;
  515. size_t match_length_minimum_left { 0 };
  516. if (match_repetition_symbol())
  517. return set_error(Error::InvalidRepetitionMarker);
  518. for (;;) {
  519. if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
  520. break;
  521. if (match(TokenType::Pipe)) {
  522. consume();
  523. ByteCode bytecode_right;
  524. size_t match_length_minimum_right { 0 };
  525. if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
  526. return set_error(Error::InvalidPattern);
  527. ByteCode new_bytecode;
  528. new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
  529. bytecode_left = move(new_bytecode);
  530. match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
  531. }
  532. }
  533. if (bytecode_left.is_empty())
  534. set_error(Error::EmptySubExpression);
  535. stack.append(move(bytecode_left));
  536. match_length_minimum = match_length_minimum_left;
  537. return !has_error();
  538. }
  539. // =============================
  540. // ECMA262 Parser
  541. // =============================
  542. bool ECMA262Parser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  543. {
  544. if (m_parser_state.regex_options.has_flag_set(AllFlags::Unicode)) {
  545. return parse_pattern(stack, match_length_minimum, true, true);
  546. } else {
  547. ByteCode new_stack;
  548. size_t new_match_length = 0;
  549. auto res = parse_pattern(new_stack, new_match_length, false, false);
  550. if (m_parser_state.named_capture_groups_count > 0) {
  551. reset();
  552. return parse_pattern(stack, match_length_minimum, false, true);
  553. }
  554. if (!res)
  555. return false;
  556. stack.append(new_stack);
  557. match_length_minimum = new_match_length;
  558. return res;
  559. }
  560. }
  561. bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  562. {
  563. return parse_disjunction(stack, match_length_minimum, unicode, named);
  564. }
  565. bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  566. {
  567. ByteCode left_alternative_stack;
  568. size_t left_alternative_min_length = 0;
  569. auto alt_ok = parse_alternative(left_alternative_stack, left_alternative_min_length, unicode, named);
  570. if (!alt_ok)
  571. return false;
  572. if (!match(TokenType::Pipe)) {
  573. stack.append(left_alternative_stack);
  574. match_length_minimum = left_alternative_min_length;
  575. return alt_ok;
  576. }
  577. consume();
  578. ByteCode right_alternative_stack;
  579. size_t right_alternative_min_length = 0;
  580. auto continuation_ok = parse_disjunction(right_alternative_stack, right_alternative_min_length, unicode, named);
  581. if (!continuation_ok)
  582. return false;
  583. stack.insert_bytecode_alternation(move(left_alternative_stack), move(right_alternative_stack));
  584. match_length_minimum = min(left_alternative_min_length, right_alternative_min_length);
  585. return continuation_ok;
  586. }
  587. bool ECMA262Parser::parse_alternative(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  588. {
  589. for (;;) {
  590. if (match(TokenType::Eof))
  591. return true;
  592. if (parse_term(stack, match_length_minimum, unicode, named))
  593. continue;
  594. return !has_error();
  595. }
  596. }
  597. bool ECMA262Parser::parse_term(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  598. {
  599. if (parse_assertion(stack, match_length_minimum, unicode, named))
  600. return true;
  601. ByteCode atom_stack;
  602. size_t minimum_atom_length = 0;
  603. auto parse_with_quantifier = [&] {
  604. bool did_parse_one = false;
  605. if (m_should_use_browser_extended_grammar)
  606. did_parse_one = parse_extended_atom(atom_stack, minimum_atom_length, named);
  607. if (!did_parse_one)
  608. did_parse_one = parse_atom(atom_stack, minimum_atom_length, unicode, named);
  609. if (!did_parse_one)
  610. return false;
  611. VERIFY(did_parse_one);
  612. if (!parse_quantifier(atom_stack, minimum_atom_length, unicode, named))
  613. return false;
  614. return true;
  615. };
  616. if (!parse_with_quantifier())
  617. return false;
  618. stack.append(move(atom_stack));
  619. match_length_minimum += minimum_atom_length;
  620. return true;
  621. }
  622. bool ECMA262Parser::parse_assertion(ByteCode& stack, [[maybe_unused]] size_t& match_length_minimum, bool unicode, bool named)
  623. {
  624. if (match(TokenType::Circumflex)) {
  625. consume();
  626. stack.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  627. return true;
  628. }
  629. if (match(TokenType::Dollar)) {
  630. consume();
  631. stack.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  632. return true;
  633. }
  634. if (try_skip("\\b")) {
  635. stack.insert_bytecode_check_boundary(BoundaryCheckType::Word);
  636. return true;
  637. }
  638. if (try_skip("\\B")) {
  639. stack.insert_bytecode_check_boundary(BoundaryCheckType::NonWord);
  640. return true;
  641. }
  642. if (match(TokenType::LeftParen)) {
  643. if (!try_skip("(?"))
  644. return false;
  645. if (done()) {
  646. set_error(Error::InvalidCaptureGroup);
  647. return false;
  648. }
  649. ByteCode assertion_stack;
  650. size_t length_dummy = 0;
  651. bool should_parse_forward_assertion = m_should_use_browser_extended_grammar ? unicode : true;
  652. if (should_parse_forward_assertion && try_skip("=")) {
  653. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  654. return false;
  655. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
  656. return true;
  657. }
  658. if (should_parse_forward_assertion && try_skip("!")) {
  659. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  660. return false;
  661. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
  662. return true;
  663. }
  664. if (m_should_use_browser_extended_grammar) {
  665. if (!unicode) {
  666. if (parse_quantifiable_assertion(assertion_stack, match_length_minimum, named)) {
  667. stack.append(move(assertion_stack));
  668. return true;
  669. }
  670. }
  671. }
  672. if (try_skip("<=")) {
  673. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  674. return false;
  675. // FIXME: Somehow ensure that this assertion regexp has a fixed length.
  676. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookBehind, length_dummy);
  677. return true;
  678. }
  679. if (try_skip("<!")) {
  680. if (!parse_inner_disjunction(assertion_stack, length_dummy, unicode, named))
  681. return false;
  682. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookBehind, length_dummy);
  683. return true;
  684. }
  685. // If none of these matched, put the '(?' back.
  686. m_parser_state.lexer.back(3);
  687. m_parser_state.current_token = m_parser_state.lexer.next();
  688. return false;
  689. }
  690. return false;
  691. }
  692. bool ECMA262Parser::parse_inner_disjunction(ByteCode& bytecode_stack, size_t& length, bool unicode, bool named)
  693. {
  694. auto disjunction_ok = parse_disjunction(bytecode_stack, length, unicode, named);
  695. if (!disjunction_ok)
  696. return false;
  697. consume(TokenType::RightParen, Error::MismatchingParen);
  698. return true;
  699. }
  700. bool ECMA262Parser::parse_quantifiable_assertion(ByteCode& stack, size_t&, bool named)
  701. {
  702. VERIFY(m_should_use_browser_extended_grammar);
  703. ByteCode assertion_stack;
  704. size_t match_length_minimum = 0;
  705. if (try_skip("=")) {
  706. if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
  707. return false;
  708. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::LookAhead);
  709. return true;
  710. }
  711. if (try_skip("!")) {
  712. if (!parse_inner_disjunction(assertion_stack, match_length_minimum, false, named))
  713. return false;
  714. stack.insert_bytecode_lookaround(move(assertion_stack), ByteCode::LookAroundType::NegatedLookAhead);
  715. return true;
  716. }
  717. return false;
  718. }
  719. StringView ECMA262Parser::read_digits_as_string(ReadDigitsInitialZeroState initial_zero, ReadDigitFollowPolicy follow_policy, bool hex, int max_count)
  720. {
  721. if (!match(TokenType::Char))
  722. return {};
  723. if (initial_zero != ReadDigitsInitialZeroState::Allow) {
  724. auto has_initial_zero = m_parser_state.current_token.value() == "0";
  725. if (initial_zero == ReadDigitsInitialZeroState::Disallow && has_initial_zero)
  726. return {};
  727. if (initial_zero == ReadDigitsInitialZeroState::Require && !has_initial_zero)
  728. return {};
  729. }
  730. int count = 0;
  731. size_t offset = 0;
  732. auto start_token = m_parser_state.current_token;
  733. while (match(TokenType::Char)) {
  734. auto c = m_parser_state.current_token.value();
  735. if (follow_policy == ReadDigitFollowPolicy::DisallowDigit) {
  736. if (hex && AK::StringUtils::convert_to_uint_from_hex(c).has_value())
  737. break;
  738. if (!hex && c.to_uint().has_value())
  739. break;
  740. }
  741. if (follow_policy == ReadDigitFollowPolicy::DisallowNonDigit) {
  742. if (hex && !AK::StringUtils::convert_to_uint_from_hex(c).has_value())
  743. break;
  744. if (!hex && !c.to_uint().has_value())
  745. break;
  746. }
  747. if (max_count > 0 && count >= max_count)
  748. break;
  749. offset += consume().value().length();
  750. ++count;
  751. }
  752. return StringView { start_token.value().characters_without_null_termination(), offset };
  753. }
  754. Optional<unsigned> ECMA262Parser::read_digits(ECMA262Parser::ReadDigitsInitialZeroState initial_zero, ECMA262Parser::ReadDigitFollowPolicy follow_policy, bool hex, int max_count)
  755. {
  756. auto str = read_digits_as_string(initial_zero, follow_policy, hex, max_count);
  757. if (str.is_empty())
  758. return {};
  759. if (hex)
  760. return AK::StringUtils::convert_to_uint_from_hex(str);
  761. return str.to_uint();
  762. }
  763. bool ECMA262Parser::parse_quantifier(ByteCode& stack, size_t& match_length_minimum, bool, bool)
  764. {
  765. enum class Repetition {
  766. OneOrMore,
  767. ZeroOrMore,
  768. Optional,
  769. Explicit,
  770. None,
  771. } repetition_mark { Repetition::None };
  772. bool ungreedy = false;
  773. Optional<size_t> repeat_min, repeat_max;
  774. if (match(TokenType::Asterisk)) {
  775. consume();
  776. repetition_mark = Repetition::ZeroOrMore;
  777. } else if (match(TokenType::Plus)) {
  778. consume();
  779. repetition_mark = Repetition::OneOrMore;
  780. } else if (match(TokenType::Questionmark)) {
  781. consume();
  782. repetition_mark = Repetition::Optional;
  783. } else if (match(TokenType::LeftCurly)) {
  784. consume();
  785. repetition_mark = Repetition::Explicit;
  786. auto low_bound = read_digits();
  787. if (!low_bound.has_value()) {
  788. set_error(Error::InvalidBraceContent);
  789. return false;
  790. }
  791. repeat_min = low_bound.value();
  792. if (match(TokenType::Comma)) {
  793. consume();
  794. auto high_bound = read_digits();
  795. if (high_bound.has_value())
  796. repeat_max = high_bound.value();
  797. } else {
  798. repeat_max = repeat_min;
  799. }
  800. if (!match(TokenType::RightCurly)) {
  801. set_error(Error::MismatchingBrace);
  802. return false;
  803. }
  804. consume();
  805. if (repeat_max.has_value()) {
  806. if (repeat_min.value() > repeat_max.value())
  807. set_error(Error::InvalidBraceContent);
  808. }
  809. } else {
  810. return true;
  811. }
  812. if (match(TokenType::Questionmark)) {
  813. if (repetition_mark == Repetition::Explicit) {
  814. set_error(Error::InvalidRepetitionMarker);
  815. return false;
  816. }
  817. consume();
  818. ungreedy = true;
  819. }
  820. ByteCode new_bytecode;
  821. switch (repetition_mark) {
  822. case Repetition::OneOrMore:
  823. new_bytecode.insert_bytecode_repetition_min_one(stack, !ungreedy);
  824. break;
  825. case Repetition::ZeroOrMore:
  826. new_bytecode.insert_bytecode_repetition_any(stack, !ungreedy);
  827. match_length_minimum = 0;
  828. break;
  829. case Repetition::Optional:
  830. new_bytecode.insert_bytecode_repetition_zero_or_one(stack, !ungreedy);
  831. match_length_minimum = 0;
  832. break;
  833. case Repetition::Explicit:
  834. new_bytecode.insert_bytecode_repetition_min_max(stack, repeat_min.value(), repeat_max);
  835. match_length_minimum *= repeat_min.value();
  836. break;
  837. case Repetition::None:
  838. VERIFY_NOT_REACHED();
  839. }
  840. return true;
  841. }
  842. bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  843. {
  844. if (match(TokenType::EscapeSequence)) {
  845. // Also part of AtomEscape.
  846. auto token = consume();
  847. match_length_minimum += 1;
  848. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[1] } });
  849. return true;
  850. }
  851. if (try_skip("\\")) {
  852. // AtomEscape.
  853. return parse_atom_escape(stack, match_length_minimum, unicode, named);
  854. }
  855. if (match(TokenType::LeftBracket)) {
  856. // Character class.
  857. return parse_character_class(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named);
  858. }
  859. if (match(TokenType::LeftParen)) {
  860. // Non-capturing group, or a capture group.
  861. return parse_capture_group(stack, match_length_minimum, unicode && !m_should_use_browser_extended_grammar, named);
  862. }
  863. if (match(TokenType::Period)) {
  864. consume();
  865. match_length_minimum += 1;
  866. stack.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  867. return true;
  868. }
  869. if (match(TokenType::Circumflex) || match(TokenType::Dollar) || match(TokenType::RightParen)
  870. || match(TokenType::Pipe) || match(TokenType::Plus) || match(TokenType::Asterisk)
  871. || match(TokenType::Questionmark)) {
  872. return false;
  873. }
  874. if (match(TokenType::RightBracket) || match(TokenType::RightCurly) || match(TokenType::LeftCurly)) {
  875. if (m_should_use_browser_extended_grammar) {
  876. auto token = consume();
  877. match_length_minimum += 1;
  878. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
  879. return true;
  880. } else {
  881. return false;
  882. }
  883. }
  884. if (match_ordinary_characters()) {
  885. auto token = consume().value();
  886. match_length_minimum += 1;
  887. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
  888. return true;
  889. }
  890. set_error(Error::InvalidPattern);
  891. return false;
  892. }
  893. bool ECMA262Parser::parse_extended_atom(ByteCode&, size_t&, bool)
  894. {
  895. // Note: This includes only rules *not* present in parse_atom()
  896. VERIFY(m_should_use_browser_extended_grammar);
  897. if (parse_invalid_braced_quantifier())
  898. return true; // FAIL FAIL FAIL
  899. return false;
  900. }
  901. bool ECMA262Parser::parse_invalid_braced_quantifier()
  902. {
  903. if (!match(TokenType::LeftCurly))
  904. return false;
  905. consume();
  906. size_t chars_consumed = 1;
  907. auto low_bound = read_digits_as_string();
  908. StringView high_bound;
  909. if (low_bound.is_empty()) {
  910. back(chars_consumed + 1);
  911. return false;
  912. }
  913. chars_consumed += low_bound.length();
  914. if (match(TokenType::Comma)) {
  915. consume();
  916. ++chars_consumed;
  917. high_bound = read_digits_as_string();
  918. chars_consumed += high_bound.length();
  919. }
  920. if (!match(TokenType::RightCurly)) {
  921. back(chars_consumed + 1);
  922. return false;
  923. }
  924. consume();
  925. set_error(Error::InvalidPattern);
  926. return true;
  927. }
  928. bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  929. {
  930. if (auto escape_str = read_digits_as_string(ReadDigitsInitialZeroState::Disallow, ReadDigitFollowPolicy::DisallowNonDigit); !escape_str.is_empty()) {
  931. if (auto escape = escape_str.to_uint(); escape.has_value()) {
  932. // See if this is a "back"-reference (we've already parsed the group it refers to)
  933. auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(escape.value());
  934. if (maybe_length.has_value()) {
  935. match_length_minimum += maybe_length.value();
  936. stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)escape.value() } });
  937. return true;
  938. }
  939. // It's not a pattern seen before, so we have to see if it's a valid reference to a future group.
  940. if (escape.value() <= ensure_total_number_of_capturing_parenthesis()) {
  941. // This refers to a future group, and it will _always_ be matching an empty string
  942. // So just match nothing and move on.
  943. return true;
  944. }
  945. if (!m_should_use_browser_extended_grammar) {
  946. set_error(Error::InvalidNumber);
  947. return false;
  948. }
  949. }
  950. // If not, put the characters back.
  951. back(escape_str.length());
  952. }
  953. // CharacterEscape > ControlEscape
  954. if (try_skip("f")) {
  955. match_length_minimum += 1;
  956. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\f' } });
  957. return true;
  958. }
  959. if (try_skip("n")) {
  960. match_length_minimum += 1;
  961. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\n' } });
  962. return true;
  963. }
  964. if (try_skip("r")) {
  965. match_length_minimum += 1;
  966. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\r' } });
  967. return true;
  968. }
  969. if (try_skip("t")) {
  970. match_length_minimum += 1;
  971. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\t' } });
  972. return true;
  973. }
  974. if (try_skip("v")) {
  975. match_length_minimum += 1;
  976. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\v' } });
  977. return true;
  978. }
  979. // CharacterEscape > ControlLetter
  980. if (try_skip("c")) {
  981. for (auto c = 'A'; c <= 'z'; ++c) {
  982. if (try_skip({ &c, 1 })) {
  983. match_length_minimum += 1;
  984. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)(c % 32) } });
  985. return true;
  986. }
  987. }
  988. if (m_should_use_browser_extended_grammar) {
  989. back(2);
  990. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'\\' } });
  991. match_length_minimum += 1;
  992. return true;
  993. }
  994. if (unicode) {
  995. set_error(Error::InvalidPattern);
  996. return false;
  997. }
  998. // Allow '\c' in non-unicode mode, just matches 'c'.
  999. match_length_minimum += 1;
  1000. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'c' } });
  1001. return true;
  1002. }
  1003. // LegacyOctalEscapeSequence
  1004. if (m_should_use_browser_extended_grammar) {
  1005. if (!unicode) {
  1006. if (auto escape = parse_legacy_octal_escape(); escape.has_value()) {
  1007. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)escape.value() } });
  1008. match_length_minimum += 1;
  1009. return true;
  1010. }
  1011. }
  1012. }
  1013. // '\0'
  1014. if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value()) {
  1015. match_length_minimum += 1;
  1016. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
  1017. return true;
  1018. }
  1019. // HexEscape
  1020. if (try_skip("x")) {
  1021. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
  1022. match_length_minimum += 1;
  1023. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)hex_escape.value() } });
  1024. return true;
  1025. } else if (!unicode) {
  1026. // '\x' is allowed in non-unicode mode, just matches 'x'.
  1027. match_length_minimum += 1;
  1028. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'x' } });
  1029. return true;
  1030. } else {
  1031. set_error(Error::InvalidPattern);
  1032. return false;
  1033. }
  1034. }
  1035. if (try_skip("u")) {
  1036. if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
  1037. // FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode.
  1038. match_length_minimum += 1;
  1039. StringBuilder builder;
  1040. builder.append_code_point(code_point.value());
  1041. // FIXME: This isn't actually correct for ECMAScript.
  1042. auto u8_encoded = builder.string_view();
  1043. stack.insert_bytecode_compare_string(u8_encoded);
  1044. return true;
  1045. } else if (!unicode) {
  1046. // '\u' is allowed in non-unicode mode, just matches 'u'.
  1047. match_length_minimum += 1;
  1048. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'u' } });
  1049. return true;
  1050. } else {
  1051. set_error(Error::InvalidPattern);
  1052. return false;
  1053. }
  1054. }
  1055. // IdentityEscape
  1056. auto source_characters = m_should_use_browser_extended_grammar ? "^$\\.*+?()[|"sv : "^$\\.*+?()[]{}|"sv;
  1057. for (auto ch : source_characters) {
  1058. if (try_skip({ &ch, 1 })) {
  1059. match_length_minimum += 1;
  1060. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)ch } });
  1061. return true;
  1062. }
  1063. }
  1064. if (unicode) {
  1065. if (try_skip("/")) {
  1066. match_length_minimum += 1;
  1067. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)'/' } });
  1068. return true;
  1069. }
  1070. }
  1071. if (named && try_skip("k")) {
  1072. auto name = read_capture_group_specifier(true);
  1073. if (name.is_empty()) {
  1074. set_error(Error::InvalidNameForCaptureGroup);
  1075. return false;
  1076. }
  1077. auto maybe_length = m_parser_state.named_capture_group_minimum_lengths.get(name);
  1078. if (!maybe_length.has_value()) {
  1079. set_error(Error::InvalidNameForCaptureGroup);
  1080. return false;
  1081. }
  1082. match_length_minimum += maybe_length.value();
  1083. stack.insert_bytecode_compare_named_reference(name);
  1084. return true;
  1085. }
  1086. if (unicode) {
  1087. if (try_skip("p{")) {
  1088. // FIXME: Implement this path, Unicode property match.
  1089. TODO();
  1090. }
  1091. if (try_skip("P{")) {
  1092. // FIXME: Implement this path, Unicode property match.
  1093. TODO();
  1094. }
  1095. }
  1096. if (done())
  1097. return set_error(Error::InvalidTrailingEscape);
  1098. bool negate = false;
  1099. auto ch = parse_character_class_escape(negate);
  1100. if (!ch.has_value()) {
  1101. if (!unicode) {
  1102. // Allow all SourceCharacter's as escapes here.
  1103. auto token = consume();
  1104. match_length_minimum += 1;
  1105. stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
  1106. return true;
  1107. }
  1108. set_error(Error::InvalidCharacterClass);
  1109. return false;
  1110. }
  1111. Vector<CompareTypeAndValuePair> compares;
  1112. if (negate)
  1113. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
  1114. compares.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)ch.value() });
  1115. match_length_minimum += 1;
  1116. stack.insert_bytecode_compare_values(move(compares));
  1117. return true;
  1118. }
  1119. Optional<u8> ECMA262Parser::parse_legacy_octal_escape()
  1120. {
  1121. constexpr auto all_octal_digits = "01234567";
  1122. auto read_octal_digit = [&](auto start, auto end, bool should_ensure_no_following_octal_digit) -> Optional<u8> {
  1123. for (char c = '0' + start; c <= '0' + end; ++c) {
  1124. if (try_skip({ &c, 1 })) {
  1125. if (!should_ensure_no_following_octal_digit || !lookahead_any(all_octal_digits))
  1126. return c - '0';
  1127. back(2);
  1128. return {};
  1129. }
  1130. }
  1131. return {};
  1132. };
  1133. // OctalDigit(1)
  1134. if (auto digit = read_octal_digit(0, 7, true); digit.has_value()) {
  1135. return digit.value();
  1136. }
  1137. // OctalDigit(2)
  1138. if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
  1139. if (auto right_digit = read_octal_digit(0, 7, true); right_digit.has_value()) {
  1140. return left_digit.value() * 8 + right_digit.value();
  1141. }
  1142. back(2);
  1143. }
  1144. // OctalDigit(2)
  1145. if (auto left_digit = read_octal_digit(4, 7, false); left_digit.has_value()) {
  1146. if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
  1147. return left_digit.value() * 8 + right_digit.value();
  1148. }
  1149. back(2);
  1150. }
  1151. // OctalDigit(3)
  1152. if (auto left_digit = read_octal_digit(0, 3, false); left_digit.has_value()) {
  1153. size_t chars_consumed = 1;
  1154. if (auto mid_digit = read_octal_digit(0, 7, false); mid_digit.has_value()) {
  1155. ++chars_consumed;
  1156. if (auto right_digit = read_octal_digit(0, 7, false); right_digit.has_value()) {
  1157. return left_digit.value() * 64 + mid_digit.value() * 8 + right_digit.value();
  1158. }
  1159. }
  1160. back(chars_consumed);
  1161. }
  1162. return {};
  1163. }
  1164. Optional<CharClass> ECMA262Parser::parse_character_class_escape(bool& negate, bool expect_backslash)
  1165. {
  1166. if (expect_backslash && !try_skip("\\"))
  1167. return {};
  1168. // CharacterClassEscape
  1169. CharClass ch_class;
  1170. if (try_skip("d")) {
  1171. ch_class = CharClass::Digit;
  1172. } else if (try_skip("D")) {
  1173. ch_class = CharClass::Digit;
  1174. negate = true;
  1175. } else if (try_skip("s")) {
  1176. ch_class = CharClass::Space;
  1177. } else if (try_skip("S")) {
  1178. ch_class = CharClass::Space;
  1179. negate = true;
  1180. } else if (try_skip("w")) {
  1181. ch_class = CharClass::Word;
  1182. } else if (try_skip("W")) {
  1183. ch_class = CharClass::Word;
  1184. negate = true;
  1185. } else {
  1186. return {};
  1187. }
  1188. return ch_class;
  1189. }
  1190. bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool)
  1191. {
  1192. consume(TokenType::LeftBracket, Error::InvalidPattern);
  1193. Vector<CompareTypeAndValuePair> compares;
  1194. if (match(TokenType::Circumflex)) {
  1195. // Negated charclass
  1196. consume();
  1197. compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 });
  1198. }
  1199. if (match(TokenType::RightBracket)) {
  1200. consume();
  1201. return true;
  1202. }
  1203. if (!parse_nonempty_class_ranges(compares, unicode))
  1204. return false;
  1205. match_length_minimum += 1;
  1206. stack.insert_bytecode_compare_values(move(compares));
  1207. return true;
  1208. }
  1209. struct CharClassRangeElement {
  1210. union {
  1211. CharClass character_class;
  1212. u32 code_point { 0 };
  1213. };
  1214. bool is_negated { false };
  1215. bool is_character_class { false };
  1216. };
  1217. bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
  1218. {
  1219. auto read_class_atom_no_dash = [&]() -> Optional<CharClassRangeElement> {
  1220. if (match(TokenType::EscapeSequence)) {
  1221. auto token = consume().value();
  1222. return { { .code_point = (u32)token[1], .is_character_class = false } };
  1223. }
  1224. if (try_skip("\\")) {
  1225. if (done()) {
  1226. set_error(Error::InvalidTrailingEscape);
  1227. return {};
  1228. }
  1229. if (try_skip("f"))
  1230. return { { .code_point = '\f', .is_character_class = false } };
  1231. if (try_skip("n"))
  1232. return { { .code_point = '\n', .is_character_class = false } };
  1233. if (try_skip("r"))
  1234. return { { .code_point = '\r', .is_character_class = false } };
  1235. if (try_skip("t"))
  1236. return { { .code_point = '\t', .is_character_class = false } };
  1237. if (try_skip("v"))
  1238. return { { .code_point = '\v', .is_character_class = false } };
  1239. if (try_skip("b"))
  1240. return { { .code_point = '\b', .is_character_class = false } };
  1241. if (try_skip("/"))
  1242. return { { .code_point = '/', .is_character_class = false } };
  1243. // CharacterEscape > ControlLetter
  1244. if (try_skip("c")) {
  1245. for (auto c = 'A'; c <= 'z'; ++c) {
  1246. if (try_skip({ &c, 1 }))
  1247. return { { .code_point = (u32)(c % 32), .is_character_class = false } };
  1248. }
  1249. if (m_should_use_browser_extended_grammar) {
  1250. for (auto c = '0'; c <= '9'; ++c) {
  1251. if (try_skip({ &c, 1 }))
  1252. return { { .code_point = (u32)(c % 32), .is_character_class = false } };
  1253. }
  1254. if (try_skip("_"))
  1255. return { { .code_point = (u32)('_' % 32), .is_character_class = false } };
  1256. back(2);
  1257. return { { .code_point = '\\', .is_character_class = false } };
  1258. }
  1259. }
  1260. // '\0'
  1261. if (read_digits(ReadDigitsInitialZeroState::Require, ReadDigitFollowPolicy::DisallowDigit).has_value())
  1262. return { { .code_point = 0, .is_character_class = false } };
  1263. // HexEscape
  1264. if (try_skip("x")) {
  1265. if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 2); hex_escape.has_value()) {
  1266. return { { .code_point = hex_escape.value(), .is_character_class = false } };
  1267. } else if (!unicode) {
  1268. // '\x' is allowed in non-unicode mode, just matches 'x'.
  1269. return { { .code_point = 'x', .is_character_class = false } };
  1270. } else {
  1271. set_error(Error::InvalidPattern);
  1272. return {};
  1273. }
  1274. }
  1275. if (try_skip("u")) {
  1276. if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, ReadDigitFollowPolicy::Any, true, 4); code_point.has_value()) {
  1277. // FIXME: While codepoint ranges are supported, codepoint matches as "Char" are not!
  1278. return { { .code_point = code_point.value(), .is_character_class = false } };
  1279. } else if (!unicode) {
  1280. // '\u' is allowed in non-unicode mode, just matches 'u'.
  1281. return { { .code_point = 'u', .is_character_class = false } };
  1282. } else {
  1283. set_error(Error::InvalidPattern);
  1284. return {};
  1285. }
  1286. }
  1287. if (unicode) {
  1288. if (try_skip("-"))
  1289. return { { .code_point = '-', .is_character_class = false } };
  1290. }
  1291. if (try_skip("p{") || try_skip("P{")) {
  1292. // FIXME: Implement these; unicode properties.
  1293. TODO();
  1294. }
  1295. if (try_skip("d"))
  1296. return { { .character_class = CharClass::Digit, .is_character_class = true } };
  1297. if (try_skip("s"))
  1298. return { { .character_class = CharClass::Space, .is_character_class = true } };
  1299. if (try_skip("w"))
  1300. return { { .character_class = CharClass::Word, .is_character_class = true } };
  1301. if (try_skip("D"))
  1302. return { { .character_class = CharClass::Digit, .is_negated = true, .is_character_class = true } };
  1303. if (try_skip("S"))
  1304. return { { .character_class = CharClass::Space, .is_negated = true, .is_character_class = true } };
  1305. if (try_skip("W"))
  1306. return { { .character_class = CharClass::Word, .is_negated = true, .is_character_class = true } };
  1307. if (!unicode) {
  1308. // Any unrecognised escape is allowed in non-unicode mode.
  1309. return { { .code_point = (u32)skip(), .is_character_class = false } };
  1310. }
  1311. }
  1312. if (match(TokenType::RightBracket) || match(TokenType::HyphenMinus))
  1313. return {};
  1314. // Allow any (other) SourceCharacter.
  1315. return { { .code_point = (u32)skip(), .is_character_class = false } };
  1316. };
  1317. auto read_class_atom = [&]() -> Optional<CharClassRangeElement> {
  1318. if (match(TokenType::HyphenMinus)) {
  1319. consume();
  1320. return { { .code_point = '-', .is_character_class = false } };
  1321. }
  1322. return read_class_atom_no_dash();
  1323. };
  1324. while (!match(TokenType::RightBracket)) {
  1325. if (match(TokenType::Eof)) {
  1326. set_error(Error::MismatchingBracket);
  1327. return false;
  1328. }
  1329. auto first_atom = read_class_atom();
  1330. if (!first_atom.has_value())
  1331. return false;
  1332. if (match(TokenType::HyphenMinus)) {
  1333. consume();
  1334. if (match(TokenType::RightBracket)) {
  1335. // Allow '-' as the last element in a charclass, even after an atom.
  1336. m_parser_state.lexer.back(2); // -]
  1337. m_parser_state.current_token = m_parser_state.lexer.next();
  1338. goto read_as_single_atom;
  1339. }
  1340. auto second_atom = read_class_atom();
  1341. if (!second_atom.has_value())
  1342. return false;
  1343. if (first_atom.value().is_character_class || second_atom.value().is_character_class) {
  1344. if (m_should_use_browser_extended_grammar) {
  1345. if (unicode) {
  1346. set_error(Error::InvalidRange);
  1347. return false;
  1348. }
  1349. // CharacterRangeOrUnion > !Unicode > CharClass
  1350. if (first_atom->is_character_class)
  1351. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom->character_class });
  1352. else
  1353. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)first_atom->code_point });
  1354. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)'-' });
  1355. if (second_atom->is_character_class)
  1356. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)second_atom->character_class });
  1357. else
  1358. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, (ByteCodeValueType)second_atom->code_point });
  1359. continue;
  1360. } else {
  1361. set_error(Error::InvalidRange);
  1362. return false;
  1363. }
  1364. }
  1365. if (first_atom.value().code_point > second_atom.value().code_point) {
  1366. set_error(Error::InvalidRange);
  1367. return false;
  1368. }
  1369. VERIFY(!first_atom.value().is_negated);
  1370. VERIFY(!second_atom.value().is_negated);
  1371. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharRange, CharRange { first_atom.value().code_point, second_atom.value().code_point } });
  1372. continue;
  1373. }
  1374. read_as_single_atom:;
  1375. auto atom = first_atom.value();
  1376. if (atom.is_character_class) {
  1377. if (atom.is_negated)
  1378. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
  1379. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)first_atom.value().character_class });
  1380. } else {
  1381. VERIFY(!atom.is_negated);
  1382. ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Char, first_atom.value().code_point });
  1383. }
  1384. }
  1385. consume(TokenType::RightBracket, Error::MismatchingBracket);
  1386. return true;
  1387. }
  1388. StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
  1389. {
  1390. if (take_starting_angle_bracket && !consume("<"))
  1391. return {};
  1392. auto start_token = m_parser_state.current_token;
  1393. size_t offset = 0;
  1394. while (match(TokenType::Char)) {
  1395. auto c = m_parser_state.current_token.value();
  1396. if (c == ">")
  1397. break;
  1398. offset += consume().value().length();
  1399. }
  1400. StringView name { start_token.value().characters_without_null_termination(), offset };
  1401. if (!consume(">") || name.is_empty())
  1402. set_error(Error::InvalidNameForCaptureGroup);
  1403. return name;
  1404. }
  1405. bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
  1406. {
  1407. consume(TokenType::LeftParen, Error::InvalidPattern);
  1408. if (match(TokenType::Questionmark)) {
  1409. // Non-capturing group or group with specifier.
  1410. consume();
  1411. if (match(TokenType::Colon)) {
  1412. consume();
  1413. ByteCode noncapture_group_bytecode;
  1414. size_t length = 0;
  1415. if (!parse_disjunction(noncapture_group_bytecode, length, unicode, named))
  1416. return set_error(Error::InvalidPattern);
  1417. consume(TokenType::RightParen, Error::MismatchingParen);
  1418. stack.append(move(noncapture_group_bytecode));
  1419. match_length_minimum += length;
  1420. return true;
  1421. }
  1422. if (consume("<")) {
  1423. ++m_parser_state.named_capture_groups_count;
  1424. auto name = read_capture_group_specifier();
  1425. if (name.is_empty()) {
  1426. set_error(Error::InvalidNameForCaptureGroup);
  1427. return false;
  1428. }
  1429. ByteCode capture_group_bytecode;
  1430. size_t length = 0;
  1431. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1432. return set_error(Error::InvalidPattern);
  1433. consume(TokenType::RightParen, Error::MismatchingParen);
  1434. stack.insert_bytecode_group_capture_left(name);
  1435. stack.append(move(capture_group_bytecode));
  1436. stack.insert_bytecode_group_capture_right(name);
  1437. match_length_minimum += length;
  1438. m_parser_state.named_capture_group_minimum_lengths.set(name, length);
  1439. return true;
  1440. }
  1441. set_error(Error::InvalidCaptureGroup);
  1442. return false;
  1443. }
  1444. auto group_index = ++m_parser_state.capture_groups_count;
  1445. stack.insert_bytecode_group_capture_left(group_index);
  1446. ByteCode capture_group_bytecode;
  1447. size_t length = 0;
  1448. if (!parse_disjunction(capture_group_bytecode, length, unicode, named))
  1449. return set_error(Error::InvalidPattern);
  1450. stack.append(move(capture_group_bytecode));
  1451. m_parser_state.capture_group_minimum_lengths.set(group_index, length);
  1452. consume(TokenType::RightParen, Error::MismatchingParen);
  1453. stack.insert_bytecode_group_capture_right(group_index);
  1454. match_length_minimum += length;
  1455. return true;
  1456. }
  1457. size_t ECMA262Parser::ensure_total_number_of_capturing_parenthesis()
  1458. {
  1459. if (m_total_number_of_capturing_parenthesis.has_value())
  1460. return m_total_number_of_capturing_parenthesis.value();
  1461. GenericLexer lexer { m_parser_state.lexer.source() };
  1462. size_t count = 0;
  1463. while (!lexer.is_eof()) {
  1464. switch (lexer.peek()) {
  1465. case '\\':
  1466. lexer.consume(2);
  1467. continue;
  1468. case '[':
  1469. while (!lexer.is_eof()) {
  1470. if (lexer.consume_specific('\\'))
  1471. lexer.consume();
  1472. else if (lexer.consume_specific(']'))
  1473. break;
  1474. lexer.consume();
  1475. }
  1476. break;
  1477. case '(':
  1478. if (lexer.consume_specific('?')) {
  1479. // non-capturing group '(?:', lookaround '(?<='/'(?<!', or named capture '(?<'
  1480. if (!lexer.consume_specific('<'))
  1481. break;
  1482. if (lexer.next_is(is_any_of("=!")))
  1483. break;
  1484. ++count;
  1485. } else {
  1486. ++count;
  1487. }
  1488. break;
  1489. }
  1490. lexer.consume();
  1491. }
  1492. m_total_number_of_capturing_parenthesis = count;
  1493. return count;
  1494. }
  1495. }