RegexParser.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include "RegexParser.h"
  27. #include "RegexDebug.h"
  28. #include <AK/String.h>
  29. #include <AK/StringBuilder.h>
  30. #include <cstdio>
  31. namespace regex {
  32. ALWAYS_INLINE bool Parser::set_error(Error error)
  33. {
  34. if (m_parser_state.error == Error::NoError) {
  35. m_parser_state.error = error;
  36. m_parser_state.error_token = m_parser_state.current_token;
  37. }
  38. return false; // always return false, that eases the API usage (return set_error(...)) :^)
  39. }
  40. ALWAYS_INLINE bool Parser::done() const
  41. {
  42. return match(TokenType::Eof);
  43. }
  44. ALWAYS_INLINE bool Parser::match(TokenType type) const
  45. {
  46. return m_parser_state.current_token.type() == type;
  47. }
  48. ALWAYS_INLINE Token Parser::consume()
  49. {
  50. auto old_token = m_parser_state.current_token;
  51. m_parser_state.current_token = m_parser_state.lexer.next();
  52. return old_token;
  53. }
  54. ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
  55. {
  56. if (m_parser_state.current_token.type() != type) {
  57. set_error(error);
  58. dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.current_token.name() << ". Expected: " << Token::name(type);
  59. }
  60. return consume();
  61. }
  62. ALWAYS_INLINE bool Parser::consume(const String& str)
  63. {
  64. size_t potentially_go_back { 1 };
  65. for (auto ch : str) {
  66. if (match(TokenType::Char)) {
  67. if (m_parser_state.current_token.value()[0] != ch) {
  68. m_parser_state.lexer.back(potentially_go_back);
  69. m_parser_state.current_token = m_parser_state.lexer.next();
  70. return false;
  71. }
  72. } else {
  73. m_parser_state.lexer.back(potentially_go_back);
  74. m_parser_state.current_token = m_parser_state.lexer.next();
  75. return false;
  76. }
  77. consume(TokenType::Char, Error::NoError);
  78. ++potentially_go_back;
  79. }
  80. return true;
  81. }
  82. ALWAYS_INLINE void Parser::reset()
  83. {
  84. m_parser_state.bytecode.clear();
  85. m_parser_state.lexer.reset();
  86. m_parser_state.current_token = m_parser_state.lexer.next();
  87. m_parser_state.error = Error::NoError;
  88. m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
  89. m_parser_state.regex_options = {};
  90. }
  91. Parser::Result Parser::parse(Optional<AllOptions> regex_options)
  92. {
  93. reset();
  94. if (regex_options.has_value())
  95. m_parser_state.regex_options = regex_options.value();
  96. if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
  97. consume(TokenType::Eof, Error::InvalidPattern);
  98. else
  99. set_error(Error::InvalidPattern);
  100. #ifdef REGEX_DEBUG
  101. fprintf(stderr, "[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.bytecode.size());
  102. #endif
  103. return {
  104. move(m_parser_state.bytecode),
  105. move(m_parser_state.capture_groups_count),
  106. move(m_parser_state.named_capture_groups_count),
  107. move(m_parser_state.match_length_minimum),
  108. move(m_parser_state.error),
  109. move(m_parser_state.error_token)
  110. };
  111. }
  112. // =============================
  113. // PosixExtended Parser
  114. // =============================
  115. bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
  116. {
  117. return parse_root(stack, match_length_minimum);
  118. }
  119. ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
  120. {
  121. auto type = m_parser_state.current_token.type();
  122. return (type == TokenType::Asterisk
  123. || type == TokenType::Plus
  124. || type == TokenType::Questionmark
  125. || type == TokenType::LeftCurly);
  126. }
  127. ALWAYS_INLINE bool PosixExtendedParser::match_ordinary_characters()
  128. {
  129. // NOTE: This method must not be called during bracket and repetition parsing!
  130. // FIXME: Add assertion for that?
  131. auto type = m_parser_state.current_token.type();
  132. return (type == TokenType::Char
  133. || type == TokenType::Comma
  134. || type == TokenType::Slash
  135. || type == TokenType::EqualSign
  136. || type == TokenType::HyphenMinus
  137. || type == TokenType::Colon);
  138. }
  139. ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
  140. {
  141. if (match(TokenType::LeftCurly)) {
  142. consume();
  143. StringBuilder number_builder;
  144. while (match(TokenType::Char)) {
  145. number_builder.append(consume().value());
  146. }
  147. auto maybe_minimum = number_builder.build().to_uint();
  148. if (!maybe_minimum.has_value())
  149. return set_error(Error::InvalidBraceContent);
  150. auto minimum = maybe_minimum.value();
  151. match_length_minimum *= minimum;
  152. if (match(TokenType::Comma)) {
  153. consume();
  154. } else {
  155. ByteCode bytecode;
  156. bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
  157. bytecode_to_repeat = move(bytecode);
  158. consume(TokenType::RightCurly, Error::MismatchingBrace);
  159. return !has_error();
  160. }
  161. Optional<size_t> maybe_maximum {};
  162. number_builder.clear();
  163. while (match(TokenType::Char)) {
  164. number_builder.append(consume().value());
  165. }
  166. if (!number_builder.is_empty()) {
  167. auto value = number_builder.build().to_uint();
  168. if (!value.has_value() || minimum > value.value())
  169. return set_error(Error::InvalidBraceContent);
  170. maybe_maximum = value.value();
  171. }
  172. bytecode_to_repeat.insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
  173. consume(TokenType::RightCurly, Error::MismatchingBrace);
  174. return !has_error();
  175. } else if (match(TokenType::Plus)) {
  176. consume();
  177. bool nongreedy = match(TokenType::Questionmark);
  178. if (nongreedy)
  179. consume();
  180. // Note: dont touch match_length_minimum, it's already correct
  181. bytecode_to_repeat.insert_bytecode_repetition_min_one(bytecode_to_repeat, !nongreedy);
  182. return !has_error();
  183. } else if (match(TokenType::Asterisk)) {
  184. consume();
  185. match_length_minimum = 0;
  186. bool nongreedy = match(TokenType::Questionmark);
  187. if (nongreedy)
  188. consume();
  189. bytecode_to_repeat.insert_bytecode_repetition_any(bytecode_to_repeat, !nongreedy);
  190. return !has_error();
  191. } else if (match(TokenType::Questionmark)) {
  192. consume();
  193. match_length_minimum = 0;
  194. bool nongreedy = match(TokenType::Questionmark);
  195. if (nongreedy)
  196. consume();
  197. bytecode_to_repeat.insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, !nongreedy);
  198. return !has_error();
  199. }
  200. return false;
  201. }
  202. ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
  203. {
  204. Vector<CompareTypeAndValuePair> values;
  205. for (;;) {
  206. if (match(TokenType::HyphenMinus)) {
  207. consume();
  208. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  209. // first in the bracket expression
  210. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  211. } else if (match(TokenType::RightBracket)) {
  212. // Last in the bracket expression
  213. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  214. } else if (values.last().type == CharacterCompareType::Char) {
  215. values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
  216. if (match(TokenType::HyphenMinus)) {
  217. consume();
  218. // Valid range, add ordinary character
  219. values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
  220. }
  221. } else {
  222. return set_error(Error::InvalidRange);
  223. }
  224. } else if (match(TokenType::Char) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) {
  225. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
  226. } else if (match(TokenType::Circumflex)) {
  227. auto t = consume();
  228. if (values.is_empty())
  229. values.append({ CharacterCompareType::Inverse, 0 });
  230. else
  231. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
  232. } else if (match(TokenType::LeftBracket)) {
  233. consume();
  234. if (match(TokenType::Period)) {
  235. consume();
  236. // FIXME: Parse collating element, this is needed when we have locale support
  237. // This could have impact on length parameter, I guess.
  238. ASSERT_NOT_REACHED();
  239. consume(TokenType::Period, Error::InvalidCollationElement);
  240. consume(TokenType::RightBracket, Error::MismatchingBracket);
  241. } else if (match(TokenType::EqualSign)) {
  242. consume();
  243. // FIXME: Parse collating element, this is needed when we have locale support
  244. // This could have impact on length parameter, I guess.
  245. ASSERT_NOT_REACHED();
  246. consume(TokenType::EqualSign, Error::InvalidCollationElement);
  247. consume(TokenType::RightBracket, Error::MismatchingBracket);
  248. } else if (match(TokenType::Colon)) {
  249. consume();
  250. CharClass ch_class;
  251. // parse character class
  252. if (match(TokenType::Char)) {
  253. if (consume("alnum"))
  254. ch_class = CharClass::Alnum;
  255. else if (consume("alpha"))
  256. ch_class = CharClass::Alpha;
  257. else if (consume("blank"))
  258. ch_class = CharClass::Blank;
  259. else if (consume("cntrl"))
  260. ch_class = CharClass::Cntrl;
  261. else if (consume("digit"))
  262. ch_class = CharClass::Digit;
  263. else if (consume("graph"))
  264. ch_class = CharClass::Graph;
  265. else if (consume("lower"))
  266. ch_class = CharClass::Lower;
  267. else if (consume("print"))
  268. ch_class = CharClass::Print;
  269. else if (consume("punct"))
  270. ch_class = CharClass::Punct;
  271. else if (consume("space"))
  272. ch_class = CharClass::Space;
  273. else if (consume("upper"))
  274. ch_class = CharClass::Upper;
  275. else if (consume("xdigit"))
  276. ch_class = CharClass::Xdigit;
  277. else
  278. return set_error(Error::InvalidCharacterClass);
  279. values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
  280. } else
  281. return set_error(Error::InvalidCharacterClass);
  282. // FIXME: we do not support locale specific character classes until locales are implemented
  283. consume(TokenType::Colon, Error::InvalidCharacterClass);
  284. consume(TokenType::RightBracket, Error::MismatchingBracket);
  285. } else {
  286. return set_error(Error::MismatchingBracket);
  287. }
  288. } else if (match(TokenType::RightBracket)) {
  289. if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
  290. // handle bracket as ordinary character
  291. values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
  292. } else {
  293. // closing bracket expression
  294. break;
  295. }
  296. } else
  297. // nothing matched, this is a failure, as at least the closing bracket must match...
  298. return set_error(Error::MismatchingBracket);
  299. // check if range expression has to be completed...
  300. if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
  301. if (values.last().type != CharacterCompareType::Char)
  302. return set_error(Error::InvalidRange);
  303. auto value2 = values.take_last();
  304. values.take_last(); // RangeExpressionDummy
  305. auto value1 = values.take_last();
  306. values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (u32)value1.value, (u32)value2.value }) });
  307. }
  308. }
  309. if (values.size())
  310. match_length_minimum = 1;
  311. if (values.first().type == CharacterCompareType::Inverse)
  312. match_length_minimum = 0;
  313. stack.insert_bytecode_compare_values(move(values));
  314. return !has_error();
  315. }
  316. ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
  317. {
  318. ByteCode bytecode;
  319. size_t length = 0;
  320. bool should_parse_repetition_symbol { false };
  321. for (;;) {
  322. if (match_ordinary_characters()) {
  323. Token start_token = m_parser_state.current_token;
  324. Token last_token = m_parser_state.current_token;
  325. for (;;) {
  326. if (!match_ordinary_characters())
  327. break;
  328. ++length;
  329. last_token = consume();
  330. }
  331. if (length > 1) {
  332. // last character is inserted into 'bytecode' for duplication symbol handling
  333. auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
  334. stack.insert_bytecode_compare_string(start_token.value(), new_length);
  335. }
  336. if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
  337. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
  338. should_parse_repetition_symbol = true;
  339. break;
  340. }
  341. if (match_repetition_symbol())
  342. return set_error(Error::InvalidRepetitionMarker);
  343. if (match(TokenType::Period)) {
  344. length = 1;
  345. consume();
  346. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
  347. should_parse_repetition_symbol = true;
  348. break;
  349. }
  350. if (match(TokenType::EscapeSequence)) {
  351. length = 1;
  352. Token t = consume();
  353. #ifdef REGEX_DEBUG
  354. printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
  355. #endif
  356. bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
  357. should_parse_repetition_symbol = true;
  358. break;
  359. }
  360. if (match(TokenType::LeftBracket)) {
  361. consume();
  362. ByteCode sub_ops;
  363. if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
  364. return set_error(Error::InvalidBracketContent);
  365. bytecode.append(move(sub_ops));
  366. consume(TokenType::RightBracket, Error::MismatchingBracket);
  367. should_parse_repetition_symbol = true;
  368. break;
  369. }
  370. if (match(TokenType::RightBracket)) {
  371. return set_error(Error::MismatchingBracket);
  372. }
  373. if (match(TokenType::RightCurly)) {
  374. return set_error(Error::MismatchingBrace);
  375. }
  376. if (match(TokenType::Circumflex)) {
  377. consume();
  378. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
  379. break;
  380. }
  381. if (match(TokenType::Dollar)) {
  382. consume();
  383. bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
  384. break;
  385. }
  386. if (match(TokenType::RightParen))
  387. return false;
  388. if (match(TokenType::LeftParen)) {
  389. consume();
  390. Optional<StringView> capture_group_name;
  391. bool prevent_capture_group = false;
  392. if (match(TokenType::Questionmark)) {
  393. consume();
  394. if (match(TokenType::Colon)) {
  395. consume();
  396. prevent_capture_group = true;
  397. } else if (consume("<")) { // named capturing group
  398. Token start_token = m_parser_state.current_token;
  399. Token last_token = m_parser_state.current_token;
  400. size_t capture_group_name_length = 0;
  401. for (;;) {
  402. if (!match_ordinary_characters())
  403. return set_error(Error::InvalidNameForCaptureGroup);
  404. if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
  405. consume();
  406. break;
  407. }
  408. ++capture_group_name_length;
  409. last_token = consume();
  410. }
  411. capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
  412. } else if (match(TokenType::EqualSign)) { // positive lookahead
  413. consume();
  414. ASSERT_NOT_REACHED();
  415. } else if (consume("!")) { // negative lookahead
  416. ASSERT_NOT_REACHED();
  417. } else if (consume("<")) {
  418. if (match(TokenType::EqualSign)) { // positive lookbehind
  419. consume();
  420. ASSERT_NOT_REACHED();
  421. }
  422. if (consume("!")) // negative lookbehind
  423. ASSERT_NOT_REACHED();
  424. } else {
  425. return set_error(Error::InvalidRepetitionMarker);
  426. }
  427. }
  428. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  429. if (capture_group_name.has_value())
  430. bytecode.insert_bytecode_group_capture_left(capture_group_name.value());
  431. else
  432. bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
  433. }
  434. ByteCode capture_group_bytecode;
  435. if (!parse_root(capture_group_bytecode, length))
  436. return set_error(Error::InvalidPattern);
  437. bytecode.append(move(capture_group_bytecode));
  438. consume(TokenType::RightParen, Error::MismatchingParen);
  439. if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
  440. if (capture_group_name.has_value()) {
  441. bytecode.insert_bytecode_group_capture_right(capture_group_name.value());
  442. ++m_parser_state.named_capture_groups_count;
  443. } else {
  444. bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
  445. ++m_parser_state.capture_groups_count;
  446. }
  447. }
  448. should_parse_repetition_symbol = true;
  449. break;
  450. }
  451. return false;
  452. }
  453. if (match_repetition_symbol()) {
  454. if (should_parse_repetition_symbol)
  455. parse_repetition_symbol(bytecode, length);
  456. else
  457. return set_error(Error::InvalidRepetitionMarker);
  458. }
  459. stack.append(move(bytecode));
  460. match_length_minimum += length;
  461. return true;
  462. }
  463. bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
  464. {
  465. ByteCode bytecode_left;
  466. size_t match_length_minimum_left { 0 };
  467. if (match_repetition_symbol())
  468. return set_error(Error::InvalidRepetitionMarker);
  469. for (;;) {
  470. if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
  471. break;
  472. if (match(TokenType::Pipe)) {
  473. consume();
  474. ByteCode bytecode_right;
  475. size_t match_length_minimum_right { 0 };
  476. if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
  477. return set_error(Error::InvalidPattern);
  478. ByteCode new_bytecode;
  479. new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
  480. bytecode_left = move(new_bytecode);
  481. match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
  482. }
  483. }
  484. if (bytecode_left.is_empty())
  485. set_error(Error::EmptySubExpression);
  486. stack.append(move(bytecode_left));
  487. match_length_minimum = match_length_minimum_left;
  488. return !has_error();
  489. }
  490. }