RegexByteCode.h 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexMatch.h"
  8. #include "RegexOptions.h"
  9. #include <AK/Format.h>
  10. #include <AK/Forward.h>
  11. #include <AK/HashMap.h>
  12. #include <AK/NonnullOwnPtr.h>
  13. #include <AK/OwnPtr.h>
  14. #include <AK/Traits.h>
  15. #include <AK/TypeCasts.h>
  16. #include <AK/Types.h>
  17. #include <AK/Vector.h>
  18. #include <LibUnicode/Forward.h>
  19. namespace regex {
  20. using ByteCodeValueType = u64;
  21. #define ENUMERATE_OPCODES \
  22. __ENUMERATE_OPCODE(Compare) \
  23. __ENUMERATE_OPCODE(Jump) \
  24. __ENUMERATE_OPCODE(JumpNonEmpty) \
  25. __ENUMERATE_OPCODE(ForkJump) \
  26. __ENUMERATE_OPCODE(ForkStay) \
  27. __ENUMERATE_OPCODE(FailForks) \
  28. __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
  29. __ENUMERATE_OPCODE(SaveRightCaptureGroup) \
  30. __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
  31. __ENUMERATE_OPCODE(CheckBegin) \
  32. __ENUMERATE_OPCODE(CheckEnd) \
  33. __ENUMERATE_OPCODE(CheckBoundary) \
  34. __ENUMERATE_OPCODE(Save) \
  35. __ENUMERATE_OPCODE(Restore) \
  36. __ENUMERATE_OPCODE(GoBack) \
  37. __ENUMERATE_OPCODE(ClearCaptureGroup) \
  38. __ENUMERATE_OPCODE(Repeat) \
  39. __ENUMERATE_OPCODE(ResetRepeat) \
  40. __ENUMERATE_OPCODE(Checkpoint) \
  41. __ENUMERATE_OPCODE(Exit)
  42. // clang-format off
  43. enum class OpCodeId : ByteCodeValueType {
  44. #define __ENUMERATE_OPCODE(x) x,
  45. ENUMERATE_OPCODES
  46. #undef __ENUMERATE_OPCODE
  47. First = Compare,
  48. Last = Exit,
  49. };
  50. // clang-format on
  51. #define ENUMERATE_CHARACTER_COMPARE_TYPES \
  52. __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
  53. __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
  54. __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
  55. __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
  56. __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
  57. __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
  58. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
  59. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
  60. __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
  61. __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
  62. __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
  63. __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
  64. __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
  65. __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
  66. enum class CharacterCompareType : ByteCodeValueType {
  67. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
  68. ENUMERATE_CHARACTER_COMPARE_TYPES
  69. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  70. };
  71. #define ENUMERATE_CHARACTER_CLASSES \
  72. __ENUMERATE_CHARACTER_CLASS(Alnum) \
  73. __ENUMERATE_CHARACTER_CLASS(Cntrl) \
  74. __ENUMERATE_CHARACTER_CLASS(Lower) \
  75. __ENUMERATE_CHARACTER_CLASS(Space) \
  76. __ENUMERATE_CHARACTER_CLASS(Alpha) \
  77. __ENUMERATE_CHARACTER_CLASS(Digit) \
  78. __ENUMERATE_CHARACTER_CLASS(Print) \
  79. __ENUMERATE_CHARACTER_CLASS(Upper) \
  80. __ENUMERATE_CHARACTER_CLASS(Blank) \
  81. __ENUMERATE_CHARACTER_CLASS(Graph) \
  82. __ENUMERATE_CHARACTER_CLASS(Punct) \
  83. __ENUMERATE_CHARACTER_CLASS(Word) \
  84. __ENUMERATE_CHARACTER_CLASS(Xdigit)
  85. enum class CharClass : ByteCodeValueType {
  86. #define __ENUMERATE_CHARACTER_CLASS(x) x,
  87. ENUMERATE_CHARACTER_CLASSES
  88. #undef __ENUMERATE_CHARACTER_CLASS
  89. };
  90. #define ENUMERATE_BOUNDARY_CHECK_TYPES \
  91. __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
  92. __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
  93. enum class BoundaryCheckType : ByteCodeValueType {
  94. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
  95. ENUMERATE_BOUNDARY_CHECK_TYPES
  96. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  97. };
  98. struct CharRange {
  99. u32 const from;
  100. u32 const to;
  101. CharRange(u64 value)
  102. : from(value >> 32)
  103. , to(value & 0xffffffff)
  104. {
  105. }
  106. CharRange(u32 from, u32 to)
  107. : from(from)
  108. , to(to)
  109. {
  110. }
  111. operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
  112. };
  113. struct CompareTypeAndValuePair {
  114. CharacterCompareType type;
  115. ByteCodeValueType value;
  116. };
  117. class OpCode;
  118. class ByteCode : public Vector<ByteCodeValueType> {
  119. public:
  120. ByteCode()
  121. {
  122. ensure_opcodes_initialized();
  123. }
  124. ByteCode(ByteCode const&) = default;
  125. virtual ~ByteCode() = default;
  126. ByteCode& operator=(ByteCode&&) = default;
  127. void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
  128. {
  129. ByteCode bytecode;
  130. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  131. bytecode.empend(pairs.size()); // number of arguments
  132. ByteCode arguments;
  133. for (auto& value : pairs) {
  134. VERIFY(value.type != CharacterCompareType::RangeExpressionDummy);
  135. VERIFY(value.type != CharacterCompareType::Undefined);
  136. VERIFY(value.type != CharacterCompareType::String);
  137. arguments.append((ByteCodeValueType)value.type);
  138. if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
  139. arguments.append(move(value.value));
  140. }
  141. bytecode.empend(arguments.size()); // size of arguments
  142. bytecode.extend(move(arguments));
  143. extend(move(bytecode));
  144. }
  145. void insert_bytecode_check_boundary(BoundaryCheckType type)
  146. {
  147. ByteCode bytecode;
  148. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
  149. bytecode.empend((ByteCodeValueType)type);
  150. extend(move(bytecode));
  151. }
  152. void insert_bytecode_clear_capture_group(size_t index)
  153. {
  154. empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
  155. empend(index);
  156. }
  157. void insert_bytecode_compare_string(StringView view)
  158. {
  159. ByteCode bytecode;
  160. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  161. bytecode.empend(static_cast<u64>(1)); // number of arguments
  162. ByteCode arguments;
  163. arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
  164. arguments.insert_string(view);
  165. bytecode.empend(arguments.size()); // size of arguments
  166. bytecode.extend(move(arguments));
  167. extend(move(bytecode));
  168. }
  169. void insert_bytecode_group_capture_left(size_t capture_groups_count)
  170. {
  171. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
  172. empend(capture_groups_count);
  173. }
  174. void insert_bytecode_group_capture_right(size_t capture_groups_count)
  175. {
  176. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
  177. empend(capture_groups_count);
  178. }
  179. void insert_bytecode_group_capture_right(size_t capture_groups_count, StringView const& name)
  180. {
  181. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
  182. empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
  183. empend(name.length());
  184. empend(capture_groups_count);
  185. }
  186. enum class LookAroundType {
  187. LookAhead,
  188. LookBehind,
  189. NegatedLookAhead,
  190. NegatedLookBehind,
  191. };
  192. void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
  193. {
  194. // FIXME: The save stack will grow infinitely with repeated failures
  195. // as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
  196. switch (type) {
  197. case LookAroundType::LookAhead: {
  198. // SAVE
  199. // REGEXP BODY
  200. // RESTORE
  201. empend((ByteCodeValueType)OpCodeId::Save);
  202. extend(move(lookaround_body));
  203. empend((ByteCodeValueType)OpCodeId::Restore);
  204. return;
  205. }
  206. case LookAroundType::NegatedLookAhead: {
  207. // JUMP _A
  208. // LABEL _L
  209. // REGEXP BODY
  210. // FAIL 2
  211. // LABEL _A
  212. // SAVE
  213. // FORKJUMP _L
  214. // RESTORE
  215. auto body_length = lookaround_body.size();
  216. empend((ByteCodeValueType)OpCodeId::Jump);
  217. empend((ByteCodeValueType)body_length + 2); // JUMP to label _A
  218. extend(move(lookaround_body));
  219. empend((ByteCodeValueType)OpCodeId::FailForks);
  220. empend((ByteCodeValueType)2); // Fail two forks
  221. empend((ByteCodeValueType)OpCodeId::Save);
  222. empend((ByteCodeValueType)OpCodeId::ForkJump);
  223. empend((ByteCodeValueType) - (body_length + 5)); // JUMP to label _L
  224. empend((ByteCodeValueType)OpCodeId::Restore);
  225. return;
  226. }
  227. case LookAroundType::LookBehind:
  228. // SAVE
  229. // GOBACK match_length(BODY)
  230. // REGEXP BODY
  231. // RESTORE
  232. empend((ByteCodeValueType)OpCodeId::Save);
  233. empend((ByteCodeValueType)OpCodeId::GoBack);
  234. empend((ByteCodeValueType)match_length);
  235. extend(move(lookaround_body));
  236. empend((ByteCodeValueType)OpCodeId::Restore);
  237. return;
  238. case LookAroundType::NegatedLookBehind: {
  239. // JUMP _A
  240. // LABEL _L
  241. // GOBACK match_length(BODY)
  242. // REGEXP BODY
  243. // FAIL 2
  244. // LABEL _A
  245. // SAVE
  246. // FORKJUMP _L
  247. // RESTORE
  248. auto body_length = lookaround_body.size();
  249. empend((ByteCodeValueType)OpCodeId::Jump);
  250. empend((ByteCodeValueType)body_length + 4); // JUMP to label _A
  251. empend((ByteCodeValueType)OpCodeId::GoBack);
  252. empend((ByteCodeValueType)match_length);
  253. extend(move(lookaround_body));
  254. empend((ByteCodeValueType)OpCodeId::FailForks);
  255. empend((ByteCodeValueType)2); // Fail two forks
  256. empend((ByteCodeValueType)OpCodeId::Save);
  257. empend((ByteCodeValueType)OpCodeId::ForkJump);
  258. empend((ByteCodeValueType) - (body_length + 7)); // JUMP to label _L
  259. empend((ByteCodeValueType)OpCodeId::Restore);
  260. return;
  261. }
  262. }
  263. VERIFY_NOT_REACHED();
  264. }
  265. void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
  266. {
  267. // FORKJUMP _ALT
  268. // REGEXP ALT2
  269. // JUMP _END
  270. // LABEL _ALT
  271. // REGEXP ALT1
  272. // LABEL _END
  273. ByteCode byte_code;
  274. empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  275. empend(right.size() + 2); // Jump to the _ALT label
  276. extend(right);
  277. empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
  278. empend(left.size()); // Jump to the _END label
  279. // LABEL _ALT = bytecode.size() + 2
  280. extend(left);
  281. // LABEL _END = alterantive_bytecode.size
  282. }
  283. template<typename T>
  284. static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true) requires(IsIntegral<T>)
  285. {
  286. ByteCode new_bytecode;
  287. new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id);
  288. if (maximum.has_value()) {
  289. // (REPEAT REGEXP MIN)
  290. // LABEL _MAX_LOOP |
  291. // FORK END |
  292. // REGEXP |
  293. // REPEAT _MAX_LOOP MAX-MIN | if max > min
  294. // FORK END |
  295. // REGEXP |
  296. // LABEL END |
  297. // RESET _MAX_LOOP |
  298. auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump);
  299. if (maximum.value() > minimum) {
  300. new_bytecode.empend(jump_kind);
  301. new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
  302. auto pre_loop_fork_jump_index = new_bytecode.size();
  303. new_bytecode.extend(bytecode_to_repeat);
  304. auto repetitions = maximum.value() - minimum;
  305. auto fork_jump_address = new_bytecode.size();
  306. if (repetitions > 1) {
  307. new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat);
  308. new_bytecode.empend(bytecode_to_repeat.size() + 2);
  309. new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1));
  310. new_bytecode.empend(max_repetition_mark_id);
  311. new_bytecode.empend(jump_kind);
  312. new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
  313. auto post_loop_fork_jump_index = new_bytecode.size();
  314. new_bytecode.extend(bytecode_to_repeat);
  315. fork_jump_address = new_bytecode.size();
  316. new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index);
  317. new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat);
  318. new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id);
  319. }
  320. new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index);
  321. }
  322. } else {
  323. // no maximum value set, repeat finding if possible:
  324. // (REPEAT REGEXP MIN)
  325. // LABEL _START
  326. // CHECKPOINT _C
  327. // REGEXP
  328. // JUMP_NONEMPTY _C _START FORK
  329. // Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n)
  330. new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint);
  331. auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay);
  332. new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
  333. new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 1); // Jump to the last iteration
  334. new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 1); // if _C is not empty.
  335. new_bytecode.empend(jump_kind);
  336. }
  337. bytecode_to_repeat = move(new_bytecode);
  338. }
  339. template<typename T>
  340. void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id) requires(IsIntegral<T>)
  341. {
  342. // LABEL _LOOP
  343. // REGEXP
  344. // REPEAT _LOOP N-1
  345. // REGEXP
  346. if (n == 0)
  347. return;
  348. // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the
  349. // REPEAT instruction forcing another loop.
  350. extend(bytecode_to_repeat);
  351. if (n > 1) {
  352. empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat));
  353. empend(bytecode_to_repeat.size());
  354. empend(static_cast<ByteCodeValueType>(n - 1));
  355. empend(repetition_mark_id);
  356. extend(bytecode_to_repeat);
  357. }
  358. }
  359. static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
  360. {
  361. // LABEL _START = -bytecode_to_repeat.size()
  362. // CHECKPOINT _C
  363. // REGEXP
  364. // JUMP_NONEMPTY _C _START FORKSTAY (FORKJUMP -> Greedy)
  365. bytecode_to_repeat.prepend((ByteCodeValueType)OpCodeId::Checkpoint);
  366. bytecode_to_repeat.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
  367. bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 3); // Jump to the _START label...
  368. bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 2); // ...if _C is not empty
  369. if (greedy)
  370. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  371. else
  372. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  373. }
  374. static void transform_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
  375. {
  376. // LABEL _START
  377. // FORKJUMP _END (FORKSTAY -> Greedy)
  378. // CHECKPOINT _C
  379. // REGEXP
  380. // JUMP_NONEMPTY _C _START JUMP
  381. // LABEL _END
  382. // LABEL _START = m_bytes.size();
  383. ByteCode bytecode;
  384. if (greedy)
  385. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  386. else
  387. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  388. bytecode.empend(bytecode_to_repeat.size() + 1 + 4); // Jump to the _END label
  389. auto c_label = bytecode.size();
  390. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint));
  391. bytecode.extend(bytecode_to_repeat);
  392. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::JumpNonEmpty));
  393. bytecode.empend(-bytecode.size() - 3); // Jump(...) to the _START label...
  394. bytecode.empend(c_label - bytecode.size() - 2); // ...only if _C passes.
  395. bytecode.empend((ByteCodeValueType)OpCodeId::Jump);
  396. // LABEL _END = bytecode.size()
  397. bytecode_to_repeat = move(bytecode);
  398. }
  399. static void transform_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
  400. {
  401. // FORKJUMP _END (FORKSTAY -> Greedy)
  402. // REGEXP
  403. // LABEL _END
  404. ByteCode bytecode;
  405. if (greedy)
  406. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  407. else
  408. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  409. bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
  410. for (auto& op : bytecode_to_repeat)
  411. bytecode.append(move(op));
  412. // LABEL _END = bytecode.size()
  413. bytecode_to_repeat = move(bytecode);
  414. }
  415. OpCode& get_opcode(MatchState& state) const;
  416. private:
  417. void insert_string(StringView const& view)
  418. {
  419. empend((ByteCodeValueType)view.length());
  420. for (size_t i = 0; i < view.length(); ++i)
  421. empend((ByteCodeValueType)view[i]);
  422. }
  423. void ensure_opcodes_initialized();
  424. ALWAYS_INLINE OpCode& get_opcode_by_id(OpCodeId id) const;
  425. static OwnPtr<OpCode> s_opcodes[(size_t)OpCodeId::Last + 1];
  426. static bool s_opcodes_initialized;
  427. };
  428. #define ENUMERATE_EXECUTION_RESULTS \
  429. __ENUMERATE_EXECUTION_RESULT(Continue) \
  430. __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
  431. __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
  432. __ENUMERATE_EXECUTION_RESULT(Failed) \
  433. __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
  434. __ENUMERATE_EXECUTION_RESULT(Succeeded)
  435. enum class ExecutionResult : u8 {
  436. #define __ENUMERATE_EXECUTION_RESULT(x) x,
  437. ENUMERATE_EXECUTION_RESULTS
  438. #undef __ENUMERATE_EXECUTION_RESULT
  439. };
  440. char const* execution_result_name(ExecutionResult result);
  441. char const* opcode_id_name(OpCodeId opcode_id);
  442. char const* boundary_check_type_name(BoundaryCheckType);
  443. char const* character_compare_type_name(CharacterCompareType result);
  444. char const* execution_result_name(ExecutionResult result);
  445. class OpCode {
  446. public:
  447. OpCode() = default;
  448. virtual ~OpCode() = default;
  449. virtual OpCodeId opcode_id() const = 0;
  450. virtual size_t size() const = 0;
  451. virtual ExecutionResult execute(MatchInput const& input, MatchState& state) const = 0;
  452. ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
  453. {
  454. VERIFY(state().instruction_position + offset <= m_bytecode->size());
  455. return m_bytecode->at(state().instruction_position + 1 + offset);
  456. }
  457. ALWAYS_INLINE char const* name() const;
  458. static char const* name(OpCodeId const);
  459. ALWAYS_INLINE void set_state(MatchState& state) { m_state = &state; }
  460. ALWAYS_INLINE void set_bytecode(ByteCode& bytecode) { m_bytecode = &bytecode; }
  461. ALWAYS_INLINE MatchState const& state() const
  462. {
  463. VERIFY(m_state);
  464. return *m_state;
  465. }
  466. String const to_string() const
  467. {
  468. return String::formatted("[{:#02X}] {}", (int)opcode_id(), name(opcode_id()));
  469. }
  470. virtual String const arguments_string() const = 0;
  471. ALWAYS_INLINE ByteCode const& bytecode() const { return *m_bytecode; }
  472. protected:
  473. ByteCode* m_bytecode { nullptr };
  474. MatchState* m_state { nullptr };
  475. };
  476. class OpCode_Exit final : public OpCode {
  477. public:
  478. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  479. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
  480. ALWAYS_INLINE size_t size() const override { return 1; }
  481. String const arguments_string() const override { return ""; }
  482. };
  483. class OpCode_FailForks final : public OpCode {
  484. public:
  485. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  486. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
  487. ALWAYS_INLINE size_t size() const override { return 2; }
  488. ALWAYS_INLINE size_t count() const { return argument(0); }
  489. String const arguments_string() const override { return String::formatted("count={}", count()); }
  490. };
  491. class OpCode_Save final : public OpCode {
  492. public:
  493. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  494. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
  495. ALWAYS_INLINE size_t size() const override { return 1; }
  496. String const arguments_string() const override { return ""; }
  497. };
  498. class OpCode_Restore final : public OpCode {
  499. public:
  500. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  501. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
  502. ALWAYS_INLINE size_t size() const override { return 1; }
  503. String const arguments_string() const override { return ""; }
  504. };
  505. class OpCode_GoBack final : public OpCode {
  506. public:
  507. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  508. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
  509. ALWAYS_INLINE size_t size() const override { return 2; }
  510. ALWAYS_INLINE size_t count() const { return argument(0); }
  511. String const arguments_string() const override { return String::formatted("count={}", count()); }
  512. };
  513. class OpCode_Jump final : public OpCode {
  514. public:
  515. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  516. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
  517. ALWAYS_INLINE size_t size() const override { return 2; }
  518. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  519. String const arguments_string() const override
  520. {
  521. return String::formatted("offset={} [&{}]", offset(), state().instruction_position + size() + offset());
  522. }
  523. };
  524. class OpCode_ForkJump final : public OpCode {
  525. public:
  526. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  527. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
  528. ALWAYS_INLINE size_t size() const override { return 2; }
  529. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  530. String const arguments_string() const override
  531. {
  532. return String::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position);
  533. }
  534. };
  535. class OpCode_ForkStay final : public OpCode {
  536. public:
  537. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  538. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
  539. ALWAYS_INLINE size_t size() const override { return 2; }
  540. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  541. String const arguments_string() const override
  542. {
  543. return String::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position);
  544. }
  545. };
  546. class OpCode_CheckBegin final : public OpCode {
  547. public:
  548. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  549. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
  550. ALWAYS_INLINE size_t size() const override { return 1; }
  551. String const arguments_string() const override { return ""; }
  552. };
  553. class OpCode_CheckEnd final : public OpCode {
  554. public:
  555. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  556. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
  557. ALWAYS_INLINE size_t size() const override { return 1; }
  558. String const arguments_string() const override { return ""; }
  559. };
  560. class OpCode_CheckBoundary final : public OpCode {
  561. public:
  562. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  563. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
  564. ALWAYS_INLINE size_t size() const override { return 2; }
  565. ALWAYS_INLINE size_t arguments_count() const { return 1; }
  566. ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
  567. String const arguments_string() const override { return String::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); }
  568. };
  569. class OpCode_ClearCaptureGroup final : public OpCode {
  570. public:
  571. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  572. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; }
  573. ALWAYS_INLINE size_t size() const override { return 2; }
  574. ALWAYS_INLINE size_t id() const { return argument(0); }
  575. String const arguments_string() const override { return String::formatted("id={}", id()); }
  576. };
  577. class OpCode_SaveLeftCaptureGroup final : public OpCode {
  578. public:
  579. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  580. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
  581. ALWAYS_INLINE size_t size() const override { return 2; }
  582. ALWAYS_INLINE size_t id() const { return argument(0); }
  583. String const arguments_string() const override { return String::formatted("id={}", id()); }
  584. };
  585. class OpCode_SaveRightCaptureGroup final : public OpCode {
  586. public:
  587. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  588. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
  589. ALWAYS_INLINE size_t size() const override { return 2; }
  590. ALWAYS_INLINE size_t id() const { return argument(0); }
  591. String const arguments_string() const override { return String::formatted("id={}", id()); }
  592. };
  593. class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
  594. public:
  595. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  596. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
  597. ALWAYS_INLINE size_t size() const override { return 4; }
  598. ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
  599. ALWAYS_INLINE size_t length() const { return argument(1); }
  600. ALWAYS_INLINE size_t id() const { return argument(2); }
  601. String const arguments_string() const override
  602. {
  603. return String::formatted("name={}, length={}", name(), length());
  604. }
  605. };
  606. class OpCode_Compare final : public OpCode {
  607. public:
  608. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  609. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
  610. ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
  611. ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
  612. ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
  613. String const arguments_string() const override;
  614. Vector<String> const variable_arguments_to_string(Optional<MatchInput> input = {}) const;
  615. private:
  616. ALWAYS_INLINE static void compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
  617. ALWAYS_INLINE static bool compare_string(MatchInput const& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match);
  618. ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
  619. ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
  620. ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
  621. ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
  622. ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
  623. ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
  624. };
  625. class OpCode_Repeat : public OpCode {
  626. public:
  627. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  628. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Repeat; }
  629. ALWAYS_INLINE size_t size() const override { return 4; }
  630. ALWAYS_INLINE size_t offset() const { return argument(0); }
  631. ALWAYS_INLINE u64 count() const { return argument(1); }
  632. ALWAYS_INLINE size_t id() const { return argument(2); }
  633. String const arguments_string() const override
  634. {
  635. auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
  636. return String::formatted("offset={} count={} id={} rep={}, sp: {}", offset(), count() + 1, id(), reps + 1, state().string_position);
  637. }
  638. };
  639. class OpCode_ResetRepeat : public OpCode {
  640. public:
  641. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  642. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ResetRepeat; }
  643. ALWAYS_INLINE size_t size() const override { return 2; }
  644. ALWAYS_INLINE size_t id() const { return argument(0); }
  645. String const arguments_string() const override
  646. {
  647. auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
  648. return String::formatted("id={} rep={}", id(), reps + 1);
  649. }
  650. };
  651. class OpCode_Checkpoint final : public OpCode {
  652. public:
  653. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  654. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Checkpoint; }
  655. ALWAYS_INLINE size_t size() const override { return 1; }
  656. String const arguments_string() const override { return ""; }
  657. };
  658. class OpCode_JumpNonEmpty final : public OpCode {
  659. public:
  660. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  661. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::JumpNonEmpty; }
  662. ALWAYS_INLINE size_t size() const override { return 4; }
  663. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  664. ALWAYS_INLINE ssize_t checkpoint() const { return argument(1); }
  665. ALWAYS_INLINE OpCodeId form() const { return (OpCodeId)argument(2); }
  666. String const arguments_string() const override
  667. {
  668. return String::formatted("{} offset={} [&{}], cp={} [&{}]",
  669. opcode_id_name(form()),
  670. offset(), state().instruction_position + size() + offset(),
  671. checkpoint(), state().instruction_position + size() + checkpoint());
  672. }
  673. };
  674. template<typename T>
  675. bool is(OpCode const&);
  676. template<typename T>
  677. ALWAYS_INLINE bool is(OpCode const&)
  678. {
  679. return false;
  680. }
  681. template<typename T>
  682. ALWAYS_INLINE bool is(OpCode const* opcode)
  683. {
  684. return is<T>(*opcode);
  685. }
  686. template<>
  687. ALWAYS_INLINE bool is<OpCode_ForkStay>(OpCode const& opcode)
  688. {
  689. return opcode.opcode_id() == OpCodeId::ForkStay;
  690. }
  691. template<>
  692. ALWAYS_INLINE bool is<OpCode_Exit>(OpCode const& opcode)
  693. {
  694. return opcode.opcode_id() == OpCodeId::Exit;
  695. }
  696. template<>
  697. ALWAYS_INLINE bool is<OpCode_Compare>(OpCode const& opcode)
  698. {
  699. return opcode.opcode_id() == OpCodeId::Compare;
  700. }
  701. template<typename T>
  702. ALWAYS_INLINE T const& to(OpCode const& opcode)
  703. {
  704. return verify_cast<T>(opcode);
  705. }
  706. template<typename T>
  707. ALWAYS_INLINE T* to(OpCode* opcode)
  708. {
  709. return verify_cast<T>(opcode);
  710. }
  711. template<typename T>
  712. ALWAYS_INLINE T const* to(OpCode const* opcode)
  713. {
  714. return verify_cast<T>(opcode);
  715. }
  716. template<typename T>
  717. ALWAYS_INLINE T& to(OpCode& opcode)
  718. {
  719. return verify_cast<T>(opcode);
  720. }
  721. }