RegexByteCode.h 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexBytecodeStreamOptimizer.h"
  8. #include "RegexMatch.h"
  9. #include "RegexOptions.h"
  10. #include <AK/DisjointChunks.h>
  11. #include <AK/Format.h>
  12. #include <AK/Forward.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/NonnullOwnPtr.h>
  15. #include <AK/OwnPtr.h>
  16. #include <AK/Traits.h>
  17. #include <AK/TypeCasts.h>
  18. #include <AK/Types.h>
  19. #include <AK/Vector.h>
  20. #include <LibUnicode/Forward.h>
  21. namespace regex {
  22. using ByteCodeValueType = u64;
  23. #define ENUMERATE_OPCODES \
  24. __ENUMERATE_OPCODE(Compare) \
  25. __ENUMERATE_OPCODE(Jump) \
  26. __ENUMERATE_OPCODE(JumpNonEmpty) \
  27. __ENUMERATE_OPCODE(ForkJump) \
  28. __ENUMERATE_OPCODE(ForkStay) \
  29. __ENUMERATE_OPCODE(ForkReplaceJump) \
  30. __ENUMERATE_OPCODE(ForkReplaceStay) \
  31. __ENUMERATE_OPCODE(FailForks) \
  32. __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
  33. __ENUMERATE_OPCODE(SaveRightCaptureGroup) \
  34. __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
  35. __ENUMERATE_OPCODE(CheckBegin) \
  36. __ENUMERATE_OPCODE(CheckEnd) \
  37. __ENUMERATE_OPCODE(CheckBoundary) \
  38. __ENUMERATE_OPCODE(Save) \
  39. __ENUMERATE_OPCODE(Restore) \
  40. __ENUMERATE_OPCODE(GoBack) \
  41. __ENUMERATE_OPCODE(ClearCaptureGroup) \
  42. __ENUMERATE_OPCODE(Repeat) \
  43. __ENUMERATE_OPCODE(ResetRepeat) \
  44. __ENUMERATE_OPCODE(Checkpoint) \
  45. __ENUMERATE_OPCODE(Exit)
  46. // clang-format off
  47. enum class OpCodeId : ByteCodeValueType {
  48. #define __ENUMERATE_OPCODE(x) x,
  49. ENUMERATE_OPCODES
  50. #undef __ENUMERATE_OPCODE
  51. First = Compare,
  52. Last = Exit,
  53. };
  54. // clang-format on
  55. #define ENUMERATE_CHARACTER_COMPARE_TYPES \
  56. __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
  57. __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
  58. __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
  59. __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
  60. __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
  61. __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
  62. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
  63. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
  64. __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
  65. __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
  66. __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
  67. __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
  68. __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
  69. __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
  70. __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable)
  71. enum class CharacterCompareType : ByteCodeValueType {
  72. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
  73. ENUMERATE_CHARACTER_COMPARE_TYPES
  74. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  75. };
  76. #define ENUMERATE_CHARACTER_CLASSES \
  77. __ENUMERATE_CHARACTER_CLASS(Alnum) \
  78. __ENUMERATE_CHARACTER_CLASS(Cntrl) \
  79. __ENUMERATE_CHARACTER_CLASS(Lower) \
  80. __ENUMERATE_CHARACTER_CLASS(Space) \
  81. __ENUMERATE_CHARACTER_CLASS(Alpha) \
  82. __ENUMERATE_CHARACTER_CLASS(Digit) \
  83. __ENUMERATE_CHARACTER_CLASS(Print) \
  84. __ENUMERATE_CHARACTER_CLASS(Upper) \
  85. __ENUMERATE_CHARACTER_CLASS(Blank) \
  86. __ENUMERATE_CHARACTER_CLASS(Graph) \
  87. __ENUMERATE_CHARACTER_CLASS(Punct) \
  88. __ENUMERATE_CHARACTER_CLASS(Word) \
  89. __ENUMERATE_CHARACTER_CLASS(Xdigit)
  90. enum class CharClass : ByteCodeValueType {
  91. #define __ENUMERATE_CHARACTER_CLASS(x) x,
  92. ENUMERATE_CHARACTER_CLASSES
  93. #undef __ENUMERATE_CHARACTER_CLASS
  94. };
  95. #define ENUMERATE_BOUNDARY_CHECK_TYPES \
  96. __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
  97. __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
  98. enum class BoundaryCheckType : ByteCodeValueType {
  99. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
  100. ENUMERATE_BOUNDARY_CHECK_TYPES
  101. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  102. };
  103. struct CharRange {
  104. u32 const from;
  105. u32 const to;
  106. CharRange(u64 value)
  107. : from(value >> 32)
  108. , to(value & 0xffffffff)
  109. {
  110. }
  111. CharRange(u32 from, u32 to)
  112. : from(from)
  113. , to(to)
  114. {
  115. }
  116. operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
  117. };
  118. struct CompareTypeAndValuePair {
  119. CharacterCompareType type;
  120. ByteCodeValueType value;
  121. };
  122. class OpCode;
  123. class ByteCode : public DisjointChunks<ByteCodeValueType> {
  124. using Base = DisjointChunks<ByteCodeValueType>;
  125. public:
  126. ByteCode()
  127. {
  128. ensure_opcodes_initialized();
  129. }
  130. ByteCode(ByteCode const&) = default;
  131. virtual ~ByteCode() = default;
  132. ByteCode& operator=(ByteCode&&) = default;
  133. ByteCode& operator=(Base&& value)
  134. {
  135. static_cast<Base&>(*this) = move(value);
  136. return *this;
  137. }
  138. template<typename... Args>
  139. void empend(Args&&... args)
  140. {
  141. if (is_empty())
  142. Base::append({});
  143. Base::last_chunk().empend(forward<Args>(args)...);
  144. }
  145. template<typename T>
  146. void append(T&& value)
  147. {
  148. if (is_empty())
  149. Base::append({});
  150. Base::last_chunk().append(forward<T>(value));
  151. }
  152. template<typename T>
  153. void prepend(T&& value)
  154. {
  155. if (is_empty())
  156. return append(forward<T>(value));
  157. Base::first_chunk().prepend(forward<T>(value));
  158. }
  159. void last_chunk() const = delete;
  160. void first_chunk() const = delete;
  161. void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
  162. {
  163. Optimizer::append_character_class(*this, move(pairs));
  164. }
  165. void insert_bytecode_check_boundary(BoundaryCheckType type)
  166. {
  167. ByteCode bytecode;
  168. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
  169. bytecode.empend((ByteCodeValueType)type);
  170. extend(move(bytecode));
  171. }
  172. void insert_bytecode_clear_capture_group(size_t index)
  173. {
  174. empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
  175. empend(index);
  176. }
  177. void insert_bytecode_compare_string(StringView view)
  178. {
  179. ByteCode bytecode;
  180. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  181. bytecode.empend(static_cast<u64>(1)); // number of arguments
  182. ByteCode arguments;
  183. arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
  184. arguments.insert_string(view);
  185. bytecode.empend(arguments.size()); // size of arguments
  186. bytecode.extend(move(arguments));
  187. extend(move(bytecode));
  188. }
  189. void insert_bytecode_group_capture_left(size_t capture_groups_count)
  190. {
  191. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
  192. empend(capture_groups_count);
  193. }
  194. void insert_bytecode_group_capture_right(size_t capture_groups_count)
  195. {
  196. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
  197. empend(capture_groups_count);
  198. }
  199. void insert_bytecode_group_capture_right(size_t capture_groups_count, StringView const& name)
  200. {
  201. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
  202. empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
  203. empend(name.length());
  204. empend(capture_groups_count);
  205. }
  206. enum class LookAroundType {
  207. LookAhead,
  208. LookBehind,
  209. NegatedLookAhead,
  210. NegatedLookBehind,
  211. };
  212. void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
  213. {
  214. // FIXME: The save stack will grow infinitely with repeated failures
  215. // as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
  216. switch (type) {
  217. case LookAroundType::LookAhead: {
  218. // SAVE
  219. // REGEXP BODY
  220. // RESTORE
  221. empend((ByteCodeValueType)OpCodeId::Save);
  222. extend(move(lookaround_body));
  223. empend((ByteCodeValueType)OpCodeId::Restore);
  224. return;
  225. }
  226. case LookAroundType::NegatedLookAhead: {
  227. // JUMP _A
  228. // LABEL _L
  229. // REGEXP BODY
  230. // FAIL 2
  231. // LABEL _A
  232. // SAVE
  233. // FORKJUMP _L
  234. // RESTORE
  235. auto body_length = lookaround_body.size();
  236. empend((ByteCodeValueType)OpCodeId::Jump);
  237. empend((ByteCodeValueType)body_length + 2); // JUMP to label _A
  238. extend(move(lookaround_body));
  239. empend((ByteCodeValueType)OpCodeId::FailForks);
  240. empend((ByteCodeValueType)2); // Fail two forks
  241. empend((ByteCodeValueType)OpCodeId::Save);
  242. empend((ByteCodeValueType)OpCodeId::ForkJump);
  243. empend((ByteCodeValueType) - (body_length + 5)); // JUMP to label _L
  244. empend((ByteCodeValueType)OpCodeId::Restore);
  245. return;
  246. }
  247. case LookAroundType::LookBehind:
  248. // SAVE
  249. // GOBACK match_length(BODY)
  250. // REGEXP BODY
  251. // RESTORE
  252. empend((ByteCodeValueType)OpCodeId::Save);
  253. empend((ByteCodeValueType)OpCodeId::GoBack);
  254. empend((ByteCodeValueType)match_length);
  255. extend(move(lookaround_body));
  256. empend((ByteCodeValueType)OpCodeId::Restore);
  257. return;
  258. case LookAroundType::NegatedLookBehind: {
  259. // JUMP _A
  260. // LABEL _L
  261. // GOBACK match_length(BODY)
  262. // REGEXP BODY
  263. // FAIL 2
  264. // LABEL _A
  265. // SAVE
  266. // FORKJUMP _L
  267. // RESTORE
  268. auto body_length = lookaround_body.size();
  269. empend((ByteCodeValueType)OpCodeId::Jump);
  270. empend((ByteCodeValueType)body_length + 4); // JUMP to label _A
  271. empend((ByteCodeValueType)OpCodeId::GoBack);
  272. empend((ByteCodeValueType)match_length);
  273. extend(move(lookaround_body));
  274. empend((ByteCodeValueType)OpCodeId::FailForks);
  275. empend((ByteCodeValueType)2); // Fail two forks
  276. empend((ByteCodeValueType)OpCodeId::Save);
  277. empend((ByteCodeValueType)OpCodeId::ForkJump);
  278. empend((ByteCodeValueType) - (body_length + 7)); // JUMP to label _L
  279. empend((ByteCodeValueType)OpCodeId::Restore);
  280. return;
  281. }
  282. }
  283. VERIFY_NOT_REACHED();
  284. }
  285. void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
  286. {
  287. // FORKJUMP _ALT
  288. // REGEXP ALT2
  289. // JUMP _END
  290. // LABEL _ALT
  291. // REGEXP ALT1
  292. // LABEL _END
  293. // Optimisation: Eliminate extra work by unifying common pre-and-postfix exprs.
  294. Optimizer::append_alternation(*this, move(left), move(right));
  295. }
  296. template<typename T>
  297. static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true) requires(IsIntegral<T>)
  298. {
  299. ByteCode new_bytecode;
  300. new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id);
  301. if (maximum.has_value()) {
  302. // (REPEAT REGEXP MIN)
  303. // LABEL _MAX_LOOP |
  304. // FORK END |
  305. // REGEXP |
  306. // REPEAT _MAX_LOOP MAX-MIN | if max > min
  307. // FORK END |
  308. // REGEXP |
  309. // LABEL END |
  310. // RESET _MAX_LOOP |
  311. auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump);
  312. if (maximum.value() > minimum) {
  313. new_bytecode.empend(jump_kind);
  314. new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
  315. auto pre_loop_fork_jump_index = new_bytecode.size();
  316. new_bytecode.extend(bytecode_to_repeat);
  317. auto repetitions = maximum.value() - minimum;
  318. auto fork_jump_address = new_bytecode.size();
  319. if (repetitions > 1) {
  320. new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat);
  321. new_bytecode.empend(bytecode_to_repeat.size() + 2);
  322. new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1));
  323. new_bytecode.empend(max_repetition_mark_id);
  324. new_bytecode.empend(jump_kind);
  325. new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
  326. auto post_loop_fork_jump_index = new_bytecode.size();
  327. new_bytecode.extend(bytecode_to_repeat);
  328. fork_jump_address = new_bytecode.size();
  329. new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index);
  330. new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat);
  331. new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id);
  332. }
  333. new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index);
  334. }
  335. } else {
  336. // no maximum value set, repeat finding if possible:
  337. // (REPEAT REGEXP MIN)
  338. // LABEL _START
  339. // CHECKPOINT _C
  340. // REGEXP
  341. // JUMP_NONEMPTY _C _START FORK
  342. // Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n)
  343. new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint);
  344. auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay);
  345. new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
  346. new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 1); // Jump to the last iteration
  347. new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 1); // if _C is not empty.
  348. new_bytecode.empend(jump_kind);
  349. }
  350. bytecode_to_repeat = move(new_bytecode);
  351. }
  352. template<typename T>
  353. void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id) requires(IsIntegral<T>)
  354. {
  355. // LABEL _LOOP
  356. // REGEXP
  357. // REPEAT _LOOP N-1
  358. // REGEXP
  359. if (n == 0)
  360. return;
  361. // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the
  362. // REPEAT instruction forcing another loop.
  363. extend(bytecode_to_repeat);
  364. if (n > 1) {
  365. empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat));
  366. empend(bytecode_to_repeat.size());
  367. empend(static_cast<ByteCodeValueType>(n - 1));
  368. empend(repetition_mark_id);
  369. extend(bytecode_to_repeat);
  370. }
  371. }
  372. static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
  373. {
  374. // LABEL _START = -bytecode_to_repeat.size()
  375. // CHECKPOINT _C
  376. // REGEXP
  377. // JUMP_NONEMPTY _C _START FORKSTAY (FORKJUMP -> Greedy)
  378. bytecode_to_repeat.prepend((ByteCodeValueType)OpCodeId::Checkpoint);
  379. bytecode_to_repeat.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
  380. bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 3); // Jump to the _START label...
  381. bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 2); // ...if _C is not empty
  382. if (greedy)
  383. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  384. else
  385. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  386. }
  387. static void transform_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
  388. {
  389. // LABEL _START
  390. // FORKJUMP _END (FORKSTAY -> Greedy)
  391. // CHECKPOINT _C
  392. // REGEXP
  393. // JUMP_NONEMPTY _C _START JUMP
  394. // LABEL _END
  395. // LABEL _START = m_bytes.size();
  396. ByteCode bytecode;
  397. if (greedy)
  398. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  399. else
  400. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  401. bytecode.empend(bytecode_to_repeat.size() + 1 + 4); // Jump to the _END label
  402. auto c_label = bytecode.size();
  403. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint));
  404. bytecode.extend(bytecode_to_repeat);
  405. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::JumpNonEmpty));
  406. bytecode.empend(-bytecode.size() - 3); // Jump(...) to the _START label...
  407. bytecode.empend(c_label - bytecode.size() - 2); // ...only if _C passes.
  408. bytecode.empend((ByteCodeValueType)OpCodeId::Jump);
  409. // LABEL _END = bytecode.size()
  410. bytecode_to_repeat = move(bytecode);
  411. }
  412. static void transform_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
  413. {
  414. // FORKJUMP _END (FORKSTAY -> Greedy)
  415. // REGEXP
  416. // LABEL _END
  417. ByteCode bytecode;
  418. if (greedy)
  419. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  420. else
  421. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  422. bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
  423. bytecode.extend(move(bytecode_to_repeat));
  424. // LABEL _END = bytecode.size()
  425. bytecode_to_repeat = move(bytecode);
  426. }
  427. OpCode& get_opcode(MatchState& state) const;
  428. private:
  429. void insert_string(StringView const& view)
  430. {
  431. empend((ByteCodeValueType)view.length());
  432. for (size_t i = 0; i < view.length(); ++i)
  433. empend((ByteCodeValueType)view[i]);
  434. }
  435. void ensure_opcodes_initialized();
  436. ALWAYS_INLINE OpCode& get_opcode_by_id(OpCodeId id) const;
  437. static OwnPtr<OpCode> s_opcodes[(size_t)OpCodeId::Last + 1];
  438. static bool s_opcodes_initialized;
  439. };
  440. #define ENUMERATE_EXECUTION_RESULTS \
  441. __ENUMERATE_EXECUTION_RESULT(Continue) \
  442. __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
  443. __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
  444. __ENUMERATE_EXECUTION_RESULT(Failed) \
  445. __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
  446. __ENUMERATE_EXECUTION_RESULT(Succeeded)
  447. enum class ExecutionResult : u8 {
  448. #define __ENUMERATE_EXECUTION_RESULT(x) x,
  449. ENUMERATE_EXECUTION_RESULTS
  450. #undef __ENUMERATE_EXECUTION_RESULT
  451. };
  452. char const* execution_result_name(ExecutionResult result);
  453. char const* opcode_id_name(OpCodeId opcode_id);
  454. char const* boundary_check_type_name(BoundaryCheckType);
  455. char const* character_compare_type_name(CharacterCompareType result);
  456. char const* execution_result_name(ExecutionResult result);
  457. class OpCode {
  458. public:
  459. OpCode() = default;
  460. virtual ~OpCode() = default;
  461. virtual OpCodeId opcode_id() const = 0;
  462. virtual size_t size() const = 0;
  463. virtual ExecutionResult execute(MatchInput const& input, MatchState& state) const = 0;
  464. ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
  465. {
  466. VERIFY(state().instruction_position + offset <= m_bytecode->size());
  467. return m_bytecode->at(state().instruction_position + 1 + offset);
  468. }
  469. ALWAYS_INLINE char const* name() const;
  470. static char const* name(OpCodeId const);
  471. ALWAYS_INLINE void set_state(MatchState& state) { m_state = &state; }
  472. ALWAYS_INLINE void set_bytecode(ByteCode& bytecode) { m_bytecode = &bytecode; }
  473. ALWAYS_INLINE MatchState const& state() const
  474. {
  475. VERIFY(m_state);
  476. return *m_state;
  477. }
  478. String const to_string() const
  479. {
  480. return String::formatted("[{:#02X}] {}", (int)opcode_id(), name(opcode_id()));
  481. }
  482. virtual String const arguments_string() const = 0;
  483. ALWAYS_INLINE ByteCode const& bytecode() const { return *m_bytecode; }
  484. protected:
  485. ByteCode* m_bytecode { nullptr };
  486. MatchState* m_state { nullptr };
  487. };
  488. class OpCode_Exit final : public OpCode {
  489. public:
  490. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  491. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
  492. ALWAYS_INLINE size_t size() const override { return 1; }
  493. String const arguments_string() const override { return ""; }
  494. };
  495. class OpCode_FailForks final : public OpCode {
  496. public:
  497. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  498. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
  499. ALWAYS_INLINE size_t size() const override { return 2; }
  500. ALWAYS_INLINE size_t count() const { return argument(0); }
  501. String const arguments_string() const override { return String::formatted("count={}", count()); }
  502. };
  503. class OpCode_Save final : public OpCode {
  504. public:
  505. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  506. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
  507. ALWAYS_INLINE size_t size() const override { return 1; }
  508. String const arguments_string() const override { return ""; }
  509. };
  510. class OpCode_Restore final : public OpCode {
  511. public:
  512. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  513. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
  514. ALWAYS_INLINE size_t size() const override { return 1; }
  515. String const arguments_string() const override { return ""; }
  516. };
  517. class OpCode_GoBack final : public OpCode {
  518. public:
  519. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  520. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
  521. ALWAYS_INLINE size_t size() const override { return 2; }
  522. ALWAYS_INLINE size_t count() const { return argument(0); }
  523. String const arguments_string() const override { return String::formatted("count={}", count()); }
  524. };
  525. class OpCode_Jump final : public OpCode {
  526. public:
  527. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  528. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
  529. ALWAYS_INLINE size_t size() const override { return 2; }
  530. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  531. String const arguments_string() const override
  532. {
  533. return String::formatted("offset={} [&{}]", offset(), state().instruction_position + size() + offset());
  534. }
  535. };
  536. class OpCode_ForkJump : public OpCode {
  537. public:
  538. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  539. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
  540. ALWAYS_INLINE size_t size() const override { return 2; }
  541. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  542. String const arguments_string() const override
  543. {
  544. return String::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position);
  545. }
  546. };
  547. class OpCode_ForkReplaceJump final : public OpCode_ForkJump {
  548. public:
  549. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  550. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceJump; }
  551. };
  552. class OpCode_ForkStay : public OpCode {
  553. public:
  554. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  555. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
  556. ALWAYS_INLINE size_t size() const override { return 2; }
  557. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  558. String const arguments_string() const override
  559. {
  560. return String::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position);
  561. }
  562. };
  563. class OpCode_ForkReplaceStay final : public OpCode_ForkStay {
  564. public:
  565. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  566. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceStay; }
  567. };
  568. class OpCode_CheckBegin final : public OpCode {
  569. public:
  570. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  571. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
  572. ALWAYS_INLINE size_t size() const override { return 1; }
  573. String const arguments_string() const override { return ""; }
  574. };
  575. class OpCode_CheckEnd final : public OpCode {
  576. public:
  577. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  578. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
  579. ALWAYS_INLINE size_t size() const override { return 1; }
  580. String const arguments_string() const override { return ""; }
  581. };
  582. class OpCode_CheckBoundary final : public OpCode {
  583. public:
  584. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  585. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
  586. ALWAYS_INLINE size_t size() const override { return 2; }
  587. ALWAYS_INLINE size_t arguments_count() const { return 1; }
  588. ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
  589. String const arguments_string() const override { return String::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); }
  590. };
  591. class OpCode_ClearCaptureGroup final : public OpCode {
  592. public:
  593. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  594. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; }
  595. ALWAYS_INLINE size_t size() const override { return 2; }
  596. ALWAYS_INLINE size_t id() const { return argument(0); }
  597. String const arguments_string() const override { return String::formatted("id={}", id()); }
  598. };
  599. class OpCode_SaveLeftCaptureGroup final : public OpCode {
  600. public:
  601. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  602. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
  603. ALWAYS_INLINE size_t size() const override { return 2; }
  604. ALWAYS_INLINE size_t id() const { return argument(0); }
  605. String const arguments_string() const override { return String::formatted("id={}", id()); }
  606. };
  607. class OpCode_SaveRightCaptureGroup final : public OpCode {
  608. public:
  609. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  610. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
  611. ALWAYS_INLINE size_t size() const override { return 2; }
  612. ALWAYS_INLINE size_t id() const { return argument(0); }
  613. String const arguments_string() const override { return String::formatted("id={}", id()); }
  614. };
  615. class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
  616. public:
  617. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  618. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
  619. ALWAYS_INLINE size_t size() const override { return 4; }
  620. ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
  621. ALWAYS_INLINE size_t length() const { return argument(1); }
  622. ALWAYS_INLINE size_t id() const { return argument(2); }
  623. String const arguments_string() const override
  624. {
  625. return String::formatted("name={}, length={}", name(), length());
  626. }
  627. };
  628. class OpCode_Compare final : public OpCode {
  629. public:
  630. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  631. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
  632. ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
  633. ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
  634. ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
  635. String const arguments_string() const override;
  636. Vector<String> const variable_arguments_to_string(Optional<MatchInput> input = {}) const;
  637. Vector<CompareTypeAndValuePair> flat_compares() const;
  638. private:
  639. ALWAYS_INLINE static void compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
  640. ALWAYS_INLINE static bool compare_string(MatchInput const& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match);
  641. ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
  642. ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
  643. ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
  644. ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
  645. ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
  646. ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
  647. };
  648. class OpCode_Repeat : public OpCode {
  649. public:
  650. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  651. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Repeat; }
  652. ALWAYS_INLINE size_t size() const override { return 4; }
  653. ALWAYS_INLINE size_t offset() const { return argument(0); }
  654. ALWAYS_INLINE u64 count() const { return argument(1); }
  655. ALWAYS_INLINE size_t id() const { return argument(2); }
  656. String const arguments_string() const override
  657. {
  658. auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
  659. return String::formatted("offset={} count={} id={} rep={}, sp: {}", offset(), count() + 1, id(), reps + 1, state().string_position);
  660. }
  661. };
  662. class OpCode_ResetRepeat : public OpCode {
  663. public:
  664. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  665. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ResetRepeat; }
  666. ALWAYS_INLINE size_t size() const override { return 2; }
  667. ALWAYS_INLINE size_t id() const { return argument(0); }
  668. String const arguments_string() const override
  669. {
  670. auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
  671. return String::formatted("id={} rep={}", id(), reps + 1);
  672. }
  673. };
  674. class OpCode_Checkpoint final : public OpCode {
  675. public:
  676. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  677. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Checkpoint; }
  678. ALWAYS_INLINE size_t size() const override { return 1; }
  679. String const arguments_string() const override { return ""; }
  680. };
  681. class OpCode_JumpNonEmpty final : public OpCode {
  682. public:
  683. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  684. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::JumpNonEmpty; }
  685. ALWAYS_INLINE size_t size() const override { return 4; }
  686. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  687. ALWAYS_INLINE ssize_t checkpoint() const { return argument(1); }
  688. ALWAYS_INLINE OpCodeId form() const { return (OpCodeId)argument(2); }
  689. String const arguments_string() const override
  690. {
  691. return String::formatted("{} offset={} [&{}], cp={} [&{}]",
  692. opcode_id_name(form()),
  693. offset(), state().instruction_position + size() + offset(),
  694. checkpoint(), state().instruction_position + size() + checkpoint());
  695. }
  696. };
  697. template<typename T>
  698. bool is(OpCode const&);
  699. template<typename T>
  700. ALWAYS_INLINE bool is(OpCode const&)
  701. {
  702. return false;
  703. }
  704. template<typename T>
  705. ALWAYS_INLINE bool is(OpCode const* opcode)
  706. {
  707. return is<T>(*opcode);
  708. }
  709. template<>
  710. ALWAYS_INLINE bool is<OpCode_ForkStay>(OpCode const& opcode)
  711. {
  712. return opcode.opcode_id() == OpCodeId::ForkStay;
  713. }
  714. template<>
  715. ALWAYS_INLINE bool is<OpCode_Exit>(OpCode const& opcode)
  716. {
  717. return opcode.opcode_id() == OpCodeId::Exit;
  718. }
  719. template<>
  720. ALWAYS_INLINE bool is<OpCode_Compare>(OpCode const& opcode)
  721. {
  722. return opcode.opcode_id() == OpCodeId::Compare;
  723. }
  724. template<typename T>
  725. ALWAYS_INLINE T const& to(OpCode const& opcode)
  726. {
  727. return verify_cast<T>(opcode);
  728. }
  729. template<typename T>
  730. ALWAYS_INLINE T* to(OpCode* opcode)
  731. {
  732. return verify_cast<T>(opcode);
  733. }
  734. template<typename T>
  735. ALWAYS_INLINE T const* to(OpCode const* opcode)
  736. {
  737. return verify_cast<T>(opcode);
  738. }
  739. template<typename T>
  740. ALWAYS_INLINE T& to(OpCode& opcode)
  741. {
  742. return verify_cast<T>(opcode);
  743. }
  744. }