RegexByteCode.h 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #pragma once
  7. #include "RegexBytecodeStreamOptimizer.h"
  8. #include "RegexMatch.h"
  9. #include "RegexOptions.h"
  10. #include <AK/Concepts.h>
  11. #include <AK/DisjointChunks.h>
  12. #include <AK/Forward.h>
  13. #include <AK/HashMap.h>
  14. #include <AK/NonnullOwnPtr.h>
  15. #include <AK/OwnPtr.h>
  16. #include <AK/Traits.h>
  17. #include <AK/TypeCasts.h>
  18. #include <AK/Types.h>
  19. #include <AK/Vector.h>
  20. #include <LibUnicode/Forward.h>
  21. namespace regex {
  22. using ByteCodeValueType = u64;
  23. #define ENUMERATE_OPCODES \
  24. __ENUMERATE_OPCODE(Compare) \
  25. __ENUMERATE_OPCODE(Jump) \
  26. __ENUMERATE_OPCODE(JumpNonEmpty) \
  27. __ENUMERATE_OPCODE(ForkJump) \
  28. __ENUMERATE_OPCODE(ForkStay) \
  29. __ENUMERATE_OPCODE(ForkReplaceJump) \
  30. __ENUMERATE_OPCODE(ForkReplaceStay) \
  31. __ENUMERATE_OPCODE(FailForks) \
  32. __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
  33. __ENUMERATE_OPCODE(SaveRightCaptureGroup) \
  34. __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
  35. __ENUMERATE_OPCODE(CheckBegin) \
  36. __ENUMERATE_OPCODE(CheckEnd) \
  37. __ENUMERATE_OPCODE(CheckBoundary) \
  38. __ENUMERATE_OPCODE(Save) \
  39. __ENUMERATE_OPCODE(Restore) \
  40. __ENUMERATE_OPCODE(GoBack) \
  41. __ENUMERATE_OPCODE(ClearCaptureGroup) \
  42. __ENUMERATE_OPCODE(Repeat) \
  43. __ENUMERATE_OPCODE(ResetRepeat) \
  44. __ENUMERATE_OPCODE(Checkpoint) \
  45. __ENUMERATE_OPCODE(Exit)
  46. // clang-format off
  47. enum class OpCodeId : ByteCodeValueType {
  48. #define __ENUMERATE_OPCODE(x) x,
  49. ENUMERATE_OPCODES
  50. #undef __ENUMERATE_OPCODE
  51. First = Compare,
  52. Last = Exit,
  53. };
  54. // clang-format on
  55. #define ENUMERATE_CHARACTER_COMPARE_TYPES \
  56. __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
  57. __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
  58. __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
  59. __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
  60. __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
  61. __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
  62. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
  63. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
  64. __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
  65. __ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
  66. __ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
  67. __ENUMERATE_CHARACTER_COMPARE_TYPE(Script) \
  68. __ENUMERATE_CHARACTER_COMPARE_TYPE(ScriptExtension) \
  69. __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy) \
  70. __ENUMERATE_CHARACTER_COMPARE_TYPE(LookupTable) \
  71. __ENUMERATE_CHARACTER_COMPARE_TYPE(And) \
  72. __ENUMERATE_CHARACTER_COMPARE_TYPE(Or) \
  73. __ENUMERATE_CHARACTER_COMPARE_TYPE(EndAndOr)
  74. enum class CharacterCompareType : ByteCodeValueType {
  75. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
  76. ENUMERATE_CHARACTER_COMPARE_TYPES
  77. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  78. };
  79. #define ENUMERATE_CHARACTER_CLASSES \
  80. __ENUMERATE_CHARACTER_CLASS(Alnum) \
  81. __ENUMERATE_CHARACTER_CLASS(Cntrl) \
  82. __ENUMERATE_CHARACTER_CLASS(Lower) \
  83. __ENUMERATE_CHARACTER_CLASS(Space) \
  84. __ENUMERATE_CHARACTER_CLASS(Alpha) \
  85. __ENUMERATE_CHARACTER_CLASS(Digit) \
  86. __ENUMERATE_CHARACTER_CLASS(Print) \
  87. __ENUMERATE_CHARACTER_CLASS(Upper) \
  88. __ENUMERATE_CHARACTER_CLASS(Blank) \
  89. __ENUMERATE_CHARACTER_CLASS(Graph) \
  90. __ENUMERATE_CHARACTER_CLASS(Punct) \
  91. __ENUMERATE_CHARACTER_CLASS(Word) \
  92. __ENUMERATE_CHARACTER_CLASS(Xdigit)
  93. enum class CharClass : ByteCodeValueType {
  94. #define __ENUMERATE_CHARACTER_CLASS(x) x,
  95. ENUMERATE_CHARACTER_CLASSES
  96. #undef __ENUMERATE_CHARACTER_CLASS
  97. };
  98. #define ENUMERATE_BOUNDARY_CHECK_TYPES \
  99. __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
  100. __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
  101. enum class BoundaryCheckType : ByteCodeValueType {
  102. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
  103. ENUMERATE_BOUNDARY_CHECK_TYPES
  104. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  105. };
  106. struct CharRange {
  107. u32 const from;
  108. u32 const to;
  109. CharRange(u64 value)
  110. : from(value >> 32)
  111. , to(value & 0xffffffff)
  112. {
  113. }
  114. CharRange(u32 from, u32 to)
  115. : from(from)
  116. , to(to)
  117. {
  118. }
  119. operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
  120. };
  121. struct CompareTypeAndValuePair {
  122. CharacterCompareType type;
  123. ByteCodeValueType value;
  124. };
  125. class OpCode;
  126. class ByteCode : public DisjointChunks<ByteCodeValueType> {
  127. using Base = DisjointChunks<ByteCodeValueType>;
  128. public:
  129. ByteCode()
  130. {
  131. ensure_opcodes_initialized();
  132. }
  133. ByteCode(ByteCode const&) = default;
  134. virtual ~ByteCode() = default;
  135. ByteCode& operator=(ByteCode&&) = default;
  136. ByteCode& operator=(Base&& value)
  137. {
  138. static_cast<Base&>(*this) = move(value);
  139. return *this;
  140. }
  141. template<typename... Args>
  142. void empend(Args&&... args)
  143. {
  144. if (is_empty())
  145. Base::append({});
  146. Base::last_chunk().empend(forward<Args>(args)...);
  147. }
  148. template<typename T>
  149. void append(T&& value)
  150. {
  151. if (is_empty())
  152. Base::append({});
  153. Base::last_chunk().append(forward<T>(value));
  154. }
  155. template<typename T>
  156. void prepend(T&& value)
  157. {
  158. if (is_empty())
  159. return append(forward<T>(value));
  160. Base::first_chunk().prepend(forward<T>(value));
  161. }
  162. void append(Span<ByteCodeValueType const> value)
  163. {
  164. if (is_empty())
  165. Base::append({});
  166. auto& last = Base::last_chunk();
  167. last.ensure_capacity(value.size());
  168. for (auto v : value)
  169. last.unchecked_append(v);
  170. }
  171. void ensure_capacity(size_t capacity)
  172. {
  173. if (is_empty())
  174. Base::append({});
  175. Base::last_chunk().ensure_capacity(capacity);
  176. }
  177. void last_chunk() const = delete;
  178. void first_chunk() const = delete;
  179. void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
  180. {
  181. Optimizer::append_character_class(*this, move(pairs));
  182. }
  183. void insert_bytecode_check_boundary(BoundaryCheckType type)
  184. {
  185. ByteCode bytecode;
  186. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
  187. bytecode.empend((ByteCodeValueType)type);
  188. extend(move(bytecode));
  189. }
  190. void insert_bytecode_clear_capture_group(size_t index)
  191. {
  192. empend(static_cast<ByteCodeValueType>(OpCodeId::ClearCaptureGroup));
  193. empend(index);
  194. }
  195. void insert_bytecode_compare_string(StringView view)
  196. {
  197. empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  198. empend(static_cast<u64>(1)); // number of arguments
  199. empend(2 + view.length()); // size of arguments
  200. empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
  201. insert_string(view);
  202. }
  203. void insert_bytecode_group_capture_left(size_t capture_groups_count)
  204. {
  205. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
  206. empend(capture_groups_count);
  207. }
  208. void insert_bytecode_group_capture_right(size_t capture_groups_count)
  209. {
  210. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
  211. empend(capture_groups_count);
  212. }
  213. void insert_bytecode_group_capture_right(size_t capture_groups_count, StringView name)
  214. {
  215. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
  216. empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
  217. empend(name.length());
  218. empend(capture_groups_count);
  219. }
  220. enum class LookAroundType {
  221. LookAhead,
  222. LookBehind,
  223. NegatedLookAhead,
  224. NegatedLookBehind,
  225. };
  226. void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
  227. {
  228. // FIXME: The save stack will grow infinitely with repeated failures
  229. // as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
  230. switch (type) {
  231. case LookAroundType::LookAhead: {
  232. // SAVE
  233. // REGEXP BODY
  234. // RESTORE
  235. empend((ByteCodeValueType)OpCodeId::Save);
  236. extend(move(lookaround_body));
  237. empend((ByteCodeValueType)OpCodeId::Restore);
  238. return;
  239. }
  240. case LookAroundType::NegatedLookAhead: {
  241. // JUMP _A
  242. // LABEL _L
  243. // REGEXP BODY
  244. // FAIL
  245. // LABEL _A
  246. // SAVE
  247. // FORKJUMP _L
  248. // RESTORE
  249. auto body_length = lookaround_body.size();
  250. empend((ByteCodeValueType)OpCodeId::Jump);
  251. empend((ByteCodeValueType)body_length + 1); // JUMP to label _A
  252. extend(move(lookaround_body));
  253. empend((ByteCodeValueType)OpCodeId::FailForks);
  254. empend((ByteCodeValueType)OpCodeId::Save);
  255. empend((ByteCodeValueType)OpCodeId::ForkJump);
  256. empend((ByteCodeValueType) - (body_length + 4)); // JUMP to label _L
  257. empend((ByteCodeValueType)OpCodeId::Restore);
  258. return;
  259. }
  260. case LookAroundType::LookBehind:
  261. // SAVE
  262. // GOBACK match_length(BODY)
  263. // REGEXP BODY
  264. // RESTORE
  265. empend((ByteCodeValueType)OpCodeId::Save);
  266. empend((ByteCodeValueType)OpCodeId::GoBack);
  267. empend((ByteCodeValueType)match_length);
  268. extend(move(lookaround_body));
  269. empend((ByteCodeValueType)OpCodeId::Restore);
  270. return;
  271. case LookAroundType::NegatedLookBehind: {
  272. // JUMP _A
  273. // LABEL _L
  274. // GOBACK match_length(BODY)
  275. // REGEXP BODY
  276. // FAIL
  277. // LABEL _A
  278. // SAVE
  279. // FORKJUMP _L
  280. // RESTORE
  281. auto body_length = lookaround_body.size();
  282. empend((ByteCodeValueType)OpCodeId::Jump);
  283. empend((ByteCodeValueType)body_length + 3); // JUMP to label _A
  284. empend((ByteCodeValueType)OpCodeId::GoBack);
  285. empend((ByteCodeValueType)match_length);
  286. extend(move(lookaround_body));
  287. empend((ByteCodeValueType)OpCodeId::FailForks);
  288. empend((ByteCodeValueType)OpCodeId::Save);
  289. empend((ByteCodeValueType)OpCodeId::ForkJump);
  290. empend((ByteCodeValueType) - (body_length + 6)); // JUMP to label _L
  291. empend((ByteCodeValueType)OpCodeId::Restore);
  292. return;
  293. }
  294. }
  295. VERIFY_NOT_REACHED();
  296. }
  297. void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
  298. {
  299. // FORKJUMP _ALT
  300. // REGEXP ALT2
  301. // JUMP _END
  302. // LABEL _ALT
  303. // REGEXP ALT1
  304. // LABEL _END
  305. // Optimisation: Eliminate extra work by unifying common pre-and-postfix exprs.
  306. Optimizer::append_alternation(*this, move(left), move(right));
  307. }
  308. template<Integral T>
  309. static void transform_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, T minimum, Optional<T> maximum, size_t min_repetition_mark_id, size_t max_repetition_mark_id, bool greedy = true)
  310. {
  311. if (!maximum.has_value()) {
  312. if (minimum == 0)
  313. return transform_bytecode_repetition_any(bytecode_to_repeat, greedy);
  314. if (minimum == 1)
  315. return transform_bytecode_repetition_min_one(bytecode_to_repeat, greedy);
  316. }
  317. ByteCode new_bytecode;
  318. new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum, min_repetition_mark_id);
  319. if (maximum.has_value()) {
  320. // (REPEAT REGEXP MIN)
  321. // LABEL _MAX_LOOP |
  322. // FORK END |
  323. // REGEXP |
  324. // REPEAT _MAX_LOOP MAX-MIN | if max > min
  325. // FORK END |
  326. // REGEXP |
  327. // LABEL END |
  328. // RESET _MAX_LOOP |
  329. auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkStay : OpCodeId::ForkJump);
  330. if (maximum.value() > minimum) {
  331. new_bytecode.empend(jump_kind);
  332. new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
  333. auto pre_loop_fork_jump_index = new_bytecode.size();
  334. new_bytecode.extend(bytecode_to_repeat);
  335. auto repetitions = maximum.value() - minimum;
  336. auto fork_jump_address = new_bytecode.size();
  337. if (repetitions > 1) {
  338. new_bytecode.empend((ByteCodeValueType)OpCodeId::Repeat);
  339. new_bytecode.empend(bytecode_to_repeat.size() + 2);
  340. new_bytecode.empend(static_cast<ByteCodeValueType>(repetitions - 1));
  341. new_bytecode.empend(max_repetition_mark_id);
  342. new_bytecode.empend(jump_kind);
  343. new_bytecode.empend((ByteCodeValueType)0); // Placeholder for the jump target.
  344. auto post_loop_fork_jump_index = new_bytecode.size();
  345. new_bytecode.extend(bytecode_to_repeat);
  346. fork_jump_address = new_bytecode.size();
  347. new_bytecode[post_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - post_loop_fork_jump_index);
  348. new_bytecode.empend((ByteCodeValueType)OpCodeId::ResetRepeat);
  349. new_bytecode.empend((ByteCodeValueType)max_repetition_mark_id);
  350. }
  351. new_bytecode[pre_loop_fork_jump_index - 1] = (ByteCodeValueType)(fork_jump_address - pre_loop_fork_jump_index);
  352. }
  353. } else {
  354. // no maximum value set, repeat finding if possible:
  355. // (REPEAT REGEXP MIN)
  356. // LABEL _START
  357. // CHECKPOINT _C
  358. // REGEXP
  359. // JUMP_NONEMPTY _C _START FORK
  360. // Note: This is only safe because REPEAT will leave one iteration outside (see repetition_n)
  361. auto checkpoint = s_next_checkpoint_serial_id++;
  362. new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)OpCodeId::Checkpoint);
  363. new_bytecode.insert(new_bytecode.size() - bytecode_to_repeat.size(), (ByteCodeValueType)checkpoint);
  364. auto jump_kind = static_cast<ByteCodeValueType>(greedy ? OpCodeId::ForkJump : OpCodeId::ForkStay);
  365. new_bytecode.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
  366. new_bytecode.empend(-bytecode_to_repeat.size() - 4 - 2); // Jump to the last iteration
  367. new_bytecode.empend(checkpoint); // if _C is not empty.
  368. new_bytecode.empend(jump_kind);
  369. }
  370. bytecode_to_repeat = move(new_bytecode);
  371. }
  372. template<Integral T>
  373. void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, T n, size_t repetition_mark_id)
  374. {
  375. // LABEL _LOOP
  376. // REGEXP
  377. // REPEAT _LOOP N-1
  378. // REGEXP
  379. if (n == 0)
  380. return;
  381. // Note: this bytecode layout allows callers to repeat the last REGEXP instruction without the
  382. // REPEAT instruction forcing another loop.
  383. extend(bytecode_to_repeat);
  384. if (n > 1) {
  385. empend(static_cast<ByteCodeValueType>(OpCodeId::Repeat));
  386. empend(bytecode_to_repeat.size());
  387. empend(static_cast<ByteCodeValueType>(n - 1));
  388. empend(repetition_mark_id);
  389. extend(bytecode_to_repeat);
  390. }
  391. }
  392. static void transform_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
  393. {
  394. // LABEL _START = -bytecode_to_repeat.size()
  395. // CHECKPOINT _C
  396. // REGEXP
  397. // JUMP_NONEMPTY _C _START FORKSTAY (FORKJUMP -> Greedy)
  398. auto checkpoint = s_next_checkpoint_serial_id++;
  399. bytecode_to_repeat.prepend((ByteCodeValueType)checkpoint);
  400. bytecode_to_repeat.prepend((ByteCodeValueType)OpCodeId::Checkpoint);
  401. bytecode_to_repeat.empend((ByteCodeValueType)OpCodeId::JumpNonEmpty);
  402. bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 3); // Jump to the _START label...
  403. bytecode_to_repeat.empend(checkpoint); // ...if _C is not empty
  404. if (greedy)
  405. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  406. else
  407. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  408. }
  409. static void transform_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
  410. {
  411. // LABEL _START
  412. // FORKJUMP _END (FORKSTAY -> Greedy)
  413. // CHECKPOINT _C
  414. // REGEXP
  415. // JUMP_NONEMPTY _C _START JUMP
  416. // LABEL _END
  417. // LABEL _START = m_bytes.size();
  418. ByteCode bytecode;
  419. if (greedy)
  420. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  421. else
  422. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  423. bytecode.empend(bytecode_to_repeat.size() + 2 + 4); // Jump to the _END label
  424. auto checkpoint = s_next_checkpoint_serial_id++;
  425. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Checkpoint));
  426. bytecode.empend(static_cast<ByteCodeValueType>(checkpoint));
  427. bytecode.extend(bytecode_to_repeat);
  428. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::JumpNonEmpty));
  429. bytecode.empend(-bytecode.size() - 3); // Jump(...) to the _START label...
  430. bytecode.empend(checkpoint); // ...only if _C passes.
  431. bytecode.empend((ByteCodeValueType)OpCodeId::Jump);
  432. // LABEL _END = bytecode.size()
  433. bytecode_to_repeat = move(bytecode);
  434. }
  435. static void transform_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
  436. {
  437. // FORKJUMP _END (FORKSTAY -> Greedy)
  438. // REGEXP
  439. // LABEL _END
  440. ByteCode bytecode;
  441. if (greedy)
  442. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  443. else
  444. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  445. bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
  446. bytecode.extend(move(bytecode_to_repeat));
  447. // LABEL _END = bytecode.size()
  448. bytecode_to_repeat = move(bytecode);
  449. }
  450. OpCode& get_opcode(MatchState& state) const;
  451. static void reset_checkpoint_serial_id() { s_next_checkpoint_serial_id = 0; }
  452. private:
  453. void insert_string(StringView view)
  454. {
  455. empend((ByteCodeValueType)view.length());
  456. for (size_t i = 0; i < view.length(); ++i)
  457. empend((ByteCodeValueType)view[i]);
  458. }
  459. void ensure_opcodes_initialized();
  460. ALWAYS_INLINE OpCode& get_opcode_by_id(OpCodeId id) const;
  461. static OwnPtr<OpCode> s_opcodes[(size_t)OpCodeId::Last + 1];
  462. static bool s_opcodes_initialized;
  463. static size_t s_next_checkpoint_serial_id;
  464. };
  465. #define ENUMERATE_EXECUTION_RESULTS \
  466. __ENUMERATE_EXECUTION_RESULT(Continue) \
  467. __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
  468. __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
  469. __ENUMERATE_EXECUTION_RESULT(Failed) \
  470. __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
  471. __ENUMERATE_EXECUTION_RESULT(Succeeded)
  472. enum class ExecutionResult : u8 {
  473. #define __ENUMERATE_EXECUTION_RESULT(x) x,
  474. ENUMERATE_EXECUTION_RESULTS
  475. #undef __ENUMERATE_EXECUTION_RESULT
  476. };
  477. StringView execution_result_name(ExecutionResult result);
  478. StringView opcode_id_name(OpCodeId opcode_id);
  479. StringView boundary_check_type_name(BoundaryCheckType);
  480. StringView character_compare_type_name(CharacterCompareType result);
  481. class OpCode {
  482. public:
  483. OpCode() = default;
  484. virtual ~OpCode() = default;
  485. virtual OpCodeId opcode_id() const = 0;
  486. virtual size_t size() const = 0;
  487. virtual ExecutionResult execute(MatchInput const& input, MatchState& state) const = 0;
  488. ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
  489. {
  490. return m_bytecode->at(state().instruction_position + 1 + offset);
  491. }
  492. ALWAYS_INLINE StringView name() const;
  493. static StringView name(OpCodeId);
  494. ALWAYS_INLINE void set_state(MatchState& state) { m_state = &state; }
  495. ALWAYS_INLINE void set_bytecode(ByteCode& bytecode) { m_bytecode = &bytecode; }
  496. ALWAYS_INLINE MatchState const& state() const
  497. {
  498. VERIFY(m_state);
  499. return *m_state;
  500. }
  501. ByteString to_byte_string() const
  502. {
  503. return ByteString::formatted("[{:#02X}] {}", (int)opcode_id(), name(opcode_id()));
  504. }
  505. virtual ByteString arguments_string() const = 0;
  506. ALWAYS_INLINE ByteCode const& bytecode() const { return *m_bytecode; }
  507. protected:
  508. ByteCode* m_bytecode { nullptr };
  509. MatchState* m_state { nullptr };
  510. };
  511. class OpCode_Exit final : public OpCode {
  512. public:
  513. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  514. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
  515. ALWAYS_INLINE size_t size() const override { return 1; }
  516. ByteString arguments_string() const override { return ByteString::empty(); }
  517. };
  518. class OpCode_FailForks final : public OpCode {
  519. public:
  520. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  521. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
  522. ALWAYS_INLINE size_t size() const override { return 1; }
  523. ByteString arguments_string() const override { return ByteString::empty(); }
  524. };
  525. class OpCode_Save final : public OpCode {
  526. public:
  527. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  528. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
  529. ALWAYS_INLINE size_t size() const override { return 1; }
  530. ByteString arguments_string() const override { return ByteString::empty(); }
  531. };
  532. class OpCode_Restore final : public OpCode {
  533. public:
  534. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  535. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
  536. ALWAYS_INLINE size_t size() const override { return 1; }
  537. ByteString arguments_string() const override { return ByteString::empty(); }
  538. };
  539. class OpCode_GoBack final : public OpCode {
  540. public:
  541. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  542. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
  543. ALWAYS_INLINE size_t size() const override { return 2; }
  544. ALWAYS_INLINE size_t count() const { return argument(0); }
  545. ByteString arguments_string() const override { return ByteString::formatted("count={}", count()); }
  546. };
  547. class OpCode_Jump final : public OpCode {
  548. public:
  549. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  550. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
  551. ALWAYS_INLINE size_t size() const override { return 2; }
  552. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  553. ByteString arguments_string() const override
  554. {
  555. return ByteString::formatted("offset={} [&{}]", offset(), state().instruction_position + size() + offset());
  556. }
  557. };
  558. class OpCode_ForkJump : public OpCode {
  559. public:
  560. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  561. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
  562. ALWAYS_INLINE size_t size() const override { return 2; }
  563. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  564. ByteString arguments_string() const override
  565. {
  566. return ByteString::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position);
  567. }
  568. };
  569. class OpCode_ForkReplaceJump final : public OpCode_ForkJump {
  570. public:
  571. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  572. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceJump; }
  573. };
  574. class OpCode_ForkStay : public OpCode {
  575. public:
  576. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  577. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
  578. ALWAYS_INLINE size_t size() const override { return 2; }
  579. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  580. ByteString arguments_string() const override
  581. {
  582. return ByteString::formatted("offset={} [&{}], sp: {}", offset(), state().instruction_position + size() + offset(), state().string_position);
  583. }
  584. };
  585. class OpCode_ForkReplaceStay final : public OpCode_ForkStay {
  586. public:
  587. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  588. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkReplaceStay; }
  589. };
  590. class OpCode_CheckBegin final : public OpCode {
  591. public:
  592. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  593. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
  594. ALWAYS_INLINE size_t size() const override { return 1; }
  595. ByteString arguments_string() const override { return ByteString::empty(); }
  596. };
  597. class OpCode_CheckEnd final : public OpCode {
  598. public:
  599. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  600. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
  601. ALWAYS_INLINE size_t size() const override { return 1; }
  602. ByteString arguments_string() const override { return ByteString::empty(); }
  603. };
  604. class OpCode_CheckBoundary final : public OpCode {
  605. public:
  606. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  607. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
  608. ALWAYS_INLINE size_t size() const override { return 2; }
  609. ALWAYS_INLINE size_t arguments_count() const { return 1; }
  610. ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
  611. ByteString arguments_string() const override { return ByteString::formatted("kind={} ({})", (long unsigned int)argument(0), boundary_check_type_name(type())); }
  612. };
  613. class OpCode_ClearCaptureGroup final : public OpCode {
  614. public:
  615. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  616. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ClearCaptureGroup; }
  617. ALWAYS_INLINE size_t size() const override { return 2; }
  618. ALWAYS_INLINE size_t id() const { return argument(0); }
  619. ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); }
  620. };
  621. class OpCode_SaveLeftCaptureGroup final : public OpCode {
  622. public:
  623. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  624. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
  625. ALWAYS_INLINE size_t size() const override { return 2; }
  626. ALWAYS_INLINE size_t id() const { return argument(0); }
  627. ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); }
  628. };
  629. class OpCode_SaveRightCaptureGroup final : public OpCode {
  630. public:
  631. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  632. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
  633. ALWAYS_INLINE size_t size() const override { return 2; }
  634. ALWAYS_INLINE size_t id() const { return argument(0); }
  635. ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); }
  636. };
  637. class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
  638. public:
  639. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  640. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
  641. ALWAYS_INLINE size_t size() const override { return 4; }
  642. ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
  643. ALWAYS_INLINE size_t length() const { return argument(1); }
  644. ALWAYS_INLINE size_t id() const { return argument(2); }
  645. ByteString arguments_string() const override
  646. {
  647. return ByteString::formatted("name={}, length={}", name(), length());
  648. }
  649. };
  650. class OpCode_Compare final : public OpCode {
  651. public:
  652. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  653. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
  654. ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
  655. ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
  656. ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
  657. ByteString arguments_string() const override;
  658. Vector<ByteString> variable_arguments_to_byte_string(Optional<MatchInput const&> input = {}) const;
  659. Vector<CompareTypeAndValuePair> flat_compares() const;
  660. static bool matches_character_class(CharClass, u32, bool insensitive);
  661. private:
  662. ALWAYS_INLINE static void compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
  663. ALWAYS_INLINE static bool compare_string(MatchInput const& input, MatchState& state, RegexStringView str, bool& had_zero_length_match);
  664. ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
  665. ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
  666. ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
  667. ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
  668. ALWAYS_INLINE static void compare_script(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
  669. ALWAYS_INLINE static void compare_script_extension(MatchInput const& input, MatchState& state, Unicode::Script script, bool inverse, bool& inverse_matched);
  670. };
  671. class OpCode_Repeat : public OpCode {
  672. public:
  673. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  674. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Repeat; }
  675. ALWAYS_INLINE size_t size() const override { return 4; }
  676. ALWAYS_INLINE size_t offset() const { return argument(0); }
  677. ALWAYS_INLINE u64 count() const { return argument(1); }
  678. ALWAYS_INLINE size_t id() const { return argument(2); }
  679. ByteString arguments_string() const override
  680. {
  681. auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
  682. return ByteString::formatted("offset={} count={} id={} rep={}, sp: {}", offset(), count() + 1, id(), reps + 1, state().string_position);
  683. }
  684. };
  685. class OpCode_ResetRepeat : public OpCode {
  686. public:
  687. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  688. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ResetRepeat; }
  689. ALWAYS_INLINE size_t size() const override { return 2; }
  690. ALWAYS_INLINE size_t id() const { return argument(0); }
  691. ByteString arguments_string() const override
  692. {
  693. auto reps = id() < state().repetition_marks.size() ? state().repetition_marks.at(id()) : 0;
  694. return ByteString::formatted("id={} rep={}", id(), reps + 1);
  695. }
  696. };
  697. class OpCode_Checkpoint final : public OpCode {
  698. public:
  699. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  700. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Checkpoint; }
  701. ALWAYS_INLINE size_t size() const override { return 2; }
  702. ALWAYS_INLINE size_t id() const { return argument(0); }
  703. ByteString arguments_string() const override { return ByteString::formatted("id={}", id()); }
  704. };
  705. class OpCode_JumpNonEmpty final : public OpCode {
  706. public:
  707. ExecutionResult execute(MatchInput const& input, MatchState& state) const override;
  708. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::JumpNonEmpty; }
  709. ALWAYS_INLINE size_t size() const override { return 4; }
  710. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  711. ALWAYS_INLINE ssize_t checkpoint() const { return argument(1); }
  712. ALWAYS_INLINE OpCodeId form() const { return (OpCodeId)argument(2); }
  713. ByteString arguments_string() const override
  714. {
  715. return ByteString::formatted("{} offset={} [&{}], cp={}",
  716. opcode_id_name(form()),
  717. offset(), state().instruction_position + size() + offset(),
  718. checkpoint());
  719. }
  720. };
  721. ALWAYS_INLINE OpCode& ByteCode::get_opcode(regex::MatchState& state) const
  722. {
  723. OpCodeId opcode_id;
  724. if (auto opcode_ptr = static_cast<DisjointChunks<ByteCodeValueType> const&>(*this).find(state.instruction_position))
  725. opcode_id = (OpCodeId)*opcode_ptr;
  726. else
  727. opcode_id = OpCodeId::Exit;
  728. auto& opcode = get_opcode_by_id(opcode_id);
  729. opcode.set_state(state);
  730. return opcode;
  731. }
  732. ALWAYS_INLINE OpCode& ByteCode::get_opcode_by_id(OpCodeId id) const
  733. {
  734. VERIFY(id >= OpCodeId::First && id <= OpCodeId::Last);
  735. auto& opcode = s_opcodes[(u32)id];
  736. opcode->set_bytecode(*const_cast<ByteCode*>(this));
  737. return *opcode;
  738. }
  739. template<typename T>
  740. bool is(OpCode const&);
  741. template<typename T>
  742. ALWAYS_INLINE bool is(OpCode const&)
  743. {
  744. return false;
  745. }
  746. template<typename T>
  747. ALWAYS_INLINE bool is(OpCode const* opcode)
  748. {
  749. return is<T>(*opcode);
  750. }
  751. template<>
  752. ALWAYS_INLINE bool is<OpCode_ForkStay>(OpCode const& opcode)
  753. {
  754. return opcode.opcode_id() == OpCodeId::ForkStay;
  755. }
  756. template<>
  757. ALWAYS_INLINE bool is<OpCode_Exit>(OpCode const& opcode)
  758. {
  759. return opcode.opcode_id() == OpCodeId::Exit;
  760. }
  761. template<>
  762. ALWAYS_INLINE bool is<OpCode_Compare>(OpCode const& opcode)
  763. {
  764. return opcode.opcode_id() == OpCodeId::Compare;
  765. }
  766. template<typename T>
  767. ALWAYS_INLINE T const& to(OpCode const& opcode)
  768. {
  769. return verify_cast<T>(opcode);
  770. }
  771. template<typename T>
  772. ALWAYS_INLINE T* to(OpCode* opcode)
  773. {
  774. return verify_cast<T>(opcode);
  775. }
  776. template<typename T>
  777. ALWAYS_INLINE T const* to(OpCode const* opcode)
  778. {
  779. return verify_cast<T>(opcode);
  780. }
  781. template<typename T>
  782. ALWAYS_INLINE T& to(OpCode& opcode)
  783. {
  784. return verify_cast<T>(opcode);
  785. }
  786. }