RegexByteCode.h 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #pragma once
  27. #include "RegexMatch.h"
  28. #include "RegexOptions.h"
  29. #include <AK/Format.h>
  30. #include <AK/Forward.h>
  31. #include <AK/HashMap.h>
  32. #include <AK/NonnullOwnPtr.h>
  33. #include <AK/OwnPtr.h>
  34. #include <AK/Traits.h>
  35. #include <AK/Types.h>
  36. #include <AK/Vector.h>
  37. namespace regex {
  38. using ByteCodeValueType = u64;
  39. #define ENUMERATE_OPCODES \
  40. __ENUMERATE_OPCODE(Compare) \
  41. __ENUMERATE_OPCODE(Jump) \
  42. __ENUMERATE_OPCODE(ForkJump) \
  43. __ENUMERATE_OPCODE(ForkStay) \
  44. __ENUMERATE_OPCODE(FailForks) \
  45. __ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
  46. __ENUMERATE_OPCODE(SaveRightCaptureGroup) \
  47. __ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
  48. __ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
  49. __ENUMERATE_OPCODE(CheckBegin) \
  50. __ENUMERATE_OPCODE(CheckEnd) \
  51. __ENUMERATE_OPCODE(CheckBoundary) \
  52. __ENUMERATE_OPCODE(Save) \
  53. __ENUMERATE_OPCODE(Restore) \
  54. __ENUMERATE_OPCODE(GoBack) \
  55. __ENUMERATE_OPCODE(Exit)
  56. // clang-format off
  57. enum class OpCodeId : ByteCodeValueType {
  58. #define __ENUMERATE_OPCODE(x) x,
  59. ENUMERATE_OPCODES
  60. #undef __ENUMERATE_OPCODE
  61. First = Compare,
  62. Last = Exit,
  63. };
  64. // clang-format on
  65. #define ENUMERATE_CHARACTER_COMPARE_TYPES \
  66. __ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
  67. __ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
  68. __ENUMERATE_CHARACTER_COMPARE_TYPE(TemporaryInverse) \
  69. __ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
  70. __ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
  71. __ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
  72. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
  73. __ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
  74. __ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
  75. __ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
  76. __ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
  77. enum class CharacterCompareType : ByteCodeValueType {
  78. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
  79. ENUMERATE_CHARACTER_COMPARE_TYPES
  80. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  81. };
  82. #define ENUMERATE_CHARACTER_CLASSES \
  83. __ENUMERATE_CHARACTER_CLASS(Alnum) \
  84. __ENUMERATE_CHARACTER_CLASS(Cntrl) \
  85. __ENUMERATE_CHARACTER_CLASS(Lower) \
  86. __ENUMERATE_CHARACTER_CLASS(Space) \
  87. __ENUMERATE_CHARACTER_CLASS(Alpha) \
  88. __ENUMERATE_CHARACTER_CLASS(Digit) \
  89. __ENUMERATE_CHARACTER_CLASS(Print) \
  90. __ENUMERATE_CHARACTER_CLASS(Upper) \
  91. __ENUMERATE_CHARACTER_CLASS(Blank) \
  92. __ENUMERATE_CHARACTER_CLASS(Graph) \
  93. __ENUMERATE_CHARACTER_CLASS(Punct) \
  94. __ENUMERATE_CHARACTER_CLASS(Word) \
  95. __ENUMERATE_CHARACTER_CLASS(Xdigit)
  96. enum class CharClass : ByteCodeValueType {
  97. #define __ENUMERATE_CHARACTER_CLASS(x) x,
  98. ENUMERATE_CHARACTER_CLASSES
  99. #undef __ENUMERATE_CHARACTER_CLASS
  100. };
  101. #define ENUMERATE_BOUNDARY_CHECK_TYPES \
  102. __ENUMERATE_BOUNDARY_CHECK_TYPE(Word) \
  103. __ENUMERATE_BOUNDARY_CHECK_TYPE(NonWord)
  104. enum class BoundaryCheckType : ByteCodeValueType {
  105. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) x,
  106. ENUMERATE_BOUNDARY_CHECK_TYPES
  107. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  108. };
  109. struct CharRange {
  110. const u32 from;
  111. const u32 to;
  112. CharRange(u64 value)
  113. : from(value >> 32)
  114. , to(value & 0xffffffff)
  115. {
  116. }
  117. CharRange(u32 from, u32 to)
  118. : from(from)
  119. , to(to)
  120. {
  121. }
  122. operator ByteCodeValueType() const { return ((u64)from << 32) | to; }
  123. };
  124. struct CompareTypeAndValuePair {
  125. CharacterCompareType type;
  126. ByteCodeValueType value;
  127. };
  128. class OpCode;
  129. class ByteCode : public Vector<ByteCodeValueType> {
  130. public:
  131. ByteCode() = default;
  132. ByteCode(const ByteCode&) = default;
  133. virtual ~ByteCode() = default;
  134. ByteCode& operator=(ByteCode&&) = default;
  135. void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
  136. {
  137. ByteCode bytecode;
  138. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  139. bytecode.empend(pairs.size()); // number of arguments
  140. ByteCode arguments;
  141. for (auto& value : pairs) {
  142. VERIFY(value.type != CharacterCompareType::RangeExpressionDummy);
  143. VERIFY(value.type != CharacterCompareType::Undefined);
  144. VERIFY(value.type != CharacterCompareType::String);
  145. VERIFY(value.type != CharacterCompareType::NamedReference);
  146. arguments.append((ByteCodeValueType)value.type);
  147. if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar && value.type != CharacterCompareType::TemporaryInverse)
  148. arguments.append(move(value.value));
  149. }
  150. bytecode.empend(arguments.size()); // size of arguments
  151. bytecode.append(move(arguments));
  152. append(move(bytecode));
  153. }
  154. void insert_bytecode_check_boundary(BoundaryCheckType type)
  155. {
  156. ByteCode bytecode;
  157. bytecode.empend((ByteCodeValueType)OpCodeId::CheckBoundary);
  158. bytecode.empend((ByteCodeValueType)type);
  159. append(move(bytecode));
  160. }
  161. void insert_bytecode_compare_string(StringView view)
  162. {
  163. ByteCode bytecode;
  164. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  165. bytecode.empend(static_cast<u64>(1)); // number of arguments
  166. ByteCode arguments;
  167. arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
  168. arguments.insert_string(view);
  169. bytecode.empend(arguments.size()); // size of arguments
  170. bytecode.append(move(arguments));
  171. append(move(bytecode));
  172. }
  173. void insert_bytecode_compare_named_reference(StringView name)
  174. {
  175. ByteCode bytecode;
  176. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  177. bytecode.empend(static_cast<u64>(1)); // number of arguments
  178. ByteCode arguments;
  179. arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::NamedReference));
  180. arguments.empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
  181. arguments.empend(name.length());
  182. bytecode.empend(arguments.size()); // size of arguments
  183. bytecode.append(move(arguments));
  184. append(move(bytecode));
  185. }
  186. void insert_bytecode_group_capture_left(size_t capture_groups_count)
  187. {
  188. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
  189. empend(capture_groups_count);
  190. }
  191. void insert_bytecode_group_capture_left(const StringView& name)
  192. {
  193. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup));
  194. empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
  195. empend(name.length());
  196. }
  197. void insert_bytecode_group_capture_right(size_t capture_groups_count)
  198. {
  199. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
  200. empend(capture_groups_count);
  201. }
  202. void insert_bytecode_group_capture_right(const StringView& name)
  203. {
  204. empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
  205. empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
  206. empend(name.length());
  207. }
  208. enum class LookAroundType {
  209. LookAhead,
  210. LookBehind,
  211. NegatedLookAhead,
  212. NegatedLookBehind,
  213. };
  214. void insert_bytecode_lookaround(ByteCode&& lookaround_body, LookAroundType type, size_t match_length = 0)
  215. {
  216. // FIXME: The save stack will grow infinitely with repeated failures
  217. // as we do not discard that on failure (we don't necessarily know how many to pop with the current architecture).
  218. switch (type) {
  219. case LookAroundType::LookAhead: {
  220. // SAVE
  221. // REGEXP BODY
  222. // RESTORE
  223. empend((ByteCodeValueType)OpCodeId::Save);
  224. append(move(lookaround_body));
  225. empend((ByteCodeValueType)OpCodeId::Restore);
  226. return;
  227. }
  228. case LookAroundType::NegatedLookAhead: {
  229. // JUMP _A
  230. // LABEL _L
  231. // REGEXP BODY
  232. // FAIL 2
  233. // LABEL _A
  234. // SAVE
  235. // FORKJUMP _L
  236. // RESTORE
  237. auto body_length = lookaround_body.size();
  238. empend((ByteCodeValueType)OpCodeId::Jump);
  239. empend((ByteCodeValueType)body_length + 2); // JUMP to label _A
  240. append(move(lookaround_body));
  241. empend((ByteCodeValueType)OpCodeId::FailForks);
  242. empend((ByteCodeValueType)2); // Fail two forks
  243. empend((ByteCodeValueType)OpCodeId::Save);
  244. empend((ByteCodeValueType)OpCodeId::ForkJump);
  245. empend((ByteCodeValueType) - (body_length + 5)); // JUMP to label _L
  246. empend((ByteCodeValueType)OpCodeId::Restore);
  247. return;
  248. }
  249. case LookAroundType::LookBehind:
  250. // SAVE
  251. // GOBACK match_length(BODY)
  252. // REGEXP BODY
  253. // RESTORE
  254. empend((ByteCodeValueType)OpCodeId::Save);
  255. empend((ByteCodeValueType)OpCodeId::GoBack);
  256. empend((ByteCodeValueType)match_length);
  257. append(move(lookaround_body));
  258. empend((ByteCodeValueType)OpCodeId::Restore);
  259. return;
  260. case LookAroundType::NegatedLookBehind: {
  261. // JUMP _A
  262. // LABEL _L
  263. // GOBACK match_length(BODY)
  264. // REGEXP BODY
  265. // FAIL 2
  266. // LABEL _A
  267. // SAVE
  268. // FORKJUMP _L
  269. // RESTORE
  270. auto body_length = lookaround_body.size();
  271. empend((ByteCodeValueType)OpCodeId::Jump);
  272. empend((ByteCodeValueType)body_length + 4); // JUMP to label _A
  273. empend((ByteCodeValueType)OpCodeId::GoBack);
  274. empend((ByteCodeValueType)match_length);
  275. append(move(lookaround_body));
  276. empend((ByteCodeValueType)OpCodeId::FailForks);
  277. empend((ByteCodeValueType)2); // Fail two forks
  278. empend((ByteCodeValueType)OpCodeId::Save);
  279. empend((ByteCodeValueType)OpCodeId::ForkJump);
  280. empend((ByteCodeValueType) - (body_length + 7)); // JUMP to label _L
  281. empend((ByteCodeValueType)OpCodeId::Restore);
  282. return;
  283. }
  284. }
  285. VERIFY_NOT_REACHED();
  286. }
  287. void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
  288. {
  289. // FORKJUMP _ALT
  290. // REGEXP ALT2
  291. // JUMP _END
  292. // LABEL _ALT
  293. // REGEXP ALT1
  294. // LABEL _END
  295. ByteCode byte_code;
  296. empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  297. empend(right.size() + 2); // Jump to the _ALT label
  298. for (auto& op : right)
  299. append(move(op));
  300. empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
  301. empend(left.size()); // Jump to the _END label
  302. // LABEL _ALT = bytecode.size() + 2
  303. for (auto& op : left)
  304. append(move(op));
  305. // LABEL _END = alterantive_bytecode.size
  306. }
  307. void insert_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum)
  308. {
  309. ByteCode new_bytecode;
  310. new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
  311. if (maximum.has_value()) {
  312. if (maximum.value() > minimum) {
  313. auto diff = maximum.value() - minimum;
  314. new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  315. new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
  316. for (size_t i = 0; i < diff; ++i) {
  317. new_bytecode.append(bytecode_to_repeat);
  318. new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  319. new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
  320. }
  321. }
  322. } else {
  323. // no maximum value set, repeat finding if possible
  324. new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  325. new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration
  326. }
  327. bytecode_to_repeat = move(new_bytecode);
  328. }
  329. void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n)
  330. {
  331. for (size_t i = 0; i < n; ++i)
  332. append(bytecode_to_repeat);
  333. }
  334. void insert_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
  335. {
  336. // LABEL _START = -bytecode_to_repeat.size()
  337. // REGEXP
  338. // FORKSTAY _START (FORKJUMP -> Greedy)
  339. if (greedy)
  340. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  341. else
  342. bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  343. bytecode_to_repeat.empend(-(bytecode_to_repeat.size() + 1)); // Jump to the _START label
  344. }
  345. void insert_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
  346. {
  347. // LABEL _START
  348. // FORKJUMP _END (FORKSTAY -> Greedy)
  349. // REGEXP
  350. // JUMP _START
  351. // LABEL _END
  352. // LABEL _START = m_bytes.size();
  353. ByteCode bytecode;
  354. if (greedy)
  355. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  356. else
  357. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  358. bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label
  359. for (auto& op : bytecode_to_repeat)
  360. bytecode.append(move(op));
  361. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
  362. bytecode.empend(-bytecode.size() - 1); // Jump to the _START label
  363. // LABEL _END = bytecode.size()
  364. bytecode_to_repeat = move(bytecode);
  365. }
  366. void insert_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
  367. {
  368. // FORKJUMP _END (FORKSTAY -> Greedy)
  369. // REGEXP
  370. // LABEL _END
  371. ByteCode bytecode;
  372. if (greedy)
  373. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
  374. else
  375. bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  376. bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
  377. for (auto& op : bytecode_to_repeat)
  378. bytecode.append(move(op));
  379. // LABEL _END = bytecode.size()
  380. bytecode_to_repeat = move(bytecode);
  381. }
  382. OpCode* get_opcode(MatchState& state) const;
  383. private:
  384. void insert_string(const StringView& view)
  385. {
  386. empend((ByteCodeValueType)view.length());
  387. for (size_t i = 0; i < view.length(); ++i)
  388. empend((ByteCodeValueType)view[i]);
  389. }
  390. ALWAYS_INLINE OpCode* get_opcode_by_id(OpCodeId id) const;
  391. static HashMap<u32, OwnPtr<OpCode>> s_opcodes;
  392. };
  393. #define ENUMERATE_EXECUTION_RESULTS \
  394. __ENUMERATE_EXECUTION_RESULT(Continue) \
  395. __ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
  396. __ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
  397. __ENUMERATE_EXECUTION_RESULT(Failed) \
  398. __ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
  399. __ENUMERATE_EXECUTION_RESULT(Succeeded)
  400. enum class ExecutionResult : u8 {
  401. #define __ENUMERATE_EXECUTION_RESULT(x) x,
  402. ENUMERATE_EXECUTION_RESULTS
  403. #undef __ENUMERATE_EXECUTION_RESULT
  404. };
  405. const char* execution_result_name(ExecutionResult result);
  406. const char* opcode_id_name(OpCodeId opcode_id);
  407. const char* boundary_check_type_name(BoundaryCheckType);
  408. const char* character_compare_type_name(CharacterCompareType result);
  409. const char* execution_result_name(ExecutionResult result);
  410. class OpCode {
  411. public:
  412. OpCode(ByteCode& bytecode)
  413. : m_bytecode(&bytecode)
  414. {
  415. }
  416. virtual ~OpCode() = default;
  417. virtual OpCodeId opcode_id() const = 0;
  418. virtual size_t size() const = 0;
  419. virtual ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const = 0;
  420. ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
  421. {
  422. VERIFY(state().instruction_position + offset <= m_bytecode->size());
  423. return m_bytecode->at(state().instruction_position + 1 + offset);
  424. }
  425. ALWAYS_INLINE const char* name() const;
  426. static const char* name(const OpCodeId);
  427. ALWAYS_INLINE OpCode* set_state(MatchState& state)
  428. {
  429. m_state = &state;
  430. return this;
  431. }
  432. ALWAYS_INLINE OpCode* set_bytecode(ByteCode& bytecode)
  433. {
  434. m_bytecode = &bytecode;
  435. return this;
  436. }
  437. ALWAYS_INLINE void reset_state() { m_state.clear(); }
  438. ALWAYS_INLINE const MatchState& state() const
  439. {
  440. VERIFY(m_state.has_value());
  441. return *m_state.value();
  442. }
  443. const String to_string() const
  444. {
  445. return String::format("[0x%02X] %s", (int)opcode_id(), name(opcode_id()));
  446. }
  447. virtual const String arguments_string() const = 0;
  448. ALWAYS_INLINE const ByteCode& bytecode() const { return *m_bytecode; }
  449. protected:
  450. ByteCode* m_bytecode;
  451. Optional<MatchState*> m_state;
  452. };
  453. class OpCode_Exit final : public OpCode {
  454. public:
  455. OpCode_Exit(ByteCode& bytecode)
  456. : OpCode(bytecode)
  457. {
  458. }
  459. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  460. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
  461. ALWAYS_INLINE size_t size() const override { return 1; }
  462. const String arguments_string() const override { return ""; }
  463. };
  464. class OpCode_FailForks final : public OpCode {
  465. public:
  466. OpCode_FailForks(ByteCode& bytecode)
  467. : OpCode(bytecode)
  468. {
  469. }
  470. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  471. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::FailForks; }
  472. ALWAYS_INLINE size_t size() const override { return 2; }
  473. ALWAYS_INLINE size_t count() const { return argument(0); }
  474. const String arguments_string() const override { return String::formatted("count={}", count()); }
  475. };
  476. class OpCode_Save final : public OpCode {
  477. public:
  478. OpCode_Save(ByteCode& bytecode)
  479. : OpCode(bytecode)
  480. {
  481. }
  482. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  483. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Save; }
  484. ALWAYS_INLINE size_t size() const override { return 1; }
  485. const String arguments_string() const override { return ""; }
  486. };
  487. class OpCode_Restore final : public OpCode {
  488. public:
  489. OpCode_Restore(ByteCode& bytecode)
  490. : OpCode(bytecode)
  491. {
  492. }
  493. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  494. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Restore; }
  495. ALWAYS_INLINE size_t size() const override { return 1; }
  496. const String arguments_string() const override { return ""; }
  497. };
  498. class OpCode_GoBack final : public OpCode {
  499. public:
  500. OpCode_GoBack(ByteCode& bytecode)
  501. : OpCode(bytecode)
  502. {
  503. }
  504. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  505. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::GoBack; }
  506. ALWAYS_INLINE size_t size() const override { return 2; }
  507. ALWAYS_INLINE size_t count() const { return argument(0); }
  508. const String arguments_string() const override { return String::formatted("count={}", count()); }
  509. };
  510. class OpCode_Jump final : public OpCode {
  511. public:
  512. OpCode_Jump(ByteCode& bytecode)
  513. : OpCode(bytecode)
  514. {
  515. }
  516. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  517. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
  518. ALWAYS_INLINE size_t size() const override { return 2; }
  519. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  520. const String arguments_string() const override
  521. {
  522. return String::format("offset=%zd [&%zu]", offset(), state().instruction_position + size() + offset());
  523. }
  524. };
  525. class OpCode_ForkJump final : public OpCode {
  526. public:
  527. OpCode_ForkJump(ByteCode& bytecode)
  528. : OpCode(bytecode)
  529. {
  530. }
  531. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  532. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
  533. ALWAYS_INLINE size_t size() const override { return 2; }
  534. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  535. const String arguments_string() const override
  536. {
  537. return String::format("offset=%zd [&%zu], sp: %zu", offset(), state().instruction_position + size() + offset(), state().string_position);
  538. }
  539. };
  540. class OpCode_ForkStay final : public OpCode {
  541. public:
  542. OpCode_ForkStay(ByteCode& bytecode)
  543. : OpCode(bytecode)
  544. {
  545. }
  546. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  547. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
  548. ALWAYS_INLINE size_t size() const override { return 2; }
  549. ALWAYS_INLINE ssize_t offset() const { return argument(0); }
  550. const String arguments_string() const override
  551. {
  552. return String::format("offset=%zd [&%zu], sp: %zu", offset(), state().instruction_position + size() + offset(), state().string_position);
  553. }
  554. };
  555. class OpCode_CheckBegin final : public OpCode {
  556. public:
  557. OpCode_CheckBegin(ByteCode& bytecode)
  558. : OpCode(bytecode)
  559. {
  560. }
  561. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  562. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
  563. ALWAYS_INLINE size_t size() const override { return 1; }
  564. const String arguments_string() const override { return ""; }
  565. };
  566. class OpCode_CheckEnd final : public OpCode {
  567. public:
  568. OpCode_CheckEnd(ByteCode& bytecode)
  569. : OpCode(bytecode)
  570. {
  571. }
  572. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  573. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
  574. ALWAYS_INLINE size_t size() const override { return 1; }
  575. const String arguments_string() const override { return ""; }
  576. };
  577. class OpCode_CheckBoundary final : public OpCode {
  578. public:
  579. OpCode_CheckBoundary(ByteCode& bytecode)
  580. : OpCode(bytecode)
  581. {
  582. }
  583. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  584. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBoundary; }
  585. ALWAYS_INLINE size_t size() const override { return 2; }
  586. ALWAYS_INLINE size_t arguments_count() const { return 1; }
  587. ALWAYS_INLINE BoundaryCheckType type() const { return static_cast<BoundaryCheckType>(argument(0)); }
  588. const String arguments_string() const override { return String::format("kind=%lu (%s)", (long unsigned int)argument(0), boundary_check_type_name(type())); }
  589. };
  590. class OpCode_SaveLeftCaptureGroup final : public OpCode {
  591. public:
  592. OpCode_SaveLeftCaptureGroup(ByteCode& bytecode)
  593. : OpCode(bytecode)
  594. {
  595. }
  596. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  597. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
  598. ALWAYS_INLINE size_t size() const override { return 2; }
  599. ALWAYS_INLINE size_t id() const { return argument(0); }
  600. const String arguments_string() const override { return String::format("id=%lu", id()); }
  601. };
  602. class OpCode_SaveRightCaptureGroup final : public OpCode {
  603. public:
  604. OpCode_SaveRightCaptureGroup(ByteCode& bytecode)
  605. : OpCode(bytecode)
  606. {
  607. }
  608. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  609. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
  610. ALWAYS_INLINE size_t size() const override { return 2; }
  611. ALWAYS_INLINE size_t id() const { return argument(0); }
  612. const String arguments_string() const override { return String::format("id=%lu", id()); }
  613. };
  614. class OpCode_SaveLeftNamedCaptureGroup final : public OpCode {
  615. public:
  616. OpCode_SaveLeftNamedCaptureGroup(ByteCode& bytecode)
  617. : OpCode(bytecode)
  618. {
  619. }
  620. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  621. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; }
  622. ALWAYS_INLINE size_t size() const override { return 3; }
  623. ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
  624. ALWAYS_INLINE size_t length() const { return argument(1); }
  625. const String arguments_string() const override
  626. {
  627. return String::format("name=%s, length=%lu", name().to_string().characters(), length());
  628. }
  629. };
  630. class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
  631. public:
  632. OpCode_SaveRightNamedCaptureGroup(ByteCode& bytecode)
  633. : OpCode(bytecode)
  634. {
  635. }
  636. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  637. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
  638. ALWAYS_INLINE size_t size() const override { return 3; }
  639. ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
  640. ALWAYS_INLINE size_t length() const { return argument(1); }
  641. const String arguments_string() const override
  642. {
  643. return String::format("name=%s, length=%zu", name().to_string().characters(), length());
  644. }
  645. };
  646. class OpCode_Compare final : public OpCode {
  647. public:
  648. OpCode_Compare(ByteCode& bytecode)
  649. : OpCode(bytecode)
  650. {
  651. }
  652. ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
  653. ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
  654. ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
  655. ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
  656. ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
  657. const String arguments_string() const override;
  658. const Vector<String> variable_arguments_to_string(Optional<MatchInput> input = {}) const;
  659. private:
  660. ALWAYS_INLINE static void compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched);
  661. ALWAYS_INLINE static bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match);
  662. ALWAYS_INLINE static void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
  663. ALWAYS_INLINE static void compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
  664. };
  665. template<typename T>
  666. bool is(const OpCode&);
  667. template<typename T>
  668. ALWAYS_INLINE bool is(const OpCode&)
  669. {
  670. return false;
  671. }
  672. template<typename T>
  673. ALWAYS_INLINE bool is(const OpCode* opcode)
  674. {
  675. return is<T>(*opcode);
  676. }
  677. template<>
  678. ALWAYS_INLINE bool is<OpCode_ForkStay>(const OpCode& opcode)
  679. {
  680. return opcode.opcode_id() == OpCodeId::ForkStay;
  681. }
  682. template<>
  683. ALWAYS_INLINE bool is<OpCode_Exit>(const OpCode& opcode)
  684. {
  685. return opcode.opcode_id() == OpCodeId::Exit;
  686. }
  687. template<>
  688. ALWAYS_INLINE bool is<OpCode_Compare>(const OpCode& opcode)
  689. {
  690. return opcode.opcode_id() == OpCodeId::Compare;
  691. }
  692. template<typename T>
  693. ALWAYS_INLINE const T& to(const OpCode& opcode)
  694. {
  695. VERIFY(is<T>(opcode));
  696. return static_cast<const T&>(opcode);
  697. }
  698. template<typename T>
  699. ALWAYS_INLINE T* to(OpCode* opcode)
  700. {
  701. VERIFY(is<T>(opcode));
  702. return static_cast<T*>(opcode);
  703. }
  704. template<typename T>
  705. ALWAYS_INLINE const T* to(const OpCode* opcode)
  706. {
  707. VERIFY(is<T>(opcode));
  708. return static_cast<const T*>(opcode);
  709. }
  710. template<typename T>
  711. ALWAYS_INLINE T& to(OpCode& opcode)
  712. {
  713. VERIFY(is<T>(opcode));
  714. return static_cast<T&>(opcode);
  715. }
  716. }