RegexOptimizer.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669
  1. /*
  2. * Copyright (c) 2021, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/QuickSort.h>
  7. #include <AK/RedBlackTree.h>
  8. #include <AK/Stack.h>
  9. #include <LibRegex/Regex.h>
  10. #include <LibRegex/RegexBytecodeStreamOptimizer.h>
  11. namespace regex {
  12. using Detail::Block;
  13. template<typename Parser>
  14. void Regex<Parser>::run_optimization_passes()
  15. {
  16. parser_result.bytecode.flatten();
  17. // Rewrite fork loops as atomic groups
  18. // e.g. a*b -> (ATOMIC a*)b
  19. attempt_rewrite_loops_as_atomic_groups(split_basic_blocks(parser_result.bytecode));
  20. parser_result.bytecode.flatten();
  21. }
  22. template<typename Parser>
  23. typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCode const& bytecode)
  24. {
  25. BasicBlockList block_boundaries;
  26. size_t end_of_last_block = 0;
  27. MatchState state;
  28. state.instruction_position = 0;
  29. auto check_jump = [&]<typename T>(OpCode const& opcode) {
  30. auto& op = static_cast<T const&>(opcode);
  31. ssize_t jump_offset = op.size() + op.offset();
  32. if (jump_offset >= 0) {
  33. block_boundaries.append({ end_of_last_block, state.instruction_position });
  34. end_of_last_block = state.instruction_position + opcode.size();
  35. } else {
  36. // This op jumps back, see if that's within this "block".
  37. if (jump_offset + state.instruction_position > end_of_last_block) {
  38. // Split the block!
  39. block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position });
  40. block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position });
  41. end_of_last_block = state.instruction_position + opcode.size();
  42. } else {
  43. // Nope, it's just a jump to another block
  44. block_boundaries.append({ end_of_last_block, state.instruction_position });
  45. end_of_last_block = state.instruction_position + opcode.size();
  46. }
  47. }
  48. };
  49. for (;;) {
  50. auto& opcode = bytecode.get_opcode(state);
  51. switch (opcode.opcode_id()) {
  52. case OpCodeId::Jump:
  53. check_jump.template operator()<OpCode_Jump>(opcode);
  54. break;
  55. case OpCodeId::JumpNonEmpty:
  56. check_jump.template operator()<OpCode_JumpNonEmpty>(opcode);
  57. break;
  58. case OpCodeId::ForkJump:
  59. check_jump.template operator()<OpCode_ForkJump>(opcode);
  60. break;
  61. case OpCodeId::ForkStay:
  62. check_jump.template operator()<OpCode_ForkStay>(opcode);
  63. break;
  64. case OpCodeId::FailForks:
  65. block_boundaries.append({ end_of_last_block, state.instruction_position });
  66. end_of_last_block = state.instruction_position + opcode.size();
  67. break;
  68. case OpCodeId::Repeat: {
  69. // Repeat produces two blocks, one containing its repeated expr, and one after that.
  70. auto repeat_start = state.instruction_position - static_cast<OpCode_Repeat const&>(opcode).offset();
  71. if (repeat_start > end_of_last_block)
  72. block_boundaries.append({ end_of_last_block, repeat_start });
  73. block_boundaries.append({ repeat_start, state.instruction_position });
  74. end_of_last_block = state.instruction_position + opcode.size();
  75. break;
  76. }
  77. default:
  78. break;
  79. }
  80. auto next_ip = state.instruction_position + opcode.size();
  81. if (next_ip < bytecode.size())
  82. state.instruction_position = next_ip;
  83. else
  84. break;
  85. }
  86. if (end_of_last_block < bytecode.size())
  87. block_boundaries.append({ end_of_last_block, bytecode.size() });
  88. quick_sort(block_boundaries, [](auto& a, auto& b) { return a.start < b.start; });
  89. return block_boundaries;
  90. }
  91. enum class AtomicRewritePreconditionResult {
  92. SatisfiedWithProperHeader,
  93. SatisfiedWithEmptyHeader,
  94. NotSatisfied,
  95. };
  96. static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block)
  97. {
  98. Vector<Vector<CompareTypeAndValuePair>> repeated_values;
  99. HashTable<size_t> active_capture_groups;
  100. MatchState state;
  101. for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) {
  102. auto& opcode = bytecode.get_opcode(state);
  103. switch (opcode.opcode_id()) {
  104. case OpCodeId::Compare: {
  105. auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
  106. if (repeated_values.is_empty() && any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
  107. return AtomicRewritePreconditionResult::NotSatisfied;
  108. repeated_values.append(move(compares));
  109. break;
  110. }
  111. case OpCodeId::CheckBegin:
  112. case OpCodeId::CheckEnd:
  113. if (repeated_values.is_empty())
  114. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
  115. break;
  116. case OpCodeId::CheckBoundary:
  117. // FIXME: What should we do with these? for now, let's fail.
  118. return AtomicRewritePreconditionResult::NotSatisfied;
  119. case OpCodeId::Restore:
  120. case OpCodeId::GoBack:
  121. return AtomicRewritePreconditionResult::NotSatisfied;
  122. case OpCodeId::SaveRightCaptureGroup:
  123. active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
  124. break;
  125. case OpCodeId::SaveLeftCaptureGroup:
  126. active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
  127. break;
  128. default:
  129. break;
  130. }
  131. state.instruction_position += opcode.size();
  132. }
  133. dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size());
  134. dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size());
  135. bool following_block_has_at_least_one_compare = false;
  136. // Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
  137. for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) {
  138. auto& opcode = bytecode.get_opcode(state);
  139. switch (opcode.opcode_id()) {
  140. // Note: These have to exist since we're effectively repeating the following block as well
  141. case OpCodeId::SaveRightCaptureGroup:
  142. active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
  143. break;
  144. case OpCodeId::SaveLeftCaptureGroup:
  145. active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
  146. break;
  147. case OpCodeId::Compare: {
  148. following_block_has_at_least_one_compare = true;
  149. // We found a compare, let's see what it has.
  150. auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
  151. if (compares.is_empty())
  152. break;
  153. if (any_of(compares, [&](auto& compare) {
  154. return compare.type == CharacterCompareType::AnyChar
  155. || (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value));
  156. }))
  157. return AtomicRewritePreconditionResult::NotSatisfied;
  158. for (auto& repeated_value : repeated_values) {
  159. // FIXME: This is too naive!
  160. if (any_of(repeated_value, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
  161. return AtomicRewritePreconditionResult::NotSatisfied;
  162. for (auto& repeated_compare : repeated_value) {
  163. // FIXME: This is too naive! it will miss _tons_ of cases since it doesn't check ranges!
  164. if (any_of(compares, [&](auto& compare) { return compare.type == repeated_compare.type && compare.value == repeated_compare.value; }))
  165. return AtomicRewritePreconditionResult::NotSatisfied;
  166. }
  167. }
  168. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
  169. }
  170. case OpCodeId::CheckBegin:
  171. case OpCodeId::CheckEnd:
  172. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; // Nothing can match the end!
  173. case OpCodeId::CheckBoundary:
  174. // FIXME: What should we do with these? For now, consider them a failure.
  175. return AtomicRewritePreconditionResult::NotSatisfied;
  176. default:
  177. break;
  178. }
  179. state.instruction_position += opcode.size();
  180. }
  181. if (following_block_has_at_least_one_compare)
  182. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
  183. return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
  184. }
  185. template<typename Parser>
  186. void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
  187. {
  188. auto& bytecode = parser_result.bytecode;
  189. if constexpr (REGEX_DEBUG) {
  190. RegexDebug dbg;
  191. dbg.print_bytecode(*this);
  192. for (auto& block : basic_blocks)
  193. dbgln("block from {} to {}", block.start, block.end);
  194. }
  195. // A pattern such as:
  196. // bb0 | RE0
  197. // | ForkX bb0
  198. // -------------------------
  199. // bb1 | RE1
  200. // can be rewritten as:
  201. // loop.hdr | ForkStay bb1 (if RE1 matches _something_, empty otherwise)
  202. // -------------------------
  203. // bb0 | RE0
  204. // | ForkReplaceX bb0
  205. // -------------------------
  206. // bb1 | RE1
  207. // provided that first(RE1) not-in end(RE0), which is to say
  208. // that RE1 cannot start with whatever RE0 has matched (ever).
  209. //
  210. // Alternatively, a second form of this pattern can also occur:
  211. // bb0 | *
  212. // | ForkX bb2
  213. // ------------------------
  214. // bb1 | RE0
  215. // | Jump bb0
  216. // ------------------------
  217. // bb2 | RE1
  218. // which can be transformed (with the same preconditions) to:
  219. // bb0 | *
  220. // | ForkReplaceX bb2
  221. // ------------------------
  222. // bb1 | RE0
  223. // | Jump bb0
  224. // ------------------------
  225. // bb2 | RE1
  226. enum class AlternateForm {
  227. DirectLoopWithoutHeader, // loop without proper header, a block forking to itself. i.e. the first form.
  228. DirectLoopWithoutHeaderAndEmptyFollow, // loop without proper header, a block forking to itself. i.e. the first form but with RE1 being empty.
  229. DirectLoopWithHeader, // loop with proper header, i.e. the second form.
  230. };
  231. struct CandidateBlock {
  232. Block forking_block;
  233. Optional<Block> new_target_block;
  234. AlternateForm form;
  235. };
  236. Vector<CandidateBlock> candidate_blocks;
  237. auto is_an_eligible_jump = [](OpCode const& opcode, size_t ip, size_t block_start, AlternateForm alternate_form) {
  238. switch (opcode.opcode_id()) {
  239. case OpCodeId::JumpNonEmpty: {
  240. auto& op = static_cast<OpCode_JumpNonEmpty const&>(opcode);
  241. auto form = op.form();
  242. if (form != OpCodeId::Jump && alternate_form == AlternateForm::DirectLoopWithHeader)
  243. return false;
  244. if (form != OpCodeId::ForkJump && form != OpCodeId::ForkStay && alternate_form == AlternateForm::DirectLoopWithoutHeader)
  245. return false;
  246. return op.offset() + ip + opcode.size() == block_start;
  247. }
  248. case OpCodeId::ForkJump:
  249. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  250. return false;
  251. return static_cast<OpCode_ForkJump const&>(opcode).offset() + ip + opcode.size() == block_start;
  252. case OpCodeId::ForkStay:
  253. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  254. return false;
  255. return static_cast<OpCode_ForkStay const&>(opcode).offset() + ip + opcode.size() == block_start;
  256. case OpCodeId::Jump:
  257. // Infinite loop does *not* produce forks.
  258. if (alternate_form == AlternateForm::DirectLoopWithoutHeader)
  259. return false;
  260. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  261. return static_cast<OpCode_Jump const&>(opcode).offset() + ip + opcode.size() == block_start;
  262. VERIFY_NOT_REACHED();
  263. default:
  264. return false;
  265. }
  266. };
  267. for (size_t i = 0; i < basic_blocks.size(); ++i) {
  268. auto forking_block = basic_blocks[i];
  269. Optional<Block> fork_fallback_block;
  270. if (i + 1 < basic_blocks.size())
  271. fork_fallback_block = basic_blocks[i + 1];
  272. MatchState state;
  273. // Check if the last instruction in this block is a jump to the block itself:
  274. {
  275. state.instruction_position = forking_block.end;
  276. auto& opcode = bytecode.get_opcode(state);
  277. if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithoutHeader)) {
  278. // We've found RE0 (and RE1 is just the following block, if any), let's see if the precondition applies.
  279. // if RE1 is empty, there's no first(RE1), so this is an automatic pass.
  280. if (!fork_fallback_block.has_value() || fork_fallback_block->end == fork_fallback_block->start) {
  281. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
  282. break;
  283. }
  284. auto precondition = block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block);
  285. if (precondition == AtomicRewritePreconditionResult::SatisfiedWithProperHeader) {
  286. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
  287. break;
  288. }
  289. if (precondition == AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader) {
  290. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeaderAndEmptyFollow });
  291. break;
  292. }
  293. }
  294. }
  295. // Check if the last instruction in the last block is a direct jump to this block
  296. if (fork_fallback_block.has_value()) {
  297. state.instruction_position = fork_fallback_block->end;
  298. auto& opcode = bytecode.get_opcode(state);
  299. if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithHeader)) {
  300. // We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
  301. state.instruction_position = forking_block.end;
  302. auto& opcode = bytecode.get_opcode(state);
  303. if (opcode.opcode_id() == OpCodeId::ForkJump || opcode.opcode_id() == OpCodeId::ForkStay) {
  304. Optional<Block> block_following_fork_fallback;
  305. if (i + 2 < basic_blocks.size())
  306. block_following_fork_fallback = basic_blocks[i + 2];
  307. if (!block_following_fork_fallback.has_value()
  308. || block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback) != AtomicRewritePreconditionResult::NotSatisfied) {
  309. candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithHeader });
  310. break;
  311. }
  312. }
  313. }
  314. }
  315. }
  316. dbgln_if(REGEX_DEBUG, "Found {} candidate blocks", candidate_blocks.size());
  317. if (candidate_blocks.is_empty()) {
  318. dbgln_if(REGEX_DEBUG, "Failed to find anything for {}", pattern_value);
  319. return;
  320. }
  321. RedBlackTree<size_t, size_t> needed_patches;
  322. // Reverse the blocks, so we can patch the bytecode without messing with the latter patches.
  323. quick_sort(candidate_blocks, [](auto& a, auto& b) { return b.forking_block.start > a.forking_block.start; });
  324. for (auto& candidate : candidate_blocks) {
  325. // Note that both forms share a ForkReplace patch in forking_block.
  326. // Patch the ForkX in forking_block to be a ForkReplaceX instead.
  327. auto& opcode_id = bytecode[candidate.forking_block.end];
  328. if (opcode_id == (ByteCodeValueType)OpCodeId::ForkStay) {
  329. opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
  330. } else if (opcode_id == (ByteCodeValueType)OpCodeId::ForkJump) {
  331. opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
  332. } else if (opcode_id == (ByteCodeValueType)OpCodeId::JumpNonEmpty) {
  333. auto& jump_opcode_id = bytecode[candidate.forking_block.end + 3];
  334. if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkStay)
  335. jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
  336. else if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkJump)
  337. jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
  338. else
  339. VERIFY_NOT_REACHED();
  340. } else {
  341. VERIFY_NOT_REACHED();
  342. }
  343. if (candidate.form == AlternateForm::DirectLoopWithoutHeader) {
  344. if (candidate.new_target_block.has_value()) {
  345. // Insert a fork-stay targeted at the second block.
  346. bytecode.insert(candidate.forking_block.start, (ByteCodeValueType)OpCodeId::ForkStay);
  347. bytecode.insert(candidate.forking_block.start + 1, candidate.new_target_block->start - candidate.forking_block.start);
  348. needed_patches.insert(candidate.forking_block.start, 2u);
  349. }
  350. }
  351. }
  352. if (!needed_patches.is_empty()) {
  353. MatchState state;
  354. state.instruction_position = 0;
  355. struct Patch {
  356. ssize_t value;
  357. size_t offset;
  358. bool should_negate { false };
  359. };
  360. for (;;) {
  361. if (state.instruction_position >= bytecode.size())
  362. break;
  363. auto& opcode = bytecode.get_opcode(state);
  364. Stack<Patch, 2> patch_points;
  365. switch (opcode.opcode_id()) {
  366. case OpCodeId::Jump:
  367. patch_points.push({ static_cast<OpCode_Jump const&>(opcode).offset(), state.instruction_position + 1 });
  368. break;
  369. case OpCodeId::JumpNonEmpty:
  370. patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).offset(), state.instruction_position + 1 });
  371. patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).checkpoint(), state.instruction_position + 2 });
  372. break;
  373. case OpCodeId::ForkJump:
  374. patch_points.push({ static_cast<OpCode_ForkJump const&>(opcode).offset(), state.instruction_position + 1 });
  375. break;
  376. case OpCodeId::ForkStay:
  377. patch_points.push({ static_cast<OpCode_ForkStay const&>(opcode).offset(), state.instruction_position + 1 });
  378. break;
  379. case OpCodeId::Repeat:
  380. patch_points.push({ -(ssize_t) static_cast<OpCode_Repeat const&>(opcode).offset(), state.instruction_position + 1, true });
  381. break;
  382. default:
  383. break;
  384. }
  385. while (!patch_points.is_empty()) {
  386. auto& patch_point = patch_points.top();
  387. auto target_offset = patch_point.value + state.instruction_position + opcode.size();
  388. constexpr auto do_patch = [](auto& patch_it, auto& patch_point, auto& target_offset, auto& bytecode, auto ip) {
  389. if (patch_it.key() == ip)
  390. return;
  391. if (patch_point.value < 0 && target_offset < patch_it.key() && ip > patch_it.key())
  392. bytecode[patch_point.offset] += (patch_point.should_negate ? 1 : -1) * (*patch_it);
  393. else if (patch_point.value > 0 && target_offset > patch_it.key() && ip < patch_it.key())
  394. bytecode[patch_point.offset] += (patch_point.should_negate ? -1 : 1) * (*patch_it);
  395. };
  396. if (auto patch_it = needed_patches.find_largest_not_above_iterator(target_offset); !patch_it.is_end())
  397. do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
  398. else if (auto patch_it = needed_patches.find_largest_not_above_iterator(state.instruction_position); !patch_it.is_end())
  399. do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
  400. patch_points.pop();
  401. }
  402. state.instruction_position += opcode.size();
  403. }
  404. }
  405. if constexpr (REGEX_DEBUG) {
  406. warnln("Transformed to:");
  407. RegexDebug dbg;
  408. dbg.print_bytecode(*this);
  409. }
  410. }
  411. void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right)
  412. {
  413. auto left_is_empty = left.is_empty();
  414. auto right_is_empty = right.is_empty();
  415. if (left_is_empty || right_is_empty) {
  416. if (left_is_empty && right_is_empty)
  417. return;
  418. // ForkJump right (+ left.size() + 2 + right.size())
  419. // (left)
  420. // Jump end (+ right.size())
  421. // (right)
  422. // LABEL end
  423. target.append(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  424. target.append(left.size() + 2 + right.size());
  425. target.extend(move(left));
  426. target.append(static_cast<ByteCodeValueType>(OpCodeId::Jump));
  427. target.append(right.size());
  428. target.extend(move(right));
  429. return;
  430. }
  431. auto left_blocks = Regex<PosixBasicParser>::split_basic_blocks(left);
  432. auto right_blocks = Regex<PosixBasicParser>::split_basic_blocks(right);
  433. size_t left_skip = 0;
  434. MatchState state;
  435. for (size_t block_index = 0; block_index < left_blocks.size() && block_index < right_blocks.size(); block_index++) {
  436. auto& left_block = left_blocks[block_index];
  437. auto& right_block = right_blocks[block_index];
  438. auto left_end = block_index + 1 == left_blocks.size() ? left_block.end : left_blocks[block_index + 1].start;
  439. auto right_end = block_index + 1 == right_blocks.size() ? right_block.end : right_blocks[block_index + 1].start;
  440. if (left_end - left_block.start != right_end - right_block.start)
  441. break;
  442. if (left.spans().slice(left_block.start, left_end - left_block.start) != right.spans().slice(right_block.start, right_end - right_block.start))
  443. break;
  444. left_skip = left_end;
  445. }
  446. dbgln_if(REGEX_DEBUG, "Skipping {}/{} bytecode entries from {}/{}", left_skip, 0, left.size(), right.size());
  447. if (left_skip > 0) {
  448. target.extend(left.release_slice(left_blocks.first().start, left_skip));
  449. right = right.release_slice(left_skip);
  450. }
  451. auto left_size = left.size();
  452. target.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  453. target.empend(right.size() + (left_size > 0 ? 2 : 0)); // Jump to the _ALT label
  454. target.extend(move(right));
  455. if (left_size != 0) {
  456. target.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
  457. target.empend(left.size()); // Jump to the _END label
  458. }
  459. // LABEL _ALT = bytecode.size() + 2
  460. target.extend(move(left));
  461. // LABEL _END = alterantive_bytecode.size
  462. }
  463. enum class LookupTableInsertionOutcome {
  464. Successful,
  465. ReplaceWithAnyChar,
  466. TemporaryInversionNeeded,
  467. PermanentInversionNeeded,
  468. CannotPlaceInTable,
  469. };
  470. static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
  471. {
  472. switch (pair.type) {
  473. case CharacterCompareType::Inverse:
  474. return LookupTableInsertionOutcome::PermanentInversionNeeded;
  475. case CharacterCompareType::TemporaryInverse:
  476. return LookupTableInsertionOutcome::TemporaryInversionNeeded;
  477. case CharacterCompareType::AnyChar:
  478. return LookupTableInsertionOutcome::ReplaceWithAnyChar;
  479. case CharacterCompareType::CharClass:
  480. return LookupTableInsertionOutcome::CannotPlaceInTable;
  481. case CharacterCompareType::Char:
  482. table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
  483. break;
  484. case CharacterCompareType::CharRange: {
  485. CharRange range { pair.value };
  486. table.insert(range.from, range);
  487. break;
  488. }
  489. case CharacterCompareType::Reference:
  490. case CharacterCompareType::Property:
  491. case CharacterCompareType::GeneralCategory:
  492. case CharacterCompareType::Script:
  493. case CharacterCompareType::ScriptExtension:
  494. return LookupTableInsertionOutcome::CannotPlaceInTable;
  495. case CharacterCompareType::Undefined:
  496. case CharacterCompareType::RangeExpressionDummy:
  497. case CharacterCompareType::String:
  498. case CharacterCompareType::LookupTable:
  499. VERIFY_NOT_REACHED();
  500. }
  501. return LookupTableInsertionOutcome::Successful;
  502. }
  503. void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
  504. {
  505. ByteCode arguments;
  506. size_t argument_count = 0;
  507. if (pairs.size() <= 1) {
  508. for (auto& pair : pairs) {
  509. arguments.append(to_underlying(pair.type));
  510. if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse)
  511. arguments.append(pair.value);
  512. ++argument_count;
  513. }
  514. } else {
  515. RedBlackTree<ByteCodeValueType, CharRange> table;
  516. RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
  517. auto* current_table = &table;
  518. auto* current_inverted_table = &inverted_table;
  519. bool invert_for_next_iteration = false;
  520. bool is_currently_inverted = false;
  521. for (auto& value : pairs) {
  522. auto should_invert_after_this_iteration = invert_for_next_iteration;
  523. invert_for_next_iteration = false;
  524. auto insertion_result = insert_into_lookup_table(*current_table, value);
  525. switch (insertion_result) {
  526. case LookupTableInsertionOutcome::Successful:
  527. break;
  528. case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
  529. table.clear();
  530. inverted_table.clear();
  531. arguments.append(to_underlying(CharacterCompareType::AnyChar));
  532. ++argument_count;
  533. break;
  534. }
  535. case LookupTableInsertionOutcome::TemporaryInversionNeeded:
  536. swap(current_table, current_inverted_table);
  537. invert_for_next_iteration = true;
  538. is_currently_inverted = !is_currently_inverted;
  539. break;
  540. case LookupTableInsertionOutcome::PermanentInversionNeeded:
  541. swap(current_table, current_inverted_table);
  542. is_currently_inverted = !is_currently_inverted;
  543. break;
  544. case LookupTableInsertionOutcome::CannotPlaceInTable:
  545. if (is_currently_inverted) {
  546. arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
  547. ++argument_count;
  548. }
  549. arguments.append(to_underlying(value.type));
  550. arguments.append(value.value);
  551. ++argument_count;
  552. break;
  553. }
  554. if (should_invert_after_this_iteration) {
  555. swap(current_table, current_inverted_table);
  556. is_currently_inverted = !is_currently_inverted;
  557. }
  558. }
  559. auto append_table = [&](auto& table) {
  560. ++argument_count;
  561. arguments.append(to_underlying(CharacterCompareType::LookupTable));
  562. auto size_index = arguments.size();
  563. arguments.append(0);
  564. Optional<CharRange> active_range;
  565. size_t range_count = 0;
  566. for (auto& range : table) {
  567. if (!active_range.has_value()) {
  568. active_range = range;
  569. continue;
  570. }
  571. if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
  572. active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
  573. } else {
  574. ++range_count;
  575. arguments.append(active_range.release_value());
  576. active_range = range;
  577. }
  578. }
  579. if (active_range.has_value()) {
  580. ++range_count;
  581. arguments.append(active_range.release_value());
  582. }
  583. arguments[size_index] = range_count;
  584. };
  585. if (!table.is_empty())
  586. append_table(table);
  587. if (!inverted_table.is_empty()) {
  588. ++argument_count;
  589. arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
  590. append_table(inverted_table);
  591. }
  592. }
  593. target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  594. target.empend(argument_count); // number of arguments
  595. target.empend(arguments.size()); // size of arguments
  596. target.extend(move(arguments));
  597. }
  598. template void Regex<PosixBasicParser>::run_optimization_passes();
  599. template void Regex<PosixExtendedParser>::run_optimization_passes();
  600. template void Regex<ECMA262Parser>::run_optimization_passes();
  601. }