RegexOptimizer.cpp 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640
  1. /*
  2. * Copyright (c) 2021, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/QuickSort.h>
  7. #include <AK/RedBlackTree.h>
  8. #include <AK/Stack.h>
  9. #include <LibRegex/Regex.h>
  10. #include <LibRegex/RegexBytecodeStreamOptimizer.h>
  11. namespace regex {
  12. using Detail::Block;
  13. template<typename Parser>
  14. void Regex<Parser>::run_optimization_passes()
  15. {
  16. parser_result.bytecode.flatten();
  17. // Rewrite fork loops as atomic groups
  18. // e.g. a*b -> (ATOMIC a*)b
  19. attempt_rewrite_loops_as_atomic_groups(split_basic_blocks());
  20. parser_result.bytecode.flatten();
  21. }
  22. template<typename Parser>
  23. typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks()
  24. {
  25. BasicBlockList block_boundaries;
  26. auto& bytecode = parser_result.bytecode;
  27. size_t end_of_last_block = 0;
  28. MatchState state;
  29. state.instruction_position = 0;
  30. auto check_jump = [&]<typename T>(OpCode const& opcode) {
  31. auto& op = static_cast<T const&>(opcode);
  32. ssize_t jump_offset = op.size() + op.offset();
  33. if (jump_offset >= 0) {
  34. block_boundaries.append({ end_of_last_block, state.instruction_position });
  35. end_of_last_block = state.instruction_position + opcode.size();
  36. } else {
  37. // This op jumps back, see if that's within this "block".
  38. if (jump_offset + state.instruction_position > end_of_last_block) {
  39. // Split the block!
  40. block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position });
  41. block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position });
  42. end_of_last_block = state.instruction_position + opcode.size();
  43. } else {
  44. // Nope, it's just a jump to another block
  45. block_boundaries.append({ end_of_last_block, state.instruction_position });
  46. end_of_last_block = state.instruction_position + opcode.size();
  47. }
  48. }
  49. };
  50. for (;;) {
  51. auto& opcode = bytecode.get_opcode(state);
  52. switch (opcode.opcode_id()) {
  53. case OpCodeId::Jump:
  54. check_jump.template operator()<OpCode_Jump>(opcode);
  55. break;
  56. case OpCodeId::JumpNonEmpty:
  57. check_jump.template operator()<OpCode_JumpNonEmpty>(opcode);
  58. break;
  59. case OpCodeId::ForkJump:
  60. check_jump.template operator()<OpCode_ForkJump>(opcode);
  61. break;
  62. case OpCodeId::ForkStay:
  63. check_jump.template operator()<OpCode_ForkStay>(opcode);
  64. break;
  65. case OpCodeId::FailForks:
  66. block_boundaries.append({ end_of_last_block, state.instruction_position });
  67. end_of_last_block = state.instruction_position + opcode.size();
  68. break;
  69. case OpCodeId::Repeat: {
  70. // Repeat produces two blocks, one containing its repeated expr, and one after that.
  71. auto repeat_start = state.instruction_position - static_cast<OpCode_Repeat const&>(opcode).offset();
  72. if (repeat_start > end_of_last_block)
  73. block_boundaries.append({ end_of_last_block, repeat_start });
  74. block_boundaries.append({ repeat_start, state.instruction_position });
  75. end_of_last_block = state.instruction_position + opcode.size();
  76. break;
  77. }
  78. default:
  79. break;
  80. }
  81. auto next_ip = state.instruction_position + opcode.size();
  82. if (next_ip < bytecode.size())
  83. state.instruction_position = next_ip;
  84. else
  85. break;
  86. }
  87. if (end_of_last_block < bytecode.size())
  88. block_boundaries.append({ end_of_last_block, bytecode.size() });
  89. quick_sort(block_boundaries, [](auto& a, auto& b) { return a.start < b.start; });
  90. return block_boundaries;
  91. }
  92. static bool block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block)
  93. {
  94. Vector<Vector<CompareTypeAndValuePair>> repeated_values;
  95. HashTable<size_t> active_capture_groups;
  96. MatchState state;
  97. for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) {
  98. auto& opcode = bytecode.get_opcode(state);
  99. switch (opcode.opcode_id()) {
  100. case OpCodeId::Compare: {
  101. auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
  102. if (repeated_values.is_empty() && any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
  103. return false;
  104. repeated_values.append(move(compares));
  105. break;
  106. }
  107. case OpCodeId::CheckBegin:
  108. case OpCodeId::CheckEnd:
  109. if (repeated_values.is_empty())
  110. return true;
  111. break;
  112. case OpCodeId::CheckBoundary:
  113. // FIXME: What should we do with these? for now, let's fail.
  114. return false;
  115. case OpCodeId::Restore:
  116. case OpCodeId::GoBack:
  117. return false;
  118. case OpCodeId::SaveRightCaptureGroup:
  119. active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
  120. break;
  121. case OpCodeId::SaveLeftCaptureGroup:
  122. active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
  123. break;
  124. default:
  125. break;
  126. }
  127. state.instruction_position += opcode.size();
  128. }
  129. dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size());
  130. dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size());
  131. // Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
  132. for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) {
  133. auto& opcode = bytecode.get_opcode(state);
  134. switch (opcode.opcode_id()) {
  135. // Note: These have to exist since we're effectively repeating the following block as well
  136. case OpCodeId::SaveRightCaptureGroup:
  137. active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
  138. break;
  139. case OpCodeId::SaveLeftCaptureGroup:
  140. active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
  141. break;
  142. case OpCodeId::Compare: {
  143. // We found a compare, let's see what it has.
  144. auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
  145. if (compares.is_empty())
  146. break;
  147. if (any_of(compares, [&](auto& compare) {
  148. return compare.type == CharacterCompareType::AnyChar
  149. || (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value));
  150. }))
  151. return false;
  152. for (auto& repeated_value : repeated_values) {
  153. // FIXME: This is too naive!
  154. if (any_of(repeated_value, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
  155. return false;
  156. for (auto& repeated_compare : repeated_value) {
  157. // FIXME: This is too naive! it will miss _tons_ of cases since it doesn't check ranges!
  158. if (any_of(compares, [&](auto& compare) { return compare.type == repeated_compare.type && compare.value == repeated_compare.value; }))
  159. return false;
  160. }
  161. }
  162. return true;
  163. }
  164. case OpCodeId::CheckBegin:
  165. case OpCodeId::CheckEnd:
  166. return true; // Nothing can match the end!
  167. case OpCodeId::CheckBoundary:
  168. // FIXME: What should we do with these? For now, consider them a failure.
  169. return false;
  170. default:
  171. break;
  172. }
  173. state.instruction_position += opcode.size();
  174. }
  175. return true;
  176. }
  177. template<typename Parser>
  178. void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
  179. {
  180. auto& bytecode = parser_result.bytecode;
  181. if constexpr (REGEX_DEBUG) {
  182. RegexDebug dbg;
  183. dbg.print_bytecode(*this);
  184. for (auto& block : basic_blocks)
  185. dbgln("block from {} to {}", block.start, block.end);
  186. }
  187. // A pattern such as:
  188. // bb0 | RE0
  189. // | ForkX bb0
  190. // -------------------------
  191. // bb1 | RE1
  192. // can be rewritten as:
  193. // loop.hdr | ForkStay bb1
  194. // -------------------------
  195. // bb0 | RE0
  196. // | ForkReplaceX bb0
  197. // -------------------------
  198. // bb1 | RE1
  199. // provided that first(RE1) not-in end(RE0), which is to say
  200. // that RE1 cannot start with whatever RE0 has matched (ever).
  201. //
  202. // Alternatively, a second form of this pattern can also occur:
  203. // bb0 | *
  204. // | ForkX bb2
  205. // ------------------------
  206. // bb1 | RE0
  207. // | Jump bb0
  208. // ------------------------
  209. // bb2 | RE1
  210. // which can be transformed (with the same preconditions) to:
  211. // bb0 | *
  212. // | ForkReplaceX bb2
  213. // ------------------------
  214. // bb1 | RE0
  215. // | Jump bb0
  216. // ------------------------
  217. // bb2 | RE1
  218. enum class AlternateForm {
  219. DirectLoopWithoutHeader, // loop without proper header, a block forking to itself. i.e. the first form.
  220. DirectLoopWithHeader, // loop with proper header, i.e. the second form.
  221. };
  222. struct CandidateBlock {
  223. Block forking_block;
  224. Optional<Block> new_target_block;
  225. AlternateForm form;
  226. };
  227. Vector<CandidateBlock> candidate_blocks;
  228. auto is_an_eligible_jump = [](OpCode const& opcode, size_t ip, size_t block_start, AlternateForm alternate_form) {
  229. switch (opcode.opcode_id()) {
  230. case OpCodeId::JumpNonEmpty: {
  231. auto& op = static_cast<OpCode_JumpNonEmpty const&>(opcode);
  232. auto form = op.form();
  233. if (form != OpCodeId::Jump && alternate_form == AlternateForm::DirectLoopWithHeader)
  234. return false;
  235. if (form != OpCodeId::ForkJump && form != OpCodeId::ForkStay && alternate_form == AlternateForm::DirectLoopWithoutHeader)
  236. return false;
  237. return op.offset() + ip + opcode.size() == block_start;
  238. }
  239. case OpCodeId::ForkJump:
  240. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  241. return false;
  242. return static_cast<OpCode_ForkJump const&>(opcode).offset() + ip + opcode.size() == block_start;
  243. case OpCodeId::ForkStay:
  244. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  245. return false;
  246. return static_cast<OpCode_ForkStay const&>(opcode).offset() + ip + opcode.size() == block_start;
  247. case OpCodeId::Jump:
  248. // Infinite loop does *not* produce forks.
  249. if (alternate_form == AlternateForm::DirectLoopWithoutHeader)
  250. return false;
  251. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  252. return static_cast<OpCode_Jump const&>(opcode).offset() + ip + opcode.size() == block_start;
  253. VERIFY_NOT_REACHED();
  254. default:
  255. return false;
  256. }
  257. };
  258. for (size_t i = 0; i < basic_blocks.size(); ++i) {
  259. auto forking_block = basic_blocks[i];
  260. Optional<Block> fork_fallback_block;
  261. if (i + 1 < basic_blocks.size())
  262. fork_fallback_block = basic_blocks[i + 1];
  263. MatchState state;
  264. // Check if the last instruction in this block is a jump to the block itself:
  265. {
  266. state.instruction_position = forking_block.end;
  267. auto& opcode = bytecode.get_opcode(state);
  268. if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithoutHeader)) {
  269. // We've found RE0 (and RE1 is just the following block, if any), let's see if the precondition applies.
  270. // if RE1 is empty, there's no first(RE1), so this is an automatic pass.
  271. if (!fork_fallback_block.has_value() || fork_fallback_block->end == fork_fallback_block->start) {
  272. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
  273. break;
  274. }
  275. if (block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block)) {
  276. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
  277. break;
  278. }
  279. }
  280. }
  281. // Check if the last instruction in the last block is a direct jump to this block
  282. if (fork_fallback_block.has_value()) {
  283. state.instruction_position = fork_fallback_block->end;
  284. auto& opcode = bytecode.get_opcode(state);
  285. if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithHeader)) {
  286. // We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
  287. state.instruction_position = forking_block.end;
  288. auto& opcode = bytecode.get_opcode(state);
  289. if (opcode.opcode_id() == OpCodeId::ForkJump || opcode.opcode_id() == OpCodeId::ForkStay) {
  290. Optional<Block> block_following_fork_fallback;
  291. if (i + 2 < basic_blocks.size())
  292. block_following_fork_fallback = basic_blocks[i + 2];
  293. if (!block_following_fork_fallback.has_value() || block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback)) {
  294. candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithHeader });
  295. break;
  296. }
  297. }
  298. }
  299. }
  300. }
  301. dbgln_if(REGEX_DEBUG, "Found {} candidate blocks", candidate_blocks.size());
  302. if (candidate_blocks.is_empty()) {
  303. dbgln_if(REGEX_DEBUG, "Failed to find anything for {}", pattern_value);
  304. return;
  305. }
  306. RedBlackTree<size_t, size_t> needed_patches;
  307. // Reverse the blocks, so we can patch the bytecode without messing with the latter patches.
  308. quick_sort(candidate_blocks, [](auto& a, auto& b) { return b.forking_block.start > a.forking_block.start; });
  309. for (auto& candidate : candidate_blocks) {
  310. // Note that both forms share a ForkReplace patch in forking_block.
  311. // Patch the ForkX in forking_block to be a ForkReplaceX instead.
  312. auto& opcode_id = bytecode[candidate.forking_block.end];
  313. if (opcode_id == (ByteCodeValueType)OpCodeId::ForkStay) {
  314. opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
  315. } else if (opcode_id == (ByteCodeValueType)OpCodeId::ForkJump) {
  316. opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
  317. } else if (opcode_id == (ByteCodeValueType)OpCodeId::JumpNonEmpty) {
  318. auto& jump_opcode_id = bytecode[candidate.forking_block.end + 3];
  319. if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkStay)
  320. jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
  321. else if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkJump)
  322. jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
  323. else
  324. VERIFY_NOT_REACHED();
  325. } else {
  326. VERIFY_NOT_REACHED();
  327. }
  328. if (candidate.form == AlternateForm::DirectLoopWithoutHeader) {
  329. if (candidate.new_target_block.has_value()) {
  330. // Insert a fork-stay targeted at the second block.
  331. bytecode.insert(candidate.forking_block.start, (ByteCodeValueType)OpCodeId::ForkStay);
  332. bytecode.insert(candidate.forking_block.start + 1, candidate.new_target_block->start - candidate.forking_block.start);
  333. needed_patches.insert(candidate.forking_block.start, 2u);
  334. }
  335. }
  336. }
  337. if (!needed_patches.is_empty()) {
  338. MatchState state;
  339. state.instruction_position = 0;
  340. struct Patch {
  341. ssize_t value;
  342. size_t offset;
  343. bool should_negate { false };
  344. };
  345. for (;;) {
  346. if (state.instruction_position >= bytecode.size())
  347. break;
  348. auto& opcode = bytecode.get_opcode(state);
  349. Stack<Patch, 2> patch_points;
  350. switch (opcode.opcode_id()) {
  351. case OpCodeId::Jump:
  352. patch_points.push({ static_cast<OpCode_Jump const&>(opcode).offset(), state.instruction_position + 1 });
  353. break;
  354. case OpCodeId::JumpNonEmpty:
  355. patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).offset(), state.instruction_position + 1 });
  356. patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).checkpoint(), state.instruction_position + 2 });
  357. break;
  358. case OpCodeId::ForkJump:
  359. patch_points.push({ static_cast<OpCode_ForkJump const&>(opcode).offset(), state.instruction_position + 1 });
  360. break;
  361. case OpCodeId::ForkStay:
  362. patch_points.push({ static_cast<OpCode_ForkStay const&>(opcode).offset(), state.instruction_position + 1 });
  363. break;
  364. case OpCodeId::Repeat:
  365. patch_points.push({ -(ssize_t) static_cast<OpCode_Repeat const&>(opcode).offset(), state.instruction_position + 1, true });
  366. break;
  367. default:
  368. break;
  369. }
  370. while (!patch_points.is_empty()) {
  371. auto& patch_point = patch_points.top();
  372. auto target_offset = patch_point.value + state.instruction_position + opcode.size();
  373. constexpr auto do_patch = [](auto& patch_it, auto& patch_point, auto& target_offset, auto& bytecode, auto ip) {
  374. if (patch_it.key() == ip)
  375. return;
  376. if (patch_point.value < 0 && target_offset < patch_it.key() && ip > patch_it.key())
  377. bytecode[patch_point.offset] += (patch_point.should_negate ? 1 : -1) * (*patch_it);
  378. else if (patch_point.value > 0 && target_offset > patch_it.key() && ip < patch_it.key())
  379. bytecode[patch_point.offset] += (patch_point.should_negate ? -1 : 1) * (*patch_it);
  380. };
  381. if (auto patch_it = needed_patches.find_largest_not_above_iterator(target_offset); !patch_it.is_end())
  382. do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
  383. else if (auto patch_it = needed_patches.find_largest_not_above_iterator(state.instruction_position); !patch_it.is_end())
  384. do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
  385. patch_points.pop();
  386. }
  387. state.instruction_position += opcode.size();
  388. }
  389. }
  390. if constexpr (REGEX_DEBUG) {
  391. warnln("Transformed to:");
  392. RegexDebug dbg;
  393. dbg.print_bytecode(*this);
  394. }
  395. }
  396. void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right)
  397. {
  398. if (left.is_empty()) {
  399. target.extend(right);
  400. return;
  401. }
  402. if (right.is_empty()) {
  403. target.extend(left);
  404. return;
  405. }
  406. size_t left_skip = 0;
  407. MatchState state;
  408. for (state.instruction_position = 0; state.instruction_position < left.size() && state.instruction_position < right.size();) {
  409. auto left_size = left.get_opcode(state).size();
  410. auto right_size = right.get_opcode(state).size();
  411. if (left_size != right_size)
  412. break;
  413. if (left.spans().slice(state.instruction_position, left_size) == right.spans().slice(state.instruction_position, right_size))
  414. left_skip = state.instruction_position + left_size;
  415. else
  416. break;
  417. state.instruction_position += left_size;
  418. }
  419. dbgln_if(REGEX_DEBUG, "Skipping {}/{} bytecode entries from {}/{}", left_skip, 0, left.size(), right.size());
  420. if (left_skip) {
  421. target.extend(left.release_slice(0, left_skip));
  422. right = right.release_slice(left_skip);
  423. }
  424. auto left_size = left.size();
  425. target.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  426. target.empend(right.size() + (left_size ? 2 : 0)); // Jump to the _ALT label
  427. target.extend(move(right));
  428. if (left_size != 0) {
  429. target.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
  430. target.empend(left.size()); // Jump to the _END label
  431. }
  432. // LABEL _ALT = bytecode.size() + 2
  433. target.extend(move(left));
  434. // LABEL _END = alterantive_bytecode.size
  435. }
  436. enum class LookupTableInsertionOutcome {
  437. Successful,
  438. ReplaceWithAnyChar,
  439. TemporaryInversionNeeded,
  440. PermanentInversionNeeded,
  441. CannotPlaceInTable,
  442. };
  443. static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
  444. {
  445. switch (pair.type) {
  446. case CharacterCompareType::Inverse:
  447. return LookupTableInsertionOutcome::PermanentInversionNeeded;
  448. case CharacterCompareType::TemporaryInverse:
  449. return LookupTableInsertionOutcome::TemporaryInversionNeeded;
  450. case CharacterCompareType::AnyChar:
  451. return LookupTableInsertionOutcome::ReplaceWithAnyChar;
  452. case CharacterCompareType::CharClass:
  453. return LookupTableInsertionOutcome::CannotPlaceInTable;
  454. case CharacterCompareType::Char:
  455. table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
  456. break;
  457. case CharacterCompareType::CharRange: {
  458. CharRange range { pair.value };
  459. table.insert(range.from, range);
  460. break;
  461. }
  462. case CharacterCompareType::Reference:
  463. case CharacterCompareType::Property:
  464. case CharacterCompareType::GeneralCategory:
  465. case CharacterCompareType::Script:
  466. case CharacterCompareType::ScriptExtension:
  467. return LookupTableInsertionOutcome::CannotPlaceInTable;
  468. case CharacterCompareType::Undefined:
  469. case CharacterCompareType::RangeExpressionDummy:
  470. case CharacterCompareType::String:
  471. case CharacterCompareType::LookupTable:
  472. VERIFY_NOT_REACHED();
  473. }
  474. return LookupTableInsertionOutcome::Successful;
  475. }
  476. void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
  477. {
  478. ByteCode arguments;
  479. size_t argument_count = 0;
  480. if (pairs.size() <= 1) {
  481. for (auto& pair : pairs) {
  482. arguments.append(to_underlying(pair.type));
  483. if (pair.type != CharacterCompareType::AnyChar && pair.type != CharacterCompareType::TemporaryInverse && pair.type != CharacterCompareType::Inverse)
  484. arguments.append(pair.value);
  485. ++argument_count;
  486. }
  487. } else {
  488. RedBlackTree<ByteCodeValueType, CharRange> table;
  489. RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
  490. auto* current_table = &table;
  491. auto* current_inverted_table = &inverted_table;
  492. bool invert_for_next_iteration = false;
  493. bool is_currently_inverted = false;
  494. for (auto& value : pairs) {
  495. auto should_invert_after_this_iteration = invert_for_next_iteration;
  496. invert_for_next_iteration = false;
  497. auto insertion_result = insert_into_lookup_table(*current_table, value);
  498. switch (insertion_result) {
  499. case LookupTableInsertionOutcome::Successful:
  500. break;
  501. case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
  502. table.clear();
  503. inverted_table.clear();
  504. arguments.append(to_underlying(CharacterCompareType::AnyChar));
  505. ++argument_count;
  506. break;
  507. }
  508. case LookupTableInsertionOutcome::TemporaryInversionNeeded:
  509. swap(current_table, current_inverted_table);
  510. invert_for_next_iteration = true;
  511. is_currently_inverted = !is_currently_inverted;
  512. break;
  513. case LookupTableInsertionOutcome::PermanentInversionNeeded:
  514. swap(current_table, current_inverted_table);
  515. is_currently_inverted = !is_currently_inverted;
  516. break;
  517. case LookupTableInsertionOutcome::CannotPlaceInTable:
  518. if (is_currently_inverted) {
  519. arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
  520. ++argument_count;
  521. }
  522. arguments.append(to_underlying(value.type));
  523. arguments.append(value.value);
  524. ++argument_count;
  525. break;
  526. }
  527. if (should_invert_after_this_iteration) {
  528. swap(current_table, current_inverted_table);
  529. is_currently_inverted = !is_currently_inverted;
  530. }
  531. }
  532. auto append_table = [&](auto& table) {
  533. ++argument_count;
  534. arguments.append(to_underlying(CharacterCompareType::LookupTable));
  535. auto size_index = arguments.size();
  536. arguments.append(0);
  537. Optional<CharRange> active_range;
  538. size_t range_count = 0;
  539. for (auto& range : table) {
  540. if (!active_range.has_value()) {
  541. active_range = range;
  542. continue;
  543. }
  544. if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
  545. active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
  546. } else {
  547. ++range_count;
  548. arguments.append(active_range.release_value());
  549. active_range = range;
  550. }
  551. }
  552. if (active_range.has_value()) {
  553. ++range_count;
  554. arguments.append(active_range.release_value());
  555. }
  556. arguments[size_index] = range_count;
  557. };
  558. if (!table.is_empty())
  559. append_table(table);
  560. if (!inverted_table.is_empty()) {
  561. ++argument_count;
  562. arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
  563. append_table(inverted_table);
  564. }
  565. }
  566. target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  567. target.empend(argument_count); // number of arguments
  568. target.empend(arguments.size()); // size of arguments
  569. target.extend(move(arguments));
  570. }
  571. template void Regex<PosixBasicParser>::run_optimization_passes();
  572. template void Regex<PosixExtendedParser>::run_optimization_passes();
  573. template void Regex<ECMA262Parser>::run_optimization_passes();
  574. }