RegexOptimizer.cpp 50 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187
  1. /*
  2. * Copyright (c) 2021, Ali Mohammad Pur <mpfard@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Debug.h>
  7. #include <AK/QuickSort.h>
  8. #include <AK/RedBlackTree.h>
  9. #include <AK/Stack.h>
  10. #include <LibRegex/Regex.h>
  11. #include <LibRegex/RegexBytecodeStreamOptimizer.h>
  12. #include <LibUnicode/CharacterTypes.h>
  13. #if REGEX_DEBUG
  14. # include <AK/ScopeGuard.h>
  15. # include <AK/ScopeLogger.h>
  16. #endif
  17. namespace regex {
  18. using Detail::Block;
  19. template<typename Parser>
  20. void Regex<Parser>::run_optimization_passes()
  21. {
  22. parser_result.bytecode.flatten();
  23. auto blocks = split_basic_blocks(parser_result.bytecode);
  24. if (attempt_rewrite_entire_match_as_substring_search(blocks))
  25. return;
  26. // Rewrite fork loops as atomic groups
  27. // e.g. a*b -> (ATOMIC a*)b
  28. attempt_rewrite_loops_as_atomic_groups(blocks);
  29. parser_result.bytecode.flatten();
  30. }
  31. template<typename Parser>
  32. typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCode const& bytecode)
  33. {
  34. BasicBlockList block_boundaries;
  35. size_t end_of_last_block = 0;
  36. auto bytecode_size = bytecode.size();
  37. MatchState state;
  38. state.instruction_position = 0;
  39. auto check_jump = [&]<typename T>(OpCode const& opcode) {
  40. auto& op = static_cast<T const&>(opcode);
  41. ssize_t jump_offset = op.size() + op.offset();
  42. if (jump_offset >= 0) {
  43. block_boundaries.append({ end_of_last_block, state.instruction_position });
  44. end_of_last_block = state.instruction_position + opcode.size();
  45. } else {
  46. // This op jumps back, see if that's within this "block".
  47. if (jump_offset + state.instruction_position > end_of_last_block) {
  48. // Split the block!
  49. block_boundaries.append({ end_of_last_block, jump_offset + state.instruction_position });
  50. block_boundaries.append({ jump_offset + state.instruction_position, state.instruction_position });
  51. end_of_last_block = state.instruction_position + opcode.size();
  52. } else {
  53. // Nope, it's just a jump to another block
  54. block_boundaries.append({ end_of_last_block, state.instruction_position });
  55. end_of_last_block = state.instruction_position + opcode.size();
  56. }
  57. }
  58. };
  59. for (;;) {
  60. auto& opcode = bytecode.get_opcode(state);
  61. switch (opcode.opcode_id()) {
  62. case OpCodeId::Jump:
  63. check_jump.template operator()<OpCode_Jump>(opcode);
  64. break;
  65. case OpCodeId::JumpNonEmpty:
  66. check_jump.template operator()<OpCode_JumpNonEmpty>(opcode);
  67. break;
  68. case OpCodeId::ForkJump:
  69. check_jump.template operator()<OpCode_ForkJump>(opcode);
  70. break;
  71. case OpCodeId::ForkStay:
  72. check_jump.template operator()<OpCode_ForkStay>(opcode);
  73. break;
  74. case OpCodeId::FailForks:
  75. block_boundaries.append({ end_of_last_block, state.instruction_position });
  76. end_of_last_block = state.instruction_position + opcode.size();
  77. break;
  78. case OpCodeId::Repeat: {
  79. // Repeat produces two blocks, one containing its repeated expr, and one after that.
  80. auto repeat_start = state.instruction_position - static_cast<OpCode_Repeat const&>(opcode).offset();
  81. if (repeat_start > end_of_last_block)
  82. block_boundaries.append({ end_of_last_block, repeat_start });
  83. block_boundaries.append({ repeat_start, state.instruction_position });
  84. end_of_last_block = state.instruction_position + opcode.size();
  85. break;
  86. }
  87. default:
  88. break;
  89. }
  90. auto next_ip = state.instruction_position + opcode.size();
  91. if (next_ip < bytecode_size)
  92. state.instruction_position = next_ip;
  93. else
  94. break;
  95. }
  96. if (end_of_last_block < bytecode_size)
  97. block_boundaries.append({ end_of_last_block, bytecode_size });
  98. quick_sort(block_boundaries, [](auto& a, auto& b) { return a.start < b.start; });
  99. return block_boundaries;
  100. }
  101. static bool has_overlap(Vector<CompareTypeAndValuePair> const& lhs, Vector<CompareTypeAndValuePair> const& rhs)
  102. {
  103. // We have to fully interpret the two sequences to determine if they overlap (that is, keep track of inversion state and what ranges they cover).
  104. bool inverse { false };
  105. bool temporary_inverse { false };
  106. bool reset_temporary_inverse { false };
  107. auto current_lhs_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
  108. RedBlackTree<u32, u32> lhs_ranges;
  109. RedBlackTree<u32, u32> lhs_negated_ranges;
  110. HashTable<CharClass> lhs_char_classes;
  111. HashTable<CharClass> lhs_negated_char_classes;
  112. auto has_any_unicode_property = false;
  113. HashTable<Unicode::GeneralCategory> lhs_unicode_general_categories;
  114. HashTable<Unicode::Property> lhs_unicode_properties;
  115. HashTable<Unicode::Script> lhs_unicode_scripts;
  116. HashTable<Unicode::Script> lhs_unicode_script_extensions;
  117. HashTable<Unicode::GeneralCategory> lhs_negated_unicode_general_categories;
  118. HashTable<Unicode::Property> lhs_negated_unicode_properties;
  119. HashTable<Unicode::Script> lhs_negated_unicode_scripts;
  120. HashTable<Unicode::Script> lhs_negated_unicode_script_extensions;
  121. auto any_unicode_property_matches = [&](u32 code_point) {
  122. if (any_of(lhs_negated_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); }))
  123. return false;
  124. if (any_of(lhs_negated_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); }))
  125. return false;
  126. if (any_of(lhs_negated_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); }))
  127. return false;
  128. if (any_of(lhs_negated_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); }))
  129. return false;
  130. if (any_of(lhs_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); }))
  131. return true;
  132. if (any_of(lhs_unicode_properties, [code_point](auto property) { return Unicode::code_point_has_property(code_point, property); }))
  133. return true;
  134. if (any_of(lhs_unicode_scripts, [code_point](auto script) { return Unicode::code_point_has_script(code_point, script); }))
  135. return true;
  136. if (any_of(lhs_unicode_script_extensions, [code_point](auto script) { return Unicode::code_point_has_script_extension(code_point, script); }))
  137. return true;
  138. return false;
  139. };
  140. auto range_contains = [&]<typename T>(T& value) -> bool {
  141. u32 start;
  142. u32 end;
  143. if constexpr (IsSame<T, CharRange>) {
  144. start = value.from;
  145. end = value.to;
  146. } else {
  147. start = value;
  148. end = value;
  149. }
  150. if (has_any_unicode_property) {
  151. // We have some properties, and a range is present
  152. // Instead of checking every single code point in the range, assume it's a match.
  153. return start != end || any_unicode_property_matches(start);
  154. }
  155. auto* max = lhs_ranges.find_smallest_not_below(start);
  156. return max && *max <= end;
  157. };
  158. auto char_class_contains = [&](CharClass const& value) -> bool {
  159. if (lhs_char_classes.contains(value))
  160. return true;
  161. if (lhs_negated_char_classes.contains(value))
  162. return false;
  163. // This char class might match something in the ranges we have, and checking that is far too expensive, so just bail out.
  164. return true;
  165. };
  166. for (auto const& pair : lhs) {
  167. if (reset_temporary_inverse) {
  168. reset_temporary_inverse = false;
  169. temporary_inverse = false;
  170. } else {
  171. reset_temporary_inverse = true;
  172. }
  173. switch (pair.type) {
  174. case CharacterCompareType::Inverse:
  175. inverse = !inverse;
  176. break;
  177. case CharacterCompareType::TemporaryInverse:
  178. temporary_inverse = true;
  179. reset_temporary_inverse = true;
  180. break;
  181. case CharacterCompareType::AnyChar:
  182. // Special case: if not inverted, AnyChar is always in the range.
  183. if (!current_lhs_inversion_state())
  184. return true;
  185. break;
  186. case CharacterCompareType::Char:
  187. if (!current_lhs_inversion_state())
  188. lhs_ranges.insert(pair.value, pair.value);
  189. else
  190. lhs_negated_ranges.insert(pair.value, pair.value);
  191. break;
  192. case CharacterCompareType::String:
  193. // FIXME: We just need to look at the last character of this string, but we only have the first character here.
  194. // Just bail out to avoid false positives.
  195. return true;
  196. case CharacterCompareType::CharClass:
  197. if (!current_lhs_inversion_state())
  198. lhs_char_classes.set(static_cast<CharClass>(pair.value));
  199. else
  200. lhs_negated_char_classes.set(static_cast<CharClass>(pair.value));
  201. break;
  202. case CharacterCompareType::CharRange: {
  203. auto range = CharRange(pair.value);
  204. if (!current_lhs_inversion_state())
  205. lhs_ranges.insert(range.from, range.to);
  206. else
  207. lhs_negated_ranges.insert(range.from, range.to);
  208. break;
  209. }
  210. case CharacterCompareType::LookupTable:
  211. // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
  212. return true;
  213. case CharacterCompareType::Reference:
  214. // We've handled this before coming here.
  215. break;
  216. case CharacterCompareType::Property:
  217. has_any_unicode_property = true;
  218. if (!current_lhs_inversion_state())
  219. lhs_unicode_properties.set(static_cast<Unicode::Property>(pair.value));
  220. else
  221. lhs_negated_unicode_properties.set(static_cast<Unicode::Property>(pair.value));
  222. break;
  223. case CharacterCompareType::GeneralCategory:
  224. has_any_unicode_property = true;
  225. if (!current_lhs_inversion_state())
  226. lhs_unicode_general_categories.set(static_cast<Unicode::GeneralCategory>(pair.value));
  227. else
  228. lhs_negated_unicode_general_categories.set(static_cast<Unicode::GeneralCategory>(pair.value));
  229. break;
  230. case CharacterCompareType::Script:
  231. has_any_unicode_property = true;
  232. if (!current_lhs_inversion_state())
  233. lhs_unicode_scripts.set(static_cast<Unicode::Script>(pair.value));
  234. else
  235. lhs_negated_unicode_scripts.set(static_cast<Unicode::Script>(pair.value));
  236. break;
  237. case CharacterCompareType::ScriptExtension:
  238. has_any_unicode_property = true;
  239. if (!current_lhs_inversion_state())
  240. lhs_unicode_script_extensions.set(static_cast<Unicode::Script>(pair.value));
  241. else
  242. lhs_negated_unicode_script_extensions.set(static_cast<Unicode::Script>(pair.value));
  243. break;
  244. case CharacterCompareType::And:
  245. case CharacterCompareType::Or:
  246. case CharacterCompareType::EndAndOr:
  247. // FIXME: These are too difficult to handle, so bail out.
  248. return true;
  249. case CharacterCompareType::Undefined:
  250. case CharacterCompareType::RangeExpressionDummy:
  251. // These do not occur in valid bytecode.
  252. VERIFY_NOT_REACHED();
  253. }
  254. }
  255. if constexpr (REGEX_DEBUG) {
  256. dbgln("lhs ranges:");
  257. for (auto it = lhs_ranges.begin(); it != lhs_ranges.end(); ++it)
  258. dbgln(" {}..{}", it.key(), *it);
  259. dbgln("lhs negated ranges:");
  260. for (auto it = lhs_negated_ranges.begin(); it != lhs_negated_ranges.end(); ++it)
  261. dbgln(" {}..{}", it.key(), *it);
  262. }
  263. for (auto const& pair : rhs) {
  264. if (reset_temporary_inverse) {
  265. reset_temporary_inverse = false;
  266. temporary_inverse = false;
  267. } else {
  268. reset_temporary_inverse = true;
  269. }
  270. dbgln_if(REGEX_DEBUG, "check {} ({})...", character_compare_type_name(pair.type), pair.value);
  271. switch (pair.type) {
  272. case CharacterCompareType::Inverse:
  273. inverse = !inverse;
  274. break;
  275. case CharacterCompareType::TemporaryInverse:
  276. temporary_inverse = true;
  277. reset_temporary_inverse = true;
  278. break;
  279. case CharacterCompareType::AnyChar:
  280. // Special case: if not inverted, AnyChar is always in the range.
  281. if (!current_lhs_inversion_state())
  282. return true;
  283. break;
  284. case CharacterCompareType::Char:
  285. if (current_lhs_inversion_state() ^ range_contains(pair.value))
  286. return true;
  287. break;
  288. case CharacterCompareType::String:
  289. // FIXME: We just need to look at the last character of this string, but we only have the first character here.
  290. // Just bail out to avoid false positives.
  291. return true;
  292. case CharacterCompareType::CharClass:
  293. if (current_lhs_inversion_state() ^ char_class_contains(static_cast<CharClass>(pair.value)))
  294. return true;
  295. break;
  296. case CharacterCompareType::CharRange: {
  297. auto range = CharRange(pair.value);
  298. if (current_lhs_inversion_state() ^ range_contains(range))
  299. return true;
  300. break;
  301. }
  302. case CharacterCompareType::LookupTable:
  303. // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it.
  304. return true;
  305. case CharacterCompareType::Reference:
  306. // We've handled this before coming here.
  307. break;
  308. case CharacterCompareType::Property:
  309. // The only reasonable scenario where we can check these properties without spending too much time is if:
  310. // - the ranges are empty
  311. // - the char classes are empty
  312. // - the unicode properties are empty or contain only this property
  313. if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
  314. return true;
  315. if (has_any_unicode_property && !lhs_unicode_properties.is_empty() && !lhs_negated_unicode_properties.is_empty()) {
  316. if (current_lhs_inversion_state() ^ lhs_unicode_properties.contains(static_cast<Unicode::Property>(pair.value)))
  317. return true;
  318. if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_properties.contains(static_cast<Unicode::Property>(pair.value))))
  319. return true;
  320. }
  321. break;
  322. case CharacterCompareType::GeneralCategory:
  323. if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
  324. return true;
  325. if (has_any_unicode_property && !lhs_unicode_general_categories.is_empty() && !lhs_negated_unicode_general_categories.is_empty()) {
  326. if (current_lhs_inversion_state() ^ lhs_unicode_general_categories.contains(static_cast<Unicode::GeneralCategory>(pair.value)))
  327. return true;
  328. if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_general_categories.contains(static_cast<Unicode::GeneralCategory>(pair.value))))
  329. return true;
  330. }
  331. break;
  332. case CharacterCompareType::Script:
  333. if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
  334. return true;
  335. if (has_any_unicode_property && !lhs_unicode_scripts.is_empty() && !lhs_negated_unicode_scripts.is_empty()) {
  336. if (current_lhs_inversion_state() ^ lhs_unicode_scripts.contains(static_cast<Unicode::Script>(pair.value)))
  337. return true;
  338. if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_scripts.contains(static_cast<Unicode::Script>(pair.value))))
  339. return true;
  340. }
  341. break;
  342. case CharacterCompareType::ScriptExtension:
  343. if (!lhs_ranges.is_empty() || !lhs_negated_ranges.is_empty() || !lhs_char_classes.is_empty() || !lhs_negated_char_classes.is_empty())
  344. return true;
  345. if (has_any_unicode_property && !lhs_unicode_script_extensions.is_empty() && !lhs_negated_unicode_script_extensions.is_empty()) {
  346. if (current_lhs_inversion_state() ^ lhs_unicode_script_extensions.contains(static_cast<Unicode::Script>(pair.value)))
  347. return true;
  348. if (false == (current_lhs_inversion_state() ^ lhs_negated_unicode_script_extensions.contains(static_cast<Unicode::Script>(pair.value))))
  349. return true;
  350. }
  351. break;
  352. case CharacterCompareType::And:
  353. case CharacterCompareType::Or:
  354. case CharacterCompareType::EndAndOr:
  355. // FIXME: These are too difficult to handle, so bail out.
  356. return true;
  357. case CharacterCompareType::Undefined:
  358. case CharacterCompareType::RangeExpressionDummy:
  359. // These do not occur in valid bytecode.
  360. VERIFY_NOT_REACHED();
  361. }
  362. }
  363. return false;
  364. }
  365. enum class AtomicRewritePreconditionResult {
  366. SatisfiedWithProperHeader,
  367. SatisfiedWithEmptyHeader,
  368. NotSatisfied,
  369. };
  370. static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block const& repeated_block, Block const& following_block)
  371. {
  372. Vector<Vector<CompareTypeAndValuePair>> repeated_values;
  373. HashTable<size_t> active_capture_groups;
  374. MatchState state;
  375. auto has_seen_actionable_opcode = false;
  376. for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) {
  377. auto& opcode = bytecode.get_opcode(state);
  378. switch (opcode.opcode_id()) {
  379. case OpCodeId::Compare: {
  380. has_seen_actionable_opcode = true;
  381. auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
  382. if (repeated_values.is_empty() && any_of(compares, [](auto& compare) { return compare.type == CharacterCompareType::AnyChar; }))
  383. return AtomicRewritePreconditionResult::NotSatisfied;
  384. repeated_values.append(move(compares));
  385. break;
  386. }
  387. case OpCodeId::CheckBegin:
  388. case OpCodeId::CheckEnd:
  389. has_seen_actionable_opcode = true;
  390. if (repeated_values.is_empty())
  391. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
  392. break;
  393. case OpCodeId::CheckBoundary:
  394. // FIXME: What should we do with these? for now, let's fail.
  395. return AtomicRewritePreconditionResult::NotSatisfied;
  396. case OpCodeId::Restore:
  397. case OpCodeId::GoBack:
  398. return AtomicRewritePreconditionResult::NotSatisfied;
  399. case OpCodeId::SaveRightCaptureGroup:
  400. active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
  401. break;
  402. case OpCodeId::SaveLeftCaptureGroup:
  403. active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
  404. break;
  405. case OpCodeId::ForkJump:
  406. case OpCodeId::ForkReplaceJump:
  407. case OpCodeId::JumpNonEmpty:
  408. // We could attempt to recursively resolve the follow set, but pretending that this just goes nowhere is faster.
  409. if (!has_seen_actionable_opcode)
  410. return AtomicRewritePreconditionResult::NotSatisfied;
  411. break;
  412. default:
  413. break;
  414. }
  415. state.instruction_position += opcode.size();
  416. }
  417. dbgln_if(REGEX_DEBUG, "Found {} entries in reference", repeated_values.size());
  418. dbgln_if(REGEX_DEBUG, "Found {} active capture groups", active_capture_groups.size());
  419. bool following_block_has_at_least_one_compare = false;
  420. // Find the first compare in the following block, it must NOT match any of the values in `repeated_values'.
  421. auto final_instruction = following_block.start;
  422. for (state.instruction_position = following_block.start; state.instruction_position < following_block.end;) {
  423. final_instruction = state.instruction_position;
  424. auto& opcode = bytecode.get_opcode(state);
  425. switch (opcode.opcode_id()) {
  426. // Note: These have to exist since we're effectively repeating the following block as well
  427. case OpCodeId::SaveRightCaptureGroup:
  428. active_capture_groups.set(static_cast<OpCode_SaveRightCaptureGroup const&>(opcode).id());
  429. break;
  430. case OpCodeId::SaveLeftCaptureGroup:
  431. active_capture_groups.set(static_cast<OpCode_SaveLeftCaptureGroup const&>(opcode).id());
  432. break;
  433. case OpCodeId::Compare: {
  434. following_block_has_at_least_one_compare = true;
  435. // We found a compare, let's see what it has.
  436. auto compares = static_cast<OpCode_Compare const&>(opcode).flat_compares();
  437. if (compares.is_empty())
  438. break;
  439. if (any_of(compares, [&](auto& compare) {
  440. return compare.type == CharacterCompareType::AnyChar
  441. || (compare.type == CharacterCompareType::Reference && active_capture_groups.contains(compare.value));
  442. }))
  443. return AtomicRewritePreconditionResult::NotSatisfied;
  444. if (any_of(repeated_values, [&](auto& repeated_value) { return has_overlap(compares, repeated_value); }))
  445. return AtomicRewritePreconditionResult::NotSatisfied;
  446. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
  447. }
  448. case OpCodeId::CheckBegin:
  449. case OpCodeId::CheckEnd:
  450. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader; // Nothing can match the end!
  451. case OpCodeId::CheckBoundary:
  452. // FIXME: What should we do with these? For now, consider them a failure.
  453. return AtomicRewritePreconditionResult::NotSatisfied;
  454. case OpCodeId::ForkJump:
  455. case OpCodeId::ForkReplaceJump:
  456. case OpCodeId::JumpNonEmpty:
  457. // See note in the previous switch, same cases.
  458. if (!following_block_has_at_least_one_compare)
  459. return AtomicRewritePreconditionResult::NotSatisfied;
  460. break;
  461. default:
  462. break;
  463. }
  464. state.instruction_position += opcode.size();
  465. }
  466. // If the following block falls through, we can't rewrite it.
  467. state.instruction_position = final_instruction;
  468. switch (bytecode.get_opcode(state).opcode_id()) {
  469. case OpCodeId::Jump:
  470. case OpCodeId::JumpNonEmpty:
  471. case OpCodeId::ForkJump:
  472. case OpCodeId::ForkReplaceJump:
  473. break;
  474. default:
  475. return AtomicRewritePreconditionResult::NotSatisfied;
  476. }
  477. if (following_block_has_at_least_one_compare)
  478. return AtomicRewritePreconditionResult::SatisfiedWithProperHeader;
  479. return AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader;
  480. }
  481. template<typename Parser>
  482. bool Regex<Parser>::attempt_rewrite_entire_match_as_substring_search(BasicBlockList const& basic_blocks)
  483. {
  484. // If there's no jumps, we can probably rewrite this as a substring search (Compare { string = str }).
  485. if (basic_blocks.size() > 1)
  486. return false;
  487. if (basic_blocks.is_empty()) {
  488. parser_result.optimization_data.pure_substring_search = ""sv;
  489. return true; // Empty regex, sure.
  490. }
  491. auto& bytecode = parser_result.bytecode;
  492. auto is_unicode = parser_result.options.has_flag_set(AllFlags::Unicode);
  493. // We have a single basic block, let's see if it's a series of character or string compares.
  494. StringBuilder final_string;
  495. MatchState state;
  496. while (state.instruction_position < bytecode.size()) {
  497. auto& opcode = bytecode.get_opcode(state);
  498. switch (opcode.opcode_id()) {
  499. case OpCodeId::Compare: {
  500. auto& compare = static_cast<OpCode_Compare const&>(opcode);
  501. for (auto& flat_compare : compare.flat_compares()) {
  502. if (flat_compare.type != CharacterCompareType::Char)
  503. return false;
  504. if (is_unicode || flat_compare.value <= 0x7f)
  505. final_string.append_code_point(flat_compare.value);
  506. else
  507. final_string.append(bit_cast<char>(static_cast<u8>(flat_compare.value)));
  508. }
  509. break;
  510. }
  511. default:
  512. return false;
  513. }
  514. state.instruction_position += opcode.size();
  515. }
  516. parser_result.optimization_data.pure_substring_search = final_string.to_deprecated_string();
  517. return true;
  518. }
  519. template<typename Parser>
  520. void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const& basic_blocks)
  521. {
  522. auto& bytecode = parser_result.bytecode;
  523. if constexpr (REGEX_DEBUG) {
  524. RegexDebug dbg;
  525. dbg.print_bytecode(*this);
  526. for (auto const& block : basic_blocks)
  527. dbgln("block from {} to {}", block.start, block.end);
  528. }
  529. // A pattern such as:
  530. // bb0 | RE0
  531. // | ForkX bb0
  532. // -------------------------
  533. // bb1 | RE1
  534. // can be rewritten as:
  535. // -------------------------
  536. // bb0 | RE0
  537. // | ForkReplaceX bb0
  538. // -------------------------
  539. // bb1 | RE1
  540. // provided that first(RE1) not-in end(RE0), which is to say
  541. // that RE1 cannot start with whatever RE0 has matched (ever).
  542. //
  543. // Alternatively, a second form of this pattern can also occur:
  544. // bb0 | *
  545. // | ForkX bb2
  546. // ------------------------
  547. // bb1 | RE0
  548. // | Jump bb0
  549. // ------------------------
  550. // bb2 | RE1
  551. // which can be transformed (with the same preconditions) to:
  552. // bb0 | *
  553. // | ForkReplaceX bb2
  554. // ------------------------
  555. // bb1 | RE0
  556. // | Jump bb0
  557. // ------------------------
  558. // bb2 | RE1
  559. enum class AlternateForm {
  560. DirectLoopWithoutHeader, // loop without proper header, a block forking to itself. i.e. the first form.
  561. DirectLoopWithoutHeaderAndEmptyFollow, // loop without proper header, a block forking to itself. i.e. the first form but with RE1 being empty.
  562. DirectLoopWithHeader, // loop with proper header, i.e. the second form.
  563. };
  564. struct CandidateBlock {
  565. Block forking_block;
  566. Optional<Block> new_target_block;
  567. AlternateForm form;
  568. };
  569. Vector<CandidateBlock> candidate_blocks;
  570. auto is_an_eligible_jump = [](OpCode const& opcode, size_t ip, size_t block_start, AlternateForm alternate_form) {
  571. switch (opcode.opcode_id()) {
  572. case OpCodeId::JumpNonEmpty: {
  573. auto const& op = static_cast<OpCode_JumpNonEmpty const&>(opcode);
  574. auto form = op.form();
  575. if (form != OpCodeId::Jump && alternate_form == AlternateForm::DirectLoopWithHeader)
  576. return false;
  577. if (form != OpCodeId::ForkJump && form != OpCodeId::ForkStay && alternate_form == AlternateForm::DirectLoopWithoutHeader)
  578. return false;
  579. return op.offset() + ip + opcode.size() == block_start;
  580. }
  581. case OpCodeId::ForkJump:
  582. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  583. return false;
  584. return static_cast<OpCode_ForkJump const&>(opcode).offset() + ip + opcode.size() == block_start;
  585. case OpCodeId::ForkStay:
  586. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  587. return false;
  588. return static_cast<OpCode_ForkStay const&>(opcode).offset() + ip + opcode.size() == block_start;
  589. case OpCodeId::Jump:
  590. // Infinite loop does *not* produce forks.
  591. if (alternate_form == AlternateForm::DirectLoopWithoutHeader)
  592. return false;
  593. if (alternate_form == AlternateForm::DirectLoopWithHeader)
  594. return static_cast<OpCode_Jump const&>(opcode).offset() + ip + opcode.size() == block_start;
  595. VERIFY_NOT_REACHED();
  596. default:
  597. return false;
  598. }
  599. };
  600. for (size_t i = 0; i < basic_blocks.size(); ++i) {
  601. auto forking_block = basic_blocks[i];
  602. Optional<Block> fork_fallback_block;
  603. if (i + 1 < basic_blocks.size())
  604. fork_fallback_block = basic_blocks[i + 1];
  605. MatchState state;
  606. // Check if the last instruction in this block is a jump to the block itself:
  607. {
  608. state.instruction_position = forking_block.end;
  609. auto& opcode = bytecode.get_opcode(state);
  610. if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithoutHeader)) {
  611. // We've found RE0 (and RE1 is just the following block, if any), let's see if the precondition applies.
  612. // if RE1 is empty, there's no first(RE1), so this is an automatic pass.
  613. if (!fork_fallback_block.has_value()
  614. || (fork_fallback_block->end == fork_fallback_block->start && block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block) != AtomicRewritePreconditionResult::NotSatisfied)) {
  615. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
  616. break;
  617. }
  618. auto precondition = block_satisfies_atomic_rewrite_precondition(bytecode, forking_block, *fork_fallback_block);
  619. if (precondition == AtomicRewritePreconditionResult::SatisfiedWithProperHeader) {
  620. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeader });
  621. break;
  622. }
  623. if (precondition == AtomicRewritePreconditionResult::SatisfiedWithEmptyHeader) {
  624. candidate_blocks.append({ forking_block, fork_fallback_block, AlternateForm::DirectLoopWithoutHeaderAndEmptyFollow });
  625. break;
  626. }
  627. }
  628. }
  629. // Check if the last instruction in the last block is a direct jump to this block
  630. if (fork_fallback_block.has_value()) {
  631. state.instruction_position = fork_fallback_block->end;
  632. auto& opcode = bytecode.get_opcode(state);
  633. if (is_an_eligible_jump(opcode, state.instruction_position, forking_block.start, AlternateForm::DirectLoopWithHeader)) {
  634. // We've found bb1 and bb0, let's just make sure that bb0 forks to bb2.
  635. state.instruction_position = forking_block.end;
  636. auto& opcode = bytecode.get_opcode(state);
  637. if (opcode.opcode_id() == OpCodeId::ForkJump || opcode.opcode_id() == OpCodeId::ForkStay) {
  638. Optional<Block> block_following_fork_fallback;
  639. if (i + 2 < basic_blocks.size())
  640. block_following_fork_fallback = basic_blocks[i + 2];
  641. if (!block_following_fork_fallback.has_value()
  642. || block_satisfies_atomic_rewrite_precondition(bytecode, *fork_fallback_block, *block_following_fork_fallback) != AtomicRewritePreconditionResult::NotSatisfied) {
  643. candidate_blocks.append({ forking_block, {}, AlternateForm::DirectLoopWithHeader });
  644. break;
  645. }
  646. }
  647. }
  648. }
  649. }
  650. dbgln_if(REGEX_DEBUG, "Found {} candidate blocks", candidate_blocks.size());
  651. if (candidate_blocks.is_empty()) {
  652. dbgln_if(REGEX_DEBUG, "Failed to find anything for {}", pattern_value);
  653. return;
  654. }
  655. RedBlackTree<size_t, size_t> needed_patches;
  656. // Reverse the blocks, so we can patch the bytecode without messing with the latter patches.
  657. quick_sort(candidate_blocks, [](auto& a, auto& b) { return b.forking_block.start > a.forking_block.start; });
  658. for (auto& candidate : candidate_blocks) {
  659. // Note that both forms share a ForkReplace patch in forking_block.
  660. // Patch the ForkX in forking_block to be a ForkReplaceX instead.
  661. auto& opcode_id = bytecode[candidate.forking_block.end];
  662. if (opcode_id == (ByteCodeValueType)OpCodeId::ForkStay) {
  663. opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
  664. } else if (opcode_id == (ByteCodeValueType)OpCodeId::ForkJump) {
  665. opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
  666. } else if (opcode_id == (ByteCodeValueType)OpCodeId::JumpNonEmpty) {
  667. auto& jump_opcode_id = bytecode[candidate.forking_block.end + 3];
  668. if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkStay)
  669. jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceStay;
  670. else if (jump_opcode_id == (ByteCodeValueType)OpCodeId::ForkJump)
  671. jump_opcode_id = (ByteCodeValueType)OpCodeId::ForkReplaceJump;
  672. else
  673. VERIFY_NOT_REACHED();
  674. } else {
  675. VERIFY_NOT_REACHED();
  676. }
  677. }
  678. if (!needed_patches.is_empty()) {
  679. MatchState state;
  680. auto bytecode_size = bytecode.size();
  681. state.instruction_position = 0;
  682. struct Patch {
  683. ssize_t value;
  684. size_t offset;
  685. bool should_negate { false };
  686. };
  687. for (;;) {
  688. if (state.instruction_position >= bytecode_size)
  689. break;
  690. auto& opcode = bytecode.get_opcode(state);
  691. Stack<Patch, 2> patch_points;
  692. switch (opcode.opcode_id()) {
  693. case OpCodeId::Jump:
  694. patch_points.push({ static_cast<OpCode_Jump const&>(opcode).offset(), state.instruction_position + 1 });
  695. break;
  696. case OpCodeId::JumpNonEmpty:
  697. patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).offset(), state.instruction_position + 1 });
  698. patch_points.push({ static_cast<OpCode_JumpNonEmpty const&>(opcode).checkpoint(), state.instruction_position + 2 });
  699. break;
  700. case OpCodeId::ForkJump:
  701. patch_points.push({ static_cast<OpCode_ForkJump const&>(opcode).offset(), state.instruction_position + 1 });
  702. break;
  703. case OpCodeId::ForkStay:
  704. patch_points.push({ static_cast<OpCode_ForkStay const&>(opcode).offset(), state.instruction_position + 1 });
  705. break;
  706. case OpCodeId::Repeat:
  707. patch_points.push({ -(ssize_t) static_cast<OpCode_Repeat const&>(opcode).offset(), state.instruction_position + 1, true });
  708. break;
  709. default:
  710. break;
  711. }
  712. while (!patch_points.is_empty()) {
  713. auto& patch_point = patch_points.top();
  714. auto target_offset = patch_point.value + state.instruction_position + opcode.size();
  715. constexpr auto do_patch = [](auto& patch_it, auto& patch_point, auto& target_offset, auto& bytecode, auto ip) {
  716. if (patch_it.key() == ip)
  717. return;
  718. if (patch_point.value < 0 && target_offset <= patch_it.key() && ip > patch_it.key())
  719. bytecode[patch_point.offset] += (patch_point.should_negate ? 1 : -1) * (*patch_it);
  720. else if (patch_point.value > 0 && target_offset >= patch_it.key() && ip < patch_it.key())
  721. bytecode[patch_point.offset] += (patch_point.should_negate ? -1 : 1) * (*patch_it);
  722. };
  723. if (auto patch_it = needed_patches.find_largest_not_above_iterator(target_offset); !patch_it.is_end())
  724. do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
  725. else if (auto patch_it = needed_patches.find_largest_not_above_iterator(state.instruction_position); !patch_it.is_end())
  726. do_patch(patch_it, patch_point, target_offset, bytecode, state.instruction_position);
  727. patch_points.pop();
  728. }
  729. state.instruction_position += opcode.size();
  730. }
  731. }
  732. if constexpr (REGEX_DEBUG) {
  733. warnln("Transformed to:");
  734. RegexDebug dbg;
  735. dbg.print_bytecode(*this);
  736. }
  737. }
  738. void Optimizer::append_alternation(ByteCode& target, ByteCode&& left, ByteCode&& right)
  739. {
  740. Array<ByteCode, 2> alternatives;
  741. alternatives[0] = move(left);
  742. alternatives[1] = move(right);
  743. append_alternation(target, alternatives);
  744. }
  745. void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives)
  746. {
  747. if (alternatives.size() == 0)
  748. return;
  749. if (alternatives.size() == 1)
  750. return target.extend(move(alternatives[0]));
  751. if (all_of(alternatives, [](auto& x) { return x.is_empty(); }))
  752. return;
  753. for (auto& entry : alternatives)
  754. entry.flatten();
  755. #if REGEX_DEBUG
  756. ScopeLogger<true> log;
  757. warnln("Alternations:");
  758. RegexDebug dbg;
  759. for (auto& entry : alternatives) {
  760. warnln("----------");
  761. dbg.print_bytecode(entry);
  762. }
  763. ScopeGuard print_at_end {
  764. [&] {
  765. warnln("======================");
  766. RegexDebug dbg;
  767. dbg.print_bytecode(target);
  768. }
  769. };
  770. #endif
  771. Vector<Vector<Detail::Block>> basic_blocks;
  772. basic_blocks.ensure_capacity(alternatives.size());
  773. for (auto& entry : alternatives)
  774. basic_blocks.append(Regex<PosixBasicParser>::split_basic_blocks(entry));
  775. Optional<size_t> left_skip;
  776. size_t shared_block_count = basic_blocks.first().size();
  777. for (auto& entry : basic_blocks)
  778. shared_block_count = min(shared_block_count, entry.size());
  779. MatchState state;
  780. for (size_t block_index = 0; block_index < shared_block_count; block_index++) {
  781. auto& left_block = basic_blocks.first()[block_index];
  782. auto left_end = block_index + 1 == basic_blocks.first().size() ? left_block.end : basic_blocks.first()[block_index + 1].start;
  783. auto can_continue = true;
  784. for (size_t i = 1; i < alternatives.size(); ++i) {
  785. auto& right_blocks = basic_blocks[i];
  786. auto& right_block = right_blocks[block_index];
  787. auto right_end = block_index + 1 == right_blocks.size() ? right_block.end : right_blocks[block_index + 1].start;
  788. if (left_end - left_block.start != right_end - right_block.start) {
  789. can_continue = false;
  790. break;
  791. }
  792. if (alternatives[0].spans().slice(left_block.start, left_end - left_block.start) != alternatives[i].spans().slice(right_block.start, right_end - right_block.start)) {
  793. can_continue = false;
  794. break;
  795. }
  796. }
  797. if (!can_continue)
  798. break;
  799. size_t i = 0;
  800. for (auto& entry : alternatives) {
  801. auto& blocks = basic_blocks[i++];
  802. auto& block = blocks[block_index];
  803. auto end = block_index + 1 == blocks.size() ? block.end : blocks[block_index + 1].start;
  804. state.instruction_position = block.start;
  805. size_t skip = 0;
  806. while (state.instruction_position < end) {
  807. auto& opcode = entry.get_opcode(state);
  808. state.instruction_position += opcode.size();
  809. skip = state.instruction_position;
  810. }
  811. if (left_skip.has_value())
  812. left_skip = min(skip, *left_skip);
  813. else
  814. left_skip = skip;
  815. }
  816. }
  817. // Remove forward jumps as they no longer make sense.
  818. state.instruction_position = 0;
  819. for (size_t i = 0; i < left_skip.value_or(0);) {
  820. auto& opcode = alternatives[0].get_opcode(state);
  821. switch (opcode.opcode_id()) {
  822. case OpCodeId::Jump:
  823. case OpCodeId::ForkJump:
  824. case OpCodeId::JumpNonEmpty:
  825. case OpCodeId::ForkStay:
  826. case OpCodeId::ForkReplaceJump:
  827. case OpCodeId::ForkReplaceStay:
  828. if (opcode.argument(0) + opcode.size() > left_skip.value_or(0)) {
  829. left_skip = i;
  830. goto break_out;
  831. }
  832. break;
  833. default:
  834. break;
  835. }
  836. i += opcode.size();
  837. }
  838. break_out:;
  839. dbgln_if(REGEX_DEBUG, "Skipping {}/{} bytecode entries from {}", left_skip, 0, alternatives[0].size());
  840. if (left_skip.has_value() && *left_skip > 0) {
  841. target.extend(alternatives[0].release_slice(basic_blocks.first().first().start, *left_skip));
  842. auto first = true;
  843. for (auto& entry : alternatives) {
  844. if (first) {
  845. first = false;
  846. continue;
  847. }
  848. entry = entry.release_slice(*left_skip);
  849. }
  850. }
  851. if (all_of(alternatives, [](auto& entry) { return entry.is_empty(); }))
  852. return;
  853. size_t patch_start = target.size();
  854. for (size_t i = 1; i < alternatives.size(); ++i) {
  855. target.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
  856. target.empend(0u); // To be filled later.
  857. }
  858. size_t size_to_jump = 0;
  859. bool seen_one_empty = false;
  860. for (size_t i = alternatives.size(); i > 0; --i) {
  861. auto& entry = alternatives[i - 1];
  862. if (entry.is_empty()) {
  863. if (seen_one_empty)
  864. continue;
  865. seen_one_empty = true;
  866. }
  867. auto is_first = i == 1;
  868. auto instruction_size = entry.size() + (is_first ? 0 : 2); // Jump; -> +2
  869. size_to_jump += instruction_size;
  870. if (!is_first)
  871. target[patch_start + (i - 2) * 2 + 1] = size_to_jump + (alternatives.size() - i) * 2;
  872. dbgln_if(REGEX_DEBUG, "{} size = {}, cum={}", i - 1, instruction_size, size_to_jump);
  873. }
  874. seen_one_empty = false;
  875. for (size_t i = alternatives.size(); i > 0; --i) {
  876. auto& chunk = alternatives[i - 1];
  877. if (chunk.is_empty()) {
  878. if (seen_one_empty)
  879. continue;
  880. seen_one_empty = true;
  881. }
  882. ByteCode* previous_chunk = nullptr;
  883. size_t j = i - 1;
  884. auto seen_one_empty_before = chunk.is_empty();
  885. while (j >= 1) {
  886. --j;
  887. auto& candidate_chunk = alternatives[j];
  888. if (candidate_chunk.is_empty()) {
  889. if (seen_one_empty_before)
  890. continue;
  891. }
  892. previous_chunk = &candidate_chunk;
  893. break;
  894. }
  895. size_to_jump -= chunk.size() + (previous_chunk ? 2 : 0);
  896. target.extend(move(chunk));
  897. target.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
  898. target.empend(size_to_jump); // Jump to the _END label
  899. }
  900. }
  901. enum class LookupTableInsertionOutcome {
  902. Successful,
  903. ReplaceWithAnyChar,
  904. TemporaryInversionNeeded,
  905. PermanentInversionNeeded,
  906. FlushOnInsertion,
  907. FinishFlushOnInsertion,
  908. CannotPlaceInTable,
  909. };
  910. static LookupTableInsertionOutcome insert_into_lookup_table(RedBlackTree<ByteCodeValueType, CharRange>& table, CompareTypeAndValuePair pair)
  911. {
  912. switch (pair.type) {
  913. case CharacterCompareType::Inverse:
  914. return LookupTableInsertionOutcome::PermanentInversionNeeded;
  915. case CharacterCompareType::TemporaryInverse:
  916. return LookupTableInsertionOutcome::TemporaryInversionNeeded;
  917. case CharacterCompareType::AnyChar:
  918. return LookupTableInsertionOutcome::ReplaceWithAnyChar;
  919. case CharacterCompareType::CharClass:
  920. return LookupTableInsertionOutcome::CannotPlaceInTable;
  921. case CharacterCompareType::Char:
  922. table.insert(pair.value, { (u32)pair.value, (u32)pair.value });
  923. break;
  924. case CharacterCompareType::CharRange: {
  925. CharRange range { pair.value };
  926. table.insert(range.from, range);
  927. break;
  928. }
  929. case CharacterCompareType::EndAndOr:
  930. return LookupTableInsertionOutcome::FinishFlushOnInsertion;
  931. case CharacterCompareType::And:
  932. return LookupTableInsertionOutcome::FlushOnInsertion;
  933. case CharacterCompareType::Reference:
  934. case CharacterCompareType::Property:
  935. case CharacterCompareType::GeneralCategory:
  936. case CharacterCompareType::Script:
  937. case CharacterCompareType::ScriptExtension:
  938. case CharacterCompareType::Or:
  939. return LookupTableInsertionOutcome::CannotPlaceInTable;
  940. case CharacterCompareType::Undefined:
  941. case CharacterCompareType::RangeExpressionDummy:
  942. case CharacterCompareType::String:
  943. case CharacterCompareType::LookupTable:
  944. VERIFY_NOT_REACHED();
  945. }
  946. return LookupTableInsertionOutcome::Successful;
  947. }
  948. void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndValuePair>&& pairs)
  949. {
  950. ByteCode arguments;
  951. size_t argument_count = 0;
  952. if (pairs.size() <= 1) {
  953. for (auto& pair : pairs) {
  954. arguments.append(to_underlying(pair.type));
  955. if (pair.type != CharacterCompareType::AnyChar
  956. && pair.type != CharacterCompareType::TemporaryInverse
  957. && pair.type != CharacterCompareType::Inverse
  958. && pair.type != CharacterCompareType::And
  959. && pair.type != CharacterCompareType::Or
  960. && pair.type != CharacterCompareType::EndAndOr)
  961. arguments.append(pair.value);
  962. ++argument_count;
  963. }
  964. } else {
  965. RedBlackTree<ByteCodeValueType, CharRange> table;
  966. RedBlackTree<ByteCodeValueType, CharRange> inverted_table;
  967. auto* current_table = &table;
  968. auto* current_inverted_table = &inverted_table;
  969. bool invert_for_next_iteration = false;
  970. bool is_currently_inverted = false;
  971. auto flush_tables = [&] {
  972. auto append_table = [&](auto& table) {
  973. ++argument_count;
  974. arguments.append(to_underlying(CharacterCompareType::LookupTable));
  975. auto size_index = arguments.size();
  976. arguments.append(0);
  977. Optional<CharRange> active_range;
  978. size_t range_count = 0;
  979. for (auto& range : table) {
  980. if (!active_range.has_value()) {
  981. active_range = range;
  982. continue;
  983. }
  984. if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
  985. active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
  986. } else {
  987. ++range_count;
  988. arguments.append(active_range.release_value());
  989. active_range = range;
  990. }
  991. }
  992. if (active_range.has_value()) {
  993. ++range_count;
  994. arguments.append(active_range.release_value());
  995. }
  996. arguments[size_index] = range_count;
  997. };
  998. auto contains_regular_table = !table.is_empty();
  999. auto contains_inverted_table = !inverted_table.is_empty();
  1000. if (contains_regular_table)
  1001. append_table(table);
  1002. if (contains_inverted_table) {
  1003. ++argument_count;
  1004. arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
  1005. append_table(inverted_table);
  1006. }
  1007. table.clear();
  1008. inverted_table.clear();
  1009. };
  1010. auto flush_on_every_insertion = false;
  1011. for (auto& value : pairs) {
  1012. auto should_invert_after_this_iteration = invert_for_next_iteration;
  1013. invert_for_next_iteration = false;
  1014. auto insertion_result = insert_into_lookup_table(*current_table, value);
  1015. switch (insertion_result) {
  1016. case LookupTableInsertionOutcome::Successful:
  1017. if (flush_on_every_insertion)
  1018. flush_tables();
  1019. break;
  1020. case LookupTableInsertionOutcome::ReplaceWithAnyChar: {
  1021. table.clear();
  1022. inverted_table.clear();
  1023. arguments.append(to_underlying(CharacterCompareType::AnyChar));
  1024. ++argument_count;
  1025. break;
  1026. }
  1027. case LookupTableInsertionOutcome::TemporaryInversionNeeded:
  1028. swap(current_table, current_inverted_table);
  1029. invert_for_next_iteration = true;
  1030. is_currently_inverted = !is_currently_inverted;
  1031. break;
  1032. case LookupTableInsertionOutcome::PermanentInversionNeeded:
  1033. flush_tables();
  1034. arguments.append(to_underlying(CharacterCompareType::Inverse));
  1035. ++argument_count;
  1036. break;
  1037. case LookupTableInsertionOutcome::FlushOnInsertion:
  1038. case LookupTableInsertionOutcome::FinishFlushOnInsertion:
  1039. flush_tables();
  1040. flush_on_every_insertion = insertion_result == LookupTableInsertionOutcome::FlushOnInsertion;
  1041. [[fallthrough]];
  1042. case LookupTableInsertionOutcome::CannotPlaceInTable:
  1043. if (is_currently_inverted) {
  1044. arguments.append(to_underlying(CharacterCompareType::TemporaryInverse));
  1045. ++argument_count;
  1046. }
  1047. arguments.append(to_underlying(value.type));
  1048. if (value.type != CharacterCompareType::AnyChar
  1049. && value.type != CharacterCompareType::TemporaryInverse
  1050. && value.type != CharacterCompareType::Inverse
  1051. && value.type != CharacterCompareType::And
  1052. && value.type != CharacterCompareType::Or
  1053. && value.type != CharacterCompareType::EndAndOr)
  1054. arguments.append(value.value);
  1055. ++argument_count;
  1056. break;
  1057. }
  1058. if (should_invert_after_this_iteration) {
  1059. swap(current_table, current_inverted_table);
  1060. is_currently_inverted = !is_currently_inverted;
  1061. }
  1062. }
  1063. flush_tables();
  1064. }
  1065. target.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
  1066. target.empend(argument_count); // number of arguments
  1067. target.empend(arguments.size()); // size of arguments
  1068. target.extend(move(arguments));
  1069. }
  1070. template void Regex<PosixBasicParser>::run_optimization_passes();
  1071. template void Regex<PosixExtendedParser>::run_optimization_passes();
  1072. template void Regex<ECMA262Parser>::run_optimization_passes();
  1073. }