RegexByteCode.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexByteCode.h"
  7. #include "AK/StringBuilder.h"
  8. #include "RegexDebug.h"
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/Debug.h>
  11. #include <LibUnicode/CharacterTypes.h>
  12. namespace regex {
  13. char const* OpCode::name(OpCodeId opcode_id)
  14. {
  15. switch (opcode_id) {
  16. #define __ENUMERATE_OPCODE(x) \
  17. case OpCodeId::x: \
  18. return #x;
  19. ENUMERATE_OPCODES
  20. #undef __ENUMERATE_OPCODE
  21. default:
  22. VERIFY_NOT_REACHED();
  23. return "<Unknown>";
  24. }
  25. }
  26. char const* OpCode::name() const
  27. {
  28. return name(opcode_id());
  29. }
  30. char const* execution_result_name(ExecutionResult result)
  31. {
  32. switch (result) {
  33. #define __ENUMERATE_EXECUTION_RESULT(x) \
  34. case ExecutionResult::x: \
  35. return #x;
  36. ENUMERATE_EXECUTION_RESULTS
  37. #undef __ENUMERATE_EXECUTION_RESULT
  38. default:
  39. VERIFY_NOT_REACHED();
  40. return "<Unknown>";
  41. }
  42. }
  43. char const* boundary_check_type_name(BoundaryCheckType ty)
  44. {
  45. switch (ty) {
  46. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
  47. case BoundaryCheckType::x: \
  48. return #x;
  49. ENUMERATE_BOUNDARY_CHECK_TYPES
  50. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  51. default:
  52. VERIFY_NOT_REACHED();
  53. return "<Unknown>";
  54. }
  55. }
  56. char const* character_compare_type_name(CharacterCompareType ch_compare_type)
  57. {
  58. switch (ch_compare_type) {
  59. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
  60. case CharacterCompareType::x: \
  61. return #x;
  62. ENUMERATE_CHARACTER_COMPARE_TYPES
  63. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  64. default:
  65. VERIFY_NOT_REACHED();
  66. return "<Unknown>";
  67. }
  68. }
  69. static char const* character_class_name(CharClass ch_class)
  70. {
  71. switch (ch_class) {
  72. #define __ENUMERATE_CHARACTER_CLASS(x) \
  73. case CharClass::x: \
  74. return #x;
  75. ENUMERATE_CHARACTER_CLASSES
  76. #undef __ENUMERATE_CHARACTER_CLASS
  77. default:
  78. VERIFY_NOT_REACHED();
  79. return "<Unknown>";
  80. }
  81. }
  82. OwnPtr<OpCode> ByteCode::s_opcodes[(size_t)OpCodeId::Last + 1];
  83. bool ByteCode::s_opcodes_initialized { false };
  84. void ByteCode::ensure_opcodes_initialized()
  85. {
  86. if (s_opcodes_initialized)
  87. return;
  88. for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
  89. switch ((OpCodeId)i) {
  90. case OpCodeId::Exit:
  91. s_opcodes[i] = make<OpCode_Exit>();
  92. break;
  93. case OpCodeId::Jump:
  94. s_opcodes[i] = make<OpCode_Jump>();
  95. break;
  96. case OpCodeId::Compare:
  97. s_opcodes[i] = make<OpCode_Compare>();
  98. break;
  99. case OpCodeId::CheckEnd:
  100. s_opcodes[i] = make<OpCode_CheckEnd>();
  101. break;
  102. case OpCodeId::CheckBoundary:
  103. s_opcodes[i] = make<OpCode_CheckBoundary>();
  104. break;
  105. case OpCodeId::ForkJump:
  106. s_opcodes[i] = make<OpCode_ForkJump>();
  107. break;
  108. case OpCodeId::ForkStay:
  109. s_opcodes[i] = make<OpCode_ForkStay>();
  110. break;
  111. case OpCodeId::FailForks:
  112. s_opcodes[i] = make<OpCode_FailForks>();
  113. break;
  114. case OpCodeId::Save:
  115. s_opcodes[i] = make<OpCode_Save>();
  116. break;
  117. case OpCodeId::Restore:
  118. s_opcodes[i] = make<OpCode_Restore>();
  119. break;
  120. case OpCodeId::GoBack:
  121. s_opcodes[i] = make<OpCode_GoBack>();
  122. break;
  123. case OpCodeId::CheckBegin:
  124. s_opcodes[i] = make<OpCode_CheckBegin>();
  125. break;
  126. case OpCodeId::ClearCaptureGroup:
  127. s_opcodes[i] = make<OpCode_ClearCaptureGroup>();
  128. break;
  129. case OpCodeId::ClearNamedCaptureGroup:
  130. s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>();
  131. break;
  132. case OpCodeId::SaveLeftCaptureGroup:
  133. s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>();
  134. break;
  135. case OpCodeId::SaveRightCaptureGroup:
  136. s_opcodes[i] = make<OpCode_SaveRightCaptureGroup>();
  137. break;
  138. case OpCodeId::SaveLeftNamedCaptureGroup:
  139. s_opcodes[i] = make<OpCode_SaveLeftNamedCaptureGroup>();
  140. break;
  141. case OpCodeId::SaveRightNamedCaptureGroup:
  142. s_opcodes[i] = make<OpCode_SaveRightNamedCaptureGroup>();
  143. break;
  144. }
  145. }
  146. s_opcodes_initialized = true;
  147. }
  148. ALWAYS_INLINE OpCode& ByteCode::get_opcode_by_id(OpCodeId id) const
  149. {
  150. VERIFY(id >= OpCodeId::First && id <= OpCodeId::Last);
  151. auto& opcode = s_opcodes[(u32)id];
  152. opcode->set_bytecode(*const_cast<ByteCode*>(this));
  153. return *opcode;
  154. }
  155. OpCode& ByteCode::get_opcode(MatchState& state) const
  156. {
  157. OpCodeId opcode_id;
  158. if (state.instruction_position >= size())
  159. opcode_id = OpCodeId::Exit;
  160. else
  161. opcode_id = (OpCodeId)at(state.instruction_position);
  162. auto& opcode = get_opcode_by_id(opcode_id);
  163. opcode.set_state(state);
  164. return opcode;
  165. }
  166. ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  167. {
  168. if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
  169. return ExecutionResult::Succeeded;
  170. return ExecutionResult::Failed;
  171. }
  172. ALWAYS_INLINE ExecutionResult OpCode_Save::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  173. {
  174. input.saved_positions.append(state.string_position);
  175. return ExecutionResult::Continue;
  176. }
  177. ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  178. {
  179. if (input.saved_positions.is_empty())
  180. return ExecutionResult::Failed;
  181. state.string_position = input.saved_positions.take_last();
  182. return ExecutionResult::Continue;
  183. }
  184. ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  185. {
  186. if (count() > state.string_position)
  187. return ExecutionResult::Failed_ExecuteLowPrioForks;
  188. state.string_position -= count();
  189. return ExecutionResult::Continue;
  190. }
  191. ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(MatchInput const& input, MatchState&, MatchOutput&) const
  192. {
  193. VERIFY(count() > 0);
  194. input.fail_counter += count() - 1;
  195. return ExecutionResult::Failed_ExecuteLowPrioForks;
  196. }
  197. ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  198. {
  199. state.instruction_position += offset();
  200. return ExecutionResult::Continue;
  201. }
  202. ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  203. {
  204. state.fork_at_position = state.instruction_position + size() + offset();
  205. return ExecutionResult::Fork_PrioHigh;
  206. }
  207. ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  208. {
  209. state.fork_at_position = state.instruction_position + size() + offset();
  210. return ExecutionResult::Fork_PrioLow;
  211. }
  212. ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  213. {
  214. if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  215. return ExecutionResult::Failed_ExecuteLowPrioForks;
  216. if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
  217. || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  218. || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
  219. return ExecutionResult::Continue;
  220. return ExecutionResult::Failed_ExecuteLowPrioForks;
  221. }
  222. ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  223. {
  224. auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; };
  225. auto is_word_boundary = [&] {
  226. if (state.string_position == input.view.length()) {
  227. if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
  228. return true;
  229. return false;
  230. }
  231. if (state.string_position == 0) {
  232. if (isword(input.view[0]))
  233. return true;
  234. return false;
  235. }
  236. return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
  237. };
  238. switch (type()) {
  239. case BoundaryCheckType::Word: {
  240. if (is_word_boundary())
  241. return ExecutionResult::Continue;
  242. return ExecutionResult::Failed_ExecuteLowPrioForks;
  243. }
  244. case BoundaryCheckType::NonWord: {
  245. if (!is_word_boundary())
  246. return ExecutionResult::Continue;
  247. return ExecutionResult::Failed_ExecuteLowPrioForks;
  248. }
  249. }
  250. VERIFY_NOT_REACHED();
  251. }
  252. ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  253. {
  254. if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
  255. return ExecutionResult::Failed_ExecuteLowPrioForks;
  256. if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
  257. || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
  258. return ExecutionResult::Continue;
  259. return ExecutionResult::Failed_ExecuteLowPrioForks;
  260. }
  261. ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  262. {
  263. if (input.match_index < state.capture_group_matches.size()) {
  264. auto& group = state.capture_group_matches[input.match_index];
  265. if (id() < group.size())
  266. group[id()].reset();
  267. }
  268. return ExecutionResult::Continue;
  269. }
  270. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  271. {
  272. if (input.match_index >= state.capture_group_matches.size()) {
  273. state.capture_group_matches.ensure_capacity(input.match_index);
  274. auto capacity = state.capture_group_matches.capacity();
  275. for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i)
  276. state.capture_group_matches.empend();
  277. }
  278. if (id() >= state.capture_group_matches.at(input.match_index).size()) {
  279. state.capture_group_matches.at(input.match_index).ensure_capacity(id());
  280. auto capacity = state.capture_group_matches.at(input.match_index).capacity();
  281. for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
  282. state.capture_group_matches.at(input.match_index).empend();
  283. }
  284. state.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
  285. return ExecutionResult::Continue;
  286. }
  287. ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  288. {
  289. auto& match = state.capture_group_matches.at(input.match_index).at(id());
  290. auto start_position = match.left_column;
  291. if (state.string_position < start_position)
  292. return ExecutionResult::Failed_ExecuteLowPrioForks;
  293. auto length = state.string_position - start_position;
  294. if (start_position < match.column)
  295. return ExecutionResult::Continue;
  296. VERIFY(start_position + length <= input.view.length());
  297. auto view = input.view.substring_view(start_position, length);
  298. if (input.regex_options & AllFlags::StringCopyMatches) {
  299. match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
  300. } else {
  301. match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
  302. }
  303. return ExecutionResult::Continue;
  304. }
  305. ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  306. {
  307. if (input.match_index < state.capture_group_matches.size()) {
  308. auto& group = state.named_capture_group_matches[input.match_index];
  309. if (auto it = group.find(name()); it != group.end())
  310. it->value.reset();
  311. }
  312. return ExecutionResult::Continue;
  313. }
  314. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  315. {
  316. if (input.match_index >= state.named_capture_group_matches.size()) {
  317. state.named_capture_group_matches.ensure_capacity(input.match_index);
  318. auto capacity = state.named_capture_group_matches.capacity();
  319. for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i)
  320. state.named_capture_group_matches.empend();
  321. }
  322. state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
  323. return ExecutionResult::Continue;
  324. }
  325. ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  326. {
  327. StringView capture_group_name = name();
  328. if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
  329. auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
  330. auto length = state.string_position - start_position;
  331. auto& map = state.named_capture_group_matches.at(input.match_index);
  332. if constexpr (REGEX_DEBUG) {
  333. VERIFY(start_position + length <= input.view.length());
  334. dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length));
  335. }
  336. VERIFY(start_position + length <= input.view.length());
  337. auto view = input.view.substring_view(start_position, length);
  338. if (input.regex_options & AllFlags::StringCopyMatches) {
  339. map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
  340. } else {
  341. map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
  342. }
  343. } else {
  344. warnln("Didn't find corresponding capture group match for name={}, match_index={}", capture_group_name.to_string(), input.match_index);
  345. }
  346. return ExecutionResult::Continue;
  347. }
  348. ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  349. {
  350. bool inverse { false };
  351. bool temporary_inverse { false };
  352. bool reset_temp_inverse { false };
  353. auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
  354. size_t string_position = state.string_position;
  355. bool inverse_matched { false };
  356. bool had_zero_length_match { false };
  357. state.string_position_before_match = state.string_position;
  358. size_t offset { state.instruction_position + 3 };
  359. for (size_t i = 0; i < arguments_count(); ++i) {
  360. if (state.string_position > string_position)
  361. break;
  362. if (reset_temp_inverse) {
  363. reset_temp_inverse = false;
  364. temporary_inverse = false;
  365. } else {
  366. reset_temp_inverse = true;
  367. }
  368. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  369. if (compare_type == CharacterCompareType::Inverse)
  370. inverse = true;
  371. else if (compare_type == CharacterCompareType::TemporaryInverse) {
  372. // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
  373. // it follows that this cannot be the last compare element.
  374. VERIFY(i != arguments_count() - 1);
  375. temporary_inverse = true;
  376. reset_temp_inverse = false;
  377. } else if (compare_type == CharacterCompareType::Char) {
  378. u32 ch = m_bytecode->at(offset++);
  379. // We want to compare a string that is longer or equal in length to the available string
  380. if (input.view.length() <= state.string_position)
  381. return ExecutionResult::Failed_ExecuteLowPrioForks;
  382. compare_char(input, state, ch, current_inversion_state(), inverse_matched);
  383. } else if (compare_type == CharacterCompareType::AnyChar) {
  384. // We want to compare a string that is definitely longer than the available string
  385. if (input.view.length() <= state.string_position)
  386. return ExecutionResult::Failed_ExecuteLowPrioForks;
  387. VERIFY(!current_inversion_state());
  388. ++state.string_position;
  389. } else if (compare_type == CharacterCompareType::String) {
  390. VERIFY(!current_inversion_state());
  391. auto const& length = m_bytecode->at(offset++);
  392. // We want to compare a string that is definitely longer than the available string
  393. if (input.view.length() < state.string_position + length)
  394. return ExecutionResult::Failed_ExecuteLowPrioForks;
  395. Optional<String> str;
  396. Vector<u16> utf16;
  397. Vector<u32> data;
  398. data.ensure_capacity(length);
  399. for (size_t i = offset; i < offset + length; ++i)
  400. data.unchecked_append(m_bytecode->at(i));
  401. auto view = input.view.construct_as_same(data, str, utf16);
  402. offset += length;
  403. if (!compare_string(input, state, view, had_zero_length_match))
  404. return ExecutionResult::Failed_ExecuteLowPrioForks;
  405. } else if (compare_type == CharacterCompareType::CharClass) {
  406. if (input.view.length() <= state.string_position)
  407. return ExecutionResult::Failed_ExecuteLowPrioForks;
  408. auto character_class = (CharClass)m_bytecode->at(offset++);
  409. auto ch = input.view[state.string_position];
  410. compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
  411. } else if (compare_type == CharacterCompareType::CharRange) {
  412. if (input.view.length() <= state.string_position)
  413. return ExecutionResult::Failed_ExecuteLowPrioForks;
  414. auto value = (CharRange)m_bytecode->at(offset++);
  415. auto from = value.from;
  416. auto to = value.to;
  417. auto ch = input.view[state.string_position];
  418. compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
  419. } else if (compare_type == CharacterCompareType::Reference) {
  420. auto reference_number = (size_t)m_bytecode->at(offset++);
  421. auto& groups = state.capture_group_matches.at(input.match_index);
  422. if (groups.size() <= reference_number)
  423. return ExecutionResult::Failed_ExecuteLowPrioForks;
  424. auto str = groups.at(reference_number).view;
  425. // We want to compare a string that is definitely longer than the available string
  426. if (input.view.length() < state.string_position + str.length())
  427. return ExecutionResult::Failed_ExecuteLowPrioForks;
  428. if (!compare_string(input, state, str, had_zero_length_match))
  429. return ExecutionResult::Failed_ExecuteLowPrioForks;
  430. } else if (compare_type == CharacterCompareType::NamedReference) {
  431. auto ptr = (char const*)m_bytecode->at(offset++);
  432. auto length = (size_t)m_bytecode->at(offset++);
  433. StringView name { ptr, length };
  434. auto group = state.named_capture_group_matches.at(input.match_index).get(name);
  435. if (!group.has_value())
  436. return ExecutionResult::Failed_ExecuteLowPrioForks;
  437. auto str = group.value().view;
  438. // We want to compare a string that is definitely longer than the available string
  439. if (input.view.length() < state.string_position + str.length())
  440. return ExecutionResult::Failed_ExecuteLowPrioForks;
  441. if (!compare_string(input, state, str, had_zero_length_match))
  442. return ExecutionResult::Failed_ExecuteLowPrioForks;
  443. } else if (compare_type == CharacterCompareType::Property) {
  444. auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
  445. compare_property(input, state, property, current_inversion_state(), inverse_matched);
  446. } else {
  447. warnln("Undefined comparison: {}", (int)compare_type);
  448. VERIFY_NOT_REACHED();
  449. break;
  450. }
  451. }
  452. if (current_inversion_state() && !inverse_matched)
  453. ++state.string_position;
  454. if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
  455. return ExecutionResult::Failed_ExecuteLowPrioForks;
  456. return ExecutionResult::Continue;
  457. }
  458. ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
  459. {
  460. if (state.string_position == input.view.length())
  461. return;
  462. auto input_view = input.view.substring_view(state.string_position, 1);
  463. Optional<String> str;
  464. Vector<u16> utf16;
  465. auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16);
  466. bool equal;
  467. if (input.regex_options & AllFlags::Insensitive)
  468. equal = input_view.equals_ignoring_case(compare_view);
  469. else
  470. equal = input_view.equals(compare_view);
  471. if (equal) {
  472. if (inverse)
  473. inverse_matched = true;
  474. else
  475. ++state.string_position;
  476. }
  477. }
  478. ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match)
  479. {
  480. if (state.string_position + str.length() > input.view.length()) {
  481. if (str.is_empty()) {
  482. had_zero_length_match = true;
  483. return true;
  484. }
  485. return false;
  486. }
  487. if (str.length() == 0) {
  488. had_zero_length_match = true;
  489. return true;
  490. }
  491. auto subject = input.view.substring_view(state.string_position, str.length());
  492. bool equals;
  493. if (input.regex_options & AllFlags::Insensitive)
  494. equals = subject.equals_ignoring_case(str);
  495. else
  496. equals = subject.equals(str);
  497. if (equals)
  498. state.string_position += str.length();
  499. return equals;
  500. }
  501. ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
  502. {
  503. switch (character_class) {
  504. case CharClass::Alnum:
  505. if (is_ascii_alphanumeric(ch)) {
  506. if (inverse)
  507. inverse_matched = true;
  508. else
  509. ++state.string_position;
  510. }
  511. break;
  512. case CharClass::Alpha:
  513. if (is_ascii_alpha(ch))
  514. ++state.string_position;
  515. break;
  516. case CharClass::Blank:
  517. if (is_ascii_blank(ch)) {
  518. if (inverse)
  519. inverse_matched = true;
  520. else
  521. ++state.string_position;
  522. }
  523. break;
  524. case CharClass::Cntrl:
  525. if (is_ascii_control(ch)) {
  526. if (inverse)
  527. inverse_matched = true;
  528. else
  529. ++state.string_position;
  530. }
  531. break;
  532. case CharClass::Digit:
  533. if (is_ascii_digit(ch)) {
  534. if (inverse)
  535. inverse_matched = true;
  536. else
  537. ++state.string_position;
  538. }
  539. break;
  540. case CharClass::Graph:
  541. if (is_ascii_graphical(ch)) {
  542. if (inverse)
  543. inverse_matched = true;
  544. else
  545. ++state.string_position;
  546. }
  547. break;
  548. case CharClass::Lower:
  549. if (is_ascii_lower_alpha(ch) || ((input.regex_options & AllFlags::Insensitive) && is_ascii_upper_alpha(ch))) {
  550. if (inverse)
  551. inverse_matched = true;
  552. else
  553. ++state.string_position;
  554. }
  555. break;
  556. case CharClass::Print:
  557. if (is_ascii_printable(ch)) {
  558. if (inverse)
  559. inverse_matched = true;
  560. else
  561. ++state.string_position;
  562. }
  563. break;
  564. case CharClass::Punct:
  565. if (is_ascii_punctuation(ch)) {
  566. if (inverse)
  567. inverse_matched = true;
  568. else
  569. ++state.string_position;
  570. }
  571. break;
  572. case CharClass::Space:
  573. if (is_ascii_space(ch)) {
  574. if (inverse)
  575. inverse_matched = true;
  576. else
  577. ++state.string_position;
  578. }
  579. break;
  580. case CharClass::Upper:
  581. if (is_ascii_upper_alpha(ch) || ((input.regex_options & AllFlags::Insensitive) && is_ascii_lower_alpha(ch))) {
  582. if (inverse)
  583. inverse_matched = true;
  584. else
  585. ++state.string_position;
  586. }
  587. break;
  588. case CharClass::Word:
  589. if (is_ascii_alphanumeric(ch) || ch == '_') {
  590. if (inverse)
  591. inverse_matched = true;
  592. else
  593. ++state.string_position;
  594. }
  595. break;
  596. case CharClass::Xdigit:
  597. if (is_ascii_hex_digit(ch)) {
  598. if (inverse)
  599. inverse_matched = true;
  600. else
  601. ++state.string_position;
  602. }
  603. break;
  604. }
  605. }
  606. ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
  607. {
  608. if (input.regex_options & AllFlags::Insensitive) {
  609. from = to_ascii_lowercase(from);
  610. to = to_ascii_lowercase(to);
  611. ch = to_ascii_lowercase(ch);
  612. }
  613. if (ch >= from && ch <= to) {
  614. if (inverse)
  615. inverse_matched = true;
  616. else
  617. ++state.string_position;
  618. }
  619. }
  620. ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched)
  621. {
  622. if (state.string_position == input.view.length())
  623. return;
  624. u32 code_point = input.view[state.string_position];
  625. bool equal = Unicode::code_point_has_property(code_point, property);
  626. if (equal) {
  627. if (inverse)
  628. inverse_matched = true;
  629. else
  630. ++state.string_position;
  631. }
  632. }
  633. String const OpCode_Compare::arguments_string() const
  634. {
  635. return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
  636. }
  637. Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
  638. {
  639. Vector<String> result;
  640. size_t offset { state().instruction_position + 3 };
  641. RegexStringView view = ((input.has_value()) ? input.value().view : nullptr);
  642. for (size_t i = 0; i < arguments_count(); ++i) {
  643. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  644. result.empend(String::formatted("type={} [{}]", (size_t)compare_type, character_compare_type_name(compare_type)));
  645. auto string_start_offset = state().string_position_before_match;
  646. if (compare_type == CharacterCompareType::Char) {
  647. auto ch = m_bytecode->at(offset++);
  648. auto is_ascii = is_ascii_printable(ch);
  649. if (is_ascii)
  650. result.empend(String::formatted("value='{:c}'", static_cast<char>(ch)));
  651. else
  652. result.empend(String::formatted("value={:x}", ch));
  653. if (!view.is_null() && view.length() > string_start_offset) {
  654. if (is_ascii) {
  655. result.empend(String::formatted(
  656. "compare against: '{}'",
  657. view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string()));
  658. } else {
  659. auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string();
  660. u8 buf[8] { 0 };
  661. __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
  662. result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
  663. buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
  664. }
  665. }
  666. } else if (compare_type == CharacterCompareType::NamedReference) {
  667. auto ptr = (char const*)m_bytecode->at(offset++);
  668. auto length = m_bytecode->at(offset++);
  669. result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length }));
  670. } else if (compare_type == CharacterCompareType::Reference) {
  671. auto ref = m_bytecode->at(offset++);
  672. result.empend(String::formatted("number={}", ref));
  673. } else if (compare_type == CharacterCompareType::String) {
  674. auto& length = m_bytecode->at(offset++);
  675. StringBuilder str_builder;
  676. for (size_t i = 0; i < length; ++i)
  677. str_builder.append(m_bytecode->at(offset++));
  678. result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length)));
  679. if (!view.is_null() && view.length() > state().string_position)
  680. result.empend(String::formatted(
  681. "compare against: \"{}\"",
  682. input.value().view.substring_view(string_start_offset, string_start_offset + length > view.length() ? 0 : length).to_string()));
  683. } else if (compare_type == CharacterCompareType::CharClass) {
  684. auto character_class = (CharClass)m_bytecode->at(offset++);
  685. result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
  686. if (!view.is_null() && view.length() > state().string_position)
  687. result.empend(String::formatted(
  688. "compare against: '{}'",
  689. input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  690. } else if (compare_type == CharacterCompareType::CharRange) {
  691. auto value = (CharRange)m_bytecode->at(offset++);
  692. result.empend(String::formatted("ch_range='{:c}'-'{:c}'", value.from, value.to));
  693. if (!view.is_null() && view.length() > state().string_position)
  694. result.empend(String::formatted(
  695. "compare against: '{}'",
  696. input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  697. }
  698. }
  699. return result;
  700. }
  701. }