RegexByteCode.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexByteCode.h"
  7. #include "AK/StringBuilder.h"
  8. #include "RegexDebug.h"
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/Debug.h>
  11. namespace regex {
  12. char const* OpCode::name(OpCodeId opcode_id)
  13. {
  14. switch (opcode_id) {
  15. #define __ENUMERATE_OPCODE(x) \
  16. case OpCodeId::x: \
  17. return #x;
  18. ENUMERATE_OPCODES
  19. #undef __ENUMERATE_OPCODE
  20. default:
  21. VERIFY_NOT_REACHED();
  22. return "<Unknown>";
  23. }
  24. }
  25. char const* OpCode::name() const
  26. {
  27. return name(opcode_id());
  28. }
  29. char const* execution_result_name(ExecutionResult result)
  30. {
  31. switch (result) {
  32. #define __ENUMERATE_EXECUTION_RESULT(x) \
  33. case ExecutionResult::x: \
  34. return #x;
  35. ENUMERATE_EXECUTION_RESULTS
  36. #undef __ENUMERATE_EXECUTION_RESULT
  37. default:
  38. VERIFY_NOT_REACHED();
  39. return "<Unknown>";
  40. }
  41. }
  42. char const* boundary_check_type_name(BoundaryCheckType ty)
  43. {
  44. switch (ty) {
  45. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
  46. case BoundaryCheckType::x: \
  47. return #x;
  48. ENUMERATE_BOUNDARY_CHECK_TYPES
  49. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  50. default:
  51. VERIFY_NOT_REACHED();
  52. return "<Unknown>";
  53. }
  54. }
  55. char const* character_compare_type_name(CharacterCompareType ch_compare_type)
  56. {
  57. switch (ch_compare_type) {
  58. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
  59. case CharacterCompareType::x: \
  60. return #x;
  61. ENUMERATE_CHARACTER_COMPARE_TYPES
  62. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  63. default:
  64. VERIFY_NOT_REACHED();
  65. return "<Unknown>";
  66. }
  67. }
  68. static char const* character_class_name(CharClass ch_class)
  69. {
  70. switch (ch_class) {
  71. #define __ENUMERATE_CHARACTER_CLASS(x) \
  72. case CharClass::x: \
  73. return #x;
  74. ENUMERATE_CHARACTER_CLASSES
  75. #undef __ENUMERATE_CHARACTER_CLASS
  76. default:
  77. VERIFY_NOT_REACHED();
  78. return "<Unknown>";
  79. }
  80. }
  81. OwnPtr<OpCode> ByteCode::s_opcodes[(size_t)OpCodeId::Last + 1];
  82. bool ByteCode::s_opcodes_initialized { false };
  83. void ByteCode::ensure_opcodes_initialized()
  84. {
  85. if (s_opcodes_initialized)
  86. return;
  87. for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
  88. switch ((OpCodeId)i) {
  89. case OpCodeId::Exit:
  90. s_opcodes[i] = make<OpCode_Exit>();
  91. break;
  92. case OpCodeId::Jump:
  93. s_opcodes[i] = make<OpCode_Jump>();
  94. break;
  95. case OpCodeId::Compare:
  96. s_opcodes[i] = make<OpCode_Compare>();
  97. break;
  98. case OpCodeId::CheckEnd:
  99. s_opcodes[i] = make<OpCode_CheckEnd>();
  100. break;
  101. case OpCodeId::CheckBoundary:
  102. s_opcodes[i] = make<OpCode_CheckBoundary>();
  103. break;
  104. case OpCodeId::ForkJump:
  105. s_opcodes[i] = make<OpCode_ForkJump>();
  106. break;
  107. case OpCodeId::ForkStay:
  108. s_opcodes[i] = make<OpCode_ForkStay>();
  109. break;
  110. case OpCodeId::FailForks:
  111. s_opcodes[i] = make<OpCode_FailForks>();
  112. break;
  113. case OpCodeId::Save:
  114. s_opcodes[i] = make<OpCode_Save>();
  115. break;
  116. case OpCodeId::Restore:
  117. s_opcodes[i] = make<OpCode_Restore>();
  118. break;
  119. case OpCodeId::GoBack:
  120. s_opcodes[i] = make<OpCode_GoBack>();
  121. break;
  122. case OpCodeId::CheckBegin:
  123. s_opcodes[i] = make<OpCode_CheckBegin>();
  124. break;
  125. case OpCodeId::ClearCaptureGroup:
  126. s_opcodes[i] = make<OpCode_ClearCaptureGroup>();
  127. break;
  128. case OpCodeId::ClearNamedCaptureGroup:
  129. s_opcodes[i] = make<OpCode_ClearNamedCaptureGroup>();
  130. break;
  131. case OpCodeId::SaveLeftCaptureGroup:
  132. s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>();
  133. break;
  134. case OpCodeId::SaveRightCaptureGroup:
  135. s_opcodes[i] = make<OpCode_SaveRightCaptureGroup>();
  136. break;
  137. case OpCodeId::SaveLeftNamedCaptureGroup:
  138. s_opcodes[i] = make<OpCode_SaveLeftNamedCaptureGroup>();
  139. break;
  140. case OpCodeId::SaveRightNamedCaptureGroup:
  141. s_opcodes[i] = make<OpCode_SaveRightNamedCaptureGroup>();
  142. break;
  143. }
  144. }
  145. s_opcodes_initialized = true;
  146. }
  147. ALWAYS_INLINE OpCode& ByteCode::get_opcode_by_id(OpCodeId id) const
  148. {
  149. VERIFY(id >= OpCodeId::First && id <= OpCodeId::Last);
  150. auto& opcode = s_opcodes[(u32)id];
  151. opcode->set_bytecode(*const_cast<ByteCode*>(this));
  152. return *opcode;
  153. }
  154. OpCode& ByteCode::get_opcode(MatchState& state) const
  155. {
  156. OpCodeId opcode_id;
  157. if (state.instruction_position >= size())
  158. opcode_id = OpCodeId::Exit;
  159. else
  160. opcode_id = (OpCodeId)at(state.instruction_position);
  161. auto& opcode = get_opcode_by_id(opcode_id);
  162. opcode.set_state(state);
  163. return opcode;
  164. }
  165. ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  166. {
  167. if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
  168. return ExecutionResult::Succeeded;
  169. return ExecutionResult::Failed;
  170. }
  171. ALWAYS_INLINE ExecutionResult OpCode_Save::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  172. {
  173. input.saved_positions.append(state.string_position);
  174. return ExecutionResult::Continue;
  175. }
  176. ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  177. {
  178. if (input.saved_positions.is_empty())
  179. return ExecutionResult::Failed;
  180. state.string_position = input.saved_positions.take_last();
  181. return ExecutionResult::Continue;
  182. }
  183. ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  184. {
  185. if (count() > state.string_position)
  186. return ExecutionResult::Failed_ExecuteLowPrioForks;
  187. state.string_position -= count();
  188. return ExecutionResult::Continue;
  189. }
  190. ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(MatchInput const& input, MatchState&, MatchOutput&) const
  191. {
  192. VERIFY(count() > 0);
  193. input.fail_counter += count() - 1;
  194. return ExecutionResult::Failed_ExecuteLowPrioForks;
  195. }
  196. ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  197. {
  198. state.instruction_position += offset();
  199. return ExecutionResult::Continue;
  200. }
  201. ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  202. {
  203. state.fork_at_position = state.instruction_position + size() + offset();
  204. return ExecutionResult::Fork_PrioHigh;
  205. }
  206. ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(MatchInput const&, MatchState& state, MatchOutput&) const
  207. {
  208. state.fork_at_position = state.instruction_position + size() + offset();
  209. return ExecutionResult::Fork_PrioLow;
  210. }
  211. ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  212. {
  213. if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  214. return ExecutionResult::Failed_ExecuteLowPrioForks;
  215. if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
  216. || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  217. || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
  218. return ExecutionResult::Continue;
  219. return ExecutionResult::Failed_ExecuteLowPrioForks;
  220. }
  221. ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  222. {
  223. auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; };
  224. auto is_word_boundary = [&] {
  225. if (state.string_position == input.view.length()) {
  226. if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
  227. return true;
  228. return false;
  229. }
  230. if (state.string_position == 0) {
  231. if (isword(input.view[0]))
  232. return true;
  233. return false;
  234. }
  235. return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
  236. };
  237. switch (type()) {
  238. case BoundaryCheckType::Word: {
  239. if (is_word_boundary())
  240. return ExecutionResult::Continue;
  241. return ExecutionResult::Failed_ExecuteLowPrioForks;
  242. }
  243. case BoundaryCheckType::NonWord: {
  244. if (!is_word_boundary())
  245. return ExecutionResult::Continue;
  246. return ExecutionResult::Failed_ExecuteLowPrioForks;
  247. }
  248. }
  249. VERIFY_NOT_REACHED();
  250. }
  251. ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  252. {
  253. if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
  254. return ExecutionResult::Failed_ExecuteLowPrioForks;
  255. if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
  256. || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
  257. return ExecutionResult::Continue;
  258. return ExecutionResult::Failed_ExecuteLowPrioForks;
  259. }
  260. ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  261. {
  262. if (input.match_index < state.capture_group_matches.size()) {
  263. auto& group = state.capture_group_matches[input.match_index];
  264. if (id() < group.size())
  265. group[id()] = {};
  266. }
  267. return ExecutionResult::Continue;
  268. }
  269. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  270. {
  271. if (input.match_index >= state.capture_group_matches.size()) {
  272. state.capture_group_matches.ensure_capacity(input.match_index);
  273. auto capacity = state.capture_group_matches.capacity();
  274. for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i)
  275. state.capture_group_matches.empend();
  276. }
  277. if (id() >= state.capture_group_matches.at(input.match_index).size()) {
  278. state.capture_group_matches.at(input.match_index).ensure_capacity(id());
  279. auto capacity = state.capture_group_matches.at(input.match_index).capacity();
  280. for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
  281. state.capture_group_matches.at(input.match_index).empend();
  282. }
  283. state.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
  284. return ExecutionResult::Continue;
  285. }
  286. ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  287. {
  288. auto& match = state.capture_group_matches.at(input.match_index).at(id());
  289. auto start_position = match.left_column;
  290. if (state.string_position < start_position)
  291. return ExecutionResult::Failed_ExecuteLowPrioForks;
  292. auto length = state.string_position - start_position;
  293. if (start_position < match.column)
  294. return ExecutionResult::Continue;
  295. VERIFY(start_position + length <= input.view.length());
  296. auto view = input.view.substring_view(start_position, length);
  297. if (input.regex_options & AllFlags::StringCopyMatches) {
  298. match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
  299. } else {
  300. match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
  301. }
  302. return ExecutionResult::Continue;
  303. }
  304. ALWAYS_INLINE ExecutionResult OpCode_ClearNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  305. {
  306. if (input.match_index < state.capture_group_matches.size()) {
  307. auto& group = state.named_capture_group_matches[input.match_index];
  308. group.remove(name());
  309. }
  310. return ExecutionResult::Continue;
  311. }
  312. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  313. {
  314. if (input.match_index >= state.named_capture_group_matches.size()) {
  315. state.named_capture_group_matches.ensure_capacity(input.match_index);
  316. auto capacity = state.named_capture_group_matches.capacity();
  317. for (size_t i = state.named_capture_group_matches.size(); i <= capacity; ++i)
  318. state.named_capture_group_matches.empend();
  319. }
  320. state.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
  321. return ExecutionResult::Continue;
  322. }
  323. ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  324. {
  325. StringView capture_group_name = name();
  326. if (state.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
  327. auto start_position = state.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
  328. auto length = state.string_position - start_position;
  329. auto& map = state.named_capture_group_matches.at(input.match_index);
  330. if constexpr (REGEX_DEBUG) {
  331. VERIFY(start_position + length <= input.view.length());
  332. dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length));
  333. }
  334. VERIFY(start_position + length <= input.view.length());
  335. auto view = input.view.substring_view(start_position, length);
  336. if (input.regex_options & AllFlags::StringCopyMatches) {
  337. map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
  338. } else {
  339. map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
  340. }
  341. } else {
  342. warnln("Didn't find corresponding capture group match for name={}, match_index={}", capture_group_name.to_string(), input.match_index);
  343. }
  344. return ExecutionResult::Continue;
  345. }
  346. ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, MatchState& state, MatchOutput&) const
  347. {
  348. bool inverse { false };
  349. bool temporary_inverse { false };
  350. bool reset_temp_inverse { false };
  351. auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
  352. size_t string_position = state.string_position;
  353. bool inverse_matched { false };
  354. bool had_zero_length_match { false };
  355. state.string_position_before_match = state.string_position;
  356. size_t offset { state.instruction_position + 3 };
  357. for (size_t i = 0; i < arguments_count(); ++i) {
  358. if (state.string_position > string_position)
  359. break;
  360. if (reset_temp_inverse) {
  361. reset_temp_inverse = false;
  362. temporary_inverse = false;
  363. } else {
  364. reset_temp_inverse = true;
  365. }
  366. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  367. if (compare_type == CharacterCompareType::Inverse)
  368. inverse = true;
  369. else if (compare_type == CharacterCompareType::TemporaryInverse) {
  370. // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
  371. // it follows that this cannot be the last compare element.
  372. VERIFY(i != arguments_count() - 1);
  373. temporary_inverse = true;
  374. reset_temp_inverse = false;
  375. } else if (compare_type == CharacterCompareType::Char) {
  376. u32 ch = m_bytecode->at(offset++);
  377. // We want to compare a string that is longer or equal in length to the available string
  378. if (input.view.length() <= state.string_position)
  379. return ExecutionResult::Failed_ExecuteLowPrioForks;
  380. compare_char(input, state, ch, current_inversion_state(), inverse_matched);
  381. } else if (compare_type == CharacterCompareType::AnyChar) {
  382. // We want to compare a string that is definitely longer than the available string
  383. if (input.view.length() <= state.string_position)
  384. return ExecutionResult::Failed_ExecuteLowPrioForks;
  385. VERIFY(!current_inversion_state());
  386. ++state.string_position;
  387. } else if (compare_type == CharacterCompareType::String) {
  388. VERIFY(!current_inversion_state());
  389. auto const& length = m_bytecode->at(offset++);
  390. // We want to compare a string that is definitely longer than the available string
  391. if (input.view.length() < state.string_position + length)
  392. return ExecutionResult::Failed_ExecuteLowPrioForks;
  393. Optional<String> str;
  394. Vector<u16> utf16;
  395. Vector<u32> data;
  396. data.ensure_capacity(length);
  397. for (size_t i = offset; i < offset + length; ++i)
  398. data.unchecked_append(m_bytecode->at(i));
  399. auto view = input.view.construct_as_same(data, str, utf16);
  400. offset += length;
  401. if (!compare_string(input, state, view, had_zero_length_match))
  402. return ExecutionResult::Failed_ExecuteLowPrioForks;
  403. } else if (compare_type == CharacterCompareType::CharClass) {
  404. if (input.view.length() <= state.string_position)
  405. return ExecutionResult::Failed_ExecuteLowPrioForks;
  406. auto character_class = (CharClass)m_bytecode->at(offset++);
  407. auto ch = input.view[state.string_position];
  408. compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
  409. } else if (compare_type == CharacterCompareType::CharRange) {
  410. if (input.view.length() <= state.string_position)
  411. return ExecutionResult::Failed_ExecuteLowPrioForks;
  412. auto value = (CharRange)m_bytecode->at(offset++);
  413. auto from = value.from;
  414. auto to = value.to;
  415. auto ch = input.view[state.string_position];
  416. compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
  417. } else if (compare_type == CharacterCompareType::Reference) {
  418. auto reference_number = (size_t)m_bytecode->at(offset++);
  419. auto& groups = state.capture_group_matches.at(input.match_index);
  420. if (groups.size() <= reference_number)
  421. return ExecutionResult::Failed_ExecuteLowPrioForks;
  422. auto str = groups.at(reference_number).view;
  423. // We want to compare a string that is definitely longer than the available string
  424. if (input.view.length() < state.string_position + str.length())
  425. return ExecutionResult::Failed_ExecuteLowPrioForks;
  426. if (!compare_string(input, state, str, had_zero_length_match))
  427. return ExecutionResult::Failed_ExecuteLowPrioForks;
  428. } else if (compare_type == CharacterCompareType::NamedReference) {
  429. auto ptr = (char const*)m_bytecode->at(offset++);
  430. auto length = (size_t)m_bytecode->at(offset++);
  431. StringView name { ptr, length };
  432. auto group = state.named_capture_group_matches.at(input.match_index).get(name);
  433. if (!group.has_value())
  434. return ExecutionResult::Failed_ExecuteLowPrioForks;
  435. auto str = group.value().view;
  436. // We want to compare a string that is definitely longer than the available string
  437. if (input.view.length() < state.string_position + str.length())
  438. return ExecutionResult::Failed_ExecuteLowPrioForks;
  439. if (!compare_string(input, state, str, had_zero_length_match))
  440. return ExecutionResult::Failed_ExecuteLowPrioForks;
  441. } else {
  442. warnln("Undefined comparison: {}", (int)compare_type);
  443. VERIFY_NOT_REACHED();
  444. break;
  445. }
  446. }
  447. if (current_inversion_state() && !inverse_matched)
  448. ++state.string_position;
  449. if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
  450. return ExecutionResult::Failed_ExecuteLowPrioForks;
  451. return ExecutionResult::Continue;
  452. }
  453. ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
  454. {
  455. if (state.string_position == input.view.length())
  456. return;
  457. auto input_view = input.view.substring_view(state.string_position, 1);
  458. Optional<String> str;
  459. Vector<u16> utf16;
  460. auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16);
  461. bool equal;
  462. if (input.regex_options & AllFlags::Insensitive)
  463. equal = input_view.equals_ignoring_case(compare_view);
  464. else
  465. equal = input_view.equals(compare_view);
  466. if (equal) {
  467. if (inverse)
  468. inverse_matched = true;
  469. else
  470. ++state.string_position;
  471. }
  472. }
  473. ALWAYS_INLINE bool OpCode_Compare::compare_string(MatchInput const& input, MatchState& state, RegexStringView const& str, bool& had_zero_length_match)
  474. {
  475. if (state.string_position + str.length() > input.view.length()) {
  476. if (str.is_empty()) {
  477. had_zero_length_match = true;
  478. return true;
  479. }
  480. return false;
  481. }
  482. if (str.length() == 0) {
  483. had_zero_length_match = true;
  484. return true;
  485. }
  486. auto subject = input.view.substring_view(state.string_position, str.length());
  487. bool equals;
  488. if (input.regex_options & AllFlags::Insensitive)
  489. equals = subject.equals_ignoring_case(str);
  490. else
  491. equals = subject.equals(str);
  492. if (equals)
  493. state.string_position += str.length();
  494. return equals;
  495. }
  496. ALWAYS_INLINE void OpCode_Compare::compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
  497. {
  498. switch (character_class) {
  499. case CharClass::Alnum:
  500. if (is_ascii_alphanumeric(ch)) {
  501. if (inverse)
  502. inverse_matched = true;
  503. else
  504. ++state.string_position;
  505. }
  506. break;
  507. case CharClass::Alpha:
  508. if (is_ascii_alpha(ch))
  509. ++state.string_position;
  510. break;
  511. case CharClass::Blank:
  512. if (is_ascii_blank(ch)) {
  513. if (inverse)
  514. inverse_matched = true;
  515. else
  516. ++state.string_position;
  517. }
  518. break;
  519. case CharClass::Cntrl:
  520. if (is_ascii_control(ch)) {
  521. if (inverse)
  522. inverse_matched = true;
  523. else
  524. ++state.string_position;
  525. }
  526. break;
  527. case CharClass::Digit:
  528. if (is_ascii_digit(ch)) {
  529. if (inverse)
  530. inverse_matched = true;
  531. else
  532. ++state.string_position;
  533. }
  534. break;
  535. case CharClass::Graph:
  536. if (is_ascii_graphical(ch)) {
  537. if (inverse)
  538. inverse_matched = true;
  539. else
  540. ++state.string_position;
  541. }
  542. break;
  543. case CharClass::Lower:
  544. if (is_ascii_lower_alpha(ch) || ((input.regex_options & AllFlags::Insensitive) && is_ascii_upper_alpha(ch))) {
  545. if (inverse)
  546. inverse_matched = true;
  547. else
  548. ++state.string_position;
  549. }
  550. break;
  551. case CharClass::Print:
  552. if (is_ascii_printable(ch)) {
  553. if (inverse)
  554. inverse_matched = true;
  555. else
  556. ++state.string_position;
  557. }
  558. break;
  559. case CharClass::Punct:
  560. if (is_ascii_punctuation(ch)) {
  561. if (inverse)
  562. inverse_matched = true;
  563. else
  564. ++state.string_position;
  565. }
  566. break;
  567. case CharClass::Space:
  568. if (is_ascii_space(ch)) {
  569. if (inverse)
  570. inverse_matched = true;
  571. else
  572. ++state.string_position;
  573. }
  574. break;
  575. case CharClass::Upper:
  576. if (is_ascii_upper_alpha(ch) || ((input.regex_options & AllFlags::Insensitive) && is_ascii_lower_alpha(ch))) {
  577. if (inverse)
  578. inverse_matched = true;
  579. else
  580. ++state.string_position;
  581. }
  582. break;
  583. case CharClass::Word:
  584. if (is_ascii_alphanumeric(ch) || ch == '_') {
  585. if (inverse)
  586. inverse_matched = true;
  587. else
  588. ++state.string_position;
  589. }
  590. break;
  591. case CharClass::Xdigit:
  592. if (is_ascii_hex_digit(ch)) {
  593. if (inverse)
  594. inverse_matched = true;
  595. else
  596. ++state.string_position;
  597. }
  598. break;
  599. }
  600. }
  601. ALWAYS_INLINE void OpCode_Compare::compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
  602. {
  603. if (input.regex_options & AllFlags::Insensitive) {
  604. from = to_ascii_lowercase(from);
  605. to = to_ascii_lowercase(to);
  606. ch = to_ascii_lowercase(ch);
  607. }
  608. if (ch >= from && ch <= to) {
  609. if (inverse)
  610. inverse_matched = true;
  611. else
  612. ++state.string_position;
  613. }
  614. }
  615. String const OpCode_Compare::arguments_string() const
  616. {
  617. return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
  618. }
  619. Vector<String> const OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
  620. {
  621. Vector<String> result;
  622. size_t offset { state().instruction_position + 3 };
  623. RegexStringView view = ((input.has_value()) ? input.value().view : nullptr);
  624. for (size_t i = 0; i < arguments_count(); ++i) {
  625. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  626. result.empend(String::formatted("type={} [{}]", (size_t)compare_type, character_compare_type_name(compare_type)));
  627. auto string_start_offset = state().string_position_before_match;
  628. if (compare_type == CharacterCompareType::Char) {
  629. auto ch = m_bytecode->at(offset++);
  630. auto is_ascii = is_ascii_printable(ch);
  631. if (is_ascii)
  632. result.empend(String::formatted("value='{:c}'", static_cast<char>(ch)));
  633. else
  634. result.empend(String::formatted("value={:x}", ch));
  635. if (!view.is_null() && view.length() > string_start_offset) {
  636. if (is_ascii) {
  637. result.empend(String::formatted(
  638. "compare against: '{}'",
  639. view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string()));
  640. } else {
  641. auto str = view.substring_view(string_start_offset, string_start_offset > view.length() ? 0 : 1).to_string();
  642. u8 buf[8] { 0 };
  643. __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
  644. result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
  645. buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
  646. }
  647. }
  648. } else if (compare_type == CharacterCompareType::NamedReference) {
  649. auto ptr = (char const*)m_bytecode->at(offset++);
  650. auto length = m_bytecode->at(offset++);
  651. result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length }));
  652. } else if (compare_type == CharacterCompareType::Reference) {
  653. auto ref = m_bytecode->at(offset++);
  654. result.empend(String::formatted("number={}", ref));
  655. } else if (compare_type == CharacterCompareType::String) {
  656. auto& length = m_bytecode->at(offset++);
  657. StringBuilder str_builder;
  658. for (size_t i = 0; i < length; ++i)
  659. str_builder.append(m_bytecode->at(offset++));
  660. result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length)));
  661. if (!view.is_null() && view.length() > state().string_position)
  662. result.empend(String::formatted(
  663. "compare against: \"{}\"",
  664. input.value().view.substring_view(string_start_offset, string_start_offset + length > view.length() ? 0 : length).to_string()));
  665. } else if (compare_type == CharacterCompareType::CharClass) {
  666. auto character_class = (CharClass)m_bytecode->at(offset++);
  667. result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
  668. if (!view.is_null() && view.length() > state().string_position)
  669. result.empend(String::formatted(
  670. "compare against: '{}'",
  671. input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  672. } else if (compare_type == CharacterCompareType::CharRange) {
  673. auto value = (CharRange)m_bytecode->at(offset++);
  674. result.empend(String::formatted("ch_range='{:c}'-'{:c}'", value.from, value.to));
  675. if (!view.is_null() && view.length() > state().string_position)
  676. result.empend(String::formatted(
  677. "compare against: '{}'",
  678. input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  679. }
  680. }
  681. return result;
  682. }
  683. }