RegexByteCode.cpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexByteCode.h"
  7. #include "AK/StringBuilder.h"
  8. #include "RegexDebug.h"
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/Debug.h>
  11. namespace regex {
  12. const char* OpCode::name(OpCodeId opcode_id)
  13. {
  14. switch (opcode_id) {
  15. #define __ENUMERATE_OPCODE(x) \
  16. case OpCodeId::x: \
  17. return #x;
  18. ENUMERATE_OPCODES
  19. #undef __ENUMERATE_OPCODE
  20. default:
  21. VERIFY_NOT_REACHED();
  22. return "<Unknown>";
  23. }
  24. }
  25. const char* OpCode::name() const
  26. {
  27. return name(opcode_id());
  28. }
  29. const char* execution_result_name(ExecutionResult result)
  30. {
  31. switch (result) {
  32. #define __ENUMERATE_EXECUTION_RESULT(x) \
  33. case ExecutionResult::x: \
  34. return #x;
  35. ENUMERATE_EXECUTION_RESULTS
  36. #undef __ENUMERATE_EXECUTION_RESULT
  37. default:
  38. VERIFY_NOT_REACHED();
  39. return "<Unknown>";
  40. }
  41. }
  42. const char* boundary_check_type_name(BoundaryCheckType ty)
  43. {
  44. switch (ty) {
  45. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
  46. case BoundaryCheckType::x: \
  47. return #x;
  48. ENUMERATE_BOUNDARY_CHECK_TYPES
  49. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  50. default:
  51. VERIFY_NOT_REACHED();
  52. return "<Unknown>";
  53. }
  54. }
  55. const char* character_compare_type_name(CharacterCompareType ch_compare_type)
  56. {
  57. switch (ch_compare_type) {
  58. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
  59. case CharacterCompareType::x: \
  60. return #x;
  61. ENUMERATE_CHARACTER_COMPARE_TYPES
  62. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  63. default:
  64. VERIFY_NOT_REACHED();
  65. return "<Unknown>";
  66. }
  67. }
  68. static const char* character_class_name(CharClass ch_class)
  69. {
  70. switch (ch_class) {
  71. #define __ENUMERATE_CHARACTER_CLASS(x) \
  72. case CharClass::x: \
  73. return #x;
  74. ENUMERATE_CHARACTER_CLASSES
  75. #undef __ENUMERATE_CHARACTER_CLASS
  76. default:
  77. VERIFY_NOT_REACHED();
  78. return "<Unknown>";
  79. }
  80. }
  81. OwnPtr<OpCode> ByteCode::s_opcodes[(size_t)OpCodeId::Last + 1];
  82. bool ByteCode::s_opcodes_initialized { false };
  83. ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
  84. {
  85. if (!s_opcodes_initialized) {
  86. for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
  87. switch ((OpCodeId)i) {
  88. case OpCodeId::Exit:
  89. s_opcodes[i] = make<OpCode_Exit>(*const_cast<ByteCode*>(this));
  90. break;
  91. case OpCodeId::Jump:
  92. s_opcodes[i] = make<OpCode_Jump>(*const_cast<ByteCode*>(this));
  93. break;
  94. case OpCodeId::Compare:
  95. s_opcodes[i] = make<OpCode_Compare>(*const_cast<ByteCode*>(this));
  96. break;
  97. case OpCodeId::CheckEnd:
  98. s_opcodes[i] = make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this));
  99. break;
  100. case OpCodeId::CheckBoundary:
  101. s_opcodes[i] = make<OpCode_CheckBoundary>(*const_cast<ByteCode*>(this));
  102. break;
  103. case OpCodeId::ForkJump:
  104. s_opcodes[i] = make<OpCode_ForkJump>(*const_cast<ByteCode*>(this));
  105. break;
  106. case OpCodeId::ForkStay:
  107. s_opcodes[i] = make<OpCode_ForkStay>(*const_cast<ByteCode*>(this));
  108. break;
  109. case OpCodeId::FailForks:
  110. s_opcodes[i] = make<OpCode_FailForks>(*const_cast<ByteCode*>(this));
  111. break;
  112. case OpCodeId::Save:
  113. s_opcodes[i] = make<OpCode_Save>(*const_cast<ByteCode*>(this));
  114. break;
  115. case OpCodeId::Restore:
  116. s_opcodes[i] = make<OpCode_Restore>(*const_cast<ByteCode*>(this));
  117. break;
  118. case OpCodeId::GoBack:
  119. s_opcodes[i] = make<OpCode_GoBack>(*const_cast<ByteCode*>(this));
  120. break;
  121. case OpCodeId::CheckBegin:
  122. s_opcodes[i] = make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this));
  123. break;
  124. case OpCodeId::SaveLeftCaptureGroup:
  125. s_opcodes[i] = make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this));
  126. break;
  127. case OpCodeId::SaveRightCaptureGroup:
  128. s_opcodes[i] = make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this));
  129. break;
  130. case OpCodeId::SaveLeftNamedCaptureGroup:
  131. s_opcodes[i] = make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this));
  132. break;
  133. case OpCodeId::SaveRightNamedCaptureGroup:
  134. s_opcodes[i] = make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this));
  135. break;
  136. }
  137. }
  138. s_opcodes_initialized = true;
  139. }
  140. if (id > OpCodeId::Last)
  141. return nullptr;
  142. auto* opcode = &*s_opcodes[(u32)id];
  143. opcode->set_bytecode(*const_cast<ByteCode*>(this));
  144. return opcode;
  145. }
  146. OpCode* ByteCode::get_opcode(MatchState& state) const
  147. {
  148. OpCode* op_code;
  149. if (state.instruction_position >= size()) {
  150. op_code = get_opcode_by_id(OpCodeId::Exit);
  151. } else
  152. op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position));
  153. if (op_code)
  154. op_code->set_state(state);
  155. return op_code;
  156. }
  157. ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  158. {
  159. if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
  160. return ExecutionResult::Succeeded;
  161. return ExecutionResult::Failed;
  162. }
  163. ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  164. {
  165. input.saved_positions.append(state.string_position);
  166. return ExecutionResult::Continue;
  167. }
  168. ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  169. {
  170. if (input.saved_positions.is_empty())
  171. return ExecutionResult::Failed;
  172. state.string_position = input.saved_positions.take_last();
  173. return ExecutionResult::Continue;
  174. }
  175. ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  176. {
  177. if (count() > state.string_position)
  178. return ExecutionResult::Failed_ExecuteLowPrioForks;
  179. state.string_position -= count();
  180. return ExecutionResult::Continue;
  181. }
  182. ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const
  183. {
  184. VERIFY(count() > 0);
  185. input.fail_counter += count() - 1;
  186. return ExecutionResult::Failed_ExecuteLowPrioForks;
  187. }
  188. ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  189. {
  190. state.instruction_position += offset();
  191. return ExecutionResult::Continue;
  192. }
  193. ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  194. {
  195. state.fork_at_position = state.instruction_position + size() + offset();
  196. return ExecutionResult::Fork_PrioHigh;
  197. }
  198. ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  199. {
  200. state.fork_at_position = state.instruction_position + size() + offset();
  201. return ExecutionResult::Fork_PrioLow;
  202. }
  203. ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  204. {
  205. if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  206. return ExecutionResult::Failed_ExecuteLowPrioForks;
  207. if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
  208. || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  209. || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
  210. return ExecutionResult::Continue;
  211. return ExecutionResult::Failed_ExecuteLowPrioForks;
  212. }
  213. ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  214. {
  215. auto isword = [](auto ch) { return is_ascii_alphanumeric(ch) || ch == '_'; };
  216. auto is_word_boundary = [&] {
  217. if (state.string_position == input.view.length()) {
  218. if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
  219. return true;
  220. return false;
  221. }
  222. if (state.string_position == 0) {
  223. if (isword(input.view[0]))
  224. return true;
  225. return false;
  226. }
  227. return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
  228. };
  229. switch (type()) {
  230. case BoundaryCheckType::Word: {
  231. if (is_word_boundary())
  232. return ExecutionResult::Continue;
  233. return ExecutionResult::Failed_ExecuteLowPrioForks;
  234. }
  235. case BoundaryCheckType::NonWord: {
  236. if (!is_word_boundary())
  237. return ExecutionResult::Continue;
  238. return ExecutionResult::Failed_ExecuteLowPrioForks;
  239. }
  240. }
  241. VERIFY_NOT_REACHED();
  242. }
  243. ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  244. {
  245. if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
  246. return ExecutionResult::Failed_ExecuteLowPrioForks;
  247. if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
  248. || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
  249. return ExecutionResult::Continue;
  250. return ExecutionResult::Failed_ExecuteLowPrioForks;
  251. }
  252. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  253. {
  254. if (input.match_index >= output.capture_group_matches.size()) {
  255. output.capture_group_matches.ensure_capacity(input.match_index);
  256. auto capacity = output.capture_group_matches.capacity();
  257. for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
  258. output.capture_group_matches.empend();
  259. }
  260. if (id() >= output.capture_group_matches.at(input.match_index).size()) {
  261. output.capture_group_matches.at(input.match_index).ensure_capacity(id());
  262. auto capacity = output.capture_group_matches.at(input.match_index).capacity();
  263. for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
  264. output.capture_group_matches.at(input.match_index).empend();
  265. }
  266. output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
  267. return ExecutionResult::Continue;
  268. }
  269. ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  270. {
  271. auto& match = output.capture_group_matches.at(input.match_index).at(id());
  272. auto start_position = match.left_column;
  273. auto length = state.string_position - start_position;
  274. if (start_position < match.column)
  275. return ExecutionResult::Continue;
  276. VERIFY(start_position + length <= input.view.length());
  277. auto view = input.view.substring_view(start_position, length);
  278. if (input.regex_options & AllFlags::StringCopyMatches) {
  279. match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
  280. } else {
  281. match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
  282. }
  283. return ExecutionResult::Continue;
  284. }
  285. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  286. {
  287. if (input.match_index >= output.named_capture_group_matches.size()) {
  288. output.named_capture_group_matches.ensure_capacity(input.match_index);
  289. auto capacity = output.named_capture_group_matches.capacity();
  290. for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
  291. output.named_capture_group_matches.empend();
  292. }
  293. output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
  294. return ExecutionResult::Continue;
  295. }
  296. ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  297. {
  298. StringView capture_group_name = name();
  299. if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
  300. auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
  301. auto length = state.string_position - start_position;
  302. auto& map = output.named_capture_group_matches.at(input.match_index);
  303. if constexpr (REGEX_DEBUG) {
  304. VERIFY(start_position + length <= input.view.length());
  305. dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length));
  306. }
  307. VERIFY(start_position + length <= input.view.length());
  308. auto view = input.view.substring_view(start_position, length);
  309. if (input.regex_options & AllFlags::StringCopyMatches) {
  310. map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
  311. } else {
  312. map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
  313. }
  314. } else {
  315. warnln("Didn't find corresponding capture group match for name={}, match_index={}", capture_group_name.to_string(), input.match_index);
  316. }
  317. return ExecutionResult::Continue;
  318. }
  319. ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  320. {
  321. bool inverse { false };
  322. bool temporary_inverse { false };
  323. bool reset_temp_inverse { false };
  324. auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
  325. size_t string_position = state.string_position;
  326. bool inverse_matched { false };
  327. bool had_zero_length_match { false };
  328. size_t offset { state.instruction_position + 3 };
  329. for (size_t i = 0; i < arguments_count(); ++i) {
  330. if (state.string_position > string_position)
  331. break;
  332. if (reset_temp_inverse) {
  333. reset_temp_inverse = false;
  334. temporary_inverse = false;
  335. } else {
  336. reset_temp_inverse = true;
  337. }
  338. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  339. if (compare_type == CharacterCompareType::Inverse)
  340. inverse = true;
  341. else if (compare_type == CharacterCompareType::TemporaryInverse) {
  342. // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
  343. // it follows that this cannot be the last compare element.
  344. VERIFY(i != arguments_count() - 1);
  345. temporary_inverse = true;
  346. reset_temp_inverse = false;
  347. } else if (compare_type == CharacterCompareType::Char) {
  348. u32 ch = m_bytecode->at(offset++);
  349. // We want to compare a string that is longer or equal in length to the available string
  350. if (input.view.length() - state.string_position < 1)
  351. return ExecutionResult::Failed_ExecuteLowPrioForks;
  352. compare_char(input, state, ch, current_inversion_state(), inverse_matched);
  353. } else if (compare_type == CharacterCompareType::AnyChar) {
  354. // We want to compare a string that is definitely longer than the available string
  355. if (input.view.length() - state.string_position < 1)
  356. return ExecutionResult::Failed_ExecuteLowPrioForks;
  357. VERIFY(!current_inversion_state());
  358. ++state.string_position;
  359. } else if (compare_type == CharacterCompareType::String) {
  360. VERIFY(!current_inversion_state());
  361. const auto& length = m_bytecode->at(offset++);
  362. StringBuilder str_builder;
  363. for (size_t i = 0; i < length; ++i)
  364. str_builder.append(m_bytecode->at(offset++));
  365. // We want to compare a string that is definitely longer than the available string
  366. if (input.view.length() - state.string_position < length)
  367. return ExecutionResult::Failed_ExecuteLowPrioForks;
  368. if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match))
  369. return ExecutionResult::Failed_ExecuteLowPrioForks;
  370. } else if (compare_type == CharacterCompareType::CharClass) {
  371. if (input.view.length() - state.string_position < 1)
  372. return ExecutionResult::Failed_ExecuteLowPrioForks;
  373. auto character_class = (CharClass)m_bytecode->at(offset++);
  374. auto ch = input.view[state.string_position];
  375. compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
  376. } else if (compare_type == CharacterCompareType::CharRange) {
  377. auto value = (CharRange)m_bytecode->at(offset++);
  378. auto from = value.from;
  379. auto to = value.to;
  380. auto ch = input.view[state.string_position];
  381. compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
  382. } else if (compare_type == CharacterCompareType::Reference) {
  383. auto reference_number = (size_t)m_bytecode->at(offset++);
  384. auto& groups = output.capture_group_matches.at(input.match_index);
  385. if (groups.size() <= reference_number)
  386. return ExecutionResult::Failed_ExecuteLowPrioForks;
  387. auto str = groups.at(reference_number).view;
  388. // We want to compare a string that is definitely longer than the available string
  389. if (input.view.length() - state.string_position < str.length())
  390. return ExecutionResult::Failed_ExecuteLowPrioForks;
  391. if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
  392. return ExecutionResult::Failed_ExecuteLowPrioForks;
  393. } else if (compare_type == CharacterCompareType::NamedReference) {
  394. auto ptr = (const char*)m_bytecode->at(offset++);
  395. auto length = (size_t)m_bytecode->at(offset++);
  396. StringView name { ptr, length };
  397. auto group = output.named_capture_group_matches.at(input.match_index).get(name);
  398. if (!group.has_value())
  399. return ExecutionResult::Failed_ExecuteLowPrioForks;
  400. auto str = group.value().view;
  401. // We want to compare a string that is definitely longer than the available string
  402. if (input.view.length() - state.string_position < str.length())
  403. return ExecutionResult::Failed_ExecuteLowPrioForks;
  404. if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
  405. return ExecutionResult::Failed_ExecuteLowPrioForks;
  406. } else {
  407. warnln("Undefined comparison: {}", (int)compare_type);
  408. VERIFY_NOT_REACHED();
  409. break;
  410. }
  411. }
  412. if (current_inversion_state() && !inverse_matched)
  413. ++state.string_position;
  414. if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
  415. return ExecutionResult::Failed_ExecuteLowPrioForks;
  416. return ExecutionResult::Continue;
  417. }
  418. ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
  419. {
  420. u32 ch2 = input.view[state.string_position];
  421. if (input.regex_options & AllFlags::Insensitive) {
  422. ch1 = to_ascii_lowercase(ch1);
  423. ch2 = to_ascii_uppercase(ch2);
  424. }
  425. if (ch1 == ch2) {
  426. if (inverse)
  427. inverse_matched = true;
  428. else
  429. ++state.string_position;
  430. }
  431. }
  432. ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match)
  433. {
  434. if (input.view.is_u8_view()) {
  435. auto str_view1 = StringView(str, length);
  436. auto str_view2 = StringView(&input.view.u8view()[state.string_position], length);
  437. String str1, str2;
  438. if (input.regex_options & AllFlags::Insensitive) {
  439. str1 = str_view1.to_string().to_lowercase();
  440. str2 = str_view2.to_string().to_lowercase();
  441. str_view1 = str1.view();
  442. str_view2 = str2.view();
  443. }
  444. if (str_view1 == str_view2) {
  445. state.string_position += length;
  446. if (length == 0)
  447. had_zero_length_match = true;
  448. return true;
  449. }
  450. }
  451. return false;
  452. }
  453. ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
  454. {
  455. switch (character_class) {
  456. case CharClass::Alnum:
  457. if (is_ascii_alphanumeric(ch)) {
  458. if (inverse)
  459. inverse_matched = true;
  460. else
  461. ++state.string_position;
  462. }
  463. break;
  464. case CharClass::Alpha:
  465. if (is_ascii_alpha(ch))
  466. ++state.string_position;
  467. break;
  468. case CharClass::Blank:
  469. if (is_ascii_blank(ch)) {
  470. if (inverse)
  471. inverse_matched = true;
  472. else
  473. ++state.string_position;
  474. }
  475. break;
  476. case CharClass::Cntrl:
  477. if (is_ascii_control(ch)) {
  478. if (inverse)
  479. inverse_matched = true;
  480. else
  481. ++state.string_position;
  482. }
  483. break;
  484. case CharClass::Digit:
  485. if (is_ascii_digit(ch)) {
  486. if (inverse)
  487. inverse_matched = true;
  488. else
  489. ++state.string_position;
  490. }
  491. break;
  492. case CharClass::Graph:
  493. if (is_ascii_graphical(ch)) {
  494. if (inverse)
  495. inverse_matched = true;
  496. else
  497. ++state.string_position;
  498. }
  499. break;
  500. case CharClass::Lower:
  501. if (is_ascii_lower_alpha(ch) || ((input.regex_options & AllFlags::Insensitive) && is_ascii_upper_alpha(ch))) {
  502. if (inverse)
  503. inverse_matched = true;
  504. else
  505. ++state.string_position;
  506. }
  507. break;
  508. case CharClass::Print:
  509. if (is_ascii_printable(ch)) {
  510. if (inverse)
  511. inverse_matched = true;
  512. else
  513. ++state.string_position;
  514. }
  515. break;
  516. case CharClass::Punct:
  517. if (is_ascii_punctuation(ch)) {
  518. if (inverse)
  519. inverse_matched = true;
  520. else
  521. ++state.string_position;
  522. }
  523. break;
  524. case CharClass::Space:
  525. if (is_ascii_space(ch)) {
  526. if (inverse)
  527. inverse_matched = true;
  528. else
  529. ++state.string_position;
  530. }
  531. break;
  532. case CharClass::Upper:
  533. if (is_ascii_upper_alpha(ch) || ((input.regex_options & AllFlags::Insensitive) && is_ascii_lower_alpha(ch))) {
  534. if (inverse)
  535. inverse_matched = true;
  536. else
  537. ++state.string_position;
  538. }
  539. break;
  540. case CharClass::Word:
  541. if (is_ascii_alphanumeric(ch) || ch == '_') {
  542. if (inverse)
  543. inverse_matched = true;
  544. else
  545. ++state.string_position;
  546. }
  547. break;
  548. case CharClass::Xdigit:
  549. if (is_ascii_hex_digit(ch)) {
  550. if (inverse)
  551. inverse_matched = true;
  552. else
  553. ++state.string_position;
  554. }
  555. break;
  556. }
  557. }
  558. ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
  559. {
  560. if (input.regex_options & AllFlags::Insensitive) {
  561. from = to_ascii_lowercase(from);
  562. to = to_ascii_lowercase(to);
  563. ch = to_ascii_lowercase(ch);
  564. }
  565. if (ch >= from && ch <= to) {
  566. if (inverse)
  567. inverse_matched = true;
  568. else
  569. ++state.string_position;
  570. }
  571. }
  572. const String OpCode_Compare::arguments_string() const
  573. {
  574. return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
  575. }
  576. const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
  577. {
  578. Vector<String> result;
  579. size_t offset { state().instruction_position + 3 };
  580. RegexStringView view = ((input.has_value()) ? input.value().view : nullptr);
  581. for (size_t i = 0; i < arguments_count(); ++i) {
  582. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  583. result.empend(String::formatted("type={} [{}]", (size_t)compare_type, character_compare_type_name(compare_type)));
  584. auto compared_against_string_start_offset = state().string_position > 0 ? state().string_position - 1 : state().string_position;
  585. if (compare_type == CharacterCompareType::Char) {
  586. auto ch = m_bytecode->at(offset++);
  587. auto is_ascii = is_ascii_printable(ch);
  588. if (is_ascii)
  589. result.empend(String::formatted("value='{:c}'", static_cast<char>(ch)));
  590. else
  591. result.empend(String::formatted("value={:x}", ch));
  592. if (!view.is_null() && view.length() > state().string_position) {
  593. if (is_ascii) {
  594. result.empend(String::formatted(
  595. "compare against: '{}'",
  596. view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  597. } else {
  598. auto str = view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string();
  599. u8 buf[8] { 0 };
  600. __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
  601. result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
  602. buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
  603. }
  604. }
  605. } else if (compare_type == CharacterCompareType::NamedReference) {
  606. auto ptr = (const char*)m_bytecode->at(offset++);
  607. auto length = m_bytecode->at(offset++);
  608. result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length }));
  609. } else if (compare_type == CharacterCompareType::Reference) {
  610. auto ref = m_bytecode->at(offset++);
  611. result.empend(String::formatted("number={}", ref));
  612. } else if (compare_type == CharacterCompareType::String) {
  613. auto& length = m_bytecode->at(offset++);
  614. StringBuilder str_builder;
  615. for (size_t i = 0; i < length; ++i)
  616. str_builder.append(m_bytecode->at(offset++));
  617. result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length)));
  618. if (!view.is_null() && view.length() > state().string_position)
  619. result.empend(String::formatted(
  620. "compare against: \"{}\"",
  621. input.value().view.substring_view(compared_against_string_start_offset, compared_against_string_start_offset + length > view.length() ? 0 : length).to_string()));
  622. } else if (compare_type == CharacterCompareType::CharClass) {
  623. auto character_class = (CharClass)m_bytecode->at(offset++);
  624. result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
  625. if (!view.is_null() && view.length() > state().string_position)
  626. result.empend(String::formatted(
  627. "compare against: '{}'",
  628. input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  629. } else if (compare_type == CharacterCompareType::CharRange) {
  630. auto value = (CharRange)m_bytecode->at(offset++);
  631. result.empend(String::formatted("ch_range='{:c}'-'{:c}'", value.from, value.to));
  632. if (!view.is_null() && view.length() > state().string_position)
  633. result.empend(String::formatted(
  634. "compare against: '{}'",
  635. input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  636. }
  637. }
  638. return result;
  639. }
  640. }