RegexByteCode.cpp 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include "RegexByteCode.h"
  7. #include "AK/StringBuilder.h"
  8. #include "RegexDebug.h"
  9. #include <AK/Debug.h>
  10. #include <ctype.h>
  11. namespace regex {
  12. const char* OpCode::name(OpCodeId opcode_id)
  13. {
  14. switch (opcode_id) {
  15. #define __ENUMERATE_OPCODE(x) \
  16. case OpCodeId::x: \
  17. return #x;
  18. ENUMERATE_OPCODES
  19. #undef __ENUMERATE_OPCODE
  20. default:
  21. VERIFY_NOT_REACHED();
  22. return "<Unknown>";
  23. }
  24. }
  25. const char* OpCode::name() const
  26. {
  27. return name(opcode_id());
  28. }
  29. const char* execution_result_name(ExecutionResult result)
  30. {
  31. switch (result) {
  32. #define __ENUMERATE_EXECUTION_RESULT(x) \
  33. case ExecutionResult::x: \
  34. return #x;
  35. ENUMERATE_EXECUTION_RESULTS
  36. #undef __ENUMERATE_EXECUTION_RESULT
  37. default:
  38. VERIFY_NOT_REACHED();
  39. return "<Unknown>";
  40. }
  41. }
  42. const char* boundary_check_type_name(BoundaryCheckType ty)
  43. {
  44. switch (ty) {
  45. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
  46. case BoundaryCheckType::x: \
  47. return #x;
  48. ENUMERATE_BOUNDARY_CHECK_TYPES
  49. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  50. default:
  51. VERIFY_NOT_REACHED();
  52. return "<Unknown>";
  53. }
  54. }
  55. const char* character_compare_type_name(CharacterCompareType ch_compare_type)
  56. {
  57. switch (ch_compare_type) {
  58. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
  59. case CharacterCompareType::x: \
  60. return #x;
  61. ENUMERATE_CHARACTER_COMPARE_TYPES
  62. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  63. default:
  64. VERIFY_NOT_REACHED();
  65. return "<Unknown>";
  66. }
  67. }
  68. static const char* character_class_name(CharClass ch_class)
  69. {
  70. switch (ch_class) {
  71. #define __ENUMERATE_CHARACTER_CLASS(x) \
  72. case CharClass::x: \
  73. return #x;
  74. ENUMERATE_CHARACTER_CLASSES
  75. #undef __ENUMERATE_CHARACTER_CLASS
  76. default:
  77. VERIFY_NOT_REACHED();
  78. return "<Unknown>";
  79. }
  80. }
  81. HashMap<u32, OwnPtr<OpCode>> ByteCode::s_opcodes {};
  82. ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
  83. {
  84. if (!s_opcodes.size()) {
  85. for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
  86. switch ((OpCodeId)i) {
  87. case OpCodeId::Exit:
  88. s_opcodes.set(i, make<OpCode_Exit>(*const_cast<ByteCode*>(this)));
  89. break;
  90. case OpCodeId::Jump:
  91. s_opcodes.set(i, make<OpCode_Jump>(*const_cast<ByteCode*>(this)));
  92. break;
  93. case OpCodeId::Compare:
  94. s_opcodes.set(i, make<OpCode_Compare>(*const_cast<ByteCode*>(this)));
  95. break;
  96. case OpCodeId::CheckEnd:
  97. s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
  98. break;
  99. case OpCodeId::CheckBoundary:
  100. s_opcodes.set(i, make<OpCode_CheckBoundary>(*const_cast<ByteCode*>(this)));
  101. break;
  102. case OpCodeId::ForkJump:
  103. s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
  104. break;
  105. case OpCodeId::ForkStay:
  106. s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
  107. break;
  108. case OpCodeId::FailForks:
  109. s_opcodes.set(i, make<OpCode_FailForks>(*const_cast<ByteCode*>(this)));
  110. break;
  111. case OpCodeId::Save:
  112. s_opcodes.set(i, make<OpCode_Save>(*const_cast<ByteCode*>(this)));
  113. break;
  114. case OpCodeId::Restore:
  115. s_opcodes.set(i, make<OpCode_Restore>(*const_cast<ByteCode*>(this)));
  116. break;
  117. case OpCodeId::GoBack:
  118. s_opcodes.set(i, make<OpCode_GoBack>(*const_cast<ByteCode*>(this)));
  119. break;
  120. case OpCodeId::CheckBegin:
  121. s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
  122. break;
  123. case OpCodeId::SaveLeftCaptureGroup:
  124. s_opcodes.set(i, make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this)));
  125. break;
  126. case OpCodeId::SaveRightCaptureGroup:
  127. s_opcodes.set(i, make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this)));
  128. break;
  129. case OpCodeId::SaveLeftNamedCaptureGroup:
  130. s_opcodes.set(i, make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
  131. break;
  132. case OpCodeId::SaveRightNamedCaptureGroup:
  133. s_opcodes.set(i, make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
  134. break;
  135. }
  136. }
  137. }
  138. if (id > OpCodeId::Last)
  139. return nullptr;
  140. return const_cast<OpCode*>(s_opcodes.get((u32)id).value())->set_bytecode(*const_cast<ByteCode*>(this));
  141. }
  142. OpCode* ByteCode::get_opcode(MatchState& state) const
  143. {
  144. OpCode* op_code;
  145. if (state.instruction_position >= size()) {
  146. op_code = get_opcode_by_id(OpCodeId::Exit);
  147. } else
  148. op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position));
  149. if (op_code)
  150. op_code->set_state(state);
  151. return op_code;
  152. }
  153. ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  154. {
  155. if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
  156. return ExecutionResult::Succeeded;
  157. return ExecutionResult::Failed;
  158. }
  159. ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  160. {
  161. input.saved_positions.append(state.string_position);
  162. return ExecutionResult::Continue;
  163. }
  164. ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  165. {
  166. if (input.saved_positions.is_empty())
  167. return ExecutionResult::Failed;
  168. state.string_position = input.saved_positions.take_last();
  169. return ExecutionResult::Continue;
  170. }
  171. ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  172. {
  173. if (count() > state.string_position)
  174. return ExecutionResult::Failed_ExecuteLowPrioForks;
  175. state.string_position -= count();
  176. return ExecutionResult::Continue;
  177. }
  178. ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const
  179. {
  180. VERIFY(count() > 0);
  181. input.fail_counter += count() - 1;
  182. return ExecutionResult::Failed_ExecuteLowPrioForks;
  183. }
  184. ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  185. {
  186. state.instruction_position += offset();
  187. return ExecutionResult::Continue;
  188. }
  189. ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  190. {
  191. state.fork_at_position = state.instruction_position + size() + offset();
  192. return ExecutionResult::Fork_PrioHigh;
  193. }
  194. ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  195. {
  196. state.fork_at_position = state.instruction_position + size() + offset();
  197. return ExecutionResult::Fork_PrioLow;
  198. }
  199. ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  200. {
  201. if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  202. return ExecutionResult::Failed_ExecuteLowPrioForks;
  203. if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
  204. || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  205. || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
  206. return ExecutionResult::Continue;
  207. return ExecutionResult::Failed_ExecuteLowPrioForks;
  208. }
  209. ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  210. {
  211. auto isword = [](auto ch) { return isalnum(ch) || ch == '_'; };
  212. auto is_word_boundary = [&] {
  213. if (state.string_position == input.view.length()) {
  214. if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
  215. return true;
  216. return false;
  217. }
  218. if (state.string_position == 0) {
  219. if (isword(input.view[0]))
  220. return true;
  221. return false;
  222. }
  223. return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
  224. };
  225. switch (type()) {
  226. case BoundaryCheckType::Word: {
  227. if (is_word_boundary())
  228. return ExecutionResult::Continue;
  229. return ExecutionResult::Failed_ExecuteLowPrioForks;
  230. }
  231. case BoundaryCheckType::NonWord: {
  232. if (!is_word_boundary())
  233. return ExecutionResult::Continue;
  234. return ExecutionResult::Failed_ExecuteLowPrioForks;
  235. }
  236. }
  237. VERIFY_NOT_REACHED();
  238. }
  239. ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  240. {
  241. if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
  242. return ExecutionResult::Failed_ExecuteLowPrioForks;
  243. if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
  244. || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
  245. return ExecutionResult::Continue;
  246. return ExecutionResult::Failed_ExecuteLowPrioForks;
  247. }
  248. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  249. {
  250. if (input.match_index >= output.capture_group_matches.size()) {
  251. output.capture_group_matches.ensure_capacity(input.match_index);
  252. auto capacity = output.capture_group_matches.capacity();
  253. for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
  254. output.capture_group_matches.empend();
  255. }
  256. if (id() >= output.capture_group_matches.at(input.match_index).size()) {
  257. output.capture_group_matches.at(input.match_index).ensure_capacity(id());
  258. auto capacity = output.capture_group_matches.at(input.match_index).capacity();
  259. for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
  260. output.capture_group_matches.at(input.match_index).empend();
  261. }
  262. output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
  263. return ExecutionResult::Continue;
  264. }
  265. ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  266. {
  267. auto& match = output.capture_group_matches.at(input.match_index).at(id());
  268. auto start_position = match.left_column;
  269. auto length = state.string_position - start_position;
  270. if (start_position < match.column)
  271. return ExecutionResult::Continue;
  272. VERIFY(start_position + length <= input.view.length());
  273. auto view = input.view.substring_view(start_position, length);
  274. if (input.regex_options & AllFlags::StringCopyMatches) {
  275. match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
  276. } else {
  277. match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
  278. }
  279. return ExecutionResult::Continue;
  280. }
  281. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  282. {
  283. if (input.match_index >= output.named_capture_group_matches.size()) {
  284. output.named_capture_group_matches.ensure_capacity(input.match_index);
  285. auto capacity = output.named_capture_group_matches.capacity();
  286. for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
  287. output.named_capture_group_matches.empend();
  288. }
  289. output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
  290. return ExecutionResult::Continue;
  291. }
  292. ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  293. {
  294. StringView capture_group_name = name();
  295. if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
  296. auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
  297. auto length = state.string_position - start_position;
  298. auto& map = output.named_capture_group_matches.at(input.match_index);
  299. if constexpr (REGEX_DEBUG) {
  300. VERIFY(start_position + length <= input.view.length());
  301. dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length));
  302. }
  303. VERIFY(start_position + length <= input.view.length());
  304. auto view = input.view.substring_view(start_position, length);
  305. if (input.regex_options & AllFlags::StringCopyMatches) {
  306. map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
  307. } else {
  308. map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
  309. }
  310. } else {
  311. warnln("Didn't find corresponding capture group match for name={}, match_index={}", capture_group_name.to_string(), input.match_index);
  312. }
  313. return ExecutionResult::Continue;
  314. }
  315. ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  316. {
  317. bool inverse { false };
  318. bool temporary_inverse { false };
  319. bool reset_temp_inverse { false };
  320. auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
  321. size_t string_position = state.string_position;
  322. bool inverse_matched { false };
  323. bool had_zero_length_match { false };
  324. size_t offset { state.instruction_position + 3 };
  325. for (size_t i = 0; i < arguments_count(); ++i) {
  326. if (state.string_position > string_position)
  327. break;
  328. if (reset_temp_inverse) {
  329. reset_temp_inverse = false;
  330. temporary_inverse = false;
  331. } else {
  332. reset_temp_inverse = true;
  333. }
  334. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  335. if (compare_type == CharacterCompareType::Inverse)
  336. inverse = true;
  337. else if (compare_type == CharacterCompareType::TemporaryInverse) {
  338. // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
  339. // it follows that this cannot be the last compare element.
  340. VERIFY(i != arguments_count() - 1);
  341. temporary_inverse = true;
  342. reset_temp_inverse = false;
  343. } else if (compare_type == CharacterCompareType::Char) {
  344. u32 ch = m_bytecode->at(offset++);
  345. // We want to compare a string that is longer or equal in length to the available string
  346. if (input.view.length() - state.string_position < 1)
  347. return ExecutionResult::Failed_ExecuteLowPrioForks;
  348. compare_char(input, state, ch, current_inversion_state(), inverse_matched);
  349. } else if (compare_type == CharacterCompareType::AnyChar) {
  350. // We want to compare a string that is definitely longer than the available string
  351. if (input.view.length() - state.string_position < 1)
  352. return ExecutionResult::Failed_ExecuteLowPrioForks;
  353. VERIFY(!current_inversion_state());
  354. ++state.string_position;
  355. } else if (compare_type == CharacterCompareType::String) {
  356. VERIFY(!current_inversion_state());
  357. const auto& length = m_bytecode->at(offset++);
  358. StringBuilder str_builder;
  359. for (size_t i = 0; i < length; ++i)
  360. str_builder.append(m_bytecode->at(offset++));
  361. // We want to compare a string that is definitely longer than the available string
  362. if (input.view.length() - state.string_position < length)
  363. return ExecutionResult::Failed_ExecuteLowPrioForks;
  364. if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match))
  365. return ExecutionResult::Failed_ExecuteLowPrioForks;
  366. } else if (compare_type == CharacterCompareType::CharClass) {
  367. if (input.view.length() - state.string_position < 1)
  368. return ExecutionResult::Failed_ExecuteLowPrioForks;
  369. auto character_class = (CharClass)m_bytecode->at(offset++);
  370. auto ch = input.view[state.string_position];
  371. compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
  372. } else if (compare_type == CharacterCompareType::CharRange) {
  373. auto value = (CharRange)m_bytecode->at(offset++);
  374. auto from = value.from;
  375. auto to = value.to;
  376. auto ch = input.view[state.string_position];
  377. compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
  378. } else if (compare_type == CharacterCompareType::Reference) {
  379. auto reference_number = (size_t)m_bytecode->at(offset++);
  380. auto& groups = output.capture_group_matches.at(input.match_index);
  381. if (groups.size() <= reference_number)
  382. return ExecutionResult::Failed_ExecuteLowPrioForks;
  383. auto str = groups.at(reference_number).view;
  384. // We want to compare a string that is definitely longer than the available string
  385. if (input.view.length() - state.string_position < str.length())
  386. return ExecutionResult::Failed_ExecuteLowPrioForks;
  387. if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
  388. return ExecutionResult::Failed_ExecuteLowPrioForks;
  389. } else if (compare_type == CharacterCompareType::NamedReference) {
  390. auto ptr = (const char*)m_bytecode->at(offset++);
  391. auto length = (size_t)m_bytecode->at(offset++);
  392. StringView name { ptr, length };
  393. auto group = output.named_capture_group_matches.at(input.match_index).get(name);
  394. if (!group.has_value())
  395. return ExecutionResult::Failed_ExecuteLowPrioForks;
  396. auto str = group.value().view;
  397. // We want to compare a string that is definitely longer than the available string
  398. if (input.view.length() - state.string_position < str.length())
  399. return ExecutionResult::Failed_ExecuteLowPrioForks;
  400. if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
  401. return ExecutionResult::Failed_ExecuteLowPrioForks;
  402. } else {
  403. warnln("Undefined comparison: {}", (int)compare_type);
  404. VERIFY_NOT_REACHED();
  405. break;
  406. }
  407. }
  408. if (current_inversion_state() && !inverse_matched)
  409. ++state.string_position;
  410. if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
  411. return ExecutionResult::Failed_ExecuteLowPrioForks;
  412. return ExecutionResult::Continue;
  413. }
  414. ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
  415. {
  416. u32 ch2 = input.view[state.string_position];
  417. if (input.regex_options & AllFlags::Insensitive) {
  418. ch1 = tolower(ch1);
  419. ch2 = tolower(ch2);
  420. }
  421. if (ch1 == ch2) {
  422. if (inverse)
  423. inverse_matched = true;
  424. else
  425. ++state.string_position;
  426. }
  427. }
  428. ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match)
  429. {
  430. if (input.view.is_u8_view()) {
  431. auto str_view1 = StringView(str, length);
  432. auto str_view2 = StringView(&input.view.u8view()[state.string_position], length);
  433. String str1, str2;
  434. if (input.regex_options & AllFlags::Insensitive) {
  435. str1 = str_view1.to_string().to_lowercase();
  436. str2 = str_view2.to_string().to_lowercase();
  437. str_view1 = str1.view();
  438. str_view2 = str2.view();
  439. }
  440. if (str_view1 == str_view2) {
  441. state.string_position += length;
  442. if (length == 0)
  443. had_zero_length_match = true;
  444. return true;
  445. }
  446. }
  447. return false;
  448. }
  449. ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
  450. {
  451. switch (character_class) {
  452. case CharClass::Alnum:
  453. if (isalnum(ch)) {
  454. if (inverse)
  455. inverse_matched = true;
  456. else
  457. ++state.string_position;
  458. }
  459. break;
  460. case CharClass::Alpha:
  461. if (isalpha(ch))
  462. ++state.string_position;
  463. break;
  464. case CharClass::Blank:
  465. if (ch == ' ' || ch == '\t') {
  466. if (inverse)
  467. inverse_matched = true;
  468. else
  469. ++state.string_position;
  470. }
  471. break;
  472. case CharClass::Cntrl:
  473. if (iscntrl(ch)) {
  474. if (inverse)
  475. inverse_matched = true;
  476. else
  477. ++state.string_position;
  478. }
  479. break;
  480. case CharClass::Digit:
  481. if (isdigit(ch)) {
  482. if (inverse)
  483. inverse_matched = true;
  484. else
  485. ++state.string_position;
  486. }
  487. break;
  488. case CharClass::Graph:
  489. if (isgraph(ch)) {
  490. if (inverse)
  491. inverse_matched = true;
  492. else
  493. ++state.string_position;
  494. }
  495. break;
  496. case CharClass::Lower:
  497. if (islower(ch) || ((input.regex_options & AllFlags::Insensitive) && isupper(ch))) {
  498. if (inverse)
  499. inverse_matched = true;
  500. else
  501. ++state.string_position;
  502. }
  503. break;
  504. case CharClass::Print:
  505. if (isprint(ch)) {
  506. if (inverse)
  507. inverse_matched = true;
  508. else
  509. ++state.string_position;
  510. }
  511. break;
  512. case CharClass::Punct:
  513. if (ispunct(ch)) {
  514. if (inverse)
  515. inverse_matched = true;
  516. else
  517. ++state.string_position;
  518. }
  519. break;
  520. case CharClass::Space:
  521. if (isspace(ch)) {
  522. if (inverse)
  523. inverse_matched = true;
  524. else
  525. ++state.string_position;
  526. }
  527. break;
  528. case CharClass::Upper:
  529. if (isupper(ch) || ((input.regex_options & AllFlags::Insensitive) && islower(ch))) {
  530. if (inverse)
  531. inverse_matched = true;
  532. else
  533. ++state.string_position;
  534. }
  535. break;
  536. case CharClass::Word:
  537. if (isalnum(ch) || ch == '_') {
  538. if (inverse)
  539. inverse_matched = true;
  540. else
  541. ++state.string_position;
  542. }
  543. break;
  544. case CharClass::Xdigit:
  545. if (isxdigit(ch)) {
  546. if (inverse)
  547. inverse_matched = true;
  548. else
  549. ++state.string_position;
  550. }
  551. break;
  552. }
  553. }
  554. ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
  555. {
  556. if (input.regex_options & AllFlags::Insensitive) {
  557. from = tolower(from);
  558. to = tolower(to);
  559. ch = tolower(ch);
  560. }
  561. if (ch >= from && ch <= to) {
  562. if (inverse)
  563. inverse_matched = true;
  564. else
  565. ++state.string_position;
  566. }
  567. }
  568. const String OpCode_Compare::arguments_string() const
  569. {
  570. return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
  571. }
  572. const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
  573. {
  574. Vector<String> result;
  575. size_t offset { state().instruction_position + 3 };
  576. RegexStringView view = ((input.has_value()) ? input.value().view : nullptr);
  577. for (size_t i = 0; i < arguments_count(); ++i) {
  578. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  579. result.empend(String::formatted("type={} [{}]", (size_t)compare_type, character_compare_type_name(compare_type)));
  580. auto compared_against_string_start_offset = state().string_position > 0 ? state().string_position - 1 : state().string_position;
  581. if (compare_type == CharacterCompareType::Char) {
  582. auto ch = m_bytecode->at(offset++);
  583. auto is_ascii = isascii(ch) && isprint(ch);
  584. if (is_ascii)
  585. result.empend(String::formatted("value='{:c}'", static_cast<char>(ch)));
  586. else
  587. result.empend(String::formatted("value={:x}", ch));
  588. if (!view.is_null() && view.length() > state().string_position) {
  589. if (is_ascii) {
  590. result.empend(String::formatted(
  591. "compare against: '{}'",
  592. view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  593. } else {
  594. auto str = view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string();
  595. u8 buf[8] { 0 };
  596. __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
  597. result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
  598. buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
  599. }
  600. }
  601. } else if (compare_type == CharacterCompareType::NamedReference) {
  602. auto ptr = (const char*)m_bytecode->at(offset++);
  603. auto length = m_bytecode->at(offset++);
  604. result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length }));
  605. } else if (compare_type == CharacterCompareType::Reference) {
  606. auto ref = m_bytecode->at(offset++);
  607. result.empend(String::formatted("number={}", ref));
  608. } else if (compare_type == CharacterCompareType::String) {
  609. auto& length = m_bytecode->at(offset++);
  610. StringBuilder str_builder;
  611. for (size_t i = 0; i < length; ++i)
  612. str_builder.append(m_bytecode->at(offset++));
  613. result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length)));
  614. if (!view.is_null() && view.length() > state().string_position)
  615. result.empend(String::formatted(
  616. "compare against: \"{}\"",
  617. input.value().view.substring_view(compared_against_string_start_offset, compared_against_string_start_offset + length > view.length() ? 0 : length).to_string()));
  618. } else if (compare_type == CharacterCompareType::CharClass) {
  619. auto character_class = (CharClass)m_bytecode->at(offset++);
  620. result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
  621. if (!view.is_null() && view.length() > state().string_position)
  622. result.empend(String::formatted(
  623. "compare against: '{}'",
  624. input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  625. } else if (compare_type == CharacterCompareType::CharRange) {
  626. auto value = (CharRange)m_bytecode->at(offset++);
  627. result.empend(String::formatted("ch_range='{:c}'-'{:c}'", value.from, value.to));
  628. if (!view.is_null() && view.length() > state().string_position)
  629. result.empend(String::formatted(
  630. "compare against: '{}'",
  631. input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  632. }
  633. }
  634. return result;
  635. }
  636. }