RegexByteCode.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767
  1. /*
  2. * Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include "RegexByteCode.h"
  27. #include "AK/StringBuilder.h"
  28. #include "RegexDebug.h"
  29. #include <AK/Debug.h>
  30. #include <ctype.h>
  31. namespace regex {
  32. const char* OpCode::name(OpCodeId opcode_id)
  33. {
  34. switch (opcode_id) {
  35. #define __ENUMERATE_OPCODE(x) \
  36. case OpCodeId::x: \
  37. return #x;
  38. ENUMERATE_OPCODES
  39. #undef __ENUMERATE_OPCODE
  40. default:
  41. VERIFY_NOT_REACHED();
  42. return "<Unknown>";
  43. }
  44. }
  45. const char* OpCode::name() const
  46. {
  47. return name(opcode_id());
  48. }
  49. const char* execution_result_name(ExecutionResult result)
  50. {
  51. switch (result) {
  52. #define __ENUMERATE_EXECUTION_RESULT(x) \
  53. case ExecutionResult::x: \
  54. return #x;
  55. ENUMERATE_EXECUTION_RESULTS
  56. #undef __ENUMERATE_EXECUTION_RESULT
  57. default:
  58. VERIFY_NOT_REACHED();
  59. return "<Unknown>";
  60. }
  61. }
  62. const char* boundary_check_type_name(BoundaryCheckType ty)
  63. {
  64. switch (ty) {
  65. #define __ENUMERATE_BOUNDARY_CHECK_TYPE(x) \
  66. case BoundaryCheckType::x: \
  67. return #x;
  68. ENUMERATE_BOUNDARY_CHECK_TYPES
  69. #undef __ENUMERATE_BOUNDARY_CHECK_TYPE
  70. default:
  71. VERIFY_NOT_REACHED();
  72. return "<Unknown>";
  73. }
  74. }
  75. const char* character_compare_type_name(CharacterCompareType ch_compare_type)
  76. {
  77. switch (ch_compare_type) {
  78. #define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
  79. case CharacterCompareType::x: \
  80. return #x;
  81. ENUMERATE_CHARACTER_COMPARE_TYPES
  82. #undef __ENUMERATE_CHARACTER_COMPARE_TYPE
  83. default:
  84. VERIFY_NOT_REACHED();
  85. return "<Unknown>";
  86. }
  87. }
  88. static const char* character_class_name(CharClass ch_class)
  89. {
  90. switch (ch_class) {
  91. #define __ENUMERATE_CHARACTER_CLASS(x) \
  92. case CharClass::x: \
  93. return #x;
  94. ENUMERATE_CHARACTER_CLASSES
  95. #undef __ENUMERATE_CHARACTER_CLASS
  96. default:
  97. VERIFY_NOT_REACHED();
  98. return "<Unknown>";
  99. }
  100. }
  101. HashMap<u32, OwnPtr<OpCode>> ByteCode::s_opcodes {};
  102. ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
  103. {
  104. if (!s_opcodes.size()) {
  105. for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
  106. switch ((OpCodeId)i) {
  107. case OpCodeId::Exit:
  108. s_opcodes.set(i, make<OpCode_Exit>(*const_cast<ByteCode*>(this)));
  109. break;
  110. case OpCodeId::Jump:
  111. s_opcodes.set(i, make<OpCode_Jump>(*const_cast<ByteCode*>(this)));
  112. break;
  113. case OpCodeId::Compare:
  114. s_opcodes.set(i, make<OpCode_Compare>(*const_cast<ByteCode*>(this)));
  115. break;
  116. case OpCodeId::CheckEnd:
  117. s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
  118. break;
  119. case OpCodeId::CheckBoundary:
  120. s_opcodes.set(i, make<OpCode_CheckBoundary>(*const_cast<ByteCode*>(this)));
  121. break;
  122. case OpCodeId::ForkJump:
  123. s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
  124. break;
  125. case OpCodeId::ForkStay:
  126. s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
  127. break;
  128. case OpCodeId::FailForks:
  129. s_opcodes.set(i, make<OpCode_FailForks>(*const_cast<ByteCode*>(this)));
  130. break;
  131. case OpCodeId::Save:
  132. s_opcodes.set(i, make<OpCode_Save>(*const_cast<ByteCode*>(this)));
  133. break;
  134. case OpCodeId::Restore:
  135. s_opcodes.set(i, make<OpCode_Restore>(*const_cast<ByteCode*>(this)));
  136. break;
  137. case OpCodeId::GoBack:
  138. s_opcodes.set(i, make<OpCode_GoBack>(*const_cast<ByteCode*>(this)));
  139. break;
  140. case OpCodeId::CheckBegin:
  141. s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
  142. break;
  143. case OpCodeId::SaveLeftCaptureGroup:
  144. s_opcodes.set(i, make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this)));
  145. break;
  146. case OpCodeId::SaveRightCaptureGroup:
  147. s_opcodes.set(i, make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this)));
  148. break;
  149. case OpCodeId::SaveLeftNamedCaptureGroup:
  150. s_opcodes.set(i, make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
  151. break;
  152. case OpCodeId::SaveRightNamedCaptureGroup:
  153. s_opcodes.set(i, make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
  154. break;
  155. }
  156. }
  157. }
  158. if (id > OpCodeId::Last)
  159. return nullptr;
  160. return const_cast<OpCode*>(s_opcodes.get((u32)id).value())->set_bytecode(*const_cast<ByteCode*>(this));
  161. }
  162. OpCode* ByteCode::get_opcode(MatchState& state) const
  163. {
  164. OpCode* op_code;
  165. if (state.instruction_position >= size()) {
  166. op_code = get_opcode_by_id(OpCodeId::Exit);
  167. } else
  168. op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position));
  169. if (op_code)
  170. op_code->set_state(state);
  171. return op_code;
  172. }
  173. ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  174. {
  175. if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
  176. return ExecutionResult::Succeeded;
  177. return ExecutionResult::Failed;
  178. }
  179. ALWAYS_INLINE ExecutionResult OpCode_Save::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  180. {
  181. input.saved_positions.append(state.string_position);
  182. return ExecutionResult::Continue;
  183. }
  184. ALWAYS_INLINE ExecutionResult OpCode_Restore::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  185. {
  186. if (input.saved_positions.is_empty())
  187. return ExecutionResult::Failed;
  188. state.string_position = input.saved_positions.take_last();
  189. return ExecutionResult::Continue;
  190. }
  191. ALWAYS_INLINE ExecutionResult OpCode_GoBack::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  192. {
  193. if (count() > state.string_position)
  194. return ExecutionResult::Failed_ExecuteLowPrioForks;
  195. state.string_position -= count();
  196. return ExecutionResult::Continue;
  197. }
  198. ALWAYS_INLINE ExecutionResult OpCode_FailForks::execute(const MatchInput& input, MatchState&, MatchOutput&) const
  199. {
  200. VERIFY(count() > 0);
  201. input.fail_counter += count() - 1;
  202. return ExecutionResult::Failed_ExecuteLowPrioForks;
  203. }
  204. ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  205. {
  206. state.instruction_position += offset();
  207. return ExecutionResult::Continue;
  208. }
  209. ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  210. {
  211. state.fork_at_position = state.instruction_position + size() + offset();
  212. return ExecutionResult::Fork_PrioHigh;
  213. }
  214. ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const
  215. {
  216. state.fork_at_position = state.instruction_position + size() + offset();
  217. return ExecutionResult::Fork_PrioLow;
  218. }
  219. ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  220. {
  221. if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  222. return ExecutionResult::Failed_ExecuteLowPrioForks;
  223. if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
  224. || (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
  225. || (0 == state.string_position && (input.regex_options & AllFlags::Global)))
  226. return ExecutionResult::Continue;
  227. return ExecutionResult::Failed_ExecuteLowPrioForks;
  228. }
  229. ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  230. {
  231. auto isword = [](auto ch) { return isalnum(ch) || ch == '_'; };
  232. auto is_word_boundary = [&] {
  233. if (state.string_position == input.view.length()) {
  234. if (state.string_position > 0 && isword(input.view[state.string_position - 1]))
  235. return true;
  236. return false;
  237. }
  238. if (state.string_position == 0) {
  239. if (isword(input.view[0]))
  240. return true;
  241. return false;
  242. }
  243. return !!(isword(input.view[state.string_position]) ^ isword(input.view[state.string_position - 1]));
  244. };
  245. switch (type()) {
  246. case BoundaryCheckType::Word: {
  247. if (is_word_boundary())
  248. return ExecutionResult::Continue;
  249. return ExecutionResult::Failed_ExecuteLowPrioForks;
  250. }
  251. case BoundaryCheckType::NonWord: {
  252. if (!is_word_boundary())
  253. return ExecutionResult::Continue;
  254. return ExecutionResult::Failed_ExecuteLowPrioForks;
  255. }
  256. }
  257. VERIFY_NOT_REACHED();
  258. }
  259. ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
  260. {
  261. if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
  262. return ExecutionResult::Failed_ExecuteLowPrioForks;
  263. if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
  264. || (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
  265. return ExecutionResult::Continue;
  266. return ExecutionResult::Failed_ExecuteLowPrioForks;
  267. }
  268. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  269. {
  270. if (input.match_index >= output.capture_group_matches.size()) {
  271. output.capture_group_matches.ensure_capacity(input.match_index);
  272. auto capacity = output.capture_group_matches.capacity();
  273. for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
  274. output.capture_group_matches.empend();
  275. }
  276. if (id() >= output.capture_group_matches.at(input.match_index).size()) {
  277. output.capture_group_matches.at(input.match_index).ensure_capacity(id());
  278. auto capacity = output.capture_group_matches.at(input.match_index).capacity();
  279. for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
  280. output.capture_group_matches.at(input.match_index).empend();
  281. }
  282. output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
  283. return ExecutionResult::Continue;
  284. }
  285. ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  286. {
  287. auto& match = output.capture_group_matches.at(input.match_index).at(id());
  288. auto start_position = match.left_column;
  289. auto length = state.string_position - start_position;
  290. if (start_position < match.column)
  291. return ExecutionResult::Continue;
  292. VERIFY(start_position + length <= input.view.length());
  293. auto view = input.view.substring_view(start_position, length);
  294. if (input.regex_options & AllFlags::StringCopyMatches) {
  295. match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
  296. } else {
  297. match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
  298. }
  299. return ExecutionResult::Continue;
  300. }
  301. ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  302. {
  303. if (input.match_index >= output.named_capture_group_matches.size()) {
  304. output.named_capture_group_matches.ensure_capacity(input.match_index);
  305. auto capacity = output.named_capture_group_matches.capacity();
  306. for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
  307. output.named_capture_group_matches.empend();
  308. }
  309. output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
  310. return ExecutionResult::Continue;
  311. }
  312. ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  313. {
  314. StringView capture_group_name = name();
  315. if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
  316. auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
  317. auto length = state.string_position - start_position;
  318. auto& map = output.named_capture_group_matches.at(input.match_index);
  319. if constexpr (REGEX_DEBUG) {
  320. VERIFY(start_position + length <= input.view.length());
  321. dbgln("Save named capture group with name={} and content='{}'", capture_group_name, input.view.substring_view(start_position, length));
  322. }
  323. VERIFY(start_position + length <= input.view.length());
  324. auto view = input.view.substring_view(start_position, length);
  325. if (input.regex_options & AllFlags::StringCopyMatches) {
  326. map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
  327. } else {
  328. map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
  329. }
  330. } else {
  331. fprintf(stderr, "Didn't find corresponding capture group match for name=%s, match_index=%lu\n", capture_group_name.to_string().characters(), input.match_index);
  332. }
  333. return ExecutionResult::Continue;
  334. }
  335. ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
  336. {
  337. bool inverse { false };
  338. bool temporary_inverse { false };
  339. bool reset_temp_inverse { false };
  340. auto current_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; };
  341. size_t string_position = state.string_position;
  342. bool inverse_matched { false };
  343. bool had_zero_length_match { false };
  344. size_t offset { state.instruction_position + 3 };
  345. for (size_t i = 0; i < arguments_count(); ++i) {
  346. if (state.string_position > string_position)
  347. break;
  348. if (reset_temp_inverse) {
  349. reset_temp_inverse = false;
  350. temporary_inverse = false;
  351. } else {
  352. reset_temp_inverse = true;
  353. }
  354. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  355. if (compare_type == CharacterCompareType::Inverse)
  356. inverse = true;
  357. else if (compare_type == CharacterCompareType::TemporaryInverse) {
  358. // If "TemporaryInverse" is given, negate the current inversion state only for the next opcode.
  359. // it follows that this cannot be the last compare element.
  360. VERIFY(i != arguments_count() - 1);
  361. temporary_inverse = true;
  362. reset_temp_inverse = false;
  363. } else if (compare_type == CharacterCompareType::Char) {
  364. u32 ch = m_bytecode->at(offset++);
  365. // We want to compare a string that is longer or equal in length to the available string
  366. if (input.view.length() - state.string_position < 1)
  367. return ExecutionResult::Failed_ExecuteLowPrioForks;
  368. compare_char(input, state, ch, current_inversion_state(), inverse_matched);
  369. } else if (compare_type == CharacterCompareType::AnyChar) {
  370. // We want to compare a string that is definitely longer than the available string
  371. if (input.view.length() - state.string_position < 1)
  372. return ExecutionResult::Failed_ExecuteLowPrioForks;
  373. VERIFY(!current_inversion_state());
  374. ++state.string_position;
  375. } else if (compare_type == CharacterCompareType::String) {
  376. VERIFY(!current_inversion_state());
  377. const auto& length = m_bytecode->at(offset++);
  378. StringBuilder str_builder;
  379. for (size_t i = 0; i < length; ++i)
  380. str_builder.append(m_bytecode->at(offset++));
  381. // We want to compare a string that is definitely longer than the available string
  382. if (input.view.length() - state.string_position < length)
  383. return ExecutionResult::Failed_ExecuteLowPrioForks;
  384. if (!compare_string(input, state, str_builder.string_view().characters_without_null_termination(), length, had_zero_length_match))
  385. return ExecutionResult::Failed_ExecuteLowPrioForks;
  386. } else if (compare_type == CharacterCompareType::CharClass) {
  387. if (input.view.length() - state.string_position < 1)
  388. return ExecutionResult::Failed_ExecuteLowPrioForks;
  389. auto character_class = (CharClass)m_bytecode->at(offset++);
  390. auto ch = input.view[state.string_position];
  391. compare_character_class(input, state, character_class, ch, current_inversion_state(), inverse_matched);
  392. } else if (compare_type == CharacterCompareType::CharRange) {
  393. auto value = (CharRange)m_bytecode->at(offset++);
  394. auto from = value.from;
  395. auto to = value.to;
  396. auto ch = input.view[state.string_position];
  397. compare_character_range(input, state, from, to, ch, current_inversion_state(), inverse_matched);
  398. } else if (compare_type == CharacterCompareType::Reference) {
  399. auto reference_number = (size_t)m_bytecode->at(offset++);
  400. auto& groups = output.capture_group_matches.at(input.match_index);
  401. if (groups.size() <= reference_number)
  402. return ExecutionResult::Failed_ExecuteLowPrioForks;
  403. auto str = groups.at(reference_number).view;
  404. // We want to compare a string that is definitely longer than the available string
  405. if (input.view.length() - state.string_position < str.length())
  406. return ExecutionResult::Failed_ExecuteLowPrioForks;
  407. if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
  408. return ExecutionResult::Failed_ExecuteLowPrioForks;
  409. } else if (compare_type == CharacterCompareType::NamedReference) {
  410. auto ptr = (const char*)m_bytecode->at(offset++);
  411. auto length = (size_t)m_bytecode->at(offset++);
  412. StringView name { ptr, length };
  413. auto group = output.named_capture_group_matches.at(input.match_index).get(name);
  414. if (!group.has_value())
  415. return ExecutionResult::Failed_ExecuteLowPrioForks;
  416. auto str = group.value().view;
  417. // We want to compare a string that is definitely longer than the available string
  418. if (input.view.length() - state.string_position < str.length())
  419. return ExecutionResult::Failed_ExecuteLowPrioForks;
  420. if (!compare_string(input, state, str.characters_without_null_termination(), str.length(), had_zero_length_match))
  421. return ExecutionResult::Failed_ExecuteLowPrioForks;
  422. } else {
  423. fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type);
  424. VERIFY_NOT_REACHED();
  425. break;
  426. }
  427. }
  428. if (current_inversion_state() && !inverse_matched)
  429. ++state.string_position;
  430. if ((!had_zero_length_match && string_position == state.string_position) || state.string_position > input.view.length())
  431. return ExecutionResult::Failed_ExecuteLowPrioForks;
  432. return ExecutionResult::Continue;
  433. }
  434. ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, u32 ch1, bool inverse, bool& inverse_matched)
  435. {
  436. u32 ch2 = input.view[state.string_position];
  437. if (input.regex_options & AllFlags::Insensitive) {
  438. ch1 = tolower(ch1);
  439. ch2 = tolower(ch2);
  440. }
  441. if (ch1 == ch2) {
  442. if (inverse)
  443. inverse_matched = true;
  444. else
  445. ++state.string_position;
  446. }
  447. }
  448. ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length, bool& had_zero_length_match)
  449. {
  450. if (input.view.is_u8_view()) {
  451. auto str_view1 = StringView(str, length);
  452. auto str_view2 = StringView(&input.view.u8view()[state.string_position], length);
  453. String str1, str2;
  454. if (input.regex_options & AllFlags::Insensitive) {
  455. str1 = str_view1.to_string().to_lowercase();
  456. str2 = str_view2.to_string().to_lowercase();
  457. str_view1 = str1.view();
  458. str_view2 = str2.view();
  459. }
  460. if (str_view1 == str_view2) {
  461. state.string_position += length;
  462. if (length == 0)
  463. had_zero_length_match = true;
  464. return true;
  465. }
  466. }
  467. return false;
  468. }
  469. ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched)
  470. {
  471. switch (character_class) {
  472. case CharClass::Alnum:
  473. if (isalnum(ch)) {
  474. if (inverse)
  475. inverse_matched = true;
  476. else
  477. ++state.string_position;
  478. }
  479. break;
  480. case CharClass::Alpha:
  481. if (isalpha(ch))
  482. ++state.string_position;
  483. break;
  484. case CharClass::Blank:
  485. if (ch == ' ' || ch == '\t') {
  486. if (inverse)
  487. inverse_matched = true;
  488. else
  489. ++state.string_position;
  490. }
  491. break;
  492. case CharClass::Cntrl:
  493. if (iscntrl(ch)) {
  494. if (inverse)
  495. inverse_matched = true;
  496. else
  497. ++state.string_position;
  498. }
  499. break;
  500. case CharClass::Digit:
  501. if (isdigit(ch)) {
  502. if (inverse)
  503. inverse_matched = true;
  504. else
  505. ++state.string_position;
  506. }
  507. break;
  508. case CharClass::Graph:
  509. if (isgraph(ch)) {
  510. if (inverse)
  511. inverse_matched = true;
  512. else
  513. ++state.string_position;
  514. }
  515. break;
  516. case CharClass::Lower:
  517. if (islower(ch) || ((input.regex_options & AllFlags::Insensitive) && isupper(ch))) {
  518. if (inverse)
  519. inverse_matched = true;
  520. else
  521. ++state.string_position;
  522. }
  523. break;
  524. case CharClass::Print:
  525. if (isprint(ch)) {
  526. if (inverse)
  527. inverse_matched = true;
  528. else
  529. ++state.string_position;
  530. }
  531. break;
  532. case CharClass::Punct:
  533. if (ispunct(ch)) {
  534. if (inverse)
  535. inverse_matched = true;
  536. else
  537. ++state.string_position;
  538. }
  539. break;
  540. case CharClass::Space:
  541. if (isspace(ch)) {
  542. if (inverse)
  543. inverse_matched = true;
  544. else
  545. ++state.string_position;
  546. }
  547. break;
  548. case CharClass::Upper:
  549. if (isupper(ch) || ((input.regex_options & AllFlags::Insensitive) && islower(ch))) {
  550. if (inverse)
  551. inverse_matched = true;
  552. else
  553. ++state.string_position;
  554. }
  555. break;
  556. case CharClass::Word:
  557. if (isalnum(ch) || ch == '_') {
  558. if (inverse)
  559. inverse_matched = true;
  560. else
  561. ++state.string_position;
  562. }
  563. break;
  564. case CharClass::Xdigit:
  565. if (isxdigit(ch)) {
  566. if (inverse)
  567. inverse_matched = true;
  568. else
  569. ++state.string_position;
  570. }
  571. break;
  572. }
  573. }
  574. ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched)
  575. {
  576. if (input.regex_options & AllFlags::Insensitive) {
  577. from = tolower(from);
  578. to = tolower(to);
  579. ch = tolower(ch);
  580. }
  581. if (ch >= from && ch <= to) {
  582. if (inverse)
  583. inverse_matched = true;
  584. else
  585. ++state.string_position;
  586. }
  587. }
  588. const String OpCode_Compare::arguments_string() const
  589. {
  590. return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
  591. }
  592. const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
  593. {
  594. Vector<String> result;
  595. size_t offset { state().instruction_position + 3 };
  596. RegexStringView view = ((input.has_value()) ? input.value().view : nullptr);
  597. for (size_t i = 0; i < arguments_count(); ++i) {
  598. auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
  599. result.empend(String::formatted("type={} [{}]", (size_t)compare_type, character_compare_type_name(compare_type)));
  600. auto compared_against_string_start_offset = state().string_position > 0 ? state().string_position - 1 : state().string_position;
  601. if (compare_type == CharacterCompareType::Char) {
  602. auto ch = m_bytecode->at(offset++);
  603. auto is_ascii = isascii(ch) && isprint(ch);
  604. if (is_ascii)
  605. result.empend(String::formatted("value='{:c}'", static_cast<char>(ch)));
  606. else
  607. result.empend(String::formatted("value={:x}", ch));
  608. if (!view.is_null() && view.length() > state().string_position) {
  609. if (is_ascii) {
  610. result.empend(String::formatted(
  611. "compare against: '{}'",
  612. view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  613. } else {
  614. auto str = view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string();
  615. u8 buf[8] { 0 };
  616. __builtin_memcpy(buf, str.characters(), min(str.length(), sizeof(buf)));
  617. result.empend(String::formatted("compare against: {:x},{:x},{:x},{:x},{:x},{:x},{:x},{:x}",
  618. buf[0], buf[1], buf[2], buf[3], buf[4], buf[5], buf[6], buf[7]));
  619. }
  620. }
  621. } else if (compare_type == CharacterCompareType::NamedReference) {
  622. auto ptr = (const char*)m_bytecode->at(offset++);
  623. auto length = m_bytecode->at(offset++);
  624. result.empend(String::formatted("name='{}'", StringView { ptr, (size_t)length }));
  625. } else if (compare_type == CharacterCompareType::Reference) {
  626. auto ref = m_bytecode->at(offset++);
  627. result.empend(String::formatted("number={}", ref));
  628. } else if (compare_type == CharacterCompareType::String) {
  629. auto& length = m_bytecode->at(offset++);
  630. StringBuilder str_builder;
  631. for (size_t i = 0; i < length; ++i)
  632. str_builder.append(m_bytecode->at(offset++));
  633. result.empend(String::formatted("value=\"{}\"", str_builder.string_view().substring_view(0, length)));
  634. if (!view.is_null() && view.length() > state().string_position)
  635. result.empend(String::formatted(
  636. "compare against: \"{}\"",
  637. input.value().view.substring_view(compared_against_string_start_offset, compared_against_string_start_offset + length > view.length() ? 0 : length).to_string()));
  638. } else if (compare_type == CharacterCompareType::CharClass) {
  639. auto character_class = (CharClass)m_bytecode->at(offset++);
  640. result.empend(String::formatted("ch_class={} [{}]", (size_t)character_class, character_class_name(character_class)));
  641. if (!view.is_null() && view.length() > state().string_position)
  642. result.empend(String::formatted(
  643. "compare against: '{}'",
  644. input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  645. } else if (compare_type == CharacterCompareType::CharRange) {
  646. auto value = (CharRange)m_bytecode->at(offset++);
  647. result.empend(String::formatted("ch_range='{:c}'-'{:c}'", value.from, value.to));
  648. if (!view.is_null() && view.length() > state().string_position)
  649. result.empend(String::formatted(
  650. "compare against: '{}'",
  651. input.value().view.substring_view(compared_against_string_start_offset, state().string_position > view.length() ? 0 : 1).to_string()));
  652. }
  653. }
  654. return result;
  655. }
  656. }