/* * Copyright (c) 2020, Emanuel Sprung * * SPDX-License-Identifier: BSD-2-Clause */ #include #include #include #include #include #include #if REGEX_DEBUG # include #endif namespace regex { #if REGEX_DEBUG static RegexDebug s_regex_dbg(stderr); #endif template regex::Parser::Result Regex::parse_pattern(StringView pattern, typename ParserTraits::OptionsType regex_options) { regex::Lexer lexer(pattern); Parser parser(lexer, regex_options); return parser.parse(); } template struct CacheKey { ByteString pattern; typename ParserTraits::OptionsType options; bool operator==(CacheKey const& other) const { return pattern == other.pattern && options.value() == other.options.value(); } }; template static OrderedHashMap, regex::Parser::Result> s_parser_cache; template static size_t s_cached_bytecode_size = 0; static constexpr auto MaxRegexCachedBytecodeSize = 1 * MiB; template Regex::Regex(ByteString pattern, typename ParserTraits::OptionsType regex_options) : pattern_value(move(pattern)) { if (auto cache_entry = s_parser_cache.get({ pattern_value, regex_options }); cache_entry.has_value()) { parser_result = cache_entry.value(); } else { regex::Lexer lexer(pattern_value); Parser parser(lexer, regex_options); parser_result = parser.parse(); run_optimization_passes(); if (parser_result.error == regex::Error::NoError) { while (parser_result.bytecode.size() * sizeof(ByteCodeValueType) + s_cached_bytecode_size > MaxRegexCachedBytecodeSize) s_cached_bytecode_size -= s_parser_cache.take_first().bytecode.size() * sizeof(ByteCodeValueType); s_parser_cache.set({ pattern_value, regex_options }, parser_result); s_cached_bytecode_size += parser_result.bytecode.size() * sizeof(ByteCodeValueType); } } if (parser_result.error == regex::Error::NoError) matcher = make>(this, static_cast(parser_result.options.value())); } template Regex::Regex(regex::Parser::Result parse_result, ByteString pattern, typename ParserTraits::OptionsType regex_options) : pattern_value(move(pattern)) , parser_result(move(parse_result)) { run_optimization_passes(); if (parser_result.error == regex::Error::NoError) matcher = make>(this, regex_options | static_cast(parse_result.options.value())); } template Regex::Regex(Regex&& regex) : pattern_value(move(regex.pattern_value)) , parser_result(move(regex.parser_result)) , matcher(move(regex.matcher)) , start_offset(regex.start_offset) { if (matcher) matcher->reset_pattern({}, this); } template Regex& Regex::operator=(Regex&& regex) { pattern_value = move(regex.pattern_value); parser_result = move(regex.parser_result); matcher = move(regex.matcher); if (matcher) matcher->reset_pattern({}, this); start_offset = regex.start_offset; return *this; } template typename ParserTraits::OptionsType Regex::options() const { if (!matcher || parser_result.error != Error::NoError) return {}; return matcher->options(); } template ByteString Regex::error_string(Optional message) const { StringBuilder eb; eb.append("Error during parsing of regular expression:\n"sv); eb.appendff(" {}\n ", pattern_value); for (size_t i = 0; i < parser_result.error_token.position(); ++i) eb.append(' '); eb.appendff("^---- {}", message.value_or(get_error_string(parser_result.error))); return eb.to_byte_string(); } template RegexResult Matcher::match(RegexStringView view, Optional::OptionsType> regex_options) const { AllOptions options = m_regex_options | regex_options.value_or({}).value(); if constexpr (!IsSame) { if (options.has_flag_set(AllFlags::Multiline)) return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string... } Vector views; views.append(view); return match(views, regex_options); } template RegexResult Matcher::match(Vector const& views, Optional::OptionsType> regex_options) const { // If the pattern *itself* isn't stateful, reset any changes to start_offset. if (!((AllFlags)m_regex_options.value() & AllFlags::Internal_Stateful)) m_pattern->start_offset = 0; size_t match_count { 0 }; MatchInput input; MatchState state; size_t operations = 0; input.regex_options = m_regex_options | regex_options.value_or({}).value(); input.start_offset = m_pattern->start_offset; size_t lines_to_skip = 0; bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode); for (auto const& view : views) const_cast(view).set_unicode(unicode); if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { if (views.size() > 1 && input.start_offset > views.first().length()) { dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip); for (auto const& view : views) { if (input.start_offset < view.length() + 1) break; ++lines_to_skip; input.start_offset -= view.length() + 1; input.global_offset += view.length() + 1; } dbgln_if(REGEX_DEBUG, "Ended with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip); } } if (c_match_preallocation_count) { state.matches.ensure_capacity(c_match_preallocation_count); state.capture_group_matches.ensure_capacity(c_match_preallocation_count); auto& capture_groups_count = m_pattern->parser_result.capture_groups_count; for (size_t j = 0; j < c_match_preallocation_count; ++j) { state.matches.empend(); state.capture_group_matches.empend(); state.capture_group_matches.mutable_at(j).ensure_capacity(capture_groups_count); for (size_t k = 0; k < capture_groups_count; ++k) state.capture_group_matches.mutable_at(j).unchecked_append({}); } } auto append_match = [](auto& input, auto& state, auto& start_position) { if (state.matches.size() == input.match_index) state.matches.empend(); VERIFY(start_position + state.string_position - start_position <= input.view.length()); if (input.regex_options.has_flag_set(AllFlags::StringCopyMatches)) { state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_byte_string(), input.line, start_position, input.global_offset + start_position }; } else { // let the view point to the original string ... state.matches.mutable_at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position }; } }; #if REGEX_DEBUG s_regex_dbg.print_header(); #endif bool continue_search = input.regex_options.has_flag_set(AllFlags::Global) || input.regex_options.has_flag_set(AllFlags::Multiline); if (input.regex_options.has_flag_set(AllFlags::Sticky)) continue_search = false; auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch); auto only_start_of_line = m_pattern->parser_result.optimization_data.only_start_of_line && !input.regex_options.has_flag_set(AllFlags::Multiline); for (auto const& view : views) { if (lines_to_skip != 0) { ++input.line; --lines_to_skip; continue; } input.view = view; dbgln_if(REGEX_DEBUG, "[match] Starting match with view ({}): _{}_", view.length(), view); auto view_length = view.length(); size_t view_index = m_pattern->start_offset; state.string_position = view_index; state.string_position_in_code_units = view_index; bool succeeded = false; if (view_index == view_length && m_pattern->parser_result.match_length_minimum == 0) { // Run the code until it tries to consume something. // This allows non-consuming code to run on empty strings, for instance // e.g. "Exit" size_t temp_operations = operations; input.column = match_count; input.match_index = match_count; state.string_position = view_index; state.string_position_in_code_units = view_index; state.instruction_position = 0; state.repetition_marks.clear(); auto success = execute(input, state, temp_operations); // This success is acceptable only if it doesn't read anything from the input (input length is 0). if (success && (state.string_position <= view_index)) { operations = temp_operations; if (!match_count) { // Nothing was *actually* matched, so append an empty match. append_match(input, state, view_index); ++match_count; // This prevents a regex pattern like ".*" from matching the empty string // multiple times, once in this block and once in the following for loop. if (view_index == 0 && view_length == 0) ++view_index; } } } for (; view_index <= view_length; ++view_index) { if (view_index == view_length && input.regex_options.has_flag_set(AllFlags::Multiline)) break; auto& match_length_minimum = m_pattern->parser_result.match_length_minimum; // FIXME: More performant would be to know the remaining minimum string // length needed to match from the current position onwards within // the vm. Add new OpCode for MinMatchLengthFromSp with the value of // the remaining string length from the current path. The value though // has to be filled in reverse. That implies a second run over bytecode // after generation has finished. if (match_length_minimum && match_length_minimum > view_length - view_index) break; input.column = match_count; input.match_index = match_count; state.string_position = view_index; state.string_position_in_code_units = view_index; state.instruction_position = 0; state.repetition_marks.clear(); auto success = execute(input, state, operations); if (success) { succeeded = true; if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) { if (!continue_search) break; continue; } if (input.regex_options.has_flag_set(AllFlags::MatchNotBeginOfLine) && view_index == 0) { if (!continue_search) break; continue; } dbgln_if(REGEX_DEBUG, "state.string_position={}, view_index={}", state.string_position, view_index); dbgln_if(REGEX_DEBUG, "[match] Found a match (length={}): '{}'", state.string_position - view_index, input.view.substring_view(view_index, state.string_position - view_index)); ++match_count; if (continue_search) { append_match(input, state, view_index); bool has_zero_length = state.string_position == view_index; view_index = state.string_position - (has_zero_length ? 0 : 1); if (single_match_only) break; continue; } if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { append_match(input, state, view_index); break; } if (state.string_position < view_length) { return { false, 0, {}, {}, {}, operations }; } append_match(input, state, view_index); break; } if (!continue_search || only_start_of_line) break; } ++input.line; input.global_offset += view.length() + 1; // +1 includes the line break character if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) m_pattern->start_offset = state.string_position; if (succeeded && !continue_search) break; } RegexResult result { match_count != 0, match_count, move(state.matches).release(), move(state.capture_group_matches).release(), operations, m_pattern->parser_result.capture_groups_count, m_pattern->parser_result.named_capture_groups_count, }; if (match_count) { // Make sure there are as many capture matches as there are actual matches. if (result.capture_group_matches.size() < match_count) result.capture_group_matches.resize(match_count); for (auto& matches : result.capture_group_matches) matches.resize(m_pattern->parser_result.capture_groups_count + 1); if (!input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) { for (auto& matches : result.capture_group_matches) matches.remove_all_matching([](auto& match) { return match.view.is_null(); }); } } else { result.capture_group_matches.clear_with_capacity(); } return result; } template class BumpAllocatedLinkedList { public: BumpAllocatedLinkedList() = default; ALWAYS_INLINE void append(T value) { auto node_ptr = m_allocator.allocate(move(value)); VERIFY(node_ptr); if (!m_first) { m_first = node_ptr; m_last = node_ptr; return; } node_ptr->previous = m_last; m_last->next = node_ptr; m_last = node_ptr; } ALWAYS_INLINE T take_last() { VERIFY(m_last); T value = move(m_last->value); if (m_last == m_first) { m_last = nullptr; m_first = nullptr; } else { m_last = m_last->previous; m_last->next = nullptr; } return value; } ALWAYS_INLINE T& last() { return m_last->value; } ALWAYS_INLINE bool is_empty() const { return m_first == nullptr; } auto reverse_begin() { return ReverseIterator(m_last); } auto reverse_end() { return ReverseIterator(); } private: struct Node { T value; Node* next { nullptr }; Node* previous { nullptr }; }; struct ReverseIterator { ReverseIterator() = default; explicit ReverseIterator(Node* node) : m_node(node) { } T* operator->() { return &m_node->value; } T& operator*() { return m_node->value; } bool operator==(ReverseIterator const& it) const { return m_node == it.m_node; } ReverseIterator& operator++() { if (m_node) m_node = m_node->previous; return *this; } private: Node* m_node; }; UniformBumpAllocator m_allocator; Node* m_first { nullptr }; Node* m_last { nullptr }; }; template bool Matcher::execute(MatchInput const& input, MatchState& state, size_t& operations) const { if (m_pattern->parser_result.optimization_data.pure_substring_search.has_value() && input.view.is_string_view()) { // Yay, we can do a simple substring search! auto& needle = m_pattern->parser_result.optimization_data.pure_substring_search.value(); if (needle.length() + state.string_position > input.view.length()) return false; auto haystack = input.view.string_view().substring_view(state.string_position); if (input.regex_options.has_flag_set(AllFlags::Insensitive)) { if (!haystack.substring_view(0, needle.length()).equals_ignoring_ascii_case(needle)) return false; } else { if (!haystack.starts_with(needle)) return false; } state.string_position += needle.length(); state.string_position_in_code_units += needle.length(); return true; } BumpAllocatedLinkedList states_to_try_next; #if REGEX_DEBUG size_t recursion_level = 0; #endif auto& bytecode = m_pattern->parser_result.bytecode; for (;;) { auto& opcode = bytecode.get_opcode(state); ++operations; #if REGEX_DEBUG s_regex_dbg.print_opcode("VM", opcode, state, recursion_level, false); #endif ExecutionResult result; if (input.fail_counter > 0) { --input.fail_counter; result = ExecutionResult::Failed_ExecuteLowPrioForks; } else { result = opcode.execute(input, state); } #if REGEX_DEBUG s_regex_dbg.print_result(opcode, bytecode, input, state, result); #endif state.instruction_position += opcode.size(); switch (result) { case ExecutionResult::Fork_PrioLow: { bool found = false; if (input.fork_to_replace.has_value()) { for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) { if (it->initiating_fork == input.fork_to_replace.value()) { (*it) = state; it->instruction_position = state.fork_at_position; it->initiating_fork = *input.fork_to_replace; found = true; break; } } input.fork_to_replace.clear(); } if (!found) { states_to_try_next.append(state); states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size(); states_to_try_next.last().instruction_position = state.fork_at_position; } continue; } case ExecutionResult::Fork_PrioHigh: { bool found = false; if (input.fork_to_replace.has_value()) { for (auto it = states_to_try_next.reverse_begin(); it != states_to_try_next.reverse_end(); ++it) { if (it->initiating_fork == input.fork_to_replace.value()) { (*it) = state; it->initiating_fork = *input.fork_to_replace; found = true; break; } } input.fork_to_replace.clear(); } if (!found) { states_to_try_next.append(state); states_to_try_next.last().initiating_fork = state.instruction_position - opcode.size(); } state.instruction_position = state.fork_at_position; #if REGEX_DEBUG ++recursion_level; #endif continue; } case ExecutionResult::Continue: continue; case ExecutionResult::Succeeded: return true; case ExecutionResult::Failed: if (!states_to_try_next.is_empty()) { state = states_to_try_next.take_last(); continue; } return false; case ExecutionResult::Failed_ExecuteLowPrioForks: { if (states_to_try_next.is_empty()) { return false; } state = states_to_try_next.take_last(); #if REGEX_DEBUG ++recursion_level; #endif continue; } } } VERIFY_NOT_REACHED(); } template class Matcher; template class Regex; template class Matcher; template class Regex; template class Matcher; template class Regex; } template struct AK::Traits> : public AK::DefaultTraits> { static unsigned hash(regex::CacheKey const& key) { return pair_int_hash(key.pattern.hash(), int_hash(to_underlying(key.options.value()))); } };