LibRegex: Add a regular expression library

This commit is a mix of several commits, squashed into one because the
commits before 'Move regex to own Library and fix all the broken stuff'
were not fixable in any elegant way.
The commits are listed below for "historical" purposes:

- AK: Add options/flags and Errors for regular expressions

Flags can be provided for any possible flavour by adding a new scoped enum.
Handling of flags is done by templated Options class and the overloaded
'|' and '&' operators.

- AK: Add Lexer for regular expressions

The lexer parses the input and extracts tokens needed to parse a regular
expression.

- AK: Add regex Parser and PosixExtendedParser

This patchset adds a abstract parser class that can be derived to implement
different parsers. A parser produces bytecode to be executed within the
regex matcher.

- AK: Add regex matcher

This patchset adds an regex matcher based on the principles of the T-REX VM.
The bytecode pruduced by the respective Parser is put into the matcher and
the VM will recursively execute the bytecode according to the available OpCodes.
Possible improvement: the recursion could be replaced by multi threading capabilities.

To match a Regular expression, e.g. for the Posix standard regular expression matcher
use the following API:

```
Pattern<PosixExtendedParser> pattern("^.*$");
auto result = pattern.match("Well, hello friends!\nHello World!"); // Match whole needle

EXPECT(result.count == 1);
EXPECT(result.matches.at(0).view.starts_with("Well"));
EXPECT(result.matches.at(0).view.end() == "!");

result = pattern.match("Well, hello friends!\nHello World!", PosixFlags::Multiline); // Match line by line

EXPECT(result.count == 2);
EXPECT(result.matches.at(0).view == "Well, hello friends!");
EXPECT(result.matches.at(1).view == "Hello World!");

EXPECT(pattern.has_match("Well,....")); // Just check if match without a result, which saves some resources.
```

- AK: Rework regex to work with opcodes objects

This patchsets reworks the matcher to work on a more structured base.
For that an abstract OpCode class and derived classes for the specific
OpCodes have been added. The respective opcode logic is contained in
each respective execute() method.

- AK: Add benchmark for regex

- AK: Some optimization in regex for runtime and memory

- LibRegex: Move regex to own Library and fix all the broken stuff

Now regex works again and grep utility is also in place for testing.
This commit also fixes the use of regex.h in C by making `regex_t`
an opaque (-ish) type, which makes its behaviour consistent between
C and C++ compilers.
Previously, <regex.h> would've blown C compilers up, and even if it
didn't, would've caused a leak in C code, and not in C++ code (due to
the existence of `OwnPtr` inside the struct).

To make this whole ordeal easier to deal with (for now), this pulls the
definitions of `reg*()` into LibRegex.

pros:
- The circular dependency between LibC and LibRegex is broken
- Eaiser to test (without accidentally pulling in the host's libc!)

cons:
- Using any of the regex.h functions will require the user to link -lregex
- The symbols will be missing from libc, which will be a big surprise
  down the line (especially with shared libs).

Co-Authored-By: Ali Mohammad Pur <ali.mpfard@gmail.com>
This commit is contained in:
Emanuel Sprung 2020-04-26 14:45:10 +02:00 committed by Andreas Kling
parent 4eb23abf06
commit 55450055d8
Notes: sideshowbarker 2024-07-19 01:15:19 +09:00
26 changed files with 7424 additions and 6 deletions

View file

@ -84,6 +84,7 @@ add_subdirectory(Meta/Lagom)
add_subdirectory(DevTools/IPCCompiler)
add_subdirectory(Libraries/LibWeb/CodeGenerators)
add_subdirectory(AK/Tests)
add_subdirectory(Libraries/LibRegex/Tests)
set(write_if_different ${CMAKE_SOURCE_DIR}/Meta/write-only-on-difference.sh)

View file

@ -1,15 +1,17 @@
add_subdirectory(LibAudio)
add_subdirectory(LibC)
add_subdirectory(LibChess)
add_subdirectory(LibCompress)
add_subdirectory(LibCore)
add_subdirectory(LibCpp)
add_subdirectory(LibCrypt)
add_subdirectory(LibCrypto)
add_subdirectory(LibCompress)
add_subdirectory(LibDebug)
add_subdirectory(LibDesktop)
add_subdirectory(LibDiff)
add_subdirectory(LibGUI)
add_subdirectory(LibGemini)
add_subdirectory(LibGfx)
add_subdirectory(LibGUI)
add_subdirectory(LibHTTP)
add_subdirectory(LibIPC)
add_subdirectory(LibImageDecoderClient)
@ -21,12 +23,11 @@ add_subdirectory(LibMarkdown)
add_subdirectory(LibPCIDB)
add_subdirectory(LibProtocol)
add_subdirectory(LibPthread)
add_subdirectory(LibRegex)
add_subdirectory(LibTLS)
add_subdirectory(LibTar)
add_subdirectory(LibTextCodec)
add_subdirectory(LibThread)
add_subdirectory(LibTLS)
add_subdirectory(LibVT)
add_subdirectory(LibWeb)
add_subdirectory(LibX86)
add_subdirectory(LibDiff)
add_subdirectory(LibCpp)

122
Libraries/LibC/regex.h Normal file
View file

@ -0,0 +1,122 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <stddef.h>
#include <sys/types.h>
__BEGIN_DECLS
typedef ssize_t regoff_t;
struct regex_t {
void* __data;
};
enum __Regex_Error {
__Regex_NoError,
__Regex_InvalidPattern, // Invalid regular expression.
__Regex_InvalidCollationElement, // Invalid collating element referenced.
__Regex_InvalidCharacterClass, // Invalid character class type referenced.
__Regex_InvalidTrailingEscape, // Trailing \ in pattern.
__Regex_InvalidNumber, // Number in \digit invalid or in error.
__Regex_MismatchingBracket, // [ ] imbalance.
__Regex_MismatchingParen, // ( ) imbalance.
__Regex_MismatchingBrace, // { } imbalance.
__Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
__Regex_InvalidBracketContent, // Content of [] invalid.
__Regex_InvalidRange, // Invalid endpoint in range expression.
__Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
__Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
__Regex_EmptySubExpression, // Sub expression has empty content.
__Regex_InvalidCaptureGroup, // Content of capture group is invalid.
__Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
};
enum ReError {
REG_NOERR = __Regex_NoError,
REG_BADPAT = __Regex_InvalidPattern, // Invalid regular expression.
REG_ECOLLATE = __Regex_InvalidCollationElement, // Invalid collating element referenced.
REG_ECTYPE = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
REG_EESCAPE = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
REG_ESUBREG = __Regex_InvalidNumber, // Number in \digit invalid or in error.
REG_EBRACK = __Regex_MismatchingBracket, // [ ] imbalance.
REG_EPAREN = __Regex_MismatchingParen, // \( \) or ( ) imbalance.
REG_EBRACE = __Regex_MismatchingBrace, // \{ \} imbalance.
REG_BADBR = __Regex_InvalidBraceContent, // Content of \{ \} invalid: not a number, number too large, more than two numbers, first larger than second.
REG_ERANGE = __Regex_InvalidRange, // Invalid endpoint in range expression.
REG_BADRPT = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
REG_EMPTY_EXPR = __Regex_EmptySubExpression, // Empty expression
REG_ENOSYS, // The implementation does not support the function.
REG_ESPACE, // Out of memory.
REG_NOMATCH, // regexec() failed to match.
};
struct regmatch_t {
regoff_t rm_so; // byte offset from start of string to start of substring
regoff_t rm_eo; // byte offset from start of string of the first character after the end of substring
regoff_t rm_cnt; // number of matches
};
enum __RegexAllFlags {
__Regex_Global = 1, // All matches (don't return after first match)
__Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z])
__Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
__Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such
__Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
__Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
__Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string!
__Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
__Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result
__Regex_StringCopyMatches = __Regex_Global << 9, // Do explicitly copy results into new allocated string instead of StringView to original string.
__Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters
__Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended.
__Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
__Regex_Last = __Regex_SkipTrimEmptyMatches
};
// Values for the cflags parameter to the regcomp() function:
#define REG_EXTENDED __Regex_Extended // Use Extended Regular Expressions.
#define REG_ICASE __Regex_Insensitive // Ignore case in match.
#define REG_NOSUB __Regex_SkipSubExprResults // Report only success or fail in regexec().
#define REG_GLOBAL __Regex_Global // Don't stop searching for more match
#define REG_NEWLINE (__Regex_Multiline | REG_GLOBAL) // Change the handling of newline.
// Values for the eflags parameter to the regexec() function:
#define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string.
#define REG_NOTEOL __Regex_MatchNotEndOfLine // The dollar sign ($), when taken as a special character, will not match the end of string.
//static_assert (sizeof(FlagsUnderlyingType) * 8 >= regex::POSIXFlags::Last << 1), "flags type too small")
#define REG_SEARCH __Regex_Last << 1
int regcomp(regex_t*, const char*, int);
int regexec(const regex_t*, const char*, size_t, regmatch_t[], int);
size_t regerror(int, const regex_t*, char*, size_t);
void regfree(regex_t*);
__END_DECLS

View file

@ -0,0 +1,256 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <LibRegex/Regex.h>
#include <ctype.h>
#include <stdio.h>
#include <string.h>
#ifdef __serenity__
# include <regex.h>
#else
# include <LibC/regex.h>
#endif
struct internal_regex_t {
u8 cflags;
u8 eflags;
OwnPtr<Regex<PosixExtended>> re;
size_t re_pat_errpos;
ReError re_pat_err;
String re_pat;
size_t re_nsub;
};
static internal_regex_t* impl_from(regex_t* re)
{
if (!re)
return nullptr;
return reinterpret_cast<internal_regex_t*>(re->__data);
}
static const internal_regex_t* impl_from(const regex_t* re)
{
return impl_from(const_cast<regex_t*>(re));
}
extern "C" {
int regcomp(regex_t* reg, const char* pattern, int cflags)
{
if (!reg)
return REG_ESPACE;
// Note that subsequent uses of regcomp() without regfree() _will_ leak memory
// This could've been prevented if libc provided a reginit() or similar, but it does not.
reg->__data = new internal_regex_t { 0, 0, nullptr, 0, ReError::REG_NOERR, {}, 0 };
auto preg = impl_from(reg);
if (!(cflags & REG_EXTENDED))
return REG_ENOSYS;
preg->cflags = cflags;
String pattern_str(pattern);
preg->re = make<Regex<PosixExtended>>(pattern_str, PosixOptions {} | (PosixFlags)cflags);
auto parser_result = preg->re->parser_result;
if (parser_result.error != regex::Error::NoError) {
preg->re_pat_errpos = parser_result.error_token.position();
preg->re_pat_err = (ReError)parser_result.error;
preg->re_pat = pattern;
dbg() << "Have Error: " << (ReError)parser_result.error;
return (ReError)parser_result.error;
}
preg->re_nsub = parser_result.capture_groups_count;
return REG_NOERR;
}
int regexec(const regex_t* reg, const char* string, size_t nmatch, regmatch_t pmatch[], int eflags)
{
auto preg = impl_from(reg);
if (!preg->re || preg->re_pat_err) {
if (preg->re_pat_err)
return preg->re_pat_err;
return REG_BADPAT;
}
RegexResult result;
if (eflags & REG_SEARCH)
result = preg->re->search({ string }, PosixOptions {} | (PosixFlags)eflags);
else
result = preg->re->match({ string }, PosixOptions {} | (PosixFlags)eflags);
if (result.success) {
auto size = result.matches.size();
if (size && nmatch && pmatch) {
pmatch[0].rm_cnt = size;
size_t match_index { 0 };
for (size_t i = 0; i < size; ++i) {
pmatch[match_index].rm_so = result.matches.at(i).global_offset;
pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.matches.at(i).view.length();
if (match_index > 0)
pmatch[match_index].rm_cnt = result.capture_group_matches.size();
++match_index;
if (match_index >= nmatch)
return REG_NOERR;
if (i < result.capture_group_matches.size()) {
auto capture_groups_size = result.capture_group_matches.at(i).size();
for (size_t j = 0; j < preg->re->parser_result.capture_groups_count; ++j) {
if (j >= capture_groups_size || !result.capture_group_matches.at(i).at(j).view.length()) {
pmatch[match_index].rm_so = -1;
pmatch[match_index].rm_eo = -1;
pmatch[match_index].rm_cnt = 0;
} else {
pmatch[match_index].rm_so = result.capture_group_matches.at(i).at(j).global_offset;
pmatch[match_index].rm_eo = pmatch[match_index].rm_so + result.capture_group_matches.at(i).at(j).view.length();
pmatch[match_index].rm_cnt = 1;
}
++match_index;
if (match_index >= nmatch)
return REG_NOERR;
}
}
}
if (match_index < nmatch) {
for (size_t i = match_index; i < nmatch; ++i) {
pmatch[i].rm_so = -1;
pmatch[i].rm_eo = -1;
pmatch[i].rm_cnt = 0;
}
}
}
return REG_NOERR;
} else {
if (nmatch && pmatch) {
pmatch[0].rm_so = -1;
pmatch[0].rm_eo = -1;
pmatch[0].rm_cnt = 0;
}
}
return REG_NOMATCH;
}
inline static String get_error(ReError errcode)
{
String error;
switch ((ReError)errcode) {
case REG_NOERR:
error = "No error";
break;
case REG_NOMATCH:
error = "regexec() failed to match.";
break;
case REG_BADPAT:
error = "Invalid regular expression.";
break;
case REG_ECOLLATE:
error = "Invalid collating element referenced.";
break;
case REG_ECTYPE:
error = "Invalid character class type referenced.";
break;
case REG_EESCAPE:
error = "Trailing \\ in pattern.";
break;
case REG_ESUBREG:
error = "Number in \\digit invalid or in error.";
break;
case REG_EBRACK:
error = "[ ] imbalance.";
break;
case REG_EPAREN:
error = "\\( \\) or ( ) imbalance.";
break;
case REG_EBRACE:
error = "\\{ \\} imbalance.";
break;
case REG_BADBR:
error = "Content of \\{ \\} invalid: not a number, number too large, more than two numbers, first larger than second.";
break;
case REG_ERANGE:
error = "Invalid endpoint in range expression.";
break;
case REG_ESPACE:
error = "Out of memory.";
break;
case REG_BADRPT:
error = "?, * or + not preceded by valid regular expression.";
break;
case REG_ENOSYS:
error = "The implementation does not support the function.";
break;
case REG_EMPTY_EXPR:
error = "Empty expression provided";
break;
}
return error;
}
size_t regerror(int errcode, const regex_t* reg, char* errbuf, size_t errbuf_size)
{
String error;
auto preg = impl_from(reg);
if (!preg)
error = get_error((ReError)errcode);
else
error = preg->re->error_string(get_error(preg->re_pat_err));
if (!errbuf_size)
return error.length();
if (!error.copy_characters_to_buffer(errbuf, errbuf_size))
return 0;
return error.length();
}
void regfree(regex_t* reg)
{
auto preg = impl_from(reg);
if (preg) {
delete preg;
reg->__data = nullptr;
}
}
}

View file

@ -0,0 +1,10 @@
set(SOURCES
C/Regex.cpp
RegexByteCode.cpp
RegexLexer.cpp
RegexMatcher.cpp
RegexParser.cpp
)
serenity_lib(LibRegex regex)
target_link_libraries(LibRegex LibC LibCore)

View file

@ -0,0 +1,54 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/Types.h>
namespace regex {
enum class Error : u8;
class Lexer;
class PosixExtendedParser;
class ByteCode;
class OpCode;
class OpCode_Exit;
class OpCode_Jump;
class OpCode_ForkJump;
class OpCode_ForkStay;
class OpCode_CheckBegin;
class OpCode_CheckEnd;
class OpCode_SaveLeftCaptureGroup;
class OpCode_SaveRightCaptureGroup;
class OpCode_SaveLeftNamedCaptureGroup;
class OpCode_SaveNamedLeftCaptureGroup;
class OpCode_SaveRightNamedCaptureGroup;
class OpCode_Compare;
}
using regex::Error;
using regex::Lexer;
using regex::PosixExtendedParser;

View file

@ -0,0 +1,31 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <LibRegex/Forward.h>
#include <LibRegex/RegexDebug.h>
#include <LibRegex/RegexMatcher.h>

View file

@ -0,0 +1,572 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexByteCode.h"
#include "AK/StringBuilder.h"
#include "RegexDebug.h"
#include <ctype.h>
namespace regex {
const char* OpCode::name(OpCodeId opcode_id)
{
switch (opcode_id) {
#define __ENUMERATE_OPCODE(x) \
case OpCodeId::x: \
return #x;
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* OpCode::name() const
{
return name(opcode_id());
}
const char* execution_result_name(ExecutionResult result)
{
switch (result) {
#define __ENUMERATE_EXECUTION_RESULT(x) \
case ExecutionResult::x: \
return #x;
ENUMERATE_EXECUTION_RESULTS
#undef __ENUMERATE_EXECUTION_RESULT
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* character_compare_type_name(CharacterCompareType ch_compare_type)
{
switch (ch_compare_type) {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) \
case CharacterCompareType::x: \
return #x;
ENUMERATE_CHARACTER_COMPARE_TYPES
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
static const char* character_class_name(CharClass ch_class)
{
switch (ch_class) {
#define __ENUMERATE_CHARACTER_CLASS(x) \
case CharClass::x: \
return #x;
ENUMERATE_CHARACTER_CLASSES
#undef __ENUMERATE_CHARACTER_CLASS
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
HashMap<u32, OwnPtr<OpCode>> ByteCode::s_opcodes {};
ALWAYS_INLINE OpCode* ByteCode::get_opcode_by_id(OpCodeId id) const
{
if (!s_opcodes.size()) {
for (u32 i = (u32)OpCodeId::First; i <= (u32)OpCodeId::Last; ++i) {
switch ((OpCodeId)i) {
case OpCodeId::Exit:
s_opcodes.set(i, make<OpCode_Exit>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::Jump:
s_opcodes.set(i, make<OpCode_Jump>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::Compare:
s_opcodes.set(i, make<OpCode_Compare>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::CheckEnd:
s_opcodes.set(i, make<OpCode_CheckEnd>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::ForkJump:
s_opcodes.set(i, make<OpCode_ForkJump>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::ForkStay:
s_opcodes.set(i, make<OpCode_ForkStay>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::CheckBegin:
s_opcodes.set(i, make<OpCode_CheckBegin>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveLeftCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveLeftCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveRightCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveRightCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveLeftNamedCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveLeftNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
case OpCodeId::SaveRightNamedCaptureGroup:
s_opcodes.set(i, make<OpCode_SaveRightNamedCaptureGroup>(*const_cast<ByteCode*>(this)));
break;
}
}
}
if (id > OpCodeId::Last)
return nullptr;
return const_cast<OpCode*>(s_opcodes.get((u32)id).value())->set_bytecode(*const_cast<ByteCode*>(this));
}
OpCode* ByteCode::get_opcode(MatchState& state) const
{
OpCode* op_code;
if (state.instruction_position >= size()) {
op_code = get_opcode_by_id(OpCodeId::Exit);
} else
op_code = get_opcode_by_id((OpCodeId)at(state.instruction_position));
if (op_code)
op_code->set_state(state);
return op_code;
}
ALWAYS_INLINE ExecutionResult OpCode_Exit::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (state.string_position > input.view.length() || state.instruction_position >= m_bytecode->size())
return ExecutionResult::Succeeded;
return ExecutionResult::Failed;
}
ALWAYS_INLINE ExecutionResult OpCode_Jump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
{
state.instruction_position += offset();
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_ForkJump::execute(const MatchInput&, MatchState& state, MatchOutput&) const
{
state.fork_at_position = state.instruction_position + size() + offset();
return ExecutionResult::Fork_PrioHigh;
}
ALWAYS_INLINE ExecutionResult OpCode_ForkStay::execute(const MatchInput&, MatchState& state, MatchOutput&) const
{
state.fork_at_position = state.instruction_position + size() + offset();
return ExecutionResult::Fork_PrioLow;
}
ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
return ExecutionResult::Failed;
if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
|| (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|| (0 == state.string_position && (input.regex_options & AllFlags::Global)))
return ExecutionResult::Continue;
return ExecutionResult::Failed;
}
ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
return ExecutionResult::Failed;
if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
|| (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
return ExecutionResult::Succeeded;
return ExecutionResult::Failed;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
if (input.match_index >= output.capture_group_matches.size()) {
output.capture_group_matches.ensure_capacity(input.match_index);
auto capacity = output.capture_group_matches.capacity();
for (size_t i = output.capture_group_matches.size(); i <= capacity; ++i)
output.capture_group_matches.empend();
}
if (id() >= output.capture_group_matches.at(input.match_index).size()) {
output.capture_group_matches.at(input.match_index).ensure_capacity(id());
auto capacity = output.capture_group_matches.at(input.match_index).capacity();
for (size_t i = output.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
output.capture_group_matches.at(input.match_index).empend();
}
output.capture_group_matches.at(input.match_index).at(id()).left_column = state.string_position;
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
auto& match = output.capture_group_matches.at(input.match_index).at(id());
auto start_position = match.left_column;
auto length = state.string_position - start_position;
if (start_position < match.column)
return ExecutionResult::Continue;
ASSERT(start_position + length <= input.view.length());
auto view = input.view.substring_view(start_position, length);
if (input.regex_options & AllFlags::StringCopyMatches) {
match = { view.to_string(), input.line, start_position, input.global_offset + start_position }; // create a copy of the original string
} else {
match = { view, input.line, start_position, input.global_offset + start_position }; // take view to original string
}
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
if (input.match_index >= output.named_capture_group_matches.size()) {
output.named_capture_group_matches.ensure_capacity(input.match_index);
auto capacity = output.named_capture_group_matches.capacity();
for (size_t i = output.named_capture_group_matches.size(); i <= capacity; ++i)
output.named_capture_group_matches.empend();
}
output.named_capture_group_matches.at(input.match_index).ensure(name()).column = state.string_position;
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(const MatchInput& input, MatchState& state, MatchOutput& output) const
{
StringView capture_group_name = name();
if (output.named_capture_group_matches.at(input.match_index).contains(capture_group_name)) {
auto start_position = output.named_capture_group_matches.at(input.match_index).ensure(capture_group_name).column;
auto length = state.string_position - start_position;
auto& map = output.named_capture_group_matches.at(input.match_index);
#ifdef REGEX_DEBUG
ASSERT(start_position + length < input.view.length());
dbg() << "Save named capture group with name=" << capture_group_name << " and content: " << input.view.substring_view(start_position, length).to_string();
#endif
ASSERT(start_position + length <= input.view.length());
auto view = input.view.substring_view(start_position, length);
if (input.regex_options & AllFlags::StringCopyMatches) {
map.set(capture_group_name, { view.to_string(), input.line, start_position, input.global_offset + start_position }); // create a copy of the original string
} else {
map.set(capture_group_name, { view, input.line, start_position, input.global_offset + start_position }); // take view to original string
}
} else {
fprintf(stderr, "Didn't find corresponding capture group match for name=%s, match_index=%lu\n", capture_group_name.to_string().characters(), input.match_index);
}
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(const MatchInput& input, MatchState& state, MatchOutput&) const
{
bool inverse { false };
size_t string_position = state.string_position;
bool inverse_matched { false };
size_t offset { state.instruction_position + 3 };
for (size_t i = 0; i < arguments_count(); ++i) {
if (state.string_position > string_position)
break;
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
if (compare_type == CharacterCompareType::Inverse)
inverse = true;
else if (compare_type == CharacterCompareType::Char) {
char ch = m_bytecode->at(offset++);
// We want to compare a string that is longer or equal in length to the available string
if (input.view.length() - state.string_position < 1)
return ExecutionResult::Failed_ExecuteLowPrioForks;
compare_char(input, state, ch, inverse, inverse_matched);
} else if (compare_type == CharacterCompareType::AnyChar) {
// We want to compare a string that is definitely longer than the available string
if (input.view.length() - state.string_position < 1)
return ExecutionResult::Failed_ExecuteLowPrioForks;
ASSERT(!inverse);
++state.string_position;
} else if (compare_type == CharacterCompareType::String) {
ASSERT(!inverse);
char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
auto& length = m_bytecode->at(offset++);
// We want to compare a string that is definitely longer than the available string
if (input.view.length() - state.string_position < length)
return ExecutionResult::Failed_ExecuteLowPrioForks;
if (!compare_string(input, state, str, length))
return ExecutionResult::Failed_ExecuteLowPrioForks;
} else if (compare_type == CharacterCompareType::CharClass) {
if (input.view.length() - state.string_position < 1)
return ExecutionResult::Failed_ExecuteLowPrioForks;
auto character_class = (CharClass)m_bytecode->at(offset++);
auto& ch = input.view[state.string_position];
compare_character_class(input, state, character_class, ch, inverse, inverse_matched);
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = (CharRange)m_bytecode->at(offset++);
auto from = value.from;
auto to = value.to;
auto ch = input.view[state.string_position];
compare_character_range(input, state, from, to, ch, inverse, inverse_matched);
} else {
fprintf(stderr, "Undefined comparison: %i\n", (int)compare_type);
ASSERT_NOT_REACHED();
break;
}
}
if (inverse && !inverse_matched)
++state.string_position;
if (string_position == state.string_position || state.string_position > input.view.length())
return ExecutionResult::Failed_ExecuteLowPrioForks;
return ExecutionResult::Continue;
}
ALWAYS_INLINE void OpCode_Compare::compare_char(const MatchInput& input, MatchState& state, char& ch, bool inverse, bool& inverse_matched) const
{
auto ch1 = ch;
auto ch2 = input.view[state.string_position];
if (input.regex_options & AllFlags::Insensitive) {
ch1 = tolower(ch1);
ch2 = tolower(ch2);
}
if (ch1 == ch2) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
}
ALWAYS_INLINE bool OpCode_Compare::compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) const
{
auto str_view1 = StringView(str, length);
auto str_view2 = StringView(&input.view[state.string_position], length);
String str1, str2;
if (input.regex_options & AllFlags::Insensitive) {
str1 = str_view1.to_string().to_lowercase();
str2 = str_view2.to_string().to_lowercase();
str_view1 = str1.view();
str_view2 = str2.view();
}
if (str_view1 == str_view2) {
state.string_position += length;
return true;
} else
return false;
}
ALWAYS_INLINE void OpCode_Compare::compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched) const
{
switch (character_class) {
case CharClass::Alnum:
if (isalnum(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Alpha:
if (isalpha(ch))
++state.string_position;
break;
case CharClass::Blank:
if (ch == ' ' || ch == '\t') {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Cntrl:
if (iscntrl(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Digit:
if (isdigit(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Graph:
if (isgraph(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Lower:
if (islower(ch) || ((input.regex_options & AllFlags::Insensitive) && isupper(ch))) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Print:
if (isprint(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Punct:
if (ispunct(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Space:
if (isspace(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Upper:
if (isupper(ch) || ((input.regex_options & AllFlags::Insensitive) && islower(ch))) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
case CharClass::Xdigit:
if (isxdigit(ch)) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
break;
}
}
ALWAYS_INLINE void OpCode_Compare::compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched) const
{
if (input.regex_options & AllFlags::Insensitive) {
from = tolower(from);
to = tolower(to);
ch = tolower(ch);
}
if (ch >= from && ch <= to) {
if (inverse)
inverse_matched = true;
else
++state.string_position;
}
}
const String OpCode_Compare::arguments_string() const
{
return String::format("argc=%lu, args=%lu ", arguments_count(), arguments_size());
}
const Vector<String> OpCode_Compare::variable_arguments_to_string(Optional<MatchInput> input) const
{
Vector<String> result;
size_t offset { state().instruction_position + 3 };
StringView view;
if (input.has_value())
view = input.value().view;
for (size_t i = 0; i < arguments_count(); ++i) {
auto compare_type = (CharacterCompareType)m_bytecode->at(offset++);
result.empend(String::format("type=%lu [%s]", (size_t)compare_type, character_compare_type_name(compare_type)));
if (compare_type == CharacterCompareType::Char) {
char ch = m_bytecode->at(offset++);
result.empend(String::format("value='%c'", ch));
if (!view.is_null())
result.empend(String::format("compare against: '%s'", String { view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters()));
} else if (compare_type == CharacterCompareType::String) {
char* str = reinterpret_cast<char*>(m_bytecode->at(offset++));
auto& length = m_bytecode->at(offset++);
result.empend(String::format("value=\"%s\"", String { str, length }.characters()));
if (!view.is_null())
result.empend(String::format("compare against: \"%s\"", String { input.value().view.substring_view(state().string_position, state().string_position + length > view.length() ? 0 : length) }.characters()));
} else if (compare_type == CharacterCompareType::CharClass) {
auto character_class = (CharClass)m_bytecode->at(offset++);
result.empend(String::format("ch_class=%lu [%s]", (size_t)character_class, character_class_name(character_class)));
if (!view.is_null())
result.empend(String::format("compare against: '%s'", String { input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters()));
} else if (compare_type == CharacterCompareType::CharRange) {
auto value = (CharRange)m_bytecode->at(offset++);
result.empend(String::format("ch_range='%c'-'%c'", value.from, value.to));
if (!view.is_null())
result.empend(String::format("compare against: '%s'", String { input.value().view.substring_view(state().string_position, state().string_position + 1 > view.length() ? 0 : 1) }.characters()));
}
}
return result;
}
}

View file

@ -0,0 +1,637 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexMatch.h"
#include "RegexOptions.h"
#include <AK/Forward.h>
#include <AK/HashMap.h>
#include <AK/NonnullOwnPtr.h>
#include <AK/OwnPtr.h>
#include <AK/Traits.h>
#include <AK/Types.h>
#include <AK/Vector.h>
namespace regex {
using ByteCodeValueType = size_t;
#define ENUMERATE_OPCODES \
__ENUMERATE_OPCODE(Compare) \
__ENUMERATE_OPCODE(Jump) \
__ENUMERATE_OPCODE(ForkJump) \
__ENUMERATE_OPCODE(ForkStay) \
__ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightCaptureGroup) \
__ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
__ENUMERATE_OPCODE(CheckBegin) \
__ENUMERATE_OPCODE(CheckEnd) \
__ENUMERATE_OPCODE(Exit)
enum class OpCodeId : ByteCodeValueType {
#define __ENUMERATE_OPCODE(x) x,
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
First
= Compare,
Last
= Exit,
};
#define ENUMERATE_CHARACTER_COMPARE_TYPES \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Undefined) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Inverse) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(AnyChar) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(Char) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(String) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharClass) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(CharRange) \
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
enum class CharacterCompareType : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_COMPARE_TYPE(x) x,
ENUMERATE_CHARACTER_COMPARE_TYPES
#undef __ENUMERATE_CHARACTER_COMPARE_TYPE
};
#define ENUMERATE_CHARACTER_CLASSES \
__ENUMERATE_CHARACTER_CLASS(Alnum) \
__ENUMERATE_CHARACTER_CLASS(Cntrl) \
__ENUMERATE_CHARACTER_CLASS(Lower) \
__ENUMERATE_CHARACTER_CLASS(Space) \
__ENUMERATE_CHARACTER_CLASS(Alpha) \
__ENUMERATE_CHARACTER_CLASS(Digit) \
__ENUMERATE_CHARACTER_CLASS(Print) \
__ENUMERATE_CHARACTER_CLASS(Upper) \
__ENUMERATE_CHARACTER_CLASS(Blank) \
__ENUMERATE_CHARACTER_CLASS(Graph) \
__ENUMERATE_CHARACTER_CLASS(Punct) \
__ENUMERATE_CHARACTER_CLASS(Xdigit)
enum class CharClass : ByteCodeValueType {
#define __ENUMERATE_CHARACTER_CLASS(x) x,
ENUMERATE_CHARACTER_CLASSES
#undef __ENUMERATE_CHARACTER_CLASS
};
struct CharRange {
const char from;
const char to;
CharRange(size_t value)
: from(value >> 8)
, to(value & 0xFF)
{
}
CharRange(char from, char to)
: from(from)
, to(to)
{
}
operator ByteCodeValueType() const { return (from << 8) | to; }
};
struct CompareTypeAndValuePair {
CharacterCompareType type;
ByteCodeValueType value;
};
class OpCode;
class ByteCode : public Vector<ByteCodeValueType> {
public:
ByteCode() = default;
virtual ~ByteCode() = default;
void insert_bytecode_compare_values(Vector<CompareTypeAndValuePair>&& pairs)
{
ByteCode bytecode;
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
bytecode.empend(pairs.size()); // number of arguments
ByteCode arguments;
for (auto& value : pairs) {
ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
ASSERT(value.type != CharacterCompareType::Undefined);
ASSERT(value.type != CharacterCompareType::String);
arguments.append((ByteCodeValueType)value.type);
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnyChar)
arguments.append(move(value.value));
}
bytecode.empend(arguments.size()); // size of arguments
bytecode.append(move(arguments));
append(move(bytecode));
}
void insert_bytecode_compare_string(StringView view, size_t length)
{
ByteCode bytecode;
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Compare));
bytecode.empend(1); // number of arguments
ByteCode arguments;
arguments.empend(static_cast<ByteCodeValueType>(CharacterCompareType::String));
arguments.empend(reinterpret_cast<ByteCodeValueType>(view.characters_without_null_termination()));
arguments.empend(length);
bytecode.empend(arguments.size()); // size of arguments
bytecode.append(move(arguments));
append(move(bytecode));
}
void insert_bytecode_group_capture_left(size_t capture_groups_count)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftCaptureGroup));
empend(capture_groups_count);
}
void insert_bytecode_group_capture_left(const StringView& name)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveLeftNamedCaptureGroup));
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
empend(name.length());
}
void insert_bytecode_group_capture_right(size_t capture_groups_count)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightCaptureGroup));
empend(capture_groups_count);
}
void insert_bytecode_group_capture_right(const StringView& name)
{
empend(static_cast<ByteCodeValueType>(OpCodeId::SaveRightNamedCaptureGroup));
empend(reinterpret_cast<ByteCodeValueType>(name.characters_without_null_termination()));
empend(name.length());
}
void insert_bytecode_alternation(ByteCode&& left, ByteCode&& right)
{
// FORKSTAY _ALT
// REGEXP ALT1
// JUMP _END
// LABEL _ALT
// REGEXP ALT2
// LABEL _END
ByteCode byte_code;
empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
empend(left.size() + 2); // Jump to the _ALT label
for (auto& op : left)
append(move(op));
empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
empend(right.size()); // Jump to the _END label
// LABEL _ALT = bytecode.size() + 2
for (auto& op : right)
append(move(op));
// LABEL _END = alterantive_bytecode.size
}
void insert_bytecode_repetition_min_max(ByteCode& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum)
{
ByteCode new_bytecode;
new_bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
if (maximum.has_value()) {
if (maximum.value() > minimum) {
auto diff = maximum.value() - minimum;
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
for (size_t i = 0; i < diff; ++i) {
new_bytecode.append(bytecode_to_repeat);
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
}
}
} else {
// no maximum value set, repeat finding if possible
new_bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration
}
bytecode_to_repeat = move(new_bytecode);
}
void insert_bytecode_repetition_n(ByteCode& bytecode_to_repeat, size_t n)
{
for (size_t i = 0; i < n; ++i)
append(bytecode_to_repeat);
}
void insert_bytecode_repetition_min_one(ByteCode& bytecode_to_repeat, bool greedy)
{
// LABEL _START = -bytecode_to_repeat.size()
// REGEXP
// FORKJUMP _START (FORKSTAY -> Greedy)
if (greedy)
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
else
bytecode_to_repeat.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
bytecode_to_repeat.empend(-(bytecode_to_repeat.size() + 1)); // Jump to the _START label
}
void insert_bytecode_repetition_any(ByteCode& bytecode_to_repeat, bool greedy)
{
// LABEL _START
// FORKSTAY _END (FORKJUMP -> Greedy)
// REGEXP
// JUMP _START
// LABEL _END
// LABEL _START = m_bytes.size();
ByteCode bytecode;
if (greedy)
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
else
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label
for (auto& op : bytecode_to_repeat)
bytecode.append(move(op));
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::Jump));
bytecode.empend(-bytecode.size() - 1); // Jump to the _START label
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
void insert_bytecode_repetition_zero_or_one(ByteCode& bytecode_to_repeat, bool greedy)
{
// FORKSTAY _END (FORKJUMP -> Greedy)
// REGEXP
// LABEL _END
ByteCode bytecode;
if (greedy)
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkJump));
else
bytecode.empend(static_cast<ByteCodeValueType>(OpCodeId::ForkStay));
bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
for (auto& op : bytecode_to_repeat)
bytecode.append(move(op));
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
OpCode* get_opcode(MatchState& state) const;
private:
ALWAYS_INLINE OpCode* get_opcode_by_id(OpCodeId id) const;
static HashMap<u32, OwnPtr<OpCode>> s_opcodes;
};
#define ENUMERATE_EXECUTION_RESULTS \
__ENUMERATE_EXECUTION_RESULT(Continue) \
__ENUMERATE_EXECUTION_RESULT(Fork_PrioHigh) \
__ENUMERATE_EXECUTION_RESULT(Fork_PrioLow) \
__ENUMERATE_EXECUTION_RESULT(Failed) \
__ENUMERATE_EXECUTION_RESULT(Failed_ExecuteLowPrioForks) \
__ENUMERATE_EXECUTION_RESULT(Succeeded)
enum class ExecutionResult : u8 {
#define __ENUMERATE_EXECUTION_RESULT(x) x,
ENUMERATE_EXECUTION_RESULTS
#undef __ENUMERATE_EXECUTION_RESULT
};
const char* execution_result_name(ExecutionResult result);
const char* opcode_id_name(OpCodeId opcode_id);
const char* character_compare_type_name(CharacterCompareType result);
const char* execution_result_name(ExecutionResult result);
class OpCode {
public:
OpCode(ByteCode& bytecode)
: m_bytecode(&bytecode)
{
}
virtual ~OpCode() = default;
virtual OpCodeId opcode_id() const = 0;
virtual size_t size() const = 0;
virtual ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const = 0;
ALWAYS_INLINE ByteCodeValueType argument(size_t offset) const
{
ASSERT(state().instruction_position + offset <= m_bytecode->size());
return m_bytecode->at(state().instruction_position + 1 + offset);
}
ALWAYS_INLINE const char* name() const;
static const char* name(const OpCodeId);
ALWAYS_INLINE OpCode* set_state(MatchState& state)
{
m_state = &state;
return this;
}
ALWAYS_INLINE OpCode* set_bytecode(ByteCode& bytecode)
{
m_bytecode = &bytecode;
return this;
}
ALWAYS_INLINE void reset_state() { m_state.clear(); }
ALWAYS_INLINE const MatchState& state() const
{
ASSERT(m_state.has_value());
return *m_state.value();
}
const String to_string() const
{
return String::format("[0x%02X] %s", opcode_id(), name(opcode_id()));
}
virtual const String arguments_string() const = 0;
ALWAYS_INLINE const ByteCode& bytecode() const { return *m_bytecode; }
protected:
ByteCode* m_bytecode;
Optional<MatchState*> m_state;
};
class OpCode_Exit final : public OpCode {
public:
OpCode_Exit(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Exit; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_Jump final : public OpCode {
public:
OpCode_Jump(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Jump; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
const String arguments_string() const override
{
return String::format("offset=%i [&%lu]", offset(), state().instruction_position + size() + offset());
}
};
class OpCode_ForkJump final : public OpCode {
public:
OpCode_ForkJump(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkJump; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
const String arguments_string() const override
{
return String::format("offset=%i [&%lu], sp: %lu", offset(), state().instruction_position + size() + offset(), state().string_position);
}
};
class OpCode_ForkStay final : public OpCode {
public:
OpCode_ForkStay(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::ForkStay; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE ssize_t offset() const { return argument(0); }
const String arguments_string() const override
{
return String::format("offset=%i [&%lu], sp: %lu", offset(), state().instruction_position + size() + offset(), state().string_position);
}
};
class OpCode_CheckBegin final : public OpCode {
public:
OpCode_CheckBegin(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckBegin; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_CheckEnd final : public OpCode {
public:
OpCode_CheckEnd(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::CheckEnd; }
ALWAYS_INLINE size_t size() const override { return 1; }
const String arguments_string() const override { return ""; }
};
class OpCode_SaveLeftCaptureGroup final : public OpCode {
public:
OpCode_SaveLeftCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t id() const { return argument(0); }
const String arguments_string() const override { return String::format("id=%lu", id()); }
};
class OpCode_SaveRightCaptureGroup final : public OpCode {
public:
OpCode_SaveRightCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 2; }
ALWAYS_INLINE size_t id() const { return argument(0); }
const String arguments_string() const override { return String::format("id=%lu", id()); }
};
class OpCode_SaveLeftNamedCaptureGroup final : public OpCode {
public:
OpCode_SaveLeftNamedCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveLeftNamedCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 3; }
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
ALWAYS_INLINE size_t length() const { return argument(1); }
const String arguments_string() const override
{
return String::format("name=%s, length=%lu", name().to_string().characters(), length());
}
};
class OpCode_SaveRightNamedCaptureGroup final : public OpCode {
public:
OpCode_SaveRightNamedCaptureGroup(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::SaveRightNamedCaptureGroup; }
ALWAYS_INLINE size_t size() const override { return 3; }
ALWAYS_INLINE StringView name() const { return { reinterpret_cast<char*>(argument(0)), length() }; }
ALWAYS_INLINE size_t length() const { return argument(1); }
const String arguments_string() const override
{
return String::format("name=%s, length=%lu", name().to_string().characters(), length());
}
};
class OpCode_Compare final : public OpCode {
public:
OpCode_Compare(ByteCode& bytecode)
: OpCode(bytecode)
{
}
ExecutionResult execute(const MatchInput& input, MatchState& state, MatchOutput& output) const override;
ALWAYS_INLINE OpCodeId opcode_id() const override { return OpCodeId::Compare; }
ALWAYS_INLINE size_t size() const override { return arguments_size() + 3; }
ALWAYS_INLINE size_t arguments_count() const { return argument(0); }
ALWAYS_INLINE size_t arguments_size() const { return argument(1); }
const String arguments_string() const override;
const Vector<String> variable_arguments_to_string(Optional<MatchInput> input = {}) const;
private:
ALWAYS_INLINE void compare_char(const MatchInput& input, MatchState& state, char& ch, bool inverse, bool& inverse_matched) const;
ALWAYS_INLINE bool compare_string(const MatchInput& input, MatchState& state, const char* str, size_t length) const;
ALWAYS_INLINE void compare_character_class(const MatchInput& input, MatchState& state, CharClass character_class, char ch, bool inverse, bool& inverse_matched) const;
ALWAYS_INLINE void compare_character_range(const MatchInput& input, MatchState& state, char from, char to, char ch, bool inverse, bool& inverse_matched) const;
};
template<typename T>
bool is(const OpCode&);
template<typename T>
ALWAYS_INLINE bool is(const OpCode&)
{
return false;
}
template<typename T>
ALWAYS_INLINE bool is(const OpCode* opcode)
{
return is<T>(*opcode);
}
template<>
ALWAYS_INLINE bool is<OpCode_ForkStay>(const OpCode& opcode)
{
return opcode.opcode_id() == OpCodeId::ForkStay;
}
template<>
ALWAYS_INLINE bool is<OpCode_Exit>(const OpCode& opcode)
{
return opcode.opcode_id() == OpCodeId::Exit;
}
template<>
ALWAYS_INLINE bool is<OpCode_Compare>(const OpCode& opcode)
{
return opcode.opcode_id() == OpCodeId::Compare;
}
template<typename T>
ALWAYS_INLINE const T& to(const OpCode& opcode)
{
ASSERT(is<T>(opcode));
return static_cast<const T&>(opcode);
}
template<typename T>
ALWAYS_INLINE T* to(OpCode* opcode)
{
ASSERT(is<T>(opcode));
return static_cast<T*>(opcode);
}
template<typename T>
ALWAYS_INLINE const T* to(const OpCode* opcode)
{
ASSERT(is<T>(opcode));
return static_cast<const T*>(opcode);
}
template<typename T>
ALWAYS_INLINE T& to(OpCode& opcode)
{
ASSERT(is<T>(opcode));
return static_cast<T&>(opcode);
}
}

View file

@ -0,0 +1,153 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "LibRegex/RegexMatcher.h"
#include "AK/StringBuilder.h"
//#define REGEX_DEBUG
#ifdef REGEX_DEBUG
namespace regex {
class RegexDebug {
public:
RegexDebug(FILE* file = stdout)
: m_file(file)
{
}
virtual ~RegexDebug() = default;
template<typename T>
void print_raw_bytecode(Regex<T>& regex) const
{
auto& bytecode = regex.parser_result.bytecode;
size_t index { 0 };
for (auto& value : bytecode) {
fprintf(m_file, "OpCode i=%3lu [0x%02X]\n", index, (u32)value);
++index;
}
}
template<typename T>
void print_bytecode(Regex<T>& regex) const
{
MatchState state;
auto& bytecode = regex.parser_result.bytecode;
for (;;) {
auto* opcode = bytecode.get_opcode(state);
if (!opcode) {
dbg() << "Wrong opcode... failed!";
return;
}
print_opcode("PrintBytecode", *opcode, state);
fprintf(m_file, "%s", m_debug_stripline.characters());
if (is<OpCode_Exit>(*opcode))
break;
state.instruction_position += opcode->size();
}
fflush(m_file);
}
void print_opcode(const String& system, OpCode& opcode, MatchState& state, size_t recursion = 0, bool newline = true) const
{
fprintf(m_file, "%-15s | %-5lu | %-9lu | %-35s | %-30s | %-20s%s",
system.characters(),
state.instruction_position,
recursion,
opcode.to_string().characters(),
opcode.arguments_string().characters(),
String::format("ip: %3lu, sp: %3lu", state.instruction_position, state.string_position).characters(),
newline ? "\n" : "");
if (newline && is<OpCode_Compare>(opcode)) {
for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string()) {
fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
}
}
}
void print_result(const OpCode& opcode, const ByteCode& bytecode, const MatchInput& input, MatchState& state, ExecutionResult result) const
{
StringBuilder builder;
builder.append(execution_result_name(result));
if (result == ExecutionResult::Succeeded) {
builder.appendf(", ip: %lu/%lu, sp: %lu/%lu", state.instruction_position, bytecode.size() - 1, state.string_position, input.view.length() - 1);
} else if (result == ExecutionResult::Fork_PrioHigh) {
builder.appendf(", next ip: %lu", state.fork_at_position + opcode.size());
} else if (result != ExecutionResult::Failed) {
builder.appendf(", next ip: %lu", state.instruction_position + opcode.size());
}
fprintf(m_file, " | %-20s\n", builder.to_string().characters());
if (is<OpCode_Compare>(opcode)) {
for (auto& line : to<OpCode_Compare>(opcode).variable_arguments_to_string(input)) {
fprintf(m_file, "%-15s | %-5s | %-9s | %-35s | %-30s | %-20s%s", "", "", "", "", line.characters(), "", "\n");
}
}
fprintf(m_file, "%s", m_debug_stripline.characters());
}
void print_header()
{
StringBuilder builder;
builder.appendf("%-15s | %-5s | %-9s | %-35s | %-30s | %-20s | %-20s\n", "System", "Index", "Recursion", "OpCode", "Arguments", "State", "Result");
auto length = builder.length();
for (size_t i = 0; i < length; ++i) {
builder.append('=');
}
fprintf(m_file, "%s\n", builder.to_string().characters());
fflush(m_file);
builder.clear();
for (size_t i = 0; i < length; ++i) {
builder.append('-');
}
builder.append('\n');
m_debug_stripline = builder.to_string();
}
private:
String m_debug_stripline;
FILE* m_file;
};
}
using regex::RegexDebug;
#endif

View file

@ -0,0 +1,102 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/String.h>
#include <AK/Types.h>
#ifdef __serenity__
# include <regex.h>
#else
# include <LibC/regex.h>
#endif
namespace regex {
enum class Error : u8 {
NoError = __Regex_NoError,
InvalidPattern = __Regex_InvalidPattern, // Invalid regular expression.
InvalidCollationElement = __Regex_InvalidCollationElement, // Invalid collating element referenced.
InvalidCharacterClass = __Regex_InvalidCharacterClass, // Invalid character class type referenced.
InvalidTrailingEscape = __Regex_InvalidTrailingEscape, // Trailing \ in pattern.
InvalidNumber = __Regex_InvalidNumber, // Number in \digit invalid or in error.
MismatchingBracket = __Regex_MismatchingBracket, // [ ] imbalance.
MismatchingParen = __Regex_MismatchingParen, // ( ) imbalance.
MismatchingBrace = __Regex_MismatchingBrace, // { } imbalance.
InvalidBraceContent = __Regex_InvalidBraceContent, // Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.
InvalidBracketContent = __Regex_InvalidBracketContent, // Content of [] invalid.
InvalidRange = __Regex_InvalidRange, // Invalid endpoint in range expression.
InvalidRepetitionMarker = __Regex_InvalidRepetitionMarker, // ?, * or + not preceded by valid regular expression.
ReachedMaxRecursion = __Regex_ReachedMaxRecursion, // MaximumRecursion has been reached.
EmptySubExpression = __Regex_EmptySubExpression, // Sub expression has empty content.
InvalidCaptureGroup = __Regex_InvalidCaptureGroup, // Content of capture group is invalid.
InvalidNameForCaptureGroup = __Regex_InvalidNameForCaptureGroup, // Name of capture group is invalid.
};
inline String get_error_string(Error error)
{
switch (error) {
case Error::NoError:
return "No error";
case Error::InvalidPattern:
return "Invalid regular expression.";
case Error::InvalidCollationElement:
return "Invalid collating element referenced.";
case Error::InvalidCharacterClass:
return "Invalid character class type referenced.";
case Error::InvalidTrailingEscape:
return "Trailing \\ in pattern.";
case Error::InvalidNumber:
return "Number in \\digit invalid or in error.";
case Error::MismatchingBracket:
return "[ ] imbalance.";
case Error::MismatchingParen:
return "( ) imbalance.";
case Error::MismatchingBrace:
return "{ } imbalance.";
case Error::InvalidBraceContent:
return "Content of {} invalid: not a number, number too large, more than two numbers, first larger than second.";
case Error::InvalidBracketContent:
return "Content of [] invalid.";
case Error::InvalidRange:
return "Invalid endpoint in range expression.";
case Error::InvalidRepetitionMarker:
return "?, * or + not preceded by valid regular expression.";
case Error::ReachedMaxRecursion:
return "Maximum recursion has been reached.";
case Error::EmptySubExpression:
return "Sub expression has empty content.";
case Error::InvalidCaptureGroup:
return "Content of capture group is invalid.";
case Error::InvalidNameForCaptureGroup:
return "Name of capture group is invalid.";
}
return "Undefined error.";
}
}
using regex::Error;
using regex::get_error_string;

View file

@ -0,0 +1,210 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexLexer.h"
#include <AK/Assertions.h>
#include <stdio.h>
namespace regex {
const char* Token::name(const TokenType type)
{
switch (type) {
#define __ENUMERATE_REGEX_TOKEN(x) \
case TokenType::x: \
return #x;
ENUMERATE_REGEX_TOKENS
#undef __ENUMERATE_REGEX_TOKEN
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* Token::name() const
{
return name(m_type);
}
Lexer::Lexer(const StringView source)
: m_source(source)
{
}
ALWAYS_INLINE char Lexer::peek(size_t offset) const
{
if ((m_position + offset) >= m_source.length())
return EOF;
return m_source[m_position + offset];
}
void Lexer::back(size_t offset)
{
m_position -= offset;
m_previous_position = m_position - 1;
m_current_char = m_source[m_position];
}
ALWAYS_INLINE void Lexer::consume()
{
m_previous_position = m_position;
if (m_position >= m_source.length()) {
m_position = m_source.length() + 1;
m_current_char = EOF;
return;
}
m_current_char = m_source[m_position++];
}
void Lexer::reset()
{
m_position = 0;
m_current_token = { TokenType::Eof, 0, StringView(nullptr) };
m_current_char = 0;
m_previous_position = 0;
}
Token Lexer::next()
{
size_t token_start_position;
auto begin_token = [&] {
token_start_position = m_position;
};
auto commit_token = [&](auto type) -> Token& {
ASSERT(token_start_position + m_previous_position - token_start_position + 1 <= m_source.length());
auto substring = m_source.substring_view(token_start_position, m_previous_position - token_start_position + 1);
m_current_token = Token(type, token_start_position, substring);
return m_current_token;
};
auto emit_token = [&](auto type) -> Token& {
m_current_token = Token(type, m_position, m_source.substring_view(m_position, 1));
consume();
return m_current_token;
};
auto match_escape_sequence = [&]() -> size_t {
switch (peek(1)) {
case '^':
case '.':
case '[':
case ']':
case '$':
case '(':
case ')':
case '|':
case '*':
case '+':
case '?':
case '{':
case '\\':
return 2;
default:
fprintf(stderr, "[LEXER] Found invalid escape sequence: \\%c\n", peek(1));
return 0;
}
};
while (m_position <= m_source.length()) {
auto ch = peek();
if (ch == '(')
return emit_token(TokenType::LeftParen);
if (ch == ')')
return emit_token(TokenType::RightParen);
if (ch == '{')
return emit_token(TokenType::LeftCurly);
if (ch == '}')
return emit_token(TokenType::RightCurly);
if (ch == '[')
return emit_token(TokenType::LeftBracket);
if (ch == ']')
return emit_token(TokenType::RightBracket);
if (ch == '.')
return emit_token(TokenType::Period);
if (ch == '*')
return emit_token(TokenType::Asterisk);
if (ch == '+')
return emit_token(TokenType::Plus);
if (ch == '$')
return emit_token(TokenType::Dollar);
if (ch == '^')
return emit_token(TokenType::Circumflex);
if (ch == '|')
return emit_token(TokenType::Pipe);
if (ch == '?')
return emit_token(TokenType::Questionmark);
if (ch == ',')
return emit_token(TokenType::Comma);
if (ch == '/')
return emit_token(TokenType::Slash);
if (ch == '=')
return emit_token(TokenType::EqualSign);
if (ch == ':')
return emit_token(TokenType::Colon);
if (ch == '-')
return emit_token(TokenType::HyphenMinus);
if (ch == '\\') {
size_t escape = match_escape_sequence();
if (escape > 0) {
begin_token();
for (size_t i = 0; i < escape; ++i)
consume();
return commit_token(TokenType::EscapeSequence);
}
}
if (ch == EOF)
break;
return emit_token(TokenType::Char);
}
return Token(TokenType::Eof, m_position, nullptr);
}
}

View file

@ -0,0 +1,108 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/Forward.h>
#include <AK/StringView.h>
namespace regex {
#define ENUMERATE_REGEX_TOKENS \
__ENUMERATE_REGEX_TOKEN(Eof) \
__ENUMERATE_REGEX_TOKEN(Char) \
__ENUMERATE_REGEX_TOKEN(Circumflex) \
__ENUMERATE_REGEX_TOKEN(Period) \
__ENUMERATE_REGEX_TOKEN(LeftParen) \
__ENUMERATE_REGEX_TOKEN(RightParen) \
__ENUMERATE_REGEX_TOKEN(LeftCurly) \
__ENUMERATE_REGEX_TOKEN(RightCurly) \
__ENUMERATE_REGEX_TOKEN(LeftBracket) \
__ENUMERATE_REGEX_TOKEN(RightBracket) \
__ENUMERATE_REGEX_TOKEN(Asterisk) \
__ENUMERATE_REGEX_TOKEN(EscapeSequence) \
__ENUMERATE_REGEX_TOKEN(Dollar) \
__ENUMERATE_REGEX_TOKEN(Pipe) \
__ENUMERATE_REGEX_TOKEN(Plus) \
__ENUMERATE_REGEX_TOKEN(Comma) \
__ENUMERATE_REGEX_TOKEN(Slash) \
__ENUMERATE_REGEX_TOKEN(EqualSign) \
__ENUMERATE_REGEX_TOKEN(HyphenMinus) \
__ENUMERATE_REGEX_TOKEN(Colon) \
__ENUMERATE_REGEX_TOKEN(Questionmark)
enum class TokenType {
#define __ENUMERATE_REGEX_TOKEN(x) x,
ENUMERATE_REGEX_TOKENS
#undef __ENUMERATE_REGEX_TOKEN
};
class Token {
public:
Token() = default;
Token(const TokenType type, const size_t start_position, const StringView value)
: m_type(type)
, m_position(start_position)
, m_value(value)
{
}
TokenType type() const { return m_type; }
const StringView& value() const { return m_value; }
size_t position() const { return m_position; }
const char* name() const;
static const char* name(const TokenType);
private:
TokenType m_type { TokenType::Eof };
size_t m_position { 0 };
StringView m_value { nullptr };
};
class Lexer {
public:
Lexer() = default;
explicit Lexer(const StringView source);
Token next();
void reset();
void back(size_t offset);
void set_source(const StringView source) { m_source = source; }
private:
ALWAYS_INLINE char peek(size_t offset = 0) const;
ALWAYS_INLINE void consume();
StringView m_source {};
size_t m_position { 0 };
size_t m_previous_position { 0 };
Token m_current_token { TokenType::Eof, 0, StringView(nullptr) };
int m_current_char { 0 };
};
}
using regex::Lexer;

View file

@ -0,0 +1,100 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexOptions.h"
#include "AK/FlyString.h"
#include "AK/HashMap.h"
#include "AK/String.h"
#include "AK/StringView.h"
#include "AK/Vector.h"
namespace regex {
class Match final {
private:
Optional<FlyString> string;
public:
Match() = default;
~Match() = default;
Match(const StringView view_, const size_t line_, const size_t column_, const size_t global_offset_)
: view(view_)
, line(line_)
, column(column_)
, global_offset(global_offset_)
, left_column(column_)
{
}
Match(const String string_, const size_t line_, const size_t column_, const size_t global_offset_)
: string(string_)
, view(string.value().view())
, line(line_)
, column(column_)
, global_offset(global_offset_)
, left_column(column_)
{
}
StringView view { nullptr };
size_t line { 0 };
size_t column { 0 };
size_t global_offset { 0 };
// ugly, as not usable by user, but needed to prevent to create extra vectors that are
// able to store the column when the left paren has been found
size_t left_column { 0 };
};
struct MatchInput {
StringView view { nullptr };
AllOptions regex_options {};
size_t match_index { 0 };
size_t line { 0 };
size_t column { 0 };
size_t global_offset { 0 }; // For multiline matching, knowning the offset from start could be important
};
struct MatchState {
size_t string_position { 0 };
size_t instruction_position { 0 };
size_t fork_at_position { 0 };
};
struct MatchOutput {
size_t operations;
Vector<Match> matches;
Vector<Vector<Match>> capture_group_matches;
Vector<HashMap<String, Match>> named_capture_group_matches;
};
}

View file

@ -0,0 +1,313 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexMatcher.h"
#include "RegexDebug.h"
#include "RegexParser.h"
#include <AK/String.h>
#include <AK/StringBuilder.h>
namespace regex {
#ifdef REGEX_DEBUG
static RegexDebug s_regex_dbg(stderr);
#endif
template<class Parser>
Regex<Parser>::Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options)
{
pattern_value = pattern.to_string();
regex::Lexer lexer(pattern);
Parser parser(lexer, regex_options);
parser_result = parser.parse();
if (parser_result.error == regex::Error::NoError) {
matcher = make<Matcher<Parser>>(*this, regex_options);
} else {
fprintf(stderr, "%s\n", error_string().characters());
}
}
template<class Parser>
String Regex<Parser>::error_string(Optional<String> message) const
{
StringBuilder eb;
eb.appendf("Error during parsing of regular expression:\n");
eb.appendf(" %s\n ", pattern_value.characters());
for (size_t i = 0; i < parser_result.error_token.position(); ++i)
eb.append(" ");
eb.appendf("^---- %s\n", message.value_or(get_error_string(parser_result.error)).characters());
return eb.build();
}
template<typename Parser>
RegexResult Matcher<Parser>::match(const StringView& view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options) const
{
size_t match_count { 0 };
Vector<StringView> views { view };
MatchInput input;
MatchState state;
MatchOutput output;
input.regex_options = m_regex_options | regex_options.value_or({}).value();
output.operations = 0;
if (input.regex_options & AllFlags::Multiline)
views = view.lines(false); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
if (c_match_preallocation_count) {
output.matches.ensure_capacity(c_match_preallocation_count);
output.capture_group_matches.ensure_capacity(c_match_preallocation_count);
output.named_capture_group_matches.ensure_capacity(c_match_preallocation_count);
auto& capture_groups_count = m_pattern.parser_result.capture_groups_count;
auto& named_capture_groups_count = m_pattern.parser_result.named_capture_groups_count;
for (size_t j = 0; j < c_match_preallocation_count; ++j) {
output.matches.empend();
output.capture_group_matches.unchecked_append({});
output.capture_group_matches.at(j).ensure_capacity(capture_groups_count);
for (size_t k = 0; k < capture_groups_count; ++k)
output.capture_group_matches.at(j).unchecked_append({});
output.named_capture_group_matches.unchecked_append({});
output.named_capture_group_matches.at(j).ensure_capacity(named_capture_groups_count);
}
}
auto append_match = [](auto& input, auto& state, auto& output, auto& start_position) {
if (output.matches.size() == input.match_index)
output.matches.empend();
ASSERT(start_position + state.string_position - start_position <= input.view.length());
if (input.regex_options & AllFlags::StringCopyMatches) {
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position).to_string(), input.line, start_position, input.global_offset + start_position };
} else { // let the view point to the original string ...
output.matches.at(input.match_index) = { input.view.substring_view(start_position, state.string_position - start_position), input.line, start_position, input.global_offset + start_position };
}
};
#ifdef REGEX_DEBUG
s_regex_dbg.print_header();
#endif
bool continue_search = (input.regex_options & AllFlags::Global) || (input.regex_options & AllFlags::Multiline);
for (auto& view : views) {
input.view = view;
#ifdef REGEX_DEBUG
dbg() << "[match] Starting match with view (" << view.length() << "): _" << view << "_";
#endif
auto view_length = view.length();
for (size_t view_index = 0; view_index < view_length; ++view_index) {
auto& match_length_minimum = m_pattern.parser_result.match_length_minimum;
// FIXME: More performant would be to know the remaining minimum string
// length needed to match from the current position onwards within
// the vm. Add new OpCode for MinMatchLengthFromSp with the value of
// the remaining string length from the current path. The value though
// has to be filled in reverse. That implies a second run over bytecode
// after generation has finished.
if (match_length_minimum && match_length_minimum > view_length - view_index)
break;
input.column = match_count;
input.match_index = match_count;
state.string_position = view_index;
state.instruction_position = 0;
auto success = execute(input, state, output, 0);
if (!success.has_value())
return { false, 0, {}, {}, {}, output.operations };
if (success.value()) {
if ((input.regex_options & AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) {
if (!continue_search)
break;
continue;
}
if ((input.regex_options & AllFlags::MatchNotBeginOfLine) && view_index == 0) {
if (!continue_search)
break;
continue;
}
#ifdef REGEX_DEBUG
dbg() << "state.string_position: " << state.string_position << " view_index: " << view_index;
dbg() << "[match] Found a match (length = " << state.string_position - view_index << "): " << input.view.substring_view(view_index, state.string_position - view_index);
#endif
++match_count;
if (continue_search) {
append_match(input, state, output, view_index);
bool has_zero_length = state.string_position == view_index;
view_index = state.string_position - (has_zero_length ? 0 : 1);
continue;
} else if (!continue_search && state.string_position < view_length)
return { false, 0, {}, {}, {}, output.operations };
append_match(input, state, output, view_index);
break;
}
if (!continue_search)
break;
}
++input.line;
input.global_offset += view.length() + 1; // +1 includes the line break character
}
MatchOutput output_copy;
if (match_count) {
auto capture_groups_count = min(output.capture_group_matches.size(), output.matches.size());
for (size_t i = 0; i < capture_groups_count; ++i) {
output_copy.capture_group_matches.append(output.capture_group_matches.at(i));
}
auto named_capture_groups_count = min(output.named_capture_group_matches.size(), output.matches.size());
for (size_t i = 0; i < named_capture_groups_count; ++i) {
if (output.matches.at(i).view.length())
output_copy.named_capture_group_matches.append(output.named_capture_group_matches.at(i));
}
for (size_t i = 0; i < match_count; ++i)
output_copy.matches.append(output.matches.at(i));
} else {
output_copy.capture_group_matches.clear_with_capacity();
output_copy.named_capture_group_matches.clear_with_capacity();
}
return {
match_count ? true : false,
match_count,
move(output_copy.matches),
move(output_copy.capture_group_matches),
move(output_copy.named_capture_group_matches),
output.operations,
};
}
template<class Parser>
Optional<bool> Matcher<Parser>::execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const
{
if (recursion_level > c_max_recursion)
return false;
Vector<MatchState> fork_low_prio_states;
MatchState fork_high_prio_state;
Optional<bool> success;
auto& bytecode = m_pattern.parser_result.bytecode;
for (;;) {
++output.operations;
auto* opcode = bytecode.get_opcode(state);
if (!opcode) {
dbg() << "Wrong opcode... failed!";
return {};
}
#ifdef REGEX_DEBUG
s_regex_dbg.print_opcode("VM", *opcode, state, recursion_level, false);
#endif
auto result = opcode->execute(input, state, output);
#ifdef REGEX_DEBUG
s_regex_dbg.print_result(*opcode, bytecode, input, state, result);
#endif
state.instruction_position += opcode->size();
switch (result) {
case ExecutionResult::Fork_PrioLow:
fork_low_prio_states.prepend(state);
continue;
case ExecutionResult::Fork_PrioHigh:
fork_high_prio_state = state;
fork_high_prio_state.instruction_position = fork_high_prio_state.fork_at_position;
success = execute(input, fork_high_prio_state, output, ++recursion_level);
if (!success.has_value())
return {};
if (success.value()) {
state = fork_high_prio_state;
return true;
}
continue;
case ExecutionResult::Continue:
continue;
case ExecutionResult::Succeeded:
return true;
case ExecutionResult::Failed:
return false;
case ExecutionResult::Failed_ExecuteLowPrioForks:
return execute_low_prio_forks(input, state, output, fork_low_prio_states, recursion_level + 1);
}
}
ASSERT_NOT_REACHED();
}
template<class Parser>
ALWAYS_INLINE Optional<bool> Matcher<Parser>::execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const
{
for (auto& state : states) {
state.instruction_position = state.fork_at_position;
#ifdef REGEX_DEBUG
fprintf(stderr, "Forkstay... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
#endif
auto success = execute(input, state, output, recursion_level);
if (!success.has_value())
return {};
if (success.value()) {
#ifdef REGEX_DEBUG
fprintf(stderr, "Forkstay succeeded... ip = %lu, sp = %lu\n", state.instruction_position, state.string_position);
#endif
original_state = state;
return true;
}
}
original_state.string_position = 0;
return false;
};
template class Matcher<PosixExtendedParser>;
template class Regex<PosixExtendedParser>;
}

View file

@ -0,0 +1,186 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexByteCode.h"
#include "RegexMatch.h"
#include "RegexOptions.h"
#include "RegexParser.h"
#include <AK/Forward.h>
#include <AK/HashMap.h>
#include <AK/NonnullOwnPtrVector.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <stdio.h>
namespace regex {
static const constexpr size_t c_max_recursion = 5000;
static const constexpr size_t c_match_preallocation_count = 0;
struct RegexResult final {
bool success { false };
size_t count { 0 };
Vector<Match> matches;
Vector<Vector<Match>> capture_group_matches;
Vector<HashMap<String, Match>> named_capture_group_matches;
size_t operations { 0 };
};
template<class Parser>
class Regex;
template<class Parser>
class Matcher final {
public:
Matcher(const Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
: m_pattern(pattern)
, m_regex_options(regex_options.value_or({}))
{
}
~Matcher() = default;
RegexResult match(const StringView&, Optional<typename ParserTraits<Parser>::OptionsType> = {}) const;
private:
Optional<bool> execute(const MatchInput& input, MatchState& state, MatchOutput& output, size_t recursion_level) const;
ALWAYS_INLINE Optional<bool> execute_low_prio_forks(const MatchInput& input, MatchState& original_state, MatchOutput& output, Vector<MatchState> states, size_t recursion_level) const;
const Regex<Parser>& m_pattern;
const typename ParserTraits<Parser>::OptionsType m_regex_options;
};
template<class Parser>
class Regex final {
public:
String pattern_value;
regex::Parser::Result parser_result;
OwnPtr<Matcher<Parser>> matcher { nullptr };
Regex(StringView pattern, typename ParserTraits<Parser>::OptionsType regex_options = {});
~Regex() = default;
void print_bytecode(FILE* f = stdout) const;
String error_string(Optional<String> message = {}) const;
RegexResult match(StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
return matcher->match(view, regex_options);
}
RegexResult search(StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
AllOptions options = (AllOptions)regex_options.value_or({});
if ((options & AllFlags::MatchNotBeginOfLine) && (options & AllFlags::MatchNotEndOfLine)) {
options.reset_flag(AllFlags::MatchNotEndOfLine);
options.reset_flag(AllFlags::MatchNotBeginOfLine);
}
options |= AllFlags::Global;
return matcher->match(view, options);
}
bool match(StringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return {};
m = matcher->match(view, regex_options);
return m.success;
}
bool search(StringView view, RegexResult& m, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
m = search(view, regex_options);
return m.success;
}
bool has_match(const StringView view, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {}) const
{
if (!matcher || parser_result.error != Error::NoError)
return false;
RegexResult result = matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
return result.success;
}
};
template<class Parser>
RegexResult match(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
return {};
return pattern.matcher->match(view, regex_options);
}
template<class Parser>
bool match(const StringView view, Regex<Parser>& pattern, RegexResult& res, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
return {};
res = pattern.matcher->match(view, regex_options);
return res.success;
}
template<class Parser>
RegexResult search(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
return {};
return pattern.matcher->search(view, regex_options);
}
template<class Parser>
bool search(const StringView view, Regex<Parser>& pattern, RegexResult& res, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
if (!pattern.matcher || pattern.parser_result.error != Error::NoError)
return {};
res = pattern.matcher->search(view, regex_options);
return res.success;
}
template<class Parser>
bool has_match(const StringView view, Regex<Parser>& pattern, Optional<typename ParserTraits<Parser>::OptionsType> regex_options = {})
{
if (pattern.matcher == nullptr)
return {};
RegexResult result = pattern.matcher->match(view, AllOptions { regex_options.value_or({}) } | AllFlags::SkipSubExprResults);
return result.success;
}
}
using regex::has_match;
using regex::match;
using regex::Regex;
using regex::RegexResult;

View file

@ -0,0 +1,159 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include <AK/Types.h>
#include <stdio.h>
#ifdef __serenity__
# include <regex.h>
#else
# include <LibC/regex.h>
#endif
namespace regex {
using FlagsUnderlyingType = u16;
enum class AllFlags {
Global = __Regex_Global, // All matches (don't return after first match)
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such
Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored
Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted
MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string!
MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set
SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result
StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string.
SingleLine = __Regex_SingleLine, // Dot matches newline characters
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
Last = SkipTrimEmptyMatches
};
enum class PosixFlags : FlagsUnderlyingType {
Global = (FlagsUnderlyingType)AllFlags::Global,
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
Extended = (FlagsUnderlyingType)AllFlags::Extended,
Extra = (FlagsUnderlyingType)AllFlags::Extra,
MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine,
MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine,
SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults,
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
};
enum class ECMAScriptFlags : FlagsUnderlyingType {
Global = (FlagsUnderlyingType)AllFlags::Global,
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
Unicode = (FlagsUnderlyingType)AllFlags::Unicode,
Extended = (FlagsUnderlyingType)AllFlags::Extended,
Extra = (FlagsUnderlyingType)AllFlags::Extra,
SingleLine = (FlagsUnderlyingType)AllFlags::SingleLine,
Sticky = (FlagsUnderlyingType)AllFlags::Sticky,
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
StringCopyMatches = (FlagsUnderlyingType)AllFlags::StringCopyMatches,
};
template<class T>
class RegexOptions {
public:
using FlagsType = T;
RegexOptions() = default;
RegexOptions(T flags)
: m_flags(flags)
{
}
template<class U>
RegexOptions(RegexOptions<U> other)
: m_flags((T) static_cast<FlagsUnderlyingType>(other.value()))
{
}
operator bool() const { return !!*this; }
bool operator!() const { return (FlagsUnderlyingType)m_flags == 0; }
RegexOptions<T> operator|(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag) }; }
RegexOptions<T> operator&(T flag) const { return RegexOptions<T> { (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag) }; }
RegexOptions<T>& operator|=(T flag)
{
m_flags = (T)((FlagsUnderlyingType)m_flags | (FlagsUnderlyingType)flag);
return *this;
}
RegexOptions<T>& operator&=(T flag)
{
m_flags = (T)((FlagsUnderlyingType)m_flags & (FlagsUnderlyingType)flag);
return *this;
}
void reset_flags() { m_flags = (T)0; }
void reset_flag(T flag) { m_flags = (T)((FlagsUnderlyingType)m_flags & ~(FlagsUnderlyingType)flag); }
void set_flag(T flag) { *this |= flag; }
bool has_flag_set(T flag) const { return *this & flag; }
T value() const { return m_flags; }
private:
T m_flags { 0 };
};
template<class T>
inline RegexOptions<T> operator|(T lhs, T rhs)
{
return RegexOptions<T> { lhs } |= rhs;
}
template<class T>
inline RegexOptions<T> operator&(T lhs, T rhs)
{
return RegexOptions<T> { lhs } &= rhs;
}
template<class T>
inline T operator~(T flag)
{
return (T) ~((FlagsUnderlyingType)flag);
}
using AllOptions = RegexOptions<AllFlags>;
using ECMAScriptOptions = RegexOptions<ECMAScriptFlags>;
using PosixOptions = RegexOptions<PosixFlags>;
}
using regex::ECMAScriptFlags;
using regex::ECMAScriptOptions;
using regex::PosixFlags;
using regex::PosixOptions;

View file

@ -0,0 +1,598 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexParser.h"
#include "RegexDebug.h"
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <cstdio>
namespace regex {
ALWAYS_INLINE bool Parser::set_error(Error error)
{
if (m_parser_state.error == Error::NoError) {
m_parser_state.error = error;
m_parser_state.error_token = m_parser_state.current_token;
}
return false; // always return false, that eases the API usage (return set_error(...)) :^)
}
ALWAYS_INLINE bool Parser::done() const
{
return match(TokenType::Eof);
}
ALWAYS_INLINE bool Parser::match(TokenType type) const
{
return m_parser_state.current_token.type() == type;
}
ALWAYS_INLINE Token Parser::consume()
{
auto old_token = m_parser_state.current_token;
m_parser_state.current_token = m_parser_state.lexer.next();
return old_token;
}
ALWAYS_INLINE Token Parser::consume(TokenType type, Error error)
{
if (m_parser_state.current_token.type() != type) {
set_error(error);
dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.current_token.name() << ". Expected: " << Token::name(type);
}
return consume();
}
ALWAYS_INLINE bool Parser::consume(const String& str)
{
size_t potentially_go_back { 1 };
for (auto ch : str) {
if (match(TokenType::Char)) {
if (m_parser_state.current_token.value()[0] != ch) {
m_parser_state.lexer.back(potentially_go_back);
m_parser_state.current_token = m_parser_state.lexer.next();
return false;
}
} else {
m_parser_state.lexer.back(potentially_go_back);
m_parser_state.current_token = m_parser_state.lexer.next();
return false;
}
consume(TokenType::Char, Error::NoError);
++potentially_go_back;
}
return true;
}
ALWAYS_INLINE void Parser::reset()
{
m_parser_state.bytecode.clear();
m_parser_state.lexer.reset();
m_parser_state.current_token = m_parser_state.lexer.next();
m_parser_state.error = Error::NoError;
m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
m_parser_state.regex_options = {};
}
Parser::Result Parser::parse(Optional<AllOptions> regex_options)
{
reset();
if (regex_options.has_value())
m_parser_state.regex_options = regex_options.value();
if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
consume(TokenType::Eof, Error::InvalidPattern);
else
set_error(Error::InvalidPattern);
#ifdef REGEX_DEBUG
fprintf(stderr, "[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.bytecode.size());
#endif
return {
move(m_parser_state.bytecode),
move(m_parser_state.capture_groups_count),
move(m_parser_state.named_capture_groups_count),
move(m_parser_state.match_length_minimum),
move(m_parser_state.error),
move(m_parser_state.error_token)
};
}
// =============================
// PosixExtended Parser
// =============================
bool PosixExtendedParser::parse_internal(ByteCode& stack, size_t& match_length_minimum)
{
return parse_root(stack, match_length_minimum);
}
ALWAYS_INLINE bool PosixExtendedParser::match_repetition_symbol()
{
auto type = m_parser_state.current_token.type();
return (type == TokenType::Asterisk
|| type == TokenType::Plus
|| type == TokenType::Questionmark
|| type == TokenType::LeftCurly);
}
ALWAYS_INLINE bool PosixExtendedParser::match_ordinary_characters()
{
// NOTE: This method must not be called during bracket and repetition parsing!
// FIXME: Add assertion for that?
auto type = m_parser_state.current_token.type();
return (type == TokenType::Char
|| type == TokenType::Comma
|| type == TokenType::Slash
|| type == TokenType::EqualSign
|| type == TokenType::HyphenMinus
|| type == TokenType::Colon);
}
ALWAYS_INLINE bool PosixExtendedParser::parse_repetition_symbol(ByteCode& bytecode_to_repeat, size_t& match_length_minimum)
{
if (match(TokenType::LeftCurly)) {
consume();
StringBuilder number_builder;
while (match(TokenType::Char)) {
number_builder.append(consume().value());
}
auto maybe_minimum = number_builder.build().to_uint();
if (!maybe_minimum.has_value())
return set_error(Error::InvalidBraceContent);
auto minimum = maybe_minimum.value();
match_length_minimum *= minimum;
if (match(TokenType::Comma)) {
consume();
} else {
ByteCode bytecode;
bytecode.insert_bytecode_repetition_n(bytecode_to_repeat, minimum);
bytecode_to_repeat = move(bytecode);
consume(TokenType::RightCurly, Error::MismatchingBrace);
return !has_error();
}
Optional<size_t> maybe_maximum {};
number_builder.clear();
while (match(TokenType::Char)) {
number_builder.append(consume().value());
}
if (!number_builder.is_empty()) {
auto value = number_builder.build().to_uint();
if (!value.has_value() || minimum > value.value())
return set_error(Error::InvalidBraceContent);
maybe_maximum = value.value();
}
bytecode_to_repeat.insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maybe_maximum);
consume(TokenType::RightCurly, Error::MismatchingBrace);
return !has_error();
} else if (match(TokenType::Plus)) {
consume();
bool greedy = match(TokenType::Questionmark);
if (greedy)
consume();
// Note: dont touch match_length_minimum, it's already correct
bytecode_to_repeat.insert_bytecode_repetition_min_one(bytecode_to_repeat, greedy);
return !has_error();
} else if (match(TokenType::Asterisk)) {
consume();
match_length_minimum = 0;
bool greedy = match(TokenType::Questionmark);
if (greedy)
consume();
bytecode_to_repeat.insert_bytecode_repetition_any(bytecode_to_repeat, greedy);
return !has_error();
} else if (match(TokenType::Questionmark)) {
consume();
match_length_minimum = 0;
bool greedy = match(TokenType::Questionmark);
if (greedy)
consume();
bytecode_to_repeat.insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy);
return !has_error();
}
return false;
}
ALWAYS_INLINE bool PosixExtendedParser::parse_bracket_expression(ByteCode& stack, size_t& match_length_minimum)
{
Vector<CompareTypeAndValuePair> values;
for (;;) {
if (match(TokenType::HyphenMinus)) {
consume();
if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
// first in the bracket expression
values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
} else if (match(TokenType::RightBracket)) {
// Last in the bracket expression
values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
} else if (values.last().type == CharacterCompareType::Char) {
values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
if (match(TokenType::HyphenMinus)) {
consume();
// Valid range, add ordinary character
values.append({ CharacterCompareType::Char, (ByteCodeValueType)'-' });
}
} else {
return set_error(Error::InvalidRange);
}
} else if (match(TokenType::Char) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) {
values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
} else if (match(TokenType::Circumflex)) {
auto t = consume();
if (values.is_empty())
values.append({ CharacterCompareType::Inverse, 0 });
else
values.append({ CharacterCompareType::Char, (ByteCodeValueType)*t.value().characters_without_null_termination() });
} else if (match(TokenType::LeftBracket)) {
consume();
if (match(TokenType::Period)) {
consume();
// FIXME: Parse collating element, this is needed when we have locale support
// This could have impact on length parameter, I guess.
ASSERT_NOT_REACHED();
consume(TokenType::Period, Error::InvalidCollationElement);
consume(TokenType::RightBracket, Error::MismatchingBracket);
} else if (match(TokenType::EqualSign)) {
consume();
// FIXME: Parse collating element, this is needed when we have locale support
// This could have impact on length parameter, I guess.
ASSERT_NOT_REACHED();
consume(TokenType::EqualSign, Error::InvalidCollationElement);
consume(TokenType::RightBracket, Error::MismatchingBracket);
} else if (match(TokenType::Colon)) {
consume();
CharClass ch_class;
// parse character class
if (match(TokenType::Char)) {
if (consume("alnum"))
ch_class = CharClass::Alnum;
else if (consume("alpha"))
ch_class = CharClass::Alpha;
else if (consume("blank"))
ch_class = CharClass::Blank;
else if (consume("cntrl"))
ch_class = CharClass::Cntrl;
else if (consume("digit"))
ch_class = CharClass::Digit;
else if (consume("graph"))
ch_class = CharClass::Graph;
else if (consume("lower"))
ch_class = CharClass::Lower;
else if (consume("print"))
ch_class = CharClass::Print;
else if (consume("punct"))
ch_class = CharClass::Punct;
else if (consume("space"))
ch_class = CharClass::Space;
else if (consume("upper"))
ch_class = CharClass::Upper;
else if (consume("xdigit"))
ch_class = CharClass::Xdigit;
else
return set_error(Error::InvalidCharacterClass);
values.append({ CharacterCompareType::CharClass, (ByteCodeValueType)ch_class });
} else
return set_error(Error::InvalidCharacterClass);
// FIXME: we do not support locale specific character classes until locales are implemented
consume(TokenType::Colon, Error::InvalidCharacterClass);
consume(TokenType::RightBracket, Error::MismatchingBracket);
} else {
return set_error(Error::MismatchingBracket);
}
} else if (match(TokenType::RightBracket)) {
if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
// handle bracket as ordinary character
values.append({ CharacterCompareType::Char, (ByteCodeValueType)*consume().value().characters_without_null_termination() });
} else {
// closing bracket expression
break;
}
} else
// nothing matched, this is a failure, as at least the closing bracket must match...
return set_error(Error::MismatchingBracket);
// check if range expression has to be completed...
if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
if (values.last().type != CharacterCompareType::Char)
return set_error(Error::InvalidRange);
auto value2 = values.take_last();
values.take_last(); // RangeExpressionDummy
auto value1 = values.take_last();
values.append({ CharacterCompareType::CharRange, static_cast<ByteCodeValueType>(CharRange { (char)value1.value, (char)value2.value }) });
}
}
if (values.size())
match_length_minimum = 1;
if (values.first().type == CharacterCompareType::Inverse)
match_length_minimum = 0;
stack.insert_bytecode_compare_values(move(values));
return !has_error();
}
ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, size_t& match_length_minimum)
{
ByteCode bytecode;
size_t length = 0;
bool should_parse_repetition_symbol { false };
for (;;) {
if (match_ordinary_characters()) {
Token start_token = m_parser_state.current_token;
Token last_token = m_parser_state.current_token;
for (;;) {
if (!match_ordinary_characters())
break;
++length;
last_token = consume();
}
if (length > 1) {
// last character is inserted into 'bytecode' for duplication symbol handling
auto new_length = length - ((match_repetition_symbol() && length > 1) ? 1 : 0);
stack.insert_bytecode_compare_string(start_token.value(), new_length);
}
if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)last_token.value().characters_without_null_termination()[0] } });
should_parse_repetition_symbol = true;
break;
}
if (match_repetition_symbol())
return set_error(Error::InvalidRepetitionMarker);
if (match(TokenType::Period)) {
length = 1;
consume();
bytecode.insert_bytecode_compare_values({ { CharacterCompareType::AnyChar, 0 } });
should_parse_repetition_symbol = true;
break;
}
if (match(TokenType::EscapeSequence)) {
length = 1;
Token t = consume();
#ifdef REGEX_DEBUG
printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
#endif
bytecode.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u32)t.value().characters_without_null_termination()[1] } });
should_parse_repetition_symbol = true;
break;
}
if (match(TokenType::LeftBracket)) {
consume();
ByteCode sub_ops;
if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
return set_error(Error::InvalidBracketContent);
bytecode.append(move(sub_ops));
consume(TokenType::RightBracket, Error::MismatchingBracket);
should_parse_repetition_symbol = true;
break;
}
if (match(TokenType::RightBracket)) {
return set_error(Error::MismatchingBracket);
}
if (match(TokenType::RightCurly)) {
return set_error(Error::MismatchingBrace);
}
if (match(TokenType::Circumflex)) {
consume();
bytecode.empend((ByteCodeValueType)OpCodeId::CheckBegin);
break;
}
if (match(TokenType::Dollar)) {
consume();
bytecode.empend((ByteCodeValueType)OpCodeId::CheckEnd);
break;
}
if (match(TokenType::RightParen))
return false;
if (match(TokenType::LeftParen)) {
consume();
Optional<StringView> capture_group_name;
bool prevent_capture_group = false;
if (match(TokenType::Questionmark)) {
consume();
if (match(TokenType::Colon)) {
consume();
prevent_capture_group = true;
} else if (consume("<")) { // named capturing group
Token start_token = m_parser_state.current_token;
Token last_token = m_parser_state.current_token;
size_t capture_group_name_length = 0;
for (;;) {
if (!match_ordinary_characters())
return set_error(Error::InvalidNameForCaptureGroup);
if (match(TokenType::Char) && m_parser_state.current_token.value()[0] == '>') {
consume();
break;
}
++capture_group_name_length;
last_token = consume();
}
capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
} else if (match(TokenType::EqualSign)) { // positive lookahead
consume();
ASSERT_NOT_REACHED();
} else if (consume("!")) { // negative lookahead
ASSERT_NOT_REACHED();
} else if (consume("<")) {
if (match(TokenType::EqualSign)) { // positive lookbehind
consume();
ASSERT_NOT_REACHED();
}
if (consume("!")) // negative lookbehind
ASSERT_NOT_REACHED();
} else {
return set_error(Error::InvalidRepetitionMarker);
}
}
if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
if (capture_group_name.has_value())
bytecode.insert_bytecode_group_capture_left(capture_group_name.value());
else
bytecode.insert_bytecode_group_capture_left(m_parser_state.capture_groups_count);
}
ByteCode capture_group_bytecode;
if (!parse_root(capture_group_bytecode, length))
return set_error(Error::InvalidPattern);
bytecode.append(move(capture_group_bytecode));
consume(TokenType::RightParen, Error::MismatchingParen);
if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
if (capture_group_name.has_value()) {
bytecode.insert_bytecode_group_capture_right(capture_group_name.value());
++m_parser_state.named_capture_groups_count;
} else {
bytecode.insert_bytecode_group_capture_right(m_parser_state.capture_groups_count);
++m_parser_state.capture_groups_count;
}
}
should_parse_repetition_symbol = true;
break;
}
return false;
}
if (match_repetition_symbol()) {
if (should_parse_repetition_symbol)
parse_repetition_symbol(bytecode, length);
else
return set_error(Error::InvalidRepetitionMarker);
}
stack.append(move(bytecode));
match_length_minimum += length;
return true;
}
bool PosixExtendedParser::parse_root(ByteCode& stack, size_t& match_length_minimum)
{
ByteCode bytecode_left;
size_t match_length_minimum_left { 0 };
if (match_repetition_symbol())
return set_error(Error::InvalidRepetitionMarker);
for (;;) {
if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
break;
if (match(TokenType::Pipe)) {
consume();
ByteCode bytecode_right;
size_t match_length_minimum_right { 0 };
if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
return set_error(Error::InvalidPattern);
ByteCode new_bytecode;
new_bytecode.insert_bytecode_alternation(move(bytecode_left), move(bytecode_right));
bytecode_left = move(new_bytecode);
match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
}
}
if (bytecode_left.is_empty())
set_error(Error::EmptySubExpression);
stack.append(move(bytecode_left));
match_length_minimum = match_length_minimum_left;
return !has_error();
}
}

View file

@ -0,0 +1,803 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "RegexParser.h"
#include <AK/String.h>
#include <AK/StringBuilder.h>
namespace AK {
namespace regex {
const char* ByteCodeValue::name(OpCode type)
{
switch (type) {
#define __ENUMERATE_OPCODE(x) \
case OpCode::x: \
return #x;
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
default:
ASSERT_NOT_REACHED();
return "<Unknown>";
}
}
const char* ByteCodeValue::name() const
{
return name(op_code);
}
template<class T>
bool Parser<T>::set_error(Error error)
{
if (m_parser_state.error == Error::NoError) {
m_parser_state.error = error;
m_parser_state.error_token = m_parser_state.current_token;
}
return false; // always return false, that eases the API usage (return set_error(...)) :^)
}
template<class T>
bool Parser<T>::done() const
{
return match(TokenType::Eof);
}
template<class T>
bool Parser<T>::match(TokenType type) const
{
return m_parser_state.current_token.type() == type;
}
template<class T>
Token Parser<T>::consume()
{
auto old_token = m_parser_state.current_token;
m_parser_state.current_token = m_parser_state.lexer.next();
return old_token;
}
template<class T>
Token Parser<T>::consume(TokenType type, Error error)
{
if (m_parser_state.current_token.type() != type) {
set_error(error);
#ifdef __serenity__
dbg() << "[PARSER] Error: Unexpected token " << m_parser_state.m_current_token.name() << ". Expected: " << Token::name(type);
#else
fprintf(stderr, "[PARSER] Error: Unexpected token %s. Expected %s\n", m_parser_state.current_token.name(), Token::name(type));
#endif
}
return consume();
}
template<class T>
bool Parser<T>::consume(const String& str)
{
size_t potentially_go_back { 1 };
for (auto ch : str) {
if (match(TokenType::OrdinaryCharacter)) {
if (m_parser_state.current_token.value()[0] != ch) {
m_parser_state.lexer.back(potentially_go_back);
m_parser_state.current_token = m_parser_state.lexer.next();
return false;
}
} else {
m_parser_state.lexer.back(potentially_go_back);
m_parser_state.current_token = m_parser_state.lexer.next();
return false;
}
consume(TokenType::OrdinaryCharacter);
++potentially_go_back;
}
return true;
}
template<class T>
void Parser<T>::reset()
{
m_parser_state.bytecode.clear();
m_parser_state.lexer.reset();
m_parser_state.current_token = m_parser_state.lexer.next();
m_parser_state.error = Error::NoError;
m_parser_state.error_token = { TokenType::Eof, 0, StringView(nullptr) };
m_parser_state.regex_options = {};
}
template<class T>
ParserResult Parser<T>::parse(Optional<OptionsType> regex_options)
{
reset();
if (regex_options.has_value())
m_parser_state.regex_options = regex_options.value();
if (parse_internal(m_parser_state.bytecode, m_parser_state.match_length_minimum))
consume(TokenType::Eof);
else
set_error(Error::InvalidPattern);
#ifdef REGEX_DEBUG
printf("[PARSER] Produced bytecode with %lu entries (opcodes + arguments)\n", m_parser_state.m_bytes.size());
#endif
return {
move(m_parser_state.bytecode),
move(m_parser_state.capture_groups_count),
move(m_parser_state.named_capture_groups_count),
move(m_parser_state.match_length_minimum),
move(m_parser_state.error),
move(m_parser_state.error_token)
};
}
template<class T>
void Parser<T>::insert_bytecode_compare_values(Vector<ByteCodeValue>& stack, Vector<CompareTypeAndValuePair>&& pairs)
{
Vector<ByteCodeValue> bytecode;
bytecode.empend(OpCode::Compare);
bytecode.empend(pairs.size()); // number of arguments
for (auto& value : pairs) {
ASSERT(value.type != CharacterCompareType::RangeExpressionDummy);
ASSERT(value.type != CharacterCompareType::Undefined);
ASSERT(value.type != CharacterCompareType::OrdinaryCharacters);
bytecode.append(move(value.type));
if (value.type != CharacterCompareType::Inverse && value.type != CharacterCompareType::AnySingleCharacter)
bytecode.append(move(value.value));
}
stack.append(move(bytecode));
}
template<class T>
void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack)
{
stack.empend(OpCode::SaveLeftCaptureGroup);
stack.empend(m_parser_state.capture_groups_count);
}
template<class T>
void Parser<T>::insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name)
{
stack.empend(OpCode::SaveLeftNamedCaptureGroup);
stack.empend(name.characters_without_null_termination());
stack.empend(name.length());
}
template<class T>
void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack)
{
stack.empend(OpCode::SaveRightCaptureGroup);
stack.empend(m_parser_state.capture_groups_count);
}
template<class T>
void Parser<T>::insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name)
{
stack.empend(OpCode::SaveRightNamedCaptureGroup);
stack.empend(name.characters_without_null_termination());
stack.empend(name.length());
}
template<class T>
void Parser<T>::insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&& left, Vector<ByteCodeValue>&& right)
{
// FORKSTAY _ALT
// REGEXP ALT1
// JUMP _END
// LABEL _ALT
// REGEXP ALT2
// LABEL _END
stack.empend(OpCode::ForkJump);
stack.empend(left.size() + 2); // Jump to the _ALT label
for (auto& op : left)
stack.append(move(op));
stack.empend(OpCode::Jump);
stack.empend(right.size()); // Jump to the _END label
// LABEL _ALT = bytecode.size() + 2
for (auto& op : right)
stack.append(move(op));
// LABEL _END = alterantive_bytecode.size
}
template<class T>
void Parser<T>::insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum)
{
Vector<ByteCodeValue> new_bytecode;
insert_bytecode_repetition_n(new_bytecode, bytecode_to_repeat, minimum);
if (maximum.has_value()) {
if (maximum.value() > minimum) {
auto diff = maximum.value() - minimum;
new_bytecode.empend(OpCode::ForkStay);
new_bytecode.empend(diff * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
for (size_t i = 0; i < diff; ++i) {
new_bytecode.append(bytecode_to_repeat);
new_bytecode.empend(OpCode::ForkStay);
new_bytecode.empend((diff - i - 1) * (bytecode_to_repeat.size() + 2)); // Jump to the _END label
}
}
} else {
// no maximum value set, repeat finding if possible
new_bytecode.empend(OpCode::ForkJump);
new_bytecode.empend(-bytecode_to_repeat.size() - 2); // Jump to the last iteration
}
bytecode_to_repeat = move(new_bytecode);
}
template<class T>
void Parser<T>::insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n)
{
for (size_t i = 0; i < n; ++i)
stack.append(bytecode_to_repeat);
}
template<class T>
void Parser<T>::insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy)
{
// LABEL _START = -bytecode_to_repeat.size()
// REGEXP
// FORKJUMP _START (FORKSTAY -> Greedy)
if (greedy)
bytecode_to_repeat.empend(OpCode::ForkStay);
else
bytecode_to_repeat.empend(OpCode::ForkJump);
bytecode_to_repeat.empend(-bytecode_to_repeat.size() - 1); // Jump to the _START label
}
template<class T>
void Parser<T>::insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy)
{
// LABEL _START
// FORKSTAY _END (FORKJUMP -> Greedy)
// REGEXP
// JUMP _START
// LABEL _END
// LABEL _START = stack.size();
Vector<ByteCodeValue> bytecode;
if (greedy)
bytecode.empend(OpCode::ForkJump);
else
bytecode.empend(OpCode::ForkStay);
bytecode.empend(bytecode_to_repeat.size() + 2); // Jump to the _END label
for (auto& op : bytecode_to_repeat)
bytecode.append(move(op));
bytecode.empend(OpCode::Jump);
bytecode.empend(-bytecode.size() - 1); // Jump to the _START label
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
template<class T>
void Parser<T>::insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy)
{
// FORKSTAY _END (FORKJUMP -> Greedy)
// REGEXP
// LABEL _END
Vector<ByteCodeValue> bytecode;
if (greedy)
bytecode.empend(OpCode::ForkJump);
else
bytecode.empend(OpCode::ForkStay);
bytecode.empend(bytecode_to_repeat.size()); // Jump to the _END label
for (auto& op : bytecode_to_repeat)
bytecode.append(move(op));
// LABEL _END = bytecode.size()
bytecode_to_repeat = move(bytecode);
}
// =============================
// PosixExtended Parser
// =============================
bool PosixExtendedParser::parse_internal(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
{
return parse_root(stack, match_length_minimum);
}
bool PosixExtendedParser::match_repetition_symbol()
{
auto type = m_parser_state.current_token.type();
return (type == TokenType::Asterisk
|| type == TokenType::Plus
|| type == TokenType::Questionmark
|| type == TokenType::LeftCurly);
}
bool PosixExtendedParser::match_ordinary_characters()
{
// NOTE: This method must not be called during bracket and repetition parsing!
// FIXME: Add assertion for that?
auto type = m_parser_state.current_token.type();
return (type == TokenType::OrdinaryCharacter
|| type == TokenType::Comma
|| type == TokenType::Slash
|| type == TokenType::EqualSign
|| type == TokenType::HyphenMinus
|| type == TokenType::Colon);
}
bool PosixExtendedParser::parse_repetition_symbol(Vector<ByteCodeValue>& bytecode_to_repeat, size_t& match_length_minimum)
{
if (match(TokenType::LeftCurly)) {
consume();
StringBuilder number_builder;
bool ok;
while (match(TokenType::OrdinaryCharacter)) {
number_builder.append(consume().value());
}
size_t minimum = number_builder.build().to_uint(ok);
if (!ok)
return set_error(Error::InvalidBraceContent);
match_length_minimum *= minimum;
if (match(TokenType::Comma)) {
consume();
} else {
Vector<ByteCodeValue> bytecode;
insert_bytecode_repetition_n(bytecode, bytecode_to_repeat, minimum);
bytecode_to_repeat = move(bytecode);
return !has_error();
}
Optional<size_t> maximum {};
number_builder.clear();
while (match(TokenType::OrdinaryCharacter)) {
number_builder.append(consume().value());
}
if (!number_builder.is_empty()) {
maximum = number_builder.build().to_uint(ok);
if (!ok || minimum > maximum.value())
return set_error(Error::InvalidBraceContent);
}
insert_bytecode_repetition_min_max(bytecode_to_repeat, minimum, maximum);
consume(TokenType::RightCurly, Error::BraceMismatch);
return !has_error();
} else if (match(TokenType::Plus)) {
consume();
bool greedy = match(TokenType::Questionmark);
if (greedy)
consume();
// Note: dont touch match_length_minimum, it's already correct
insert_bytecode_repetition_min_one(bytecode_to_repeat, greedy);
return !has_error();
} else if (match(TokenType::Asterisk)) {
consume();
match_length_minimum = 0;
bool greedy = match(TokenType::Questionmark);
if (greedy)
consume();
insert_bytecode_repetition_any(bytecode_to_repeat, greedy);
return !has_error();
} else if (match(TokenType::Questionmark)) {
consume();
match_length_minimum = 0;
bool greedy = match(TokenType::Questionmark);
if (greedy)
consume();
insert_bytecode_repetition_zero_or_one(bytecode_to_repeat, greedy);
return !has_error();
}
return false;
}
bool PosixExtendedParser::parse_bracket_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
{
Vector<CompareTypeAndValuePair> values;
for (;;) {
if (match(TokenType::HyphenMinus)) {
consume();
if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
// first in the bracket expression
values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } });
} else if (match(TokenType::RightBracket)) {
// Last in the bracket expression
values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } });
} else if (values.last().type == CharacterCompareType::OrdinaryCharacter) {
values.append({ CharacterCompareType::RangeExpressionDummy, 0 });
if (match(TokenType::HyphenMinus)) {
consume();
// Valid range, add ordinary character
values.append({ CharacterCompareType::OrdinaryCharacter, { '-' } });
}
} else {
return set_error(Error::InvalidRange);
}
} else if (match(TokenType::OrdinaryCharacter) || match(TokenType::Period) || match(TokenType::Asterisk) || match(TokenType::EscapeSequence) || match(TokenType::Plus)) {
values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } });
} else if (match(TokenType::Circumflex)) {
auto t = consume();
if (values.is_empty())
values.append({ CharacterCompareType::Inverse, 0 });
else
values.append({ CharacterCompareType::OrdinaryCharacter, { *t.value().characters_without_null_termination() } });
} else if (match(TokenType::LeftBracket)) {
consume();
if (match(TokenType::Period)) {
consume();
// FIXME: Parse collating element, this is needed when we have locale support
// This could have impact on length parameter, I guess.
ASSERT_NOT_REACHED();
consume(TokenType::Period, Error::InvalidCollationElement);
consume(TokenType::RightBracket, Error::BracketMismatch);
} else if (match(TokenType::EqualSign)) {
consume();
// FIXME: Parse collating element, this is needed when we have locale support
// This could have impact on length parameter, I guess.
ASSERT_NOT_REACHED();
consume(TokenType::EqualSign, Error::InvalidCollationElement);
consume(TokenType::RightBracket, Error::BracketMismatch);
} else if (match(TokenType::Colon)) {
consume();
CharacterClass ch_class;
// parse character class
if (match(TokenType::OrdinaryCharacter)) {
if (consume("alnum"))
ch_class = CharacterClass::Alnum;
else if (consume("alpha"))
ch_class = CharacterClass::Alpha;
else if (consume("blank"))
ch_class = CharacterClass::Blank;
else if (consume("cntrl"))
ch_class = CharacterClass::Cntrl;
else if (consume("digit"))
ch_class = CharacterClass::Digit;
else if (consume("graph"))
ch_class = CharacterClass::Graph;
else if (consume("lower"))
ch_class = CharacterClass::Lower;
else if (consume("print"))
ch_class = CharacterClass::Print;
else if (consume("punct"))
ch_class = CharacterClass::Punct;
else if (consume("space"))
ch_class = CharacterClass::Space;
else if (consume("upper"))
ch_class = CharacterClass::Upper;
else if (consume("xdigit"))
ch_class = CharacterClass::Xdigit;
else
return set_error(Error::InvalidCharacterClass);
values.append({ CharacterCompareType::CharacterClass, ch_class });
} else
return set_error(Error::InvalidCharacterClass);
// FIXME: we do not support locale specific character classes until locales are implemented
consume(TokenType::Colon, Error::InvalidCharacterClass);
consume(TokenType::RightBracket, Error::BracketMismatch);
}
} else if (match(TokenType::RightBracket)) {
if (values.is_empty() || (values.size() == 1 && values.last().type == CharacterCompareType::Inverse)) {
// handle bracket as ordinary character
values.append({ CharacterCompareType::OrdinaryCharacter, { *consume().value().characters_without_null_termination() } });
} else {
// closing bracket expression
break;
}
} else
// nothing matched, this is a failure, as at least the closing bracket must match...
return set_error(Error::BracketMismatch);
// check if range expression has to be completed...
if (values.size() >= 3 && values.at(values.size() - 2).type == CharacterCompareType::RangeExpressionDummy) {
if (values.last().type != CharacterCompareType::OrdinaryCharacter)
return set_error(Error::InvalidRange);
auto value2 = values.take_last();
values.take_last(); // RangeExpressionDummy
auto value1 = values.take_last();
values.append({ CharacterCompareType::RangeExpression, ByteCodeValue { value1.value.ch, value2.value.ch } });
}
}
if (values.size())
match_length_minimum = 1;
if (values.first().type == CharacterCompareType::Inverse)
match_length_minimum = 0;
insert_bytecode_compare_values(stack, move(values));
return !has_error();
}
bool PosixExtendedParser::parse_sub_expression(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
{
Vector<ByteCodeValue> bytecode;
size_t length = 0;
bool should_parse_repetition_symbol { false };
for (;;) {
if (match_ordinary_characters()) {
Token start_token = m_parser_state.current_token;
Token last_token = m_parser_state.current_token;
for (;;) {
if (!match_ordinary_characters())
break;
++length;
last_token = consume();
}
if (length > 1) {
stack.empend(OpCode::Compare);
stack.empend(1ul); // number of arguments
stack.empend(CharacterCompareType::OrdinaryCharacters);
stack.empend(start_token.value().characters_without_null_termination());
stack.empend(length - ((match_repetition_symbol() && length > 1) ? 1 : 0)); // last character is inserted into 'bytecode' for duplication symbol handling
}
if ((match_repetition_symbol() && length > 1) || length == 1) // Create own compare opcode for last character before duplication symbol
insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { last_token.value().characters_without_null_termination()[0] } } });
should_parse_repetition_symbol = true;
break;
}
if (match_repetition_symbol())
return set_error(Error::InvalidRepetitionMarker);
if (match(TokenType::Period)) {
length = 1;
consume();
insert_bytecode_compare_values(bytecode, { { CharacterCompareType::AnySingleCharacter, { 0 } } });
should_parse_repetition_symbol = true;
break;
}
if (match(TokenType::EscapeSequence)) {
length = 1;
Token t = consume();
#ifdef REGEX_DEBUG
printf("[PARSER] EscapeSequence with substring %s\n", String(t.value()).characters());
#endif
insert_bytecode_compare_values(bytecode, { { CharacterCompareType::OrdinaryCharacter, { (char)t.value().characters_without_null_termination()[1] } } });
should_parse_repetition_symbol = true;
break;
}
if (match(TokenType::LeftBracket)) {
consume();
Vector<ByteCodeValue> sub_ops;
if (!parse_bracket_expression(sub_ops, length) || !sub_ops.size())
return set_error(Error::BracketMismatch);
bytecode.append(move(sub_ops));
consume(TokenType::RightBracket);
should_parse_repetition_symbol = true;
break;
}
if (match(TokenType::RightBracket)) {
return set_error(Error::BracketMismatch);
}
if (match(TokenType::RightCurly)) {
return set_error(Error::BraceMismatch);
}
if (match(TokenType::Circumflex)) {
consume();
bytecode.empend(OpCode::CheckBegin);
break;
}
if (match(TokenType::Dollar)) {
consume();
bytecode.empend(OpCode::CheckEnd);
break;
}
if (match(TokenType::LeftParen)) {
consume();
Optional<StringView> capture_group_name;
bool no_subexpression_match_qualifier = false;
if (match(TokenType::Questionmark)) {
consume();
if (match(TokenType::Colon)) {
consume();
no_subexpression_match_qualifier = true;
} else if (consume("<")) { // named capturing group
Token start_token = m_parser_state.current_token;
Token last_token = m_parser_state.current_token;
size_t capture_group_name_length = 0;
for (;;) {
if (!match_ordinary_characters())
return set_error(Error::InvalidNameForCaptureGroup);
if (match(TokenType::OrdinaryCharacter) && m_parser_state.current_token.value()[0] == '>') {
consume();
break;
}
++capture_group_name_length;
last_token = consume();
}
capture_group_name = StringView(start_token.value().characters_without_null_termination(), capture_group_name_length);
} else if (match(TokenType::EqualSign)) { // positive lookahead
consume();
ASSERT_NOT_REACHED();
} else if (consume("!")) { // negative lookahead
ASSERT_NOT_REACHED();
} else if (consume("<")) {
if (match(TokenType::EqualSign)) { // positive lookbehind
consume();
ASSERT_NOT_REACHED();
}
if (consume("!")) // negative lookbehind
ASSERT_NOT_REACHED();
} else {
return set_error(Error::InvalidRepetitionMarker);
}
}
if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) {
if (capture_group_name.has_value())
insert_bytecode_group_capture_left(bytecode, capture_group_name.value());
else
insert_bytecode_group_capture_left(bytecode);
}
Vector<ByteCodeValue> capture_group_bytecode;
bool res = !parse_root(capture_group_bytecode, length);
if (capture_group_bytecode.is_empty() && match(TokenType::RightParen))
return set_error(Error::ParenEmpty);
if (!res)
return false;
bytecode.append(move(capture_group_bytecode));
consume(TokenType::RightParen, Error::ParenMismatch);
if (!(m_parser_state.regex_options & (u8)AllFlags::NoSubExpressions || no_subexpression_match_qualifier)) {
if (capture_group_name.has_value()) {
insert_bytecode_group_capture_right(bytecode, capture_group_name.value());
++m_parser_state.named_capture_groups_count;
} else {
insert_bytecode_group_capture_right(bytecode);
++m_parser_state.capture_groups_count;
}
}
should_parse_repetition_symbol = true;
break;
}
return false;
}
if (match_repetition_symbol()) {
if (should_parse_repetition_symbol)
parse_repetition_symbol(bytecode, length);
else
return set_error(Error::InvalidRepetitionMarker);
}
stack.append(move(bytecode));
match_length_minimum += length;
return true;
}
bool PosixExtendedParser::parse_root(Vector<ByteCodeValue>& stack, size_t& match_length_minimum)
{
Vector<ByteCodeValue> bytecode_left;
size_t match_length_minimum_left { 0 };
if (match_repetition_symbol())
return set_error(Error::InvalidRepetitionMarker);
for (;;) {
if (!parse_sub_expression(bytecode_left, match_length_minimum_left))
break;
if (match(TokenType::Pipe)) {
consume();
Vector<ByteCodeValue> bytecode_right;
size_t match_length_minimum_right { 0 };
if (!parse_root(bytecode_right, match_length_minimum_right) || bytecode_right.is_empty())
return set_error(Error::InvalidPattern);
Vector<ByteCodeValue> new_bytecode;
insert_bytecode_alternation(new_bytecode, move(bytecode_left), move(bytecode_right));
bytecode_left = move(new_bytecode);
match_length_minimum_left = min(match_length_minimum_right, match_length_minimum_left);
}
}
stack.append(move(bytecode_left));
match_length_minimum = match_length_minimum_left;
return !has_error();
}
}
}

View file

@ -0,0 +1,145 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexByteCode.h"
#include "RegexError.h"
#include "RegexLexer.h"
#include "RegexOptions.h"
#include <AK/Forward.h>
#include <AK/StringBuilder.h>
#include <AK/Types.h>
#include <AK/Vector.h>
namespace regex {
class PosixExtendedParser;
template<typename T>
struct GenericParserTraits {
using OptionsType = T;
};
template<typename T>
struct ParserTraits : public GenericParserTraits<T> {
};
template<>
struct ParserTraits<PosixExtendedParser> : public GenericParserTraits<PosixOptions> {
};
class Parser {
public:
struct Result {
ByteCode bytecode;
size_t capture_groups_count;
size_t named_capture_groups_count;
size_t match_length_minimum;
Error error;
Token error_token;
};
explicit Parser(Lexer& lexer)
: m_parser_state(lexer)
{
}
Parser(Lexer& lexer, AllOptions regex_options)
: m_parser_state(lexer, regex_options)
{
}
virtual ~Parser() = default;
Result parse(Optional<AllOptions> regex_options = {});
bool has_error() const { return m_parser_state.error != Error::NoError; }
Error error() const { return m_parser_state.error; }
protected:
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
ALWAYS_INLINE bool match(TokenType type) const;
ALWAYS_INLINE bool match(char ch) const;
ALWAYS_INLINE Token consume();
ALWAYS_INLINE Token consume(TokenType type, Error error);
ALWAYS_INLINE bool consume(const String&);
ALWAYS_INLINE void reset();
ALWAYS_INLINE bool done() const;
ALWAYS_INLINE bool set_error(Error error);
struct ParserState {
Lexer& lexer;
Token current_token;
Error error = Error::NoError;
Token error_token { TokenType::Eof, 0, StringView(nullptr) };
ByteCode bytecode;
size_t capture_groups_count { 0 };
size_t named_capture_groups_count { 0 };
size_t match_length_minimum { 0 };
AllOptions regex_options;
explicit ParserState(Lexer& lexer)
: lexer(lexer)
, current_token(lexer.next())
{
}
explicit ParserState(Lexer& lexer, AllOptions regex_options)
: lexer(lexer)
, current_token(lexer.next())
, regex_options(regex_options)
{
}
};
ParserState m_parser_state;
};
class PosixExtendedParser final : public Parser {
public:
explicit PosixExtendedParser(Lexer& lexer)
: Parser(lexer) {};
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
: Parser(lexer, regex_options.value_or({})) {};
~PosixExtendedParser() = default;
private:
ALWAYS_INLINE bool match_repetition_symbol();
ALWAYS_INLINE bool match_ordinary_characters();
bool parse_internal(ByteCode&, size_t&) override;
bool parse_root(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_sub_expression(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_bracket_expression(ByteCode&, size_t&);
ALWAYS_INLINE bool parse_repetition_symbol(ByteCode&, size_t&);
};
using PosixExtended = PosixExtendedParser;
}
using regex::PosixExtended;

View file

@ -0,0 +1,257 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#pragma once
#include "RegexError.h"
#include "RegexLexer.h"
#include "RegexOptions.h"
#include <AK/Forward.h>
#include <AK/Types.h>
#include <AK/Vector.h>
namespace AK {
namespace regex {
#define ENUMERATE_OPCODES \
__ENUMERATE_OPCODE(Compare) \
__ENUMERATE_OPCODE(Jump) \
__ENUMERATE_OPCODE(ForkJump) \
__ENUMERATE_OPCODE(ForkStay) \
__ENUMERATE_OPCODE(SaveLeftCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightCaptureGroup) \
__ENUMERATE_OPCODE(SaveLeftNamedCaptureGroup) \
__ENUMERATE_OPCODE(SaveRightNamedCaptureGroup) \
__ENUMERATE_OPCODE(CheckBegin) \
__ENUMERATE_OPCODE(CheckEnd) \
__ENUMERATE_OPCODE(Exit)
enum class OpCode : u8 {
#define __ENUMERATE_OPCODE(x) x,
ENUMERATE_OPCODES
#undef __ENUMERATE_OPCODE
};
enum class CharacterCompareType : u8 {
Undefined,
Inverse,
AnySingleCharacter,
OrdinaryCharacter,
OrdinaryCharacters,
CharacterClass,
RangeExpression,
RangeExpressionDummy,
};
enum class CharacterClass : u8 {
Alnum,
Cntrl,
Lower,
Space,
Alpha,
Digit,
Print,
Upper,
Blank,
Graph,
Punct,
Xdigit,
};
class ByteCodeValue {
public:
union CompareValue {
CompareValue(const CharacterClass value)
: character_class(value)
{
}
CompareValue(const char value1, const char value2)
: range_values { value1, value2 }
{
}
const CharacterClass character_class;
const struct {
const char from;
const char to;
} range_values;
};
union {
const OpCode op_code;
const char* string;
const char ch;
const int number;
const size_t positive_number;
const CompareValue compare_value;
const CharacterCompareType compare_type;
};
const char* name() const;
static const char* name(OpCode);
ByteCodeValue(const OpCode value)
: op_code(value)
{
}
ByteCodeValue(const char* value)
: string(value)
{
}
ByteCodeValue(const char value)
: ch(value)
{
}
ByteCodeValue(const int value)
: number(value)
{
}
ByteCodeValue(const size_t value)
: positive_number(value)
{
}
ByteCodeValue(const CharacterClass value)
: compare_value(value)
{
}
ByteCodeValue(const char value1, const char value2)
: compare_value(value1, value2)
{
}
ByteCodeValue(const CharacterCompareType value)
: compare_type(value)
{
}
~ByteCodeValue() = default;
};
struct CompareTypeAndValuePair {
CharacterCompareType type;
ByteCodeValue value;
};
struct ParserResult {
Vector<ByteCodeValue> m_bytes;
size_t m_match_groups;
size_t m_min_match_length;
Error m_error;
Token m_error_token;
};
template<class T>
class Parser {
public:
explicit Parser(Lexer& lexer)
: m_parser_state(lexer)
{
}
Parser(Lexer& lexer, T options)
: m_parser_state(lexer, options)
{
}
virtual ~Parser() = default;
virtual ParserResult parse(T options = {}, EngineOptions engine_options = {});
bool has_error() const { return m_parser_state.m_error != Error::NoError; }
Error error() const { return m_parser_state.m_error; }
protected:
virtual bool parse_internal(Vector<ByteCodeValue>&, size_t& min_length) = 0;
bool match(TokenType type) const;
bool match(char ch) const;
Token consume();
Token consume(TokenType type, Error error = Error::InvalidPattern);
bool consume(const String&);
void reset();
bool done() const;
bool set_error(Error error);
void insert_bytecode_compare_values(Vector<ByteCodeValue>&, Vector<CompareTypeAndValuePair>&&);
void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack);
void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack);
void insert_bytecode_group_capture_left(Vector<ByteCodeValue>& stack, const StringView& name);
void insert_bytecode_group_capture_right(Vector<ByteCodeValue>& stack, const StringView& name);
void insert_bytecode_alternation(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>&&, Vector<ByteCodeValue>&&);
void insert_bytecode_repetition_min_max(Vector<ByteCodeValue>& bytecode_to_repeat, size_t minimum, Optional<size_t> maximum);
void insert_bytecode_repetition_n(Vector<ByteCodeValue>& stack, Vector<ByteCodeValue>& bytecode_to_repeat, size_t n);
void insert_bytecode_repetition_min_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy);
void insert_bytecode_repetition_any(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy);
void insert_bytecode_repetition_zero_or_one(Vector<ByteCodeValue>& bytecode_to_repeat, bool greedy);
struct ParserState {
Lexer& lexer;
Token current_token;
Error error = Error::NoError;
Token error_token { TokenType::Eof, 0, StringView(nullptr) };
Vector<ByteCodeValue> bytecode;
size_t capture_groups_count { 0 };
size_t named_capture_groups_count { 0 };
size_t match_length_minimum { 0 };
OptionsType regex_options;
explicit ParserState(Lexer& lexer)
: lexer(lexer)
, current_token(lexer.next())
{
}
explicit ParserState(Lexer& lexer, Optional<OptionsType> regex_options)
: lexer(lexer)
, current_token(lexer.next())
, regex_options(regex_options.value_or({}))
{
}
};
ParserState m_parser_state;
};
class PosixExtendedParser final : public Parser<PosixOptions> {
public:
explicit PosixExtendedParser(Lexer& lexer)
: Parser(lexer) {};
PosixExtendedParser(Lexer& lexer, Optional<OptionsType> regex_options)
: Parser(lexer, regex_options) {};
~PosixExtendedParser() = default;
private:
bool match_repetition_symbol();
bool match_ordinary_characters();
bool parse_internal(Vector<ByteCodeValue>&, size_t&) override;
bool parse_root(Vector<ByteCodeValue>&, size_t&);
bool parse_sub_expression(Vector<ByteCodeValue>&, size_t&);
bool parse_bracket_expression(Vector<ByteCodeValue>&, size_t&);
bool parse_repetition_symbol(Vector<ByteCodeValue>&, size_t&);
};
}
}
using AK::regex::ParserResult;
using AK::regex::PosixExtendedParser;

View file

@ -0,0 +1,914 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
#include <LibRegex/Regex.h>
#include <stdio.h>
#ifndef REGEX_DEBUG
# define BENCHMARK_LOOP_ITERATIONS 100000
//# define REGEX_BENCHMARK_OUR
# ifndef __serenity__
//# define REGEX_BENCHMARK_OTHER
# endif
# if defined(REGEX_BENCHMARK_OTHER)
# include <regex>
# endif
# if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(catch_all_benchmark)
{
Regex<PosixExtended> re("^.*$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT(re.match("Hello World", m));
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(catch_all_benchmark_reference_stdcpp_regex_match)
{
std::regex re("^.*$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello World", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_start_benchmark)
{
Regex<PosixExtended> re("^hello friends");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hello!", m), false);
EXPECT_EQ(re.match("hello friends", m), true);
EXPECT_EQ(re.match("Well, hello friends", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_start_benchmark_reference_stdcpp_regex_match)
{
std::regex re("^hello friends");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello", m, re), false);
EXPECT_EQ(std::regex_match("hello friends", m, re), true);
EXPECT_EQ(std::regex_match("Well, hello friends", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_end_benchmark)
{
Regex<PosixExtended> re(".*hello\\.\\.\\. there$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hallo", m), false);
EXPECT_EQ(re.match("I said fyhello... there", m), true);
EXPECT_EQ(re.match("ahello... therea", m), false);
EXPECT_EQ(re.match("hello.. there", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_end_benchmark_reference_stdcpp_regex_search)
{
std::regex re(".*hello\\.\\.\\. there$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("Hallo", m, re), false);
EXPECT_EQ(std::regex_search("I said fyhello... there", m, re), true);
EXPECT_EQ(std::regex_search("ahello... therea", m, re), false);
EXPECT_EQ(std::regex_search("hello.. there", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_period_benchmark)
{
Regex<PosixExtended> re("hello.");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hello1", m), false);
EXPECT_EQ(re.match("hello1", m), true);
EXPECT_EQ(re.match("hello2", m), true);
EXPECT_EQ(re.match("hello?", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_period_benchmark_reference_stdcpp_regex_match)
{
std::regex re("hello.");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello1", m, re), false);
EXPECT_EQ(std::regex_match("hello1", m, re), true);
EXPECT_EQ(std::regex_match("hello2", m, re), true);
EXPECT_EQ(std::regex_match("hello?", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_period_end_benchmark)
{
Regex<PosixExtended> re("hello.$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("Hello1", m), false);
EXPECT_EQ(re.search("hello1hello1", m), true);
EXPECT_EQ(re.search("hello2hell", m), false);
EXPECT_EQ(re.search("hello?", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_period_end_benchmark_reference_stdcpp_regex_search)
{
std::regex re("hello.$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("Hello1", m, re), false);
EXPECT_EQ(std::regex_search("hello1hello1", m, re), true);
EXPECT_EQ(std::regex_search("hello2hell", m, re), false);
EXPECT_EQ(std::regex_search("hello?", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_escaped_benchmark)
{
Regex<PosixExtended> re("hello\\.");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hello.", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_escaped_benchmark_reference_stdcpp_regex_match)
{
std::regex re("hello\\.");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hello.", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_period2_end_benchmark)
{
Regex<PosixExtended> re(".*hi... there$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("Hello there", m), false);
EXPECT_EQ(re.search("I said fyhi... there", m), true);
EXPECT_EQ(re.search("....hi... ", m), false);
EXPECT_EQ(re.search("I said fyhihii there", m), true);
EXPECT_EQ(re.search("I said fyhihi there", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_period2_end_benchmark_reference_stdcpp_regex_search)
{
std::regex re(".*hi... there$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("Hello there", m, re), false);
EXPECT_EQ(std::regex_search("I said fyhi... there", m, re), true);
EXPECT_EQ(std::regex_search("....hi... ", m, re), false);
EXPECT_EQ(std::regex_search("I said fyhihii there", m, re), true);
EXPECT_EQ(std::regex_search("I said fyhihi there", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_plus_benchmark)
{
Regex<PosixExtended> re("a+");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("b", m), false);
EXPECT_EQ(re.search("a", m), true);
EXPECT_EQ(re.search("aaaaaabbbbb", m), true);
EXPECT_EQ(re.search("aaaaaaaaaaa", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_plus_benchmark_reference_stdcpp_regex_search)
{
std::regex re("a+");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("b", m, re), false);
EXPECT_EQ(std::regex_search("a", m, re), true);
EXPECT_EQ(std::regex_search("aaaaaabbbbb", m, re), true);
EXPECT_EQ(std::regex_search("aaaaaaaaaaa", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_questionmark_benchmark)
{
Regex<PosixExtended> re("da?d");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("a", m), false);
EXPECT_EQ(re.search("daa", m), false);
EXPECT_EQ(re.search("ddddd", m), true);
EXPECT_EQ(re.search("dd", m), true);
EXPECT_EQ(re.search("dad", m), true);
EXPECT_EQ(re.search("dada", m), true);
EXPECT_EQ(re.search("adadaa", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_questionmark_benchmark_reference_stdcpp_regex_search)
{
std::regex re("da?d");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("a", m, re), false);
EXPECT_EQ(std::regex_search("daa", m, re), false);
EXPECT_EQ(std::regex_search("ddddd", m, re), true);
EXPECT_EQ(std::regex_search("dd", m, re), true);
EXPECT_EQ(std::regex_search("dad", m, re), true);
EXPECT_EQ(std::regex_search("dada", m, re), true);
EXPECT_EQ(std::regex_search("adadaa", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(character_class_benchmark)
{
Regex<PosixExtended> re("[[:alpha:]]");
RegexResult m;
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match(haystack.characters(), m), false);
EXPECT_EQ(re.search(haystack.characters(), m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(character_class_benchmark_reference_stdcpp_regex_search)
{
std::regex re("[[:alpha:]]");
std::cmatch m;
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match(haystack.characters(), m, re), false);
EXPECT_EQ(std::regex_search(haystack.characters(), m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(escaped_char_questionmark_benchmark)
{
Regex<PosixExtended> re("This\\.?And\\.?That");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("ThisAndThat", m), true);
EXPECT_EQ(re.match("This.And.That", m), true);
EXPECT_EQ(re.match("This And That", m), false);
EXPECT_EQ(re.match("This..And..That", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(escaped_char_questionmark_benchmark_reference_stdcpp_regex_match)
{
std::regex re("This\\.?And\\.?That");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("ThisAndThat", m, re), true);
EXPECT_EQ(std::regex_match("This.And.That", m, re), true);
EXPECT_EQ(std::regex_match("This And That", m, re), false);
EXPECT_EQ(std::regex_match("This..And..That", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(char_qualifier_asterisk_benchmark)
{
Regex<PosixExtended> re("regex*");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.search("#include <regex.h>", m), true);
EXPECT_EQ(re.search("#include <stdio.h>", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(char_qualifier_asterisk_benchmark_reference_stdcpp_regex_match)
{
std::regex re("regex*");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_search("#include <regex.h>", m, re), true);
EXPECT_EQ(std::regex_search("#include <stdio.h>", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_questionmark_benchmark)
{
Regex<PosixExtended> re("test(hello)?test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testtest", m), true);
EXPECT_EQ(re.match("testhellotest", m), true);
EXPECT_EQ(re.match("testasfdtest", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_questionmark_benchmark_reference_stdcpp_regex_match)
{
std::regex re("test(hello)?test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testtest", m, re), true);
EXPECT_EQ(std::regex_match("testhellotest", m, re), true);
EXPECT_EQ(std::regex_match("testasfdtest", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_asterisk_benchmark)
{
Regex<PosixExtended> re("test(hello)*test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testtest", m), true);
EXPECT_EQ(re.match("testhellohellotest", m), true);
EXPECT_EQ(re.search("testhellohellotest, testhellotest", m), true);
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_asterisk_benchmark_reference_stdcpp_regex_match)
{
std::regex re("test(hello)*test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testtest", m, re), true);
EXPECT_EQ(std::regex_match("testhellohellotest", m, re), true);
EXPECT_EQ(std::regex_search("testhellohellotest, testhellotest", m, re), true);
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark)
{
Regex<PosixExtended> re("test(.*)test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testasdftest", m), true);
EXPECT_EQ(re.match("testasdfasdftest", m), true);
EXPECT_EQ(re.search("testaaaatest, testbbbtest, testtest", m), true);
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_asterisk_2_benchmark_reference_stdcpp_regex_match)
{
std::regex re("test(.*)test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testasdftest", m, re), true);
EXPECT_EQ(std::regex_match("testasdfasdftest", m, re), true);
EXPECT_EQ(std::regex_search("testaaaatest, testbbbtest, testtest", m, re), true);
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark)
{
Regex<PosixExtended> re("test(a)?(b)?(c)?test");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("testtest", m), true);
EXPECT_EQ(re.match("testabctest", m), true);
EXPECT_EQ(re.search("testabctest, testactest", m), true);
EXPECT_EQ(re.match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m), false);
EXPECT_EQ(re.match("test", m), false);
EXPECT_EQ(re.match("whaaaaat", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(multi_parens_qualifier_questionmark_benchmark_reference_stdcpp_regex_match)
{
std::regex re("test(a)?(b)?(c)?test");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("testtest", m, re), true);
EXPECT_EQ(std::regex_match("testabctest", m, re), true);
EXPECT_EQ(std::regex_search("testabctest, testactest", m, re), true);
EXPECT_EQ(std::regex_match("aaaaaaaaaaaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbbbbbbb", m, re), false);
EXPECT_EQ(std::regex_match("test", m, re), false);
EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_alternative_benchmark)
{
Regex<PosixExtended> re("test|hello|friends");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("test", m), true);
EXPECT_EQ(re.match("hello", m), true);
EXPECT_EQ(re.match("friends", m), true);
EXPECT_EQ(re.match("whaaaaat", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_alternative_benchmark_reference_stdcpp_regex_match)
{
std::regex re("test|hello|friends");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("test", m, re), true);
EXPECT_EQ(std::regex_match("hello", m, re), true);
EXPECT_EQ(std::regex_match("friends", m, re), true);
EXPECT_EQ(std::regex_match("whaaaaat", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(alternative_match_groups_benchmark)
{
Regex<PosixExtended> re("test(a)?(b)?|hello ?(dear|my)? friends");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("test", m), true);
EXPECT_EQ(re.match("testa", m), true);
EXPECT_EQ(re.match("testb", m), true);
EXPECT_EQ(re.match("hello friends", m), true);
EXPECT_EQ(re.match("hello dear friends", m), true);
EXPECT_EQ(re.match("hello my friends", m), true);
EXPECT_EQ(re.match("testabc", m), false);
EXPECT_EQ(re.match("hello test friends", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(alternative_match_groups_benchmark_reference_stdcpp_regex_match)
{
std::regex re("test(a)?(b)?|hello ?(dear|my)? friends");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("test", m, re), true);
EXPECT_EQ(std::regex_match("testa", m, re), true);
EXPECT_EQ(std::regex_match("testb", m, re), true);
EXPECT_EQ(std::regex_match("hello friends", m, re), true);
EXPECT_EQ(std::regex_match("hello dear friends", m, re), true);
EXPECT_EQ(std::regex_match("hello my friends", m, re), true);
EXPECT_EQ(std::regex_match("testabc", m, re), false);
EXPECT_EQ(std::regex_match("hello test friends", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_exact_benchmark)
{
Regex<PosixExtended> re("(hello){3}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hellohellohello", m), true);
EXPECT_EQ(re.search("hellohellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohello", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_exact_benchmark_reference_stdcpp_regex_match)
{
std::regex re("(hello){3}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_minimum_benchmark)
{
Regex<PosixExtended> re("(hello){3,}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hellohellohello", m), true);
EXPECT_EQ(re.search("hellohellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_minimum_benchmark_reference_stdcpp_regex_match)
{
std::regex re("(hello){3,}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(parens_qualifier_maximum_benchmark)
{
Regex<PosixExtended> re("(hello){2,3}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello", m), false);
EXPECT_EQ(re.match("hellohellohello", m), true);
EXPECT_EQ(re.search("hellohellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohello", m), true);
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
EXPECT_EQ(re.match("test hellohellohellohello", m), false);
EXPECT_EQ(re.search("test hellohellohellohello", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(parens_qualifier_maximum_benchmark_reference_stdcpp_regex_match)
{
std::regex re("(hello){2,3}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello", m, re), false);
EXPECT_EQ(std::regex_match("hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohello", m, re), true);
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
EXPECT_EQ(std::regex_match("test hellohellohellohello", m, re), false);
EXPECT_EQ(std::regex_search("test hellohellohellohello", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(char_qualifier_min_max_benchmark)
{
Regex<PosixExtended> re("c{3,30}");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("cc", m), false);
EXPECT_EQ(re.match("ccc", m), true);
EXPECT_EQ(re.match("cccccccccccccccccccccccccccccc", m), true);
EXPECT_EQ(re.match("ccccccccccccccccccccccccccccccc", m), false);
EXPECT_EQ(re.search("ccccccccccccccccccccccccccccccc", m), true);
EXPECT_EQ(re.match("cccccccccccccccccccccccccccccccc", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(char_qualifier_min_max_benchmark_reference_stdcpp_regex_match)
{
std::regex re("c{3,30}");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("cc", m, re), false);
EXPECT_EQ(std::regex_match("ccc", m, re), true);
EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccc", m, re), true);
EXPECT_EQ(std::regex_match("ccccccccccccccccccccccccccccccc", m, re), false);
EXPECT_EQ(std::regex_search("ccccccccccccccccccccccccccccccc", m, re), true);
EXPECT_EQ(std::regex_match("cccccccccccccccccccccccccccccccc", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_benchmark)
{
Regex<PosixExtended> re("[abc]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), true);
EXPECT_EQ(re.match("b", m), true);
EXPECT_EQ(re.match("c", m), true);
EXPECT_EQ(re.match("d", m), false);
EXPECT_EQ(re.match("e", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_benchmark_reference_stdcpp_regex_match)
{
std::regex re("[abc]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), true);
EXPECT_EQ(std::regex_match("b", m, re), true);
EXPECT_EQ(std::regex_match("c", m, re), true);
EXPECT_EQ(std::regex_match("d", m, re), false);
EXPECT_EQ(std::regex_match("e", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark)
{
Regex<PosixExtended> re("[^abc]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), false);
EXPECT_EQ(re.match("b", m), false);
EXPECT_EQ(re.match("c", m), false);
EXPECT_EQ(re.match("d", m), true);
EXPECT_EQ(re.match("e", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_inverse_benchmark_reference_stdcpp_regex_match)
{
std::regex re("[^abc]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), false);
EXPECT_EQ(std::regex_match("b", m, re), false);
EXPECT_EQ(std::regex_match("c", m, re), false);
EXPECT_EQ(std::regex_match("d", m, re), true);
EXPECT_EQ(std::regex_match("e", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_range_benchmark)
{
Regex<PosixExtended> re("[a-d]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), true);
EXPECT_EQ(re.match("b", m), true);
EXPECT_EQ(re.match("c", m), true);
EXPECT_EQ(re.match("d", m), true);
EXPECT_EQ(re.match("e", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_range_benchmark_reference_stdcpp_regex_match)
{
std::regex re("[a-d]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), true);
EXPECT_EQ(std::regex_match("b", m, re), true);
EXPECT_EQ(std::regex_match("c", m, re), true);
EXPECT_EQ(std::regex_match("d", m, re), true);
EXPECT_EQ(std::regex_match("e", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark)
{
Regex<PosixExtended> re("[^a-df-z]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("a", m), false);
EXPECT_EQ(re.match("b", m), false);
EXPECT_EQ(re.match("c", m), false);
EXPECT_EQ(re.match("d", m), false);
EXPECT_EQ(re.match("e", m), true);
EXPECT_EQ(re.match("k", m), false);
EXPECT_EQ(re.match("z", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_chars_range_inverse_benchmark_reference_stdcpp_regex_match)
{
std::regex re("[^a-df-z]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("a", m, re), false);
EXPECT_EQ(std::regex_match("b", m, re), false);
EXPECT_EQ(std::regex_match("c", m, re), false);
EXPECT_EQ(std::regex_match("d", m, re), false);
EXPECT_EQ(std::regex_match("e", m, re), true);
EXPECT_EQ(std::regex_match("k", m, re), false);
EXPECT_EQ(std::regex_match("z", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(bracket_character_class_uuid_benchmark)
{
Regex<PosixExtended> re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m), true);
EXPECT_EQ(re.match("fb9b62a2", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(bracket_character_class_uuid_benchmark_reference_stdcpp_regex_match)
{
std::regex re("^([[:xdigit:]]{8})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{4})-([[:xdigit:]]{12})$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("fb9b62a2-1579-4e3a-afba-76239ccb6583", m, re), true);
EXPECT_EQ(std::regex_match("fb9b62a2", m, re), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark)
{
Regex<PosixExtended> re("[^[:digit:]]");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("1", m), false);
EXPECT_EQ(re.match("2", m), false);
EXPECT_EQ(re.match("3", m), false);
EXPECT_EQ(re.match("d", m), true);
EXPECT_EQ(re.match("e", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_bracket_character_class_inverse_benchmark_reference_stdcpp_regex_match)
{
std::regex re("[^[:digit:]]");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("1", m, re), false);
EXPECT_EQ(std::regex_match("2", m, re), false);
EXPECT_EQ(std::regex_match("3", m, re), false);
EXPECT_EQ(std::regex_match("d", m, re), true);
EXPECT_EQ(std::regex_match("e", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(email_address_benchmark)
{
Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("hello.world@domain.tld", m), true);
EXPECT_EQ(re.match("this.is.a.very_long_email_address@world.wide.web", m), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(email_address_benchmark_reference_stdcpp_regex_match)
{
std::regex re("^[A-Z0-9a-z._%+-]{1,64}@(?:[A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("hello.world@domain.tld", m, re), true);
EXPECT_EQ(std::regex_match("this.is.a.very_long_email_address@world.wide.web", m, re), true);
}
}
#endif
#if defined(REGEX_BENCHMARK_OUR)
BENCHMARK_CASE(simple_ignorecase_benchmark)
{
Regex<PosixExtended> re("^hello friends", PosixFlags::Insensitive);
RegexResult m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(re.match("Hello Friends", m), true);
EXPECT_EQ(re.match("hello Friends", m), true);
EXPECT_EQ(re.match("hello Friends!", m), false);
EXPECT_EQ(re.search("hello Friends", m), true);
EXPECT_EQ(re.match("hell Friends", m), false);
EXPECT_EQ(re.search("hell Friends", m), false);
}
}
#endif
#if defined(REGEX_BENCHMARK_OTHER)
BENCHMARK_CASE(simple_ignorecase_benchmark_reference_stdcpp_regex_match)
{
std::regex re("^hello friends", std::regex_constants::icase);
std::cmatch m;
for (size_t i = 0; i < BENCHMARK_LOOP_ITERATIONS; ++i) {
EXPECT_EQ(std::regex_match("Hello Friends", m, re), true);
EXPECT_EQ(std::regex_match("hello Friends", m, re), true);
EXPECT_EQ(std::regex_match("hello Friends!", m, re), false);
EXPECT_EQ(std::regex_search("hello Friends", m, re), true);
EXPECT_EQ(std::regex_match("hell Friends", m, re), false);
EXPECT_EQ(std::regex_search("hell Friends", m, re), false);
}
}
#endif
#endif
TEST_MAIN(Regex)

View file

@ -0,0 +1,21 @@
file(GLOB TEST_SOURCES CONFIGURE_DEPENDS "*.cpp")
file(GLOB REGEX_SOURCES CONFIGURE_DEPENDS "../*.cpp")
file(GLOB C_REGEX_SOURCES CONFIGURE_DEPENDS "../C/*.cpp")
foreach(source ${TEST_SOURCES})
get_filename_component(name ${source} NAME_WE)
add_executable(${name} ${source} ${REGEX_SOURCES} ${C_REGEX_SOURCES})
target_link_libraries(${name} LagomCore)
add_test(
NAME ${name}
COMMAND ${name}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
set_tests_properties(
${name}
PROPERTIES
FAIL_REGULAR_EXPRESSION
"FAIL"
)
endforeach()

View file

@ -0,0 +1,451 @@
/*
* Copyright (c) 2020, Emanuel Sprung <emanuel.sprung@gmail.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <AK/TestSuite.h> // import first, to prevent warning of ASSERT* redefinition
#include <LibRegex/Regex.h>
#include <LibRegex/RegexDebug.h>
#include <AK/StringBuilder.h>
#include <stdio.h>
static ECMAScriptOptions match_test_api_options(const ECMAScriptOptions options)
{
return options;
}
static PosixOptions match_test_api_options(const PosixOptions options)
{
return options;
}
TEST_CASE(regex_options_ecmascript)
{
ECMAScriptOptions eo;
eo |= ECMAScriptFlags::Global;
EXPECT(eo & ECMAScriptFlags::Global);
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
eo = match_test_api_options(ECMAScriptFlags::Global | ECMAScriptFlags::Insensitive | ECMAScriptFlags::Sticky);
EXPECT(eo & ECMAScriptFlags::Global);
EXPECT(eo & ECMAScriptFlags::Insensitive);
EXPECT(eo & ECMAScriptFlags::Sticky);
EXPECT(!(eo & ECMAScriptFlags::Unicode));
EXPECT(!(eo & ECMAScriptFlags::Multiline));
EXPECT(!(eo & ECMAScriptFlags::SingleLine));
eo &= ECMAScriptFlags::Insensitive;
EXPECT(!(eo & ECMAScriptFlags::Global));
EXPECT(eo & ECMAScriptFlags::Insensitive);
EXPECT(!(eo & ECMAScriptFlags::Multiline));
eo &= ECMAScriptFlags::Sticky;
EXPECT(!(eo & ECMAScriptFlags::Global));
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
EXPECT(!(eo & ECMAScriptFlags::Multiline));
EXPECT(!(eo & ECMAScriptFlags::Sticky));
eo = ~ECMAScriptFlags::Insensitive;
EXPECT(eo & ECMAScriptFlags::Global);
EXPECT(!(eo & ECMAScriptFlags::Insensitive));
EXPECT(eo & ECMAScriptFlags::Multiline);
EXPECT(eo & ECMAScriptFlags::Sticky);
}
TEST_CASE(regex_options_posix)
{
PosixOptions eo;
eo |= PosixFlags::Global;
EXPECT(eo & PosixFlags::Global);
EXPECT(!(eo & PosixFlags::Insensitive));
eo = match_test_api_options(PosixFlags::Global | PosixFlags::Insensitive | PosixFlags::MatchNotBeginOfLine);
EXPECT(eo & PosixFlags::Global);
EXPECT(eo & PosixFlags::Insensitive);
EXPECT(eo & PosixFlags::MatchNotBeginOfLine);
EXPECT(!(eo & PosixFlags::Unicode));
EXPECT(!(eo & PosixFlags::Multiline));
eo &= PosixFlags::Insensitive;
EXPECT(!(eo & PosixFlags::Global));
EXPECT(eo & PosixFlags::Insensitive);
EXPECT(!(eo & PosixFlags::Multiline));
eo &= PosixFlags::MatchNotBeginOfLine;
EXPECT(!(eo & PosixFlags::Global));
EXPECT(!(eo & PosixFlags::Insensitive));
EXPECT(!(eo & PosixFlags::Multiline));
eo = ~PosixFlags::Insensitive;
EXPECT(eo & PosixFlags::Global);
EXPECT(!(eo & PosixFlags::Insensitive));
EXPECT(eo & PosixFlags::Multiline);
}
TEST_CASE(regex_lexer)
{
Lexer l("/[.*+?^${}()|[\\]\\\\]/g");
EXPECT(l.next().type() == regex::TokenType::Slash);
EXPECT(l.next().type() == regex::TokenType::LeftBracket);
EXPECT(l.next().type() == regex::TokenType::Period);
EXPECT(l.next().type() == regex::TokenType::Asterisk);
EXPECT(l.next().type() == regex::TokenType::Plus);
EXPECT(l.next().type() == regex::TokenType::Questionmark);
EXPECT(l.next().type() == regex::TokenType::Circumflex);
EXPECT(l.next().type() == regex::TokenType::Dollar);
EXPECT(l.next().type() == regex::TokenType::LeftCurly);
EXPECT(l.next().type() == regex::TokenType::RightCurly);
EXPECT(l.next().type() == regex::TokenType::LeftParen);
EXPECT(l.next().type() == regex::TokenType::RightParen);
EXPECT(l.next().type() == regex::TokenType::Pipe);
EXPECT(l.next().type() == regex::TokenType::LeftBracket);
EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
EXPECT(l.next().type() == regex::TokenType::EscapeSequence);
EXPECT(l.next().type() == regex::TokenType::RightBracket);
EXPECT(l.next().type() == regex::TokenType::Slash);
EXPECT(l.next().type() == regex::TokenType::Char);
}
TEST_CASE(parser_error_parens)
{
String pattern = "test()test";
Lexer l(pattern);
PosixExtendedParser p(l);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
}
TEST_CASE(parser_error_special_characters_used_at_wrong_place)
{
String pattern;
Vector<char, 5> chars = { '*', '+', '?', '{' };
StringBuilder b;
Lexer l;
PosixExtended p(l);
for (auto& ch : chars) {
// First in ere
b.clear();
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After vertical line
b.clear();
b.append("a|");
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After circumflex
b.clear();
b.append("^");
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After dollar
b.clear();
b.append("$");
b.append(ch);
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
// After left parens
b.clear();
b.append("(");
b.append(ch);
b.append(")");
pattern = b.build();
l.set_source(pattern);
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::InvalidRepetitionMarker);
}
}
TEST_CASE(parser_error_vertical_line_used_at_wrong_place)
{
Lexer l;
PosixExtended p(l);
// First in ere
l.set_source("|asdf");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
// Last in ere
l.set_source("asdf|");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
// After left parens
l.set_source("(|asdf)");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
// Proceed right parens
l.set_source("(asdf)|");
p.parse();
EXPECT(p.has_error());
EXPECT(p.error() == Error::EmptySubExpression);
}
TEST_CASE(catch_all_first)
{
Regex<PosixExtended> re("^.*$");
RegexResult m;
re.match("Hello World", m);
EXPECT(m.count == 1);
EXPECT(re.match("Hello World", m));
}
TEST_CASE(catch_all)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Global);
EXPECT(re.has_match("Hello World"));
EXPECT(re.match("Hello World").success);
EXPECT(re.match("Hello World").count == 1);
EXPECT(has_match("Hello World", re));
auto res = match("Hello World", re);
EXPECT(res.success);
EXPECT(res.count == 1);
EXPECT(res.matches.size() == 1);
EXPECT(res.matches.first().view == "Hello World");
}
TEST_CASE(catch_all_again)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Extra);
EXPECT_EQ(has_match("Hello World", re), true);
}
TEST_CASE(char_utf8)
{
Regex<PosixExtended> re("😀");
RegexResult result;
EXPECT_EQ((result = match("Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界", re, PosixFlags::Global)).success, true);
EXPECT_EQ(result.count, 2u);
}
TEST_CASE(catch_all_newline)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline | PosixFlags::StringCopyMatches);
RegexResult result;
auto lambda = [&result, &re]() {
String aaa = "Hello World\nTest\n1234\n";
result = match(aaa, re);
EXPECT_EQ(result.success, true);
};
lambda();
EXPECT_EQ(result.count, 3u);
EXPECT_EQ(result.matches.at(0).view, "Hello World");
EXPECT_EQ(result.matches.at(1).view, "Test");
EXPECT_EQ(result.matches.at(2).view, "1234");
}
TEST_CASE(catch_all_newline_view)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
RegexResult result;
String aaa = "Hello World\nTest\n1234\n";
result = match(aaa, re);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 3u);
String str = "Hello World";
EXPECT_EQ(result.matches.at(0).view, str.view());
EXPECT_EQ(result.matches.at(1).view, "Test");
EXPECT_EQ(result.matches.at(2).view, "1234");
}
TEST_CASE(catch_all_newline_2)
{
Regex<PosixExtended> re("^.*$");
RegexResult result;
result = match("Hello World\nTest\n1234\n", re, PosixFlags::Multiline | PosixFlags::StringCopyMatches);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 3u);
EXPECT_EQ(result.matches.at(0).view, "Hello World");
EXPECT_EQ(result.matches.at(1).view, "Test");
EXPECT_EQ(result.matches.at(2).view, "1234");
result = match("Hello World\nTest\n1234\n", re);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 1u);
EXPECT_EQ(result.matches.at(0).view, "Hello World\nTest\n1234\n");
}
TEST_CASE(match_all_character_class)
{
Regex<PosixExtended> re("[[:alpha:]]");
String str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
RegexResult result = match(str, re, PosixFlags::Global | PosixFlags::StringCopyMatches);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 24u);
EXPECT_EQ(result.matches.at(0).view, "W");
EXPECT_EQ(result.matches.at(1).view, "i");
EXPECT_EQ(result.matches.at(2).view, "n");
EXPECT(&result.matches.at(0).view.characters_without_null_termination()[0] != &str.view().characters_without_null_termination()[1]);
}
TEST_CASE(example_for_git_commit)
{
Regex<PosixExtended> re("^.*$");
auto result = re.match("Well, hello friends!\nHello World!");
EXPECT(result.success);
EXPECT(result.count == 1);
EXPECT(result.matches.at(0).view.starts_with("Well"));
EXPECT(result.matches.at(0).view.length() == 33);
EXPECT(re.has_match("Well,...."));
result = re.match("Well, hello friends!\nHello World!", PosixFlags::Multiline);
EXPECT(result.success);
EXPECT(result.count == 2);
EXPECT(result.matches.at(0).view == "Well, hello friends!");
EXPECT(result.matches.at(1).view == "Hello World!");
}
TEST_CASE(email_address)
{
Regex<PosixExtended> re("^[A-Z0-9a-z._%+-]{1,64}@([A-Za-z0-9-]{1,63}\\.){1,125}[A-Za-z]{2,63}$");
EXPECT(re.has_match("hello.world@domain.tld"));
EXPECT(re.has_match("this.is.a.very_long_email_address@world.wide.web"));
}
TEST_CASE(ini_file_entries)
{
Regex<PosixExtended> re("[[:alpha:]]*=([[:digit:]]*)|\\[(.*)\\]");
RegexResult result;
#ifdef REGEX_DEBUG
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
#endif
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
EXPECT_EQ(result.count, 3u);
#ifdef REGEX_DEBUG
for (auto& v : result.matches)
fprintf(stderr, "%s\n", v.view.to_string().characters());
#endif
EXPECT_EQ(result.matches.at(0).view, "[Window]");
EXPECT_EQ(result.capture_group_matches.at(0).at(1).view, "Window");
EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
EXPECT_EQ(result.matches.at(1).line, 1u);
EXPECT_EQ(result.matches.at(1).column, 0u);
EXPECT_EQ(result.capture_group_matches.at(1).at(0).view, "255");
EXPECT_EQ(result.capture_group_matches.at(1).at(0).line, 1u);
EXPECT_EQ(result.capture_group_matches.at(1).at(0).column, 8u);
EXPECT_EQ(result.matches.at(2).view, "AudibleBeep=0");
EXPECT_EQ(result.capture_group_matches.at(2).at(0).view, "0");
EXPECT_EQ(result.capture_group_matches.at(2).at(0).line, 2u);
EXPECT_EQ(result.capture_group_matches.at(2).at(0).column, 12u);
}
TEST_CASE(named_capture_group)
{
Regex<PosixExtended> re("[[:alpha:]]*=(?<Test>[[:digit:]]*)");
RegexResult result;
#ifdef REGEX_DEBUG
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
#endif
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
EXPECT_EQ(result.count, 2u);
EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
EXPECT_EQ(result.named_capture_group_matches.at(0).ensure("Test").view, "255");
EXPECT_EQ(result.matches.at(1).view, "AudibleBeep=0");
EXPECT_EQ(result.named_capture_group_matches.at(1).ensure("Test").view, "0");
}
TEST_CASE(a_star)
{
Regex<PosixExtended> re("a*");
RegexResult result;
#ifdef REGEX_DEBUG
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
#endif
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
EXPECT_EQ(re.search(haystack.view(), result, PosixFlags::Multiline), true);
EXPECT_EQ(result.count, 32u);
EXPECT_EQ(result.matches.at(0).view.length(), 0u);
EXPECT_EQ(result.matches.at(10).view.length(), 1u);
EXPECT_EQ(result.matches.at(10).view, "a");
EXPECT_EQ(result.matches.at(31).view.length(), 0u);
}
TEST_CASE(simple_period_end_benchmark)
{
Regex<PosixExtended> re("hello.$");
RegexResult m;
EXPECT_EQ(re.search("Hello1", m), false);
EXPECT_EQ(re.search("hello1hello1", m), true);
EXPECT_EQ(re.search("hello2hell", m), false);
EXPECT_EQ(re.search("hello?", m), true);
}
TEST_MAIN(Regex)

File diff suppressed because it is too large Load diff

View file

@ -39,6 +39,8 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
endif()
file(GLOB AK_SOURCES CONFIGURE_DEPENDS "../../AK/*.cpp")
file(GLOB LIBREGEX_LIBC_SOURCES "../../Libraries/LibRegex/C/Regex.cpp")
file(GLOB LIBREGEX_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibRegex/*.cpp")
file(GLOB LIBCORE_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibCore/*.cpp")
file(GLOB LIBELF_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibELF/*.cpp")
file(GLOB LIBGEMINI_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibGemini/*.cpp")
@ -58,8 +60,9 @@ file(GLOB LIBTLS_SOURCES CONFIGURE_DEPENDS "../../Libraries/LibTLS/*.cpp")
file(GLOB SHELL_SOURCES CONFIGURE_DEPENDS "../../Shell/*.cpp")
file(GLOB SHELL_TESTS CONFIGURE_DEPENDS "../../Shell/Tests/*.sh")
set(LAGOM_REGEX_SOURCES ${LIBREGEX_LIBC_SOURCES} ${LIBREGEX_SOURCES})
set(LAGOM_CORE_SOURCES ${AK_SOURCES} ${LIBCORE_SOURCES})
set(LAGOM_MORE_SOURCES ${LIBELF_SOURCES} ${LIBIPC_SOURCES} ${LIBLINE_SOURCES} ${LIBJS_SOURCES} ${LIBJS_SUBDIR_SOURCES} ${LIBX86_SOURCES} ${LIBCRYPTO_SOURCES} ${LIBCOMPRESS_SOURCES} ${LIBCRYPTO_SUBDIR_SOURCES} ${LIBTLS_SOURCES} ${LIBMARKDOWN_SOURCES} ${LIBGEMINI_SOURCES} ${LIBGFX_SOURCES} ${LIBHTTP_SOURCES})
set(LAGOM_MORE_SOURCES ${LIBELF_SOURCES} ${LIBIPC_SOURCES} ${LIBLINE_SOURCES} ${LIBJS_SOURCES} ${LIBJS_SUBDIR_SOURCES} ${LIBX86_SOURCES} ${LIBCRYPTO_SOURCES} ${LIBCOMPRESS_SOURCES} ${LIBCRYPTO_SUBDIR_SOURCES} ${LIBTLS_SOURCES} ${LIBMARKDOWN_SOURCES} ${LIBGEMINI_SOURCES} ${LIBGFX_SOURCES} ${LIBHTTP_SOURCES} ${LAGOM_REGEX_SOURCES})
include_directories (../../)
include_directories (../../Libraries/)