mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2024-11-22 07:30:19 +00:00
LibRegex: Support ECMA-262 Unicode escapes of the form "\u{code_point}"
When the Unicode flag is set, regular expressions may escape code points by surrounding the hexadecimal code point with curly braces, e.g. \u{41} is the character "A". When the Unicode flag is not set, this should be considered a repetition symbol - \u{41} is the character "u" repeated 41 times. This is left as a TODO for now.
This commit is contained in:
parent
0e6375558d
commit
345ef6abba
Notes:
sideshowbarker
2024-07-18 08:26:24 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/345ef6abba6 Pull-request: https://github.com/SerenityOS/serenity/pull/8931 Reviewed-by: https://github.com/linusg ✅
2 changed files with 27 additions and 0 deletions
|
@ -510,6 +510,11 @@ TEST_CASE(ECMA262_parse)
|
|||
{ "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
|
||||
{ "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode },
|
||||
{ "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
|
||||
{ "\\u{0}", regex::Error::NoError, ECMAScriptFlags::Unicode },
|
||||
{ "\\u{10ffff}", regex::Error::NoError, ECMAScriptFlags::Unicode },
|
||||
{ "\\u{10ffff", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
|
||||
{ "\\u{10ffffx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
|
||||
{ "\\u{110000}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
|
@ -605,6 +610,7 @@ TEST_CASE(ECMA262_unicode_match)
|
|||
{ "\\ude00", "😀", false, ECMAScriptFlags::Unicode },
|
||||
{ "\\ud83d\\ude00", "😀", true },
|
||||
{ "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode },
|
||||
{ "\\u{1f600}", "😀", true, ECMAScriptFlags::Unicode },
|
||||
{ "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true },
|
||||
{ "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode },
|
||||
};
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
|
||||
#include "RegexParser.h"
|
||||
#include "RegexDebug.h"
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringUtils.h>
|
||||
|
@ -1440,6 +1441,26 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
}
|
||||
|
||||
if (try_skip("u")) {
|
||||
if (match(TokenType::LeftCurly)) {
|
||||
consume();
|
||||
|
||||
if (!unicode) {
|
||||
// FIXME: In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
|
||||
TODO();
|
||||
}
|
||||
|
||||
auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 6);
|
||||
if (code_point.has_value() && is_unicode(*code_point) && match(TokenType::RightCurly)) {
|
||||
consume();
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
|
||||
return true;
|
||||
}
|
||||
|
||||
set_error(Error::InvalidPattern);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
|
||||
// In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
|
||||
// rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
|
||||
|
|
Loading…
Reference in a new issue