LibJS: Combine UTF-16 surrogate pairs when concatenating strings

In the following use case:

    "\ud834" + "\udf06"

We were previously combining these as two individual code points. When
concatenating strings, we must take care to combine the high surrogate
from the left-hand side with the low surrogate from the right-hand side.
This commit is contained in:
Timothy Flynn 2022-01-17 11:24:58 -05:00 committed by Linus Groh
parent ab02e3981e
commit a57e2f9a76
Notes: sideshowbarker 2024-07-17 20:41:48 +09:00
2 changed files with 78 additions and 21 deletions

View file

@ -978,6 +978,51 @@ ThrowCompletionOr<Value> unsigned_right_shift(GlobalObject& global_object, Value
return vm.throw_completion<TypeError>(global_object, ErrorType::BigIntBadOperator, "unsigned right-shift");
}
// https://tc39.es/ecma262/#string-concatenation
static PrimitiveString* concatenate_strings(GlobalObject& global_object, PrimitiveString const& lhs, PrimitiveString const& rhs)
{
auto& vm = global_object.vm();
if (lhs.has_utf16_string() && rhs.has_utf16_string()) {
auto const& lhs_string = lhs.utf16_string();
auto const& rhs_string = rhs.utf16_string();
Vector<u16, 1> combined;
combined.ensure_capacity(lhs_string.length_in_code_units() + rhs_string.length_in_code_units());
combined.extend(lhs_string.string());
combined.extend(rhs_string.string());
return js_string(vm, Utf16String(move(combined)));
}
Utf8View lhs_string { lhs.string() };
Utf8View rhs_string { rhs.string() };
StringBuilder builder(lhs_string.length() + rhs_string.length());
Optional<u16> high_surrogate;
for (auto it = lhs_string.begin(); it != lhs_string.end(); ++it) {
if (!it.peek(1).has_value() && Utf16View::is_high_surrogate(*it) && !rhs_string.is_empty())
high_surrogate = *it;
else
builder.append_code_point(*it);
}
if (high_surrogate.has_value()) {
auto low_surrogate = *rhs_string.begin();
if (Utf16View::is_low_surrogate(low_surrogate)) {
builder.append_code_point(Utf16View::decode_surrogate_pair(*high_surrogate, low_surrogate));
rhs_string = rhs_string.substring_view(3); // A low surrogate encoded as UTF-8 is 3 bytes.
} else {
builder.append_code_point(*high_surrogate);
}
}
builder.append(rhs_string.as_string());
return js_string(vm, builder.to_string());
}
// 13.8.1 The Addition Operator ( + ), https://tc39.es/ecma262/#sec-addition-operator-plus
ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs)
{
@ -995,28 +1040,10 @@ ThrowCompletionOr<Value> add(GlobalObject& global_object, Value lhs, Value rhs)
auto lhs_primitive = TRY(lhs.to_primitive(global_object));
auto rhs_primitive = TRY(rhs.to_primitive(global_object));
if (lhs_primitive.is_string() && rhs_primitive.is_string()) {
auto const& lhs_string = lhs_primitive.as_string();
auto const& rhs_string = rhs_primitive.as_string();
if (lhs_string.has_utf16_string() && rhs_string.has_utf16_string()) {
auto const& lhs_utf16_string = lhs_string.utf16_string();
auto const& rhs_utf16_string = rhs_string.utf16_string();
Vector<u16, 1> combined;
combined.ensure_capacity(lhs_utf16_string.length_in_code_units() + rhs_utf16_string.length_in_code_units());
combined.extend(lhs_utf16_string.string());
combined.extend(rhs_utf16_string.string());
return Value(js_string(vm.heap(), Utf16String(move(combined))));
}
}
if (lhs_primitive.is_string() || rhs_primitive.is_string()) {
auto lhs_string = TRY(lhs_primitive.to_string(global_object));
auto rhs_string = TRY(rhs_primitive.to_string(global_object));
StringBuilder builder(lhs_string.length() + rhs_string.length());
builder.append(lhs_string);
builder.append(rhs_string);
return Value(js_string(vm, builder.to_string()));
auto lhs_string = TRY(lhs_primitive.to_primitive_string(global_object));
auto rhs_string = TRY(rhs_primitive.to_primitive_string(global_object));
return concatenate_strings(global_object, *lhs_string, *rhs_string);
}
auto lhs_numeric = TRY(lhs_primitive.to_numeric(global_object));

View file

@ -0,0 +1,30 @@
test("adding strings", () => {
expect("" + "").toBe("");
expect("ab" + "").toBe("ab");
expect("" + "cd").toBe("cd");
expect("ab" + "cd").toBe("abcd");
});
test("adding strings with non-strings", () => {
expect("a" + 1).toBe("a1");
expect(1 + "a").toBe("1a");
expect("a" + {}).toBe("a[object Object]");
expect({} + "a").toBeNaN();
expect("a" + []).toBe("a");
expect([] + "a").toBe("a");
expect("a" + NaN).toBe("aNaN");
expect(NaN + "a").toBe("NaNa");
expect(Array(16).join([[][[]] + []][+[]][++[+[]][+[]]] - 1) + " Batman!").toBe(
"NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN Batman!"
);
});
test("adding strings with dangling surrogates", () => {
expect("\ud834" + "").toBe("\ud834");
expect("" + "\udf06").toBe("\udf06");
expect("\ud834" + "\udf06").toBe("𝌆");
expect("\ud834" + "\ud834").toBe("\ud834\ud834");
expect("\udf06" + "\udf06").toBe("\udf06\udf06");
expect("\ud834a" + "\udf06").toBe("\ud834a\udf06");
expect("\ud834" + "a\udf06").toBe("\ud834a\udf06");
});