LibUnicode: Add Punycode::encode

This commit is contained in:
Simon Wanner 2023-06-14 15:24:30 +02:00 committed by Tim Flynn
parent 299d35aadc
commit cfd0a60863
Notes: sideshowbarker 2024-07-16 22:16:50 +09:00
3 changed files with 132 additions and 0 deletions

View file

@ -47,4 +47,11 @@ TEST_CASE(decode)
EXPECT(decode("Nåväl hej vänner"sv).is_error());
}
TEST_CASE(encode)
{
#define CASE(a, b) EXPECT_EQ(TRY_OR_FAIL(encode(a)), b);
ENUMERATE_TEST_CASES
#undef CASE
}
}

View file

@ -5,6 +5,7 @@
*/
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/Punycode.h>
namespace Unicode::Punycode {
@ -30,6 +31,14 @@ static Optional<u32> digit_value_of_code_point(u32 code_point)
return {};
}
static u32 code_point_value_of_digit(u32 digit)
{
VERIFY(digit < 36);
if (digit <= 25)
return 'a' + digit;
return '0' + digit - 26;
}
// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.1
static u32 adapt(u32 delta, u32 num_points, bool first_time)
{
@ -160,4 +169,118 @@ ErrorOr<String> decode(StringView input)
return builder.to_string();
}
static Optional<u32> find_smallest_code_point_greater_than_or_equal(Utf32View code_points, u32 threshold)
{
Optional<u32> result;
for (auto code_point : code_points) {
if (code_point >= threshold && (!result.has_value() || code_point < result.value()))
result = code_point;
}
return result;
}
ErrorOr<String> encode(StringView input)
{
Vector<u32> code_points;
for (auto code_point : Utf8View(input))
TRY(code_points.try_append(code_point));
return encode(Utf32View(code_points.data(), code_points.size()));
}
// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.3
ErrorOr<String> encode(Utf32View input)
{
Vector<u32> output;
// let n = initial_n
Checked<size_t> n = INITIAL_N;
// let delta = 0
Checked<size_t> delta = 0;
// let bias = initial_bias
u32 bias = INITIAL_BIAS;
// let h = b = the number of basic code points in the input
// copy them to the output in order, followed by a delimiter if b > 0
size_t b = 0;
for (auto code_point : input) {
if (is_ascii(code_point)) {
TRY(output.try_append(code_point));
b++;
}
}
auto h = b;
if (b > 0)
TRY(output.try_append(DELIMITER));
// while h < length(input) do begin
while (h < input.length()) {
// let m = the minimum {non-basic} code point >= n in the input
auto m = find_smallest_code_point_greater_than_or_equal(input, n.value());
VERIFY(m.has_value());
// let delta = delta + (m - n) * (h + 1), fail on overflow
delta = delta + (Checked(static_cast<size_t>(m.value())) - n) * Checked(h + 1);
if (delta.has_overflow())
return Error::from_string_literal("Numeric overflow");
// let n = m
n = m.value();
// for each code point c in the input (in order) do begin
for (auto c : input) {
// if c < n {or c is basic} then increment delta, fail on overflow
if (c < n.value()) {
delta++;
if (delta.has_overflow())
return Error::from_string_literal("Numeric overflow");
}
// if c == n then begin
if (c == n.value()) {
// let q = delta
auto q = delta.value();
// for k = base to infinity in steps of base do begin
for (size_t k = BASE;; k += BASE) {
// let t = tmin if k <= bias {+ tmin}, or
// tmax if k >= bias + tmax, or k - bias otherwise
u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias);
// if q < t then break
if (q < t)
break;
// output the code point for digit t + ((q - t) mod (base - t))
auto digit = t + ((q - t) % (BASE - t));
TRY(output.try_append(code_point_value_of_digit(digit)));
// let q = (q - t) div (base - t)
q = (q - t) / (BASE - t);
}
// output the code point for digit q
TRY(output.try_append(code_point_value_of_digit(q)));
// let bias = adapt(delta, h + 1, test h equals b?)
bias = adapt(delta.value(), h + 1, h == b);
// let delta = 0
delta = 0;
// increment h
h++;
}
}
// increment delta and n
delta++;
n++;
}
StringBuilder builder;
TRY(builder.try_append(Utf32View(output.data(), output.size())));
return builder.to_string();
}
}

View file

@ -11,5 +11,7 @@
namespace Unicode::Punycode {
ErrorOr<String> decode(StringView);
ErrorOr<String> encode(StringView);
ErrorOr<String> encode(Utf32View);
}