Pārlūkot izejas kodu

LibUnicode: Add Punycode::encode

Simon Wanner 2 gadi atpakaļ
vecāks
revīzija
cfd0a60863

+ 7 - 0
Tests/LibUnicode/TestPunycode.cpp

@@ -47,4 +47,11 @@ TEST_CASE(decode)
     EXPECT(decode("Nåväl hej vänner"sv).is_error());
 }
 
+TEST_CASE(encode)
+{
+#define CASE(a, b) EXPECT_EQ(TRY_OR_FAIL(encode(a)), b);
+    ENUMERATE_TEST_CASES
+#undef CASE
+}
+
 }

+ 123 - 0
Userland/Libraries/LibUnicode/Punycode.cpp

@@ -5,6 +5,7 @@
  */
 
 #include <AK/Utf32View.h>
+#include <AK/Utf8View.h>
 #include <LibUnicode/Punycode.h>
 
 namespace Unicode::Punycode {
@@ -30,6 +31,14 @@ static Optional<u32> digit_value_of_code_point(u32 code_point)
     return {};
 }
 
+static u32 code_point_value_of_digit(u32 digit)
+{
+    VERIFY(digit < 36);
+    if (digit <= 25)
+        return 'a' + digit;
+    return '0' + digit - 26;
+}
+
 // https://www.rfc-editor.org/rfc/rfc3492.html#section-6.1
 static u32 adapt(u32 delta, u32 num_points, bool first_time)
 {
@@ -160,4 +169,118 @@ ErrorOr<String> decode(StringView input)
     return builder.to_string();
 }
 
+static Optional<u32> find_smallest_code_point_greater_than_or_equal(Utf32View code_points, u32 threshold)
+{
+    Optional<u32> result;
+    for (auto code_point : code_points) {
+        if (code_point >= threshold && (!result.has_value() || code_point < result.value()))
+            result = code_point;
+    }
+    return result;
+}
+
+ErrorOr<String> encode(StringView input)
+{
+    Vector<u32> code_points;
+    for (auto code_point : Utf8View(input))
+        TRY(code_points.try_append(code_point));
+    return encode(Utf32View(code_points.data(), code_points.size()));
+}
+
+// https://www.rfc-editor.org/rfc/rfc3492.html#section-6.3
+ErrorOr<String> encode(Utf32View input)
+{
+    Vector<u32> output;
+
+    // let n = initial_n
+    Checked<size_t> n = INITIAL_N;
+
+    // let delta = 0
+    Checked<size_t> delta = 0;
+
+    // let bias = initial_bias
+    u32 bias = INITIAL_BIAS;
+
+    // let h = b = the number of basic code points in the input
+    // copy them to the output in order, followed by a delimiter if b > 0
+    size_t b = 0;
+    for (auto code_point : input) {
+        if (is_ascii(code_point)) {
+            TRY(output.try_append(code_point));
+            b++;
+        }
+    }
+    auto h = b;
+    if (b > 0)
+        TRY(output.try_append(DELIMITER));
+
+    // while h < length(input) do begin
+    while (h < input.length()) {
+        // let m = the minimum {non-basic} code point >= n in the input
+        auto m = find_smallest_code_point_greater_than_or_equal(input, n.value());
+        VERIFY(m.has_value());
+
+        // let delta = delta + (m - n) * (h + 1), fail on overflow
+        delta = delta + (Checked(static_cast<size_t>(m.value())) - n) * Checked(h + 1);
+        if (delta.has_overflow())
+            return Error::from_string_literal("Numeric overflow");
+
+        // let n = m
+        n = m.value();
+
+        // for each code point c in the input (in order) do begin
+        for (auto c : input) {
+            // if c < n {or c is basic} then increment delta, fail on overflow
+            if (c < n.value()) {
+                delta++;
+                if (delta.has_overflow())
+                    return Error::from_string_literal("Numeric overflow");
+            }
+
+            // if c == n then begin
+            if (c == n.value()) {
+                // let q = delta
+                auto q = delta.value();
+
+                // for k = base to infinity in steps of base do begin
+                for (size_t k = BASE;; k += BASE) {
+                    // let t = tmin if k <= bias {+ tmin}, or
+                    //         tmax if k >= bias + tmax, or k - bias otherwise
+                    u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias);
+
+                    // if q < t then break
+                    if (q < t)
+                        break;
+
+                    // output the code point for digit t + ((q - t) mod (base - t))
+                    auto digit = t + ((q - t) % (BASE - t));
+                    TRY(output.try_append(code_point_value_of_digit(digit)));
+
+                    // let q = (q - t) div (base - t)
+                    q = (q - t) / (BASE - t);
+                }
+                // output the code point for digit q
+                TRY(output.try_append(code_point_value_of_digit(q)));
+
+                // let bias = adapt(delta, h + 1, test h equals b?)
+                bias = adapt(delta.value(), h + 1, h == b);
+
+                // let delta = 0
+                delta = 0;
+
+                // increment h
+                h++;
+            }
+        }
+
+        // increment delta and n
+        delta++;
+        n++;
+    }
+
+    StringBuilder builder;
+    TRY(builder.try_append(Utf32View(output.data(), output.size())));
+    return builder.to_string();
+}
+
 }

+ 2 - 0
Userland/Libraries/LibUnicode/Punycode.h

@@ -11,5 +11,7 @@
 namespace Unicode::Punycode {
 
 ErrorOr<String> decode(StringView);
+ErrorOr<String> encode(StringView);
+ErrorOr<String> encode(Utf32View);
 
 }