Punycode.cpp 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. /*
  2. * Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Utf32View.h>
  7. #include <AK/Utf8View.h>
  8. #include <LibUnicode/Punycode.h>
  9. namespace Unicode::Punycode {
  10. // https://www.rfc-editor.org/rfc/rfc3492.html#section-5
  11. static constexpr u32 BASE = 36;
  12. static constexpr u32 TMIN = 1;
  13. static constexpr u32 TMAX = 26;
  14. static constexpr u32 SKEW = 38;
  15. static constexpr u32 DAMP = 700;
  16. static constexpr u32 INITIAL_BIAS = 72;
  17. static constexpr u32 INITIAL_N = 0x80;
  18. static constexpr u32 DELIMITER = '-';
  19. static Optional<u32> digit_value_of_code_point(u32 code_point)
  20. {
  21. if (code_point >= 'A' && code_point <= 'Z')
  22. return code_point - 'A';
  23. if (code_point >= 'a' && code_point <= 'z')
  24. return code_point - 'a';
  25. if (code_point >= '0' && code_point <= '9')
  26. return code_point - '0' + 26;
  27. return {};
  28. }
  29. static u32 code_point_value_of_digit(u32 digit)
  30. {
  31. VERIFY(digit < 36);
  32. if (digit <= 25)
  33. return 'a' + digit;
  34. return '0' + digit - 26;
  35. }
  36. // https://www.rfc-editor.org/rfc/rfc3492.html#section-6.1
  37. static u32 adapt(u32 delta, u32 num_points, bool first_time)
  38. {
  39. // if firsttime then let delta = delta div damp
  40. if (first_time)
  41. delta = delta / DAMP;
  42. // else let delta = delta div 2
  43. else
  44. delta = delta / 2;
  45. // let delta = delta + (delta div numpoints)
  46. delta = delta + (delta / num_points);
  47. // let k = 0
  48. u32 k = 0;
  49. // while delta > ((base - tmin) * tmax) div 2 do begin
  50. while (delta > ((BASE - TMIN) * TMAX) / 2) {
  51. // let delta = delta div (base - tmin)
  52. delta = delta / (BASE - TMIN);
  53. // let k = k + base
  54. k = k + BASE;
  55. }
  56. // return k + (((base - tmin + 1) * delta) div (delta + skew))
  57. return k + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
  58. }
  59. // https://www.rfc-editor.org/rfc/rfc3492.html#section-6.2
  60. ErrorOr<String> decode(StringView input)
  61. {
  62. size_t consumed = 0;
  63. // let n = initial_n
  64. Checked<size_t> n = INITIAL_N;
  65. // let i = 0
  66. Checked<u32> i = 0;
  67. // let bias = initial_bias
  68. u32 bias = INITIAL_BIAS;
  69. // let output = an empty string indexed from 0
  70. Vector<u32> output;
  71. // consume all code points before the last delimiter (if there is one)
  72. // and copy them to output, fail on any non-basic code point
  73. Optional<size_t> last_delimiter_index = input.find_last(DELIMITER);
  74. if (last_delimiter_index.has_value()) {
  75. for (; consumed < last_delimiter_index.value(); consumed++) {
  76. if (!is_ascii(input[consumed]))
  77. return Error::from_string_literal("Unexpected non-basic code point");
  78. TRY(output.try_append(input[consumed]));
  79. }
  80. // if more than zero code points were consumed then consume one more
  81. // (which will be the last delimiter)
  82. if (last_delimiter_index.value() > 0) {
  83. auto next = input[consumed++];
  84. VERIFY(next == DELIMITER);
  85. }
  86. }
  87. // while the input is not exhausted do begin
  88. while (consumed < input.length()) {
  89. // let oldi = i
  90. Checked<u32> old_i = i;
  91. // let w = 1
  92. Checked<u32> w = 1;
  93. // for k = base to infinity in steps of base do begin
  94. for (size_t k = BASE;; k += BASE) {
  95. // consume a code point, or fail if there was none to consume
  96. if (consumed >= input.length())
  97. return Error::from_string_literal("No more code points to consume");
  98. auto code_point = input[consumed++];
  99. // let digit = the code point's digit-value, fail if it has none
  100. auto digit = digit_value_of_code_point(code_point);
  101. if (!digit.has_value())
  102. return Error::from_string_literal("Invalid base-36 digit");
  103. // let i = i + digit * w, fail on overflow
  104. i = i + Checked(digit.value()) * w;
  105. if (i.has_overflow())
  106. return Error::from_string_literal("Numeric overflow");
  107. // let t = tmin if k <= bias {+ tmin}, or
  108. // tmax if k >= bias + tmax, or k - bias otherwise
  109. u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias);
  110. // if digit < t then break
  111. if (digit.value() < t)
  112. break;
  113. // let w = w * (base - t), fail on overflow
  114. w = w * Checked(BASE - t);
  115. if (w.has_overflow())
  116. return Error::from_string_literal("Numeric overflow");
  117. }
  118. // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?)
  119. bias = adapt((i - old_i).value(), output.size() + 1, !old_i);
  120. // let n = n + i div (length(output) + 1), fail on overflow
  121. n = n + Checked(static_cast<size_t>(i.value() / static_cast<u32>(output.size() + 1)));
  122. if (n.has_overflow())
  123. return Error::from_string_literal("Numeric overflow");
  124. // let i = i mod (length(output) + 1)
  125. i = i % Checked(static_cast<u32>(output.size() + 1));
  126. // {if n is a basic code point then fail}
  127. // NOTE: The full statement enclosed in braces (checking whether n is a basic code point) can be omitted if initial_n exceeds all basic code points
  128. // (which is true for Punycode), because n is never less than initial_n.
  129. VERIFY(!is_ascii(n.value()));
  130. // insert n into output at position i
  131. TRY(output.try_insert(i.value(), n.value()));
  132. // increment i
  133. i++;
  134. }
  135. StringBuilder builder;
  136. TRY(builder.try_append(Utf32View(output.data(), output.size())));
  137. return builder.to_string();
  138. }
  139. static Optional<u32> find_smallest_code_point_greater_than_or_equal(Utf32View code_points, u32 threshold)
  140. {
  141. Optional<u32> result;
  142. for (auto code_point : code_points) {
  143. if (code_point >= threshold && (!result.has_value() || code_point < result.value()))
  144. result = code_point;
  145. }
  146. return result;
  147. }
  148. ErrorOr<String> encode(StringView input)
  149. {
  150. Vector<u32> code_points;
  151. for (auto code_point : Utf8View(input))
  152. TRY(code_points.try_append(code_point));
  153. return encode(Utf32View(code_points.data(), code_points.size()));
  154. }
  155. // https://www.rfc-editor.org/rfc/rfc3492.html#section-6.3
  156. ErrorOr<String> encode(Utf32View input)
  157. {
  158. Vector<u32> output;
  159. // let n = initial_n
  160. Checked<size_t> n = INITIAL_N;
  161. // let delta = 0
  162. Checked<size_t> delta = 0;
  163. // let bias = initial_bias
  164. u32 bias = INITIAL_BIAS;
  165. // let h = b = the number of basic code points in the input
  166. // copy them to the output in order, followed by a delimiter if b > 0
  167. size_t b = 0;
  168. for (auto code_point : input) {
  169. if (is_ascii(code_point)) {
  170. TRY(output.try_append(code_point));
  171. b++;
  172. }
  173. }
  174. auto h = b;
  175. if (b > 0)
  176. TRY(output.try_append(DELIMITER));
  177. // while h < length(input) do begin
  178. while (h < input.length()) {
  179. // let m = the minimum {non-basic} code point >= n in the input
  180. auto m = find_smallest_code_point_greater_than_or_equal(input, n.value());
  181. VERIFY(m.has_value());
  182. // let delta = delta + (m - n) * (h + 1), fail on overflow
  183. delta = delta + (Checked(static_cast<size_t>(m.value())) - n) * Checked(h + 1);
  184. if (delta.has_overflow())
  185. return Error::from_string_literal("Numeric overflow");
  186. // let n = m
  187. n = m.value();
  188. // for each code point c in the input (in order) do begin
  189. for (auto c : input) {
  190. // if c < n {or c is basic} then increment delta, fail on overflow
  191. if (c < n.value()) {
  192. delta++;
  193. if (delta.has_overflow())
  194. return Error::from_string_literal("Numeric overflow");
  195. }
  196. // if c == n then begin
  197. if (c == n.value()) {
  198. // let q = delta
  199. auto q = delta.value();
  200. // for k = base to infinity in steps of base do begin
  201. for (size_t k = BASE;; k += BASE) {
  202. // let t = tmin if k <= bias {+ tmin}, or
  203. // tmax if k >= bias + tmax, or k - bias otherwise
  204. u32 t = k <= bias ? TMIN : (k >= bias + TMAX ? TMAX : k - bias);
  205. // if q < t then break
  206. if (q < t)
  207. break;
  208. // output the code point for digit t + ((q - t) mod (base - t))
  209. auto digit = t + ((q - t) % (BASE - t));
  210. TRY(output.try_append(code_point_value_of_digit(digit)));
  211. // let q = (q - t) div (base - t)
  212. q = (q - t) / (BASE - t);
  213. }
  214. // output the code point for digit q
  215. TRY(output.try_append(code_point_value_of_digit(q)));
  216. // let bias = adapt(delta, h + 1, test h equals b?)
  217. bias = adapt(delta.value(), h + 1, h == b);
  218. // let delta = 0
  219. delta = 0;
  220. // increment h
  221. h++;
  222. }
  223. }
  224. // increment delta and n
  225. delta++;
  226. n++;
  227. }
  228. StringBuilder builder;
  229. TRY(builder.try_append(Utf32View(output.data(), output.size())));
  230. return builder.to_string();
  231. }
  232. }