IDNA.cpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. /*
  2. * Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Error.h>
  7. #include <AK/String.h>
  8. #include <LibUnicode/CharacterTypes.h>
  9. #include <LibUnicode/IDNA.h>
  10. #include <LibUnicode/Normalize.h>
  11. #include <LibUnicode/Punycode.h>
  12. #if ENABLE_UNICODE_DATA
  13. # include <LibUnicode/IDNAData.h>
  14. # include <LibUnicode/UnicodeData.h>
  15. #endif
  16. namespace Unicode::IDNA {
  17. Optional<Mapping> __attribute__((weak)) get_idna_mapping(u32) { return {}; }
  18. struct ProcessingResult {
  19. Vector<String> result {};
  20. bool has_error { false };
  21. };
  22. static MappingStatus translate_status(MappingStatus status, UseStd3AsciiRules use_std3_ascii_rules)
  23. {
  24. switch (status) {
  25. case MappingStatus::DisallowedStd3Valid:
  26. return use_std3_ascii_rules == UseStd3AsciiRules::Yes ? MappingStatus::Disallowed : MappingStatus::Valid;
  27. case MappingStatus::DisallowedStd3Mapped:
  28. return use_std3_ascii_rules == UseStd3AsciiRules::Yes ? MappingStatus::Disallowed : MappingStatus::Mapped;
  29. default:
  30. return status;
  31. }
  32. }
  33. // https://www.unicode.org/reports/tr46/#Validity_Criteria
  34. static bool is_valid_label(String const& label, CheckHyphens check_hyphens, CheckBidi check_bidi, CheckJoiners check_joiners, UseStd3AsciiRules use_std3_ascii_rules, TransitionalProcessing transitional_processing)
  35. {
  36. // 1. The label must be in Unicode Normalization Form NFC.
  37. auto normalized = normalize(label, NormalizationForm::NFC);
  38. if (normalized != label)
  39. return false;
  40. size_t position = 0;
  41. for (auto code_point : label.code_points()) {
  42. // 2. If CheckHyphens, the label must not contain a U+002D HYPHEN-MINUS character in both the third and fourth positions.
  43. if (check_hyphens == CheckHyphens::Yes && code_point == '-' && (position == 2 || position == 3))
  44. return false;
  45. // 4. The label must not contain a U+002E ( . ) FULL STOP.
  46. if (code_point == '.')
  47. return false;
  48. // 5. The label must not begin with a combining mark, that is: General_Category=Mark.
  49. static auto general_category_mark = general_category_from_string("Mark"sv);
  50. if (position == 0 && general_category_mark.has_value() && code_point_has_general_category(code_point, general_category_mark.value()))
  51. return false;
  52. // 6. Each code point in the label must only have certain status values according to Section 5, IDNA Mapping Table:
  53. Optional<Mapping> mapping = get_idna_mapping(code_point);
  54. if (!mapping.has_value())
  55. return false;
  56. auto status = translate_status(mapping->status, use_std3_ascii_rules);
  57. if (transitional_processing == TransitionalProcessing::Yes) {
  58. // 1. For Transitional Processing, each value must be valid.
  59. if (status != MappingStatus::Valid)
  60. return false;
  61. } else {
  62. // 2. For Nontransitional Processing, each value must be either valid or deviation.
  63. if (status != MappingStatus::Valid && status != MappingStatus::Deviation)
  64. return false;
  65. }
  66. position++;
  67. }
  68. // 3. If CheckHyphens, the label must neither begin nor end with a U+002D HYPHEN-MINUS character.
  69. if (check_hyphens == CheckHyphens::Yes && (label.starts_with('-') || label.ends_with('-')))
  70. return false;
  71. // FIXME: 7. If CheckJoiners, the label must satisify the ContextJ rules from Appendix A, in The Unicode Code Points and Internationalized Domain Names for Applications (IDNA) [IDNA2008].
  72. (void)check_joiners;
  73. // FIXME: 8. If CheckBidi, and if the domain name is a Bidi domain name, then the label must satisfy all six of the numbered conditions in [IDNA2008] RFC 5893, Section 2.
  74. (void)check_bidi;
  75. return true;
  76. }
  77. // https://www.unicode.org/reports/tr46/#Processing
  78. static ErrorOr<ProcessingResult> apply_main_processing_steps(Utf8View domain_name, ToAsciiOptions const& options)
  79. {
  80. bool has_error = false;
  81. StringBuilder mapped;
  82. // 1. Map. For each code point in the domain_name string, look up the status value in Section 5, IDNA Mapping Table, and take the following actions:
  83. for (u32 code_point : domain_name) {
  84. Optional<Mapping> mapping = get_idna_mapping(code_point);
  85. if (!mapping.has_value()) {
  86. has_error = true;
  87. continue;
  88. }
  89. switch (translate_status(mapping->status, options.use_std3_ascii_rules)) {
  90. // disallowed: Leave the code point unchanged in the string, and record that there was an error.
  91. case MappingStatus::Disallowed:
  92. TRY(mapped.try_append_code_point(code_point));
  93. has_error = true;
  94. break;
  95. // ignored: Remove the code point from the string. This is equivalent to mapping the code point to an empty string.
  96. case MappingStatus::Ignored:
  97. break;
  98. // mapped: Replace the code point in the string by the value for the mapping in Section 5, IDNA Mapping Table.
  99. case MappingStatus::Mapped:
  100. TRY(mapped.try_append(mapping->mapped_to));
  101. break;
  102. // deviation:
  103. case MappingStatus::Deviation:
  104. if (options.transitional_processing == TransitionalProcessing::Yes) {
  105. // If Transitional_Processing, replace the code point in the string by the value for the mapping in Section 5, IDNA Mapping Table .
  106. TRY(mapped.try_append(mapping->mapped_to));
  107. } else {
  108. TRY(mapped.try_append_code_point(code_point));
  109. }
  110. break;
  111. // valid: Leave the code point unchanged in the string.
  112. case MappingStatus::Valid:
  113. TRY(mapped.try_append_code_point(code_point));
  114. break;
  115. default:
  116. VERIFY_NOT_REACHED();
  117. }
  118. }
  119. // 2. Normalize. Normalize the domain_name string to Unicode Normalization Form C.
  120. auto normalized = normalize(mapped.string_view(), NormalizationForm::NFC);
  121. // 3. Break. Break the string into labels at U+002E ( . ) FULL STOP.
  122. auto labels = TRY(normalized.split('.', SplitBehavior::KeepEmpty));
  123. // 4. Convert/Validate. For each label in the domain_name string:
  124. for (auto& label : labels) {
  125. // If the label starts with “xn--”:
  126. if (label.starts_with_bytes("xn--"sv)) {
  127. // 1. Attempt to convert the rest of the label to Unicode according to Punycode [RFC3492]. If that conversion fails, record that there was an error, and continue with the next label.
  128. // Otherwise replace the original label in the string by the results of the conversion.
  129. auto punycode = Punycode::decode(label.bytes_as_string_view().substring_view(4));
  130. if (punycode.is_error()) {
  131. has_error = true;
  132. continue;
  133. }
  134. label = punycode.release_value();
  135. // 2. Verify that the label meets the validity criteria in Section 4.1, Validity Criteria for Nontransitional Processing.
  136. // If any of the validity criteria are not satisfied, record that there was an error.
  137. if (!is_valid_label(label, options.check_hyphens, options.check_bidi, options.check_joiners, options.use_std3_ascii_rules, TransitionalProcessing::No))
  138. has_error = true;
  139. }
  140. // If the label does not start with “xn--”:
  141. else {
  142. // Verify that the label meets the validity criteria in Section 4.1, Validity Criteria for the input Processing choice (Transitional or Nontransitional).
  143. // If any of the validity criteria are not satisfied, record that there was an error.
  144. if (!is_valid_label(label, options.check_hyphens, options.check_bidi, options.check_joiners, options.use_std3_ascii_rules, options.transitional_processing))
  145. has_error = true;
  146. }
  147. }
  148. return ProcessingResult {
  149. .result = move(labels),
  150. .has_error = has_error,
  151. };
  152. }
  153. // https://www.unicode.org/reports/tr46/#ToASCII
  154. ErrorOr<String> to_ascii(Utf8View domain_name, ToAsciiOptions const& options)
  155. {
  156. // 1. To the input domain_name, apply the Processing Steps in Section 4, Processing, using the input boolean flags Transitional_Processing, CheckHyphens, CheckBidi, CheckJoiners, and UseSTD3ASCIIRules. This may record an error.
  157. auto processed = TRY(apply_main_processing_steps(domain_name, options));
  158. bool has_error = processed.has_error;
  159. // 2. Break the result into labels at U+002E FULL STOP.
  160. auto labels = move(processed.result);
  161. // 3. Convert each label with non-ASCII characters into Punycode [RFC3492], and prefix by “xn--”. This may record an error.
  162. for (auto& label : labels) {
  163. auto all_ascii = true;
  164. for (auto code_point : label.code_points()) {
  165. if (!is_ascii(code_point)) {
  166. all_ascii = false;
  167. break;
  168. }
  169. }
  170. if (!all_ascii) {
  171. auto punycode = Punycode::encode(label);
  172. if (punycode.is_error()) {
  173. has_error = true;
  174. continue;
  175. }
  176. auto punycode_result = punycode.release_value();
  177. StringBuilder builder;
  178. TRY(builder.try_append("xn--"sv));
  179. TRY(builder.try_append(punycode_result));
  180. label = TRY(builder.to_string());
  181. }
  182. }
  183. // 4. If the VerifyDnsLength flag is true, then verify DNS length restrictions. This may record an error. For more information, see [STD13] and [STD3].
  184. if (options.verify_dns_length == VerifyDnsLength::Yes) {
  185. // 1. The length of the domain name, excluding the root label and its dot, is from 1 to 253.
  186. size_t total_length = 0;
  187. auto* root_label = !labels.is_empty() && labels.last().is_empty() ? &labels.last() : nullptr;
  188. for (auto& label : labels) {
  189. // 2. The length of each label is from 1 to 63.
  190. auto length = label.bytes().size();
  191. if (label.is_empty() && &label != root_label)
  192. return Error::from_string_literal("Invalid empty label");
  193. if (length > 63)
  194. return Error::from_string_literal("Label too long");
  195. total_length += length;
  196. }
  197. total_length += labels.size() - (root_label ? 2 : 1);
  198. if (total_length == 0 || total_length > 253)
  199. return Error::from_string_literal("Domain too long");
  200. }
  201. // 5. If an error was recorded in steps 1-4, then the operation has failed and a failure value is returned. No DNS lookup should be done.
  202. if (has_error)
  203. return Error::from_string_literal("Invalid domain name");
  204. // 6. Otherwise join the labels using U+002E FULL STOP as a separator, and return the result.
  205. return String::join('.', labels);
  206. }
  207. }