HTMLEncodingDetection.cpp 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. /*
  2. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/StringView.h>
  7. #include <AK/Utf8View.h>
  8. #include <LibTextCodec/Decoder.h>
  9. #include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
  10. #include <ctype.h>
  11. namespace Web::HTML {
  12. bool prescan_should_abort(const ByteBuffer& input, const size_t& position)
  13. {
  14. return position >= input.size() || position >= 1024;
  15. }
  16. bool prescan_is_whitespace_or_slash(const u8& byte)
  17. {
  18. return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
  19. }
  20. bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position)
  21. {
  22. while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
  23. ++position;
  24. return !prescan_should_abort(input, position);
  25. }
  26. Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
  27. {
  28. if (!prescan_skip_whitespace_and_slashes(input, position))
  29. return {};
  30. if (input[position] == '>')
  31. return {};
  32. StringBuilder attribute_name;
  33. while (true) {
  34. if (input[position] == '=' && !attribute_name.is_empty()) {
  35. ++position;
  36. goto value;
  37. } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
  38. goto spaces;
  39. else if (input[position] == '/' || input[position] == '>')
  40. return Attribute(attribute_name.to_string(), "");
  41. else
  42. attribute_name.append_as_lowercase(input[position]);
  43. ++position;
  44. if (prescan_should_abort(input, position))
  45. return {};
  46. }
  47. spaces:
  48. if (!prescan_skip_whitespace_and_slashes(input, position))
  49. return {};
  50. if (input[position] != '=')
  51. return Attribute(attribute_name.to_string(), "");
  52. ++position;
  53. value:
  54. if (!prescan_skip_whitespace_and_slashes(input, position))
  55. return {};
  56. StringBuilder attribute_value;
  57. if (input[position] == '"' || input[position] == '\'') {
  58. u8 quote_character = input[position];
  59. ++position;
  60. for (; !prescan_should_abort(input, position); ++position) {
  61. if (input[position] == quote_character)
  62. return Attribute(attribute_name.to_string(), attribute_value.to_string());
  63. else
  64. attribute_value.append_as_lowercase(input[position]);
  65. }
  66. return {};
  67. } else if (input[position] == '>')
  68. return Attribute(attribute_name.to_string(), "");
  69. else
  70. attribute_value.append_as_lowercase(input[position]);
  71. ++position;
  72. if (prescan_should_abort(input, position))
  73. return {};
  74. for (; !prescan_should_abort(input, position); ++position) {
  75. if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
  76. return Attribute(attribute_name.to_string(), attribute_value.to_string());
  77. else
  78. attribute_value.append_as_lowercase(input[position]);
  79. }
  80. return {};
  81. }
  82. // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
  83. Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
  84. {
  85. // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
  86. // Detects '<?x'
  87. if (!prescan_should_abort(input, 6)) {
  88. if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
  89. return "utf-16le";
  90. if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
  91. return "utf-16be";
  92. }
  93. for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
  94. if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
  95. && input[position + 2] == '-' && input[position + 3] == '-') {
  96. position += 2;
  97. for (; !prescan_should_abort(input, position + 3); ++position) {
  98. if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
  99. position += 2;
  100. break;
  101. }
  102. }
  103. } else if (!prescan_should_abort(input, position + 6)
  104. && input[position] == '<'
  105. && (input[position + 1] == 'M' || input[position + 1] == 'm')
  106. && (input[position + 2] == 'E' || input[position + 2] == 'e')
  107. && (input[position + 3] == 'T' || input[position + 3] == 't')
  108. && (input[position + 4] == 'A' || input[position + 4] == 'a')
  109. && prescan_is_whitespace_or_slash(input[position + 5])) {
  110. position += 6;
  111. Vector<String> attribute_list {};
  112. bool got_pragma = false;
  113. Optional<bool> need_pragma {};
  114. Optional<String> charset {};
  115. while (true) {
  116. auto attribute = prescan_get_attribute(input, position);
  117. if (!attribute.has_value())
  118. break;
  119. if (attribute_list.contains_slow(attribute.value().name()))
  120. continue;
  121. auto& attribute_name = attribute.value().name();
  122. attribute_list.append(attribute.value().name());
  123. if (attribute_name == "http-equiv" && attribute.value().value() == "content-type")
  124. got_pragma = true;
  125. else if (attribute_name == "charset") {
  126. auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
  127. if (maybe_charset.has_value()) {
  128. charset = Optional<String> { maybe_charset };
  129. need_pragma = { false };
  130. }
  131. }
  132. // FIXME: For attribute name "content", do this:
  133. // Apply the "algorithm for extracting a character encoding from a meta
  134. // element", giving the attribute's value as the string to parse. If a
  135. // character encoding is returned, and if charset is still set to null,
  136. // let charset be the encoding returned, and set need pragma to true.
  137. }
  138. if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
  139. continue;
  140. if (charset.value() == "UTF-16BE/LE")
  141. return "UTF-8";
  142. else if (charset.value() == "x-user-defined")
  143. return "windows-1252";
  144. else
  145. return charset.value();
  146. } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
  147. && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
  148. position += 2;
  149. prescan_skip_whitespace_and_slashes(input, position);
  150. while (prescan_get_attribute(input, position).has_value()) { };
  151. } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
  152. position += 2;
  153. while (input[position] != '>') {
  154. ++position;
  155. if (prescan_should_abort(input, position))
  156. return {};
  157. }
  158. } else {
  159. // Do nothing.
  160. }
  161. }
  162. return {};
  163. }
  164. // https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
  165. String run_encoding_sniffing_algorithm(const ByteBuffer& input)
  166. {
  167. if (input.size() >= 2) {
  168. if (input[0] == 0xFE && input[1] == 0xFF) {
  169. return "UTF-16BE";
  170. } else if (input[0] == 0xFF && input[1] == 0xFE) {
  171. return "UTF-16LE";
  172. } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
  173. return "UTF-8";
  174. }
  175. }
  176. // FIXME: If the user has explicitly instructed the user agent to override the document's character
  177. // encoding with a specific encoding.
  178. // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
  179. // at any later step in this algorithm.
  180. // FIXME: If the transport layer specifies a character encoding, and it is supported.
  181. auto optional_encoding = run_prescan_byte_stream_algorithm(input);
  182. if (optional_encoding.has_value()) {
  183. return optional_encoding.value();
  184. }
  185. // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
  186. // is non-null and a child browsing context.
  187. // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
  188. // when it was last visited.
  189. if (!Utf8View(StringView(input)).validate()) {
  190. // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
  191. return "windows-1252";
  192. }
  193. // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
  194. // "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
  195. return "UTF-8";
  196. }
  197. }