HTMLEncodingDetection.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. /*
  2. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/GenericLexer.h>
  8. #include <AK/StringView.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibTextCodec/Decoder.h>
  11. #include <LibWeb/DOM/Attribute.h>
  12. #include <LibWeb/DOM/Document.h>
  13. #include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
  14. #include <ctype.h>
  15. namespace Web::HTML {
  16. bool prescan_should_abort(ByteBuffer const& input, size_t const& position)
  17. {
  18. return position >= input.size() || position >= 1024;
  19. }
  20. bool prescan_is_whitespace_or_slash(u8 const& byte)
  21. {
  22. return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
  23. }
  24. bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position)
  25. {
  26. while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
  27. ++position;
  28. return !prescan_should_abort(input, position);
  29. }
  30. // https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
  31. Optional<StringView> extract_character_encoding_from_meta_element(String const& string)
  32. {
  33. // Checking for "charset" is case insensitive, as is getting an encoding.
  34. // Therefore, stick to lowercase from the start for simplicity.
  35. auto lowercase_string = string.to_lowercase();
  36. GenericLexer lexer(lowercase_string);
  37. for (;;) {
  38. auto charset_index = lexer.remaining().find("charset"sv);
  39. if (!charset_index.has_value())
  40. return {};
  41. // 7 is the length of "charset".
  42. lexer.ignore(charset_index.value() + 7);
  43. lexer.ignore_while([](char c) {
  44. // FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
  45. return is_ascii_space(c);
  46. });
  47. if (lexer.peek() != '=')
  48. continue;
  49. break;
  50. }
  51. // Ignore the '='.
  52. lexer.ignore();
  53. lexer.ignore_while([](char c) {
  54. // FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
  55. return is_ascii_space(c);
  56. });
  57. if (lexer.is_eof())
  58. return {};
  59. if (lexer.consume_specific('"')) {
  60. auto matching_double_quote = lexer.remaining().find('"');
  61. if (!matching_double_quote.has_value())
  62. return {};
  63. auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value());
  64. return TextCodec::get_standardized_encoding(encoding);
  65. }
  66. if (lexer.consume_specific('\'')) {
  67. auto matching_single_quote = lexer.remaining().find('\'');
  68. if (!matching_single_quote.has_value())
  69. return {};
  70. auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value());
  71. return TextCodec::get_standardized_encoding(encoding);
  72. }
  73. auto encoding = lexer.consume_until([](char c) {
  74. // FIXME: Not the exact same ASCII whitespace. The spec does not include vertical tab (\v).
  75. return is_ascii_space(c) || c == ';';
  76. });
  77. return TextCodec::get_standardized_encoding(encoding);
  78. }
  79. JS::GCPtr<DOM::Attribute> prescan_get_attribute(DOM::Document& document, ByteBuffer const& input, size_t& position)
  80. {
  81. if (!prescan_skip_whitespace_and_slashes(input, position))
  82. return {};
  83. if (input[position] == '>')
  84. return {};
  85. StringBuilder attribute_name;
  86. while (true) {
  87. if (input[position] == '=' && !attribute_name.is_empty()) {
  88. ++position;
  89. goto value;
  90. } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
  91. goto spaces;
  92. else if (input[position] == '/' || input[position] == '>')
  93. return *DOM::Attribute::create(document, attribute_name.to_string(), "");
  94. else
  95. attribute_name.append_as_lowercase(input[position]);
  96. ++position;
  97. if (prescan_should_abort(input, position))
  98. return {};
  99. }
  100. spaces:
  101. if (!prescan_skip_whitespace_and_slashes(input, position))
  102. return {};
  103. if (input[position] != '=')
  104. return DOM::Attribute::create(document, attribute_name.to_string(), "");
  105. ++position;
  106. value:
  107. if (!prescan_skip_whitespace_and_slashes(input, position))
  108. return {};
  109. StringBuilder attribute_value;
  110. if (input[position] == '"' || input[position] == '\'') {
  111. u8 quote_character = input[position];
  112. ++position;
  113. for (; !prescan_should_abort(input, position); ++position) {
  114. if (input[position] == quote_character)
  115. return DOM::Attribute::create(document, attribute_name.to_string(), attribute_value.to_string());
  116. else
  117. attribute_value.append_as_lowercase(input[position]);
  118. }
  119. return {};
  120. } else if (input[position] == '>')
  121. return DOM::Attribute::create(document, attribute_name.to_string(), "");
  122. else
  123. attribute_value.append_as_lowercase(input[position]);
  124. ++position;
  125. if (prescan_should_abort(input, position))
  126. return {};
  127. for (; !prescan_should_abort(input, position); ++position) {
  128. if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
  129. return DOM::Attribute::create(document, attribute_name.to_string(), attribute_value.to_string());
  130. else
  131. attribute_value.append_as_lowercase(input[position]);
  132. }
  133. return {};
  134. }
  135. // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
  136. Optional<String> run_prescan_byte_stream_algorithm(DOM::Document& document, ByteBuffer const& input)
  137. {
  138. // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
  139. // Detects '<?x'
  140. if (!prescan_should_abort(input, 6)) {
  141. if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
  142. return "utf-16le";
  143. if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
  144. return "utf-16be";
  145. }
  146. for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
  147. if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
  148. && input[position + 2] == '-' && input[position + 3] == '-') {
  149. position += 2;
  150. for (; !prescan_should_abort(input, position + 3); ++position) {
  151. if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
  152. position += 2;
  153. break;
  154. }
  155. }
  156. } else if (!prescan_should_abort(input, position + 6)
  157. && input[position] == '<'
  158. && (input[position + 1] == 'M' || input[position + 1] == 'm')
  159. && (input[position + 2] == 'E' || input[position + 2] == 'e')
  160. && (input[position + 3] == 'T' || input[position + 3] == 't')
  161. && (input[position + 4] == 'A' || input[position + 4] == 'a')
  162. && prescan_is_whitespace_or_slash(input[position + 5])) {
  163. position += 6;
  164. Vector<String> attribute_list {};
  165. bool got_pragma = false;
  166. Optional<bool> need_pragma {};
  167. Optional<String> charset {};
  168. while (true) {
  169. auto attribute = prescan_get_attribute(document, input, position);
  170. if (!attribute)
  171. break;
  172. if (attribute_list.contains_slow(attribute->name()))
  173. continue;
  174. auto& attribute_name = attribute->name();
  175. attribute_list.append(attribute->name());
  176. if (attribute_name == "http-equiv") {
  177. got_pragma = attribute->value() == "content-type";
  178. } else if (attribute_name == "content") {
  179. auto encoding = extract_character_encoding_from_meta_element(attribute->value());
  180. if (encoding.has_value() && !charset.has_value()) {
  181. charset = encoding.value();
  182. need_pragma = true;
  183. }
  184. } else if (attribute_name == "charset") {
  185. auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value());
  186. if (maybe_charset.has_value()) {
  187. charset = Optional<String> { maybe_charset };
  188. need_pragma = { false };
  189. }
  190. }
  191. }
  192. if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
  193. continue;
  194. if (charset.value() == "UTF-16BE/LE")
  195. return "UTF-8";
  196. else if (charset.value() == "x-user-defined")
  197. return "windows-1252";
  198. else
  199. return charset.value();
  200. } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
  201. && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
  202. position += 2;
  203. prescan_skip_whitespace_and_slashes(input, position);
  204. while (prescan_get_attribute(document, input, position)) { };
  205. } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
  206. position += 2;
  207. while (input[position] != '>') {
  208. ++position;
  209. if (prescan_should_abort(input, position))
  210. return {};
  211. }
  212. } else {
  213. // Do nothing.
  214. }
  215. }
  216. return {};
  217. }
  218. // https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
  219. String run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input)
  220. {
  221. if (input.size() >= 2) {
  222. if (input[0] == 0xFE && input[1] == 0xFF) {
  223. return "UTF-16BE";
  224. } else if (input[0] == 0xFF && input[1] == 0xFE) {
  225. return "UTF-16LE";
  226. } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
  227. return "UTF-8";
  228. }
  229. }
  230. // FIXME: If the user has explicitly instructed the user agent to override the document's character
  231. // encoding with a specific encoding.
  232. // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
  233. // at any later step in this algorithm.
  234. // FIXME: If the transport layer specifies a character encoding, and it is supported.
  235. auto optional_encoding = run_prescan_byte_stream_algorithm(document, input);
  236. if (optional_encoding.has_value()) {
  237. return optional_encoding.value();
  238. }
  239. // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
  240. // is non-null and a child browsing context.
  241. // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
  242. // when it was last visited.
  243. if (!Utf8View(StringView(input)).validate()) {
  244. // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
  245. return "windows-1252";
  246. }
  247. // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
  248. // "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
  249. return "UTF-8";
  250. }
  251. }