HTMLEncodingDetection.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. /*
  2. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/GenericLexer.h>
  8. #include <AK/StringView.h>
  9. #include <AK/Utf8View.h>
  10. #include <LibTextCodec/Decoder.h>
  11. #include <LibWeb/DOM/Attr.h>
  12. #include <LibWeb/DOM/Document.h>
  13. #include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
  14. #include <LibWeb/Infra/CharacterTypes.h>
  15. #include <ctype.h>
  16. namespace Web::HTML {
  17. bool prescan_should_abort(ByteBuffer const& input, size_t const& position)
  18. {
  19. return position >= input.size() || position >= 1024;
  20. }
  21. bool prescan_is_whitespace_or_slash(u8 const& byte)
  22. {
  23. return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
  24. }
  25. bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position)
  26. {
  27. while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
  28. ++position;
  29. return !prescan_should_abort(input, position);
  30. }
  31. // https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
  32. Optional<StringView> extract_character_encoding_from_meta_element(DeprecatedString const& string)
  33. {
  34. // Checking for "charset" is case insensitive, as is getting an encoding.
  35. // Therefore, stick to lowercase from the start for simplicity.
  36. auto lowercase_string = string.to_lowercase();
  37. GenericLexer lexer(lowercase_string);
  38. for (;;) {
  39. auto charset_index = lexer.remaining().find("charset"sv);
  40. if (!charset_index.has_value())
  41. return {};
  42. // 7 is the length of "charset".
  43. lexer.ignore(charset_index.value() + 7);
  44. lexer.ignore_while([](char c) {
  45. return Infra::is_ascii_whitespace(c);
  46. });
  47. if (lexer.peek() != '=')
  48. continue;
  49. break;
  50. }
  51. // Ignore the '='.
  52. lexer.ignore();
  53. lexer.ignore_while([](char c) {
  54. return Infra::is_ascii_whitespace(c);
  55. });
  56. if (lexer.is_eof())
  57. return {};
  58. if (lexer.consume_specific('"')) {
  59. auto matching_double_quote = lexer.remaining().find('"');
  60. if (!matching_double_quote.has_value())
  61. return {};
  62. auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value());
  63. return TextCodec::get_standardized_encoding(encoding);
  64. }
  65. if (lexer.consume_specific('\'')) {
  66. auto matching_single_quote = lexer.remaining().find('\'');
  67. if (!matching_single_quote.has_value())
  68. return {};
  69. auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value());
  70. return TextCodec::get_standardized_encoding(encoding);
  71. }
  72. auto encoding = lexer.consume_until([](char c) {
  73. return Infra::is_ascii_whitespace(c) || c == ';';
  74. });
  75. return TextCodec::get_standardized_encoding(encoding);
  76. }
  77. JS::GCPtr<DOM::Attr> prescan_get_attribute(DOM::Document& document, ByteBuffer const& input, size_t& position)
  78. {
  79. if (!prescan_skip_whitespace_and_slashes(input, position))
  80. return {};
  81. if (input[position] == '>')
  82. return {};
  83. StringBuilder attribute_name;
  84. while (true) {
  85. if (input[position] == '=' && !attribute_name.is_empty()) {
  86. ++position;
  87. goto value;
  88. } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
  89. goto spaces;
  90. else if (input[position] == '/' || input[position] == '>')
  91. return *DOM::Attr::create(document, attribute_name.to_deprecated_string(), "");
  92. else
  93. attribute_name.append_as_lowercase(input[position]);
  94. ++position;
  95. if (prescan_should_abort(input, position))
  96. return {};
  97. }
  98. spaces:
  99. if (!prescan_skip_whitespace_and_slashes(input, position))
  100. return {};
  101. if (input[position] != '=')
  102. return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "");
  103. ++position;
  104. value:
  105. if (!prescan_skip_whitespace_and_slashes(input, position))
  106. return {};
  107. StringBuilder attribute_value;
  108. if (input[position] == '"' || input[position] == '\'') {
  109. u8 quote_character = input[position];
  110. ++position;
  111. for (; !prescan_should_abort(input, position); ++position) {
  112. if (input[position] == quote_character)
  113. return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string());
  114. else
  115. attribute_value.append_as_lowercase(input[position]);
  116. }
  117. return {};
  118. } else if (input[position] == '>')
  119. return DOM::Attr::create(document, attribute_name.to_deprecated_string(), "");
  120. else
  121. attribute_value.append_as_lowercase(input[position]);
  122. ++position;
  123. if (prescan_should_abort(input, position))
  124. return {};
  125. for (; !prescan_should_abort(input, position); ++position) {
  126. if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
  127. return DOM::Attr::create(document, attribute_name.to_deprecated_string(), attribute_value.to_deprecated_string());
  128. else
  129. attribute_value.append_as_lowercase(input[position]);
  130. }
  131. return {};
  132. }
  133. // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
  134. Optional<DeprecatedString> run_prescan_byte_stream_algorithm(DOM::Document& document, ByteBuffer const& input)
  135. {
  136. // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
  137. // Detects '<?x'
  138. if (!prescan_should_abort(input, 6)) {
  139. if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
  140. return "utf-16le";
  141. if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
  142. return "utf-16be";
  143. }
  144. for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
  145. if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
  146. && input[position + 2] == '-' && input[position + 3] == '-') {
  147. position += 2;
  148. for (; !prescan_should_abort(input, position + 3); ++position) {
  149. if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
  150. position += 2;
  151. break;
  152. }
  153. }
  154. } else if (!prescan_should_abort(input, position + 6)
  155. && input[position] == '<'
  156. && (input[position + 1] == 'M' || input[position + 1] == 'm')
  157. && (input[position + 2] == 'E' || input[position + 2] == 'e')
  158. && (input[position + 3] == 'T' || input[position + 3] == 't')
  159. && (input[position + 4] == 'A' || input[position + 4] == 'a')
  160. && prescan_is_whitespace_or_slash(input[position + 5])) {
  161. position += 6;
  162. Vector<DeprecatedString> attribute_list {};
  163. bool got_pragma = false;
  164. Optional<bool> need_pragma {};
  165. Optional<DeprecatedString> charset {};
  166. while (true) {
  167. auto attribute = prescan_get_attribute(document, input, position);
  168. if (!attribute)
  169. break;
  170. if (attribute_list.contains_slow(attribute->name()))
  171. continue;
  172. auto& attribute_name = attribute->name();
  173. attribute_list.append(attribute->name());
  174. if (attribute_name == "http-equiv") {
  175. got_pragma = attribute->value() == "content-type";
  176. } else if (attribute_name == "content") {
  177. auto encoding = extract_character_encoding_from_meta_element(attribute->value());
  178. if (encoding.has_value() && !charset.has_value()) {
  179. charset = encoding.value();
  180. need_pragma = true;
  181. }
  182. } else if (attribute_name == "charset") {
  183. auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value());
  184. if (maybe_charset.has_value()) {
  185. charset = Optional<DeprecatedString> { maybe_charset };
  186. need_pragma = { false };
  187. }
  188. }
  189. }
  190. if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
  191. continue;
  192. if (charset.value() == "UTF-16BE/LE")
  193. return "UTF-8";
  194. else if (charset.value() == "x-user-defined")
  195. return "windows-1252";
  196. else
  197. return charset.value();
  198. } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
  199. && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
  200. position += 2;
  201. prescan_skip_whitespace_and_slashes(input, position);
  202. while (prescan_get_attribute(document, input, position)) { };
  203. } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
  204. position += 2;
  205. while (input[position] != '>') {
  206. ++position;
  207. if (prescan_should_abort(input, position))
  208. return {};
  209. }
  210. } else {
  211. // Do nothing.
  212. }
  213. }
  214. return {};
  215. }
  216. // https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
  217. DeprecatedString run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input)
  218. {
  219. if (input.size() >= 2) {
  220. if (input[0] == 0xFE && input[1] == 0xFF) {
  221. return "UTF-16BE";
  222. } else if (input[0] == 0xFF && input[1] == 0xFE) {
  223. return "UTF-16LE";
  224. } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
  225. return "UTF-8";
  226. }
  227. }
  228. // FIXME: If the user has explicitly instructed the user agent to override the document's character
  229. // encoding with a specific encoding.
  230. // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
  231. // at any later step in this algorithm.
  232. // FIXME: If the transport layer specifies a character encoding, and it is supported.
  233. auto optional_encoding = run_prescan_byte_stream_algorithm(document, input);
  234. if (optional_encoding.has_value()) {
  235. return optional_encoding.value();
  236. }
  237. // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
  238. // is non-null and a child browsing context.
  239. // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
  240. // when it was last visited.
  241. if (!Utf8View(StringView(input)).validate()) {
  242. // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
  243. return "windows-1252";
  244. }
  245. // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
  246. // "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
  247. return "UTF-8";
  248. }
  249. }