MimeType.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. /*
  2. * Copyright (c) 2022, Luke Wilde <lukew@serenityos.org>
  3. * Copyright (c) 2022-2023, Linus Groh <linusg@serenityos.org>
  4. * Copyright (c) 2022, networkException <networkexception@serenityos.org>
  5. * Copyright (c) 2024, Jamie Mansfield <jmansfield@cadixdev.org>
  6. *
  7. * SPDX-License-Identifier: BSD-2-Clause
  8. */
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/GenericLexer.h>
  11. #include <AK/String.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Utf8View.h>
  14. #include <LibWeb/Fetch/Infrastructure/HTTP.h>
  15. #include <LibWeb/Infra/Strings.h>
  16. #include <LibWeb/MimeSniff/MimeType.h>
  17. namespace Web::MimeSniff {
  18. // https://mimesniff.spec.whatwg.org/#javascript-mime-type-essence-match
  19. bool is_javascript_mime_type_essence_match(StringView string)
  20. {
  21. // A string is a JavaScript MIME type essence match if it is an ASCII case-insensitive match for one of the JavaScript MIME type essence strings.
  22. for (auto const& javascript_essence : s_javascript_mime_type_essence_strings) {
  23. if (string.equals_ignoring_ascii_case(javascript_essence))
  24. return true;
  25. }
  26. return false;
  27. }
  28. static bool contains_only_http_quoted_string_token_code_points(StringView string)
  29. {
  30. // https://mimesniff.spec.whatwg.org/#http-quoted-string-token-code-point
  31. // An HTTP quoted-string token code point is U+0009 TAB, a code point in the range U+0020 SPACE to U+007E (~), inclusive,
  32. // or a code point in the range U+0080 through U+00FF (ÿ), inclusive.
  33. for (auto ch : Utf8View(string)) {
  34. if (!(ch == '\t' || (ch >= 0x20 && ch <= 0x7E) || (ch >= 0x80 && ch <= 0xFF)))
  35. return false;
  36. }
  37. return true;
  38. }
  39. static bool contains_only_http_token_code_points(StringView string)
  40. {
  41. // https://mimesniff.spec.whatwg.org/#http-token-code-point
  42. // An HTTP token code point is U+0021 (!), U+0023 (#), U+0024 ($), U+0025 (%), U+0026 (&), U+0027 ('), U+002A (*),
  43. // U+002B (+), U+002D (-), U+002E (.), U+005E (^), U+005F (_), U+0060 (`), U+007C (|), U+007E (~), or an ASCII alphanumeric.
  44. constexpr auto is_certain_non_ascii_alphanumeric = is_any_of("!#$%&'*+-.^_`|~"sv);
  45. for (char ch : string) {
  46. if (!is_certain_non_ascii_alphanumeric(ch) && !is_ascii_alphanumeric(ch))
  47. return false;
  48. }
  49. return true;
  50. }
  51. MimeType::MimeType(String type, String subtype)
  52. : m_type(move(type))
  53. , m_subtype(move(subtype))
  54. {
  55. // NOTE: type and subtype are expected to be non-empty and contain only
  56. // http token code points in the MIME type parsing algorithm. That's
  57. // why we are performing the same checks here.
  58. VERIFY(!m_type.is_empty() && contains_only_http_token_code_points(m_type));
  59. VERIFY(!m_subtype.is_empty() && contains_only_http_token_code_points(m_subtype));
  60. }
  61. MimeType::MimeType(MimeType const& other) = default;
  62. MimeType& MimeType::operator=(MimeType const& other) = default;
  63. MimeType::MimeType(MimeType&& other) = default;
  64. MimeType& MimeType::operator=(MimeType&& other) = default;
  65. MimeType::~MimeType() = default;
  66. MimeType MimeType::create(String type, String subtype)
  67. {
  68. auto mime_type = MimeType { move(type), move(subtype) };
  69. mime_type.m_cached_essence = MUST(String::formatted("{}/{}", mime_type.m_type, mime_type.m_subtype));
  70. return mime_type;
  71. }
  72. // https://mimesniff.spec.whatwg.org/#parse-a-mime-type
  73. Optional<MimeType> MimeType::parse(StringView string)
  74. {
  75. // Verify that the input string is valid UTF-8 first, so we don't have to think about it anymore.
  76. if (!Utf8View(string).validate())
  77. return OptionalNone {};
  78. // 1. Remove any leading and trailing HTTP whitespace from input.
  79. auto trimmed_string = string.trim(Fetch::Infrastructure::HTTP_WHITESPACE, TrimMode::Both);
  80. // 2. Let position be a position variable for input, initially pointing at the start of input.
  81. GenericLexer lexer(trimmed_string);
  82. // 3. Let type be the result of collecting a sequence of code points that are not U+002F (/) from input, given position.
  83. auto type = lexer.consume_until('/');
  84. // 4. If type is the empty string or does not solely contain HTTP token code points, then return failure.
  85. if (type.is_empty() || !contains_only_http_token_code_points(type))
  86. return OptionalNone {};
  87. // 5. If position is past the end of input, then return failure.
  88. if (lexer.is_eof())
  89. return OptionalNone {};
  90. // 6. Advance position by 1. (This skips past U+002F (/).)
  91. lexer.ignore(1);
  92. // 7. Let subtype be the result of collecting a sequence of code points that are not U+003B (;) from input, given position.
  93. auto subtype = lexer.consume_until(';');
  94. // 8. Remove any trailing HTTP whitespace from subtype.
  95. subtype = subtype.trim(Fetch::Infrastructure::HTTP_WHITESPACE, TrimMode::Right);
  96. // 9. If subtype is the empty string or does not solely contain HTTP token code points, then return failure.
  97. if (subtype.is_empty() || !contains_only_http_token_code_points(subtype))
  98. return OptionalNone {};
  99. // 10. Let mimeType be a new MIME type record whose type is type, in ASCII lowercase, and subtype is subtype, in ASCII lowercase.
  100. auto mime_type = MimeType::create(MUST(Infra::to_ascii_lowercase(type)), MUST(Infra::to_ascii_lowercase(subtype)));
  101. // 11. While position is not past the end of input:
  102. while (!lexer.is_eof()) {
  103. // 1. Advance position by 1. (This skips past U+003B (;).)
  104. lexer.ignore(1);
  105. // 2. Collect a sequence of code points that are HTTP whitespace from input given position.
  106. lexer.ignore_while(is_any_of(Fetch::Infrastructure::HTTP_WHITESPACE));
  107. // 3. Let parameterName be the result of collecting a sequence of code points that are not U+003B (;) or U+003D (=) from input, given position.
  108. auto parameter_name_view = lexer.consume_until([](char ch) {
  109. return ch == ';' || ch == '=';
  110. });
  111. // 4. Set parameterName to parameterName, in ASCII lowercase.
  112. auto parameter_name = MUST(Infra::to_ascii_lowercase(parameter_name_view));
  113. // 5. If position is not past the end of input, then:
  114. if (!lexer.is_eof()) {
  115. // 1. If the code point at position within input is U+003B (;), then continue.
  116. if (lexer.peek() == ';')
  117. continue;
  118. // 2. Advance position by 1. (This skips past U+003D (=).)
  119. lexer.ignore(1);
  120. }
  121. // 6. If position is past the end of input, then break.
  122. // NOTE: This is not an `else` because the ignore on step 11.5.2 could put us past the end of the input.
  123. if (lexer.is_eof())
  124. break;
  125. // 7. Let parameterValue be null.
  126. String parameter_value;
  127. // 8. If the code point at position within input is U+0022 ("), then:
  128. if (lexer.peek() == '"') {
  129. // 1. Set parameterValue to the result of collecting an HTTP quoted string from input, given position and the extract-value flag.
  130. parameter_value = Fetch::Infrastructure::collect_an_http_quoted_string(lexer, Fetch::Infrastructure::HttpQuotedStringExtractValue::Yes);
  131. // 2. Collect a sequence of code points that are not U+003B (;) from input, given position.
  132. lexer.ignore_until(';');
  133. }
  134. // 9. Otherwise:
  135. else {
  136. // 1. Set parameterValue to the result of collecting a sequence of code points that are not U+003B (;) from input, given position.
  137. parameter_value = String::from_utf8_without_validation(lexer.consume_until(';').bytes());
  138. // 2. Remove any trailing HTTP whitespace from parameterValue.
  139. parameter_value = MUST(parameter_value.trim(Fetch::Infrastructure::HTTP_WHITESPACE, TrimMode::Right));
  140. // 3. If parameterValue is the empty string, then continue.
  141. if (parameter_value.is_empty())
  142. continue;
  143. }
  144. // 10. If all of the following are true
  145. if (
  146. // - parameterName is not the empty string
  147. !parameter_name.is_empty()
  148. // - parameterName solely contains HTTP token code points
  149. && contains_only_http_token_code_points(parameter_name)
  150. // - parameterValue solely contains HTTP quoted-string token code points
  151. && contains_only_http_quoted_string_token_code_points(parameter_value)
  152. // - mimeType’s parameters[parameterName] does not exist
  153. && !mime_type.m_parameters.contains(parameter_name)) {
  154. // then set mimeType’s parameters[parameterName] to parameterValue.
  155. mime_type.m_parameters.set(move(parameter_name), move(parameter_value));
  156. }
  157. }
  158. // 12. Return mimeType.
  159. return mime_type;
  160. }
  161. // https://mimesniff.spec.whatwg.org/#mime-type-essence
  162. String const& MimeType::essence() const
  163. {
  164. // The essence of a MIME type mimeType is mimeType’s type, followed by U+002F (/), followed by mimeType’s subtype.
  165. return m_cached_essence;
  166. }
  167. // https://mimesniff.spec.whatwg.org/#serialize-a-mime-type
  168. String MimeType::serialized() const
  169. {
  170. // 1. Let serialization be the concatenation of mimeType’s type, U+002F (/), and mimeType’s subtype.
  171. StringBuilder serialization;
  172. serialization.append(m_type);
  173. serialization.append('/');
  174. serialization.append(m_subtype);
  175. // 2. For each name → value of mimeType’s parameters:
  176. for (auto [name, value] : m_parameters) {
  177. // 1. Append U+003B (;) to serialization.
  178. serialization.append(';');
  179. // 2. Append name to serialization.
  180. serialization.append(name);
  181. // 3. Append U+003D (=) to serialization.
  182. serialization.append('=');
  183. // 4. If value does not solely contain HTTP token code points or value is the empty string, then:
  184. if (!contains_only_http_token_code_points(value) || value.is_empty()) {
  185. // 1. Precede each occurrence of U+0022 (") or U+005C (\) in value with U+005C (\).
  186. value = MUST(value.replace("\\"sv, "\\\\"sv, ReplaceMode::All));
  187. value = MUST(value.replace("\""sv, "\\\""sv, ReplaceMode::All));
  188. // 2. Prepend U+0022 (") to value.
  189. // 3. Append U+0022 (") to value.
  190. value = MUST(String::formatted("\"{}\"", value));
  191. }
  192. // 5. Append value to serialization.
  193. serialization.append(value);
  194. }
  195. // 3. Return serialization.
  196. return serialization.to_string_without_validation();
  197. }
  198. void MimeType::set_parameter(String name, String value)
  199. {
  200. // https://mimesniff.spec.whatwg.org/#parameters
  201. // A MIME type’s parameters is an ordered map whose keys are ASCII strings and values are strings limited to HTTP quoted-string token code points.
  202. VERIFY(contains_only_http_quoted_string_token_code_points(name));
  203. VERIFY(contains_only_http_quoted_string_token_code_points(value));
  204. m_parameters.set(move(name), move(value));
  205. }
  206. // https://mimesniff.spec.whatwg.org/#image-mime-type
  207. bool MimeType::is_image() const
  208. {
  209. // An image MIME type is a MIME type whose type is "image".
  210. return type() == "image"sv;
  211. }
  212. // https://mimesniff.spec.whatwg.org/#audio-or-video-mime-type
  213. bool MimeType::is_audio_or_video() const
  214. {
  215. // An audio or video MIME type is any MIME type whose type is "audio" or "video", or whose essence is "application/ogg".
  216. return type().is_one_of("audio"sv, "video"sv) || essence() == "application/ogg"sv;
  217. }
  218. // https://mimesniff.spec.whatwg.org/#font-mime-type
  219. bool MimeType::is_font() const
  220. {
  221. // A font MIME type is any MIME type whose type is "font", or whose essence is one of the following:
  222. // - application/font-cff
  223. // - application/font-off
  224. // - application/font-sfnt
  225. // - application/font-ttf
  226. // - application/font-woff
  227. // - application/vnd.ms-fontobject
  228. // - application/vnd.ms-opentype
  229. if (type() == "font"sv)
  230. return true;
  231. return essence().is_one_of(
  232. "application/font-cff"sv,
  233. "application/font-off"sv,
  234. "application/font-sfnt"sv,
  235. "application/font-ttf"sv,
  236. "application/font-woff"sv,
  237. "application/vnd.ms-fontobject"sv,
  238. "application/vnd.ms-opentype"sv);
  239. }
  240. // https://mimesniff.spec.whatwg.org/#zip-based-mime-type
  241. bool MimeType::is_zip_based() const
  242. {
  243. // A ZIP-based MIME type is any MIME type whose subtype ends in "+zip" or whose essence is one of the following:
  244. // - application/zip
  245. return subtype().ends_with_bytes("+zip"sv) || essence().is_one_of("application/zip"sv);
  246. }
  247. // https://mimesniff.spec.whatwg.org/#archive-mime-type
  248. bool MimeType::is_archive() const
  249. {
  250. // An archive MIME type is any MIME type whose essence is one of the following:
  251. // - application/x-rar-compressed
  252. // - application/zip
  253. // - application/x-gzip
  254. return essence().is_one_of("application/x-rar-compressed"sv, "application/zip"sv, "application/x-gzip"sv);
  255. }
  256. // https://mimesniff.spec.whatwg.org/#xml-mime-type
  257. bool MimeType::is_xml() const
  258. {
  259. // An XML MIME type is any MIME type whose subtype ends in "+xml" or whose essence is "text/xml" or "application/xml". [RFC7303]
  260. return m_subtype.ends_with_bytes("+xml"sv) || essence().is_one_of("text/xml"sv, "application/xml"sv);
  261. }
  262. // https://mimesniff.spec.whatwg.org/#html-mime-type
  263. bool MimeType::is_html() const
  264. {
  265. // An HTML MIME type is any MIME type whose essence is "text/html".
  266. return essence().is_one_of("text/html"sv);
  267. }
  268. // https://mimesniff.spec.whatwg.org/#scriptable-mime-type
  269. bool MimeType::is_scriptable() const
  270. {
  271. // A scriptable MIME type is an XML MIME type, HTML MIME type, or any MIME type whose essence is "application/pdf".
  272. return is_xml() || is_html() || essence() == "application/pdf"sv;
  273. }
  274. // https://mimesniff.spec.whatwg.org/#javascript-mime-type
  275. bool MimeType::is_javascript() const
  276. {
  277. return s_javascript_mime_type_essence_strings.contains_slow(essence());
  278. }
  279. // https://mimesniff.spec.whatwg.org/#json-mime-type
  280. bool MimeType::is_json() const
  281. {
  282. // A JSON MIME type is any MIME type whose subtype ends in "+json" or whose essence is "application/json" or "text/json".
  283. return subtype().ends_with_bytes("+json"sv) || essence().is_one_of("application/json"sv, "text/json"sv);
  284. }
  285. // https://mimesniff.spec.whatwg.org/#minimize-a-supported-mime-type
  286. String minimise_a_supported_mime_type(MimeType const& mime_type)
  287. {
  288. // 1. If mimeType is a JavaScript MIME type, then return "text/javascript".
  289. if (mime_type.is_javascript())
  290. return "text/javascript"_string;
  291. // 2. If mimeType is a JSON MIME type, then return "application/json".
  292. if (mime_type.is_json())
  293. return "application/json"_string;
  294. // 3. If mimeType’s essence is "image/svg+xml", then return "image/svg+xml".
  295. if (mime_type.essence() == "image/svg+xml")
  296. return "image/svg+xml"_string;
  297. // 4. If mimeType is an XML MIME type, then return "application/xml".
  298. if (mime_type.is_xml())
  299. return "application/xml"_string;
  300. // FIXME: 5. If mimeType is supported by the user agent, then return mimeType’s essence.
  301. // 6. Return the empty string.
  302. return {};
  303. }
  304. }