Resource.cpp 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785
  1. /*
  2. * Copyright (c) 2023-2024, Kemal Zebari <kemalzebra@gmail.com>.
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <LibWeb/Fetch/Infrastructure/URL.h>
  7. #include <LibWeb/MimeSniff/Resource.h>
  8. namespace {
  9. using namespace Web::MimeSniff;
  10. struct BytePatternTableRow {
  11. StringView byte_pattern;
  12. StringView pattern_mask;
  13. ReadonlyBytes ignored_leading_bytes;
  14. StringView mime_type;
  15. // NOTE: If the byte pattern has a tag-terminating byte, add a byte where this byte should be. The value itself is ignored in
  16. // the pattern_matching_algorithm() (see the NOTE in this algorithm for more details).
  17. bool is_tag_terminated { false };
  18. };
  19. // https://mimesniff.spec.whatwg.org/#tag-terminating-byte
  20. bool is_tag_terminating_byte(u8 byte)
  21. {
  22. // A tag-terminating byte (abbreviated 0xTT) is any one of the following bytes: 0x20 (SP), 0x3E (">").
  23. return byte == 0x20 || byte == 0x3E;
  24. }
  25. // https://mimesniff.spec.whatwg.org/#binary-data-byte
  26. bool is_binary_data_byte(u8 byte)
  27. {
  28. // A binary data byte is a byte in the range 0x00 to 0x08 (NUL to BS), the byte 0x0B (VT), a byte in
  29. // the range 0x0E to 0x1A (SO to SUB), or a byte in the range 0x1C to 0x1F (FS to US).
  30. return (byte <= 0x08) || byte == 0x0B || (byte >= 0x0E && byte <= 0x1A) || (byte >= 0x1C && byte <= 0x1F);
  31. }
  32. // https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
  33. bool pattern_matching_algorithm(ReadonlyBytes input, ReadonlyBytes pattern, ReadonlyBytes mask, ReadonlyBytes ignored, bool is_tag_terminated = false)
  34. {
  35. // 1. Assert: pattern’s length is equal to mask’s length.
  36. VERIFY(pattern.size() == mask.size());
  37. // 2. If input’s length is less than pattern’s length, return false.
  38. if (input.size() < pattern.size())
  39. return false;
  40. // 3. Let s be 0.
  41. size_t s = 0;
  42. // 4. While s < input’s length:
  43. while (s < input.size()) {
  44. // 1. If ignored does not contain input[s], break.
  45. if (!ignored.contains_slow(input[s]))
  46. break;
  47. // 2. Set s to s + 1.
  48. s++;
  49. }
  50. // 5. Let p be 0.
  51. size_t p = 0;
  52. // 6. While p < pattern’s length:
  53. while (p < pattern.size()) {
  54. // 1. Let maskedData be the result of applying the bitwise AND operator to input[s] and mask[p].
  55. u8 masked_data = input[s] & mask[p];
  56. // NOTE: This non-standard branch exists to avoid having to create 2 byte patterns just so that
  57. // they can only differ by their tag-terminating byte (which could be a 0x20 or 0x3E byte).
  58. if (is_tag_terminated && p + 1 == pattern.size())
  59. return is_tag_terminating_byte(masked_data);
  60. // 2. If maskedData is not equal to pattern[p], return false.
  61. if (masked_data != pattern[p])
  62. return false;
  63. // 3. Set s to s + 1.
  64. s++;
  65. // 4. Set p to p + 1.
  66. p++;
  67. }
  68. // 7. Return true.
  69. return true;
  70. }
  71. ReadonlyBytes constexpr no_ignored_bytes;
  72. // https://mimesniff.spec.whatwg.org/#matching-an-image-type-pattern
  73. ErrorOr<Optional<MimeType>> match_an_image_type_pattern(ReadonlyBytes input)
  74. {
  75. // 1. Execute the following steps for each row row in the following table:
  76. static Array<BytePatternTableRow, 8> constexpr pattern_table {
  77. // A Windows Icon signature.
  78. BytePatternTableRow { "\x00\x00\x01\x00"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "image/x-icon"sv },
  79. // A Windows Cursor signature.
  80. BytePatternTableRow { "\x00\x00\x02\x00"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "image/x-icon"sv },
  81. // The string "BM", a BMP signature.
  82. BytePatternTableRow { "\x42\x4D"sv, "\xFF\xFF"sv, no_ignored_bytes, "image/bmp"sv },
  83. // The string "GIF87a", a GIF signature.
  84. BytePatternTableRow { "\x47\x49\x46\x38\x37\x61"sv, "\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "image/gif"sv },
  85. // The string "GIF89a", a GIF signature.
  86. BytePatternTableRow { "\x47\x49\x46\x38\x39\x61"sv, "\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "image/gif"sv },
  87. // The string "RIFF" followed by four bytes followed by the string "WEBPVP".
  88. BytePatternTableRow { "\x52\x49\x46\x46\x00\x00\x00\x00\x57\x45\x42\x50\x56\x50"sv,
  89. "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "image/webp"sv },
  90. // An error-checking byte followed by the string "PNG" followed by CR LF SUB LF, the PNG signature.
  91. BytePatternTableRow { "\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"sv, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "image/png"sv },
  92. // The JPEG Start of Image marker followed by the indicator byte of another marker.
  93. BytePatternTableRow { "\xFF\xD8\xFF"sv, "\xFF\xFF\xFF"sv, no_ignored_bytes, "image/jpeg"sv },
  94. };
  95. for (auto const& row : pattern_table) {
  96. // 1. Let patternMatched be the result of the pattern matching algorithm given input, the value in
  97. // the first column of row, the value in the second column of row, and the value in the third
  98. // column of row.
  99. auto pattern_matched = pattern_matching_algorithm(input, row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
  100. // 2. If patternMatched is true, return the value in the fourth column of row.
  101. if (pattern_matched)
  102. return MimeType::parse(row.mime_type);
  103. }
  104. // 2. Return undefined.
  105. return OptionalNone {};
  106. }
  107. // https://mimesniff.spec.whatwg.org/#signature-for-mp4
  108. bool matches_mp4_signature(ReadonlyBytes sequence)
  109. {
  110. // 1. Let sequence be the byte sequence to be matched, where sequence[s] is byte s in sequence and sequence[0] is the first byte in sequence.
  111. // 2. Let length be the number of bytes in sequence.
  112. auto length = sequence.size();
  113. // 3. If length is less than 12, return false.
  114. if (length < 12)
  115. return false;
  116. // 4. Let box-size be the four bytes from sequence[0] to sequence[3], interpreted as a 32-bit unsigned big-endian integer.
  117. u32 box_size = 0;
  118. box_size |= static_cast<u32>(sequence[0] << 24);
  119. box_size |= static_cast<u32>(sequence[1] << 16);
  120. box_size |= static_cast<u32>(sequence[2] << 8);
  121. box_size |= sequence[3];
  122. // 5. If length is less than box-size or if box-size modulo 4 is not equal to 0, return false.
  123. if ((length < box_size) || (box_size % 4 != 0))
  124. return false;
  125. // 6. If the four bytes from sequence[4] to sequence[7] are not equal to 0x66 0x74 0x79 0x70 ("ftyp"), return false.
  126. if (sequence.slice(4, 4) != "\x66\x74\x79\x70"sv.bytes())
  127. return false;
  128. // 7. If the three bytes from sequence[8] to sequence[10] are equal to 0x6D 0x70 0x34 ("mp4"), return true.
  129. if (sequence.slice(8, 3) == "\x6D\x70\x34"sv.bytes())
  130. return true;
  131. // 8. Let bytes-read be 16.
  132. u32 bytes_read = 16;
  133. // 9. While bytes-read is less than box-size, continuously loop through these steps:
  134. // 1. If the three bytes from sequence[bytes-read] to sequence[bytes-read + 2] are equal to 0x6D 0x70 0x34 ("mp4"), return true.
  135. // 2. Increment bytes-read by 4.
  136. while (bytes_read < box_size) {
  137. if (sequence.slice(bytes_read, 3) == "\x6D\x70\x34"sv.bytes())
  138. return true;
  139. bytes_read += 4;
  140. }
  141. // 10. Return false.
  142. return false;
  143. }
  144. // https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern
  145. ErrorOr<Optional<MimeType>> match_an_audio_or_video_type_pattern(ReadonlyBytes input)
  146. {
  147. // 1. Execute the following steps for each row row in the following table:
  148. static Array<BytePatternTableRow, 6> constexpr pattern_table {
  149. // The string "FORM" followed by four bytes followed by the string "AIFF", the AIFF signature.
  150. BytePatternTableRow { "\x46\x4F\x52\x4D\x00\x00\x00\x00\x41\x49\x46\x46"sv,
  151. "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "audio/aiff"sv },
  152. // The string "ID3", the ID3v2-tagged MP3 signature.
  153. BytePatternTableRow { "\x49\x44\x33"sv, "\xFF\xFF\xFF"sv, no_ignored_bytes, "audio/mpeg"sv },
  154. // The string "OggS" followed by NUL, the Ogg container signature.
  155. BytePatternTableRow { "\x4F\x67\x67\x53\x00"sv, "\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/ogg"sv },
  156. // The string "MThd" followed by four bytes representing the number 6 in 32 bits (big-endian), the MIDI signature.
  157. BytePatternTableRow { "\x4D\x54\x68\x64\x00\x00\x00\x06"sv, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "audio/midi"sv },
  158. // The string "RIFF" followed by four bytes followed by the string "AVI ", the AVI signature.
  159. BytePatternTableRow { "\x52\x49\x46\x46\x00\x00\x00\x00\x41\x56\x49\x20"sv,
  160. "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "video/avi"sv },
  161. // The string "RIFF" followed by four bytes followed by the string "WAVE", the WAVE signature.
  162. BytePatternTableRow { "\x52\x49\x46\x46\x00\x00\x00\x00\x57\x41\x56\x45"sv,
  163. "\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "audio/wave"sv }
  164. };
  165. for (auto const& row : pattern_table) {
  166. // 1. Let patternMatched be the result of the pattern matching algorithm given input, the
  167. // value in the first column of row, the value in the second column of row, and the
  168. // value in the third column of row.
  169. auto pattern_matched = pattern_matching_algorithm(input, row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
  170. // 2. If patternMatched is true, return the value in the fourth column of row.
  171. if (pattern_matched)
  172. return MimeType::parse(row.mime_type);
  173. }
  174. // 2. If input matches the signature for MP4, return "video/mp4".
  175. if (matches_mp4_signature(input))
  176. return MimeType::create("video"_string, "mp4"_string);
  177. // FIXME: 3. If input matches the signature for WebM, return "video/webm".
  178. // FIXME: 4. If input matches the signature for MP3 without ID3, return "audio/mpeg".
  179. // 5. Return undefined.
  180. return OptionalNone {};
  181. }
  182. // https://mimesniff.spec.whatwg.org/#matching-a-font-type-pattern
  183. ErrorOr<Optional<MimeType>> match_a_font_type_pattern(ReadonlyBytes input)
  184. {
  185. // 1. Execute the following steps for each row row in the following table:
  186. static Array<BytePatternTableRow, 6> constexpr pattern_table {
  187. // 34 bytes followed by the string "LP", the Embedded OpenType signature.
  188. BytePatternTableRow {
  189. .byte_pattern = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x4C\x50"sv,
  190. .pattern_mask = "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"sv,
  191. .ignored_leading_bytes = no_ignored_bytes,
  192. .mime_type = "application/vnd.ms-fontobject"sv,
  193. },
  194. // 4 bytes representing the version number 1.0, a TrueType signature.
  195. BytePatternTableRow { "\x00\x01\x00\x00"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "font/ttf"sv },
  196. // The string "OTTO", the OpenType signature.
  197. BytePatternTableRow { "\x4F\x54\x54\x4F"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "font/otf"sv },
  198. // The string "ttcf", the TrueType Collection signature.
  199. BytePatternTableRow { "\x74\x74\x63\x66"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "font/collection"sv },
  200. // The string "wOFF", the Web Open Font Format 1.0 signature.
  201. BytePatternTableRow { "\x77\x4F\x46\x46"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "font/woff"sv },
  202. // The string "wOF2", the Web Open Font Format 2.0 signature.
  203. BytePatternTableRow { "\x77\x4F\x46\x32"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "font/woff2"sv },
  204. };
  205. for (auto const& row : pattern_table) {
  206. // 1. Let patternMatched be the result of the pattern matching algorithm given input, the
  207. // value in the first column of row, the value in the second column of row, and the
  208. // value in the third column of row.
  209. auto pattern_matched = pattern_matching_algorithm(input, row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
  210. // 2. If patternMatched is true, return the value in the fourth column of row.
  211. if (pattern_matched)
  212. return MimeType::parse(row.mime_type);
  213. }
  214. // 2. Return undefined.
  215. return OptionalNone {};
  216. }
  217. // https://mimesniff.spec.whatwg.org/#matching-an-archive-type-pattern
  218. ErrorOr<Optional<MimeType>> match_an_archive_type_pattern(ReadonlyBytes input)
  219. {
  220. // 1. Execute the following steps for each row row in the following table:
  221. static Array<BytePatternTableRow, 3> constexpr pattern_table {
  222. // The GZIP archive signature.
  223. BytePatternTableRow { "\x1F\x8B\x08"sv, "\xFF\xFF\xFF"sv, no_ignored_bytes, "application/x-gzip"sv },
  224. // The string "PK" followed by ETX EOT, the ZIP archive signature.
  225. BytePatternTableRow { "\x50\x4B\x03\x04"sv, "\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/zip"sv },
  226. // The string "Rar " followed by SUB BEL NUL, the RAR archive signature.
  227. BytePatternTableRow { "\x52\x61\x72\x20\x1A\x07\x00"sv, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/x-rar-compressed"sv },
  228. };
  229. for (auto const& row : pattern_table) {
  230. // 1. Let patternMatched be the result of the pattern matching algorithm given input, the
  231. // value in the first column of row, the value in the second column of row, and the
  232. // value in the third column of row.
  233. auto pattern_matched = pattern_matching_algorithm(input, row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
  234. // 2. If patternMatched is true, return the value in the fourth column of row.
  235. if (pattern_matched)
  236. return MimeType::parse(row.mime_type);
  237. }
  238. // 2. Return undefined.
  239. return OptionalNone {};
  240. }
  241. // https://mimesniff.spec.whatwg.org/#rules-for-identifying-an-unknown-mime-type
  242. ErrorOr<MimeType> rules_for_identifying_an_unknown_mime_type(Resource const& resource, bool sniff_scriptable = false)
  243. {
  244. // 1. If the sniff-scriptable flag is set, execute the following steps for each row row in the following table:
  245. if (sniff_scriptable) {
  246. static auto constexpr text_html_mime_type = "text/html"sv;
  247. // https://mimesniff.spec.whatwg.org/#whitespace-byte
  248. // A whitespace byte (abbreviated 0xWS) is any one of the following bytes: 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP).
  249. static Array<u8, 5> constexpr ignored_whitespace_bytes { 0x09, 0x0A, 0x0C, 0x0D, 0x20 };
  250. static Array<BytePatternTableRow, 19> constexpr pattern_table {
  251. // The case-insensitive string "<!DOCTYPE HTML" followed by a tag-terminating byte.
  252. BytePatternTableRow { "\x3C\x21\x44\x4F\x43\x54\x59\x50\x45\x20\x48\x54\x4D\x4C\x00"sv,
  253. "\xFF\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xDF\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  254. // The case-insensitive string "<HTML" followed by a tag-terminating byte.
  255. BytePatternTableRow { "\x3C\x48\x54\x4D\x4C\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  256. // The case-insensitive string "<HEAD" followed by a tag-terminating byte.
  257. BytePatternTableRow { "\x3C\x48\x45\x41\x44\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  258. // The case-insensitive string "<SCRIPT" followed by a tag-terminating byte.
  259. BytePatternTableRow { "\x3C\x53\x43\x52\x49\x50\x54\x00"sv,
  260. "\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  261. // The case-insensitive string "<IFRAME" followed by a tag-terminating byte.
  262. BytePatternTableRow { "\x3C\x49\x46\x52\x41\x4D\x45\x00"sv,
  263. "\xFF\xDF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  264. // The case-insensitive string "<H1" followed by a tag-terminating byte.
  265. BytePatternTableRow { "\x3C\x48\x31\x00"sv, "\xFF\xDF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  266. // The case-insensitive string "<DIV" followed by a tag-terminating byte.
  267. BytePatternTableRow { "\x3C\x44\x49\x56\x00"sv, "\xFF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  268. // The case-insensitive string "<FONT" followed by a tag-terminating byte.
  269. BytePatternTableRow { "\x3C\x46\x4F\x4E\x54\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  270. // The case-insensitive string "<TABLE" followed by a tag-terminating byte.
  271. BytePatternTableRow { "\x3C\x54\x41\x42\x4C\x45\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  272. // The case-insensitive string "<A" followed by a tag-terminating byte.
  273. BytePatternTableRow { "\x3C\x41\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  274. // The case-insensitive string "<STYLE" followed by a tag-terminating byte.
  275. BytePatternTableRow { "\x3C\x53\x54\x59\x4C\x45\x00"sv,
  276. "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  277. // The case-insensitive string "<TITLE" followed by a tag-terminating byte.
  278. BytePatternTableRow { "\x3C\x54\x49\x54\x4C\x45\x00"sv,
  279. "\xFF\xDF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  280. // The case-insensitive string "<B" followed by a tag-terminating byte.
  281. BytePatternTableRow { "\x3C\x42\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  282. // The case-insensitive string "<BODY" followed by a tag-terminating byte.
  283. BytePatternTableRow { "\x3C\x42\x4F\x44\x59\x00"sv, "\xFF\xDF\xDF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  284. // The case-insensitive string "<BR" followed by a tag-terminating byte.
  285. BytePatternTableRow { "\x3C\x42\x52\x00"sv, "\xFF\xDF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  286. // The case-insensitive string "<P" followed by a tag-terminating byte.
  287. BytePatternTableRow { "\x3C\x50\x00"sv, "\xFF\xDF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  288. // The string "<!--" followed by a tag-terminating byte.
  289. BytePatternTableRow { "\x3C\x21\x2D\x2D\x00"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, text_html_mime_type, true },
  290. // The string "<?xml".
  291. BytePatternTableRow { "\x3C\x3F\x78\x6D\x6C"sv, "\xFF\xFF\xFF\xFF\xFF"sv, ignored_whitespace_bytes, "text/xml"sv },
  292. // The string "%PDF-", the PDF signature.
  293. BytePatternTableRow { "\x25\x50\x44\x46\x2D"sv, "\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/pdf"sv },
  294. };
  295. for (auto const& row : pattern_table) {
  296. // 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
  297. // the value in the first column of row, the value in the second column of row, and the value in the
  298. // third column of row.
  299. auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes, row.is_tag_terminated);
  300. // 2. If patternMatched is true, return the value in the fourth column of row.
  301. if (pattern_matched) {
  302. if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
  303. return maybe_type.release_value();
  304. }
  305. }
  306. }
  307. // 2. Execute the following steps for each row row in the following table:
  308. static auto constexpr text_plain_mime_type = "text/plain"sv;
  309. static Array<BytePatternTableRow, 4> constexpr pattern_table {
  310. // The string "%!PS-Adobe-", the PostScript signature.
  311. BytePatternTableRow { "\x25\x21\x50\x53\x2D\x41\x64\x6F\x62\x65\x2D"sv,
  312. "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"sv, no_ignored_bytes, "application/postscript"sv },
  313. // UTF-16BE BOM
  314. BytePatternTableRow { "\xFE\xFF\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
  315. // UTF-16LE BOM
  316. BytePatternTableRow { "\xFF\xFE\x00\x00"sv, "\xFF\xFF\x00\x00"sv, no_ignored_bytes, text_plain_mime_type },
  317. // UTF-8 BOM
  318. BytePatternTableRow { "\xEF\xBB\xBF\x00"sv, "\xFF\xFF\xFF\x00"sv, no_ignored_bytes, text_plain_mime_type },
  319. };
  320. for (auto const& row : pattern_table) {
  321. // 1. Let patternMatched be the result of the pattern matching algorithm given resource’s resource header,
  322. // the value in the first column of row, the value in the second column of row, and the value in the
  323. // third column of row.
  324. auto pattern_matched = pattern_matching_algorithm(resource.resource_header(), row.byte_pattern.bytes(), row.pattern_mask.bytes(), row.ignored_leading_bytes);
  325. // 2. If patternMatched is true, return the value in the fourth column of row.
  326. if (pattern_matched) {
  327. if (auto maybe_type = TRY(MimeType::parse(row.mime_type)); maybe_type.has_value())
  328. return maybe_type.release_value();
  329. }
  330. }
  331. // 3. Let matchedType be the result of executing the image type pattern matching algorithm given resource’s resource header.
  332. auto matched_type = TRY(match_an_image_type_pattern(resource.resource_header()));
  333. // 4. If matchedType is not undefined, return matchedType.
  334. if (matched_type.has_value())
  335. return matched_type.release_value();
  336. // 5. Set matchedType to the result of executing the audio or video type pattern matching algorithm given resource’s resource header.
  337. matched_type = TRY(match_an_audio_or_video_type_pattern(resource.resource_header()));
  338. // 6. If matchedType is not undefined, return matchedType.
  339. if (matched_type.has_value())
  340. return matched_type.release_value();
  341. // 7. Set matchedType to the result of executing the archive type pattern matching algorithm given resource’s resource header.
  342. matched_type = TRY(match_an_archive_type_pattern(resource.resource_header()));
  343. // 8. If matchedType is not undefined, return matchedType.
  344. if (matched_type.has_value())
  345. return matched_type.release_value();
  346. // 9. If resource’s resource header contains no binary data bytes, return "text/plain".
  347. if (!any_of(resource.resource_header(), is_binary_data_byte))
  348. return MimeType::create("text"_string, "plain"_string);
  349. // 10. Return "application/octet-stream".
  350. return MimeType::create("application"_string, "octet-stream"_string);
  351. }
  352. }
  353. namespace Web::MimeSniff {
  354. ErrorOr<Resource> Resource::create(ReadonlyBytes data, SniffingConfiguration configuration)
  355. {
  356. // NOTE: Non-standard but for cases where pattern matching fails, let's fall back to the safest MIME type.
  357. auto default_computed_mime_type = TRY(MimeType::create("application"_string, "octet-stream"_string));
  358. auto resource = Resource { data, configuration.no_sniff, move(default_computed_mime_type) };
  359. TRY(resource.supplied_mime_type_detection_algorithm(configuration.scheme, move(configuration.supplied_type)));
  360. TRY(resource.context_specific_sniffing_algorithm(configuration.sniffing_context));
  361. return resource;
  362. }
  363. ErrorOr<MimeType> Resource::sniff(ReadonlyBytes data, SniffingConfiguration configuration)
  364. {
  365. auto resource = TRY(create(data, move(configuration)));
  366. return move(resource.m_computed_mime_type);
  367. }
  368. Resource::Resource(ReadonlyBytes data, bool no_sniff, MimeType&& default_computed_mime_type)
  369. : m_no_sniff(no_sniff)
  370. , m_computed_mime_type(move(default_computed_mime_type))
  371. {
  372. read_the_resource_header(data);
  373. }
  374. Resource::~Resource() = default;
  375. // https://mimesniff.spec.whatwg.org/#supplied-mime-type-detection-algorithm
  376. // NOTE: Parameters are non-standard.
  377. ErrorOr<void> Resource::supplied_mime_type_detection_algorithm(StringView scheme, Optional<MimeType> supplied_type)
  378. {
  379. // 1. Let supplied-type be null.
  380. // 2. If the resource is retrieved via HTTP, execute the following steps:
  381. // 1. If one or more Content-Type headers are associated with the resource, execute the following steps:
  382. // 1. Set supplied-type to the value of the last Content-Type header associated with the resource.
  383. // 2. Set the check-for-apache-bug flag if supplied-type is exactly equal to one of the values in the following table:
  384. // NOTE: Non-standard but this algorithm expects the caller to handle step 2.1.1.
  385. if (supplied_type.has_value()) {
  386. if (Fetch::Infrastructure::is_http_or_https_scheme(scheme)) {
  387. // NOTE: The spec expects a space between the semicolon and the start of the charset parameter. However, we will lose this
  388. // space because MimeType::parse() ignores any spaces found there.
  389. static Array<StringView, 4> constexpr apache_bug_mime_types = {
  390. "text/plain"sv,
  391. "text/plain;charset=ISO-8859-1"sv,
  392. "text/plain;charset=iso-8859-1"sv,
  393. "text/plain;charset=UTF-8"sv
  394. };
  395. auto serialized_supplied_type = TRY(supplied_type->serialized());
  396. for (auto apache_bug_mime_type : apache_bug_mime_types) {
  397. if (serialized_supplied_type == apache_bug_mime_type) {
  398. m_check_for_apache_bug_flag = true;
  399. break;
  400. }
  401. }
  402. }
  403. }
  404. // 3. If the resource is retrieved directly from the file system, set supplied-type
  405. // to the MIME type provided by the file system.
  406. // 4. If the resource is retrieved via another protocol (such as FTP), set
  407. // supplied-type to the MIME type as determined by that protocol, if any.
  408. // 5. If supplied-type is not a MIME type, the supplied MIME type is undefined.
  409. // Abort these steps.
  410. // 6. The supplied MIME type is supplied-type.
  411. // NOTE: The expectation is for the caller to handle these spec steps.
  412. m_supplied_mime_type = supplied_type;
  413. return {};
  414. }
  415. // https://mimesniff.spec.whatwg.org/#read-the-resource-header
  416. void Resource::read_the_resource_header(ReadonlyBytes data)
  417. {
  418. // 1. Let buffer be a byte sequence.
  419. ByteBuffer buffer;
  420. // 2. Read bytes of the resource into buffer until one of the following conditions is met:
  421. // - the end of the resource is reached.
  422. // - the number of bytes in buffer is greater than or equal to 1445.
  423. // - a reasonable amount of time has elapsed, as determined by the user agent.
  424. // FIXME: The spec expects us to be reading from a stream. Reimplement this spec step once
  425. // we have greater support for streaming in areas that calls on this API.
  426. static size_t constexpr MAX_SNIFF_SIZE = 1445;
  427. buffer.append(data.slice(0, min(data.size(), MAX_SNIFF_SIZE)));
  428. // 3. The resource header is buffer.
  429. m_resource_header = move(buffer);
  430. }
  431. // https://mimesniff.spec.whatwg.org/#mime-type-sniffing-algorithm
  432. ErrorOr<void> Resource::mime_type_sniffing_algorithm()
  433. {
  434. // 1. If the supplied MIME type is undefined or if the supplied MIME type’s essence
  435. // is "unknown/unknown", "application/unknown", or "*/*", execute the rules for
  436. // identifying an unknown MIME type with the sniff-scriptable flag equal to the
  437. // inverse of the no-sniff flag and abort these steps.
  438. if (!m_supplied_mime_type.has_value() || m_supplied_mime_type->essence().is_one_of("unknown/unknown", "application/unknown", "*/*")) {
  439. m_computed_mime_type = TRY(rules_for_identifying_an_unknown_mime_type(*this, !m_no_sniff));
  440. return {};
  441. }
  442. // 2. If the no-sniff flag is set, the computed MIME type is the supplied MIME type.
  443. // Abort these steps.
  444. if (m_no_sniff) {
  445. m_computed_mime_type = m_supplied_mime_type.value();
  446. return {};
  447. }
  448. // 3. If the check-for-apache-bug flag is set, execute the rules for distinguishing
  449. // if a resource is text or binary and abort these steps.
  450. if (m_check_for_apache_bug_flag) {
  451. TRY(rules_for_distinguishing_if_a_resource_is_text_or_binary());
  452. return {};
  453. }
  454. // 4. If the supplied MIME type is an XML MIME type, the computed MIME type is the supplied MIME type.
  455. // Abort these steps.
  456. if (m_supplied_mime_type->is_xml()) {
  457. m_computed_mime_type = m_supplied_mime_type.value();
  458. return {};
  459. }
  460. // 5. If the supplied MIME type’s essence is "text/html", execute the rules for distinguishing if a
  461. // resource is a feed or HTML and abort these steps.
  462. if (m_supplied_mime_type->essence() == "text/html") {
  463. // FIXME: Execute the rules for distinguishing if a resource is a feed or HTML and abort these steps.
  464. return {};
  465. }
  466. // FIXME: 6. If the supplied MIME type is an image MIME type supported by the user agent, let matched-type be
  467. // the result of executing the image type pattern matching algorithm with the resource header as
  468. // the byte sequence to be matched.
  469. Optional<MimeType> matched_type;
  470. // 7. If matched-type is not undefined, the computed MIME type is matched-type.
  471. // Abort these steps.
  472. if (matched_type.has_value()) {
  473. m_computed_mime_type = matched_type.release_value();
  474. return {};
  475. }
  476. // FIXME: 8. If the supplied MIME type is an audio or video MIME type supported by the user agent, let matched-type be
  477. // the result of executing the audio or video type pattern matching algorithm with the resource header as
  478. // the byte sequence to be matched.
  479. // 9. If matched-type is not undefined, the computed MIME type is matched-type.
  480. // Abort these steps.
  481. if (matched_type.has_value()) {
  482. m_computed_mime_type = matched_type.release_value();
  483. return {};
  484. }
  485. // 10. The computed MIME type is the supplied MIME type.
  486. m_computed_mime_type = m_supplied_mime_type.value();
  487. return {};
  488. }
  489. // https://mimesniff.spec.whatwg.org/#sniffing-a-mislabeled-binary-resource
  490. ErrorOr<void> Resource::rules_for_distinguishing_if_a_resource_is_text_or_binary()
  491. {
  492. // 1. Let length be the number of bytes in the resource header.
  493. auto length = m_resource_header.size();
  494. // 2. If length is greater than or equal to 2 and the first 2 bytes of the
  495. // resource header are equal to 0xFE 0xFF (UTF-16BE BOM) or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain".
  496. // Abort these steps.
  497. auto resource_header_span = m_resource_header.span();
  498. auto utf_16_be_bom = "\xFE\xFF"sv.bytes();
  499. auto utf_16_le_bom = "\xFF\xFE"sv.bytes();
  500. if (length >= 2
  501. && (resource_header_span.starts_with(utf_16_be_bom)
  502. || resource_header_span.starts_with(utf_16_le_bom))) {
  503. m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string));
  504. return {};
  505. }
  506. // 3. If length is greater than or equal to 3 and the first 3 bytes of the resource header are equal to 0xEF 0xBB 0xBF (UTF-8 BOM),
  507. // the computed MIME type is "text/plain".
  508. // Abort these steps.
  509. auto utf_8_bom = "\xEF\xBB\xBF"sv.bytes();
  510. if (length >= 3 && resource_header_span.starts_with(utf_8_bom)) {
  511. m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string));
  512. return {};
  513. }
  514. // 4. If the resource header contains no binary data bytes, the computed MIME type is "text/plain".
  515. // Abort these steps.
  516. if (!any_of(resource_header(), is_binary_data_byte)) {
  517. m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string));
  518. return {};
  519. }
  520. // 5. The computed MIME type is "application/octet-stream".
  521. // NOTE: This is the default MIME type of the computed MIME type.
  522. return {};
  523. }
  524. // https://mimesniff.spec.whatwg.org/#context-specific-sniffing-algorithm
  525. ErrorOr<void> Resource::context_specific_sniffing_algorithm(SniffingContext sniffing_context)
  526. {
  527. // A context-specific sniffing algorithm determines the computed MIME type of a resource only if
  528. // the resource is a MIME type relevant to a particular context.
  529. if (sniffing_context == SniffingContext::None || sniffing_context == SniffingContext::Browsing) {
  530. // https://mimesniff.spec.whatwg.org/#sniffing-in-a-browsing-context
  531. // Use the MIME type sniffing algorithm.
  532. return mime_type_sniffing_algorithm();
  533. }
  534. // NOTE: Non-standard but if the client expects us to not sniff, we shouldn't be doing any
  535. // context-specific sniffing if we don't have to.
  536. if (m_no_sniff && m_supplied_mime_type.has_value()) {
  537. m_computed_mime_type = m_supplied_mime_type.value();
  538. return {};
  539. }
  540. if (sniffing_context == SniffingContext::Image)
  541. return rules_for_sniffing_images_specifically();
  542. if (sniffing_context == SniffingContext::AudioOrVideo)
  543. return rules_for_sniffing_audio_or_video_specifically();
  544. if (sniffing_context == SniffingContext::Font)
  545. return rules_for_sniffing_fonts_specifically();
  546. if (sniffing_context == SniffingContext::TextOrBinary)
  547. return rules_for_distinguishing_if_a_resource_is_text_or_binary();
  548. return {};
  549. }
  550. // https://mimesniff.spec.whatwg.org/#sniffing-in-an-image-context
  551. ErrorOr<void> Resource::rules_for_sniffing_images_specifically()
  552. {
  553. // 1. If the supplied MIME type is an XML MIME type, the computed MIME type is the supplied MIME type.
  554. // Abort these steps.
  555. // NOTE: Non-standard but due to the mime type detection algorithm we need this sanity check.
  556. if (m_supplied_mime_type.has_value() && m_supplied_mime_type->is_xml()) {
  557. m_computed_mime_type = m_supplied_mime_type.value();
  558. return {};
  559. }
  560. // 2. Let image-type-matched be the result of executing the image type pattern matching algorithm with
  561. // the resource header as the byte sequence to be matched.
  562. auto image_type_matched = TRY(match_an_image_type_pattern(resource_header()));
  563. // 3. If image-type-matched is not undefined, the computed MIME type is image-type-matched.
  564. // Abort these steps.
  565. if (image_type_matched.has_value()) {
  566. m_computed_mime_type = image_type_matched.release_value();
  567. return {};
  568. }
  569. // 4. The computed MIME type is the supplied MIME type.
  570. // NOTE: Non-standard but due to the mime type detection algorithm we need this sanity check.
  571. if (m_supplied_mime_type.has_value()) {
  572. m_computed_mime_type = m_supplied_mime_type.value();
  573. }
  574. // NOTE: Non-standard but if the supplied mime type is undefined, we use computed mime type's default value.
  575. return {};
  576. }
  577. // https://mimesniff.spec.whatwg.org/#sniffing-in-an-audio-or-video-context
  578. ErrorOr<void> Resource::rules_for_sniffing_audio_or_video_specifically()
  579. {
  580. // 1. If the supplied MIME type is an XML MIME type, the computed MIME type is the supplied MIME type.
  581. // Abort these steps.
  582. // NOTE: Non-standard but due to the mime type detection algorithm we need this sanity check.
  583. if (m_supplied_mime_type.has_value() && m_supplied_mime_type->is_xml()) {
  584. m_computed_mime_type = m_supplied_mime_type.value();
  585. return {};
  586. }
  587. // 2. Let audio-or-video-type-matched be the result of executing the audio or video type pattern matching
  588. // algorithm with the resource header as the byte sequence to be matched.
  589. auto audio_or_video_type_matched = TRY(match_an_audio_or_video_type_pattern(resource_header()));
  590. // 3. If audio-or-video-type-matched is not undefined, the computed MIME type is audio-or-video-type-matched.
  591. // Abort these steps.
  592. if (audio_or_video_type_matched.has_value()) {
  593. m_computed_mime_type = audio_or_video_type_matched.release_value();
  594. return {};
  595. }
  596. // 4. The computed MIME type is the supplied MIME type.
  597. // NOTE: Non-standard but due to the mime type detection algorithm we need this sanity check.
  598. if (m_supplied_mime_type.has_value()) {
  599. m_computed_mime_type = m_supplied_mime_type.value();
  600. }
  601. // NOTE: Non-standard but if the supplied mime type is undefined, we use computed mime type's default value.
  602. return {};
  603. }
  604. // https://mimesniff.spec.whatwg.org/#sniffing-in-a-font-context
  605. ErrorOr<void> Resource::rules_for_sniffing_fonts_specifically()
  606. {
  607. // 1. If the supplied MIME type is an XML MIME type, the computed MIME type is the supplied MIME type.
  608. // Abort these steps.
  609. // NOTE: Non-standard but due to the mime type detection algorithm we need this sanity check.
  610. if (m_supplied_mime_type.has_value() && m_supplied_mime_type->is_xml()) {
  611. m_computed_mime_type = m_supplied_mime_type.value();
  612. return {};
  613. }
  614. // 2. Let font-type-matched be the result of executing the font type pattern matching algorithm with the
  615. // resource header as the byte sequence to be matched.
  616. auto font_type_matched = TRY(match_a_font_type_pattern(resource_header()));
  617. // 3. If font-type-matched is not undefined, the computed MIME type is font-type-matched.
  618. // Abort these steps.
  619. if (font_type_matched.has_value()) {
  620. m_computed_mime_type = font_type_matched.release_value();
  621. return {};
  622. }
  623. // 4. The computed MIME type is the supplied MIME type.
  624. // NOTE: Non-standard but due to the mime type detection algorithm we need this sanity check.
  625. if (m_supplied_mime_type.has_value()) {
  626. m_computed_mime_type = m_supplied_mime_type.value();
  627. }
  628. // NOTE: Non-standard but if the supplied mime type is undefined, we use computed mime type's default value.
  629. return {};
  630. }
  631. }