URL.cpp 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <andreas@ladybird.org>
  3. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  4. * Copyright (c) 2024, Sam Atkins <sam@ladybird.org>
  5. *
  6. * SPDX-License-Identifier: BSD-2-Clause
  7. */
  8. #include <AK/Base64.h>
  9. #include <AK/CharacterTypes.h>
  10. #include <AK/Debug.h>
  11. #include <AK/LexicalPath.h>
  12. #include <AK/StringBuilder.h>
  13. #include <AK/Utf8View.h>
  14. #include <LibURL/Parser.h>
  15. #include <LibURL/URL.h>
  16. #if defined(ENABLE_PUBLIC_SUFFIX)
  17. # include <LibURL/PublicSuffixData.h>
  18. #endif
  19. namespace URL {
  20. // FIXME: It could make sense to force users of URL to use URL::Parser::basic_parse() explicitly instead of using a constructor.
  21. URL::URL(StringView string)
  22. : URL(Parser::basic_parse(string))
  23. {
  24. if constexpr (URL_PARSER_DEBUG) {
  25. if (m_data->valid)
  26. dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
  27. else
  28. dbgln("URL constructor: Parsed URL to be invalid.");
  29. }
  30. }
  31. URL URL::complete_url(StringView relative_url) const
  32. {
  33. if (!is_valid())
  34. return {};
  35. return Parser::basic_parse(relative_url, *this);
  36. }
  37. ByteString URL::path_segment_at_index(size_t index) const
  38. {
  39. VERIFY(index < path_segment_count());
  40. return percent_decode(m_data->paths[index]);
  41. }
  42. ByteString URL::basename() const
  43. {
  44. if (!m_data->valid)
  45. return {};
  46. if (m_data->paths.is_empty())
  47. return {};
  48. auto& last_segment = m_data->paths.last();
  49. return percent_decode(last_segment);
  50. }
  51. void URL::set_scheme(String scheme)
  52. {
  53. m_data->scheme = move(scheme);
  54. m_data->valid = compute_validity();
  55. }
  56. // https://url.spec.whatwg.org/#set-the-username
  57. void URL::set_username(StringView username)
  58. {
  59. // To set the username given a url and username, set url’s username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
  60. m_data->username = percent_encode(username, PercentEncodeSet::Userinfo);
  61. m_data->valid = compute_validity();
  62. }
  63. // https://url.spec.whatwg.org/#set-the-password
  64. void URL::set_password(StringView password)
  65. {
  66. // To set the password given a url and password, set url’s password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
  67. m_data->password = percent_encode(password, PercentEncodeSet::Userinfo);
  68. m_data->valid = compute_validity();
  69. }
  70. void URL::set_host(Host host)
  71. {
  72. m_data->host = move(host);
  73. m_data->valid = compute_validity();
  74. }
  75. // https://url.spec.whatwg.org/#concept-host-serializer
  76. ErrorOr<String> URL::serialized_host() const
  77. {
  78. return Parser::serialize_host(m_data->host);
  79. }
  80. void URL::set_port(Optional<u16> port)
  81. {
  82. if (port == default_port_for_scheme(m_data->scheme)) {
  83. m_data->port = {};
  84. return;
  85. }
  86. m_data->port = move(port);
  87. m_data->valid = compute_validity();
  88. }
  89. void URL::set_paths(Vector<ByteString> const& paths)
  90. {
  91. m_data->paths.clear_with_capacity();
  92. m_data->paths.ensure_capacity(paths.size());
  93. for (auto const& segment : paths)
  94. m_data->paths.unchecked_append(percent_encode(segment, PercentEncodeSet::Path));
  95. m_data->valid = compute_validity();
  96. }
  97. void URL::append_path(StringView path)
  98. {
  99. m_data->paths.append(percent_encode(path, PercentEncodeSet::Path));
  100. }
  101. // https://url.spec.whatwg.org/#cannot-have-a-username-password-port
  102. bool URL::cannot_have_a_username_or_password_or_port() const
  103. {
  104. // A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
  105. return m_data->host.has<Empty>() || m_data->host == String {} || m_data->scheme == "file"sv;
  106. }
  107. // FIXME: This is by no means complete.
  108. // NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
  109. bool URL::compute_validity() const
  110. {
  111. if (m_data->scheme.is_empty())
  112. return false;
  113. if (m_data->cannot_be_a_base_url) {
  114. if (m_data->paths.size() != 1)
  115. return false;
  116. if (m_data->paths[0].is_empty())
  117. return false;
  118. } else {
  119. if (m_data->scheme.is_one_of("about", "mailto"))
  120. return false;
  121. // NOTE: Maybe it is allowed to have a zero-segment path.
  122. if (m_data->paths.size() == 0)
  123. return false;
  124. }
  125. // NOTE: A file URL's host should be the empty string for localhost, not null.
  126. if (m_data->scheme == "file" && m_data->host.has<Empty>())
  127. return false;
  128. return true;
  129. }
  130. // https://url.spec.whatwg.org/#default-port
  131. Optional<u16> default_port_for_scheme(StringView scheme)
  132. {
  133. // Spec defined mappings with port:
  134. if (scheme == "ftp")
  135. return 21;
  136. if (scheme == "http")
  137. return 80;
  138. if (scheme == "https")
  139. return 443;
  140. if (scheme == "ws")
  141. return 80;
  142. if (scheme == "wss")
  143. return 443;
  144. // NOTE: not in spec, but we support these too
  145. if (scheme == "irc")
  146. return 6667;
  147. if (scheme == "ircs")
  148. return 6697;
  149. return {};
  150. }
  151. URL create_with_file_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
  152. {
  153. LexicalPath lexical_path(path);
  154. if (!lexical_path.is_absolute())
  155. return {};
  156. URL url;
  157. url.set_scheme("file"_string);
  158. url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
  159. url.set_paths(lexical_path.parts());
  160. if (path.ends_with('/'))
  161. url.append_slash();
  162. if (!fragment.is_empty())
  163. url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
  164. return url;
  165. }
  166. URL create_with_url_or_path(ByteString const& url_or_path)
  167. {
  168. URL url = url_or_path;
  169. if (url.is_valid())
  170. return url;
  171. ByteString path = LexicalPath::canonicalized_path(url_or_path);
  172. return create_with_file_scheme(path);
  173. }
  174. URL create_with_data(StringView mime_type, StringView payload, bool is_base64)
  175. {
  176. URL url;
  177. url.set_cannot_be_a_base_url(true);
  178. url.set_scheme("data"_string);
  179. StringBuilder builder;
  180. builder.append(mime_type);
  181. if (is_base64)
  182. builder.append(";base64"sv);
  183. builder.append(',');
  184. builder.append(payload);
  185. url.set_paths({ builder.to_byte_string() });
  186. return url;
  187. }
  188. // https://url.spec.whatwg.org/#special-scheme
  189. bool is_special_scheme(StringView scheme)
  190. {
  191. return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
  192. }
  193. // https://url.spec.whatwg.org/#url-path-serializer
  194. String URL::serialize_path() const
  195. {
  196. // 1. If url has an opaque path, then return url’s path.
  197. // FIXME: Reimplement this step once we modernize the URL implementation to meet the spec.
  198. if (cannot_be_a_base_url())
  199. return m_data->paths[0];
  200. // 2. Let output be the empty string.
  201. StringBuilder output;
  202. // 3. For each segment of url’s path: append U+002F (/) followed by segment to output.
  203. for (auto const& segment : m_data->paths) {
  204. output.append('/');
  205. output.append(segment);
  206. }
  207. // 4. Return output.
  208. return output.to_string_without_validation();
  209. }
  210. // https://url.spec.whatwg.org/#concept-url-serializer
  211. ByteString URL::serialize(ExcludeFragment exclude_fragment) const
  212. {
  213. // 1. Let output be url’s scheme and U+003A (:) concatenated.
  214. StringBuilder output;
  215. output.append(m_data->scheme);
  216. output.append(':');
  217. // 2. If url’s host is non-null:
  218. if (!m_data->host.has<Empty>()) {
  219. // 1. Append "//" to output.
  220. output.append("//"sv);
  221. // 2. If url includes credentials, then:
  222. if (includes_credentials()) {
  223. // 1. Append url’s username to output.
  224. output.append(m_data->username);
  225. // 2. If url’s password is not the empty string, then append U+003A (:), followed by url’s password, to output.
  226. if (!m_data->password.is_empty()) {
  227. output.append(':');
  228. output.append(m_data->password);
  229. }
  230. // 3. Append U+0040 (@) to output.
  231. output.append('@');
  232. }
  233. // 3. Append url’s host, serialized, to output.
  234. output.append(serialized_host().release_value_but_fixme_should_propagate_errors());
  235. // 4. If url’s port is non-null, append U+003A (:) followed by url’s port, serialized, to output.
  236. if (m_data->port.has_value())
  237. output.appendff(":{}", *m_data->port);
  238. }
  239. // 3. If url’s host is null, url does not have an opaque path, url’s path’s size is greater than 1, and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
  240. // 4. Append the result of URL path serializing url to output.
  241. // FIXME: Implement this closer to spec steps.
  242. if (cannot_be_a_base_url()) {
  243. output.append(m_data->paths[0]);
  244. } else {
  245. if (m_data->host.has<Empty>() && m_data->paths.size() > 1 && m_data->paths[0].is_empty())
  246. output.append("/."sv);
  247. for (auto& segment : m_data->paths) {
  248. output.append('/');
  249. output.append(segment);
  250. }
  251. }
  252. // 5. If url’s query is non-null, append U+003F (?), followed by url’s query, to output.
  253. if (m_data->query.has_value()) {
  254. output.append('?');
  255. output.append(*m_data->query);
  256. }
  257. // 6. If exclude fragment is false and url’s fragment is non-null, then append U+0023 (#), followed by url’s fragment, to output.
  258. if (exclude_fragment == ExcludeFragment::No && m_data->fragment.has_value()) {
  259. output.append('#');
  260. output.append(*m_data->fragment);
  261. }
  262. // 7. Return output.
  263. return output.to_byte_string();
  264. }
  265. // https://url.spec.whatwg.org/#url-rendering
  266. // NOTE: This does e.g. not display credentials.
  267. // FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
  268. // resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
  269. ByteString URL::serialize_for_display() const
  270. {
  271. VERIFY(m_data->valid);
  272. StringBuilder builder;
  273. builder.append(m_data->scheme);
  274. builder.append(':');
  275. if (!m_data->host.has<Empty>()) {
  276. builder.append("//"sv);
  277. builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
  278. if (m_data->port.has_value())
  279. builder.appendff(":{}", *m_data->port);
  280. }
  281. if (cannot_be_a_base_url()) {
  282. builder.append(m_data->paths[0]);
  283. } else {
  284. if (m_data->host.has<Empty>() && m_data->paths.size() > 1 && m_data->paths[0].is_empty())
  285. builder.append("/."sv);
  286. for (auto& segment : m_data->paths) {
  287. builder.append('/');
  288. builder.append(segment);
  289. }
  290. }
  291. if (m_data->query.has_value()) {
  292. builder.append('?');
  293. builder.append(*m_data->query);
  294. }
  295. if (m_data->fragment.has_value()) {
  296. builder.append('#');
  297. builder.append(*m_data->fragment);
  298. }
  299. return builder.to_byte_string();
  300. }
  301. ErrorOr<String> URL::to_string() const
  302. {
  303. return String::from_byte_string(serialize());
  304. }
  305. // https://url.spec.whatwg.org/#concept-url-origin
  306. Origin URL::origin() const
  307. {
  308. // The origin of a URL url is the origin returned by running these steps, switching on url’s scheme:
  309. // -> "blob"
  310. if (scheme() == "blob"sv) {
  311. auto url_string = to_string().release_value_but_fixme_should_propagate_errors();
  312. // 1. If url’s blob URL entry is non-null, then return url’s blob URL entry’s environment’s origin.
  313. if (blob_url_entry().has_value())
  314. return blob_url_entry()->environment_origin;
  315. // 2. Let pathURL be the result of parsing the result of URL path serializing url.
  316. auto path_url = Parser::basic_parse(serialize_path());
  317. // 3. If pathURL is failure, then return a new opaque origin.
  318. if (!path_url.is_valid())
  319. return Origin {};
  320. // 4. If pathURL’s scheme is "http", "https", or "file", then return pathURL’s origin.
  321. if (path_url.scheme().is_one_of("http"sv, "https"sv, "file"sv))
  322. return path_url.origin();
  323. // 5. Return a new opaque origin.
  324. return Origin {};
  325. }
  326. // -> "ftp"
  327. // -> "http"
  328. // -> "https"
  329. // -> "ws"
  330. // -> "wss"
  331. if (scheme().is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) {
  332. // Return the tuple origin (url’s scheme, url’s host, url’s port, null).
  333. return Origin(scheme().to_byte_string(), host(), port());
  334. }
  335. // -> "file"
  336. // AD-HOC: Our resource:// is basically an alias to file://
  337. if (scheme() == "file"sv || scheme() == "resource"sv) {
  338. // Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin.
  339. // Note: We must return an origin with the `file://' protocol for `file://' iframes to work from `file://' pages.
  340. return Origin(scheme().to_byte_string(), String {}, {});
  341. }
  342. // -> Otherwise
  343. // Return a new opaque origin.
  344. return Origin {};
  345. }
  346. bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
  347. {
  348. if (this == &other)
  349. return true;
  350. if (!m_data->valid || !other.m_data->valid)
  351. return false;
  352. return serialize(exclude_fragments) == other.serialize(exclude_fragments);
  353. }
  354. void append_percent_encoded(StringBuilder& builder, u32 code_point)
  355. {
  356. if (code_point <= 0x7f)
  357. builder.appendff("%{:02X}", code_point);
  358. else if (code_point <= 0x07ff)
  359. builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
  360. else if (code_point <= 0xffff)
  361. builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
  362. else if (code_point <= 0x10ffff)
  363. builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
  364. else
  365. VERIFY_NOT_REACHED();
  366. }
  367. // https://url.spec.whatwg.org/#c0-control-percent-encode-set
  368. bool code_point_is_in_percent_encode_set(u32 code_point, PercentEncodeSet set)
  369. {
  370. // NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
  371. // a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
  372. switch (set) {
  373. case PercentEncodeSet::C0Control:
  374. return code_point < 0x20 || code_point > 0x7E;
  375. case PercentEncodeSet::Fragment:
  376. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"<>`"sv.contains(static_cast<char>(code_point));
  377. case PercentEncodeSet::Query:
  378. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"#<>"sv.contains(static_cast<char>(code_point));
  379. case PercentEncodeSet::SpecialQuery:
  380. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || code_point == '\'';
  381. case PercentEncodeSet::Path:
  382. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || "?`{}"sv.contains(static_cast<char>(code_point));
  383. case PercentEncodeSet::Userinfo:
  384. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(static_cast<char>(code_point));
  385. case PercentEncodeSet::Component:
  386. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(static_cast<char>(code_point));
  387. case PercentEncodeSet::ApplicationXWWWFormUrlencoded:
  388. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Component) || "!'()~"sv.contains(static_cast<char>(code_point));
  389. case PercentEncodeSet::EncodeURI:
  390. // NOTE: This is the same percent encode set that JS encodeURI() uses.
  391. // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
  392. return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point)));
  393. default:
  394. VERIFY_NOT_REACHED();
  395. }
  396. }
  397. void append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, PercentEncodeSet set)
  398. {
  399. if (code_point_is_in_percent_encode_set(code_point, set))
  400. append_percent_encoded(builder, code_point);
  401. else
  402. builder.append_code_point(code_point);
  403. }
  404. String percent_encode(StringView input, PercentEncodeSet set, SpaceAsPlus space_as_plus)
  405. {
  406. StringBuilder builder;
  407. for (auto code_point : Utf8View(input)) {
  408. if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
  409. builder.append('+');
  410. else
  411. append_percent_encoded_if_necessary(builder, code_point, set);
  412. }
  413. return MUST(builder.to_string());
  414. }
  415. ByteString percent_decode(StringView input)
  416. {
  417. if (!input.contains('%'))
  418. return input;
  419. StringBuilder builder;
  420. Utf8View utf8_view(input);
  421. for (auto it = utf8_view.begin(); !it.done(); ++it) {
  422. if (*it != '%') {
  423. builder.append_code_point(*it);
  424. } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
  425. builder.append_code_point(*it);
  426. } else {
  427. ++it;
  428. u8 byte = parse_ascii_hex_digit(*it) << 4;
  429. ++it;
  430. byte += parse_ascii_hex_digit(*it);
  431. builder.append(byte);
  432. }
  433. }
  434. return builder.to_byte_string();
  435. }
  436. bool is_public_suffix([[maybe_unused]] StringView host)
  437. {
  438. #if defined(ENABLE_PUBLIC_SUFFIX)
  439. return PublicSuffixData::the()->is_public_suffix(host);
  440. #else
  441. return false;
  442. #endif
  443. }
  444. Optional<String> get_public_suffix([[maybe_unused]] StringView host)
  445. {
  446. #if defined(ENABLE_PUBLIC_SUFFIX)
  447. return MUST(PublicSuffixData::the()->get_public_suffix(host));
  448. #else
  449. return {};
  450. #endif
  451. }
  452. }