URL.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/Base64.h>
  8. #include <AK/CharacterTypes.h>
  9. #include <AK/Debug.h>
  10. #include <AK/LexicalPath.h>
  11. #include <AK/StringBuilder.h>
  12. #include <AK/Utf8View.h>
  13. #include <LibURL/Parser.h>
  14. #include <LibURL/URL.h>
  15. namespace URL {
  16. // FIXME: It could make sense to force users of URL to use URL::Parser::basic_parse() explicitly instead of using a constructor.
  17. URL::URL(StringView string)
  18. : URL(Parser::basic_parse(string))
  19. {
  20. if constexpr (URL_PARSER_DEBUG) {
  21. if (m_valid)
  22. dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
  23. else
  24. dbgln("URL constructor: Parsed URL to be invalid.");
  25. }
  26. }
  27. URL URL::complete_url(StringView relative_url) const
  28. {
  29. if (!is_valid())
  30. return {};
  31. return Parser::basic_parse(relative_url, *this);
  32. }
  33. ErrorOr<String> URL::username() const
  34. {
  35. return String::from_byte_string(percent_decode(m_username));
  36. }
  37. ErrorOr<String> URL::password() const
  38. {
  39. return String::from_byte_string(percent_decode(m_password));
  40. }
  41. ByteString URL::path_segment_at_index(size_t index) const
  42. {
  43. VERIFY(index < path_segment_count());
  44. return percent_decode(m_paths[index]);
  45. }
  46. ByteString URL::basename() const
  47. {
  48. if (!m_valid)
  49. return {};
  50. if (m_paths.is_empty())
  51. return {};
  52. auto& last_segment = m_paths.last();
  53. return percent_decode(last_segment);
  54. }
  55. void URL::set_scheme(String scheme)
  56. {
  57. m_scheme = move(scheme);
  58. m_valid = compute_validity();
  59. }
  60. // https://url.spec.whatwg.org/#set-the-username
  61. ErrorOr<void> URL::set_username(StringView username)
  62. {
  63. // To set the username given a url and username, set url’s username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
  64. m_username = TRY(String::from_byte_string(percent_encode(username, PercentEncodeSet::Userinfo)));
  65. m_valid = compute_validity();
  66. return {};
  67. }
  68. // https://url.spec.whatwg.org/#set-the-password
  69. ErrorOr<void> URL::set_password(StringView password)
  70. {
  71. // To set the password given a url and password, set url’s password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
  72. m_password = TRY(String::from_byte_string(percent_encode(password, PercentEncodeSet::Userinfo)));
  73. m_valid = compute_validity();
  74. return {};
  75. }
  76. void URL::set_host(Host host)
  77. {
  78. m_host = move(host);
  79. m_valid = compute_validity();
  80. }
  81. // https://url.spec.whatwg.org/#concept-host-serializer
  82. ErrorOr<String> URL::serialized_host() const
  83. {
  84. return Parser::serialize_host(m_host);
  85. }
  86. void URL::set_port(Optional<u16> port)
  87. {
  88. if (port == default_port_for_scheme(m_scheme)) {
  89. m_port = {};
  90. return;
  91. }
  92. m_port = move(port);
  93. m_valid = compute_validity();
  94. }
  95. void URL::set_paths(Vector<ByteString> const& paths)
  96. {
  97. m_paths.clear_with_capacity();
  98. m_paths.ensure_capacity(paths.size());
  99. for (auto const& segment : paths)
  100. m_paths.unchecked_append(String::from_byte_string(percent_encode(segment, PercentEncodeSet::Path)).release_value_but_fixme_should_propagate_errors());
  101. m_valid = compute_validity();
  102. }
  103. void URL::append_path(StringView path)
  104. {
  105. m_paths.append(String::from_byte_string(percent_encode(path, PercentEncodeSet::Path)).release_value_but_fixme_should_propagate_errors());
  106. }
  107. // https://url.spec.whatwg.org/#cannot-have-a-username-password-port
  108. bool URL::cannot_have_a_username_or_password_or_port() const
  109. {
  110. // A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
  111. return m_host.has<Empty>() || m_host == String {} || m_scheme == "file"sv;
  112. }
  113. // FIXME: This is by no means complete.
  114. // NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
  115. bool URL::compute_validity() const
  116. {
  117. if (m_scheme.is_empty())
  118. return false;
  119. if (m_cannot_be_a_base_url) {
  120. if (m_paths.size() != 1)
  121. return false;
  122. if (m_paths[0].is_empty())
  123. return false;
  124. } else {
  125. if (m_scheme.is_one_of("about", "mailto"))
  126. return false;
  127. // NOTE: Maybe it is allowed to have a zero-segment path.
  128. if (m_paths.size() == 0)
  129. return false;
  130. }
  131. // NOTE: A file URL's host should be the empty string for localhost, not null.
  132. if (m_scheme == "file" && m_host.has<Empty>())
  133. return false;
  134. return true;
  135. }
  136. // https://url.spec.whatwg.org/#default-port
  137. Optional<u16> default_port_for_scheme(StringView scheme)
  138. {
  139. // Spec defined mappings with port:
  140. if (scheme == "ftp")
  141. return 21;
  142. if (scheme == "http")
  143. return 80;
  144. if (scheme == "https")
  145. return 443;
  146. if (scheme == "ws")
  147. return 80;
  148. if (scheme == "wss")
  149. return 443;
  150. // NOTE: not in spec, but we support these too
  151. if (scheme == "gemini")
  152. return 1965;
  153. if (scheme == "irc")
  154. return 6667;
  155. if (scheme == "ircs")
  156. return 6697;
  157. return {};
  158. }
  159. URL create_with_file_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
  160. {
  161. LexicalPath lexical_path(path);
  162. if (!lexical_path.is_absolute())
  163. return {};
  164. URL url;
  165. url.set_scheme("file"_string);
  166. url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
  167. url.set_paths(lexical_path.parts());
  168. if (path.ends_with('/'))
  169. url.append_slash();
  170. if (!fragment.is_empty())
  171. url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
  172. return url;
  173. }
  174. URL create_with_help_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
  175. {
  176. LexicalPath lexical_path(path);
  177. URL url;
  178. url.set_scheme("help"_string);
  179. url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
  180. url.set_paths(lexical_path.parts());
  181. if (path.ends_with('/'))
  182. url.append_slash();
  183. if (!fragment.is_empty())
  184. url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
  185. return url;
  186. }
  187. URL create_with_url_or_path(ByteString const& url_or_path)
  188. {
  189. URL url = url_or_path;
  190. if (url.is_valid())
  191. return url;
  192. ByteString path = LexicalPath::canonicalized_path(url_or_path);
  193. return create_with_file_scheme(path);
  194. }
  195. URL create_with_data(StringView mime_type, StringView payload, bool is_base64)
  196. {
  197. URL url;
  198. url.set_cannot_be_a_base_url(true);
  199. url.set_scheme("data"_string);
  200. StringBuilder builder;
  201. builder.append(mime_type);
  202. if (is_base64)
  203. builder.append(";base64"sv);
  204. builder.append(',');
  205. builder.append(payload);
  206. url.set_paths({ builder.to_byte_string() });
  207. return url;
  208. }
  209. // https://url.spec.whatwg.org/#special-scheme
  210. bool is_special_scheme(StringView scheme)
  211. {
  212. return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
  213. }
  214. // https://url.spec.whatwg.org/#url-path-serializer
  215. ByteString URL::serialize_path(ApplyPercentDecoding apply_percent_decoding) const
  216. {
  217. // 1. If url has an opaque path, then return url’s path.
  218. // FIXME: Reimplement this step once we modernize the URL implementation to meet the spec.
  219. if (cannot_be_a_base_url())
  220. return m_paths[0].to_byte_string();
  221. // 2. Let output be the empty string.
  222. StringBuilder output;
  223. // 3. For each segment of url’s path: append U+002F (/) followed by segment to output.
  224. for (auto const& segment : m_paths) {
  225. output.append('/');
  226. output.append(apply_percent_decoding == ApplyPercentDecoding::Yes ? percent_decode(segment) : segment.to_byte_string());
  227. }
  228. // 4. Return output.
  229. return output.to_byte_string();
  230. }
  231. // https://url.spec.whatwg.org/#concept-url-serializer
  232. ByteString URL::serialize(ExcludeFragment exclude_fragment) const
  233. {
  234. // 1. Let output be url’s scheme and U+003A (:) concatenated.
  235. StringBuilder output;
  236. output.append(m_scheme);
  237. output.append(':');
  238. // 2. If url’s host is non-null:
  239. if (!m_host.has<Empty>()) {
  240. // 1. Append "//" to output.
  241. output.append("//"sv);
  242. // 2. If url includes credentials, then:
  243. if (includes_credentials()) {
  244. // 1. Append url’s username to output.
  245. output.append(m_username);
  246. // 2. If url’s password is not the empty string, then append U+003A (:), followed by url’s password, to output.
  247. if (!m_password.is_empty()) {
  248. output.append(':');
  249. output.append(m_password);
  250. }
  251. // 3. Append U+0040 (@) to output.
  252. output.append('@');
  253. }
  254. // 3. Append url’s host, serialized, to output.
  255. output.append(serialized_host().release_value_but_fixme_should_propagate_errors());
  256. // 4. If url’s port is non-null, append U+003A (:) followed by url’s port, serialized, to output.
  257. if (m_port.has_value())
  258. output.appendff(":{}", *m_port);
  259. }
  260. // 3. If url’s host is null, url does not have an opaque path, url’s path’s size is greater than 1, and url’s path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
  261. // 4. Append the result of URL path serializing url to output.
  262. // FIXME: Implement this closer to spec steps.
  263. if (cannot_be_a_base_url()) {
  264. output.append(m_paths[0]);
  265. } else {
  266. if (m_host.has<Empty>() && m_paths.size() > 1 && m_paths[0].is_empty())
  267. output.append("/."sv);
  268. for (auto& segment : m_paths) {
  269. output.append('/');
  270. output.append(segment);
  271. }
  272. }
  273. // 5. If url’s query is non-null, append U+003F (?), followed by url’s query, to output.
  274. if (m_query.has_value()) {
  275. output.append('?');
  276. output.append(*m_query);
  277. }
  278. // 6. If exclude fragment is false and url’s fragment is non-null, then append U+0023 (#), followed by url’s fragment, to output.
  279. if (exclude_fragment == ExcludeFragment::No && m_fragment.has_value()) {
  280. output.append('#');
  281. output.append(*m_fragment);
  282. }
  283. // 7. Return output.
  284. return output.to_byte_string();
  285. }
  286. // https://url.spec.whatwg.org/#url-rendering
  287. // NOTE: This does e.g. not display credentials.
  288. // FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
  289. // resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
  290. ByteString URL::serialize_for_display() const
  291. {
  292. VERIFY(m_valid);
  293. StringBuilder builder;
  294. builder.append(m_scheme);
  295. builder.append(':');
  296. if (!m_host.has<Empty>()) {
  297. builder.append("//"sv);
  298. builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
  299. if (m_port.has_value())
  300. builder.appendff(":{}", *m_port);
  301. }
  302. if (cannot_be_a_base_url()) {
  303. builder.append(m_paths[0]);
  304. } else {
  305. if (m_host.has<Empty>() && m_paths.size() > 1 && m_paths[0].is_empty())
  306. builder.append("/."sv);
  307. for (auto& segment : m_paths) {
  308. builder.append('/');
  309. builder.append(segment);
  310. }
  311. }
  312. if (m_query.has_value()) {
  313. builder.append('?');
  314. builder.append(*m_query);
  315. }
  316. if (m_fragment.has_value()) {
  317. builder.append('#');
  318. builder.append(*m_fragment);
  319. }
  320. return builder.to_byte_string();
  321. }
  322. ErrorOr<String> URL::to_string() const
  323. {
  324. return String::from_byte_string(serialize());
  325. }
  326. // https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
  327. // https://url.spec.whatwg.org/#concept-url-origin
  328. ByteString URL::serialize_origin() const
  329. {
  330. VERIFY(m_valid);
  331. if (m_scheme == "blob"sv) {
  332. // TODO: 1. If URL’s blob URL entry is non-null, then return URL’s blob URL entry’s environment’s origin.
  333. // 2. Let url be the result of parsing URL’s path[0].
  334. VERIFY(!m_paths.is_empty());
  335. URL url = m_paths[0];
  336. // 3. Return a new opaque origin, if url is failure, and url’s origin otherwise.
  337. if (!url.is_valid())
  338. return "null";
  339. return url.serialize_origin();
  340. } else if (!m_scheme.is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
  341. return "null";
  342. }
  343. StringBuilder builder;
  344. builder.append(m_scheme);
  345. builder.append("://"sv);
  346. builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
  347. if (m_port.has_value())
  348. builder.appendff(":{}", *m_port);
  349. return builder.to_byte_string();
  350. }
  351. bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
  352. {
  353. if (this == &other)
  354. return true;
  355. if (!m_valid || !other.m_valid)
  356. return false;
  357. return serialize(exclude_fragments) == other.serialize(exclude_fragments);
  358. }
  359. void append_percent_encoded(StringBuilder& builder, u32 code_point)
  360. {
  361. if (code_point <= 0x7f)
  362. builder.appendff("%{:02X}", code_point);
  363. else if (code_point <= 0x07ff)
  364. builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
  365. else if (code_point <= 0xffff)
  366. builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
  367. else if (code_point <= 0x10ffff)
  368. builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
  369. else
  370. VERIFY_NOT_REACHED();
  371. }
  372. // https://url.spec.whatwg.org/#c0-control-percent-encode-set
  373. bool code_point_is_in_percent_encode_set(u32 code_point, PercentEncodeSet set)
  374. {
  375. // NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
  376. // a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
  377. switch (set) {
  378. case PercentEncodeSet::C0Control:
  379. return code_point < 0x20 || code_point > 0x7E;
  380. case PercentEncodeSet::Fragment:
  381. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"<>`"sv.contains(static_cast<char>(code_point));
  382. case PercentEncodeSet::Query:
  383. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::C0Control) || " \"#<>"sv.contains(static_cast<char>(code_point));
  384. case PercentEncodeSet::SpecialQuery:
  385. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || code_point == '\'';
  386. case PercentEncodeSet::Path:
  387. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Query) || "?`{}"sv.contains(static_cast<char>(code_point));
  388. case PercentEncodeSet::Userinfo:
  389. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(static_cast<char>(code_point));
  390. case PercentEncodeSet::Component:
  391. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(static_cast<char>(code_point));
  392. case PercentEncodeSet::ApplicationXWWWFormUrlencoded:
  393. return code_point_is_in_percent_encode_set(code_point, PercentEncodeSet::Component) || "!'()~"sv.contains(static_cast<char>(code_point));
  394. case PercentEncodeSet::EncodeURI:
  395. // NOTE: This is the same percent encode set that JS encodeURI() uses.
  396. // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
  397. return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point)));
  398. default:
  399. VERIFY_NOT_REACHED();
  400. }
  401. }
  402. void append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, PercentEncodeSet set)
  403. {
  404. if (code_point_is_in_percent_encode_set(code_point, set))
  405. append_percent_encoded(builder, code_point);
  406. else
  407. builder.append_code_point(code_point);
  408. }
  409. ByteString percent_encode(StringView input, PercentEncodeSet set, SpaceAsPlus space_as_plus)
  410. {
  411. StringBuilder builder;
  412. for (auto code_point : Utf8View(input)) {
  413. if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
  414. builder.append('+');
  415. else
  416. append_percent_encoded_if_necessary(builder, code_point, set);
  417. }
  418. return builder.to_byte_string();
  419. }
  420. ByteString percent_decode(StringView input)
  421. {
  422. if (!input.contains('%'))
  423. return input;
  424. StringBuilder builder;
  425. Utf8View utf8_view(input);
  426. for (auto it = utf8_view.begin(); !it.done(); ++it) {
  427. if (*it != '%') {
  428. builder.append_code_point(*it);
  429. } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
  430. builder.append_code_point(*it);
  431. } else {
  432. ++it;
  433. u8 byte = parse_ascii_hex_digit(*it) << 4;
  434. ++it;
  435. byte += parse_ascii_hex_digit(*it);
  436. builder.append(byte);
  437. }
  438. }
  439. return builder.to_byte_string();
  440. }
  441. }