URL.cpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Debug.h>
  9. #include <AK/LexicalPath.h>
  10. #include <AK/StringBuilder.h>
  11. #include <AK/URL.h>
  12. #include <AK/URLParser.h>
  13. #include <AK/Utf8View.h>
  14. namespace AK {
  15. // FIXME: It could make sense to force users of URL to use URLParser::parse() explicitly instead of using a constructor.
  16. URL::URL(StringView string)
  17. : URL(URLParser::parse(string))
  18. {
  19. if constexpr (URL_PARSER_DEBUG) {
  20. if (m_valid)
  21. dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
  22. else
  23. dbgln("URL constructor: Parsed URL to be invalid.");
  24. }
  25. }
  26. String URL::path() const
  27. {
  28. if (cannot_be_a_base_url())
  29. return paths()[0];
  30. StringBuilder builder;
  31. for (auto& path : m_paths) {
  32. builder.append('/');
  33. builder.append(path);
  34. }
  35. return builder.to_string();
  36. }
  37. URL URL::complete_url(String const& string) const
  38. {
  39. if (!is_valid())
  40. return {};
  41. return URLParser::parse(string, this);
  42. }
  43. void URL::set_scheme(String scheme)
  44. {
  45. m_scheme = move(scheme);
  46. m_valid = compute_validity();
  47. }
  48. void URL::set_username(String username)
  49. {
  50. m_username = move(username);
  51. m_valid = compute_validity();
  52. }
  53. void URL::set_password(String password)
  54. {
  55. m_password = move(password);
  56. m_valid = compute_validity();
  57. }
  58. void URL::set_host(String host)
  59. {
  60. m_host = move(host);
  61. m_valid = compute_validity();
  62. }
  63. void URL::set_port(Optional<u16> port)
  64. {
  65. if (port == default_port_for_scheme(m_scheme)) {
  66. m_port = {};
  67. return;
  68. }
  69. m_port = move(port);
  70. m_valid = compute_validity();
  71. }
  72. void URL::set_paths(Vector<String> paths)
  73. {
  74. m_paths = move(paths);
  75. m_valid = compute_validity();
  76. }
  77. void URL::set_query(String query)
  78. {
  79. m_query = move(query);
  80. }
  81. void URL::set_fragment(String fragment)
  82. {
  83. m_fragment = move(fragment);
  84. }
  85. // FIXME: This is by no means complete.
  86. // NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
  87. bool URL::compute_validity() const
  88. {
  89. if (m_scheme.is_empty())
  90. return false;
  91. if (m_scheme == "data") {
  92. if (m_data_mime_type.is_empty())
  93. return false;
  94. if (m_data_payload_is_base64) {
  95. if (m_data_payload.length() % 4 != 0)
  96. return false;
  97. for (auto character : m_data_payload) {
  98. if (!is_ascii_alphanumeric(character) || character == '+' || character == '/' || character == '=')
  99. return false;
  100. }
  101. }
  102. } else if (m_cannot_be_a_base_url) {
  103. if (m_paths.size() != 1)
  104. return false;
  105. if (m_paths[0].is_empty())
  106. return false;
  107. } else {
  108. if (m_scheme.is_one_of("about", "mailto"))
  109. return false;
  110. // NOTE: Maybe it is allowed to have a zero-segment path.
  111. if (m_paths.size() == 0)
  112. return false;
  113. }
  114. // NOTE: A file URL's host should be the empty string for localhost, not null.
  115. if (m_scheme == "file" && m_host.is_null())
  116. return false;
  117. return true;
  118. }
  119. bool URL::scheme_requires_port(StringView scheme)
  120. {
  121. return (default_port_for_scheme(scheme) != 0);
  122. }
  123. u16 URL::default_port_for_scheme(StringView scheme)
  124. {
  125. if (scheme == "http")
  126. return 80;
  127. if (scheme == "https")
  128. return 443;
  129. if (scheme == "gemini")
  130. return 1965;
  131. if (scheme == "irc")
  132. return 6667;
  133. if (scheme == "ircs")
  134. return 6697;
  135. if (scheme == "ws")
  136. return 80;
  137. if (scheme == "wss")
  138. return 443;
  139. return 0;
  140. }
  141. URL URL::create_with_file_scheme(String const& path, String const& fragment, String const& hostname)
  142. {
  143. LexicalPath lexical_path(path);
  144. if (!lexical_path.is_absolute())
  145. return {};
  146. URL url;
  147. url.set_scheme("file");
  148. // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string.
  149. // This is because a file URL always needs a non-null hostname.
  150. url.set_host(hostname.is_null() || hostname == "localhost" ? String::empty() : hostname);
  151. url.set_paths(lexical_path.parts());
  152. // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
  153. if (path.ends_with('/'))
  154. url.append_path("");
  155. url.set_fragment(fragment);
  156. return url;
  157. }
  158. URL URL::create_with_help_scheme(String const& path, String const& fragment, String const& hostname)
  159. {
  160. LexicalPath lexical_path(path);
  161. URL url;
  162. url.set_scheme("help");
  163. // NOTE: If the hostname is localhost (or null, which implies localhost), it should be set to the empty string.
  164. // This is because a file URL always needs a non-null hostname.
  165. url.set_host(hostname.is_null() || hostname == "localhost" ? String::empty() : hostname);
  166. url.set_paths(lexical_path.parts());
  167. // NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
  168. if (path.ends_with('/'))
  169. url.append_path("");
  170. url.set_fragment(fragment);
  171. return url;
  172. }
  173. URL URL::create_with_url_or_path(String const& url_or_path)
  174. {
  175. URL url = url_or_path;
  176. if (url.is_valid())
  177. return url;
  178. String path = LexicalPath::canonicalized_path(url_or_path);
  179. return URL::create_with_file_scheme(path);
  180. }
  181. // https://url.spec.whatwg.org/#special-scheme
  182. bool URL::is_special_scheme(StringView scheme)
  183. {
  184. return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
  185. }
  186. String URL::serialize_data_url() const
  187. {
  188. VERIFY(m_scheme == "data");
  189. VERIFY(!m_data_mime_type.is_null());
  190. VERIFY(!m_data_payload.is_null());
  191. StringBuilder builder;
  192. builder.append(m_scheme);
  193. builder.append(':');
  194. builder.append(m_data_mime_type);
  195. if (m_data_payload_is_base64)
  196. builder.append(";base64");
  197. builder.append(',');
  198. // NOTE: The specification does not say anything about encoding this, but we should encode at least control and non-ASCII
  199. // characters (since this is also a valid representation of the same data URL).
  200. builder.append(URL::percent_encode(m_data_payload, PercentEncodeSet::C0Control));
  201. return builder.to_string();
  202. }
  203. // https://url.spec.whatwg.org/#concept-url-serializer
  204. String URL::serialize(ExcludeFragment exclude_fragment) const
  205. {
  206. if (m_scheme == "data")
  207. return serialize_data_url();
  208. StringBuilder builder;
  209. builder.append(m_scheme);
  210. builder.append(':');
  211. if (!m_host.is_null()) {
  212. builder.append("//");
  213. if (includes_credentials()) {
  214. builder.append(percent_encode(m_username, PercentEncodeSet::Userinfo));
  215. if (!m_password.is_empty()) {
  216. builder.append(':');
  217. builder.append(percent_encode(m_password, PercentEncodeSet::Userinfo));
  218. }
  219. builder.append('@');
  220. }
  221. builder.append(m_host);
  222. if (m_port.has_value())
  223. builder.appendff(":{}", *m_port);
  224. }
  225. if (cannot_be_a_base_url()) {
  226. builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path));
  227. } else {
  228. if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty())
  229. builder.append("/.");
  230. for (auto& segment : m_paths) {
  231. builder.append('/');
  232. builder.append(percent_encode(segment, PercentEncodeSet::Path));
  233. }
  234. }
  235. if (!m_query.is_null()) {
  236. builder.append('?');
  237. builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query));
  238. }
  239. if (exclude_fragment == ExcludeFragment::No && !m_fragment.is_null()) {
  240. builder.append('#');
  241. builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment));
  242. }
  243. return builder.to_string();
  244. }
  245. // https://url.spec.whatwg.org/#url-rendering
  246. // NOTE: This does e.g. not display credentials.
  247. // FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
  248. // resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
  249. String URL::serialize_for_display() const
  250. {
  251. VERIFY(m_valid);
  252. if (m_scheme == "data")
  253. return serialize_data_url();
  254. StringBuilder builder;
  255. builder.append(m_scheme);
  256. builder.append(':');
  257. if (!m_host.is_null()) {
  258. builder.append("//");
  259. builder.append(m_host);
  260. if (m_port.has_value())
  261. builder.appendff(":{}", *m_port);
  262. }
  263. if (cannot_be_a_base_url()) {
  264. builder.append(percent_encode(m_paths[0], PercentEncodeSet::Path));
  265. } else {
  266. if (m_host.is_null() && m_paths.size() > 1 && m_paths[0].is_empty())
  267. builder.append("/.");
  268. for (auto& segment : m_paths) {
  269. builder.append('/');
  270. builder.append(percent_encode(segment, PercentEncodeSet::Path));
  271. }
  272. }
  273. if (!m_query.is_null()) {
  274. builder.append('?');
  275. builder.append(percent_encode(m_query, is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query));
  276. }
  277. if (!m_fragment.is_null()) {
  278. builder.append('#');
  279. builder.append(percent_encode(m_fragment, PercentEncodeSet::Fragment));
  280. }
  281. return builder.to_string();
  282. }
  283. // https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
  284. // https://url.spec.whatwg.org/#concept-url-origin
  285. String URL::serialize_origin() const
  286. {
  287. VERIFY(m_valid);
  288. if (m_scheme == "blob"sv) {
  289. // TODO: 1. If URL’s blob URL entry is non-null, then return URL’s blob URL entry’s environment’s origin.
  290. // 2. Let url be the result of parsing URL’s path[0].
  291. VERIFY(!m_paths.is_empty());
  292. URL url = m_paths[0];
  293. // 3. Return a new opaque origin, if url is failure, and url’s origin otherwise.
  294. if (!url.is_valid())
  295. return "null";
  296. return url.serialize_origin();
  297. } else if (!m_scheme.is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
  298. return "null";
  299. }
  300. StringBuilder builder;
  301. builder.append(m_scheme);
  302. builder.append("://"sv);
  303. builder.append(m_host);
  304. if (m_port.has_value())
  305. builder.appendff(":{}", *m_port);
  306. return builder.build();
  307. }
  308. bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
  309. {
  310. if (this == &other)
  311. return true;
  312. if (!m_valid || !other.m_valid)
  313. return false;
  314. return serialize(exclude_fragments) == other.serialize(exclude_fragments);
  315. }
  316. String URL::basename() const
  317. {
  318. if (!m_valid)
  319. return {};
  320. if (m_paths.is_empty())
  321. return {};
  322. return m_paths.last();
  323. }
  324. void URL::append_percent_encoded(StringBuilder& builder, u32 code_point)
  325. {
  326. if (code_point <= 0x7f)
  327. builder.appendff("%{:02X}", code_point);
  328. else if (code_point <= 0x07ff)
  329. builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
  330. else if (code_point <= 0xffff)
  331. builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
  332. else if (code_point <= 0x10ffff)
  333. builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
  334. else
  335. VERIFY_NOT_REACHED();
  336. }
  337. // https://url.spec.whatwg.org/#c0-control-percent-encode-set
  338. bool URL::code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set)
  339. {
  340. switch (set) {
  341. case URL::PercentEncodeSet::C0Control:
  342. return code_point < 0x20 || code_point > 0x7E;
  343. case URL::PercentEncodeSet::Fragment:
  344. return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"<>`"sv.contains(code_point);
  345. case URL::PercentEncodeSet::Query:
  346. return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"#<>"sv.contains(code_point);
  347. case URL::PercentEncodeSet::SpecialQuery:
  348. return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || code_point == '\'';
  349. case URL::PercentEncodeSet::Path:
  350. return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || "?`{}"sv.contains(code_point);
  351. case URL::PercentEncodeSet::Userinfo:
  352. return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(code_point);
  353. case URL::PercentEncodeSet::Component:
  354. return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(code_point);
  355. case URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded:
  356. return code_point >= 0x7E || !(is_ascii_alphanumeric(code_point) || "!'()~"sv.contains(code_point));
  357. case URL::PercentEncodeSet::EncodeURI:
  358. // NOTE: This is the same percent encode set that JS encodeURI() uses.
  359. // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
  360. return code_point >= 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(code_point));
  361. default:
  362. VERIFY_NOT_REACHED();
  363. }
  364. }
  365. void URL::append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, URL::PercentEncodeSet set)
  366. {
  367. if (code_point_is_in_percent_encode_set(code_point, set))
  368. append_percent_encoded(builder, code_point);
  369. else
  370. builder.append_code_point(code_point);
  371. }
  372. String URL::percent_encode(StringView input, URL::PercentEncodeSet set, SpaceAsPlus space_as_plus)
  373. {
  374. StringBuilder builder;
  375. for (auto code_point : Utf8View(input)) {
  376. if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
  377. builder.append('+');
  378. else
  379. append_percent_encoded_if_necessary(builder, code_point, set);
  380. }
  381. return builder.to_string();
  382. }
  383. String URL::percent_decode(StringView input)
  384. {
  385. if (!input.contains('%'))
  386. return input;
  387. StringBuilder builder;
  388. Utf8View utf8_view(input);
  389. for (auto it = utf8_view.begin(); !it.done(); ++it) {
  390. if (*it != '%') {
  391. builder.append_code_point(*it);
  392. } else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
  393. builder.append_code_point(*it);
  394. } else {
  395. ++it;
  396. u8 byte = parse_ascii_hex_digit(*it) << 4;
  397. ++it;
  398. byte += parse_ascii_hex_digit(*it);
  399. builder.append(byte);
  400. }
  401. }
  402. return builder.to_string();
  403. }
  404. }