URLParser.cpp 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682
  1. /*
  2. * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Debug.h>
  8. #include <AK/Optional.h>
  9. #include <AK/SourceLocation.h>
  10. #include <AK/String.h>
  11. #include <AK/StringBuilder.h>
  12. #include <AK/StringUtils.h>
  13. #include <AK/URLParser.h>
  14. #include <AK/Utf8View.h>
  15. namespace AK {
  16. // NOTE: This is similar to the LibC macro EOF = -1.
  17. constexpr u32 end_of_file = 0xFFFFFFFF;
  18. constexpr bool is_url_code_point(u32 code_point)
  19. {
  20. // FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
  21. return is_ascii_alphanumeric(code_point) || code_point >= 0xA0 || "!$&'()*+,-./:;=?@_~"sv.contains(code_point);
  22. }
  23. static void report_validation_error(SourceLocation const& location = SourceLocation::current())
  24. {
  25. dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Validation error! {}", location);
  26. }
  27. static Optional<String> parse_opaque_host(StringView input)
  28. {
  29. auto forbidden_host_code_points_excluding_percent = "\0\t\n\r #/:<>?@[\\]^|"sv;
  30. for (auto code_point : forbidden_host_code_points_excluding_percent) {
  31. if (input.contains(code_point)) {
  32. report_validation_error();
  33. return {};
  34. }
  35. }
  36. // FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
  37. // FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
  38. return URL::percent_encode(input, URL::PercentEncodeSet::C0Control);
  39. }
  40. static Optional<String> parse_ipv4_address(StringView input)
  41. {
  42. // FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
  43. return input;
  44. }
  45. // https://url.spec.whatwg.org/#concept-host-parser
  46. // NOTE: This is a very bare-bones implementation.
  47. static Optional<String> parse_host(StringView input, bool is_not_special = false)
  48. {
  49. if (input.starts_with('[')) {
  50. if (!input.ends_with(']')) {
  51. report_validation_error();
  52. return {};
  53. }
  54. // FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
  55. TODO();
  56. }
  57. if (is_not_special)
  58. return parse_opaque_host(input);
  59. VERIFY(!input.is_empty());
  60. // FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
  61. auto domain = URL::percent_decode(input);
  62. // FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
  63. auto& ascii_domain = domain;
  64. auto forbidden_host_code_points = "\0\t\n\r #%/:<>?@[\\]^|"sv;
  65. for (auto code_point : forbidden_host_code_points) {
  66. if (ascii_domain.view().contains(code_point)) {
  67. report_validation_error();
  68. return {};
  69. }
  70. }
  71. auto ipv4_host = parse_ipv4_address(ascii_domain);
  72. return ipv4_host;
  73. }
  74. constexpr bool starts_with_windows_drive_letter(StringView input)
  75. {
  76. if (input.length() < 2)
  77. return false;
  78. if (!is_ascii_alpha(input[0]) && !(input[1] == ':' || input[1] == '|'))
  79. return false;
  80. if (input.length() == 2)
  81. return true;
  82. return "/\\?#"sv.contains(input[2]);
  83. }
  84. constexpr bool is_windows_drive_letter(StringView input)
  85. {
  86. return input.length() == 2 && is_ascii_alpha(input[0]) && (input[1] == ':' || input[1] == '|');
  87. }
  88. constexpr bool is_normalized_windows_drive_letter(StringView input)
  89. {
  90. return input.length() == 2 && is_ascii_alpha(input[0]) && input[1] == ':';
  91. }
  92. constexpr bool is_single_dot_path_segment(StringView input)
  93. {
  94. return input == "."sv || input.equals_ignoring_case("%2e"sv);
  95. }
  96. constexpr bool is_double_dot_path_segment(StringView input)
  97. {
  98. return input == ".."sv || input.equals_ignoring_case(".%2e"sv) || input.equals_ignoring_case("%2e."sv) || input.equals_ignoring_case("%2e%2e"sv);
  99. }
  100. // https://fetch.spec.whatwg.org/#data-urls
  101. // FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec.
  102. Optional<URL> URLParser::parse_data_url(StringView raw_input)
  103. {
  104. dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsing '{}'.", raw_input);
  105. VERIFY(raw_input.starts_with("data:"));
  106. auto input = raw_input.substring_view(5);
  107. auto comma_offset = input.find(',');
  108. if (!comma_offset.has_value())
  109. return {};
  110. auto mime_type = StringUtils::trim(input.substring_view(0, comma_offset.value()), "\t\n\f\r ", TrimMode::Both);
  111. auto encoded_body = input.substring_view(comma_offset.value() + 1);
  112. auto body = URL::percent_decode(encoded_body);
  113. bool is_base64_encoded = false;
  114. if (mime_type.ends_with("base64", CaseSensitivity::CaseInsensitive)) {
  115. auto substring_view = mime_type.substring_view(0, mime_type.length() - 6);
  116. auto trimmed_substring_view = StringUtils::trim(substring_view, " ", TrimMode::Right);
  117. if (trimmed_substring_view.ends_with(';')) {
  118. is_base64_encoded = true;
  119. mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
  120. }
  121. }
  122. StringBuilder builder;
  123. if (mime_type.starts_with(";") || mime_type.is_empty()) {
  124. builder.append("text/plain");
  125. builder.append(mime_type);
  126. mime_type = builder.string_view();
  127. }
  128. // FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
  129. URL url { StringUtils::trim(mime_type, "\n\r\t ", TrimMode::Both), move(body), is_base64_encoded };
  130. dbgln_if(URL_PARSER_DEBUG, "URLParser::parse_data_url: Parsed data URL to be '{}'.", url.serialize());
  131. return url;
  132. }
  133. // https://url.spec.whatwg.org/#concept-basic-url-parser
  134. // NOTE: This parser assumes a UTF-8 encoding.
  135. // NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
  136. // validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
  137. // future for validation of URLs, which would then lead to infinite recursion.
  138. // The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
  139. // variables' values here, not what the URL class presents to its users.
  140. // NOTE: Since the URL class's member variables contain percent decoded data, we have to deviate from the URL parser specification when setting
  141. // some of those values. Because the specification leaves all values percent encoded in their URL data structure, we have to percent decode
  142. // everything before setting the member variables.
  143. URL URLParser::parse(StringView raw_input, URL const* base_url, Optional<URL> url, Optional<State> state_override)
  144. {
  145. dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsing '{}'", raw_input);
  146. if (raw_input.is_empty())
  147. return {};
  148. if (raw_input.starts_with("data:")) {
  149. auto maybe_url = parse_data_url(raw_input);
  150. if (!maybe_url.has_value())
  151. return {};
  152. return maybe_url.release_value();
  153. }
  154. size_t start_index = 0;
  155. size_t end_index = raw_input.length();
  156. if (!url.has_value()) {
  157. url = URL();
  158. // NOTE: This removes all leading and trailing C0 control or space characters.
  159. bool has_validation_error = false;
  160. for (size_t i = 0; i < raw_input.length(); ++i) {
  161. i8 ch = raw_input[i];
  162. if (0 <= ch && ch <= 0x20) {
  163. ++start_index;
  164. has_validation_error = true;
  165. } else {
  166. break;
  167. }
  168. }
  169. for (ssize_t i = raw_input.length() - 1; i >= 0; --i) {
  170. i8 ch = raw_input[i];
  171. if (0 <= ch && ch <= 0x20) {
  172. --end_index;
  173. has_validation_error = true;
  174. } else {
  175. break;
  176. }
  177. }
  178. if (has_validation_error)
  179. report_validation_error();
  180. }
  181. if (start_index >= end_index)
  182. return {};
  183. String processed_input = raw_input.substring_view(start_index, end_index - start_index);
  184. // NOTE: This replaces all tab and newline characters with nothing.
  185. if (processed_input.contains("\t") || processed_input.contains("\n")) {
  186. report_validation_error();
  187. processed_input = processed_input.replace("\t", "", true).replace("\n", "", true);
  188. }
  189. State state = state_override.value_or(State::SchemeStart);
  190. StringBuilder buffer;
  191. bool at_sign_seen = false;
  192. bool inside_brackets = false;
  193. bool password_token_seen = false;
  194. Utf8View input(processed_input);
  195. Utf8CodePointIterator iterator = input.begin();
  196. auto get_remaining = [&input, &iterator] {
  197. return input.substring_view(iterator - input.begin() + iterator.underlying_code_point_length_in_bytes()).as_string();
  198. };
  199. // NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
  200. // ++iterator : "increase pointer by 1"
  201. // continue : "decrease pointer by 1"
  202. for (;;) {
  203. u32 code_point = end_of_file;
  204. if (!iterator.done())
  205. code_point = *iterator;
  206. if constexpr (URL_PARSER_DEBUG) {
  207. if (code_point == end_of_file)
  208. dbgln("URLParser::parse: {} state with EOF.", state_name(state));
  209. else if (is_ascii_printable(code_point))
  210. dbgln("URLParser::parse: {} state with code point U+{:04X} ({:c}).", state_name(state), code_point, code_point);
  211. else
  212. dbgln("URLParser::parse: {} state with code point U+{:04X}.", state_name(state), code_point);
  213. }
  214. switch (state) {
  215. case State::SchemeStart:
  216. if (is_ascii_alpha(code_point)) {
  217. buffer.append_as_lowercase(code_point);
  218. state = State::Scheme;
  219. } else {
  220. state = State::NoScheme;
  221. continue;
  222. }
  223. break;
  224. case State::Scheme:
  225. if (is_ascii_alphanumeric(code_point) || code_point == '+' || code_point == '-' || code_point == '.') {
  226. buffer.append_as_lowercase(code_point);
  227. } else if (code_point == ':') {
  228. url->m_scheme = buffer.to_string();
  229. buffer.clear();
  230. if (url->scheme() == "file") {
  231. if (!get_remaining().starts_with("//")) {
  232. report_validation_error();
  233. }
  234. state = State::File;
  235. } else if (url->is_special()) {
  236. if (base_url && base_url->m_scheme == url->m_scheme)
  237. state = State::SpecialRelativeOrAuthority;
  238. else
  239. state = State::SpecialAuthoritySlashes;
  240. } else if (get_remaining().starts_with("/")) {
  241. state = State::PathOrAuthority;
  242. ++iterator;
  243. } else {
  244. url->m_cannot_be_a_base_url = true;
  245. url->append_path("");
  246. state = State::CannotBeABaseUrlPath;
  247. }
  248. } else {
  249. buffer.clear();
  250. state = State::NoScheme;
  251. iterator = input.begin();
  252. continue;
  253. }
  254. break;
  255. case State::NoScheme:
  256. if (!base_url || (base_url->m_cannot_be_a_base_url && code_point != '#')) {
  257. report_validation_error();
  258. return {};
  259. } else if (base_url->m_cannot_be_a_base_url && code_point == '#') {
  260. url->m_scheme = base_url->m_scheme;
  261. url->m_paths = base_url->m_paths;
  262. url->m_query = base_url->m_query;
  263. url->m_fragment = "";
  264. url->m_cannot_be_a_base_url = true;
  265. state = State::Fragment;
  266. } else if (base_url->m_scheme != "file") {
  267. state = State::Relative;
  268. continue;
  269. } else {
  270. state = State::File;
  271. continue;
  272. }
  273. break;
  274. case State::SpecialRelativeOrAuthority:
  275. if (code_point == '/' && get_remaining().starts_with("/")) {
  276. state = State::SpecialAuthorityIgnoreSlashes;
  277. ++iterator;
  278. } else {
  279. report_validation_error();
  280. state = State::Relative;
  281. continue;
  282. }
  283. break;
  284. case State::PathOrAuthority:
  285. if (code_point == '/') {
  286. state = State::Authority;
  287. } else {
  288. state = State::Path;
  289. continue;
  290. }
  291. break;
  292. case State::Relative:
  293. url->m_scheme = base_url->m_scheme;
  294. if (code_point == '/') {
  295. state = State::RelativeSlash;
  296. } else if (url->is_special() && code_point == '\\') {
  297. report_validation_error();
  298. state = State::RelativeSlash;
  299. } else {
  300. url->m_username = base_url->m_username;
  301. url->m_password = base_url->m_password;
  302. url->m_host = base_url->m_host;
  303. url->m_port = base_url->m_port;
  304. url->m_paths = base_url->m_paths;
  305. url->m_query = base_url->m_query;
  306. if (code_point == '?') {
  307. url->m_query = "";
  308. state = State::Query;
  309. } else if (code_point == '#') {
  310. url->m_fragment = "";
  311. state = State::Fragment;
  312. } else if (code_point != end_of_file) {
  313. url->m_query = {};
  314. if (url->m_paths.size())
  315. url->m_paths.remove(url->m_paths.size() - 1);
  316. state = State::Path;
  317. continue;
  318. }
  319. }
  320. break;
  321. case State::RelativeSlash:
  322. if (url->is_special() && (code_point == '/' || code_point == '\\')) {
  323. if (code_point == '\\')
  324. report_validation_error();
  325. state = State::SpecialAuthorityIgnoreSlashes;
  326. } else if (code_point == '/') {
  327. state = State::Authority;
  328. } else {
  329. url->m_username = base_url->m_username;
  330. url->m_password = base_url->m_password;
  331. url->m_host = base_url->m_host;
  332. url->m_port = base_url->m_port;
  333. state = State::Path;
  334. continue;
  335. }
  336. break;
  337. case State::SpecialAuthoritySlashes:
  338. if (code_point == '/' && get_remaining().starts_with("/")) {
  339. state = State::SpecialAuthorityIgnoreSlashes;
  340. ++iterator;
  341. } else {
  342. report_validation_error();
  343. state = State::SpecialAuthorityIgnoreSlashes;
  344. continue;
  345. }
  346. break;
  347. case State::SpecialAuthorityIgnoreSlashes:
  348. if (code_point != '/' && code_point != '\\') {
  349. state = State::Authority;
  350. continue;
  351. } else {
  352. report_validation_error();
  353. }
  354. break;
  355. case State::Authority:
  356. if (code_point == '@') {
  357. report_validation_error();
  358. if (at_sign_seen) {
  359. auto content = buffer.to_string();
  360. buffer.clear();
  361. buffer.append("%40");
  362. buffer.append(content);
  363. }
  364. at_sign_seen = true;
  365. StringBuilder builder;
  366. for (auto c : Utf8View(builder.string_view())) {
  367. if (c == ':' && !password_token_seen) {
  368. password_token_seen = true;
  369. continue;
  370. }
  371. builder.clear();
  372. if (password_token_seen) {
  373. builder.append(url->password());
  374. URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
  375. // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
  376. url->m_password = URL::percent_decode(builder.string_view());
  377. } else {
  378. builder.append(url->username());
  379. URL::append_percent_encoded_if_necessary(builder, c, URL::PercentEncodeSet::Userinfo);
  380. // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
  381. url->m_username = URL::percent_decode(builder.string_view());
  382. }
  383. }
  384. buffer.clear();
  385. } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
  386. if (at_sign_seen && buffer.is_empty()) {
  387. report_validation_error();
  388. return {};
  389. }
  390. // NOTE: This decreases the iterator by the number of code points in buffer plus one.
  391. iterator = input.iterator_at_byte_offset(iterator - input.begin() - buffer.length() - 1);
  392. buffer.clear();
  393. state = State::Host;
  394. } else {
  395. buffer.append_code_point(code_point);
  396. }
  397. break;
  398. case State::Host:
  399. case State::Hostname:
  400. if (code_point == ':' && !inside_brackets) {
  401. if (buffer.is_empty()) {
  402. report_validation_error();
  403. return {};
  404. }
  405. auto host = parse_host(buffer.string_view(), !url->is_special());
  406. if (!host.has_value())
  407. return {};
  408. url->m_host = host.release_value();
  409. buffer.clear();
  410. state = State::Port;
  411. } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
  412. if (url->is_special() && buffer.is_empty()) {
  413. report_validation_error();
  414. return {};
  415. }
  416. auto host = parse_host(buffer.string_view(), !url->is_special());
  417. if (!host.has_value())
  418. return {};
  419. url->m_host = host.value();
  420. buffer.clear();
  421. state = State::Port;
  422. continue;
  423. } else if (code_point == '[') {
  424. inside_brackets = true;
  425. } else if (code_point == ']') {
  426. inside_brackets = false;
  427. } else {
  428. buffer.append_code_point(code_point);
  429. }
  430. break;
  431. case State::Port:
  432. if (is_ascii_digit(code_point)) {
  433. buffer.append_code_point(code_point);
  434. } else if (code_point == end_of_file || code_point == '/' || code_point == '?' || code_point == '#' || (url->is_special() && code_point == '\\')) {
  435. if (!buffer.is_empty()) {
  436. auto port = buffer.string_view().to_uint();
  437. if (!port.has_value() || port.value() > 65535) {
  438. report_validation_error();
  439. return {};
  440. }
  441. if (port.value() == URL::default_port_for_scheme(url->scheme()))
  442. url->m_port = {};
  443. else
  444. url->m_port = port.value();
  445. buffer.clear();
  446. }
  447. state = State::PathStart;
  448. continue;
  449. } else {
  450. report_validation_error();
  451. return {};
  452. }
  453. break;
  454. case State::File:
  455. url->m_scheme = "file";
  456. url->m_host = "";
  457. if (code_point == '/' || code_point == '\\') {
  458. if (code_point == '\\')
  459. report_validation_error();
  460. state = State::FileSlash;
  461. } else if (base_url && base_url->m_scheme == "file") {
  462. url->m_host = base_url->m_host;
  463. url->m_paths = base_url->m_paths;
  464. url->m_query = base_url->m_query;
  465. if (code_point == '?') {
  466. url->m_query = "";
  467. state = State::Query;
  468. } else if (code_point == '#') {
  469. url->m_fragment = "";
  470. state = State::Fragment;
  471. } else if (code_point != end_of_file) {
  472. url->m_query = {};
  473. auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
  474. if (!starts_with_windows_drive_letter(substring_from_pointer)) {
  475. if (!url->paths().is_empty() && !(url->scheme() == "file" && url->paths().size() == 1 && is_normalized_windows_drive_letter(url->paths()[0])))
  476. url->m_paths.remove(url->m_paths.size() - 1);
  477. } else {
  478. report_validation_error();
  479. url->m_paths.clear();
  480. }
  481. state = State::Path;
  482. continue;
  483. }
  484. }
  485. break;
  486. case State::FileSlash:
  487. if (code_point == '/' || code_point == '\\') {
  488. if (code_point == '\\')
  489. report_validation_error();
  490. state = State::FileHost;
  491. } else if (base_url && base_url->m_scheme == "file") {
  492. url->m_host = base_url->m_host;
  493. auto substring_from_pointer = input.substring_view(iterator - input.begin()).as_string();
  494. if (!starts_with_windows_drive_letter(substring_from_pointer) && is_normalized_windows_drive_letter(base_url->m_paths[0]))
  495. url->append_path(base_url->m_paths[0]);
  496. state = State::Path;
  497. continue;
  498. }
  499. break;
  500. case State::FileHost:
  501. if (code_point == end_of_file || code_point == '/' || code_point == '\\' || code_point == '?' || code_point == '#') {
  502. if (is_windows_drive_letter(buffer.string_view())) {
  503. report_validation_error();
  504. state = State::Path;
  505. } else if (buffer.is_empty()) {
  506. url->m_host = "";
  507. state = State::PathStart;
  508. } else {
  509. auto host = parse_host(buffer.string_view(), true);
  510. if (!host.has_value())
  511. return {};
  512. if (host.value() == "localhost")
  513. host = "";
  514. url->m_host = host.release_value();
  515. buffer.clear();
  516. state = State::PathStart;
  517. }
  518. continue;
  519. } else {
  520. buffer.append_code_point(code_point);
  521. }
  522. break;
  523. case State::PathStart:
  524. if (url->is_special()) {
  525. if (code_point == '\\')
  526. report_validation_error();
  527. state = State::Path;
  528. if (code_point != '/' && code_point != '\\')
  529. continue;
  530. } else if (code_point == '?') {
  531. url->m_query = "";
  532. state = State::Query;
  533. } else if (code_point == '#') {
  534. url->m_fragment = "";
  535. state = State::Fragment;
  536. } else if (code_point != end_of_file) {
  537. state = State::Path;
  538. if (code_point != '/')
  539. continue;
  540. }
  541. break;
  542. case State::Path:
  543. if (code_point == end_of_file || code_point == '/' || (url->is_special() && code_point == '\\') || code_point == '?' || code_point == '#') {
  544. if (url->is_special() && code_point == '\\')
  545. report_validation_error();
  546. if (is_double_dot_path_segment(buffer.string_view())) {
  547. if (!url->m_paths.is_empty() && !(url->m_scheme == "file" && url->m_paths.size() == 1 && is_normalized_windows_drive_letter(url->m_paths[0])))
  548. url->m_paths.remove(url->m_paths.size() - 1);
  549. if (code_point != '/' && !(url->is_special() && code_point == '\\'))
  550. url->append_path("");
  551. } else if (is_single_dot_path_segment(buffer.string_view()) && code_point != '/' && !(url->is_special() && code_point == '\\')) {
  552. url->append_path("");
  553. } else if (!is_single_dot_path_segment(buffer.string_view())) {
  554. if (url->m_scheme == "file" && url->m_paths.is_empty() && is_windows_drive_letter(buffer.string_view())) {
  555. auto drive_letter = buffer.string_view()[0];
  556. buffer.clear();
  557. buffer.append(drive_letter);
  558. buffer.append(':');
  559. }
  560. // NOTE: This needs to be percent decoded since the member variables contain decoded data.
  561. url->append_path(URL::percent_decode(buffer.string_view()));
  562. }
  563. buffer.clear();
  564. if (code_point == '?') {
  565. url->m_query = "";
  566. state = State::Query;
  567. } else if (code_point == '#') {
  568. url->m_fragment = "";
  569. state = State::Fragment;
  570. }
  571. } else {
  572. if (!is_url_code_point(code_point) && code_point != '%')
  573. report_validation_error();
  574. // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
  575. URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::Path);
  576. }
  577. break;
  578. case State::CannotBeABaseUrlPath:
  579. // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
  580. // NOTE: Verify that the assumptions required for this simplification are correct.
  581. VERIFY(url->m_paths.size() == 1 && url->m_paths[0].is_empty());
  582. if (code_point == '?') {
  583. // NOTE: This needs to be percent decoded since the member variables contain decoded data.
  584. url->m_paths[0] = URL::percent_decode(buffer.string_view());
  585. url->m_query = "";
  586. state = State::Query;
  587. } else if (code_point == '#') {
  588. // NOTE: This needs to be percent decoded since the member variables contain decoded data.
  589. url->m_paths[0] = URL::percent_decode(buffer.string_view());
  590. url->m_fragment = "";
  591. state = State::Fragment;
  592. } else {
  593. if (code_point != end_of_file && !is_url_code_point(code_point) && code_point != '%')
  594. report_validation_error();
  595. // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
  596. if (code_point != end_of_file) {
  597. URL::append_percent_encoded_if_necessary(buffer, code_point, URL::PercentEncodeSet::C0Control);
  598. } else {
  599. // NOTE: This needs to be percent decoded since the member variables contain decoded data.
  600. url->m_paths[0] = URL::percent_decode(buffer.string_view());
  601. }
  602. }
  603. break;
  604. case State::Query:
  605. if (code_point == end_of_file || code_point == '#') {
  606. VERIFY(url->m_query == "");
  607. auto query_percent_encode_set = url->is_special() ? URL::PercentEncodeSet::SpecialQuery : URL::PercentEncodeSet::Query;
  608. // NOTE: This is has to be encoded and then decoded because the original sequence could contain already percent-encoded sequences.
  609. url->m_query = URL::percent_decode(URL::percent_encode(buffer.string_view(), query_percent_encode_set));
  610. buffer.clear();
  611. if (code_point == '#') {
  612. url->m_fragment = "";
  613. state = State::Fragment;
  614. }
  615. } else if (code_point != end_of_file) {
  616. if (!is_url_code_point(code_point) && code_point != '%')
  617. report_validation_error();
  618. // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
  619. buffer.append_code_point(code_point);
  620. }
  621. break;
  622. case State::Fragment:
  623. // NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
  624. if (code_point != end_of_file) {
  625. if (!is_url_code_point(code_point) && code_point != '%')
  626. report_validation_error();
  627. // FIXME: If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
  628. buffer.append_code_point(code_point);
  629. } else {
  630. // NOTE: This needs to be percent decoded since the member variables contain decoded data.
  631. url->m_fragment = URL::percent_decode(buffer.string_view());
  632. buffer.clear();
  633. }
  634. break;
  635. default:
  636. VERIFY_NOT_REACHED();
  637. }
  638. if (iterator.done())
  639. break;
  640. ++iterator;
  641. }
  642. url->m_valid = true;
  643. dbgln_if(URL_PARSER_DEBUG, "URLParser::parse: Parsed URL to be '{}'.", url->serialize());
  644. return url.release_value();
  645. }
  646. }