Tokenizer.cpp 47 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359
  1. /*
  2. * Copyright (c) 2020-2022, the SerenityOS developers.
  3. * Copyright (c) 2021-2023, Sam Atkins <atkinssj@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Debug.h>
  9. #include <AK/FloatingPointStringConversions.h>
  10. #include <AK/SourceLocation.h>
  11. #include <AK/Vector.h>
  12. #include <LibTextCodec/Decoder.h>
  13. #include <LibWeb/CSS/Parser/Tokenizer.h>
  14. #include <LibWeb/Infra/Strings.h>
  15. namespace Web::CSS::Parser {
  16. // U+FFFD REPLACEMENT CHARACTER (�)
  17. #define REPLACEMENT_CHARACTER 0xFFFD
  18. static constexpr u32 TOKENIZER_EOF = 0xFFFFFFFF;
  19. static inline void log_parse_error(SourceLocation const& location = SourceLocation::current())
  20. {
  21. dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location);
  22. }
  23. static inline bool is_eof(u32 code_point)
  24. {
  25. return code_point == TOKENIZER_EOF;
  26. }
  27. static inline bool is_quotation_mark(u32 code_point)
  28. {
  29. return code_point == 0x22;
  30. }
  31. static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
  32. {
  33. return code_point > 0x10FFFF;
  34. }
  35. static inline bool is_low_line(u32 code_point)
  36. {
  37. return code_point == 0x5F;
  38. }
  39. // https://www.w3.org/TR/css-syntax-3/#ident-start-code-point
  40. static inline bool is_ident_start_code_point(u32 code_point)
  41. {
  42. // FIXME: We use !is_ascii() for "non-ASCII code point" in the spec, but it's not quite right -
  43. // it treats EOF as a valid! The spec also lacks a definition of code point. For now, the
  44. // !is_eof() check is a hack, but it should work.
  45. return !is_eof(code_point) && (is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point));
  46. }
  47. static inline bool is_hyphen_minus(u32 code_point)
  48. {
  49. return code_point == 0x2D;
  50. }
  51. // https://www.w3.org/TR/css-syntax-3/#ident-code-point
  52. static inline bool is_ident_code_point(u32 code_point)
  53. {
  54. return is_ident_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
  55. }
  56. static inline bool is_non_printable(u32 code_point)
  57. {
  58. return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F;
  59. }
  60. static inline bool is_number_sign(u32 code_point)
  61. {
  62. return code_point == 0x23;
  63. }
  64. static inline bool is_reverse_solidus(u32 code_point)
  65. {
  66. return code_point == 0x5C;
  67. }
  68. static inline bool is_apostrophe(u32 code_point)
  69. {
  70. return code_point == 0x27;
  71. }
  72. static inline bool is_left_paren(u32 code_point)
  73. {
  74. return code_point == 0x28;
  75. }
  76. static inline bool is_right_paren(u32 code_point)
  77. {
  78. return code_point == 0x29;
  79. }
  80. static inline bool is_plus_sign(u32 code_point)
  81. {
  82. return code_point == 0x2B;
  83. }
  84. static inline bool is_comma(u32 code_point)
  85. {
  86. return code_point == 0x2C;
  87. }
  88. static inline bool is_full_stop(u32 code_point)
  89. {
  90. return code_point == 0x2E;
  91. }
  92. static inline bool is_newline(u32 code_point)
  93. {
  94. return code_point == 0xA;
  95. }
  96. static inline bool is_asterisk(u32 code_point)
  97. {
  98. return code_point == 0x2A;
  99. }
  100. static inline bool is_solidus(u32 code_point)
  101. {
  102. return code_point == 0x2F;
  103. }
  104. static inline bool is_colon(u32 code_point)
  105. {
  106. return code_point == 0x3A;
  107. }
  108. static inline bool is_semicolon(u32 code_point)
  109. {
  110. return code_point == 0x3B;
  111. }
  112. static inline bool is_less_than_sign(u32 code_point)
  113. {
  114. return code_point == 0x3C;
  115. }
  116. static inline bool is_greater_than_sign(u32 code_point)
  117. {
  118. return code_point == 0x3E;
  119. }
  120. static inline bool is_at(u32 code_point)
  121. {
  122. return code_point == 0x40;
  123. }
  124. static inline bool is_open_square_bracket(u32 code_point)
  125. {
  126. return code_point == 0x5B;
  127. }
  128. static inline bool is_closed_square_bracket(u32 code_point)
  129. {
  130. return code_point == 0x5D;
  131. }
  132. static inline bool is_open_curly_bracket(u32 code_point)
  133. {
  134. return code_point == 0x7B;
  135. }
  136. static inline bool is_closed_curly_bracket(u32 code_point)
  137. {
  138. return code_point == 0x7D;
  139. }
  140. static inline bool is_whitespace(u32 code_point)
  141. {
  142. return code_point == 0x9 || code_point == 0xA || code_point == 0x20;
  143. }
  144. static inline bool is_percent(u32 code_point)
  145. {
  146. return code_point == 0x25;
  147. }
  148. static inline bool is_exclamation_mark(u32 code_point)
  149. {
  150. return code_point == 0x21;
  151. }
  152. static inline bool is_e(u32 code_point)
  153. {
  154. return code_point == 0x65;
  155. }
  156. static inline bool is_E(u32 code_point)
  157. {
  158. return code_point == 0x45;
  159. }
  160. Vector<Token> Tokenizer::tokenize(StringView input, StringView encoding)
  161. {
  162. // https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
  163. auto filter_code_points = [](StringView input, auto encoding) -> String {
  164. auto decoder = TextCodec::decoder_for(encoding);
  165. VERIFY(decoder.has_value());
  166. auto decoded_input = MUST(decoder->to_utf8(input));
  167. // OPTIMIZATION: If the input doesn't contain any CR or FF, we can skip the filtering
  168. bool const contains_cr_or_ff = [&] {
  169. for (auto byte : decoded_input.bytes()) {
  170. if (byte == '\r' || byte == '\f')
  171. return true;
  172. }
  173. return false;
  174. }();
  175. if (!contains_cr_or_ff) {
  176. return decoded_input;
  177. }
  178. StringBuilder builder { input.length() };
  179. bool last_was_carriage_return = false;
  180. // To filter code points from a stream of (unfiltered) code points input:
  181. for (auto code_point : decoded_input.code_points()) {
  182. // Replace any U+000D CARRIAGE RETURN (CR) code points,
  183. // U+000C FORM FEED (FF) code points,
  184. // or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF)
  185. // in input by a single U+000A LINE FEED (LF) code point.
  186. if (code_point == '\r') {
  187. if (last_was_carriage_return) {
  188. builder.append('\n');
  189. } else {
  190. last_was_carriage_return = true;
  191. }
  192. } else {
  193. if (last_was_carriage_return)
  194. builder.append('\n');
  195. if (code_point == '\n') {
  196. if (!last_was_carriage_return)
  197. builder.append('\n');
  198. } else if (code_point == '\f') {
  199. builder.append('\n');
  200. // Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (�).
  201. } else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) {
  202. builder.append_code_point(REPLACEMENT_CHARACTER);
  203. } else {
  204. builder.append_code_point(code_point);
  205. }
  206. last_was_carriage_return = false;
  207. }
  208. }
  209. return builder.to_string_without_validation();
  210. };
  211. Tokenizer tokenizer { filter_code_points(input, encoding) };
  212. return tokenizer.tokenize();
  213. }
  214. Tokenizer::Tokenizer(String decoded_input)
  215. : m_decoded_input(move(decoded_input))
  216. , m_utf8_view(m_decoded_input)
  217. , m_utf8_iterator(m_utf8_view.begin())
  218. {
  219. }
  220. Vector<Token> Tokenizer::tokenize()
  221. {
  222. Vector<Token> tokens;
  223. for (;;) {
  224. auto token_start = m_position;
  225. auto token = consume_a_token();
  226. token.m_start_position = token_start;
  227. token.m_end_position = m_position;
  228. tokens.append(token);
  229. if (token.is(Token::Type::EndOfFile)) {
  230. return tokens;
  231. }
  232. }
  233. }
  234. u32 Tokenizer::next_code_point()
  235. {
  236. if (m_utf8_iterator == m_utf8_view.end())
  237. return TOKENIZER_EOF;
  238. m_prev_utf8_iterator = m_utf8_iterator;
  239. ++m_utf8_iterator;
  240. auto code_point = *m_prev_utf8_iterator;
  241. m_prev_position = m_position;
  242. if (is_newline(code_point)) {
  243. m_position.line++;
  244. m_position.column = 0;
  245. } else {
  246. m_position.column++;
  247. }
  248. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", code_point);
  249. return code_point;
  250. }
  251. u32 Tokenizer::peek_code_point(size_t offset) const
  252. {
  253. auto it = m_utf8_iterator;
  254. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  255. ++it;
  256. if (it == m_utf8_view.end())
  257. return TOKENIZER_EOF;
  258. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator);
  259. return *it;
  260. }
  261. U32Twin Tokenizer::peek_twin() const
  262. {
  263. U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF };
  264. auto it = m_utf8_iterator;
  265. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  266. values.set(i, *it);
  267. ++it;
  268. }
  269. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second);
  270. return values;
  271. }
  272. U32Triplet Tokenizer::peek_triplet() const
  273. {
  274. U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF };
  275. auto it = m_utf8_iterator;
  276. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  277. values.set(i, *it);
  278. ++it;
  279. }
  280. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third);
  281. return values;
  282. }
  283. U32Twin Tokenizer::start_of_input_stream_twin()
  284. {
  285. U32Twin twin;
  286. // FIXME: Reconsuming just to read the current code point again is weird.
  287. reconsume_current_input_code_point();
  288. twin.first = next_code_point();
  289. twin.second = peek_code_point();
  290. return twin;
  291. }
  292. U32Triplet Tokenizer::start_of_input_stream_triplet()
  293. {
  294. U32Triplet triplet;
  295. // FIXME: Reconsuming just to read the current code point again is weird.
  296. reconsume_current_input_code_point();
  297. triplet.first = next_code_point();
  298. auto next_two = peek_twin();
  299. triplet.second = next_two.first;
  300. triplet.third = next_two.second;
  301. return triplet;
  302. }
  303. Token Tokenizer::create_new_token(Token::Type type)
  304. {
  305. Token token = {};
  306. token.m_type = type;
  307. return token;
  308. }
  309. Token Tokenizer::create_eof_token()
  310. {
  311. return create_new_token(Token::Type::EndOfFile);
  312. }
  313. Token Tokenizer::create_value_token(Token::Type type, FlyString&& value, String&& representation)
  314. {
  315. auto token = create_new_token(type);
  316. token.m_value = move(value);
  317. token.m_representation = move(representation);
  318. return token;
  319. }
  320. Token Tokenizer::create_value_token(Token::Type type, u32 value, String&& representation)
  321. {
  322. auto token = create_new_token(type);
  323. token.m_value = String::from_code_point(value);
  324. token.m_representation = move(representation);
  325. return token;
  326. }
  327. // https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
  328. u32 Tokenizer::consume_escaped_code_point()
  329. {
  330. // This section describes how to consume an escaped code point.
  331. // It assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed and that the next
  332. // input code point has already been verified to be part of a valid escape.
  333. // It will return a code point.
  334. // Consume the next input code point.
  335. auto input = next_code_point();
  336. // hex digit
  337. if (is_ascii_hex_digit(input)) {
  338. // Consume as many hex digits as possible, but no more than 5.
  339. // Note that this means 1-6 hex digits have been consumed in total.
  340. StringBuilder builder;
  341. builder.append_code_point(input);
  342. size_t counter = 0;
  343. while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) {
  344. builder.append_code_point(next_code_point());
  345. }
  346. // If the next input code point is whitespace, consume it as well.
  347. if (is_whitespace(peek_code_point())) {
  348. (void)next_code_point();
  349. }
  350. // Interpret the hex digits as a hexadecimal number.
  351. auto unhexed = AK::StringUtils::convert_to_uint_from_hex<u32>(builder.string_view()).value_or(0);
  352. // If this number is zero, or is for a surrogate, or is greater than the maximum allowed
  353. // code point, return U+FFFD REPLACEMENT CHARACTER (�).
  354. if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
  355. return REPLACEMENT_CHARACTER;
  356. }
  357. // Otherwise, return the code point with that value.
  358. return unhexed;
  359. }
  360. // EOF
  361. if (is_eof(input)) {
  362. // This is a parse error. Return U+FFFD REPLACEMENT CHARACTER (�).
  363. log_parse_error();
  364. return REPLACEMENT_CHARACTER;
  365. }
  366. // anything else
  367. // Return the current input code point.
  368. return input;
  369. }
  370. // https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
  371. Token Tokenizer::consume_an_ident_like_token()
  372. {
  373. // This section describes how to consume an ident-like token from a stream of code points.
  374. // It returns an <ident-token>, <function-token>, <url-token>, or <bad-url-token>.
  375. // Consume an ident sequence, and let string be the result.
  376. auto start_byte_offset = current_byte_offset();
  377. auto string = consume_an_ident_sequence();
  378. // If string’s value is an ASCII case-insensitive match for "url", and the next input code
  379. // point is U+0028 LEFT PARENTHESIS ((), consume it.
  380. if (Infra::is_ascii_case_insensitive_match(string, "url"sv) && is_left_paren(peek_code_point())) {
  381. (void)next_code_point();
  382. // While the next two input code points are whitespace, consume the next input code point.
  383. for (;;) {
  384. auto maybe_whitespace = peek_twin();
  385. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  386. break;
  387. }
  388. (void)next_code_point();
  389. }
  390. // If the next one or two input code points are U+0022 QUOTATION MARK ("), U+0027 APOSTROPHE ('),
  391. // or whitespace followed by U+0022 QUOTATION MARK (") or U+0027 APOSTROPHE ('), then create a
  392. // <function-token> with its value set to string and return it.
  393. auto next_two = peek_twin();
  394. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  395. return create_value_token(Token::Type::Function, move(string), input_since(start_byte_offset));
  396. }
  397. // Otherwise, consume a url token, and return it.
  398. return consume_a_url_token();
  399. }
  400. // Otherwise, if the next input code point is U+0028 LEFT PARENTHESIS ((), consume it.
  401. if (is_left_paren(peek_code_point())) {
  402. (void)next_code_point();
  403. // Create a <function-token> with its value set to string and return it.
  404. return create_value_token(Token::Type::Function, move(string), input_since(start_byte_offset));
  405. }
  406. // Otherwise, create an <ident-token> with its value set to string and return it.
  407. return create_value_token(Token::Type::Ident, move(string), input_since(start_byte_offset));
  408. }
  409. // https://www.w3.org/TR/css-syntax-3/#consume-number
  410. Number Tokenizer::consume_a_number()
  411. {
  412. // This section describes how to consume a number from a stream of code points.
  413. // It returns a numeric value, and a type which is either "integer" or "number".
  414. //
  415. // Note: This algorithm does not do the verification of the first few code points
  416. // that are necessary to ensure a number can be obtained from the stream. Ensure
  417. // that the stream starts with a number before calling this algorithm.
  418. // Execute the following steps in order:
  419. // 1. Initially set type to "integer". Let repr be the empty string.
  420. StringBuilder repr;
  421. Number::Type type = Number::Type::Integer;
  422. // 2. If the next input code point is U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-),
  423. // consume it and append it to repr.
  424. bool has_explicit_sign = false;
  425. auto next_input = peek_code_point();
  426. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  427. has_explicit_sign = true;
  428. repr.append_code_point(next_code_point());
  429. }
  430. // 3. While the next input code point is a digit, consume it and append it to repr.
  431. for (;;) {
  432. auto digits = peek_code_point();
  433. if (!is_ascii_digit(digits))
  434. break;
  435. repr.append_code_point(next_code_point());
  436. }
  437. // 4. If the next 2 input code points are U+002E FULL STOP (.) followed by a digit, then:
  438. auto maybe_number = peek_twin();
  439. if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
  440. // 1. Consume them.
  441. // 2. Append them to repr.
  442. repr.append_code_point(next_code_point());
  443. repr.append_code_point(next_code_point());
  444. // 3. Set type to "number".
  445. type = Number::Type::Number;
  446. // 4. While the next input code point is a digit, consume it and append it to repr.
  447. for (;;) {
  448. auto digit = peek_code_point();
  449. if (!is_ascii_digit(digit))
  450. break;
  451. repr.append_code_point(next_code_point());
  452. }
  453. }
  454. // 5. If the next 2 or 3 input code points are U+0045 LATIN CAPITAL LETTER E (E) or
  455. // U+0065 LATIN SMALL LETTER E (e), optionally followed by U+002D HYPHEN-MINUS (-)
  456. // or U+002B PLUS SIGN (+), followed by a digit, then:
  457. auto maybe_exp = peek_triplet();
  458. if ((is_E(maybe_exp.first) || is_e(maybe_exp.first))
  459. && (((is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) && is_ascii_digit(maybe_exp.third))
  460. || (is_ascii_digit(maybe_exp.second)))) {
  461. // 1. Consume them.
  462. // 2. Append them to repr.
  463. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  464. if (is_ascii_digit(maybe_exp.third)) {
  465. repr.append_code_point(next_code_point());
  466. repr.append_code_point(next_code_point());
  467. repr.append_code_point(next_code_point());
  468. }
  469. } else if (is_ascii_digit(maybe_exp.second)) {
  470. repr.append_code_point(next_code_point());
  471. repr.append_code_point(next_code_point());
  472. }
  473. // 3. Set type to "number".
  474. type = Number::Type::Number;
  475. // 4. While the next input code point is a digit, consume it and append it to repr.
  476. for (;;) {
  477. auto digits = peek_code_point();
  478. if (!is_ascii_digit(digits))
  479. break;
  480. repr.append_code_point(next_code_point());
  481. }
  482. }
  483. // 6. Convert repr to a number, and set the value to the returned value.
  484. auto value = convert_a_string_to_a_number(repr.string_view());
  485. // 7. Return value and type.
  486. if (type == Number::Type::Integer && has_explicit_sign)
  487. return Number { Number::Type::IntegerWithExplicitSign, value };
  488. return Number { type, value };
  489. }
  490. // https://www.w3.org/TR/css-syntax-3/#convert-string-to-number
  491. double Tokenizer::convert_a_string_to_a_number(StringView string)
  492. {
  493. // FIXME: We already found the whole part, fraction part and exponent during
  494. // validation, we could probably skip
  495. return string.to_number<double>(AK::TrimWhitespace::No).release_value();
  496. }
  497. // https://www.w3.org/TR/css-syntax-3/#consume-name
  498. FlyString Tokenizer::consume_an_ident_sequence()
  499. {
  500. // This section describes how to consume an ident sequence from a stream of code points.
  501. // It returns a string containing the largest name that can be formed from adjacent
  502. // code points in the stream, starting from the first.
  503. //
  504. // Note: This algorithm does not do the verification of the first few code points that
  505. // are necessary to ensure the returned code points would constitute an <ident-token>.
  506. // If that is the intended use, ensure that the stream starts with an ident sequence before
  507. // calling this algorithm.
  508. // Let result initially be an empty string.
  509. StringBuilder result;
  510. // Repeatedly consume the next input code point from the stream:
  511. for (;;) {
  512. auto input = next_code_point();
  513. if (is_eof(input))
  514. break;
  515. // name code point
  516. if (is_ident_code_point(input)) {
  517. // Append the code point to result.
  518. result.append_code_point(input);
  519. continue;
  520. }
  521. // the stream starts with a valid escape
  522. if (is_valid_escape_sequence(start_of_input_stream_twin())) {
  523. // Consume an escaped code point. Append the returned code point to result.
  524. result.append_code_point(consume_escaped_code_point());
  525. continue;
  526. }
  527. // anything else
  528. // Reconsume the current input code point. Return result.
  529. reconsume_current_input_code_point();
  530. break;
  531. }
  532. return result.to_fly_string_without_validation();
  533. }
  534. // https://www.w3.org/TR/css-syntax-3/#consume-url-token
  535. Token Tokenizer::consume_a_url_token()
  536. {
  537. // This section describes how to consume a url token from a stream of code points.
  538. // It returns either a <url-token> or a <bad-url-token>.
  539. //
  540. // Note: This algorithm assumes that the initial "url(" has already been consumed.
  541. // This algorithm also assumes that it’s being called to consume an "unquoted" value,
  542. // like url(foo). A quoted value, like url("foo"), is parsed as a <function-token>.
  543. // Consume an ident-like token automatically handles this distinction; this algorithm
  544. // shouldn’t be called directly otherwise.
  545. // 1. Initially create a <url-token> with its value set to the empty string.
  546. auto start_byte_offset = current_byte_offset();
  547. auto token = create_new_token(Token::Type::Url);
  548. StringBuilder builder;
  549. // 2. Consume as much whitespace as possible.
  550. consume_as_much_whitespace_as_possible();
  551. auto make_token = [&]() -> Token {
  552. token.m_value = builder.to_fly_string_without_validation();
  553. token.m_representation = input_since(start_byte_offset);
  554. return token;
  555. };
  556. // 3. Repeatedly consume the next input code point from the stream:
  557. for (;;) {
  558. auto input = next_code_point();
  559. // U+0029 RIGHT PARENTHESIS ())
  560. if (is_right_paren(input)) {
  561. // Return the <url-token>.
  562. return make_token();
  563. }
  564. // EOF
  565. if (is_eof(input)) {
  566. // This is a parse error. Return the <url-token>.
  567. log_parse_error();
  568. return make_token();
  569. }
  570. // whitespace
  571. if (is_whitespace(input)) {
  572. // Consume as much whitespace as possible.
  573. consume_as_much_whitespace_as_possible();
  574. // If the next input code point is U+0029 RIGHT PARENTHESIS ()) or EOF, consume it
  575. // and return the <url-token> (if EOF was encountered, this is a parse error);
  576. input = peek_code_point();
  577. if (is_right_paren(input)) {
  578. (void)next_code_point();
  579. return make_token();
  580. }
  581. if (is_eof(input)) {
  582. (void)next_code_point();
  583. log_parse_error();
  584. return make_token();
  585. }
  586. // otherwise, consume the remnants of a bad url, create a <bad-url-token>, and return it.
  587. consume_the_remnants_of_a_bad_url();
  588. auto bad_url_token = create_new_token(Token::Type::BadUrl);
  589. bad_url_token.m_representation = input_since(start_byte_offset);
  590. return bad_url_token;
  591. }
  592. // U+0022 QUOTATION MARK (")
  593. // U+0027 APOSTROPHE (')
  594. // U+0028 LEFT PARENTHESIS (()
  595. // non-printable code point
  596. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  597. // This is a parse error. Consume the remnants of a bad url, create a <bad-url-token>, and return it.
  598. log_parse_error();
  599. consume_the_remnants_of_a_bad_url();
  600. auto bad_url_token = create_new_token(Token::Type::BadUrl);
  601. bad_url_token.m_representation = input_since(start_byte_offset);
  602. return bad_url_token;
  603. }
  604. // U+005C REVERSE SOLIDUS (\)
  605. if (is_reverse_solidus(input)) {
  606. // If the stream starts with a valid escape,
  607. if (is_valid_escape_sequence(start_of_input_stream_twin())) {
  608. // consume an escaped code point and append the returned code point to the <url-token>’s value.
  609. builder.append_code_point(consume_escaped_code_point());
  610. continue;
  611. } else {
  612. // Otherwise, this is a parse error.
  613. log_parse_error();
  614. // Consume the remnants of a bad url, create a <bad-url-token>, and return it.
  615. consume_the_remnants_of_a_bad_url();
  616. auto bad_url_token = create_new_token(Token::Type::BadUrl);
  617. bad_url_token.m_representation = input_since(start_byte_offset);
  618. return bad_url_token;
  619. }
  620. }
  621. // anything else
  622. // Append the current input code point to the <url-token>’s value.
  623. builder.append_code_point(input);
  624. }
  625. }
  626. // https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
  627. void Tokenizer::consume_the_remnants_of_a_bad_url()
  628. {
  629. // This section describes how to consume the remnants of a bad url from a stream of code points,
  630. // "cleaning up" after the tokenizer realizes that it’s in the middle of a <bad-url-token> rather
  631. // than a <url-token>. It returns nothing; its sole use is to consume enough of the input stream
  632. // to reach a recovery point where normal tokenizing can resume.
  633. // Repeatedly consume the next input code point from the stream:
  634. for (;;) {
  635. auto input = next_code_point();
  636. // U+0029 RIGHT PARENTHESIS ())
  637. // EOF
  638. if (is_eof(input) || is_right_paren(input)) {
  639. // Return.
  640. return;
  641. }
  642. // the input stream starts with a valid escape
  643. if (is_valid_escape_sequence(start_of_input_stream_twin())) {
  644. // Consume an escaped code point.
  645. // This allows an escaped right parenthesis ("\)") to be encountered without ending
  646. // the <bad-url-token>. This is otherwise identical to the "anything else" clause.
  647. (void)consume_escaped_code_point();
  648. }
  649. // anything else
  650. // Do nothing.
  651. }
  652. }
  653. void Tokenizer::consume_as_much_whitespace_as_possible()
  654. {
  655. while (is_whitespace(peek_code_point())) {
  656. (void)next_code_point();
  657. }
  658. }
  659. void Tokenizer::reconsume_current_input_code_point()
  660. {
  661. m_utf8_iterator = m_prev_utf8_iterator;
  662. m_position = m_prev_position;
  663. }
  664. // https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
  665. Token Tokenizer::consume_a_numeric_token()
  666. {
  667. // This section describes how to consume a numeric token from a stream of code points.
  668. // It returns either a <number-token>, <percentage-token>, or <dimension-token>.
  669. auto start_byte_offset = current_byte_offset();
  670. // Consume a number and let number be the result.
  671. auto number = consume_a_number();
  672. // If the next 3 input code points would start an ident sequence, then:
  673. if (would_start_an_ident_sequence(peek_triplet())) {
  674. // 1. Create a <dimension-token> with the same value and type flag as number,
  675. // and a unit set initially to the empty string.
  676. auto token = create_new_token(Token::Type::Dimension);
  677. token.m_number_value = number;
  678. // 2. Consume an ident sequence. Set the <dimension-token>’s unit to the returned value.
  679. auto unit = consume_an_ident_sequence();
  680. VERIFY(!unit.is_empty());
  681. // NOTE: We intentionally store this in the `value`, to save space.
  682. token.m_value = move(unit);
  683. // 3. Return the <dimension-token>.
  684. token.m_representation = input_since(start_byte_offset);
  685. return token;
  686. }
  687. // Otherwise, if the next input code point is U+0025 PERCENTAGE SIGN (%), consume it.
  688. if (is_percent(peek_code_point())) {
  689. (void)next_code_point();
  690. // Create a <percentage-token> with the same value as number, and return it.
  691. auto token = create_new_token(Token::Type::Percentage);
  692. token.m_number_value = number;
  693. token.m_representation = input_since(start_byte_offset);
  694. return token;
  695. }
  696. // Otherwise, create a <number-token> with the same value and type flag as number, and return it.
  697. auto token = create_new_token(Token::Type::Number);
  698. token.m_number_value = number;
  699. token.m_representation = input_since(start_byte_offset);
  700. return token;
  701. }
  702. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
  703. bool Tokenizer::would_start_a_number(U32Triplet values)
  704. {
  705. // This section describes how to check if three code points would start a number.
  706. // The algorithm described here can be called explicitly with three code points,
  707. // or can be called with the input stream itself. In the latter case, the three
  708. // code points in question are the current input code point and the next two input
  709. // code points, in that order.
  710. //
  711. // Note: This algorithm will not consume any additional code points.
  712. // Look at the first code point:
  713. // U+002B PLUS SIGN (+)
  714. // U+002D HYPHEN-MINUS (-)
  715. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  716. // If the second code point is a digit, return true.
  717. if (is_ascii_digit(values.second))
  718. return true;
  719. // Otherwise, if the second code point is a U+002E FULL STOP (.) and the third
  720. // code point is a digit, return true.
  721. if (is_full_stop(values.second) && is_ascii_digit(values.third))
  722. return true;
  723. // Otherwise, return false.
  724. return false;
  725. }
  726. // U+002E FULL STOP (.)
  727. if (is_full_stop(values.first))
  728. // If the second code point is a digit, return true. Otherwise, return false.
  729. return is_ascii_digit(values.second);
  730. // digit
  731. if (is_ascii_digit(values.first))
  732. // Return true.
  733. return true;
  734. // anything else
  735. // Return false.
  736. return false;
  737. }
  738. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
  739. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  740. {
  741. // This section describes how to check if two code points are a valid escape.
  742. // The algorithm described here can be called explicitly with two code points,
  743. // or can be called with the input stream itself. In the latter case, the two
  744. // code points in question are the current input code point and the next input
  745. // code point, in that order.
  746. //
  747. // Note: This algorithm will not consume any additional code point.
  748. // If the first code point is not U+005C REVERSE SOLIDUS (\), return false.
  749. if (!is_reverse_solidus(values.first))
  750. return false;
  751. // Otherwise, if the second code point is a newline, return false.
  752. if (is_newline(values.second))
  753. return false;
  754. // Otherwise, return true.
  755. return true;
  756. }
  757. // https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
  758. bool Tokenizer::would_start_an_ident_sequence(U32Triplet values)
  759. {
  760. // This section describes how to check if three code points would start an ident sequence.
  761. // The algorithm described here can be called explicitly with three code points, or
  762. // can be called with the input stream itself. In the latter case, the three code
  763. // points in question are the current input code point and the next two input code
  764. // points, in that order.
  765. //
  766. // Note: This algorithm will not consume any additional code points.
  767. // Look at the first code point:
  768. // U+002D HYPHEN-MINUS
  769. if (is_hyphen_minus(values.first)) {
  770. // If the second code point is a name-start code point or a U+002D HYPHEN-MINUS,
  771. // or the second and third code points are a valid escape, return true.
  772. if (is_ident_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  773. return true;
  774. // Otherwise, return false.
  775. return false;
  776. }
  777. // name-start code point
  778. if (is_ident_start_code_point(values.first)) {
  779. // Return true.
  780. return true;
  781. }
  782. // U+005C REVERSE SOLIDUS (\)
  783. if (is_reverse_solidus(values.first)) {
  784. // If the first and second code points are a valid escape, return true.
  785. if (is_valid_escape_sequence(values.to_twin_12()))
  786. return true;
  787. // Otherwise, return false.
  788. return false;
  789. }
  790. // anything else
  791. // Return false.
  792. return false;
  793. }
  794. // https://www.w3.org/TR/css-syntax-3/#consume-string-token
  795. Token Tokenizer::consume_string_token(u32 ending_code_point)
  796. {
  797. // This section describes how to consume a string token from a stream of code points.
  798. // It returns either a <string-token> or <bad-string-token>.
  799. //
  800. // This algorithm may be called with an ending code point, which denotes the code point
  801. // that ends the string. If an ending code point is not specified, the current input
  802. // code point is used.
  803. // Initially create a <string-token> with its value set to the empty string.
  804. auto start_byte_offset = current_byte_offset();
  805. auto token = create_new_token(Token::Type::String);
  806. StringBuilder builder;
  807. auto make_token = [&]() -> Token {
  808. token.m_value = builder.to_fly_string_without_validation();
  809. token.m_representation = input_since(start_byte_offset);
  810. return token;
  811. };
  812. // Repeatedly consume the next input code point from the stream:
  813. for (;;) {
  814. auto input = next_code_point();
  815. // ending code point
  816. if (input == ending_code_point)
  817. return make_token();
  818. // EOF
  819. if (is_eof(input)) {
  820. // This is a parse error. Return the <string-token>.
  821. log_parse_error();
  822. return make_token();
  823. }
  824. // newline
  825. if (is_newline(input)) {
  826. // This is a parse error. Reconsume the current input code point, create a
  827. // <bad-string-token>, and return it.
  828. reconsume_current_input_code_point();
  829. auto bad_string_token = create_new_token(Token::Type::BadString);
  830. bad_string_token.m_representation = input_since(start_byte_offset);
  831. return bad_string_token;
  832. }
  833. // U+005C REVERSE SOLIDUS (\)
  834. if (is_reverse_solidus(input)) {
  835. // If the next input code point is EOF, do nothing.
  836. auto next_input = peek_code_point();
  837. if (is_eof(next_input))
  838. continue;
  839. // Otherwise, if the next input code point is a newline, consume it.
  840. if (is_newline(next_input)) {
  841. (void)next_code_point();
  842. continue;
  843. }
  844. // Otherwise, (the stream starts with a valid escape) consume an escaped code
  845. // point and append the returned code point to the <string-token>’s value.
  846. auto escaped = consume_escaped_code_point();
  847. builder.append_code_point(escaped);
  848. continue;
  849. }
  850. // anything else
  851. // Append the current input code point to the <string-token>’s value.
  852. builder.append_code_point(input);
  853. }
  854. }
  855. // https://www.w3.org/TR/css-syntax-3/#consume-comment
  856. void Tokenizer::consume_comments()
  857. {
  858. // This section describes how to consume comments from a stream of code points.
  859. // It returns nothing.
  860. start:
  861. // If the next two input code point are U+002F SOLIDUS (/) followed by a U+002A ASTERISK (*),
  862. // consume them and all following code points up to and including the first U+002A ASTERISK (*)
  863. // followed by a U+002F SOLIDUS (/), or up to an EOF code point. Return to the start of this step.
  864. //
  865. // If the preceding paragraph ended by consuming an EOF code point, this is a parse error.
  866. //
  867. // Return nothing.
  868. auto twin = peek_twin();
  869. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  870. return;
  871. (void)next_code_point();
  872. (void)next_code_point();
  873. for (;;) {
  874. auto twin_inner = peek_twin();
  875. if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) {
  876. log_parse_error();
  877. return;
  878. }
  879. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  880. (void)next_code_point();
  881. (void)next_code_point();
  882. goto start;
  883. }
  884. (void)next_code_point();
  885. }
  886. }
  887. // https://www.w3.org/TR/css-syntax-3/#consume-token
  888. Token Tokenizer::consume_a_token()
  889. {
  890. // This section describes how to consume a token from a stream of code points.
  891. // It will return a single token of any type.
  892. // Consume comments.
  893. consume_comments();
  894. // Consume the next input code point.
  895. auto start_byte_offset = current_byte_offset();
  896. auto input = next_code_point();
  897. // whitespace
  898. if (is_whitespace(input)) {
  899. dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace");
  900. // Consume as much whitespace as possible. Return a <whitespace-token>.
  901. consume_as_much_whitespace_as_possible();
  902. auto token = create_new_token(Token::Type::Whitespace);
  903. token.m_representation = input_since(start_byte_offset);
  904. return token;
  905. }
  906. // U+0022 QUOTATION MARK (")
  907. if (is_quotation_mark(input)) {
  908. dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark");
  909. // Consume a string token and return it.
  910. return consume_string_token(input);
  911. }
  912. // U+0023 NUMBER SIGN (#)
  913. if (is_number_sign(input)) {
  914. dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign");
  915. // If the next input code point is an ident code point or the next two input code points
  916. // are a valid escape, then:
  917. auto next_input = peek_code_point();
  918. auto maybe_escape = peek_twin();
  919. if (is_ident_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) {
  920. // 1. Create a <hash-token>.
  921. auto token = create_new_token(Token::Type::Hash);
  922. // 2. If the next 3 input code points would start an ident sequence, set the <hash-token>’s
  923. // type flag to "id".
  924. if (would_start_an_ident_sequence(peek_triplet()))
  925. token.m_hash_type = Token::HashType::Id;
  926. // 3. Consume an ident sequence, and set the <hash-token>’s value to the returned string.
  927. auto name = consume_an_ident_sequence();
  928. token.m_value = move(name);
  929. // 4. Return the <hash-token>.
  930. token.m_representation = input_since(start_byte_offset);
  931. return token;
  932. }
  933. // Otherwise, return a <delim-token> with its value set to the current input code point.
  934. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  935. }
  936. // U+0027 APOSTROPHE (')
  937. if (is_apostrophe(input)) {
  938. dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe");
  939. // Consume a string token and return it.
  940. return consume_string_token(input);
  941. }
  942. // U+0028 LEFT PARENTHESIS (()
  943. if (is_left_paren(input)) {
  944. dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren");
  945. // Return a <(-token>.
  946. Token token = create_new_token(Token::Type::OpenParen);
  947. token.m_representation = input_since(start_byte_offset);
  948. return token;
  949. }
  950. // U+0029 RIGHT PARENTHESIS ())
  951. if (is_right_paren(input)) {
  952. dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren");
  953. // Return a <)-token>.
  954. Token token = create_new_token(Token::Type::CloseParen);
  955. token.m_representation = input_since(start_byte_offset);
  956. return token;
  957. }
  958. // U+002B PLUS SIGN (+)
  959. if (is_plus_sign(input)) {
  960. dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign");
  961. // If the input stream starts with a number, reconsume the current input code point,
  962. // consume a numeric token and return it.
  963. if (would_start_a_number(start_of_input_stream_triplet())) {
  964. reconsume_current_input_code_point();
  965. return consume_a_numeric_token();
  966. }
  967. // Otherwise, return a <delim-token> with its value set to the current input code point.
  968. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  969. }
  970. // U+002C COMMA (,)
  971. if (is_comma(input)) {
  972. dbgln_if(CSS_TOKENIZER_DEBUG, "is comma");
  973. // Return a <comma-token>.
  974. Token token = create_new_token(Token::Type::Comma);
  975. token.m_representation = input_since(start_byte_offset);
  976. return token;
  977. }
  978. // U+002D HYPHEN-MINUS (-)
  979. if (is_hyphen_minus(input)) {
  980. dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus");
  981. // If the input stream starts with a number, reconsume the current input code point,
  982. // consume a numeric token, and return it.
  983. if (would_start_a_number(start_of_input_stream_triplet())) {
  984. reconsume_current_input_code_point();
  985. return consume_a_numeric_token();
  986. }
  987. // Otherwise, if the next 2 input code points are U+002D HYPHEN-MINUS U+003E
  988. // GREATER-THAN SIGN (->), consume them and return a <CDC-token>.
  989. auto next_twin = peek_twin();
  990. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  991. (void)next_code_point();
  992. (void)next_code_point();
  993. Token token = create_new_token(Token::Type::CDC);
  994. token.m_representation = input_since(start_byte_offset);
  995. return token;
  996. }
  997. // Otherwise, if the input stream starts with an identifier, reconsume the current
  998. // input code point, consume an ident-like token, and return it.
  999. if (would_start_an_ident_sequence(start_of_input_stream_triplet())) {
  1000. reconsume_current_input_code_point();
  1001. return consume_an_ident_like_token();
  1002. }
  1003. // Otherwise, return a <delim-token> with its value set to the current input code point.
  1004. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  1005. }
  1006. // U+002E FULL STOP (.)
  1007. if (is_full_stop(input)) {
  1008. dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop");
  1009. // If the input stream starts with a number, reconsume the current input code point,
  1010. // consume a numeric token, and return it.
  1011. if (would_start_a_number(start_of_input_stream_triplet())) {
  1012. reconsume_current_input_code_point();
  1013. return consume_a_numeric_token();
  1014. }
  1015. // Otherwise, return a <delim-token> with its value set to the current input code point.
  1016. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  1017. }
  1018. // U+003A COLON (:)
  1019. if (is_colon(input)) {
  1020. dbgln_if(CSS_TOKENIZER_DEBUG, "is colon");
  1021. // Return a <colon-token>.
  1022. Token token = create_new_token(Token::Type::Colon);
  1023. token.m_representation = input_since(start_byte_offset);
  1024. return token;
  1025. }
  1026. // U+003B SEMICOLON (;)
  1027. if (is_semicolon(input)) {
  1028. dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon");
  1029. // Return a <semicolon-token>.
  1030. Token token = create_new_token(Token::Type::Semicolon);
  1031. token.m_representation = input_since(start_byte_offset);
  1032. return token;
  1033. }
  1034. // U+003C LESS-THAN SIGN (<)
  1035. if (is_less_than_sign(input)) {
  1036. dbgln_if(CSS_TOKENIZER_DEBUG, "is less than");
  1037. // If the next 3 input code points are U+0021 EXCLAMATION MARK U+002D HYPHEN-MINUS
  1038. // U+002D HYPHEN-MINUS (!--), consume them and return a <CDO-token>.
  1039. auto maybe_cdo = peek_triplet();
  1040. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  1041. (void)next_code_point();
  1042. (void)next_code_point();
  1043. (void)next_code_point();
  1044. Token token = create_new_token(Token::Type::CDO);
  1045. token.m_representation = input_since(start_byte_offset);
  1046. return token;
  1047. }
  1048. // Otherwise, return a <delim-token> with its value set to the current input code point.
  1049. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  1050. }
  1051. // U+0040 COMMERCIAL AT (@)
  1052. if (is_at(input)) {
  1053. dbgln_if(CSS_TOKENIZER_DEBUG, "is at");
  1054. // If the next 3 input code points would start an ident sequence, consume an ident sequence, create
  1055. // an <at-keyword-token> with its value set to the returned value, and return it.
  1056. if (would_start_an_ident_sequence(peek_triplet())) {
  1057. auto name = consume_an_ident_sequence();
  1058. return create_value_token(Token::Type::AtKeyword, move(name), input_since(start_byte_offset));
  1059. }
  1060. // Otherwise, return a <delim-token> with its value set to the current input code point.
  1061. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  1062. }
  1063. // U+005B LEFT SQUARE BRACKET ([)
  1064. if (is_open_square_bracket(input)) {
  1065. dbgln_if(CSS_TOKENIZER_DEBUG, "is open square");
  1066. // Return a <[-token>.
  1067. Token token = create_new_token(Token::Type::OpenSquare);
  1068. token.m_representation = input_since(start_byte_offset);
  1069. return token;
  1070. }
  1071. // U+005C REVERSE SOLIDUS (\)
  1072. if (is_reverse_solidus(input)) {
  1073. dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus");
  1074. // If the input stream starts with a valid escape, reconsume the current input code point,
  1075. // consume an ident-like token, and return it.
  1076. if (is_valid_escape_sequence(start_of_input_stream_twin())) {
  1077. reconsume_current_input_code_point();
  1078. return consume_an_ident_like_token();
  1079. }
  1080. // Otherwise, this is a parse error. Return a <delim-token> with its value set to the
  1081. // current input code point.
  1082. log_parse_error();
  1083. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  1084. }
  1085. // U+005D RIGHT SQUARE BRACKET (])
  1086. if (is_closed_square_bracket(input)) {
  1087. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square");
  1088. // Return a <]-token>.
  1089. Token token = create_new_token(Token::Type::CloseSquare);
  1090. token.m_representation = input_since(start_byte_offset);
  1091. return token;
  1092. }
  1093. // U+007B LEFT CURLY BRACKET ({)
  1094. if (is_open_curly_bracket(input)) {
  1095. dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly");
  1096. // Return a <{-token>.
  1097. Token token = create_new_token(Token::Type::OpenCurly);
  1098. token.m_representation = input_since(start_byte_offset);
  1099. return token;
  1100. }
  1101. // U+007D RIGHT CURLY BRACKET (})
  1102. if (is_closed_curly_bracket(input)) {
  1103. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly");
  1104. // Return a <}-token>.
  1105. Token token = create_new_token(Token::Type::CloseCurly);
  1106. token.m_representation = input_since(start_byte_offset);
  1107. return token;
  1108. }
  1109. // digit
  1110. if (is_ascii_digit(input)) {
  1111. dbgln_if(CSS_TOKENIZER_DEBUG, "is digit");
  1112. // Reconsume the current input code point, consume a numeric token, and return it.
  1113. reconsume_current_input_code_point();
  1114. return consume_a_numeric_token();
  1115. }
  1116. // name-start code point
  1117. if (is_ident_start_code_point(input)) {
  1118. dbgln_if(CSS_TOKENIZER_DEBUG, "is name start");
  1119. // Reconsume the current input code point, consume an ident-like token, and return it.
  1120. reconsume_current_input_code_point();
  1121. return consume_an_ident_like_token();
  1122. }
  1123. // EOF
  1124. if (is_eof(input)) {
  1125. // Return an <EOF-token>.
  1126. return create_eof_token();
  1127. }
  1128. // anything else
  1129. dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter");
  1130. // Return a <delim-token> with its value set to the current input code point.
  1131. return create_value_token(Token::Type::Delim, input, input_since(start_byte_offset));
  1132. }
  1133. size_t Tokenizer::current_byte_offset() const
  1134. {
  1135. return m_utf8_iterator.ptr() - m_utf8_view.bytes();
  1136. }
  1137. String Tokenizer::input_since(size_t offset) const
  1138. {
  1139. return MUST(m_decoded_input.substring_from_byte_offset_with_shared_superstring(offset, current_byte_offset() - offset));
  1140. }
  1141. }