Tokenizer.cpp 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952
  1. /*
  2. * Copyright (c) 2020-2021, the SerenityOS developers.
  3. * Copyright (c) 2021, Sam Atkins <atkinssj@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Debug.h>
  9. #include <AK/SourceLocation.h>
  10. #include <AK/Vector.h>
  11. #include <LibTextCodec/Decoder.h>
  12. #include <LibWeb/CSS/Parser/Tokenizer.h>
  13. // U+FFFD REPLACEMENT CHARACTER (�)
  14. #define REPLACEMENT_CHARACTER 0xFFFD
  15. static const u32 TOKENIZER_EOF = 0xFFFFFFFF;
  16. static inline void log_parse_error(const SourceLocation& location = SourceLocation::current())
  17. {
  18. dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location);
  19. }
  20. static inline bool is_eof(u32 code_point)
  21. {
  22. return code_point == TOKENIZER_EOF;
  23. }
  24. static inline bool is_quotation_mark(u32 code_point)
  25. {
  26. return code_point == 0x22;
  27. }
  28. static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
  29. {
  30. return code_point > 0x10FFFF;
  31. }
  32. static inline bool is_low_line(u32 code_point)
  33. {
  34. return code_point == 0x5F;
  35. }
  36. static inline bool is_name_start_code_point(u32 code_point)
  37. {
  38. // FIXME: We use !is_ascii() for "non-ASCII code point" in the spec, but it's not quite right -
  39. // it treats EOF as a valid! The spec also lacks a definition of code point. For now, the
  40. // !is_eof() check is a hack, but it should work.
  41. return !is_eof(code_point) && (is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point));
  42. }
  43. static inline bool is_hyphen_minus(u32 code_point)
  44. {
  45. return code_point == 0x2D;
  46. }
  47. static inline bool is_name_code_point(u32 code_point)
  48. {
  49. return is_name_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
  50. }
  51. static inline bool is_non_printable(u32 code_point)
  52. {
  53. return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F;
  54. }
  55. static inline bool is_number_sign(u32 code_point)
  56. {
  57. return code_point == 0x23;
  58. }
  59. static inline bool is_reverse_solidus(u32 code_point)
  60. {
  61. return code_point == 0x5C;
  62. }
  63. static inline bool is_apostrophe(u32 code_point)
  64. {
  65. return code_point == 0x27;
  66. }
  67. static inline bool is_left_paren(u32 code_point)
  68. {
  69. return code_point == 0x28;
  70. }
  71. static inline bool is_right_paren(u32 code_point)
  72. {
  73. return code_point == 0x29;
  74. }
  75. static inline bool is_plus_sign(u32 code_point)
  76. {
  77. return code_point == 0x2B;
  78. }
  79. static inline bool is_comma(u32 code_point)
  80. {
  81. return code_point == 0x2C;
  82. }
  83. static inline bool is_full_stop(u32 code_point)
  84. {
  85. return code_point == 0x2E;
  86. }
  87. static inline bool is_newline(u32 code_point)
  88. {
  89. return code_point == 0xA;
  90. }
  91. static inline bool is_asterisk(u32 code_point)
  92. {
  93. return code_point == 0x2A;
  94. }
  95. static inline bool is_solidus(u32 code_point)
  96. {
  97. return code_point == 0x2F;
  98. }
  99. static inline bool is_colon(u32 code_point)
  100. {
  101. return code_point == 0x3A;
  102. }
  103. static inline bool is_semicolon(u32 code_point)
  104. {
  105. return code_point == 0x3B;
  106. }
  107. static inline bool is_less_than_sign(u32 code_point)
  108. {
  109. return code_point == 0x3C;
  110. }
  111. static inline bool is_greater_than_sign(u32 code_point)
  112. {
  113. return code_point == 0x3E;
  114. }
  115. static inline bool is_at(u32 code_point)
  116. {
  117. return code_point == 0x40;
  118. }
  119. static inline bool is_open_square_bracket(u32 code_point)
  120. {
  121. return code_point == 0x5B;
  122. }
  123. static inline bool is_closed_square_bracket(u32 code_point)
  124. {
  125. return code_point == 0x5D;
  126. }
  127. static inline bool is_open_curly_bracket(u32 code_point)
  128. {
  129. return code_point == 0x7B;
  130. }
  131. static inline bool is_closed_curly_bracket(u32 code_point)
  132. {
  133. return code_point == 0x7D;
  134. }
  135. static inline bool is_whitespace(u32 code_point)
  136. {
  137. return code_point == 0x9 || code_point == 0xA || code_point == 0x20;
  138. }
  139. static inline bool is_percent(u32 code_point)
  140. {
  141. return code_point == 0x25;
  142. }
  143. static inline bool is_exclamation_mark(u32 code_point)
  144. {
  145. return code_point == 0x21;
  146. }
  147. static inline bool is_e(u32 code_point)
  148. {
  149. return code_point == 0x65;
  150. }
  151. static inline bool is_E(u32 code_point)
  152. {
  153. return code_point == 0x45;
  154. }
  155. namespace Web::CSS {
  156. Tokenizer::Tokenizer(const StringView& input, const String& encoding)
  157. {
  158. auto* decoder = TextCodec::decoder_for(encoding);
  159. VERIFY(decoder);
  160. StringBuilder builder(input.length());
  161. // Preprocess the stream, by doing the following:
  162. // - Replace \r, \f and \r\n with \n
  163. // - replace \0 and anything between U+D800 to U+DFFF with the replacement
  164. // character.
  165. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  166. bool last_was_carriage_return = false;
  167. decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) {
  168. if (code_point == '\r') {
  169. if (last_was_carriage_return) {
  170. builder.append('\n');
  171. } else {
  172. last_was_carriage_return = true;
  173. }
  174. } else {
  175. if (last_was_carriage_return) {
  176. builder.append('\n');
  177. }
  178. if (code_point == '\n') {
  179. if (!last_was_carriage_return) {
  180. builder.append('\n');
  181. }
  182. } else if (code_point == '\f') {
  183. builder.append('\n');
  184. } else if (code_point >= 0xD800 && code_point <= 0xDFFF) {
  185. builder.append_code_point(REPLACEMENT_CHARACTER);
  186. } else {
  187. builder.append_code_point(code_point);
  188. }
  189. last_was_carriage_return = false;
  190. }
  191. });
  192. m_decoded_input = builder.to_string();
  193. m_utf8_view = Utf8View(m_decoded_input);
  194. m_utf8_iterator = m_utf8_view.begin();
  195. }
  196. Vector<Token> Tokenizer::parse()
  197. {
  198. Vector<Token> tokens;
  199. for (;;) {
  200. auto token_start = m_position;
  201. auto token = consume_a_token();
  202. token.m_start_position = token_start;
  203. token.m_end_position = m_position;
  204. tokens.append(token);
  205. if (token.is(Token::Type::EndOfFile)) {
  206. return tokens;
  207. }
  208. }
  209. }
  210. u32 Tokenizer::next_code_point()
  211. {
  212. if (m_utf8_iterator == m_utf8_view.end())
  213. return TOKENIZER_EOF;
  214. m_prev_utf8_iterator = m_utf8_iterator;
  215. ++m_utf8_iterator;
  216. auto code_point = *m_prev_utf8_iterator;
  217. m_prev_position = m_position;
  218. if (is_newline(code_point)) {
  219. m_position.line++;
  220. m_position.column = 0;
  221. } else {
  222. m_position.column++;
  223. }
  224. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", code_point);
  225. return code_point;
  226. }
  227. u32 Tokenizer::peek_code_point(size_t offset) const
  228. {
  229. auto it = m_utf8_iterator;
  230. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  231. ++it;
  232. if (it == m_utf8_view.end())
  233. return TOKENIZER_EOF;
  234. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator);
  235. return *it;
  236. }
  237. U32Twin Tokenizer::peek_twin() const
  238. {
  239. U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF };
  240. auto it = m_utf8_iterator;
  241. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  242. values.set(i, *it);
  243. ++it;
  244. }
  245. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second);
  246. return values;
  247. }
  248. U32Triplet Tokenizer::peek_triplet() const
  249. {
  250. U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF };
  251. auto it = m_utf8_iterator;
  252. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  253. values.set(i, *it);
  254. ++it;
  255. }
  256. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third);
  257. return values;
  258. }
  259. Token Tokenizer::create_new_token(Token::Type type)
  260. {
  261. Token token = {};
  262. token.m_type = type;
  263. return token;
  264. }
  265. Token Tokenizer::create_eof_token()
  266. {
  267. return create_new_token(Token::Type::EndOfFile);
  268. }
  269. Token Tokenizer::create_value_token(Token::Type type, String value)
  270. {
  271. Token token;
  272. token.m_type = type;
  273. token.m_value.append(move(value));
  274. return token;
  275. }
  276. Token Tokenizer::create_value_token(Token::Type type, u32 value)
  277. {
  278. Token token = {};
  279. token.m_type = type;
  280. token.m_value.append_code_point(value);
  281. return token;
  282. }
  283. // https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
  284. u32 Tokenizer::consume_escaped_code_point()
  285. {
  286. auto input = next_code_point();
  287. if (is_eof(input)) {
  288. log_parse_error();
  289. return REPLACEMENT_CHARACTER;
  290. }
  291. if (is_ascii_hex_digit(input)) {
  292. StringBuilder builder;
  293. builder.append_code_point(input);
  294. size_t counter = 0;
  295. while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) {
  296. builder.append_code_point(next_code_point());
  297. }
  298. if (is_whitespace(peek_code_point())) {
  299. (void)next_code_point();
  300. }
  301. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
  302. if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
  303. return REPLACEMENT_CHARACTER;
  304. }
  305. return unhexed;
  306. }
  307. if (!input) {
  308. log_parse_error();
  309. return REPLACEMENT_CHARACTER;
  310. }
  311. return input;
  312. }
  313. // https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
  314. Token Tokenizer::consume_an_ident_like_token()
  315. {
  316. auto string = consume_a_name();
  317. if (string.equals_ignoring_case("url") && is_left_paren(peek_code_point())) {
  318. (void)next_code_point();
  319. for (;;) {
  320. auto maybe_whitespace = peek_twin();
  321. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  322. break;
  323. }
  324. (void)next_code_point();
  325. }
  326. auto next_two = peek_twin();
  327. // if one of these ", ', ' "', " '"
  328. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  329. return create_value_token(Token::Type::Function, string);
  330. }
  331. return consume_a_url_token();
  332. }
  333. if (is_left_paren(peek_code_point())) {
  334. (void)next_code_point();
  335. return create_value_token(Token::Type::Function, string);
  336. }
  337. return create_value_token(Token::Type::Ident, string);
  338. }
  339. // https://www.w3.org/TR/css-syntax-3/#consume-number
  340. CSSNumber Tokenizer::consume_a_number()
  341. {
  342. StringBuilder repr;
  343. Token::NumberType type = Token::NumberType::Integer;
  344. auto next_input = peek_code_point();
  345. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  346. repr.append_code_point(next_code_point());
  347. }
  348. for (;;) {
  349. auto digits = peek_code_point();
  350. if (!is_ascii_digit(digits))
  351. break;
  352. repr.append_code_point(next_code_point());
  353. }
  354. auto maybe_number = peek_twin();
  355. if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
  356. repr.append_code_point(next_code_point());
  357. repr.append_code_point(next_code_point());
  358. type = Token::NumberType::Number;
  359. for (;;) {
  360. auto digit = peek_code_point();
  361. if (!is_ascii_digit(digit))
  362. break;
  363. repr.append_code_point(next_code_point());
  364. }
  365. }
  366. auto maybe_exp = peek_triplet();
  367. if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
  368. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  369. if (is_ascii_digit(maybe_exp.third)) {
  370. repr.append_code_point(next_code_point());
  371. repr.append_code_point(next_code_point());
  372. repr.append_code_point(next_code_point());
  373. }
  374. } else if (is_ascii_digit(maybe_exp.second)) {
  375. repr.append_code_point(next_code_point());
  376. repr.append_code_point(next_code_point());
  377. }
  378. type = Token::NumberType::Number;
  379. for (;;) {
  380. auto digits = peek_code_point();
  381. if (!is_ascii_digit(digits))
  382. break;
  383. repr.append_code_point(next_code_point());
  384. }
  385. }
  386. return { repr.to_string(), type };
  387. }
  388. // https://www.w3.org/TR/css-syntax-3/#consume-name
  389. String Tokenizer::consume_a_name()
  390. {
  391. StringBuilder result;
  392. for (;;) {
  393. auto input = next_code_point();
  394. if (is_eof(input))
  395. break;
  396. if (is_name_code_point(input)) {
  397. result.append_code_point(input);
  398. continue;
  399. }
  400. auto next = peek_code_point();
  401. if (!is_eof(next) && is_valid_escape_sequence({ input, next })) {
  402. result.append_code_point(consume_escaped_code_point());
  403. continue;
  404. }
  405. reconsume_current_input_code_point();
  406. break;
  407. }
  408. return result.to_string();
  409. }
  410. Token Tokenizer::consume_a_url_token()
  411. {
  412. auto token = create_new_token(Token::Type::Url);
  413. consume_as_much_whitespace_as_possible();
  414. for (;;) {
  415. auto input = peek_code_point();
  416. if (is_eof(input)) {
  417. log_parse_error();
  418. return token;
  419. }
  420. if (is_right_paren(input)) {
  421. (void)next_code_point();
  422. return token;
  423. }
  424. if (is_whitespace(input)) {
  425. consume_as_much_whitespace_as_possible();
  426. input = peek_code_point();
  427. if (is_eof(input)) {
  428. log_parse_error();
  429. return token;
  430. }
  431. if (is_right_paren(input)) {
  432. return token;
  433. }
  434. consume_the_remnants_of_a_bad_url();
  435. return create_new_token(Token::Type::BadUrl);
  436. }
  437. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  438. log_parse_error();
  439. (void)next_code_point();
  440. consume_the_remnants_of_a_bad_url();
  441. return create_new_token(Token::Type::BadUrl);
  442. }
  443. if (is_reverse_solidus(input)) {
  444. if (is_valid_escape_sequence(peek_twin())) {
  445. token.m_value.append_code_point(consume_escaped_code_point());
  446. } else {
  447. log_parse_error();
  448. (void)next_code_point();
  449. consume_the_remnants_of_a_bad_url();
  450. return create_new_token(Token::Type::BadUrl);
  451. }
  452. }
  453. token.m_value.append_code_point(next_code_point());
  454. }
  455. }
  456. // https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
  457. void Tokenizer::consume_the_remnants_of_a_bad_url()
  458. {
  459. for (;;) {
  460. auto next = peek_code_point();
  461. if (is_eof(next)) {
  462. return;
  463. }
  464. auto input = next;
  465. if (is_right_paren(input)) {
  466. (void)next_code_point();
  467. return;
  468. }
  469. if (is_valid_escape_sequence(peek_twin())) {
  470. [[maybe_unused]] auto cp = consume_escaped_code_point();
  471. }
  472. (void)next_code_point();
  473. }
  474. }
  475. void Tokenizer::consume_as_much_whitespace_as_possible()
  476. {
  477. while (is_whitespace(peek_code_point())) {
  478. (void)next_code_point();
  479. }
  480. }
  481. void Tokenizer::reconsume_current_input_code_point()
  482. {
  483. m_utf8_iterator = m_prev_utf8_iterator;
  484. m_position = m_prev_position;
  485. }
  486. // https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
  487. Token Tokenizer::consume_a_numeric_token()
  488. {
  489. auto number = consume_a_number();
  490. if (would_start_an_identifier()) {
  491. auto token = create_new_token(Token::Type::Dimension);
  492. token.m_value.append(number.value);
  493. token.m_number_type = number.type;
  494. auto unit = consume_a_name();
  495. VERIFY(!unit.is_empty() && !unit.is_whitespace());
  496. token.m_unit.append(unit);
  497. return token;
  498. }
  499. if (is_percent(peek_code_point())) {
  500. (void)next_code_point();
  501. auto token = create_new_token(Token::Type::Percentage);
  502. token.m_value.append(number.value);
  503. return token;
  504. }
  505. auto token = create_new_token(Token::Type::Number);
  506. token.m_value.append(number.value);
  507. token.m_number_type = number.type;
  508. return token;
  509. }
  510. bool Tokenizer::would_start_a_number() const
  511. {
  512. return would_start_a_number(peek_triplet());
  513. }
  514. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
  515. bool Tokenizer::would_start_a_number(U32Triplet values)
  516. {
  517. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  518. if (is_ascii_digit(values.second))
  519. return true;
  520. if (is_full_stop(values.second) && is_ascii_digit(values.third))
  521. return true;
  522. return false;
  523. }
  524. if (is_full_stop(values.first))
  525. return is_ascii_digit(values.second);
  526. if (is_ascii_digit(values.first))
  527. return true;
  528. return false;
  529. }
  530. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
  531. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  532. {
  533. if (!is_reverse_solidus(values.first)) {
  534. return false;
  535. }
  536. if (is_newline(values.second)) {
  537. return false;
  538. }
  539. return true;
  540. }
  541. bool Tokenizer::would_start_an_identifier()
  542. {
  543. return would_start_an_identifier(peek_triplet());
  544. }
  545. // https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
  546. bool Tokenizer::would_start_an_identifier(U32Triplet values)
  547. {
  548. if (is_hyphen_minus(values.first)) {
  549. if (is_name_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  550. return true;
  551. return false;
  552. }
  553. if (is_name_start_code_point(values.first)) {
  554. return true;
  555. }
  556. if (is_reverse_solidus(values.first)) {
  557. if (is_valid_escape_sequence(values.to_twin_12()))
  558. return true;
  559. return false;
  560. }
  561. return false;
  562. }
  563. // https://www.w3.org/TR/css-syntax-3/#consume-string-token
  564. Token Tokenizer::consume_string_token(u32 ending_code_point)
  565. {
  566. auto token = create_new_token(Token::Type::String);
  567. for (;;) {
  568. auto input = next_code_point();
  569. if (is_eof(input)) {
  570. log_parse_error();
  571. return token;
  572. }
  573. if (input == ending_code_point)
  574. return token;
  575. if (is_newline(input)) {
  576. reconsume_current_input_code_point();
  577. return create_new_token(Token::Type::BadString);
  578. }
  579. if (is_reverse_solidus(input)) {
  580. auto next_input = peek_code_point();
  581. if (is_eof(next_input))
  582. continue;
  583. if (is_newline(next_input)) {
  584. (void)next_code_point();
  585. continue;
  586. }
  587. auto escaped = consume_escaped_code_point();
  588. token.m_value.append_code_point(escaped);
  589. }
  590. token.m_value.append_code_point(input);
  591. }
  592. }
  593. // https://www.w3.org/TR/css-syntax-3/#consume-comment
  594. void Tokenizer::consume_comments()
  595. {
  596. start:
  597. auto twin = peek_twin();
  598. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  599. return;
  600. (void)next_code_point();
  601. (void)next_code_point();
  602. for (;;) {
  603. auto twin_inner = peek_twin();
  604. if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) {
  605. log_parse_error();
  606. return;
  607. }
  608. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  609. (void)next_code_point();
  610. (void)next_code_point();
  611. goto start;
  612. }
  613. (void)next_code_point();
  614. }
  615. }
  616. // https://www.w3.org/TR/css-syntax-3/#consume-token
  617. Token Tokenizer::consume_a_token()
  618. {
  619. consume_comments();
  620. auto input = next_code_point();
  621. if (is_eof(input)) {
  622. return create_new_token(Token::Type::EndOfFile);
  623. }
  624. if (is_whitespace(input)) {
  625. dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace");
  626. consume_as_much_whitespace_as_possible();
  627. return create_new_token(Token::Type::Whitespace);
  628. }
  629. if (is_quotation_mark(input)) {
  630. dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark");
  631. return consume_string_token(input);
  632. }
  633. if (is_number_sign(input)) {
  634. dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign");
  635. auto next_input = peek_code_point();
  636. auto maybe_escape = peek_twin();
  637. if (is_name_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) {
  638. auto token = create_new_token(Token::Type::Hash);
  639. if (would_start_an_identifier())
  640. token.m_hash_type = Token::HashType::Id;
  641. auto name = consume_a_name();
  642. token.m_value.append(name);
  643. return token;
  644. }
  645. return create_value_token(Token::Type::Delim, input);
  646. }
  647. if (is_apostrophe(input)) {
  648. dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe");
  649. return consume_string_token(input);
  650. }
  651. if (is_left_paren(input)) {
  652. dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren");
  653. return create_new_token(Token::Type::OpenParen);
  654. }
  655. if (is_right_paren(input)) {
  656. dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren");
  657. return create_new_token(Token::Type::CloseParen);
  658. }
  659. if (is_plus_sign(input)) {
  660. dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign");
  661. if (would_start_a_number()) {
  662. reconsume_current_input_code_point();
  663. return consume_a_numeric_token();
  664. }
  665. return create_value_token(Token::Type::Delim, input);
  666. }
  667. if (is_comma(input)) {
  668. dbgln_if(CSS_TOKENIZER_DEBUG, "is comma");
  669. return create_new_token(Token::Type::Comma);
  670. }
  671. if (is_hyphen_minus(input)) {
  672. dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus");
  673. if (would_start_a_number()) {
  674. reconsume_current_input_code_point();
  675. return consume_a_numeric_token();
  676. }
  677. auto next_twin = peek_twin();
  678. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  679. (void)next_code_point();
  680. (void)next_code_point();
  681. return create_new_token(Token::Type::CDC);
  682. }
  683. if (would_start_an_identifier()) {
  684. reconsume_current_input_code_point();
  685. return consume_an_ident_like_token();
  686. }
  687. return create_value_token(Token::Type::Delim, input);
  688. }
  689. if (is_full_stop(input)) {
  690. dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop");
  691. if (would_start_a_number()) {
  692. reconsume_current_input_code_point();
  693. return consume_a_numeric_token();
  694. }
  695. return create_value_token(Token::Type::Delim, input);
  696. }
  697. if (is_colon(input)) {
  698. dbgln_if(CSS_TOKENIZER_DEBUG, "is colon");
  699. return create_new_token(Token::Type::Colon);
  700. }
  701. if (is_semicolon(input)) {
  702. dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon");
  703. return create_new_token(Token::Type::Semicolon);
  704. }
  705. if (is_less_than_sign(input)) {
  706. dbgln_if(CSS_TOKENIZER_DEBUG, "is less than");
  707. auto maybe_cdo = peek_triplet();
  708. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  709. (void)next_code_point();
  710. (void)next_code_point();
  711. (void)next_code_point();
  712. return create_new_token(Token::Type::CDO);
  713. }
  714. return create_value_token(Token::Type::Delim, input);
  715. }
  716. if (is_at(input)) {
  717. dbgln_if(CSS_TOKENIZER_DEBUG, "is at");
  718. if (would_start_an_identifier()) {
  719. auto name = consume_a_name();
  720. return create_value_token(Token::Type::AtKeyword, name);
  721. }
  722. return create_value_token(Token::Type::Delim, input);
  723. }
  724. if (is_open_square_bracket(input)) {
  725. dbgln_if(CSS_TOKENIZER_DEBUG, "is open square");
  726. return create_new_token(Token::Type::OpenSquare);
  727. }
  728. if (is_reverse_solidus(input)) {
  729. dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus");
  730. if (is_valid_escape_sequence({ input, peek_code_point() })) {
  731. reconsume_current_input_code_point();
  732. return consume_an_ident_like_token();
  733. }
  734. log_parse_error();
  735. return create_value_token(Token::Type::Delim, input);
  736. }
  737. if (is_closed_square_bracket(input)) {
  738. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square");
  739. return create_new_token(Token::Type::CloseSquare);
  740. }
  741. if (is_open_curly_bracket(input)) {
  742. dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly");
  743. return create_new_token(Token::Type::OpenCurly);
  744. }
  745. if (is_closed_curly_bracket(input)) {
  746. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly");
  747. return create_new_token(Token::Type::CloseCurly);
  748. }
  749. if (is_ascii_digit(input)) {
  750. dbgln_if(CSS_TOKENIZER_DEBUG, "is digit");
  751. reconsume_current_input_code_point();
  752. return consume_a_numeric_token();
  753. }
  754. if (is_name_start_code_point(input)) {
  755. dbgln_if(CSS_TOKENIZER_DEBUG, "is name start");
  756. reconsume_current_input_code_point();
  757. return consume_an_ident_like_token();
  758. }
  759. dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter");
  760. return create_value_token(Token::Type::Delim, input);
  761. }
  762. }