Tokenizer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897
  1. /*
  2. * Copyright (c) 2020-2021, the SerenityOS developers.
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Debug.h>
  8. #include <AK/SourceLocation.h>
  9. #include <AK/Vector.h>
  10. #include <LibTextCodec/Decoder.h>
  11. #include <LibWeb/CSS/Parser/Tokenizer.h>
  12. //U+FFFD REPLACEMENT CHARACTER (�)
  13. #define REPLACEMENT_CHARACTER 0xFFFD
  14. static const u32 TOKENIZER_EOF = 0xFFFFFFFF;
  15. static inline void log_parse_error(const SourceLocation& location = SourceLocation::current())
  16. {
  17. dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location);
  18. }
  19. static inline bool is_eof(u32 code_point)
  20. {
  21. return code_point == TOKENIZER_EOF;
  22. }
  23. static inline bool is_quotation_mark(u32 code_point)
  24. {
  25. return code_point == 0x22;
  26. }
  27. static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
  28. {
  29. return code_point > 0x10FFFF;
  30. }
  31. static inline bool is_low_line(u32 code_point)
  32. {
  33. return code_point == 0x5F;
  34. }
  35. static inline bool is_name_start_code_point(u32 code_point)
  36. {
  37. return is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point);
  38. }
  39. static inline bool is_hyphen_minus(u32 code_point)
  40. {
  41. return code_point == 0x2D;
  42. }
  43. static inline bool is_name_code_point(u32 code_point)
  44. {
  45. return is_name_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
  46. }
  47. static inline bool is_non_printable(u32 code_point)
  48. {
  49. return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F;
  50. }
  51. static inline bool is_number_sign(u32 code_point)
  52. {
  53. return code_point == 0x23;
  54. }
  55. static inline bool is_reverse_solidus(u32 code_point)
  56. {
  57. return code_point == 0x5C;
  58. }
  59. static inline bool is_apostrophe(u32 code_point)
  60. {
  61. return code_point == 0x27;
  62. }
  63. static inline bool is_left_paren(u32 code_point)
  64. {
  65. return code_point == 0x28;
  66. }
  67. static inline bool is_right_paren(u32 code_point)
  68. {
  69. return code_point == 0x29;
  70. }
  71. static inline bool is_plus_sign(u32 code_point)
  72. {
  73. return code_point == 0x2B;
  74. }
  75. static inline bool is_comma(u32 code_point)
  76. {
  77. return code_point == 0x2C;
  78. }
  79. static inline bool is_full_stop(u32 code_point)
  80. {
  81. return code_point == 0x2E;
  82. }
  83. static inline bool is_newline(u32 code_point)
  84. {
  85. return code_point == 0xA;
  86. }
  87. static inline bool is_asterisk(u32 code_point)
  88. {
  89. return code_point == 0x2A;
  90. }
  91. static inline bool is_solidus(u32 code_point)
  92. {
  93. return code_point == 0x2F;
  94. }
  95. static inline bool is_colon(u32 code_point)
  96. {
  97. return code_point == 0x3A;
  98. }
  99. static inline bool is_semicolon(u32 code_point)
  100. {
  101. return code_point == 0x3B;
  102. }
  103. static inline bool is_less_than_sign(u32 code_point)
  104. {
  105. return code_point == 0x3C;
  106. }
  107. static inline bool is_greater_than_sign(u32 code_point)
  108. {
  109. return code_point == 0x3E;
  110. }
  111. static inline bool is_at(u32 code_point)
  112. {
  113. return code_point == 0x40;
  114. }
  115. static inline bool is_open_square_bracket(u32 code_point)
  116. {
  117. return code_point == 0x5B;
  118. }
  119. static inline bool is_closed_square_bracket(u32 code_point)
  120. {
  121. return code_point == 0x5D;
  122. }
  123. static inline bool is_open_curly_bracket(u32 code_point)
  124. {
  125. return code_point == 0x7B;
  126. }
  127. static inline bool is_closed_curly_bracket(u32 code_point)
  128. {
  129. return code_point == 0x7D;
  130. }
  131. static inline bool is_whitespace(u32 code_point)
  132. {
  133. return code_point == 0x9 || code_point == 0xA || code_point == 0x20;
  134. }
  135. static inline bool is_percent(u32 code_point)
  136. {
  137. return code_point == 0x25;
  138. }
  139. static inline bool is_exclamation_mark(u32 code_point)
  140. {
  141. return code_point == 0x21;
  142. }
  143. static inline bool is_e(u32 code_point)
  144. {
  145. return code_point == 0x65;
  146. }
  147. static inline bool is_E(u32 code_point)
  148. {
  149. return code_point == 0x45;
  150. }
  151. namespace Web::CSS {
  152. Tokenizer::Tokenizer(const StringView& input, const String& encoding)
  153. {
  154. auto* decoder = TextCodec::decoder_for(encoding);
  155. VERIFY(decoder);
  156. // FIXME: preprocess the stream
  157. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  158. m_decoded_input = decoder->to_utf8(input);
  159. m_utf8_view = Utf8View(m_decoded_input);
  160. m_utf8_iterator = m_utf8_view.begin();
  161. }
  162. Vector<Token> Tokenizer::parse()
  163. {
  164. Vector<Token> tokens;
  165. for (;;) {
  166. auto token = consume_a_token();
  167. tokens.append(token);
  168. if (token.is(Token::Type::EndOfFile)) {
  169. return tokens;
  170. }
  171. }
  172. }
  173. u32 Tokenizer::next_code_point()
  174. {
  175. if (m_utf8_iterator == m_utf8_view.end())
  176. return TOKENIZER_EOF;
  177. m_prev_utf8_iterator = m_utf8_iterator;
  178. ++m_utf8_iterator;
  179. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", *m_prev_utf8_iterator);
  180. return *m_prev_utf8_iterator;
  181. }
  182. u32 Tokenizer::peek_code_point(size_t offset) const
  183. {
  184. auto it = m_utf8_iterator;
  185. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  186. ++it;
  187. if (it == m_utf8_view.end())
  188. return TOKENIZER_EOF;
  189. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator);
  190. return *it;
  191. }
  192. U32Twin Tokenizer::peek_twin() const
  193. {
  194. U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF };
  195. auto it = m_utf8_iterator;
  196. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  197. values.set(i, *it);
  198. ++it;
  199. }
  200. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second);
  201. return values;
  202. }
  203. U32Triplet Tokenizer::peek_triplet() const
  204. {
  205. U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF };
  206. auto it = m_utf8_iterator;
  207. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  208. values.set(i, *it);
  209. ++it;
  210. }
  211. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third);
  212. return values;
  213. }
  214. Token Tokenizer::create_new_token(Token::Type type)
  215. {
  216. Token token = {};
  217. token.m_type = type;
  218. return token;
  219. }
  220. Token Tokenizer::create_eof_token()
  221. {
  222. return create_new_token(Token::Type::EndOfFile);
  223. }
  224. Token Tokenizer::create_value_token(Token::Type type, String value)
  225. {
  226. Token token;
  227. token.m_type = type;
  228. token.m_value.append(move(value));
  229. return token;
  230. }
  231. Token Tokenizer::create_value_token(Token::Type type, u32 value)
  232. {
  233. Token token = {};
  234. token.m_type = type;
  235. token.m_value.append_code_point(value);
  236. return token;
  237. }
  238. u32 Tokenizer::consume_escaped_code_point()
  239. {
  240. auto input = next_code_point();
  241. if (is_eof(input)) {
  242. log_parse_error();
  243. return REPLACEMENT_CHARACTER;
  244. }
  245. if (is_ascii_hex_digit(input)) {
  246. StringBuilder builder;
  247. builder.append_code_point(input);
  248. size_t counter = 0;
  249. while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) {
  250. builder.append_code_point(next_code_point());
  251. }
  252. if (is_whitespace(peek_code_point())) {
  253. (void)next_code_point();
  254. }
  255. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
  256. if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
  257. return REPLACEMENT_CHARACTER;
  258. }
  259. return unhexed;
  260. }
  261. if (!input) {
  262. log_parse_error();
  263. return REPLACEMENT_CHARACTER;
  264. }
  265. return input;
  266. }
  267. Token Tokenizer::consume_an_ident_like_token()
  268. {
  269. auto string = consume_a_name();
  270. if (string.equals_ignoring_case("url") && is_left_paren(peek_code_point())) {
  271. (void)next_code_point();
  272. for (;;) {
  273. auto maybe_whitespace = peek_twin();
  274. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  275. break;
  276. }
  277. (void)next_code_point();
  278. }
  279. auto next_two = peek_twin();
  280. // if one of these ", ', ' "', " '"
  281. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  282. return create_value_token(Token::Type::Function, string);
  283. }
  284. return consume_a_url_token();
  285. }
  286. if (is_left_paren(peek_code_point())) {
  287. (void)next_code_point();
  288. return create_value_token(Token::Type::Function, string);
  289. }
  290. return create_value_token(Token::Type::Ident, string);
  291. }
  292. CSSNumber Tokenizer::consume_a_number()
  293. {
  294. StringBuilder repr;
  295. Token::NumberType type = Token::NumberType::Integer;
  296. auto next_input = peek_code_point();
  297. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  298. repr.append_code_point(next_code_point());
  299. }
  300. for (;;) {
  301. auto digits = peek_code_point();
  302. if (!is_ascii_digit(digits))
  303. break;
  304. repr.append_code_point(next_code_point());
  305. }
  306. auto maybe_number = peek_twin();
  307. if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
  308. repr.append_code_point(next_code_point());
  309. repr.append_code_point(next_code_point());
  310. type = Token::NumberType::Number;
  311. for (;;) {
  312. auto digit = peek_code_point();
  313. if (!is_ascii_digit(digit))
  314. break;
  315. repr.append_code_point(next_code_point());
  316. }
  317. }
  318. auto maybe_exp = peek_triplet();
  319. if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
  320. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  321. if (is_ascii_digit(maybe_exp.third)) {
  322. repr.append_code_point(next_code_point());
  323. repr.append_code_point(next_code_point());
  324. repr.append_code_point(next_code_point());
  325. }
  326. } else if (is_ascii_digit(maybe_exp.second)) {
  327. repr.append_code_point(next_code_point());
  328. repr.append_code_point(next_code_point());
  329. }
  330. type = Token::NumberType::Number;
  331. for (;;) {
  332. auto digits = peek_code_point();
  333. if (!is_ascii_digit(digits))
  334. break;
  335. repr.append_code_point(next_code_point());
  336. }
  337. }
  338. return { repr.to_string(), type };
  339. }
  340. String Tokenizer::consume_a_name()
  341. {
  342. StringBuilder result;
  343. for (;;) {
  344. auto input = next_code_point();
  345. if (is_eof(input))
  346. break;
  347. if (is_name_code_point(input)) {
  348. result.append_code_point(input);
  349. continue;
  350. }
  351. auto next = peek_code_point();
  352. if (!is_eof(next) && is_valid_escape_sequence({ input, next })) {
  353. result.append_code_point(consume_escaped_code_point());
  354. continue;
  355. }
  356. reconsume_current_input_code_point();
  357. break;
  358. }
  359. return result.to_string();
  360. }
  361. Token Tokenizer::consume_a_url_token()
  362. {
  363. auto token = create_new_token(Token::Type::Url);
  364. for (;;) {
  365. if (!is_whitespace(peek_code_point())) {
  366. break;
  367. }
  368. (void)next_code_point();
  369. }
  370. for (;;) {
  371. auto input = peek_code_point();
  372. if (is_eof(input)) {
  373. log_parse_error();
  374. return token;
  375. }
  376. if (is_right_paren(input)) {
  377. (void)next_code_point();
  378. return token;
  379. }
  380. if (is_whitespace(input)) {
  381. for (;;) {
  382. if (!is_whitespace(peek_code_point())) {
  383. break;
  384. }
  385. input = next_code_point();
  386. }
  387. if (is_eof(input)) {
  388. log_parse_error();
  389. return token;
  390. }
  391. if (is_right_paren(input)) {
  392. return token;
  393. }
  394. consume_the_remnants_of_a_bad_url();
  395. return create_new_token(Token::Type::BadUrl);
  396. }
  397. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  398. log_parse_error();
  399. (void)next_code_point();
  400. consume_the_remnants_of_a_bad_url();
  401. return create_new_token(Token::Type::BadUrl);
  402. }
  403. if (is_reverse_solidus(input)) {
  404. if (is_valid_escape_sequence(peek_twin())) {
  405. token.m_value.append_code_point(consume_escaped_code_point());
  406. } else {
  407. log_parse_error();
  408. (void)next_code_point();
  409. consume_the_remnants_of_a_bad_url();
  410. return create_new_token(Token::Type::BadUrl);
  411. }
  412. }
  413. token.m_value.append_code_point(next_code_point());
  414. }
  415. }
  416. void Tokenizer::consume_the_remnants_of_a_bad_url()
  417. {
  418. for (;;) {
  419. auto next = peek_code_point();
  420. if (is_eof(next)) {
  421. return;
  422. }
  423. auto input = next;
  424. if (is_right_paren(input)) {
  425. (void)next_code_point();
  426. return;
  427. }
  428. if (is_valid_escape_sequence(peek_twin())) {
  429. [[maybe_unused]] auto cp = consume_escaped_code_point();
  430. }
  431. (void)next_code_point();
  432. }
  433. }
  434. void Tokenizer::reconsume_current_input_code_point()
  435. {
  436. m_utf8_iterator = m_prev_utf8_iterator;
  437. }
  438. Token Tokenizer::consume_a_numeric_token()
  439. {
  440. auto number = consume_a_number();
  441. if (would_start_an_identifier()) {
  442. auto token = create_new_token(Token::Type::Dimension);
  443. token.m_value.append(number.value);
  444. token.m_number_type = number.type;
  445. auto unit = consume_a_name();
  446. token.m_unit.append(unit);
  447. return token;
  448. }
  449. if (is_percent(peek_code_point())) {
  450. (void)next_code_point();
  451. auto token = create_new_token(Token::Type::Percentage);
  452. token.m_value.append(number.value);
  453. return token;
  454. }
  455. auto token = create_new_token(Token::Type::Number);
  456. token.m_value.append(number.value);
  457. token.m_number_type = number.type;
  458. return token;
  459. }
  460. bool Tokenizer::starts_with_a_number() const
  461. {
  462. return starts_with_a_number(peek_triplet());
  463. }
  464. bool Tokenizer::starts_with_a_number(U32Triplet values)
  465. {
  466. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  467. if (is_ascii_digit(values.second))
  468. return true;
  469. if (is_full_stop(values.second) && is_ascii_digit(values.third))
  470. return true;
  471. return false;
  472. }
  473. if (is_full_stop(values.first))
  474. return is_ascii_digit(values.second);
  475. if (is_ascii_digit(values.first))
  476. return true;
  477. return false;
  478. }
  479. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  480. {
  481. if (!is_reverse_solidus(values.first)) {
  482. return false;
  483. }
  484. if (is_newline(values.second)) {
  485. return false;
  486. }
  487. return true;
  488. }
  489. bool Tokenizer::would_start_an_identifier()
  490. {
  491. return would_start_an_identifier(peek_triplet());
  492. }
  493. bool Tokenizer::would_start_an_identifier(U32Triplet values)
  494. {
  495. if (is_hyphen_minus(values.first)) {
  496. if (is_name_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  497. return true;
  498. return false;
  499. }
  500. if (is_name_start_code_point(values.first)) {
  501. return true;
  502. }
  503. if (is_reverse_solidus(values.first)) {
  504. if (is_valid_escape_sequence(values.to_twin_12()))
  505. return true;
  506. return false;
  507. }
  508. return false;
  509. }
  510. Token Tokenizer::consume_string_token(u32 ending_code_point)
  511. {
  512. auto token = create_new_token(Token::Type::String);
  513. for (;;) {
  514. auto input = next_code_point();
  515. if (is_eof(input)) {
  516. log_parse_error();
  517. return token;
  518. }
  519. if (input == ending_code_point)
  520. return token;
  521. if (is_newline(input)) {
  522. reconsume_current_input_code_point();
  523. return create_new_token(Token::Type::BadString);
  524. }
  525. if (is_reverse_solidus(input)) {
  526. auto next_input = peek_code_point();
  527. if (is_eof(next_input))
  528. continue;
  529. if (is_newline(next_input)) {
  530. (void)next_code_point();
  531. continue;
  532. }
  533. auto escaped = consume_escaped_code_point();
  534. token.m_value.append_code_point(escaped);
  535. }
  536. token.m_value.append_code_point(input);
  537. }
  538. }
  539. void Tokenizer::consume_comments()
  540. {
  541. start:
  542. auto twin = peek_twin();
  543. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  544. return;
  545. (void)next_code_point();
  546. (void)next_code_point();
  547. for (;;) {
  548. auto twin_inner = peek_twin();
  549. if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) {
  550. log_parse_error();
  551. return;
  552. }
  553. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  554. (void)next_code_point();
  555. (void)next_code_point();
  556. goto start;
  557. }
  558. (void)next_code_point();
  559. }
  560. }
  561. Token Tokenizer::consume_a_token()
  562. {
  563. consume_comments();
  564. auto input = next_code_point();
  565. if (is_eof(input)) {
  566. return create_new_token(Token::Type::EndOfFile);
  567. }
  568. if (is_whitespace(input)) {
  569. dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace");
  570. auto next = peek_code_point();
  571. while (is_whitespace(next)) {
  572. (void)next_code_point();
  573. next = peek_code_point();
  574. }
  575. return create_new_token(Token::Type::Whitespace);
  576. }
  577. if (is_quotation_mark(input)) {
  578. dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark");
  579. return consume_string_token(input);
  580. }
  581. if (is_number_sign(input)) {
  582. dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign");
  583. auto next_input = peek_code_point();
  584. auto maybe_escape = peek_twin();
  585. if (is_name_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) {
  586. auto token = create_new_token(Token::Type::Hash);
  587. if (would_start_an_identifier())
  588. token.m_hash_type = Token::HashType::Id;
  589. auto name = consume_a_name();
  590. token.m_value.append(name);
  591. return token;
  592. }
  593. return create_value_token(Token::Type::Delim, input);
  594. }
  595. if (is_apostrophe(input)) {
  596. dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe");
  597. return consume_string_token(input);
  598. }
  599. if (is_left_paren(input)) {
  600. dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren");
  601. return create_new_token(Token::Type::OpenParen);
  602. }
  603. if (is_right_paren(input)) {
  604. dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren");
  605. return create_new_token(Token::Type::CloseParen);
  606. }
  607. if (is_plus_sign(input)) {
  608. dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign");
  609. if (starts_with_a_number()) {
  610. reconsume_current_input_code_point();
  611. return consume_a_numeric_token();
  612. }
  613. return create_value_token(Token::Type::Delim, input);
  614. }
  615. if (is_comma(input)) {
  616. dbgln_if(CSS_TOKENIZER_DEBUG, "is comma");
  617. return create_new_token(Token::Type::Comma);
  618. }
  619. if (is_hyphen_minus(input)) {
  620. dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus");
  621. if (starts_with_a_number()) {
  622. reconsume_current_input_code_point();
  623. return consume_a_numeric_token();
  624. }
  625. auto next_twin = peek_twin();
  626. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  627. (void)next_code_point();
  628. (void)next_code_point();
  629. return create_new_token(Token::Type::CDC);
  630. }
  631. if (would_start_an_identifier()) {
  632. reconsume_current_input_code_point();
  633. return consume_an_ident_like_token();
  634. }
  635. return create_value_token(Token::Type::Delim, input);
  636. }
  637. if (is_full_stop(input)) {
  638. dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop");
  639. if (starts_with_a_number()) {
  640. reconsume_current_input_code_point();
  641. return consume_a_numeric_token();
  642. }
  643. return create_value_token(Token::Type::Delim, input);
  644. }
  645. if (is_colon(input)) {
  646. dbgln_if(CSS_TOKENIZER_DEBUG, "is colon");
  647. return create_new_token(Token::Type::Colon);
  648. }
  649. if (is_semicolon(input)) {
  650. dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon");
  651. return create_new_token(Token::Type::Semicolon);
  652. }
  653. if (is_less_than_sign(input)) {
  654. dbgln_if(CSS_TOKENIZER_DEBUG, "is less than");
  655. auto maybe_cdo = peek_triplet();
  656. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  657. (void)next_code_point();
  658. (void)next_code_point();
  659. (void)next_code_point();
  660. return create_new_token(Token::Type::CDO);
  661. }
  662. return create_value_token(Token::Type::Delim, input);
  663. }
  664. if (is_at(input)) {
  665. dbgln_if(CSS_TOKENIZER_DEBUG, "is at");
  666. if (would_start_an_identifier()) {
  667. auto name = consume_a_name();
  668. return create_value_token(Token::Type::AtKeyword, name);
  669. }
  670. return create_value_token(Token::Type::Delim, input);
  671. }
  672. if (is_open_square_bracket(input)) {
  673. dbgln_if(CSS_TOKENIZER_DEBUG, "is open square");
  674. return create_new_token(Token::Type::OpenSquare);
  675. }
  676. if (is_reverse_solidus(input)) {
  677. dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus");
  678. if (is_valid_escape_sequence({ input, peek_code_point() })) {
  679. reconsume_current_input_code_point();
  680. return consume_an_ident_like_token();
  681. }
  682. log_parse_error();
  683. return create_value_token(Token::Type::Delim, input);
  684. }
  685. if (is_closed_square_bracket(input)) {
  686. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square");
  687. return create_new_token(Token::Type::CloseSquare);
  688. }
  689. if (is_open_curly_bracket(input)) {
  690. dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly");
  691. return create_new_token(Token::Type::OpenCurly);
  692. }
  693. if (is_closed_curly_bracket(input)) {
  694. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly");
  695. return create_new_token(Token::Type::CloseCurly);
  696. }
  697. if (is_ascii_digit(input)) {
  698. dbgln_if(CSS_TOKENIZER_DEBUG, "is digit");
  699. reconsume_current_input_code_point();
  700. return consume_a_numeric_token();
  701. }
  702. if (is_name_start_code_point(input)) {
  703. dbgln_if(CSS_TOKENIZER_DEBUG, "is name start");
  704. reconsume_current_input_code_point();
  705. return consume_an_ident_like_token();
  706. }
  707. dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter");
  708. return create_value_token(Token::Type::Delim, input);
  709. }
  710. }