Tokenizer.cpp 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931
  1. /*
  2. * Copyright (c) 2020-2021, the SerenityOS developers.
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Debug.h>
  8. #include <AK/SourceLocation.h>
  9. #include <AK/Vector.h>
  10. #include <LibTextCodec/Decoder.h>
  11. #include <LibWeb/CSS/Parser/Tokenizer.h>
  12. //U+FFFD REPLACEMENT CHARACTER (�)
  13. #define REPLACEMENT_CHARACTER 0xFFFD
  14. static const u32 TOKENIZER_EOF = 0xFFFFFFFF;
  15. static inline void log_parse_error(const SourceLocation& location = SourceLocation::current())
  16. {
  17. dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location);
  18. }
  19. static inline bool is_eof(u32 code_point)
  20. {
  21. return code_point == TOKENIZER_EOF;
  22. }
  23. static inline bool is_quotation_mark(u32 code_point)
  24. {
  25. return code_point == 0x22;
  26. }
  27. static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
  28. {
  29. return code_point > 0x10FFFF;
  30. }
  31. static inline bool is_low_line(u32 code_point)
  32. {
  33. return code_point == 0x5F;
  34. }
  35. static inline bool is_name_start_code_point(u32 code_point)
  36. {
  37. return is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point);
  38. }
  39. static inline bool is_hyphen_minus(u32 code_point)
  40. {
  41. return code_point == 0x2D;
  42. }
  43. static inline bool is_name_code_point(u32 code_point)
  44. {
  45. return is_name_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
  46. }
  47. static inline bool is_non_printable(u32 code_point)
  48. {
  49. return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F;
  50. }
  51. static inline bool is_number_sign(u32 code_point)
  52. {
  53. return code_point == 0x23;
  54. }
  55. static inline bool is_reverse_solidus(u32 code_point)
  56. {
  57. return code_point == 0x5C;
  58. }
  59. static inline bool is_apostrophe(u32 code_point)
  60. {
  61. return code_point == 0x27;
  62. }
  63. static inline bool is_left_paren(u32 code_point)
  64. {
  65. return code_point == 0x28;
  66. }
  67. static inline bool is_right_paren(u32 code_point)
  68. {
  69. return code_point == 0x29;
  70. }
  71. static inline bool is_plus_sign(u32 code_point)
  72. {
  73. return code_point == 0x2B;
  74. }
  75. static inline bool is_comma(u32 code_point)
  76. {
  77. return code_point == 0x2C;
  78. }
  79. static inline bool is_full_stop(u32 code_point)
  80. {
  81. return code_point == 0x2E;
  82. }
  83. static inline bool is_newline(u32 code_point)
  84. {
  85. return code_point == 0xA;
  86. }
  87. static inline bool is_asterisk(u32 code_point)
  88. {
  89. return code_point == 0x2A;
  90. }
  91. static inline bool is_solidus(u32 code_point)
  92. {
  93. return code_point == 0x2F;
  94. }
  95. static inline bool is_colon(u32 code_point)
  96. {
  97. return code_point == 0x3A;
  98. }
  99. static inline bool is_semicolon(u32 code_point)
  100. {
  101. return code_point == 0x3B;
  102. }
  103. static inline bool is_less_than_sign(u32 code_point)
  104. {
  105. return code_point == 0x3C;
  106. }
  107. static inline bool is_greater_than_sign(u32 code_point)
  108. {
  109. return code_point == 0x3E;
  110. }
  111. static inline bool is_at(u32 code_point)
  112. {
  113. return code_point == 0x40;
  114. }
  115. static inline bool is_open_square_bracket(u32 code_point)
  116. {
  117. return code_point == 0x5B;
  118. }
  119. static inline bool is_closed_square_bracket(u32 code_point)
  120. {
  121. return code_point == 0x5D;
  122. }
  123. static inline bool is_open_curly_bracket(u32 code_point)
  124. {
  125. return code_point == 0x7B;
  126. }
  127. static inline bool is_closed_curly_bracket(u32 code_point)
  128. {
  129. return code_point == 0x7D;
  130. }
  131. static inline bool is_whitespace(u32 code_point)
  132. {
  133. return code_point == 0x9 || code_point == 0xA || code_point == 0x20;
  134. }
  135. static inline bool is_percent(u32 code_point)
  136. {
  137. return code_point == 0x25;
  138. }
  139. static inline bool is_exclamation_mark(u32 code_point)
  140. {
  141. return code_point == 0x21;
  142. }
  143. static inline bool is_e(u32 code_point)
  144. {
  145. return code_point == 0x65;
  146. }
  147. static inline bool is_E(u32 code_point)
  148. {
  149. return code_point == 0x45;
  150. }
  151. namespace Web::CSS {
  152. Tokenizer::Tokenizer(const StringView& input, const String& encoding)
  153. {
  154. auto* decoder = TextCodec::decoder_for(encoding);
  155. VERIFY(decoder);
  156. StringBuilder builder(input.length());
  157. // Preprocess the stream, by doing the following:
  158. // - Replace \r, \f and \r\n with \n
  159. // - replace \0 and anything between U+D800 to U+DFFF with the replacement
  160. // character.
  161. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  162. bool last_was_carriage_return = false;
  163. decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) {
  164. if (code_point == '\r') {
  165. if (last_was_carriage_return) {
  166. builder.append('\n');
  167. } else {
  168. last_was_carriage_return = true;
  169. }
  170. } else {
  171. if (last_was_carriage_return) {
  172. builder.append('\n');
  173. }
  174. if (code_point == '\n') {
  175. if (!last_was_carriage_return) {
  176. builder.append('\n');
  177. }
  178. } else if (code_point == '\f') {
  179. builder.append('\n');
  180. } else if (code_point >= 0xD800 && code_point <= 0xDFFF) {
  181. builder.append_code_point(REPLACEMENT_CHARACTER);
  182. } else {
  183. builder.append_code_point(code_point);
  184. }
  185. last_was_carriage_return = false;
  186. }
  187. });
  188. m_decoded_input = builder.to_string();
  189. m_utf8_view = Utf8View(m_decoded_input);
  190. m_utf8_iterator = m_utf8_view.begin();
  191. }
  192. Vector<Token> Tokenizer::parse()
  193. {
  194. Vector<Token> tokens;
  195. for (;;) {
  196. auto token = consume_a_token();
  197. tokens.append(token);
  198. if (token.is(Token::Type::EndOfFile)) {
  199. return tokens;
  200. }
  201. }
  202. }
  203. u32 Tokenizer::next_code_point()
  204. {
  205. if (m_utf8_iterator == m_utf8_view.end())
  206. return TOKENIZER_EOF;
  207. m_prev_utf8_iterator = m_utf8_iterator;
  208. ++m_utf8_iterator;
  209. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", *m_prev_utf8_iterator);
  210. return *m_prev_utf8_iterator;
  211. }
  212. u32 Tokenizer::peek_code_point(size_t offset) const
  213. {
  214. auto it = m_utf8_iterator;
  215. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  216. ++it;
  217. if (it == m_utf8_view.end())
  218. return TOKENIZER_EOF;
  219. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator);
  220. return *it;
  221. }
  222. U32Twin Tokenizer::peek_twin() const
  223. {
  224. U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF };
  225. auto it = m_utf8_iterator;
  226. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  227. values.set(i, *it);
  228. ++it;
  229. }
  230. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second);
  231. return values;
  232. }
  233. U32Triplet Tokenizer::peek_triplet() const
  234. {
  235. U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF };
  236. auto it = m_utf8_iterator;
  237. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  238. values.set(i, *it);
  239. ++it;
  240. }
  241. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third);
  242. return values;
  243. }
  244. Token Tokenizer::create_new_token(Token::Type type)
  245. {
  246. Token token = {};
  247. token.m_type = type;
  248. return token;
  249. }
  250. Token Tokenizer::create_eof_token()
  251. {
  252. return create_new_token(Token::Type::EndOfFile);
  253. }
  254. Token Tokenizer::create_value_token(Token::Type type, String value)
  255. {
  256. Token token;
  257. token.m_type = type;
  258. token.m_value.append(move(value));
  259. return token;
  260. }
  261. Token Tokenizer::create_value_token(Token::Type type, u32 value)
  262. {
  263. Token token = {};
  264. token.m_type = type;
  265. token.m_value.append_code_point(value);
  266. return token;
  267. }
  268. u32 Tokenizer::consume_escaped_code_point()
  269. {
  270. auto input = next_code_point();
  271. if (is_eof(input)) {
  272. log_parse_error();
  273. return REPLACEMENT_CHARACTER;
  274. }
  275. if (is_ascii_hex_digit(input)) {
  276. StringBuilder builder;
  277. builder.append_code_point(input);
  278. size_t counter = 0;
  279. while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) {
  280. builder.append_code_point(next_code_point());
  281. }
  282. if (is_whitespace(peek_code_point())) {
  283. (void)next_code_point();
  284. }
  285. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
  286. if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
  287. return REPLACEMENT_CHARACTER;
  288. }
  289. return unhexed;
  290. }
  291. if (!input) {
  292. log_parse_error();
  293. return REPLACEMENT_CHARACTER;
  294. }
  295. return input;
  296. }
  297. Token Tokenizer::consume_an_ident_like_token()
  298. {
  299. auto string = consume_a_name();
  300. if (string.equals_ignoring_case("url") && is_left_paren(peek_code_point())) {
  301. (void)next_code_point();
  302. for (;;) {
  303. auto maybe_whitespace = peek_twin();
  304. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  305. break;
  306. }
  307. (void)next_code_point();
  308. }
  309. auto next_two = peek_twin();
  310. // if one of these ", ', ' "', " '"
  311. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  312. return create_value_token(Token::Type::Function, string);
  313. }
  314. return consume_a_url_token();
  315. }
  316. if (is_left_paren(peek_code_point())) {
  317. (void)next_code_point();
  318. return create_value_token(Token::Type::Function, string);
  319. }
  320. return create_value_token(Token::Type::Ident, string);
  321. }
  322. CSSNumber Tokenizer::consume_a_number()
  323. {
  324. StringBuilder repr;
  325. Token::NumberType type = Token::NumberType::Integer;
  326. auto next_input = peek_code_point();
  327. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  328. repr.append_code_point(next_code_point());
  329. }
  330. for (;;) {
  331. auto digits = peek_code_point();
  332. if (!is_ascii_digit(digits))
  333. break;
  334. repr.append_code_point(next_code_point());
  335. }
  336. auto maybe_number = peek_twin();
  337. if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
  338. repr.append_code_point(next_code_point());
  339. repr.append_code_point(next_code_point());
  340. type = Token::NumberType::Number;
  341. for (;;) {
  342. auto digit = peek_code_point();
  343. if (!is_ascii_digit(digit))
  344. break;
  345. repr.append_code_point(next_code_point());
  346. }
  347. }
  348. auto maybe_exp = peek_triplet();
  349. if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
  350. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  351. if (is_ascii_digit(maybe_exp.third)) {
  352. repr.append_code_point(next_code_point());
  353. repr.append_code_point(next_code_point());
  354. repr.append_code_point(next_code_point());
  355. }
  356. } else if (is_ascii_digit(maybe_exp.second)) {
  357. repr.append_code_point(next_code_point());
  358. repr.append_code_point(next_code_point());
  359. }
  360. type = Token::NumberType::Number;
  361. for (;;) {
  362. auto digits = peek_code_point();
  363. if (!is_ascii_digit(digits))
  364. break;
  365. repr.append_code_point(next_code_point());
  366. }
  367. }
  368. return { repr.to_string(), type };
  369. }
  370. String Tokenizer::consume_a_name()
  371. {
  372. StringBuilder result;
  373. for (;;) {
  374. auto input = next_code_point();
  375. if (is_eof(input))
  376. break;
  377. if (is_name_code_point(input)) {
  378. result.append_code_point(input);
  379. continue;
  380. }
  381. auto next = peek_code_point();
  382. if (!is_eof(next) && is_valid_escape_sequence({ input, next })) {
  383. result.append_code_point(consume_escaped_code_point());
  384. continue;
  385. }
  386. reconsume_current_input_code_point();
  387. break;
  388. }
  389. return result.to_string();
  390. }
  391. Token Tokenizer::consume_a_url_token()
  392. {
  393. auto token = create_new_token(Token::Type::Url);
  394. for (;;) {
  395. if (!is_whitespace(peek_code_point())) {
  396. break;
  397. }
  398. (void)next_code_point();
  399. }
  400. for (;;) {
  401. auto input = peek_code_point();
  402. if (is_eof(input)) {
  403. log_parse_error();
  404. return token;
  405. }
  406. if (is_right_paren(input)) {
  407. (void)next_code_point();
  408. return token;
  409. }
  410. if (is_whitespace(input)) {
  411. for (;;) {
  412. if (!is_whitespace(peek_code_point())) {
  413. break;
  414. }
  415. input = next_code_point();
  416. }
  417. if (is_eof(input)) {
  418. log_parse_error();
  419. return token;
  420. }
  421. if (is_right_paren(input)) {
  422. return token;
  423. }
  424. consume_the_remnants_of_a_bad_url();
  425. return create_new_token(Token::Type::BadUrl);
  426. }
  427. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  428. log_parse_error();
  429. (void)next_code_point();
  430. consume_the_remnants_of_a_bad_url();
  431. return create_new_token(Token::Type::BadUrl);
  432. }
  433. if (is_reverse_solidus(input)) {
  434. if (is_valid_escape_sequence(peek_twin())) {
  435. token.m_value.append_code_point(consume_escaped_code_point());
  436. } else {
  437. log_parse_error();
  438. (void)next_code_point();
  439. consume_the_remnants_of_a_bad_url();
  440. return create_new_token(Token::Type::BadUrl);
  441. }
  442. }
  443. token.m_value.append_code_point(next_code_point());
  444. }
  445. }
  446. void Tokenizer::consume_the_remnants_of_a_bad_url()
  447. {
  448. for (;;) {
  449. auto next = peek_code_point();
  450. if (is_eof(next)) {
  451. return;
  452. }
  453. auto input = next;
  454. if (is_right_paren(input)) {
  455. (void)next_code_point();
  456. return;
  457. }
  458. if (is_valid_escape_sequence(peek_twin())) {
  459. [[maybe_unused]] auto cp = consume_escaped_code_point();
  460. }
  461. (void)next_code_point();
  462. }
  463. }
  464. void Tokenizer::reconsume_current_input_code_point()
  465. {
  466. m_utf8_iterator = m_prev_utf8_iterator;
  467. }
  468. Token Tokenizer::consume_a_numeric_token()
  469. {
  470. auto number = consume_a_number();
  471. if (would_start_an_identifier()) {
  472. auto token = create_new_token(Token::Type::Dimension);
  473. token.m_value.append(number.value);
  474. token.m_number_type = number.type;
  475. auto unit = consume_a_name();
  476. token.m_unit.append(unit);
  477. return token;
  478. }
  479. if (is_percent(peek_code_point())) {
  480. (void)next_code_point();
  481. auto token = create_new_token(Token::Type::Percentage);
  482. token.m_value.append(number.value);
  483. return token;
  484. }
  485. auto token = create_new_token(Token::Type::Number);
  486. token.m_value.append(number.value);
  487. token.m_number_type = number.type;
  488. return token;
  489. }
  490. bool Tokenizer::starts_with_a_number() const
  491. {
  492. return starts_with_a_number(peek_triplet());
  493. }
  494. bool Tokenizer::starts_with_a_number(U32Triplet values)
  495. {
  496. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  497. if (is_ascii_digit(values.second))
  498. return true;
  499. if (is_full_stop(values.second) && is_ascii_digit(values.third))
  500. return true;
  501. return false;
  502. }
  503. if (is_full_stop(values.first))
  504. return is_ascii_digit(values.second);
  505. if (is_ascii_digit(values.first))
  506. return true;
  507. return false;
  508. }
  509. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  510. {
  511. if (!is_reverse_solidus(values.first)) {
  512. return false;
  513. }
  514. if (is_newline(values.second)) {
  515. return false;
  516. }
  517. return true;
  518. }
  519. bool Tokenizer::would_start_an_identifier()
  520. {
  521. return would_start_an_identifier(peek_triplet());
  522. }
  523. bool Tokenizer::would_start_an_identifier(U32Triplet values)
  524. {
  525. if (is_hyphen_minus(values.first)) {
  526. if (is_name_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  527. return true;
  528. return false;
  529. }
  530. if (is_name_start_code_point(values.first)) {
  531. return true;
  532. }
  533. if (is_reverse_solidus(values.first)) {
  534. if (is_valid_escape_sequence(values.to_twin_12()))
  535. return true;
  536. return false;
  537. }
  538. return false;
  539. }
  540. Token Tokenizer::consume_string_token(u32 ending_code_point)
  541. {
  542. auto token = create_new_token(Token::Type::String);
  543. for (;;) {
  544. auto input = next_code_point();
  545. if (is_eof(input)) {
  546. log_parse_error();
  547. return token;
  548. }
  549. if (input == ending_code_point)
  550. return token;
  551. if (is_newline(input)) {
  552. reconsume_current_input_code_point();
  553. return create_new_token(Token::Type::BadString);
  554. }
  555. if (is_reverse_solidus(input)) {
  556. auto next_input = peek_code_point();
  557. if (is_eof(next_input))
  558. continue;
  559. if (is_newline(next_input)) {
  560. (void)next_code_point();
  561. continue;
  562. }
  563. auto escaped = consume_escaped_code_point();
  564. token.m_value.append_code_point(escaped);
  565. }
  566. token.m_value.append_code_point(input);
  567. }
  568. }
  569. void Tokenizer::consume_comments()
  570. {
  571. start:
  572. auto twin = peek_twin();
  573. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  574. return;
  575. (void)next_code_point();
  576. (void)next_code_point();
  577. for (;;) {
  578. auto twin_inner = peek_twin();
  579. if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) {
  580. log_parse_error();
  581. return;
  582. }
  583. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  584. (void)next_code_point();
  585. (void)next_code_point();
  586. goto start;
  587. }
  588. (void)next_code_point();
  589. }
  590. }
  591. Token Tokenizer::consume_a_token()
  592. {
  593. consume_comments();
  594. auto input = next_code_point();
  595. if (is_eof(input)) {
  596. return create_new_token(Token::Type::EndOfFile);
  597. }
  598. if (is_whitespace(input)) {
  599. dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace");
  600. auto next = peek_code_point();
  601. while (is_whitespace(next)) {
  602. (void)next_code_point();
  603. next = peek_code_point();
  604. }
  605. return create_new_token(Token::Type::Whitespace);
  606. }
  607. if (is_quotation_mark(input)) {
  608. dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark");
  609. return consume_string_token(input);
  610. }
  611. if (is_number_sign(input)) {
  612. dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign");
  613. auto next_input = peek_code_point();
  614. auto maybe_escape = peek_twin();
  615. if (is_name_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) {
  616. auto token = create_new_token(Token::Type::Hash);
  617. if (would_start_an_identifier())
  618. token.m_hash_type = Token::HashType::Id;
  619. auto name = consume_a_name();
  620. token.m_value.append(name);
  621. return token;
  622. }
  623. return create_value_token(Token::Type::Delim, input);
  624. }
  625. if (is_apostrophe(input)) {
  626. dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe");
  627. return consume_string_token(input);
  628. }
  629. if (is_left_paren(input)) {
  630. dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren");
  631. return create_new_token(Token::Type::OpenParen);
  632. }
  633. if (is_right_paren(input)) {
  634. dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren");
  635. return create_new_token(Token::Type::CloseParen);
  636. }
  637. if (is_plus_sign(input)) {
  638. dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign");
  639. if (starts_with_a_number()) {
  640. reconsume_current_input_code_point();
  641. return consume_a_numeric_token();
  642. }
  643. return create_value_token(Token::Type::Delim, input);
  644. }
  645. if (is_comma(input)) {
  646. dbgln_if(CSS_TOKENIZER_DEBUG, "is comma");
  647. return create_new_token(Token::Type::Comma);
  648. }
  649. if (is_hyphen_minus(input)) {
  650. dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus");
  651. if (starts_with_a_number()) {
  652. reconsume_current_input_code_point();
  653. return consume_a_numeric_token();
  654. }
  655. auto next_twin = peek_twin();
  656. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  657. (void)next_code_point();
  658. (void)next_code_point();
  659. return create_new_token(Token::Type::CDC);
  660. }
  661. if (would_start_an_identifier()) {
  662. reconsume_current_input_code_point();
  663. return consume_an_ident_like_token();
  664. }
  665. return create_value_token(Token::Type::Delim, input);
  666. }
  667. if (is_full_stop(input)) {
  668. dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop");
  669. if (starts_with_a_number()) {
  670. reconsume_current_input_code_point();
  671. return consume_a_numeric_token();
  672. }
  673. return create_value_token(Token::Type::Delim, input);
  674. }
  675. if (is_colon(input)) {
  676. dbgln_if(CSS_TOKENIZER_DEBUG, "is colon");
  677. return create_new_token(Token::Type::Colon);
  678. }
  679. if (is_semicolon(input)) {
  680. dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon");
  681. return create_new_token(Token::Type::Semicolon);
  682. }
  683. if (is_less_than_sign(input)) {
  684. dbgln_if(CSS_TOKENIZER_DEBUG, "is less than");
  685. auto maybe_cdo = peek_triplet();
  686. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  687. (void)next_code_point();
  688. (void)next_code_point();
  689. (void)next_code_point();
  690. return create_new_token(Token::Type::CDO);
  691. }
  692. return create_value_token(Token::Type::Delim, input);
  693. }
  694. if (is_at(input)) {
  695. dbgln_if(CSS_TOKENIZER_DEBUG, "is at");
  696. if (would_start_an_identifier()) {
  697. auto name = consume_a_name();
  698. return create_value_token(Token::Type::AtKeyword, name);
  699. }
  700. return create_value_token(Token::Type::Delim, input);
  701. }
  702. if (is_open_square_bracket(input)) {
  703. dbgln_if(CSS_TOKENIZER_DEBUG, "is open square");
  704. return create_new_token(Token::Type::OpenSquare);
  705. }
  706. if (is_reverse_solidus(input)) {
  707. dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus");
  708. if (is_valid_escape_sequence({ input, peek_code_point() })) {
  709. reconsume_current_input_code_point();
  710. return consume_an_ident_like_token();
  711. }
  712. log_parse_error();
  713. return create_value_token(Token::Type::Delim, input);
  714. }
  715. if (is_closed_square_bracket(input)) {
  716. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square");
  717. return create_new_token(Token::Type::CloseSquare);
  718. }
  719. if (is_open_curly_bracket(input)) {
  720. dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly");
  721. return create_new_token(Token::Type::OpenCurly);
  722. }
  723. if (is_closed_curly_bracket(input)) {
  724. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly");
  725. return create_new_token(Token::Type::CloseCurly);
  726. }
  727. if (is_ascii_digit(input)) {
  728. dbgln_if(CSS_TOKENIZER_DEBUG, "is digit");
  729. reconsume_current_input_code_point();
  730. return consume_a_numeric_token();
  731. }
  732. if (is_name_start_code_point(input)) {
  733. dbgln_if(CSS_TOKENIZER_DEBUG, "is name start");
  734. reconsume_current_input_code_point();
  735. return consume_an_ident_like_token();
  736. }
  737. dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter");
  738. return create_value_token(Token::Type::Delim, input);
  739. }
  740. }