Tokenizer.cpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935
  1. /*
  2. * Copyright (c) 2020-2021, the SerenityOS developers.
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/CharacterTypes.h>
  7. #include <AK/Debug.h>
  8. #include <AK/SourceLocation.h>
  9. #include <AK/Vector.h>
  10. #include <LibTextCodec/Decoder.h>
  11. #include <LibWeb/CSS/Parser/Tokenizer.h>
  12. // U+FFFD REPLACEMENT CHARACTER (�)
  13. #define REPLACEMENT_CHARACTER 0xFFFD
  14. static const u32 TOKENIZER_EOF = 0xFFFFFFFF;
  15. static inline void log_parse_error(const SourceLocation& location = SourceLocation::current())
  16. {
  17. dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location);
  18. }
  19. static inline bool is_eof(u32 code_point)
  20. {
  21. return code_point == TOKENIZER_EOF;
  22. }
  23. static inline bool is_quotation_mark(u32 code_point)
  24. {
  25. return code_point == 0x22;
  26. }
  27. static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
  28. {
  29. return code_point > 0x10FFFF;
  30. }
  31. static inline bool is_low_line(u32 code_point)
  32. {
  33. return code_point == 0x5F;
  34. }
  35. static inline bool is_name_start_code_point(u32 code_point)
  36. {
  37. // FIXME: We use !is_ascii() for "non-ASCII code point" in the spec, but it's not quite right -
  38. // it treats EOF as a valid! The spec also lacks a definition of code point. For now, the
  39. // !is_eof() check is a hack, but it should work.
  40. return !is_eof(code_point) && (is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point));
  41. }
  42. static inline bool is_hyphen_minus(u32 code_point)
  43. {
  44. return code_point == 0x2D;
  45. }
  46. static inline bool is_name_code_point(u32 code_point)
  47. {
  48. return is_name_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
  49. }
  50. static inline bool is_non_printable(u32 code_point)
  51. {
  52. return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F;
  53. }
  54. static inline bool is_number_sign(u32 code_point)
  55. {
  56. return code_point == 0x23;
  57. }
  58. static inline bool is_reverse_solidus(u32 code_point)
  59. {
  60. return code_point == 0x5C;
  61. }
  62. static inline bool is_apostrophe(u32 code_point)
  63. {
  64. return code_point == 0x27;
  65. }
  66. static inline bool is_left_paren(u32 code_point)
  67. {
  68. return code_point == 0x28;
  69. }
  70. static inline bool is_right_paren(u32 code_point)
  71. {
  72. return code_point == 0x29;
  73. }
  74. static inline bool is_plus_sign(u32 code_point)
  75. {
  76. return code_point == 0x2B;
  77. }
  78. static inline bool is_comma(u32 code_point)
  79. {
  80. return code_point == 0x2C;
  81. }
  82. static inline bool is_full_stop(u32 code_point)
  83. {
  84. return code_point == 0x2E;
  85. }
  86. static inline bool is_newline(u32 code_point)
  87. {
  88. return code_point == 0xA;
  89. }
  90. static inline bool is_asterisk(u32 code_point)
  91. {
  92. return code_point == 0x2A;
  93. }
  94. static inline bool is_solidus(u32 code_point)
  95. {
  96. return code_point == 0x2F;
  97. }
  98. static inline bool is_colon(u32 code_point)
  99. {
  100. return code_point == 0x3A;
  101. }
  102. static inline bool is_semicolon(u32 code_point)
  103. {
  104. return code_point == 0x3B;
  105. }
  106. static inline bool is_less_than_sign(u32 code_point)
  107. {
  108. return code_point == 0x3C;
  109. }
  110. static inline bool is_greater_than_sign(u32 code_point)
  111. {
  112. return code_point == 0x3E;
  113. }
  114. static inline bool is_at(u32 code_point)
  115. {
  116. return code_point == 0x40;
  117. }
  118. static inline bool is_open_square_bracket(u32 code_point)
  119. {
  120. return code_point == 0x5B;
  121. }
  122. static inline bool is_closed_square_bracket(u32 code_point)
  123. {
  124. return code_point == 0x5D;
  125. }
  126. static inline bool is_open_curly_bracket(u32 code_point)
  127. {
  128. return code_point == 0x7B;
  129. }
  130. static inline bool is_closed_curly_bracket(u32 code_point)
  131. {
  132. return code_point == 0x7D;
  133. }
  134. static inline bool is_whitespace(u32 code_point)
  135. {
  136. return code_point == 0x9 || code_point == 0xA || code_point == 0x20;
  137. }
  138. static inline bool is_percent(u32 code_point)
  139. {
  140. return code_point == 0x25;
  141. }
  142. static inline bool is_exclamation_mark(u32 code_point)
  143. {
  144. return code_point == 0x21;
  145. }
  146. static inline bool is_e(u32 code_point)
  147. {
  148. return code_point == 0x65;
  149. }
  150. static inline bool is_E(u32 code_point)
  151. {
  152. return code_point == 0x45;
  153. }
  154. namespace Web::CSS {
  155. Tokenizer::Tokenizer(const StringView& input, const String& encoding)
  156. {
  157. auto* decoder = TextCodec::decoder_for(encoding);
  158. VERIFY(decoder);
  159. StringBuilder builder(input.length());
  160. // Preprocess the stream, by doing the following:
  161. // - Replace \r, \f and \r\n with \n
  162. // - replace \0 and anything between U+D800 to U+DFFF with the replacement
  163. // character.
  164. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  165. bool last_was_carriage_return = false;
  166. decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) {
  167. if (code_point == '\r') {
  168. if (last_was_carriage_return) {
  169. builder.append('\n');
  170. } else {
  171. last_was_carriage_return = true;
  172. }
  173. } else {
  174. if (last_was_carriage_return) {
  175. builder.append('\n');
  176. }
  177. if (code_point == '\n') {
  178. if (!last_was_carriage_return) {
  179. builder.append('\n');
  180. }
  181. } else if (code_point == '\f') {
  182. builder.append('\n');
  183. } else if (code_point >= 0xD800 && code_point <= 0xDFFF) {
  184. builder.append_code_point(REPLACEMENT_CHARACTER);
  185. } else {
  186. builder.append_code_point(code_point);
  187. }
  188. last_was_carriage_return = false;
  189. }
  190. });
  191. m_decoded_input = builder.to_string();
  192. m_utf8_view = Utf8View(m_decoded_input);
  193. m_utf8_iterator = m_utf8_view.begin();
  194. }
  195. Vector<Token> Tokenizer::parse()
  196. {
  197. Vector<Token> tokens;
  198. for (;;) {
  199. auto token = consume_a_token();
  200. tokens.append(token);
  201. if (token.is(Token::Type::EndOfFile)) {
  202. return tokens;
  203. }
  204. }
  205. }
  206. u32 Tokenizer::next_code_point()
  207. {
  208. if (m_utf8_iterator == m_utf8_view.end())
  209. return TOKENIZER_EOF;
  210. m_prev_utf8_iterator = m_utf8_iterator;
  211. ++m_utf8_iterator;
  212. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", *m_prev_utf8_iterator);
  213. return *m_prev_utf8_iterator;
  214. }
  215. u32 Tokenizer::peek_code_point(size_t offset) const
  216. {
  217. auto it = m_utf8_iterator;
  218. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  219. ++it;
  220. if (it == m_utf8_view.end())
  221. return TOKENIZER_EOF;
  222. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator);
  223. return *it;
  224. }
  225. U32Twin Tokenizer::peek_twin() const
  226. {
  227. U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF };
  228. auto it = m_utf8_iterator;
  229. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  230. values.set(i, *it);
  231. ++it;
  232. }
  233. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second);
  234. return values;
  235. }
  236. U32Triplet Tokenizer::peek_triplet() const
  237. {
  238. U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF };
  239. auto it = m_utf8_iterator;
  240. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  241. values.set(i, *it);
  242. ++it;
  243. }
  244. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third);
  245. return values;
  246. }
  247. Token Tokenizer::create_new_token(Token::Type type)
  248. {
  249. Token token = {};
  250. token.m_type = type;
  251. return token;
  252. }
  253. Token Tokenizer::create_eof_token()
  254. {
  255. return create_new_token(Token::Type::EndOfFile);
  256. }
  257. Token Tokenizer::create_value_token(Token::Type type, String value)
  258. {
  259. Token token;
  260. token.m_type = type;
  261. token.m_value.append(move(value));
  262. return token;
  263. }
  264. Token Tokenizer::create_value_token(Token::Type type, u32 value)
  265. {
  266. Token token = {};
  267. token.m_type = type;
  268. token.m_value.append_code_point(value);
  269. return token;
  270. }
  271. u32 Tokenizer::consume_escaped_code_point()
  272. {
  273. auto input = next_code_point();
  274. if (is_eof(input)) {
  275. log_parse_error();
  276. return REPLACEMENT_CHARACTER;
  277. }
  278. if (is_ascii_hex_digit(input)) {
  279. StringBuilder builder;
  280. builder.append_code_point(input);
  281. size_t counter = 0;
  282. while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) {
  283. builder.append_code_point(next_code_point());
  284. }
  285. if (is_whitespace(peek_code_point())) {
  286. (void)next_code_point();
  287. }
  288. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
  289. if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
  290. return REPLACEMENT_CHARACTER;
  291. }
  292. return unhexed;
  293. }
  294. if (!input) {
  295. log_parse_error();
  296. return REPLACEMENT_CHARACTER;
  297. }
  298. return input;
  299. }
  300. Token Tokenizer::consume_an_ident_like_token()
  301. {
  302. auto string = consume_a_name();
  303. if (string.equals_ignoring_case("url") && is_left_paren(peek_code_point())) {
  304. (void)next_code_point();
  305. for (;;) {
  306. auto maybe_whitespace = peek_twin();
  307. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  308. break;
  309. }
  310. (void)next_code_point();
  311. }
  312. auto next_two = peek_twin();
  313. // if one of these ", ', ' "', " '"
  314. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  315. return create_value_token(Token::Type::Function, string);
  316. }
  317. return consume_a_url_token();
  318. }
  319. if (is_left_paren(peek_code_point())) {
  320. (void)next_code_point();
  321. return create_value_token(Token::Type::Function, string);
  322. }
  323. return create_value_token(Token::Type::Ident, string);
  324. }
  325. CSSNumber Tokenizer::consume_a_number()
  326. {
  327. StringBuilder repr;
  328. Token::NumberType type = Token::NumberType::Integer;
  329. auto next_input = peek_code_point();
  330. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  331. repr.append_code_point(next_code_point());
  332. }
  333. for (;;) {
  334. auto digits = peek_code_point();
  335. if (!is_ascii_digit(digits))
  336. break;
  337. repr.append_code_point(next_code_point());
  338. }
  339. auto maybe_number = peek_twin();
  340. if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
  341. repr.append_code_point(next_code_point());
  342. repr.append_code_point(next_code_point());
  343. type = Token::NumberType::Number;
  344. for (;;) {
  345. auto digit = peek_code_point();
  346. if (!is_ascii_digit(digit))
  347. break;
  348. repr.append_code_point(next_code_point());
  349. }
  350. }
  351. auto maybe_exp = peek_triplet();
  352. if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
  353. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  354. if (is_ascii_digit(maybe_exp.third)) {
  355. repr.append_code_point(next_code_point());
  356. repr.append_code_point(next_code_point());
  357. repr.append_code_point(next_code_point());
  358. }
  359. } else if (is_ascii_digit(maybe_exp.second)) {
  360. repr.append_code_point(next_code_point());
  361. repr.append_code_point(next_code_point());
  362. }
  363. type = Token::NumberType::Number;
  364. for (;;) {
  365. auto digits = peek_code_point();
  366. if (!is_ascii_digit(digits))
  367. break;
  368. repr.append_code_point(next_code_point());
  369. }
  370. }
  371. return { repr.to_string(), type };
  372. }
  373. String Tokenizer::consume_a_name()
  374. {
  375. StringBuilder result;
  376. for (;;) {
  377. auto input = next_code_point();
  378. if (is_eof(input))
  379. break;
  380. if (is_name_code_point(input)) {
  381. result.append_code_point(input);
  382. continue;
  383. }
  384. auto next = peek_code_point();
  385. if (!is_eof(next) && is_valid_escape_sequence({ input, next })) {
  386. result.append_code_point(consume_escaped_code_point());
  387. continue;
  388. }
  389. reconsume_current_input_code_point();
  390. break;
  391. }
  392. return result.to_string();
  393. }
  394. Token Tokenizer::consume_a_url_token()
  395. {
  396. auto token = create_new_token(Token::Type::Url);
  397. for (;;) {
  398. if (!is_whitespace(peek_code_point())) {
  399. break;
  400. }
  401. (void)next_code_point();
  402. }
  403. for (;;) {
  404. auto input = peek_code_point();
  405. if (is_eof(input)) {
  406. log_parse_error();
  407. return token;
  408. }
  409. if (is_right_paren(input)) {
  410. (void)next_code_point();
  411. return token;
  412. }
  413. if (is_whitespace(input)) {
  414. for (;;) {
  415. if (!is_whitespace(peek_code_point())) {
  416. break;
  417. }
  418. input = next_code_point();
  419. }
  420. if (is_eof(input)) {
  421. log_parse_error();
  422. return token;
  423. }
  424. if (is_right_paren(input)) {
  425. return token;
  426. }
  427. consume_the_remnants_of_a_bad_url();
  428. return create_new_token(Token::Type::BadUrl);
  429. }
  430. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  431. log_parse_error();
  432. (void)next_code_point();
  433. consume_the_remnants_of_a_bad_url();
  434. return create_new_token(Token::Type::BadUrl);
  435. }
  436. if (is_reverse_solidus(input)) {
  437. if (is_valid_escape_sequence(peek_twin())) {
  438. token.m_value.append_code_point(consume_escaped_code_point());
  439. } else {
  440. log_parse_error();
  441. (void)next_code_point();
  442. consume_the_remnants_of_a_bad_url();
  443. return create_new_token(Token::Type::BadUrl);
  444. }
  445. }
  446. token.m_value.append_code_point(next_code_point());
  447. }
  448. }
  449. void Tokenizer::consume_the_remnants_of_a_bad_url()
  450. {
  451. for (;;) {
  452. auto next = peek_code_point();
  453. if (is_eof(next)) {
  454. return;
  455. }
  456. auto input = next;
  457. if (is_right_paren(input)) {
  458. (void)next_code_point();
  459. return;
  460. }
  461. if (is_valid_escape_sequence(peek_twin())) {
  462. [[maybe_unused]] auto cp = consume_escaped_code_point();
  463. }
  464. (void)next_code_point();
  465. }
  466. }
  467. void Tokenizer::reconsume_current_input_code_point()
  468. {
  469. m_utf8_iterator = m_prev_utf8_iterator;
  470. }
  471. Token Tokenizer::consume_a_numeric_token()
  472. {
  473. auto number = consume_a_number();
  474. if (would_start_an_identifier()) {
  475. auto token = create_new_token(Token::Type::Dimension);
  476. token.m_value.append(number.value);
  477. token.m_number_type = number.type;
  478. auto unit = consume_a_name();
  479. VERIFY(!unit.is_empty() && !unit.is_whitespace());
  480. token.m_unit.append(unit);
  481. return token;
  482. }
  483. if (is_percent(peek_code_point())) {
  484. (void)next_code_point();
  485. auto token = create_new_token(Token::Type::Percentage);
  486. token.m_value.append(number.value);
  487. return token;
  488. }
  489. auto token = create_new_token(Token::Type::Number);
  490. token.m_value.append(number.value);
  491. token.m_number_type = number.type;
  492. return token;
  493. }
  494. bool Tokenizer::starts_with_a_number() const
  495. {
  496. return starts_with_a_number(peek_triplet());
  497. }
  498. bool Tokenizer::starts_with_a_number(U32Triplet values)
  499. {
  500. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  501. if (is_ascii_digit(values.second))
  502. return true;
  503. if (is_full_stop(values.second) && is_ascii_digit(values.third))
  504. return true;
  505. return false;
  506. }
  507. if (is_full_stop(values.first))
  508. return is_ascii_digit(values.second);
  509. if (is_ascii_digit(values.first))
  510. return true;
  511. return false;
  512. }
  513. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  514. {
  515. if (!is_reverse_solidus(values.first)) {
  516. return false;
  517. }
  518. if (is_newline(values.second)) {
  519. return false;
  520. }
  521. return true;
  522. }
  523. bool Tokenizer::would_start_an_identifier()
  524. {
  525. return would_start_an_identifier(peek_triplet());
  526. }
  527. bool Tokenizer::would_start_an_identifier(U32Triplet values)
  528. {
  529. if (is_hyphen_minus(values.first)) {
  530. if (is_name_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  531. return true;
  532. return false;
  533. }
  534. if (is_name_start_code_point(values.first)) {
  535. return true;
  536. }
  537. if (is_reverse_solidus(values.first)) {
  538. if (is_valid_escape_sequence(values.to_twin_12()))
  539. return true;
  540. return false;
  541. }
  542. return false;
  543. }
  544. Token Tokenizer::consume_string_token(u32 ending_code_point)
  545. {
  546. auto token = create_new_token(Token::Type::String);
  547. for (;;) {
  548. auto input = next_code_point();
  549. if (is_eof(input)) {
  550. log_parse_error();
  551. return token;
  552. }
  553. if (input == ending_code_point)
  554. return token;
  555. if (is_newline(input)) {
  556. reconsume_current_input_code_point();
  557. return create_new_token(Token::Type::BadString);
  558. }
  559. if (is_reverse_solidus(input)) {
  560. auto next_input = peek_code_point();
  561. if (is_eof(next_input))
  562. continue;
  563. if (is_newline(next_input)) {
  564. (void)next_code_point();
  565. continue;
  566. }
  567. auto escaped = consume_escaped_code_point();
  568. token.m_value.append_code_point(escaped);
  569. }
  570. token.m_value.append_code_point(input);
  571. }
  572. }
  573. void Tokenizer::consume_comments()
  574. {
  575. start:
  576. auto twin = peek_twin();
  577. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  578. return;
  579. (void)next_code_point();
  580. (void)next_code_point();
  581. for (;;) {
  582. auto twin_inner = peek_twin();
  583. if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) {
  584. log_parse_error();
  585. return;
  586. }
  587. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  588. (void)next_code_point();
  589. (void)next_code_point();
  590. goto start;
  591. }
  592. (void)next_code_point();
  593. }
  594. }
  595. Token Tokenizer::consume_a_token()
  596. {
  597. consume_comments();
  598. auto input = next_code_point();
  599. if (is_eof(input)) {
  600. return create_new_token(Token::Type::EndOfFile);
  601. }
  602. if (is_whitespace(input)) {
  603. dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace");
  604. auto next = peek_code_point();
  605. while (is_whitespace(next)) {
  606. (void)next_code_point();
  607. next = peek_code_point();
  608. }
  609. return create_new_token(Token::Type::Whitespace);
  610. }
  611. if (is_quotation_mark(input)) {
  612. dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark");
  613. return consume_string_token(input);
  614. }
  615. if (is_number_sign(input)) {
  616. dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign");
  617. auto next_input = peek_code_point();
  618. auto maybe_escape = peek_twin();
  619. if (is_name_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) {
  620. auto token = create_new_token(Token::Type::Hash);
  621. if (would_start_an_identifier())
  622. token.m_hash_type = Token::HashType::Id;
  623. auto name = consume_a_name();
  624. token.m_value.append(name);
  625. return token;
  626. }
  627. return create_value_token(Token::Type::Delim, input);
  628. }
  629. if (is_apostrophe(input)) {
  630. dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe");
  631. return consume_string_token(input);
  632. }
  633. if (is_left_paren(input)) {
  634. dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren");
  635. return create_new_token(Token::Type::OpenParen);
  636. }
  637. if (is_right_paren(input)) {
  638. dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren");
  639. return create_new_token(Token::Type::CloseParen);
  640. }
  641. if (is_plus_sign(input)) {
  642. dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign");
  643. if (starts_with_a_number()) {
  644. reconsume_current_input_code_point();
  645. return consume_a_numeric_token();
  646. }
  647. return create_value_token(Token::Type::Delim, input);
  648. }
  649. if (is_comma(input)) {
  650. dbgln_if(CSS_TOKENIZER_DEBUG, "is comma");
  651. return create_new_token(Token::Type::Comma);
  652. }
  653. if (is_hyphen_minus(input)) {
  654. dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus");
  655. if (starts_with_a_number()) {
  656. reconsume_current_input_code_point();
  657. return consume_a_numeric_token();
  658. }
  659. auto next_twin = peek_twin();
  660. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  661. (void)next_code_point();
  662. (void)next_code_point();
  663. return create_new_token(Token::Type::CDC);
  664. }
  665. if (would_start_an_identifier()) {
  666. reconsume_current_input_code_point();
  667. return consume_an_ident_like_token();
  668. }
  669. return create_value_token(Token::Type::Delim, input);
  670. }
  671. if (is_full_stop(input)) {
  672. dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop");
  673. if (starts_with_a_number()) {
  674. reconsume_current_input_code_point();
  675. return consume_a_numeric_token();
  676. }
  677. return create_value_token(Token::Type::Delim, input);
  678. }
  679. if (is_colon(input)) {
  680. dbgln_if(CSS_TOKENIZER_DEBUG, "is colon");
  681. return create_new_token(Token::Type::Colon);
  682. }
  683. if (is_semicolon(input)) {
  684. dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon");
  685. return create_new_token(Token::Type::Semicolon);
  686. }
  687. if (is_less_than_sign(input)) {
  688. dbgln_if(CSS_TOKENIZER_DEBUG, "is less than");
  689. auto maybe_cdo = peek_triplet();
  690. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  691. (void)next_code_point();
  692. (void)next_code_point();
  693. (void)next_code_point();
  694. return create_new_token(Token::Type::CDO);
  695. }
  696. return create_value_token(Token::Type::Delim, input);
  697. }
  698. if (is_at(input)) {
  699. dbgln_if(CSS_TOKENIZER_DEBUG, "is at");
  700. if (would_start_an_identifier()) {
  701. auto name = consume_a_name();
  702. return create_value_token(Token::Type::AtKeyword, name);
  703. }
  704. return create_value_token(Token::Type::Delim, input);
  705. }
  706. if (is_open_square_bracket(input)) {
  707. dbgln_if(CSS_TOKENIZER_DEBUG, "is open square");
  708. return create_new_token(Token::Type::OpenSquare);
  709. }
  710. if (is_reverse_solidus(input)) {
  711. dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus");
  712. if (is_valid_escape_sequence({ input, peek_code_point() })) {
  713. reconsume_current_input_code_point();
  714. return consume_an_ident_like_token();
  715. }
  716. log_parse_error();
  717. return create_value_token(Token::Type::Delim, input);
  718. }
  719. if (is_closed_square_bracket(input)) {
  720. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square");
  721. return create_new_token(Token::Type::CloseSquare);
  722. }
  723. if (is_open_curly_bracket(input)) {
  724. dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly");
  725. return create_new_token(Token::Type::OpenCurly);
  726. }
  727. if (is_closed_curly_bracket(input)) {
  728. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly");
  729. return create_new_token(Token::Type::CloseCurly);
  730. }
  731. if (is_ascii_digit(input)) {
  732. dbgln_if(CSS_TOKENIZER_DEBUG, "is digit");
  733. reconsume_current_input_code_point();
  734. return consume_a_numeric_token();
  735. }
  736. if (is_name_start_code_point(input)) {
  737. dbgln_if(CSS_TOKENIZER_DEBUG, "is name start");
  738. reconsume_current_input_code_point();
  739. return consume_an_ident_like_token();
  740. }
  741. dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter");
  742. return create_value_token(Token::Type::Delim, input);
  743. }
  744. }