Tokenizer.cpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948
  1. /*
  2. * Copyright (c) 2020-2021, the SerenityOS developers.
  3. * Copyright (c) 2021, Sam Atkins <atkinssj@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Debug.h>
  9. #include <AK/SourceLocation.h>
  10. #include <AK/Vector.h>
  11. #include <LibTextCodec/Decoder.h>
  12. #include <LibWeb/CSS/Parser/Tokenizer.h>
  13. // U+FFFD REPLACEMENT CHARACTER (�)
  14. #define REPLACEMENT_CHARACTER 0xFFFD
  15. static const u32 TOKENIZER_EOF = 0xFFFFFFFF;
  16. static inline void log_parse_error(const SourceLocation& location = SourceLocation::current())
  17. {
  18. dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location);
  19. }
  20. static inline bool is_eof(u32 code_point)
  21. {
  22. return code_point == TOKENIZER_EOF;
  23. }
  24. static inline bool is_quotation_mark(u32 code_point)
  25. {
  26. return code_point == 0x22;
  27. }
  28. static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
  29. {
  30. return code_point > 0x10FFFF;
  31. }
  32. static inline bool is_low_line(u32 code_point)
  33. {
  34. return code_point == 0x5F;
  35. }
  36. static inline bool is_name_start_code_point(u32 code_point)
  37. {
  38. // FIXME: We use !is_ascii() for "non-ASCII code point" in the spec, but it's not quite right -
  39. // it treats EOF as a valid! The spec also lacks a definition of code point. For now, the
  40. // !is_eof() check is a hack, but it should work.
  41. return !is_eof(code_point) && (is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point));
  42. }
  43. static inline bool is_hyphen_minus(u32 code_point)
  44. {
  45. return code_point == 0x2D;
  46. }
  47. static inline bool is_name_code_point(u32 code_point)
  48. {
  49. return is_name_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
  50. }
  51. static inline bool is_non_printable(u32 code_point)
  52. {
  53. return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F;
  54. }
  55. static inline bool is_number_sign(u32 code_point)
  56. {
  57. return code_point == 0x23;
  58. }
  59. static inline bool is_reverse_solidus(u32 code_point)
  60. {
  61. return code_point == 0x5C;
  62. }
  63. static inline bool is_apostrophe(u32 code_point)
  64. {
  65. return code_point == 0x27;
  66. }
  67. static inline bool is_left_paren(u32 code_point)
  68. {
  69. return code_point == 0x28;
  70. }
  71. static inline bool is_right_paren(u32 code_point)
  72. {
  73. return code_point == 0x29;
  74. }
  75. static inline bool is_plus_sign(u32 code_point)
  76. {
  77. return code_point == 0x2B;
  78. }
  79. static inline bool is_comma(u32 code_point)
  80. {
  81. return code_point == 0x2C;
  82. }
  83. static inline bool is_full_stop(u32 code_point)
  84. {
  85. return code_point == 0x2E;
  86. }
  87. static inline bool is_newline(u32 code_point)
  88. {
  89. return code_point == 0xA;
  90. }
  91. static inline bool is_asterisk(u32 code_point)
  92. {
  93. return code_point == 0x2A;
  94. }
  95. static inline bool is_solidus(u32 code_point)
  96. {
  97. return code_point == 0x2F;
  98. }
  99. static inline bool is_colon(u32 code_point)
  100. {
  101. return code_point == 0x3A;
  102. }
  103. static inline bool is_semicolon(u32 code_point)
  104. {
  105. return code_point == 0x3B;
  106. }
  107. static inline bool is_less_than_sign(u32 code_point)
  108. {
  109. return code_point == 0x3C;
  110. }
  111. static inline bool is_greater_than_sign(u32 code_point)
  112. {
  113. return code_point == 0x3E;
  114. }
  115. static inline bool is_at(u32 code_point)
  116. {
  117. return code_point == 0x40;
  118. }
  119. static inline bool is_open_square_bracket(u32 code_point)
  120. {
  121. return code_point == 0x5B;
  122. }
  123. static inline bool is_closed_square_bracket(u32 code_point)
  124. {
  125. return code_point == 0x5D;
  126. }
  127. static inline bool is_open_curly_bracket(u32 code_point)
  128. {
  129. return code_point == 0x7B;
  130. }
  131. static inline bool is_closed_curly_bracket(u32 code_point)
  132. {
  133. return code_point == 0x7D;
  134. }
  135. static inline bool is_whitespace(u32 code_point)
  136. {
  137. return code_point == 0x9 || code_point == 0xA || code_point == 0x20;
  138. }
  139. static inline bool is_percent(u32 code_point)
  140. {
  141. return code_point == 0x25;
  142. }
  143. static inline bool is_exclamation_mark(u32 code_point)
  144. {
  145. return code_point == 0x21;
  146. }
  147. static inline bool is_e(u32 code_point)
  148. {
  149. return code_point == 0x65;
  150. }
  151. static inline bool is_E(u32 code_point)
  152. {
  153. return code_point == 0x45;
  154. }
  155. namespace Web::CSS {
  156. Tokenizer::Tokenizer(const StringView& input, const String& encoding)
  157. {
  158. auto* decoder = TextCodec::decoder_for(encoding);
  159. VERIFY(decoder);
  160. StringBuilder builder(input.length());
  161. // Preprocess the stream, by doing the following:
  162. // - Replace \r, \f and \r\n with \n
  163. // - replace \0 and anything between U+D800 to U+DFFF with the replacement
  164. // character.
  165. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  166. bool last_was_carriage_return = false;
  167. decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) {
  168. if (code_point == '\r') {
  169. if (last_was_carriage_return) {
  170. builder.append('\n');
  171. } else {
  172. last_was_carriage_return = true;
  173. }
  174. } else {
  175. if (last_was_carriage_return) {
  176. builder.append('\n');
  177. }
  178. if (code_point == '\n') {
  179. if (!last_was_carriage_return) {
  180. builder.append('\n');
  181. }
  182. } else if (code_point == '\f') {
  183. builder.append('\n');
  184. } else if (code_point >= 0xD800 && code_point <= 0xDFFF) {
  185. builder.append_code_point(REPLACEMENT_CHARACTER);
  186. } else {
  187. builder.append_code_point(code_point);
  188. }
  189. last_was_carriage_return = false;
  190. }
  191. });
  192. m_decoded_input = builder.to_string();
  193. m_utf8_view = Utf8View(m_decoded_input);
  194. m_utf8_iterator = m_utf8_view.begin();
  195. }
  196. Vector<Token> Tokenizer::parse()
  197. {
  198. Vector<Token> tokens;
  199. for (;;) {
  200. auto token = consume_a_token();
  201. tokens.append(token);
  202. if (token.is(Token::Type::EndOfFile)) {
  203. return tokens;
  204. }
  205. }
  206. }
  207. u32 Tokenizer::next_code_point()
  208. {
  209. if (m_utf8_iterator == m_utf8_view.end())
  210. return TOKENIZER_EOF;
  211. m_prev_utf8_iterator = m_utf8_iterator;
  212. ++m_utf8_iterator;
  213. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", *m_prev_utf8_iterator);
  214. return *m_prev_utf8_iterator;
  215. }
  216. u32 Tokenizer::peek_code_point(size_t offset) const
  217. {
  218. auto it = m_utf8_iterator;
  219. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  220. ++it;
  221. if (it == m_utf8_view.end())
  222. return TOKENIZER_EOF;
  223. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator);
  224. return *it;
  225. }
  226. U32Twin Tokenizer::peek_twin() const
  227. {
  228. U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF };
  229. auto it = m_utf8_iterator;
  230. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  231. values.set(i, *it);
  232. ++it;
  233. }
  234. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second);
  235. return values;
  236. }
  237. U32Triplet Tokenizer::peek_triplet() const
  238. {
  239. U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF };
  240. auto it = m_utf8_iterator;
  241. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  242. values.set(i, *it);
  243. ++it;
  244. }
  245. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third);
  246. return values;
  247. }
  248. Token Tokenizer::create_new_token(Token::Type type)
  249. {
  250. Token token = {};
  251. token.m_type = type;
  252. return token;
  253. }
  254. Token Tokenizer::create_eof_token()
  255. {
  256. return create_new_token(Token::Type::EndOfFile);
  257. }
  258. Token Tokenizer::create_value_token(Token::Type type, String value)
  259. {
  260. Token token;
  261. token.m_type = type;
  262. token.m_value.append(move(value));
  263. return token;
  264. }
  265. Token Tokenizer::create_value_token(Token::Type type, u32 value)
  266. {
  267. Token token = {};
  268. token.m_type = type;
  269. token.m_value.append_code_point(value);
  270. return token;
  271. }
  272. // https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
  273. u32 Tokenizer::consume_escaped_code_point()
  274. {
  275. auto input = next_code_point();
  276. if (is_eof(input)) {
  277. log_parse_error();
  278. return REPLACEMENT_CHARACTER;
  279. }
  280. if (is_ascii_hex_digit(input)) {
  281. StringBuilder builder;
  282. builder.append_code_point(input);
  283. size_t counter = 0;
  284. while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) {
  285. builder.append_code_point(next_code_point());
  286. }
  287. if (is_whitespace(peek_code_point())) {
  288. (void)next_code_point();
  289. }
  290. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
  291. if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
  292. return REPLACEMENT_CHARACTER;
  293. }
  294. return unhexed;
  295. }
  296. if (!input) {
  297. log_parse_error();
  298. return REPLACEMENT_CHARACTER;
  299. }
  300. return input;
  301. }
  302. // https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
  303. Token Tokenizer::consume_an_ident_like_token()
  304. {
  305. auto string = consume_a_name();
  306. if (string.equals_ignoring_case("url") && is_left_paren(peek_code_point())) {
  307. (void)next_code_point();
  308. for (;;) {
  309. auto maybe_whitespace = peek_twin();
  310. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  311. break;
  312. }
  313. (void)next_code_point();
  314. }
  315. auto next_two = peek_twin();
  316. // if one of these ", ', ' "', " '"
  317. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  318. return create_value_token(Token::Type::Function, string);
  319. }
  320. return consume_a_url_token();
  321. }
  322. if (is_left_paren(peek_code_point())) {
  323. (void)next_code_point();
  324. return create_value_token(Token::Type::Function, string);
  325. }
  326. return create_value_token(Token::Type::Ident, string);
  327. }
  328. // https://www.w3.org/TR/css-syntax-3/#consume-number
  329. CSSNumber Tokenizer::consume_a_number()
  330. {
  331. StringBuilder repr;
  332. Token::NumberType type = Token::NumberType::Integer;
  333. auto next_input = peek_code_point();
  334. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  335. repr.append_code_point(next_code_point());
  336. }
  337. for (;;) {
  338. auto digits = peek_code_point();
  339. if (!is_ascii_digit(digits))
  340. break;
  341. repr.append_code_point(next_code_point());
  342. }
  343. auto maybe_number = peek_twin();
  344. if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
  345. repr.append_code_point(next_code_point());
  346. repr.append_code_point(next_code_point());
  347. type = Token::NumberType::Number;
  348. for (;;) {
  349. auto digit = peek_code_point();
  350. if (!is_ascii_digit(digit))
  351. break;
  352. repr.append_code_point(next_code_point());
  353. }
  354. }
  355. auto maybe_exp = peek_triplet();
  356. if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
  357. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  358. if (is_ascii_digit(maybe_exp.third)) {
  359. repr.append_code_point(next_code_point());
  360. repr.append_code_point(next_code_point());
  361. repr.append_code_point(next_code_point());
  362. }
  363. } else if (is_ascii_digit(maybe_exp.second)) {
  364. repr.append_code_point(next_code_point());
  365. repr.append_code_point(next_code_point());
  366. }
  367. type = Token::NumberType::Number;
  368. for (;;) {
  369. auto digits = peek_code_point();
  370. if (!is_ascii_digit(digits))
  371. break;
  372. repr.append_code_point(next_code_point());
  373. }
  374. }
  375. return { repr.to_string(), type };
  376. }
  377. // https://www.w3.org/TR/css-syntax-3/#consume-name
  378. String Tokenizer::consume_a_name()
  379. {
  380. StringBuilder result;
  381. for (;;) {
  382. auto input = next_code_point();
  383. if (is_eof(input))
  384. break;
  385. if (is_name_code_point(input)) {
  386. result.append_code_point(input);
  387. continue;
  388. }
  389. auto next = peek_code_point();
  390. if (!is_eof(next) && is_valid_escape_sequence({ input, next })) {
  391. result.append_code_point(consume_escaped_code_point());
  392. continue;
  393. }
  394. reconsume_current_input_code_point();
  395. break;
  396. }
  397. return result.to_string();
  398. }
  399. Token Tokenizer::consume_a_url_token()
  400. {
  401. auto token = create_new_token(Token::Type::Url);
  402. for (;;) {
  403. if (!is_whitespace(peek_code_point())) {
  404. break;
  405. }
  406. (void)next_code_point();
  407. }
  408. for (;;) {
  409. auto input = peek_code_point();
  410. if (is_eof(input)) {
  411. log_parse_error();
  412. return token;
  413. }
  414. if (is_right_paren(input)) {
  415. (void)next_code_point();
  416. return token;
  417. }
  418. if (is_whitespace(input)) {
  419. for (;;) {
  420. if (!is_whitespace(peek_code_point())) {
  421. break;
  422. }
  423. input = next_code_point();
  424. }
  425. if (is_eof(input)) {
  426. log_parse_error();
  427. return token;
  428. }
  429. if (is_right_paren(input)) {
  430. return token;
  431. }
  432. consume_the_remnants_of_a_bad_url();
  433. return create_new_token(Token::Type::BadUrl);
  434. }
  435. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  436. log_parse_error();
  437. (void)next_code_point();
  438. consume_the_remnants_of_a_bad_url();
  439. return create_new_token(Token::Type::BadUrl);
  440. }
  441. if (is_reverse_solidus(input)) {
  442. if (is_valid_escape_sequence(peek_twin())) {
  443. token.m_value.append_code_point(consume_escaped_code_point());
  444. } else {
  445. log_parse_error();
  446. (void)next_code_point();
  447. consume_the_remnants_of_a_bad_url();
  448. return create_new_token(Token::Type::BadUrl);
  449. }
  450. }
  451. token.m_value.append_code_point(next_code_point());
  452. }
  453. }
  454. // https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
  455. void Tokenizer::consume_the_remnants_of_a_bad_url()
  456. {
  457. for (;;) {
  458. auto next = peek_code_point();
  459. if (is_eof(next)) {
  460. return;
  461. }
  462. auto input = next;
  463. if (is_right_paren(input)) {
  464. (void)next_code_point();
  465. return;
  466. }
  467. if (is_valid_escape_sequence(peek_twin())) {
  468. [[maybe_unused]] auto cp = consume_escaped_code_point();
  469. }
  470. (void)next_code_point();
  471. }
  472. }
  473. void Tokenizer::reconsume_current_input_code_point()
  474. {
  475. m_utf8_iterator = m_prev_utf8_iterator;
  476. }
  477. // https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
  478. Token Tokenizer::consume_a_numeric_token()
  479. {
  480. auto number = consume_a_number();
  481. if (would_start_an_identifier()) {
  482. auto token = create_new_token(Token::Type::Dimension);
  483. token.m_value.append(number.value);
  484. token.m_number_type = number.type;
  485. auto unit = consume_a_name();
  486. VERIFY(!unit.is_empty() && !unit.is_whitespace());
  487. token.m_unit.append(unit);
  488. return token;
  489. }
  490. if (is_percent(peek_code_point())) {
  491. (void)next_code_point();
  492. auto token = create_new_token(Token::Type::Percentage);
  493. token.m_value.append(number.value);
  494. return token;
  495. }
  496. auto token = create_new_token(Token::Type::Number);
  497. token.m_value.append(number.value);
  498. token.m_number_type = number.type;
  499. return token;
  500. }
  501. bool Tokenizer::would_start_a_number() const
  502. {
  503. return would_start_a_number(peek_triplet());
  504. }
  505. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
  506. bool Tokenizer::would_start_a_number(U32Triplet values)
  507. {
  508. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  509. if (is_ascii_digit(values.second))
  510. return true;
  511. if (is_full_stop(values.second) && is_ascii_digit(values.third))
  512. return true;
  513. return false;
  514. }
  515. if (is_full_stop(values.first))
  516. return is_ascii_digit(values.second);
  517. if (is_ascii_digit(values.first))
  518. return true;
  519. return false;
  520. }
  521. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
  522. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  523. {
  524. if (!is_reverse_solidus(values.first)) {
  525. return false;
  526. }
  527. if (is_newline(values.second)) {
  528. return false;
  529. }
  530. return true;
  531. }
  532. bool Tokenizer::would_start_an_identifier()
  533. {
  534. return would_start_an_identifier(peek_triplet());
  535. }
  536. // https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
  537. bool Tokenizer::would_start_an_identifier(U32Triplet values)
  538. {
  539. if (is_hyphen_minus(values.first)) {
  540. if (is_name_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  541. return true;
  542. return false;
  543. }
  544. if (is_name_start_code_point(values.first)) {
  545. return true;
  546. }
  547. if (is_reverse_solidus(values.first)) {
  548. if (is_valid_escape_sequence(values.to_twin_12()))
  549. return true;
  550. return false;
  551. }
  552. return false;
  553. }
  554. // https://www.w3.org/TR/css-syntax-3/#consume-string-token
  555. Token Tokenizer::consume_string_token(u32 ending_code_point)
  556. {
  557. auto token = create_new_token(Token::Type::String);
  558. for (;;) {
  559. auto input = next_code_point();
  560. if (is_eof(input)) {
  561. log_parse_error();
  562. return token;
  563. }
  564. if (input == ending_code_point)
  565. return token;
  566. if (is_newline(input)) {
  567. reconsume_current_input_code_point();
  568. return create_new_token(Token::Type::BadString);
  569. }
  570. if (is_reverse_solidus(input)) {
  571. auto next_input = peek_code_point();
  572. if (is_eof(next_input))
  573. continue;
  574. if (is_newline(next_input)) {
  575. (void)next_code_point();
  576. continue;
  577. }
  578. auto escaped = consume_escaped_code_point();
  579. token.m_value.append_code_point(escaped);
  580. }
  581. token.m_value.append_code_point(input);
  582. }
  583. }
  584. // https://www.w3.org/TR/css-syntax-3/#consume-comment
  585. void Tokenizer::consume_comments()
  586. {
  587. start:
  588. auto twin = peek_twin();
  589. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  590. return;
  591. (void)next_code_point();
  592. (void)next_code_point();
  593. for (;;) {
  594. auto twin_inner = peek_twin();
  595. if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) {
  596. log_parse_error();
  597. return;
  598. }
  599. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  600. (void)next_code_point();
  601. (void)next_code_point();
  602. goto start;
  603. }
  604. (void)next_code_point();
  605. }
  606. }
  607. // https://www.w3.org/TR/css-syntax-3/#consume-token
  608. Token Tokenizer::consume_a_token()
  609. {
  610. consume_comments();
  611. auto input = next_code_point();
  612. if (is_eof(input)) {
  613. return create_new_token(Token::Type::EndOfFile);
  614. }
  615. if (is_whitespace(input)) {
  616. dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace");
  617. auto next = peek_code_point();
  618. while (is_whitespace(next)) {
  619. (void)next_code_point();
  620. next = peek_code_point();
  621. }
  622. return create_new_token(Token::Type::Whitespace);
  623. }
  624. if (is_quotation_mark(input)) {
  625. dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark");
  626. return consume_string_token(input);
  627. }
  628. if (is_number_sign(input)) {
  629. dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign");
  630. auto next_input = peek_code_point();
  631. auto maybe_escape = peek_twin();
  632. if (is_name_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) {
  633. auto token = create_new_token(Token::Type::Hash);
  634. if (would_start_an_identifier())
  635. token.m_hash_type = Token::HashType::Id;
  636. auto name = consume_a_name();
  637. token.m_value.append(name);
  638. return token;
  639. }
  640. return create_value_token(Token::Type::Delim, input);
  641. }
  642. if (is_apostrophe(input)) {
  643. dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe");
  644. return consume_string_token(input);
  645. }
  646. if (is_left_paren(input)) {
  647. dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren");
  648. return create_new_token(Token::Type::OpenParen);
  649. }
  650. if (is_right_paren(input)) {
  651. dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren");
  652. return create_new_token(Token::Type::CloseParen);
  653. }
  654. if (is_plus_sign(input)) {
  655. dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign");
  656. if (would_start_a_number()) {
  657. reconsume_current_input_code_point();
  658. return consume_a_numeric_token();
  659. }
  660. return create_value_token(Token::Type::Delim, input);
  661. }
  662. if (is_comma(input)) {
  663. dbgln_if(CSS_TOKENIZER_DEBUG, "is comma");
  664. return create_new_token(Token::Type::Comma);
  665. }
  666. if (is_hyphen_minus(input)) {
  667. dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus");
  668. if (would_start_a_number()) {
  669. reconsume_current_input_code_point();
  670. return consume_a_numeric_token();
  671. }
  672. auto next_twin = peek_twin();
  673. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  674. (void)next_code_point();
  675. (void)next_code_point();
  676. return create_new_token(Token::Type::CDC);
  677. }
  678. if (would_start_an_identifier()) {
  679. reconsume_current_input_code_point();
  680. return consume_an_ident_like_token();
  681. }
  682. return create_value_token(Token::Type::Delim, input);
  683. }
  684. if (is_full_stop(input)) {
  685. dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop");
  686. if (would_start_a_number()) {
  687. reconsume_current_input_code_point();
  688. return consume_a_numeric_token();
  689. }
  690. return create_value_token(Token::Type::Delim, input);
  691. }
  692. if (is_colon(input)) {
  693. dbgln_if(CSS_TOKENIZER_DEBUG, "is colon");
  694. return create_new_token(Token::Type::Colon);
  695. }
  696. if (is_semicolon(input)) {
  697. dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon");
  698. return create_new_token(Token::Type::Semicolon);
  699. }
  700. if (is_less_than_sign(input)) {
  701. dbgln_if(CSS_TOKENIZER_DEBUG, "is less than");
  702. auto maybe_cdo = peek_triplet();
  703. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  704. (void)next_code_point();
  705. (void)next_code_point();
  706. (void)next_code_point();
  707. return create_new_token(Token::Type::CDO);
  708. }
  709. return create_value_token(Token::Type::Delim, input);
  710. }
  711. if (is_at(input)) {
  712. dbgln_if(CSS_TOKENIZER_DEBUG, "is at");
  713. if (would_start_an_identifier()) {
  714. auto name = consume_a_name();
  715. return create_value_token(Token::Type::AtKeyword, name);
  716. }
  717. return create_value_token(Token::Type::Delim, input);
  718. }
  719. if (is_open_square_bracket(input)) {
  720. dbgln_if(CSS_TOKENIZER_DEBUG, "is open square");
  721. return create_new_token(Token::Type::OpenSquare);
  722. }
  723. if (is_reverse_solidus(input)) {
  724. dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus");
  725. if (is_valid_escape_sequence({ input, peek_code_point() })) {
  726. reconsume_current_input_code_point();
  727. return consume_an_ident_like_token();
  728. }
  729. log_parse_error();
  730. return create_value_token(Token::Type::Delim, input);
  731. }
  732. if (is_closed_square_bracket(input)) {
  733. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square");
  734. return create_new_token(Token::Type::CloseSquare);
  735. }
  736. if (is_open_curly_bracket(input)) {
  737. dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly");
  738. return create_new_token(Token::Type::OpenCurly);
  739. }
  740. if (is_closed_curly_bracket(input)) {
  741. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly");
  742. return create_new_token(Token::Type::CloseCurly);
  743. }
  744. if (is_ascii_digit(input)) {
  745. dbgln_if(CSS_TOKENIZER_DEBUG, "is digit");
  746. reconsume_current_input_code_point();
  747. return consume_a_numeric_token();
  748. }
  749. if (is_name_start_code_point(input)) {
  750. dbgln_if(CSS_TOKENIZER_DEBUG, "is name start");
  751. reconsume_current_input_code_point();
  752. return consume_an_ident_like_token();
  753. }
  754. dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter");
  755. return create_value_token(Token::Type::Delim, input);
  756. }
  757. }