Tokenizer.cpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940
  1. /*
  2. * Copyright (c) 2020-2021, SerenityOS developers
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <AK/Vector.h>
  27. #include <LibTextCodec/Decoder.h>
  28. #include <LibWeb/CSS/Parser/Tokenizer.h>
  29. #include <ctype.h>
  30. #define CSS_TOKENIZER_TRACE 0
  31. #define PARSE_ERROR() \
  32. do { \
  33. dbgln_if(CSS_TOKENIZER_TRACE, "Parse error (css tokenization) {} @ {}", __PRETTY_FUNCTION__, __LINE__); \
  34. } while (0)
  35. //U+FFFD REPLACEMENT CHARACTER (�)
  36. #define REPLACEMENT_CHARACTER 0xFFFD
  37. static inline bool is_surrogate(u32 codepoint)
  38. {
  39. return (codepoint & 0xfffff800) == 0xd800;
  40. }
  41. static inline bool is_quotation_mark(u32 codepoint)
  42. {
  43. return codepoint == 0x22;
  44. }
  45. static inline bool is_greater_than_maximum_allowed_codepoint(u32 codepoint)
  46. {
  47. return codepoint > 0x10FFFF;
  48. }
  49. static inline bool is_hex_digit(u32 codepoint)
  50. {
  51. return isxdigit(codepoint);
  52. }
  53. static inline bool is_low_line(u32 codepoint)
  54. {
  55. return codepoint == 0x5F;
  56. }
  57. static inline bool is_non_ascii(u32 codepoint)
  58. {
  59. return codepoint >= 0x80;
  60. }
  61. static inline bool is_name_start_codepoint(u32 codepoint)
  62. {
  63. return isalpha(codepoint) || is_non_ascii(codepoint) || is_low_line(codepoint);
  64. }
  65. static inline bool is_hyphen_minus(u32 codepoint)
  66. {
  67. return codepoint == 0x2D;
  68. }
  69. static inline bool is_name_codepoint(u32 codepoint)
  70. {
  71. return is_name_start_codepoint(codepoint) || isdigit(codepoint) || is_hyphen_minus(codepoint);
  72. }
  73. static inline bool is_non_printable(u32 codepoint)
  74. {
  75. return codepoint <= 0x8 || codepoint == 0xB || (codepoint >= 0xE && codepoint <= 0x1F) || codepoint == 0x7F;
  76. }
  77. static inline bool is_number_sign(u32 codepoint)
  78. {
  79. return codepoint == 0x23;
  80. }
  81. static inline bool is_reverse_solidus(u32 codepoint)
  82. {
  83. return codepoint == 0x5C;
  84. }
  85. static inline bool is_apostrophe(u32 codepoint)
  86. {
  87. return codepoint == 0x27;
  88. }
  89. static inline bool is_left_paren(u32 codepoint)
  90. {
  91. return codepoint == 0x28;
  92. }
  93. static inline bool is_right_paren(u32 codepoint)
  94. {
  95. return codepoint == 0x29;
  96. }
  97. static inline bool is_plus_sign(u32 codepoint)
  98. {
  99. return codepoint == 0x2B;
  100. }
  101. static inline bool is_comma(u32 codepoint)
  102. {
  103. return codepoint == 0x2C;
  104. }
  105. static inline bool is_full_stop(u32 codepoint)
  106. {
  107. return codepoint == 0x2E;
  108. }
  109. static inline bool is_newline(u32 codepoint)
  110. {
  111. return codepoint == 0xA;
  112. }
  113. static inline bool is_asterisk(u32 codepoint)
  114. {
  115. return codepoint == 0x2A;
  116. }
  117. static inline bool is_solidus(u32 codepoint)
  118. {
  119. return codepoint == 0x2F;
  120. }
  121. static inline bool is_colon(u32 codepoint)
  122. {
  123. return codepoint == 0x3A;
  124. }
  125. static inline bool is_semicolon(u32 codepoint)
  126. {
  127. return codepoint == 0x3B;
  128. }
  129. static inline bool is_less_than_sign(u32 codepoint)
  130. {
  131. return codepoint == 0x3C;
  132. }
  133. static inline bool is_greater_than_sign(u32 codepoint)
  134. {
  135. return codepoint == 0x3E;
  136. }
  137. static inline bool is_at(u32 codepoint)
  138. {
  139. return codepoint == 0x40;
  140. }
  141. static inline bool is_open_square_bracket(u32 codepoint)
  142. {
  143. return codepoint == 0x5B;
  144. }
  145. static inline bool is_closed_square_bracket(u32 codepoint)
  146. {
  147. return codepoint == 0x5D;
  148. }
  149. static inline bool is_open_curly_bracket(u32 codepoint)
  150. {
  151. return codepoint == 0x7B;
  152. }
  153. static inline bool is_closed_curly_bracket(u32 codepoint)
  154. {
  155. return codepoint == 0x7D;
  156. }
  157. static inline bool is_whitespace(u32 codepoint)
  158. {
  159. return codepoint == 0x9 || codepoint == 0xA || codepoint == 0x20;
  160. }
  161. static inline bool is_percent(u32 codepoint)
  162. {
  163. return codepoint == 0x25;
  164. }
  165. static inline bool is_exclamation_mark(u32 codepoint)
  166. {
  167. return codepoint == 0x21;
  168. }
  169. static inline bool is_e(u32 codepoint)
  170. {
  171. return codepoint == 0x65;
  172. }
  173. static inline bool is_E(u32 codepoint)
  174. {
  175. return codepoint == 0x45;
  176. }
  177. namespace Web::CSS {
  178. Tokenizer::Tokenizer(const StringView& input, const String& encoding)
  179. {
  180. auto* decoder = TextCodec::decoder_for(encoding);
  181. VERIFY(decoder);
  182. // FIXME: preprocess the stream
  183. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  184. m_decoded_input = decoder->to_utf8(input);
  185. m_utf8_view = Utf8View(m_decoded_input);
  186. m_utf8_iterator = m_utf8_view.begin();
  187. }
  188. Vector<Token> Tokenizer::parse()
  189. {
  190. Vector<Token> tokens;
  191. for (;;) {
  192. auto token = consume_a_token();
  193. tokens.append(token);
  194. if (token.is_eof()) {
  195. return tokens;
  196. }
  197. }
  198. }
  199. Optional<u32> Tokenizer::next_codepoint()
  200. {
  201. if (m_utf8_iterator == m_utf8_view.end())
  202. return {};
  203. m_prev_utf8_iterator = m_utf8_iterator;
  204. ++m_utf8_iterator;
  205. dbgln_if(CSS_TOKENIZER_TRACE, "(Tokenizer) Next codepoint: {:c}", (char)*m_prev_utf8_iterator);
  206. return *m_prev_utf8_iterator;
  207. }
  208. Optional<u32> Tokenizer::peek_codepoint(size_t offset) const
  209. {
  210. auto it = m_utf8_iterator;
  211. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  212. ++it;
  213. if (it == m_utf8_view.end())
  214. return {};
  215. return *it;
  216. }
  217. Optional<U32Twin> Tokenizer::peek_twin() const
  218. {
  219. U32Twin values;
  220. auto it = m_utf8_iterator;
  221. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  222. if (it == m_utf8_view.end())
  223. return {};
  224. values.set(i, *it);
  225. ++it;
  226. }
  227. return values;
  228. }
  229. Optional<U32Triplet> Tokenizer::peek_triplet() const
  230. {
  231. U32Triplet values;
  232. auto it = m_utf8_iterator;
  233. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  234. if (it == m_utf8_view.end())
  235. return {};
  236. values.set(i, *it);
  237. ++it;
  238. }
  239. return values;
  240. }
  241. Token Tokenizer::create_new_token(Token::TokenType type)
  242. {
  243. Token token = {};
  244. token.m_type = type;
  245. return token;
  246. }
  247. Token Tokenizer::create_value_token(Token::TokenType type, String value)
  248. {
  249. Token token;
  250. token.m_type = type;
  251. token.m_value.append(move(value));
  252. return token;
  253. }
  254. Token Tokenizer::create_value_token(Token::TokenType type, u32 value)
  255. {
  256. Token token = {};
  257. token.m_type = type;
  258. token.m_value.append_code_point(value);
  259. return token;
  260. }
  261. u32 Tokenizer::consume_escaped_codepoint()
  262. {
  263. auto codepoint = next_codepoint();
  264. if (!codepoint.has_value()) {
  265. PARSE_ERROR();
  266. return REPLACEMENT_CHARACTER;
  267. }
  268. auto input = codepoint.value();
  269. if (is_hex_digit(input)) {
  270. StringBuilder builder;
  271. builder.append_code_point(input);
  272. size_t counter = 0;
  273. while (is_hex_digit(peek_codepoint().value()) && counter++ < 5) {
  274. builder.append_code_point(next_codepoint().value());
  275. }
  276. if (is_whitespace(peek_codepoint().value())) {
  277. (void)next_codepoint();
  278. }
  279. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
  280. if (unhexed == 0 || is_surrogate(unhexed) || is_greater_than_maximum_allowed_codepoint(unhexed)) {
  281. return REPLACEMENT_CHARACTER;
  282. }
  283. return unhexed;
  284. }
  285. if (!input) {
  286. PARSE_ERROR();
  287. return REPLACEMENT_CHARACTER;
  288. }
  289. return input;
  290. }
  291. Token Tokenizer::consume_an_ident_like_token()
  292. {
  293. auto string = consume_a_name();
  294. if (string.equals_ignoring_case("url") && is_left_paren(peek_codepoint().value())) {
  295. (void)next_codepoint();
  296. for (;;) {
  297. auto maybe_whitespace = peek_twin().value();
  298. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  299. break;
  300. }
  301. (void)next_codepoint();
  302. }
  303. auto next_two = peek_twin().value();
  304. // if one of these ", ', ' "', " '"
  305. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  306. return create_value_token(Token::TokenType::Function, string);
  307. }
  308. return consume_a_url_token();
  309. }
  310. if (is_left_paren(peek_codepoint().value())) {
  311. (void)next_codepoint();
  312. return create_value_token(Token::TokenType::Function, string);
  313. }
  314. return create_value_token(Token::TokenType::Ident, string);
  315. }
  316. CSSNumber Tokenizer::consume_a_number()
  317. {
  318. StringBuilder repr;
  319. Token::NumberType type = Token::NumberType::Integer;
  320. auto next_input = peek_codepoint().value();
  321. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  322. repr.append_code_point(next_codepoint().value());
  323. }
  324. for (;;) {
  325. auto digits = peek_codepoint().value();
  326. if (!isdigit(digits))
  327. break;
  328. repr.append_code_point(next_codepoint().value());
  329. }
  330. auto maybe_number = peek_twin().value();
  331. if (is_full_stop(maybe_number.first) && isdigit(maybe_number.second)) {
  332. repr.append_code_point(next_codepoint().value());
  333. repr.append_code_point(next_codepoint().value());
  334. type = Token::NumberType::Number;
  335. for (;;) {
  336. auto digits = peek_codepoint();
  337. if (digits.has_value() && !isdigit(digits.value()))
  338. break;
  339. repr.append_code_point(next_codepoint().value());
  340. }
  341. }
  342. auto maybe_exp = peek_triplet().value();
  343. if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
  344. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  345. if (isdigit(maybe_exp.third)) {
  346. repr.append_code_point(next_codepoint().value());
  347. repr.append_code_point(next_codepoint().value());
  348. repr.append_code_point(next_codepoint().value());
  349. }
  350. } else if (isdigit(maybe_exp.second)) {
  351. repr.append_code_point(next_codepoint().value());
  352. repr.append_code_point(next_codepoint().value());
  353. }
  354. type = Token::NumberType::Number;
  355. for (;;) {
  356. auto digits = peek_codepoint().value();
  357. if (!isdigit(digits))
  358. break;
  359. repr.append_code_point(next_codepoint().value());
  360. }
  361. }
  362. return { repr.to_string(), type };
  363. }
  364. String Tokenizer::consume_a_name()
  365. {
  366. StringBuilder result;
  367. for (;;) {
  368. auto input = next_codepoint().value();
  369. if (is_name_codepoint(input)) {
  370. result.append_code_point(input);
  371. continue;
  372. }
  373. auto next = peek_codepoint();
  374. if (next.has_value() && is_valid_escape_sequence({ input, next.value() })) {
  375. result.append_code_point(consume_escaped_codepoint());
  376. continue;
  377. }
  378. break;
  379. }
  380. reconsume_current_input_codepoint();
  381. return result.to_string();
  382. }
  383. Token Tokenizer::consume_a_url_token()
  384. {
  385. auto token = create_new_token(Token::TokenType::Url);
  386. for (;;) {
  387. if (!is_whitespace(peek_codepoint().value())) {
  388. break;
  389. }
  390. (void)next_codepoint();
  391. }
  392. for (;;) {
  393. auto codepoint = peek_codepoint();
  394. if (!codepoint.has_value()) {
  395. PARSE_ERROR();
  396. return token;
  397. }
  398. auto input = codepoint.value();
  399. if (is_right_paren(input)) {
  400. (void)next_codepoint();
  401. return token;
  402. }
  403. if (is_whitespace(input)) {
  404. for (;;) {
  405. if (!is_whitespace(peek_codepoint().value())) {
  406. break;
  407. }
  408. codepoint = next_codepoint();
  409. }
  410. if (!codepoint.has_value()) {
  411. PARSE_ERROR();
  412. return token;
  413. }
  414. input = codepoint.value();
  415. if (is_right_paren(input)) {
  416. return token;
  417. }
  418. consume_the_remnants_of_a_bad_url();
  419. return create_new_token(Token::TokenType::BadUrl);
  420. }
  421. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  422. PARSE_ERROR();
  423. (void)next_codepoint();
  424. consume_the_remnants_of_a_bad_url();
  425. return create_new_token(Token::TokenType::BadUrl);
  426. }
  427. if (is_reverse_solidus(input)) {
  428. if (is_valid_escape_sequence()) {
  429. token.m_value.append_code_point(consume_escaped_codepoint());
  430. } else {
  431. PARSE_ERROR();
  432. (void)next_codepoint();
  433. consume_the_remnants_of_a_bad_url();
  434. return create_new_token(Token::TokenType::BadUrl);
  435. }
  436. }
  437. token.m_value.append_code_point(next_codepoint().value());
  438. }
  439. }
  440. void Tokenizer::consume_the_remnants_of_a_bad_url()
  441. {
  442. for (;;) {
  443. auto next = peek_codepoint();
  444. if (!next.has_value()) {
  445. return;
  446. }
  447. auto input = next.value();
  448. if (is_right_paren(input)) {
  449. (void)next_codepoint();
  450. return;
  451. }
  452. if (is_valid_escape_sequence()) {
  453. [[maybe_unused]] auto cp = consume_escaped_codepoint();
  454. }
  455. (void)next_codepoint();
  456. }
  457. }
  458. void Tokenizer::reconsume_current_input_codepoint()
  459. {
  460. m_utf8_iterator = m_prev_utf8_iterator;
  461. }
  462. Token Tokenizer::consume_a_numeric_token()
  463. {
  464. auto number = consume_a_number();
  465. if (would_start_an_identifier()) {
  466. auto token = create_new_token(Token::TokenType::Dimension);
  467. token.m_value.append(number.value);
  468. token.m_number_type = number.type;
  469. auto unit = consume_a_name();
  470. token.m_unit.append(unit);
  471. return token;
  472. }
  473. if (is_percent(peek_codepoint().value())) {
  474. (void)next_codepoint();
  475. auto token = create_new_token(Token::TokenType::Percentage);
  476. token.m_value.append(number.value);
  477. return token;
  478. }
  479. auto token = create_new_token(Token::TokenType::Number);
  480. token.m_value.append(number.value);
  481. token.m_number_type = number.type;
  482. return token;
  483. }
  484. bool Tokenizer::starts_with_a_number() const
  485. {
  486. return starts_with_a_number(peek_triplet().value());
  487. }
  488. bool Tokenizer::starts_with_a_number(U32Triplet values)
  489. {
  490. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  491. if (isdigit(values.second))
  492. return true;
  493. if (is_full_stop(values.second) && isdigit(values.third))
  494. return true;
  495. return false;
  496. }
  497. if (is_full_stop(values.first))
  498. return isdigit(values.second);
  499. if (isdigit(values.first))
  500. return true;
  501. return false;
  502. }
  503. bool Tokenizer::is_valid_escape_sequence()
  504. {
  505. return is_valid_escape_sequence(peek_twin().value());
  506. }
  507. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  508. {
  509. if (!is_reverse_solidus(values.first)) {
  510. return false;
  511. }
  512. if (is_newline(values.second)) {
  513. return false;
  514. }
  515. return true;
  516. }
  517. bool Tokenizer::would_start_an_identifier()
  518. {
  519. return would_start_an_identifier(peek_triplet().value());
  520. }
  521. bool Tokenizer::would_start_an_identifier(U32Triplet values)
  522. {
  523. if (is_hyphen_minus(values.first)) {
  524. if (is_name_start_codepoint(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  525. return true;
  526. return false;
  527. }
  528. if (is_name_start_codepoint(values.first)) {
  529. return true;
  530. }
  531. if (is_reverse_solidus(values.first)) {
  532. if (is_valid_escape_sequence(values.to_twin_12()))
  533. return true;
  534. return false;
  535. }
  536. return false;
  537. }
  538. Token Tokenizer::consume_string_token(u32 ending_codepoint)
  539. {
  540. auto token = create_new_token(Token::TokenType::String);
  541. for (;;) {
  542. auto codepoint = next_codepoint();
  543. if (!codepoint.has_value()) {
  544. PARSE_ERROR();
  545. return token;
  546. }
  547. auto input = codepoint.value();
  548. if (input == ending_codepoint)
  549. return token;
  550. if (is_newline(input)) {
  551. reconsume_current_input_codepoint();
  552. return create_new_token(Token::TokenType::BadString);
  553. }
  554. if (is_reverse_solidus(input)) {
  555. auto next_input = peek_codepoint();
  556. if (!next_input.has_value())
  557. continue;
  558. if (is_newline(next_input.value())) {
  559. (void)next_codepoint();
  560. continue;
  561. }
  562. auto escaped = consume_escaped_codepoint();
  563. token.m_value.append_code_point(escaped);
  564. }
  565. token.m_value.append_code_point(input);
  566. }
  567. }
  568. void Tokenizer::consume_comments()
  569. {
  570. start:
  571. auto peek = peek_twin();
  572. if (!peek.has_value()) {
  573. PARSE_ERROR();
  574. return;
  575. }
  576. auto twin = peek.value();
  577. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  578. return;
  579. (void)next_codepoint();
  580. (void)next_codepoint();
  581. for (;;) {
  582. auto peek_inner = peek_twin();
  583. if (!peek_inner.has_value()) {
  584. PARSE_ERROR();
  585. return;
  586. }
  587. auto twin_inner = peek_inner.value();
  588. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  589. (void)next_codepoint();
  590. (void)next_codepoint();
  591. goto start;
  592. }
  593. (void)next_codepoint();
  594. }
  595. }
  596. Token Tokenizer::consume_a_token()
  597. {
  598. consume_comments();
  599. auto codepoint = next_codepoint();
  600. if (!codepoint.has_value()) {
  601. return create_new_token(Token::TokenType::EndOfFile);
  602. }
  603. auto input = codepoint.value();
  604. if (is_whitespace(input)) {
  605. dbgln_if(CSS_TOKENIZER_TRACE, "is whitespace");
  606. while (is_whitespace(peek_codepoint().value()))
  607. (void)next_codepoint();
  608. return create_new_token(Token::TokenType::Whitespace);
  609. }
  610. if (is_quotation_mark(input)) {
  611. dbgln_if(CSS_TOKENIZER_TRACE, "is quotation mark");
  612. return consume_string_token(input);
  613. }
  614. if (is_number_sign(input)) {
  615. dbgln_if(CSS_TOKENIZER_TRACE, "is number sign");
  616. auto next_input = peek_codepoint().value();
  617. auto maybe_escape = peek_twin().value();
  618. if (is_name_codepoint(next_input) || is_valid_escape_sequence(maybe_escape)) {
  619. auto token = create_new_token(Token::TokenType::Hash);
  620. if (would_start_an_identifier())
  621. token.m_hash_type = Token::HashType::Id;
  622. auto name = consume_a_name();
  623. token.m_value.append(name);
  624. return token;
  625. }
  626. return create_value_token(Token::TokenType::Delim, input);
  627. }
  628. if (is_apostrophe(input)) {
  629. dbgln_if(CSS_TOKENIZER_TRACE, "is apostrophe");
  630. return consume_string_token(input);
  631. }
  632. if (is_left_paren(input)) {
  633. dbgln_if(CSS_TOKENIZER_TRACE, "is left paren");
  634. return create_new_token(Token::TokenType::OpenParen);
  635. }
  636. if (is_right_paren(input)) {
  637. dbgln_if(CSS_TOKENIZER_TRACE, "is right paren");
  638. return create_new_token(Token::TokenType::CloseParen);
  639. }
  640. if (is_plus_sign(input)) {
  641. dbgln_if(CSS_TOKENIZER_TRACE, "is plus sign");
  642. if (starts_with_a_number()) {
  643. reconsume_current_input_codepoint();
  644. return consume_a_numeric_token();
  645. }
  646. return create_value_token(Token::TokenType::Delim, input);
  647. }
  648. if (is_comma(input)) {
  649. dbgln_if(CSS_TOKENIZER_TRACE, "is comma");
  650. return create_new_token(Token::TokenType::Comma);
  651. }
  652. if (is_hyphen_minus(input)) {
  653. dbgln_if(CSS_TOKENIZER_TRACE, "is hyphen minus");
  654. if (starts_with_a_number()) {
  655. reconsume_current_input_codepoint();
  656. return consume_a_numeric_token();
  657. }
  658. auto next_twin = peek_twin().value();
  659. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  660. (void)next_codepoint();
  661. (void)next_codepoint();
  662. return create_new_token(Token::TokenType::CDC);
  663. }
  664. if (would_start_an_identifier()) {
  665. reconsume_current_input_codepoint();
  666. return consume_an_ident_like_token();
  667. }
  668. return create_value_token(Token::TokenType::Delim, input);
  669. }
  670. if (is_full_stop(input)) {
  671. dbgln_if(CSS_TOKENIZER_TRACE, "is full stop");
  672. if (starts_with_a_number()) {
  673. reconsume_current_input_codepoint();
  674. return consume_a_numeric_token();
  675. }
  676. return create_value_token(Token::TokenType::Delim, input);
  677. }
  678. if (is_colon(input)) {
  679. dbgln_if(CSS_TOKENIZER_TRACE, "is colon");
  680. return create_new_token(Token::TokenType::Colon);
  681. }
  682. if (is_semicolon(input)) {
  683. dbgln_if(CSS_TOKENIZER_TRACE, "is semicolon");
  684. return create_new_token(Token::TokenType::Semicolon);
  685. }
  686. if (is_less_than_sign(input)) {
  687. dbgln_if(CSS_TOKENIZER_TRACE, "is less than");
  688. auto maybe_cdo = peek_triplet().value();
  689. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  690. (void)next_codepoint();
  691. (void)next_codepoint();
  692. (void)next_codepoint();
  693. return create_new_token(Token::TokenType::CDO);
  694. }
  695. return create_value_token(Token::TokenType::Delim, input);
  696. }
  697. if (is_at(input)) {
  698. dbgln_if(CSS_TOKENIZER_TRACE, "is at");
  699. if (would_start_an_identifier()) {
  700. auto name = consume_a_name();
  701. return create_value_token(Token::TokenType::AtKeyword, input);
  702. }
  703. return create_value_token(Token::TokenType::Delim, input);
  704. }
  705. if (is_open_square_bracket(input)) {
  706. dbgln_if(CSS_TOKENIZER_TRACE, "is open square");
  707. return create_new_token(Token::TokenType::OpenSquare);
  708. }
  709. if (is_reverse_solidus(input)) {
  710. dbgln_if(CSS_TOKENIZER_TRACE, "is reverse solidus");
  711. if (is_valid_escape_sequence()) {
  712. reconsume_current_input_codepoint();
  713. return consume_an_ident_like_token();
  714. }
  715. PARSE_ERROR();
  716. return create_value_token(Token::TokenType::Delim, input);
  717. }
  718. if (is_closed_square_bracket(input)) {
  719. dbgln_if(CSS_TOKENIZER_TRACE, "is closed square");
  720. return create_new_token(Token::TokenType::CloseSquare);
  721. }
  722. if (is_open_curly_bracket(input)) {
  723. dbgln_if(CSS_TOKENIZER_TRACE, "is open curly");
  724. return create_new_token(Token::TokenType::OpenCurly);
  725. }
  726. if (is_closed_curly_bracket(input)) {
  727. dbgln_if(CSS_TOKENIZER_TRACE, "is closed curly");
  728. return create_new_token(Token::TokenType::CloseCurly);
  729. }
  730. if (isdigit(input)) {
  731. dbgln_if(CSS_TOKENIZER_TRACE, "is digit");
  732. reconsume_current_input_codepoint();
  733. return consume_a_numeric_token();
  734. }
  735. if (is_name_start_codepoint(input)) {
  736. dbgln_if(CSS_TOKENIZER_TRACE, "is name start");
  737. reconsume_current_input_codepoint();
  738. return consume_an_ident_like_token();
  739. }
  740. dbgln_if(CSS_TOKENIZER_TRACE, "is delimiter");
  741. return create_value_token(Token::TokenType::Delim, input);
  742. }
  743. }