Tokenizer.cpp 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034
  1. /*
  2. * Copyright (c) 2020-2021, the SerenityOS developers.
  3. * Copyright (c) 2021, Sam Atkins <atkinssj@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Debug.h>
  9. #include <AK/SourceLocation.h>
  10. #include <AK/Vector.h>
  11. #include <LibTextCodec/Decoder.h>
  12. #include <LibWeb/CSS/Parser/Tokenizer.h>
  13. #include <math.h>
  14. // U+FFFD REPLACEMENT CHARACTER (�)
  15. #define REPLACEMENT_CHARACTER 0xFFFD
  16. static const u32 TOKENIZER_EOF = 0xFFFFFFFF;
  17. static inline void log_parse_error(const SourceLocation& location = SourceLocation::current())
  18. {
  19. dbgln_if(CSS_TOKENIZER_DEBUG, "Parse error (css tokenization) {} ", location);
  20. }
  21. static inline bool is_eof(u32 code_point)
  22. {
  23. return code_point == TOKENIZER_EOF;
  24. }
  25. static inline bool is_quotation_mark(u32 code_point)
  26. {
  27. return code_point == 0x22;
  28. }
  29. static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
  30. {
  31. return code_point > 0x10FFFF;
  32. }
  33. static inline bool is_low_line(u32 code_point)
  34. {
  35. return code_point == 0x5F;
  36. }
  37. static inline bool is_name_start_code_point(u32 code_point)
  38. {
  39. // FIXME: We use !is_ascii() for "non-ASCII code point" in the spec, but it's not quite right -
  40. // it treats EOF as a valid! The spec also lacks a definition of code point. For now, the
  41. // !is_eof() check is a hack, but it should work.
  42. return !is_eof(code_point) && (is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point));
  43. }
  44. static inline bool is_hyphen_minus(u32 code_point)
  45. {
  46. return code_point == 0x2D;
  47. }
  48. static inline bool is_name_code_point(u32 code_point)
  49. {
  50. return is_name_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
  51. }
  52. static inline bool is_non_printable(u32 code_point)
  53. {
  54. return code_point <= 0x8 || code_point == 0xB || (code_point >= 0xE && code_point <= 0x1F) || code_point == 0x7F;
  55. }
  56. static inline bool is_number_sign(u32 code_point)
  57. {
  58. return code_point == 0x23;
  59. }
  60. static inline bool is_reverse_solidus(u32 code_point)
  61. {
  62. return code_point == 0x5C;
  63. }
  64. static inline bool is_apostrophe(u32 code_point)
  65. {
  66. return code_point == 0x27;
  67. }
  68. static inline bool is_left_paren(u32 code_point)
  69. {
  70. return code_point == 0x28;
  71. }
  72. static inline bool is_right_paren(u32 code_point)
  73. {
  74. return code_point == 0x29;
  75. }
  76. static inline bool is_plus_sign(u32 code_point)
  77. {
  78. return code_point == 0x2B;
  79. }
  80. static inline bool is_comma(u32 code_point)
  81. {
  82. return code_point == 0x2C;
  83. }
  84. static inline bool is_full_stop(u32 code_point)
  85. {
  86. return code_point == 0x2E;
  87. }
  88. static inline bool is_newline(u32 code_point)
  89. {
  90. return code_point == 0xA;
  91. }
  92. static inline bool is_asterisk(u32 code_point)
  93. {
  94. return code_point == 0x2A;
  95. }
  96. static inline bool is_solidus(u32 code_point)
  97. {
  98. return code_point == 0x2F;
  99. }
  100. static inline bool is_colon(u32 code_point)
  101. {
  102. return code_point == 0x3A;
  103. }
  104. static inline bool is_semicolon(u32 code_point)
  105. {
  106. return code_point == 0x3B;
  107. }
  108. static inline bool is_less_than_sign(u32 code_point)
  109. {
  110. return code_point == 0x3C;
  111. }
  112. static inline bool is_greater_than_sign(u32 code_point)
  113. {
  114. return code_point == 0x3E;
  115. }
  116. static inline bool is_at(u32 code_point)
  117. {
  118. return code_point == 0x40;
  119. }
  120. static inline bool is_open_square_bracket(u32 code_point)
  121. {
  122. return code_point == 0x5B;
  123. }
  124. static inline bool is_closed_square_bracket(u32 code_point)
  125. {
  126. return code_point == 0x5D;
  127. }
  128. static inline bool is_open_curly_bracket(u32 code_point)
  129. {
  130. return code_point == 0x7B;
  131. }
  132. static inline bool is_closed_curly_bracket(u32 code_point)
  133. {
  134. return code_point == 0x7D;
  135. }
  136. static inline bool is_whitespace(u32 code_point)
  137. {
  138. return code_point == 0x9 || code_point == 0xA || code_point == 0x20;
  139. }
  140. static inline bool is_percent(u32 code_point)
  141. {
  142. return code_point == 0x25;
  143. }
  144. static inline bool is_exclamation_mark(u32 code_point)
  145. {
  146. return code_point == 0x21;
  147. }
  148. static inline bool is_e(u32 code_point)
  149. {
  150. return code_point == 0x65;
  151. }
  152. static inline bool is_E(u32 code_point)
  153. {
  154. return code_point == 0x45;
  155. }
  156. namespace Web::CSS {
  157. Tokenizer::Tokenizer(StringView input, const String& encoding)
  158. {
  159. auto* decoder = TextCodec::decoder_for(encoding);
  160. VERIFY(decoder);
  161. StringBuilder builder(input.length());
  162. // Preprocess the stream, by doing the following:
  163. // - Replace \r, \f and \r\n with \n
  164. // - replace \0 and anything between U+D800 to U+DFFF with the replacement
  165. // character.
  166. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing
  167. bool last_was_carriage_return = false;
  168. decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) {
  169. if (code_point == '\r') {
  170. if (last_was_carriage_return) {
  171. builder.append('\n');
  172. } else {
  173. last_was_carriage_return = true;
  174. }
  175. } else {
  176. if (last_was_carriage_return) {
  177. builder.append('\n');
  178. }
  179. if (code_point == '\n') {
  180. if (!last_was_carriage_return) {
  181. builder.append('\n');
  182. }
  183. } else if (code_point == '\f') {
  184. builder.append('\n');
  185. } else if (code_point >= 0xD800 && code_point <= 0xDFFF) {
  186. builder.append_code_point(REPLACEMENT_CHARACTER);
  187. } else {
  188. builder.append_code_point(code_point);
  189. }
  190. last_was_carriage_return = false;
  191. }
  192. });
  193. m_decoded_input = builder.to_string();
  194. m_utf8_view = Utf8View(m_decoded_input);
  195. m_utf8_iterator = m_utf8_view.begin();
  196. }
  197. Vector<Token> Tokenizer::parse()
  198. {
  199. Vector<Token> tokens;
  200. for (;;) {
  201. auto token_start = m_position;
  202. auto token = consume_a_token();
  203. token.m_start_position = token_start;
  204. token.m_end_position = m_position;
  205. tokens.append(token);
  206. if (token.is(Token::Type::EndOfFile)) {
  207. return tokens;
  208. }
  209. }
  210. }
  211. u32 Tokenizer::next_code_point()
  212. {
  213. if (m_utf8_iterator == m_utf8_view.end())
  214. return TOKENIZER_EOF;
  215. m_prev_utf8_iterator = m_utf8_iterator;
  216. ++m_utf8_iterator;
  217. auto code_point = *m_prev_utf8_iterator;
  218. m_prev_position = m_position;
  219. if (is_newline(code_point)) {
  220. m_position.line++;
  221. m_position.column = 0;
  222. } else {
  223. m_position.column++;
  224. }
  225. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Next code_point: {:d}", code_point);
  226. return code_point;
  227. }
  228. u32 Tokenizer::peek_code_point(size_t offset) const
  229. {
  230. auto it = m_utf8_iterator;
  231. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  232. ++it;
  233. if (it == m_utf8_view.end())
  234. return TOKENIZER_EOF;
  235. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek code_point: {:d}", *m_prev_utf8_iterator);
  236. return *it;
  237. }
  238. U32Twin Tokenizer::peek_twin() const
  239. {
  240. U32Twin values { TOKENIZER_EOF, TOKENIZER_EOF };
  241. auto it = m_utf8_iterator;
  242. for (size_t i = 0; i < 2 && it != m_utf8_view.end(); ++i) {
  243. values.set(i, *it);
  244. ++it;
  245. }
  246. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek twin: {:d},{:d}", values.first, values.second);
  247. return values;
  248. }
  249. U32Triplet Tokenizer::peek_triplet() const
  250. {
  251. U32Triplet values { TOKENIZER_EOF, TOKENIZER_EOF, TOKENIZER_EOF };
  252. auto it = m_utf8_iterator;
  253. for (size_t i = 0; i < 3 && it != m_utf8_view.end(); ++i) {
  254. values.set(i, *it);
  255. ++it;
  256. }
  257. dbgln_if(CSS_TOKENIZER_DEBUG, "(Tokenizer) Peek triplet: {:d},{:d},{:d}", values.first, values.second, values.third);
  258. return values;
  259. }
  260. Token Tokenizer::create_new_token(Token::Type type)
  261. {
  262. Token token = {};
  263. token.m_type = type;
  264. return token;
  265. }
  266. Token Tokenizer::create_eof_token()
  267. {
  268. return create_new_token(Token::Type::EndOfFile);
  269. }
  270. Token Tokenizer::create_value_token(Token::Type type, String value)
  271. {
  272. Token token;
  273. token.m_type = type;
  274. token.m_value.append(move(value));
  275. return token;
  276. }
  277. Token Tokenizer::create_value_token(Token::Type type, u32 value)
  278. {
  279. Token token = {};
  280. token.m_type = type;
  281. token.m_value.append_code_point(value);
  282. return token;
  283. }
  284. // https://www.w3.org/TR/css-syntax-3/#consume-escaped-code-point
  285. u32 Tokenizer::consume_escaped_code_point()
  286. {
  287. auto input = next_code_point();
  288. if (is_eof(input)) {
  289. log_parse_error();
  290. return REPLACEMENT_CHARACTER;
  291. }
  292. if (is_ascii_hex_digit(input)) {
  293. StringBuilder builder;
  294. builder.append_code_point(input);
  295. size_t counter = 0;
  296. while (is_ascii_hex_digit(peek_code_point()) && counter++ < 5) {
  297. builder.append_code_point(next_code_point());
  298. }
  299. if (is_whitespace(peek_code_point())) {
  300. (void)next_code_point();
  301. }
  302. auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
  303. if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
  304. return REPLACEMENT_CHARACTER;
  305. }
  306. return unhexed;
  307. }
  308. if (!input) {
  309. log_parse_error();
  310. return REPLACEMENT_CHARACTER;
  311. }
  312. return input;
  313. }
  314. // https://www.w3.org/TR/css-syntax-3/#consume-ident-like-token
  315. Token Tokenizer::consume_an_ident_like_token()
  316. {
  317. auto string = consume_a_name();
  318. if (string.equals_ignoring_case("url") && is_left_paren(peek_code_point())) {
  319. (void)next_code_point();
  320. for (;;) {
  321. auto maybe_whitespace = peek_twin();
  322. if (!(is_whitespace(maybe_whitespace.first) && is_whitespace(maybe_whitespace.second))) {
  323. break;
  324. }
  325. (void)next_code_point();
  326. }
  327. auto next_two = peek_twin();
  328. // if one of these ", ', ' "', " '"
  329. if (is_quotation_mark(next_two.first) || is_apostrophe(next_two.first) || (is_whitespace(next_two.first) && (is_quotation_mark(next_two.second) || is_apostrophe(next_two.second)))) {
  330. return create_value_token(Token::Type::Function, string);
  331. }
  332. return consume_a_url_token();
  333. }
  334. if (is_left_paren(peek_code_point())) {
  335. (void)next_code_point();
  336. return create_value_token(Token::Type::Function, string);
  337. }
  338. return create_value_token(Token::Type::Ident, string);
  339. }
  340. // https://www.w3.org/TR/css-syntax-3/#consume-number
  341. CSSNumber Tokenizer::consume_a_number()
  342. {
  343. StringBuilder repr;
  344. Token::NumberType type = Token::NumberType::Integer;
  345. auto next_input = peek_code_point();
  346. if (is_plus_sign(next_input) || is_hyphen_minus(next_input)) {
  347. repr.append_code_point(next_code_point());
  348. }
  349. for (;;) {
  350. auto digits = peek_code_point();
  351. if (!is_ascii_digit(digits))
  352. break;
  353. repr.append_code_point(next_code_point());
  354. }
  355. auto maybe_number = peek_twin();
  356. if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
  357. repr.append_code_point(next_code_point());
  358. repr.append_code_point(next_code_point());
  359. type = Token::NumberType::Number;
  360. for (;;) {
  361. auto digit = peek_code_point();
  362. if (!is_ascii_digit(digit))
  363. break;
  364. repr.append_code_point(next_code_point());
  365. }
  366. }
  367. auto maybe_exp = peek_triplet();
  368. if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
  369. if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
  370. if (is_ascii_digit(maybe_exp.third)) {
  371. repr.append_code_point(next_code_point());
  372. repr.append_code_point(next_code_point());
  373. repr.append_code_point(next_code_point());
  374. }
  375. } else if (is_ascii_digit(maybe_exp.second)) {
  376. repr.append_code_point(next_code_point());
  377. repr.append_code_point(next_code_point());
  378. }
  379. type = Token::NumberType::Number;
  380. for (;;) {
  381. auto digits = peek_code_point();
  382. if (!is_ascii_digit(digits))
  383. break;
  384. repr.append_code_point(next_code_point());
  385. }
  386. }
  387. return { repr.to_string(), convert_a_string_to_a_number(repr.string_view()), type };
  388. }
  389. // https://www.w3.org/TR/css-syntax-3/#convert-string-to-number
  390. double Tokenizer::convert_a_string_to_a_number(StringView string)
  391. {
  392. auto code_point_at = [&](size_t index) -> u32 {
  393. if (index < string.length())
  394. return string[index];
  395. return TOKENIZER_EOF;
  396. };
  397. // This algorithm does not do any verification to ensure that the string contains only a number.
  398. // Ensure that the string contains only a valid CSS number before calling this algorithm.
  399. // Divide the string into seven components, in order from left to right:
  400. size_t position = 0;
  401. // 1. A sign: a single U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-), or the empty string.
  402. // Let s [sign] be the number -1 if the sign is U+002D HYPHEN-MINUS (-); otherwise, let s be the number 1.
  403. int sign = 1;
  404. if (is_plus_sign(code_point_at(position)) || is_hyphen_minus(code_point_at(position))) {
  405. sign = is_hyphen_minus(code_point_at(position)) ? -1 : 1;
  406. position++;
  407. }
  408. // 2. An integer part: zero or more digits.
  409. // If there is at least one digit, let i [integer_part] be the number formed by interpreting the digits
  410. // as a base-10 integer; otherwise, let i be the number 0.
  411. double integer_part = 0;
  412. while (is_ascii_digit(code_point_at(position))) {
  413. integer_part = (integer_part * 10) + (code_point_at(position) - '0');
  414. position++;
  415. }
  416. // 3. A decimal point: a single U+002E FULL STOP (.), or the empty string.
  417. if (is_full_stop(code_point_at(position)))
  418. position++;
  419. // 4. A fractional part: zero or more digits.
  420. // If there is at least one digit, let f [fractional_part] be the number formed by interpreting the digits
  421. // as a base-10 integer and d [fractional_digits] be the number of digits; otherwise, let f and d be the number 0.
  422. double fractional_part = 0;
  423. int fractional_digits = 0;
  424. while (is_ascii_digit(code_point_at(position))) {
  425. fractional_part = (fractional_part * 10) + (code_point_at(position) - '0');
  426. position++;
  427. fractional_digits++;
  428. }
  429. // 5. An exponent indicator: a single U+0045 LATIN CAPITAL LETTER E (E) or U+0065 LATIN SMALL LETTER E (e),
  430. // or the empty string.
  431. if (is_e(code_point_at(position)) || is_E(code_point_at(position)))
  432. position++;
  433. // 6. An exponent sign: a single U+002B PLUS SIGN (+) or U+002D HYPHEN-MINUS (-), or the empty string.
  434. // Let t [exponent_sign] be the number -1 if the sign is U+002D HYPHEN-MINUS (-); otherwise, let t be the number 1.
  435. int exponent_sign = 1;
  436. if (is_plus_sign(code_point_at(position)) || is_hyphen_minus(code_point_at(position))) {
  437. exponent_sign = is_hyphen_minus(code_point_at(position)) ? -1 : 1;
  438. position++;
  439. }
  440. // 7. An exponent: zero or more digits.
  441. // If there is at least one digit, let e [exponent] be the number formed by interpreting the digits as a
  442. // base-10 integer; otherwise, let e be the number 0.
  443. double exponent = 0;
  444. while (is_ascii_digit(code_point_at(position))) {
  445. exponent = (exponent * 10) + (code_point_at(position) - '0');
  446. position++;
  447. }
  448. // NOTE: We checked before calling this function that the string is a valid number,
  449. // so if there is anything at the end, something has gone wrong!
  450. VERIFY(position == string.length());
  451. // Return the number s·(i + f·10^-d)·10^te.
  452. return sign * (integer_part + fractional_part * pow(10, -fractional_digits)) * pow(10, exponent_sign * exponent);
  453. }
  454. // https://www.w3.org/TR/css-syntax-3/#consume-name
  455. String Tokenizer::consume_a_name()
  456. {
  457. StringBuilder result;
  458. for (;;) {
  459. auto input = next_code_point();
  460. if (is_eof(input))
  461. break;
  462. if (is_name_code_point(input)) {
  463. result.append_code_point(input);
  464. continue;
  465. }
  466. auto next = peek_code_point();
  467. if (!is_eof(next) && is_valid_escape_sequence({ input, next })) {
  468. result.append_code_point(consume_escaped_code_point());
  469. continue;
  470. }
  471. reconsume_current_input_code_point();
  472. break;
  473. }
  474. return result.to_string();
  475. }
  476. Token Tokenizer::consume_a_url_token()
  477. {
  478. auto token = create_new_token(Token::Type::Url);
  479. consume_as_much_whitespace_as_possible();
  480. for (;;) {
  481. auto input = peek_code_point();
  482. if (is_eof(input)) {
  483. log_parse_error();
  484. return token;
  485. }
  486. if (is_right_paren(input)) {
  487. (void)next_code_point();
  488. return token;
  489. }
  490. if (is_whitespace(input)) {
  491. consume_as_much_whitespace_as_possible();
  492. input = peek_code_point();
  493. if (is_eof(input)) {
  494. log_parse_error();
  495. return token;
  496. }
  497. if (is_right_paren(input)) {
  498. return token;
  499. }
  500. consume_the_remnants_of_a_bad_url();
  501. return create_new_token(Token::Type::BadUrl);
  502. }
  503. if (is_quotation_mark(input) || is_apostrophe(input) || is_left_paren(input) || is_non_printable(input)) {
  504. log_parse_error();
  505. (void)next_code_point();
  506. consume_the_remnants_of_a_bad_url();
  507. return create_new_token(Token::Type::BadUrl);
  508. }
  509. if (is_reverse_solidus(input)) {
  510. if (is_valid_escape_sequence(peek_twin())) {
  511. token.m_value.append_code_point(consume_escaped_code_point());
  512. } else {
  513. log_parse_error();
  514. (void)next_code_point();
  515. consume_the_remnants_of_a_bad_url();
  516. return create_new_token(Token::Type::BadUrl);
  517. }
  518. }
  519. token.m_value.append_code_point(next_code_point());
  520. }
  521. }
  522. // https://www.w3.org/TR/css-syntax-3/#consume-remnants-of-bad-url
  523. void Tokenizer::consume_the_remnants_of_a_bad_url()
  524. {
  525. for (;;) {
  526. auto next = peek_code_point();
  527. if (is_eof(next)) {
  528. return;
  529. }
  530. auto input = next;
  531. if (is_right_paren(input)) {
  532. (void)next_code_point();
  533. return;
  534. }
  535. if (is_valid_escape_sequence(peek_twin())) {
  536. [[maybe_unused]] auto cp = consume_escaped_code_point();
  537. }
  538. (void)next_code_point();
  539. }
  540. }
  541. void Tokenizer::consume_as_much_whitespace_as_possible()
  542. {
  543. while (is_whitespace(peek_code_point())) {
  544. (void)next_code_point();
  545. }
  546. }
  547. void Tokenizer::reconsume_current_input_code_point()
  548. {
  549. m_utf8_iterator = m_prev_utf8_iterator;
  550. m_position = m_prev_position;
  551. }
  552. // https://www.w3.org/TR/css-syntax-3/#consume-numeric-token
  553. Token Tokenizer::consume_a_numeric_token()
  554. {
  555. auto number = consume_a_number();
  556. if (would_start_an_identifier()) {
  557. auto token = create_new_token(Token::Type::Dimension);
  558. token.m_value.append(number.string);
  559. token.m_number_type = number.type;
  560. token.m_number_value = number.value;
  561. auto unit = consume_a_name();
  562. VERIFY(!unit.is_empty() && !unit.is_whitespace());
  563. token.m_unit = move(unit);
  564. return token;
  565. }
  566. if (is_percent(peek_code_point())) {
  567. (void)next_code_point();
  568. auto token = create_new_token(Token::Type::Percentage);
  569. token.m_value.append(number.string);
  570. token.m_number_type = number.type;
  571. token.m_number_value = number.value;
  572. return token;
  573. }
  574. auto token = create_new_token(Token::Type::Number);
  575. token.m_value.append(number.string);
  576. token.m_number_type = number.type;
  577. token.m_number_value = number.value;
  578. return token;
  579. }
  580. bool Tokenizer::would_start_a_number() const
  581. {
  582. return would_start_a_number(peek_triplet());
  583. }
  584. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-number
  585. bool Tokenizer::would_start_a_number(U32Triplet values)
  586. {
  587. if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
  588. if (is_ascii_digit(values.second))
  589. return true;
  590. if (is_full_stop(values.second) && is_ascii_digit(values.third))
  591. return true;
  592. return false;
  593. }
  594. if (is_full_stop(values.first))
  595. return is_ascii_digit(values.second);
  596. if (is_ascii_digit(values.first))
  597. return true;
  598. return false;
  599. }
  600. // https://www.w3.org/TR/css-syntax-3/#starts-with-a-valid-escape
  601. bool Tokenizer::is_valid_escape_sequence(U32Twin values)
  602. {
  603. if (!is_reverse_solidus(values.first)) {
  604. return false;
  605. }
  606. if (is_newline(values.second)) {
  607. return false;
  608. }
  609. return true;
  610. }
  611. bool Tokenizer::would_start_an_identifier()
  612. {
  613. return would_start_an_identifier(peek_triplet());
  614. }
  615. // https://www.w3.org/TR/css-syntax-3/#would-start-an-identifier
  616. bool Tokenizer::would_start_an_identifier(U32Triplet values)
  617. {
  618. if (is_hyphen_minus(values.first)) {
  619. if (is_name_start_code_point(values.second) || is_hyphen_minus(values.second) || is_valid_escape_sequence(values.to_twin_23()))
  620. return true;
  621. return false;
  622. }
  623. if (is_name_start_code_point(values.first)) {
  624. return true;
  625. }
  626. if (is_reverse_solidus(values.first)) {
  627. if (is_valid_escape_sequence(values.to_twin_12()))
  628. return true;
  629. return false;
  630. }
  631. return false;
  632. }
  633. // https://www.w3.org/TR/css-syntax-3/#consume-string-token
  634. Token Tokenizer::consume_string_token(u32 ending_code_point)
  635. {
  636. auto token = create_new_token(Token::Type::String);
  637. for (;;) {
  638. auto input = next_code_point();
  639. if (is_eof(input)) {
  640. log_parse_error();
  641. return token;
  642. }
  643. if (input == ending_code_point)
  644. return token;
  645. if (is_newline(input)) {
  646. reconsume_current_input_code_point();
  647. return create_new_token(Token::Type::BadString);
  648. }
  649. if (is_reverse_solidus(input)) {
  650. auto next_input = peek_code_point();
  651. if (is_eof(next_input))
  652. continue;
  653. if (is_newline(next_input)) {
  654. (void)next_code_point();
  655. continue;
  656. }
  657. auto escaped = consume_escaped_code_point();
  658. token.m_value.append_code_point(escaped);
  659. }
  660. token.m_value.append_code_point(input);
  661. }
  662. }
  663. // https://www.w3.org/TR/css-syntax-3/#consume-comment
  664. void Tokenizer::consume_comments()
  665. {
  666. start:
  667. auto twin = peek_twin();
  668. if (!(is_solidus(twin.first) && is_asterisk(twin.second)))
  669. return;
  670. (void)next_code_point();
  671. (void)next_code_point();
  672. for (;;) {
  673. auto twin_inner = peek_twin();
  674. if (is_eof(twin_inner.first) || is_eof(twin_inner.second)) {
  675. log_parse_error();
  676. return;
  677. }
  678. if (is_asterisk(twin_inner.first) && is_solidus(twin_inner.second)) {
  679. (void)next_code_point();
  680. (void)next_code_point();
  681. goto start;
  682. }
  683. (void)next_code_point();
  684. }
  685. }
  686. // https://www.w3.org/TR/css-syntax-3/#consume-token
  687. Token Tokenizer::consume_a_token()
  688. {
  689. consume_comments();
  690. auto input = next_code_point();
  691. if (is_eof(input)) {
  692. return create_new_token(Token::Type::EndOfFile);
  693. }
  694. if (is_whitespace(input)) {
  695. dbgln_if(CSS_TOKENIZER_DEBUG, "is whitespace");
  696. consume_as_much_whitespace_as_possible();
  697. return create_new_token(Token::Type::Whitespace);
  698. }
  699. if (is_quotation_mark(input)) {
  700. dbgln_if(CSS_TOKENIZER_DEBUG, "is quotation mark");
  701. return consume_string_token(input);
  702. }
  703. if (is_number_sign(input)) {
  704. dbgln_if(CSS_TOKENIZER_DEBUG, "is number sign");
  705. auto next_input = peek_code_point();
  706. auto maybe_escape = peek_twin();
  707. if (is_name_code_point(next_input) || is_valid_escape_sequence(maybe_escape)) {
  708. auto token = create_new_token(Token::Type::Hash);
  709. if (would_start_an_identifier())
  710. token.m_hash_type = Token::HashType::Id;
  711. auto name = consume_a_name();
  712. token.m_value.append(name);
  713. return token;
  714. }
  715. return create_value_token(Token::Type::Delim, input);
  716. }
  717. if (is_apostrophe(input)) {
  718. dbgln_if(CSS_TOKENIZER_DEBUG, "is apostrophe");
  719. return consume_string_token(input);
  720. }
  721. if (is_left_paren(input)) {
  722. dbgln_if(CSS_TOKENIZER_DEBUG, "is left paren");
  723. return create_new_token(Token::Type::OpenParen);
  724. }
  725. if (is_right_paren(input)) {
  726. dbgln_if(CSS_TOKENIZER_DEBUG, "is right paren");
  727. return create_new_token(Token::Type::CloseParen);
  728. }
  729. if (is_plus_sign(input)) {
  730. dbgln_if(CSS_TOKENIZER_DEBUG, "is plus sign");
  731. if (would_start_a_number()) {
  732. reconsume_current_input_code_point();
  733. return consume_a_numeric_token();
  734. }
  735. return create_value_token(Token::Type::Delim, input);
  736. }
  737. if (is_comma(input)) {
  738. dbgln_if(CSS_TOKENIZER_DEBUG, "is comma");
  739. return create_new_token(Token::Type::Comma);
  740. }
  741. if (is_hyphen_minus(input)) {
  742. dbgln_if(CSS_TOKENIZER_DEBUG, "is hyphen minus");
  743. if (would_start_a_number()) {
  744. reconsume_current_input_code_point();
  745. return consume_a_numeric_token();
  746. }
  747. auto next_twin = peek_twin();
  748. if (is_hyphen_minus(next_twin.first) && is_greater_than_sign(next_twin.second)) {
  749. (void)next_code_point();
  750. (void)next_code_point();
  751. return create_new_token(Token::Type::CDC);
  752. }
  753. if (would_start_an_identifier()) {
  754. reconsume_current_input_code_point();
  755. return consume_an_ident_like_token();
  756. }
  757. return create_value_token(Token::Type::Delim, input);
  758. }
  759. if (is_full_stop(input)) {
  760. dbgln_if(CSS_TOKENIZER_DEBUG, "is full stop");
  761. if (would_start_a_number()) {
  762. reconsume_current_input_code_point();
  763. return consume_a_numeric_token();
  764. }
  765. return create_value_token(Token::Type::Delim, input);
  766. }
  767. if (is_colon(input)) {
  768. dbgln_if(CSS_TOKENIZER_DEBUG, "is colon");
  769. return create_new_token(Token::Type::Colon);
  770. }
  771. if (is_semicolon(input)) {
  772. dbgln_if(CSS_TOKENIZER_DEBUG, "is semicolon");
  773. return create_new_token(Token::Type::Semicolon);
  774. }
  775. if (is_less_than_sign(input)) {
  776. dbgln_if(CSS_TOKENIZER_DEBUG, "is less than");
  777. auto maybe_cdo = peek_triplet();
  778. if (is_exclamation_mark(maybe_cdo.first) && is_hyphen_minus(maybe_cdo.second) && is_hyphen_minus(maybe_cdo.third)) {
  779. (void)next_code_point();
  780. (void)next_code_point();
  781. (void)next_code_point();
  782. return create_new_token(Token::Type::CDO);
  783. }
  784. return create_value_token(Token::Type::Delim, input);
  785. }
  786. if (is_at(input)) {
  787. dbgln_if(CSS_TOKENIZER_DEBUG, "is at");
  788. if (would_start_an_identifier()) {
  789. auto name = consume_a_name();
  790. return create_value_token(Token::Type::AtKeyword, name);
  791. }
  792. return create_value_token(Token::Type::Delim, input);
  793. }
  794. if (is_open_square_bracket(input)) {
  795. dbgln_if(CSS_TOKENIZER_DEBUG, "is open square");
  796. return create_new_token(Token::Type::OpenSquare);
  797. }
  798. if (is_reverse_solidus(input)) {
  799. dbgln_if(CSS_TOKENIZER_DEBUG, "is reverse solidus");
  800. if (is_valid_escape_sequence({ input, peek_code_point() })) {
  801. reconsume_current_input_code_point();
  802. return consume_an_ident_like_token();
  803. }
  804. log_parse_error();
  805. return create_value_token(Token::Type::Delim, input);
  806. }
  807. if (is_closed_square_bracket(input)) {
  808. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed square");
  809. return create_new_token(Token::Type::CloseSquare);
  810. }
  811. if (is_open_curly_bracket(input)) {
  812. dbgln_if(CSS_TOKENIZER_DEBUG, "is open curly");
  813. return create_new_token(Token::Type::OpenCurly);
  814. }
  815. if (is_closed_curly_bracket(input)) {
  816. dbgln_if(CSS_TOKENIZER_DEBUG, "is closed curly");
  817. return create_new_token(Token::Type::CloseCurly);
  818. }
  819. if (is_ascii_digit(input)) {
  820. dbgln_if(CSS_TOKENIZER_DEBUG, "is digit");
  821. reconsume_current_input_code_point();
  822. return consume_a_numeric_token();
  823. }
  824. if (is_name_start_code_point(input)) {
  825. dbgln_if(CSS_TOKENIZER_DEBUG, "is name start");
  826. reconsume_current_input_code_point();
  827. return consume_an_ident_like_token();
  828. }
  829. dbgln_if(CSS_TOKENIZER_DEBUG, "is delimiter");
  830. return create_value_token(Token::Type::Delim, input);
  831. }
  832. }