Tokenizer.h 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. /*
  2. * Copyright (c) 2020-2021, SerenityOS developers
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #pragma once
  27. #include <AK/Optional.h>
  28. #include <AK/String.h>
  29. #include <AK/StringView.h>
  30. #include <AK/Types.h>
  31. #include <AK/Utf8View.h>
  32. #include <LibWeb/CSS/Parser/Token.h>
  33. #include <LibWeb/Forward.h>
  34. namespace Web::CSS {
  35. class U32Twin {
  36. public:
  37. void set(size_t index, u32 value)
  38. {
  39. if (index == 0)
  40. first = value;
  41. if (index == 1)
  42. second = value;
  43. }
  44. u32 first {};
  45. u32 second {};
  46. };
  47. class U32Triplet {
  48. public:
  49. void set(size_t index, u32 value)
  50. {
  51. if (index == 0)
  52. first = value;
  53. if (index == 1)
  54. second = value;
  55. if (index == 2)
  56. third = value;
  57. }
  58. U32Twin to_twin_12()
  59. {
  60. return { first, second };
  61. }
  62. U32Twin to_twin_23()
  63. {
  64. return { second, third };
  65. }
  66. u32 first {};
  67. u32 second {};
  68. u32 third {};
  69. };
  70. class CSSNumber {
  71. public:
  72. String value;
  73. Token::NumberType type {};
  74. };
  75. class Tokenizer {
  76. public:
  77. explicit Tokenizer(const StringView& input, const String& encoding);
  78. [[nodiscard]] Vector<Token> parse();
  79. private:
  80. [[nodiscard]] Optional<u32> next_codepoint();
  81. [[nodiscard]] Optional<u32> peek_codepoint(size_t offset = 0) const;
  82. [[nodiscard]] Optional<U32Twin> peek_twin() const;
  83. [[nodiscard]] Optional<U32Triplet> peek_triplet() const;
  84. [[nodiscard]] static Token create_new_token(Token::TokenType);
  85. [[nodiscard]] static Token create_value_token(Token::TokenType, String value);
  86. [[nodiscard]] static Token create_value_token(Token::TokenType, u32 value);
  87. [[nodiscard]] Token consume_a_token();
  88. [[nodiscard]] Token consume_string_token(u32 ending_codepoint);
  89. [[nodiscard]] Token consume_a_numeric_token();
  90. [[nodiscard]] Token consume_an_ident_like_token();
  91. [[nodiscard]] CSSNumber consume_a_number();
  92. [[nodiscard]] String consume_a_name();
  93. [[nodiscard]] u32 consume_escaped_codepoint();
  94. [[nodiscard]] Token consume_a_url_token();
  95. void consume_the_remnants_of_a_bad_url();
  96. void consume_comments();
  97. void reconsume_current_input_codepoint();
  98. [[nodiscard]] bool is_valid_escape_sequence();
  99. [[nodiscard]] static bool is_valid_escape_sequence(U32Twin);
  100. [[nodiscard]] bool would_start_an_identifier();
  101. [[nodiscard]] bool would_start_an_identifier(U32Triplet);
  102. [[nodiscard]] bool starts_with_a_number() const;
  103. [[nodiscard]] static bool starts_with_a_number(U32Triplet);
  104. String m_decoded_input;
  105. Utf8View m_utf8_view;
  106. AK::Utf8CodepointIterator m_utf8_iterator;
  107. AK::Utf8CodepointIterator m_prev_utf8_iterator;
  108. };
  109. }