From 7b356c33cb33665f8dd3f1f1b827eccda4c2c6d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Lormeau?= <10747470+benit8@users.noreply.github.com> Date: Sun, 9 Aug 2020 11:34:26 +0200 Subject: [PATCH] AK: Add a GenericLexer and extend the JsonParser with it (#2696) --- AK/GenericLexer.cpp | 335 ++++++++++++++++++++++++++++++++++++++++++ AK/GenericLexer.h | 108 ++++++++++++++ AK/JsonParser.cpp | 110 ++++---------- AK/JsonParser.h | 18 +-- Kernel/CMakeLists.txt | 1 + 5 files changed, 476 insertions(+), 96 deletions(-) create mode 100644 AK/GenericLexer.cpp create mode 100644 AK/GenericLexer.h diff --git a/AK/GenericLexer.cpp b/AK/GenericLexer.cpp new file mode 100644 index 00000000000..1cb892a2922 --- /dev/null +++ b/AK/GenericLexer.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2020, Benoit Lormeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include + +namespace AK { + +GenericLexer::GenericLexer(const StringView& input) + : m_input(input) +{ +} + +GenericLexer::~GenericLexer() +{ +} + +// Tells whether the parser's index has reached input's end +bool GenericLexer::is_eof() const +{ + return m_index >= m_input.length(); +} + +// Returns the current character at the parser index, plus `offset` if specified +char GenericLexer::peek(size_t offset) const +{ + return (m_index + offset < m_input.length()) ? m_input[m_index + offset] : '\0'; +} + +// Tests the next character in the input +bool GenericLexer::next_is(char expected) const +{ + return peek() == expected; +} + +// Tests if the `expected` string comes next in the input +bool GenericLexer::next_is(const char* expected) const +{ + for (size_t i = 0; expected[i] != '\0'; ++i) + if (peek(i) != expected[i]) + return false; + return true; +} + +// Tests the next character against a Condition +bool GenericLexer::next_is(Condition condition) const +{ + return condition(peek()); +} + +// Consume a character and advance the parser index +char GenericLexer::consume() +{ + ASSERT(!is_eof()); + return m_input[m_index++]; +} + +// Consume the given character if it is next in the input +bool GenericLexer::consume_specific(char specific) +{ + if (peek() != specific) + return false; + + ignore(); + return true; +} + +// Consume the given string if it is next in the input +bool GenericLexer::consume_specific(const char* str) +{ + if (!next_is(str)) + return false; + + ignore(__builtin_strlen(str)); + return true; +} + +// Consume a number of characters +StringView GenericLexer::consume(size_t count) +{ + if (count == 0) + return {}; + + size_t start = m_index; + size_t length = min(count, m_input.length() - m_index); + m_index += length; + + return m_input.substring_view(start, length); +} + +// Consume the rest of the input +StringView GenericLexer::consume_all() +{ + if (is_eof()) + return {}; + + auto rest = m_input.substring_view(m_index, m_input.length() - m_index); + m_index = m_input.length(); + return rest; +} + +// Consume until a new line is found +StringView GenericLexer::consume_line() +{ + size_t start = m_index; + while (!is_eof() && peek() != '\r' && peek() != '\n') + m_index++; + size_t length = m_index - start; + + consume_specific('\r'); + consume_specific('\n'); + + if (length == 0) + return {}; + return m_input.substring_view(start, length); +} + +// Consume and return characters while `condition` returns true +StringView GenericLexer::consume_while(Condition condition) +{ + size_t start = m_index; + while (!is_eof() && condition(peek())) + m_index++; + size_t length = m_index - start; + + if (length == 0) + return {}; + return m_input.substring_view(start, length); +} + +// Consume and return characters until `stop` is peek'd +// The `stop` character is ignored, as it is user-defined +StringView GenericLexer::consume_until(char stop) +{ + size_t start = m_index; + while (!is_eof() && peek() != stop) + m_index++; + size_t length = m_index - start; + + ignore(); + + if (length == 0) + return {}; + return m_input.substring_view(start, length); +} + +// Consume and return characters until the string `stop` is found +// The `stop` string is ignored, as it is user-defined +StringView GenericLexer::consume_until(const char* stop) +{ + size_t start = m_index; + while (!is_eof() && !next_is(stop)) + m_index++; + size_t length = m_index - start; + + ignore(__builtin_strlen(stop)); + + if (length == 0) + return {}; + return m_input.substring_view(start, length); +} + +// Consume and return characters until `condition` return true +StringView GenericLexer::consume_until(Condition condition) +{ + size_t start = m_index; + while (!is_eof() && !condition(peek())) + m_index++; + size_t length = m_index - start; + + if (length == 0) + return {}; + return m_input.substring_view(start, length); +} + +// Consume a string surrounded by single or double quotes +// The returned StringView does not include the quotes +StringView GenericLexer::consume_quoted_string() +{ + if (!is_quote(peek())) + return {}; + + char quote_char = consume(); + size_t start = m_index; + while (!is_eof() && peek() != quote_char) + m_index++; + size_t length = m_index - start; + + if (peek() != quote_char) { + m_index = start - 1; // Restore the index in case the string is unterminated + return {}; + } + + ignore(); + + return m_input.substring_view(start, length); +} + +// Ignore a number of characters (1 by default) +void GenericLexer::ignore(size_t count) +{ + count = min(count, m_input.length() - m_index); + m_index += count; +} + +// Ignore characters while `condition` returns true +void GenericLexer::ignore_while(Condition condition) +{ + while (!is_eof() && condition(peek())) + m_index++; +} + +// Ignore characters until `stop` is peek'd +// The `stop` character is ignored as it is user-defined +void GenericLexer::ignore_until(char stop) +{ + while (!is_eof() && peek() != stop) + m_index++; + + ignore(); +} + +// Ignore characters until the string `stop` is found +// The `stop` string is ignored, as it is user-defined +void GenericLexer::ignore_until(const char* stop) +{ + while (!is_eof() && !next_is(stop)) + m_index++; + + ignore(__builtin_strlen(stop)); +} + +// Ignore characters until `condition` return true +// We don't skip the stop character as it may not be a single value +void GenericLexer::ignore_until(Condition condition) +{ + while (!is_eof() && !condition(peek())) + m_index++; +} + + +bool is_control(char c) +{ + return (c >= 0 && c <= 31) || c == 127; +} + +bool is_whitespace(char c) +{ + return (c >= '\t' && c <= '\r') || c == ' '; +} + +bool is_lowercase(char c) +{ + return c >= 'a' && c <= 'z'; +} + +bool is_uppercase(char c) +{ + return c >= 'A' && c <= 'Z'; +} + +bool is_digit(char c) +{ + return c >= '0' && c <= '9'; +} + +bool is_punctuation(char c) +{ + return (c >= '!' && c <= '/') + || (c >= ':' && c <= '@') + || (c >= '[' && c <= '`') + || (c >= '{' && c <= '~'); +} + +bool is_printable(char c) +{ + return c >= ' ' && c <= '~'; +} + +bool is_graphic(char c) +{ + return c > ' ' && c <= '~'; +} + +bool is_alpha(char c) +{ + return is_lowercase(c) || is_uppercase(c); +} + +bool is_alphanum(char c) +{ + return is_alpha(c) || is_digit(c); +} + +bool is_hex_digit(char c) +{ + return is_digit(c) + || (c >= 'A' && c <= 'F') + || (c >= 'a' && c <= 'f'); +} + +bool is_quote(char c) +{ + return c == '\'' || c == '"'; +} + +bool is_path_separator(char c) +{ + return c == '/' || c == '\\'; +} + +} diff --git a/AK/GenericLexer.h b/AK/GenericLexer.h new file mode 100644 index 00000000000..127b64518bd --- /dev/null +++ b/AK/GenericLexer.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2020, Benoit Lormeau + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#pragma once + +#include +#include + +namespace AK { + +class GenericLexer { +public: + explicit GenericLexer(const StringView& input); + virtual ~GenericLexer(); + + using Condition = Function; + + size_t tell() const { return m_index; } + size_t tell_remaining() const { return m_input.length() - m_index; } + + bool is_eof() const; + + char peek(size_t offset = 0) const; + + bool next_is(char expected) const; + bool next_is(const char* expected) const; + bool next_is(Condition) const; + + char consume(); + bool consume_specific(char specific); + bool consume_specific(const char* str); + StringView consume(size_t count); + StringView consume_all(); + StringView consume_line(); + StringView consume_while(Condition); + StringView consume_until(char stop); + StringView consume_until(const char* stop); + StringView consume_until(Condition); + // FIXME: provide an escape character + StringView consume_quoted_string(); + + void ignore(size_t count = 1); + void ignore_while(Condition); + void ignore_until(char stop); + void ignore_until(const char* stop); + void ignore_until(Condition); + +protected: + StringView m_input; + size_t m_index { 0 }; +}; + +// ctype adaptors +// FIXME: maybe put them in an another file? +bool is_alpha(char); +bool is_alphanum(char); +bool is_control(char); +bool is_digit(char); +bool is_graphic(char); +bool is_hex_digit(char); +bool is_lowercase(char); +bool is_path_separator(char); +bool is_printable(char); +bool is_punctuation(char); +bool is_quote(char); +bool is_uppercase(char); +bool is_whitespace(char); + +} + +using AK::GenericLexer; + +using AK::is_alpha; +using AK::is_alphanum; +using AK::is_control; +using AK::is_digit; +using AK::is_graphic; +using AK::is_hex_digit; +using AK::is_lowercase; +using AK::is_path_separator; +using AK::is_printable; +using AK::is_punctuation; +using AK::is_quote; +using AK::is_uppercase; +using AK::is_whitespace; diff --git a/AK/JsonParser.cpp b/AK/JsonParser.cpp index 349c44a0b60..8248b8b85d7 100644 --- a/AK/JsonParser.cpp +++ b/AK/JsonParser.cpp @@ -31,44 +31,7 @@ namespace AK { -static inline bool is_whitespace(char ch) -{ - return ch == ' ' || ch == '\n' || ch == '\t' || ch == '\v' || ch == '\r'; -} - -char JsonParser::peek() const -{ - if (m_index < m_input.length()) - return m_input[m_index]; - return '\0'; -} - -char JsonParser::consume() -{ - if (m_index < m_input.length()) - return m_input[m_index++]; - return '\0'; -} - -template -void JsonParser::consume_while(C condition) -{ - while (condition(peek())) - consume(); -} - -void JsonParser::consume_whitespace() -{ - consume_while([](char ch) { return is_whitespace(ch); }); -} - -bool JsonParser::consume_specific(char expected_ch) -{ - char consumed_ch = consume(); - return consumed_ch == expected_ch; -} - -String JsonParser::consume_quoted_string() +String JsonParser::consume_and_unescape_string() { if (!consume_specific('"')) return {}; @@ -86,11 +49,9 @@ String JsonParser::consume_quoted_string() ++peek_index; } - if (peek_index != m_index) { - while (peek_index != m_index) { - final_sb.append(m_input.characters_without_null_termination()[m_index]); - m_index++; - } + while (peek_index != m_index) { + final_sb.append(m_input[m_index]); + m_index++; } if (m_index == m_input.length()) @@ -101,7 +62,7 @@ String JsonParser::consume_quoted_string() final_sb.append(consume()); continue; } - consume(); + ignore(); char escaped_ch = consume(); switch (escaped_ch) { case 'n': @@ -120,18 +81,11 @@ String JsonParser::consume_quoted_string() final_sb.append('\f'); break; case 'u': { - StringBuilder sb; - sb.append(consume()); - sb.append(consume()); - sb.append(consume()); - sb.append(consume()); - - auto code_point = AK::StringUtils::convert_to_uint_from_hex(sb.to_string()); - if (code_point.has_value()) { + auto code_point = AK::StringUtils::convert_to_uint_from_hex(consume(4)); + if (code_point.has_value()) final_sb.append_code_point(code_point.value()); - } else { + else final_sb.append('?'); - } } break; default: final_sb.append(escaped_ch); @@ -150,27 +104,27 @@ Optional JsonParser::parse_object() if (!consume_specific('{')) return {}; for (;;) { - consume_whitespace(); + ignore_while(is_whitespace); if (peek() == '}') break; - consume_whitespace(); - auto name = consume_quoted_string(); + ignore_while(is_whitespace); + auto name = consume_and_unescape_string(); if (name.is_null()) return {}; - consume_whitespace(); + ignore_while(is_whitespace); if (!consume_specific(':')) return {}; - consume_whitespace(); + ignore_while(is_whitespace); auto value = parse_helper(); if (!value.has_value()) return {}; object.set(name, move(value.value())); - consume_whitespace(); + ignore_while(is_whitespace); if (peek() == '}') break; if (!consume_specific(',')) return {}; - consume_whitespace(); + ignore_while(is_whitespace); if (peek() == '}') return {}; } @@ -185,23 +139,23 @@ Optional JsonParser::parse_array() if (!consume_specific('[')) return {}; for (;;) { - consume_whitespace(); + ignore_while(is_whitespace); if (peek() == ']') break; auto element = parse_helper(); if (!element.has_value()) return {}; array.append(element.value()); - consume_whitespace(); + ignore_while(is_whitespace); if (peek() == ']') break; if (!consume_specific(',')) return {}; - consume_whitespace(); + ignore_while(is_whitespace); if (peek() == ']') return {}; } - consume_whitespace(); + ignore_while(is_whitespace); if (!consume_specific(']')) return {}; return array; @@ -209,7 +163,7 @@ Optional JsonParser::parse_array() Optional JsonParser::parse_string() { - auto result = consume_quoted_string(); + auto result = consume_and_unescape_string(); if (result.is_null()) return {}; return JsonValue(result); @@ -283,39 +237,30 @@ Optional JsonParser::parse_number() return value; } -bool JsonParser::consume_string(const char* str) -{ - for (size_t i = 0, length = strlen(str); i < length; ++i) { - if (!consume_specific(str[i])) - return false; - } - return true; -} - Optional JsonParser::parse_true() { - if (!consume_string("true")) + if (!consume_specific("true")) return {}; return JsonValue(true); } Optional JsonParser::parse_false() { - if (!consume_string("false")) + if (!consume_specific("false")) return {}; return JsonValue(false); } Optional JsonParser::parse_null() { - if (!consume_string("null")) + if (!consume_specific("null")) return {}; return JsonValue(JsonValue::Type::Null); } Optional JsonParser::parse_helper() { - consume_whitespace(); + ignore_while(is_whitespace); auto type_hint = peek(); switch (type_hint) { case '{': @@ -347,12 +292,13 @@ Optional JsonParser::parse_helper() return {}; } -Optional JsonParser::parse() { +Optional JsonParser::parse() +{ auto result = parse_helper(); if (!result.has_value()) return {}; - consume_whitespace(); - if (m_index != m_input.length()) + ignore_while(is_whitespace); + if (!is_eof()) return {}; return result; } diff --git a/AK/JsonParser.h b/AK/JsonParser.h index 607bab8a6fc..aee5da42300 100644 --- a/AK/JsonParser.h +++ b/AK/JsonParser.h @@ -27,13 +27,14 @@ #pragma once #include +#include namespace AK { -class JsonParser { +class JsonParser : private GenericLexer { public: explicit JsonParser(const StringView& input) - : m_input(input) + : GenericLexer(input) { } ~JsonParser() @@ -45,12 +46,7 @@ public: private: Optional parse_helper(); - char peek() const; - char consume(); - void consume_whitespace(); - bool consume_specific(char expected_ch); - bool consume_string(const char*); - String consume_quoted_string(); + String consume_and_unescape_string(); Optional parse_array(); Optional parse_object(); Optional parse_number(); @@ -59,12 +55,6 @@ private: Optional parse_true(); Optional parse_null(); - template - void consume_while(C); - - StringView m_input; - size_t m_index { 0 }; - String m_last_string_starting_with_character[256]; }; diff --git a/Kernel/CMakeLists.txt b/Kernel/CMakeLists.txt index 5cbc14ef0cd..45e71e622db 100644 --- a/Kernel/CMakeLists.txt +++ b/Kernel/CMakeLists.txt @@ -193,6 +193,7 @@ set(KERNEL_SOURCES set(AK_SOURCES ../AK/FlyString.cpp + ../AK/GenericLexer.cpp ../AK/JsonParser.cpp ../AK/JsonValue.cpp ../AK/LexicalPath.cpp