Browse Source

LibWeb: Add start of HTML Tokenizer in Swift

Currently it's just a Token class.
Andrew Kaster 10 tháng trước cách đây
mục cha
commit
fb074f9d0c

+ 5 - 0
Tests/LibWeb/CMakeLists.txt

@@ -20,4 +20,9 @@ if (ENABLE_SWIFT)
     target_link_libraries(TestLibWebSwiftBindings PRIVATE AK LibWeb)
     target_compile_options(TestLibWebSwiftBindings PRIVATE -parse-as-library)
     add_test(NAME TestLibWebSwiftBindings COMMAND TestLibWebSwiftBindings)
+
+    add_executable(TestHTMLTokenizerSwift TestHTMLTokenizerSwift.swift)
+    target_link_libraries(TestHTMLTokenizerSwift PRIVATE AK LibWeb)
+    target_compile_options(TestHTMLTokenizerSwift PRIVATE -parse-as-library)
+    add_test(NAME TestHTMLTokenizerSwift COMMAND TestHTMLTokenizerSwift)
 endif()

+ 58 - 0
Tests/LibWeb/TestHTMLTokenizerSwift.swift

@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+import AK
+import LibWeb
+import SwiftLibWeb
+import Foundation
+
+class StandardError: TextOutputStream {
+    func write(_ string: Swift.String) {
+        try! FileHandle.standardError.write(contentsOf: Data(string.utf8))
+    }
+}
+
+@main
+struct TestHTMLTokenizerSwift {
+
+    static func testTokenTypes() {
+        var standardError = StandardError()
+        print("Testing HTMLToken types...", to: &standardError)
+
+        let default_token = HTMLToken()
+        default_token.type = .Character(codePoint: "a")
+        precondition(default_token.isCharacter())
+
+        print("HTMLToken types pass", to: &standardError)
+    }
+
+    static func testParserWhitespace() {
+        var standardError = StandardError()
+        print("Testing HTMLToken parser whitespace...", to: &standardError)
+
+        for codePoint: Character in ["\t", "\n", "\r", "\u{000C}", " "] {
+            let token = HTMLToken(type: .Character(codePoint: codePoint))
+            precondition(token.isParserWhitespace())
+        }
+
+        for codePoint: Character in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] {
+            let token = HTMLToken(type: .Character(codePoint: codePoint))
+            precondition(!token.isParserWhitespace())
+        }
+
+        print("HTMLToken parser whitespace pass", to: &standardError)
+    }
+
+    static func main() {
+        var standardError = StandardError()
+        print("Starting test suite...", to: &standardError)
+
+        testTokenTypes()
+        testParserWhitespace()
+
+        print("All tests pass", to: &standardError)
+    }
+}

+ 1 - 1
Tests/LibWeb/TestLibWebSwiftBindings.swift

@@ -44,4 +44,4 @@ struct TestLibWebSwiftBindings {
 
         print("All tests pass", to: &standardError)
     }
-}
+}

+ 27 - 0
Userland/Libraries/LibWeb/CMakeLists.txt

@@ -794,4 +794,31 @@ if (ENABLE_SWIFT)
     list(APPEND LIBWEB_ALL_GENERATED_HEADERS ${generated_headers})
 
     generate_clang_module_map(LibWeb GENERATED_FILES ${LIBWEB_ALL_GENERATED_HEADERS})
+
+    target_compile_features(LibWeb PUBLIC cxx_std_23)
+
+    target_sources(LibWeb PRIVATE
+        HTML/Parser/HTMLToken.swift
+    )
+    target_compile_definitions(LibWeb PRIVATE LIBWEB_USE_SWIFT)
+    set_target_properties(LibWeb PROPERTIES Swift_MODULE_NAME "SwiftLibWeb")
+
+    # FIXME: These should be pulled automatically from interface compile options for the target
+    set(VFS_OVERLAY_OPTIONS
+        -Xcc -ivfsoverlay${CMAKE_CURRENT_BINARY_DIR}/vfs_overlay.yaml
+        -Xcc -ivfsoverlay${CMAKE_CURRENT_BINARY_DIR}/../LibGfx/vfs_overlay.yaml
+        -Xcc -ivfsoverlay${Lagom_BINARY_DIR}/AK/vfs_overlay.yaml
+    )
+    get_target_property(LIBWEB_NATIVE_DIRS LibWeb INCLUDE_DIRECTORIES)
+    _swift_generate_cxx_header(LibWeb "LibWeb-Swift.h"
+        SEARCH_PATHS ${LIBWEB_NATIVE_DIRS}
+        COMPILE_OPTIONS ${VFS_OVERLAY_OPTIONS}
+    )
+
+    # FIXME: https://gitlab.kitware.com/cmake/cmake/-/issues/26175
+    if (APPLE)
+        add_custom_command(TARGET LibWeb POST_BUILD
+            COMMAND install_name_tool -id @rpath/liblagom-web.0.dylib "$<TARGET_FILE:LibWeb>"
+        )
+    endif()
 endif()

+ 78 - 0
Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift

@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+public class HTMLToken {
+    public struct Position {
+        var line = UInt()
+        var column = UInt()
+        var byteOffset = UInt()
+    }
+
+    public struct Attribute {
+        var prefix: String?
+        var localName: String
+        var namespace_: String?
+        var value: String
+        var nameStartPosition: Position
+        var nameEndPosition: Position
+        var valueStartPosition: Position
+        var valueEndPosition: Position
+    }
+
+    public enum TokenType {
+        case Invalid
+        case DOCTYPE(
+            name: String?,
+            publicIdentifier: String?,
+            systemIdentifier: String?,
+            forceQuirksMode: Bool)
+        case StartTag(
+            tagName: String,
+            selfClosing: Bool,
+            selfClosingAcknowledged: Bool,
+            attributes: [Attribute])
+        case EndTag(
+            tagName: String,
+            selfClosing: Bool,
+            selfClosingAcknowledged: Bool,
+            attributes: [Attribute])
+        case Comment(data: String)
+        case Character(codePoint: Character)
+        case EndOfFile
+    }
+
+    public func isCharacter() -> Bool {
+        if case .Character(_) = self.type {
+            return true
+        }
+        return false
+    }
+
+    public func isParserWhitespace() -> Bool {
+        precondition(isCharacter(), "isParserWhitespace() called on non-character token")
+
+        // NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
+        switch self.type {
+        case .Character(codePoint: "\t"),
+            .Character(codePoint: "\n"),
+            .Character(codePoint: "\u{000C}"),  // \f
+            .Character(codePoint: "\r"),
+            .Character(codePoint: " "):
+            return true
+        default:
+            return false
+        }
+    }
+
+    public var type = TokenType.Invalid
+    public var startPosition = Position()
+    public var endPosition = Position()
+
+    public init() {}
+    public init(type: TokenType) {
+        self.type = type
+    }
+}