LibWeb: Implement the Data state for the Swift tokenizer
And add tests! This implementation closely follows the current C++ implementation, replacing macros and gotos with a slightly more complex state machine. It's very possible that an async version that yields tokens on "emit" would be even simpler, but let's get this one working first :).
This commit is contained in:
parent
01c4625a42
commit
77718c0a66
Notes:
github-actions[bot]
2024-08-29 04:32:14 +00:00
Author: https://github.com/ADKaster Commit: https://github.com/LadybirdBrowser/ladybird/commit/77718c0a664 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1220
3 changed files with 296 additions and 21 deletions
Tests/LibWeb
Userland/Libraries/LibWeb/HTML/Parser
|
@ -5,8 +5,8 @@
|
|||
*/
|
||||
|
||||
import AK
|
||||
import Web
|
||||
import Testing
|
||||
import Web
|
||||
|
||||
@Suite
|
||||
struct TestHTMLTokenizerSwift {
|
||||
|
@ -30,4 +30,86 @@ struct TestHTMLTokenizerSwift {
|
|||
#expect(!token.isParserWhitespace())
|
||||
}
|
||||
}
|
||||
|
||||
@Test func dataStateNoInput() {
|
||||
let tokenizer = HTMLTokenizer()
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .EndOfFile)
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2 == nil)
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data)
|
||||
}
|
||||
|
||||
@Test func dataStateSingleChar() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "X") else {
|
||||
Issue.record("Failed to create tokenizer for 'X'")
|
||||
return
|
||||
}
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .Character(codePoint: "X"))
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2?.type == .EndOfFile)
|
||||
|
||||
let token3 = tokenizer.nextToken()
|
||||
#expect(token3 == nil)
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data)
|
||||
}
|
||||
|
||||
@Test func dataStateAmpersand() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "&") else {
|
||||
Issue.record("Failed to create tokenizer for '&'")
|
||||
return
|
||||
}
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .EndOfFile)
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.CharacterReference)
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2 == nil)
|
||||
}
|
||||
|
||||
@Test func dataStateTagOpen() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "<") else {
|
||||
Issue.record("Failed to create tokenizer for '<'")
|
||||
return
|
||||
}
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .EndOfFile)
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2 == nil)
|
||||
}
|
||||
|
||||
@Test func dataStateNulChar() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "H\0I") else {
|
||||
Issue.record("Failed to create tokenizer for 'H\\0I'")
|
||||
return
|
||||
}
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .Character(codePoint: "H"))
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2?.type == .Character(codePoint: "\u{FFFD}"))
|
||||
|
||||
let token3 = tokenizer.nextToken()
|
||||
#expect(token3?.type == .Character(codePoint: "I"))
|
||||
|
||||
let token4 = tokenizer.nextToken()
|
||||
#expect(token4?.type == .EndOfFile)
|
||||
|
||||
#expect(tokenizer.state == HTMLTokenizer.State.Data)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7,13 +7,13 @@
|
|||
@_exported import WebCxx
|
||||
|
||||
public class HTMLToken {
|
||||
public struct Position {
|
||||
public struct Position: Equatable {
|
||||
var line = UInt()
|
||||
var column = UInt()
|
||||
var byteOffset = UInt()
|
||||
}
|
||||
|
||||
public struct Attribute {
|
||||
public struct Attribute: Equatable {
|
||||
var prefix: Swift.String?
|
||||
var localName: Swift.String
|
||||
var namespace_: Swift.String?
|
||||
|
@ -24,7 +24,7 @@ public class HTMLToken {
|
|||
var valueEndPosition: Position
|
||||
}
|
||||
|
||||
public enum TokenType {
|
||||
public enum TokenType: Equatable {
|
||||
case Invalid
|
||||
case DOCTYPE(
|
||||
name: Swift.String?,
|
||||
|
@ -79,7 +79,7 @@ public class HTMLToken {
|
|||
}
|
||||
}
|
||||
|
||||
extension HTMLToken.Position: Equatable, CustomStringConvertible {
|
||||
extension HTMLToken.Position: CustomStringConvertible {
|
||||
public var description: Swift.String {
|
||||
return "\(self.line):\(self.column)"
|
||||
}
|
||||
|
@ -109,13 +109,11 @@ extension HTMLToken.TokenType: CustomStringConvertible {
|
|||
|
||||
extension HTMLToken: CustomStringConvertible {
|
||||
public var description: Swift.String {
|
||||
if (self.startPosition == Position()) {
|
||||
if self.startPosition == Position() {
|
||||
return "HTMLToken(type: \(self.type))"
|
||||
}
|
||||
else if (self.endPosition == Position()) {
|
||||
} else if self.endPosition == Position() {
|
||||
return "HTMLToken(type: \(self.type))@\(self.startPosition)"
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return "HTMLToken(type: \(self.type))@\(self.startPosition)-\(self.endPosition)"
|
||||
}
|
||||
}
|
||||
|
|
|
@ -20,9 +20,9 @@ extension Swift.String {
|
|||
}
|
||||
}
|
||||
|
||||
class HTMLTokenizer {
|
||||
public class HTMLTokenizer {
|
||||
|
||||
enum State {
|
||||
public enum State {
|
||||
case Data
|
||||
case RCDATA
|
||||
case RAWTEXT
|
||||
|
@ -105,29 +105,224 @@ class HTMLTokenizer {
|
|||
case NumericCharacterReferenceEnd
|
||||
}
|
||||
|
||||
var input = Swift.String()
|
||||
var state = State.Data
|
||||
var returnState = State.Data
|
||||
private var input = Swift.String()
|
||||
private var cursor: Swift.String.Index
|
||||
private var previousCursor: Swift.String.Index
|
||||
|
||||
var currentToken = HTMLToken()
|
||||
var queuedTokens = Deque<HTMLToken>()
|
||||
public private(set) var state = State.Data
|
||||
private var returnState = State.Data
|
||||
|
||||
public init() {}
|
||||
private var currentToken = HTMLToken()
|
||||
private var queuedTokens = Deque<HTMLToken>()
|
||||
|
||||
private var aborted = false
|
||||
private var hasEmittedEOF = false
|
||||
|
||||
public init() {
|
||||
self.cursor = self.input.startIndex
|
||||
self.previousCursor = self.input.startIndex
|
||||
}
|
||||
public init?(input: AK.StringView, encoding: AK.StringView) {
|
||||
if let string = Swift.String(decoding: input, as: encoding) {
|
||||
self.input = string
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
self.cursor = self.input.startIndex
|
||||
self.previousCursor = self.input.startIndex
|
||||
}
|
||||
|
||||
public convenience init?(input: AK.StringView) {
|
||||
self.init(input: input, encoding: "UTF-8")
|
||||
}
|
||||
|
||||
public func abort() {
|
||||
self.aborted = true
|
||||
}
|
||||
|
||||
func skip(_ count: Int) {
|
||||
self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
|
||||
self.previousCursor = self.input.index(before: self.cursor)
|
||||
}
|
||||
|
||||
func peekCodePoint(_ offset: Int = 0) -> Character? {
|
||||
guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
|
||||
return nil
|
||||
}
|
||||
return self.input[index]
|
||||
}
|
||||
|
||||
func nextCodePoint() -> Character? {
|
||||
guard self.cursor < self.input.endIndex else {
|
||||
return nil
|
||||
}
|
||||
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
|
||||
// https://infra.spec.whatwg.org/#normalize-newlines
|
||||
var codePoint: Character
|
||||
if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
|
||||
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
|
||||
skip(2)
|
||||
codePoint = "\n"
|
||||
} else if let peeked = peekCodePoint(), peeked == "\r" {
|
||||
// replace every remaining U+000D CR code point with a U+000A LF code point.
|
||||
skip(1)
|
||||
codePoint = "\n"
|
||||
} else {
|
||||
skip(1)
|
||||
codePoint = self.input[self.previousCursor]
|
||||
}
|
||||
return codePoint
|
||||
}
|
||||
|
||||
func restoreCursorToPrevious() {
|
||||
self.cursor = self.previousCursor
|
||||
}
|
||||
|
||||
func createNewToken(_ token: HTMLToken) {
|
||||
self.currentToken = token
|
||||
// FIXME: Assign Position
|
||||
}
|
||||
|
||||
enum NextTokenState {
|
||||
case Emit(token: HTMLToken?)
|
||||
case SwitchTo
|
||||
case Reconsume(inputCharacter: Character?)
|
||||
case ReprocessQueue
|
||||
}
|
||||
|
||||
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
|
||||
|
||||
while !queuedTokens.isEmpty {
|
||||
return queuedTokens.popFirst()
|
||||
let processQueue = { () -> HTMLToken?? in
|
||||
if let token = self.queuedTokens.popFirst() {
|
||||
return token
|
||||
}
|
||||
return self.aborted ? Optional(nil) : nil
|
||||
}
|
||||
|
||||
return nil
|
||||
if let maybeToken = processQueue() {
|
||||
return maybeToken
|
||||
}
|
||||
|
||||
var nextInputCharacter: Character? = nil
|
||||
while true {
|
||||
// FIXME: Handle insertion point
|
||||
switch nextTokenImpl(nextInputCharacter) {
|
||||
case .Emit(let token):
|
||||
return token
|
||||
case .SwitchTo:
|
||||
nextInputCharacter = nil
|
||||
break
|
||||
case .Reconsume(let character):
|
||||
nextInputCharacter = character
|
||||
break
|
||||
case .ReprocessQueue:
|
||||
if let maybeToken = processQueue() {
|
||||
return maybeToken
|
||||
}
|
||||
nextInputCharacter = nil
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func switchTo(_ state: State) -> NextTokenState {
|
||||
self.state = state
|
||||
return .SwitchTo
|
||||
}
|
||||
|
||||
func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
|
||||
self.state = state
|
||||
return .Reconsume(inputCharacter: character)
|
||||
}
|
||||
|
||||
func switchToReturnState() -> NextTokenState {
|
||||
self.state = self.returnState
|
||||
return .ReprocessQueue
|
||||
}
|
||||
|
||||
func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
|
||||
self.state = self.returnState
|
||||
if character != nil {
|
||||
restoreCursorToPrevious()
|
||||
}
|
||||
return .ReprocessQueue
|
||||
}
|
||||
|
||||
func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
|
||||
self.state = state
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||
}
|
||||
|
||||
func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
|
||||
self.state = state
|
||||
return emitCharacter(character)
|
||||
}
|
||||
|
||||
func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
|
||||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
|
||||
self.state = `in`
|
||||
return .Reconsume(inputCharacter: currentInputCharacter)
|
||||
}
|
||||
|
||||
func emitEOF() -> NextTokenState {
|
||||
if self.hasEmittedEOF {
|
||||
return .Emit(token: nil)
|
||||
}
|
||||
self.hasEmittedEOF = true
|
||||
createNewToken(HTMLToken(type: .EndOfFile))
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||
}
|
||||
|
||||
func emitCurrentTokenFollowedByEOF() -> NextTokenState {
|
||||
precondition(!self.hasEmittedEOF)
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return emitEOF()
|
||||
}
|
||||
|
||||
func emitCharacter(_ character: Character) -> NextTokenState {
|
||||
createNewToken(HTMLToken(type: .Character(codePoint: character)))
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||
}
|
||||
|
||||
func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
|
||||
let dontConsumeNextInputCharacter = {
|
||||
self.restoreCursorToPrevious()
|
||||
}
|
||||
let _ = dontConsumeNextInputCharacter
|
||||
|
||||
// FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
|
||||
|
||||
// Handle reconsume by passing the character around in the state enum
|
||||
let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
|
||||
|
||||
switch self.state {
|
||||
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
||||
case .Data:
|
||||
switch currentInputCharacter {
|
||||
case "&":
|
||||
self.returnState = .Data
|
||||
return switchTo(.CharacterReference)
|
||||
case "<":
|
||||
return switchTo(.TagOpen)
|
||||
case "\0":
|
||||
// FIXME: log_parse_error()
|
||||
return emitCharacter("\u{FFFD}")
|
||||
case nil:
|
||||
return emitEOF()
|
||||
default:
|
||||
return emitCharacter(currentInputCharacter!)
|
||||
}
|
||||
default:
|
||||
print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
|
||||
return emitEOF()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue