瀏覽代碼

LibWeb: Add more HTML tokenization states to Swift implementation

This patch adds support for start and end tags, as well as script tag
rules.
Andrew Kaster 9 月之前
父節點
當前提交
d96c7edfb6

+ 144 - 4
Tests/LibWeb/TestHTMLTokenizerSwift.swift

@@ -76,7 +76,7 @@ struct TestHTMLTokenizerSwift {
         #expect(token2 == nil)
     }
 
-    @Test func dataStateTagOpen() {
+    @Test func tagOpenOnly() {
         guard let tokenizer = HTMLTokenizer(input: "<") else {
             Issue.record("Failed to create tokenizer for '<'")
             return
@@ -84,11 +84,14 @@ struct TestHTMLTokenizerSwift {
         #expect(tokenizer.state == HTMLTokenizer.State.Data)  // initial state
 
         let token = tokenizer.nextToken()
-        #expect(token?.type == .EndOfFile)
-        #expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
+        #expect(token?.type == .Character(codePoint: "<"))
 
         let token2 = tokenizer.nextToken()
-        #expect(token2 == nil)
+        #expect(token2?.type == .EndOfFile)
+        #expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
+
+        let token3 = tokenizer.nextToken()
+        #expect(token3 == nil)
     }
 
     @Test func dataStateNulChar() {
@@ -112,4 +115,141 @@ struct TestHTMLTokenizerSwift {
 
         #expect(tokenizer.state == HTMLTokenizer.State.Data)
     }
+
+    @Test func scriptTagWithAttributes() {
+        guard let tokenizer = HTMLTokenizer(input: "<script type=\"text/javascript\">") else {
+            Issue.record("Failed to create tokenizer for '<script type=\"text/javascript\">'")
+            return
+        }
+        #expect(tokenizer.state == HTMLTokenizer.State.Data)  // initial state
+
+        let token = tokenizer.nextToken()
+        #expect(token?.type == .StartTag(tagName: "script", attributes: [HTMLToken.Attribute(localName: "type", value: "text/javascript")]))
+
+        let token2 = tokenizer.nextToken()
+        #expect(token2?.type == .EndOfFile)
+
+        #expect(tokenizer.state == HTMLTokenizer.State.Data)
+    }
+
+    @Test func scriptWithContent() {
+        guard let tokenizer = HTMLTokenizer(input: "<script>var x = 1;</script>") else {
+            Issue.record("Failed to create tokenizer for '<script>var x = 1;</script>'")
+            return
+        }
+
+        let token = tokenizer.nextToken()
+        #expect(token?.type == .StartTag(tagName: "script", attributes: []))
+
+        for codePoint in "var x = 1;" {
+            let token = tokenizer.nextToken()
+            #expect(token?.type == .Character(codePoint: codePoint))
+        }
+
+        let token2 = tokenizer.nextToken()
+        #expect(token2?.type == .EndTag(tagName: "script"))
+
+        let token3 = tokenizer.nextToken()
+        #expect(token3?.type == .EndOfFile)
+    }
+
+    @Test func simpleDivWithContent() {
+        guard let tokenizer = HTMLTokenizer(input: "<div>hi</div>") else {
+            Issue.record("Failed to create tokenizer for '<div>hi</div>'")
+            return
+        }
+        #expect(tokenizer.state == HTMLTokenizer.State.Data)  // initial state
+
+        let token = tokenizer.nextToken()
+        #expect(token?.type == .StartTag(tagName: "div", attributes: []))
+
+        let token2 = tokenizer.nextToken()
+        #expect(token2?.type == .Character(codePoint: "h"))
+
+        let token3 = tokenizer.nextToken()
+        #expect(token3?.type == .Character(codePoint: "i"))
+
+        let token4 = tokenizer.nextToken()
+        #expect(token4?.type == .EndTag(tagName: "div"))
+
+        let token5 = tokenizer.nextToken()
+        #expect(token5?.type == .EndOfFile)
+    }
+
+    @Test func simpleDivWithContentAndAttributes() {
+        guard let tokenizer = HTMLTokenizer(input: "<div class=\"foo\">hi</div>") else {
+            Issue.record("Failed to create tokenizer for '<div class=\"foo\">hi</div>'")
+            return
+        }
+        #expect(tokenizer.state == HTMLTokenizer.State.Data)  // initial state
+
+        let token = tokenizer.nextToken()
+        #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")]))
+
+        let token2 = tokenizer.nextToken()
+        #expect(token2?.type == .Character(codePoint: "h"))
+
+        let token3 = tokenizer.nextToken()
+        #expect(token3?.type == .Character(codePoint: "i"))
+
+        let token4 = tokenizer.nextToken()
+        #expect(token4?.type == .EndTag(tagName: "div"))
+
+        let token5 = tokenizer.nextToken()
+        #expect(token5?.type == .EndOfFile)
+    }
+
+    @Test func severalDivsWithAttributesAndContent() {
+        // Explicitly use unquoted and single quotes for attribute values
+        guard let tokenizer = HTMLTokenizer(input: "<div class=foo>hi</div><div class='bar'>bye</div>") else {
+            Issue.record("Failed to create tokenizer for '<div class=\"foo\">hi</div><div class=\"bar\">bye</div>'")
+            return
+        }
+
+        let token = tokenizer.nextToken()
+        #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")]))
+
+        for codePoint in "hi" {
+            let token = tokenizer.nextToken()
+            #expect(token?.type == .Character(codePoint: codePoint))
+        }
+
+        let token2 = tokenizer.nextToken()
+        #expect(token2?.type == .EndTag(tagName: "div"))
+
+        let token3 = tokenizer.nextToken()
+        #expect(token3?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "bar")]))
+
+        for codePoint in "bye" {
+            let token = tokenizer.nextToken()
+            #expect(token?.type == .Character(codePoint: codePoint))
+        }
+
+        let token4 = tokenizer.nextToken()
+        #expect(token4?.type == .EndTag(tagName: "div"))
+
+        let token5 = tokenizer.nextToken()
+        #expect(token5?.type == .EndOfFile)
+    }
+
+    @Test func startTagWithMultipleAttributes() {
+        guard let tokenizer = HTMLTokenizer(input: "<div class=\"foo\" id=\"bar\">hi</div attr=endTagAttributeWhee>") else {
+            Issue.record("Failed to create tokenizer for '<div class=\"foo\" id=\"bar\">hi</div>'")
+            return
+        }
+
+        let token = tokenizer.nextToken()
+        #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo"), HTMLToken.Attribute(localName: "id", value: "bar")]))
+
+        for codePoint in "hi" {
+            let token = tokenizer.nextToken()
+            #expect(token?.type == .Character(codePoint: codePoint))
+        }
+
+        let token2 = tokenizer.nextToken()
+        #expect(token2?.type == .EndTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "attr", value: "endTagAttributeWhee")]))
+
+        let token3 = tokenizer.nextToken()
+        #expect(token3?.type == .EndOfFile)
+    }
 }

+ 83 - 14
Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift

@@ -14,14 +14,19 @@ public class HTMLToken {
     }
 
     public struct Attribute: Equatable {
-        var prefix: Swift.String?
-        var localName: Swift.String
-        var namespace_: Swift.String?
-        var value: Swift.String
-        var nameStartPosition: Position
-        var nameEndPosition: Position
-        var valueStartPosition: Position
-        var valueEndPosition: Position
+        public var prefix: Swift.String? = nil
+        public var localName: Swift.String
+        public var namespace_: Swift.String? = nil
+        public var value: Swift.String
+        public var nameStartPosition = Position()
+        public var nameEndPosition = Position()
+        public var valueStartPosition = Position()
+        public var valueEndPosition = Position()
+
+        public init(localName: Swift.String, value: Swift.String) {
+            self.localName = localName
+            self.value = value
+        }
     }
 
     public enum TokenType: Equatable {
@@ -33,14 +38,14 @@ public class HTMLToken {
             forceQuirksMode: Bool)
         case StartTag(
             tagName: Swift.String,
-            selfClosing: Bool,
-            selfClosingAcknowledged: Bool,
-            attributes: [Attribute])
+            selfClosing: Bool = false,
+            selfClosingAcknowledged: Bool = false,
+            attributes: [Attribute] = [])
         case EndTag(
             tagName: Swift.String,
-            selfClosing: Bool,
-            selfClosingAcknowledged: Bool,
-            attributes: [Attribute])
+            selfClosing: Bool = false,
+            selfClosingAcknowledged: Bool = false,
+            attributes: [Attribute] = [])
         case Comment(data: Swift.String)
         case Character(codePoint: Character)
         case EndOfFile
@@ -53,6 +58,24 @@ public class HTMLToken {
         return false
     }
 
+    public func isEndTag() -> Bool {
+        if case .EndTag(_, _, _, _) = self.type {
+            return true
+        }
+        return false
+    }
+
+    public func isStartTag() -> Bool {
+        if case .StartTag(_, _, _, _) = self.type {
+            return true
+        }
+        return false
+    }
+
+    public func isTag() -> Bool {
+        return isStartTag() || isEndTag()
+    }
+
     public func isParserWhitespace() -> Bool {
         precondition(isCharacter(), "isParserWhitespace() called on non-character token")
 
@@ -73,6 +96,52 @@ public class HTMLToken {
     public var startPosition = Position()
     public var endPosition = Position()
 
+    // Is in-place mutating enums a thing? Seems not https://forums.swift.org/t/in-place-mutation-of-an-enum-associated-value/11747
+    public var attributes: [Attribute] {
+        get {
+            switch self.type {
+            case .StartTag(_, _, _, let attributes):
+                return attributes
+            case .EndTag(_, _, _, let attributes):
+                return attributes
+            default:
+                preconditionFailure("attributes called on non-tag token")
+            }
+        }
+        set {
+            switch self.type {
+            case .StartTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _):
+                self.type = .StartTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue)
+            case .EndTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _):
+                self.type = .EndTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue)
+            default:
+                preconditionFailure("attributes= called on non-tag token")
+            }
+        }
+    }
+    public var tagName: Swift.String {
+        get {
+            switch self.type {
+            case .StartTag(let tagName, _, _, _):
+                return tagName
+            case .EndTag(let tagName, _, _, _):
+                return tagName
+            default:
+                preconditionFailure("tagName called on non-tag token")
+            }
+        }
+        set {
+            switch self.type {
+            case .StartTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes):
+                self.type = .StartTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
+            case .EndTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes):
+                self.type = .EndTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
+            default:
+                preconditionFailure("tagName= called on non-tag token")
+            }
+        }
+    }
+
     public init() {}
     public init(type: TokenType) {
         self.type = type

+ 715 - 4
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift

@@ -18,6 +18,12 @@ extension Swift.String {
             return nil
         }
     }
+
+    public mutating func takeString() -> Swift.String {
+        let result = self
+        self = ""
+        return result
+    }
 }
 
 public class HTMLTokenizer {
@@ -115,9 +121,24 @@ public class HTMLTokenizer {
     private var currentToken = HTMLToken()
     private var queuedTokens = Deque<HTMLToken>()
 
+    private var currentBuilder = Swift.String()
+    private var temporaryBuffer = Swift.String()
+    private var lastStartTagName: Swift.String? = nil
+    private var currentTokensAttributes: [HTMLToken.Attribute]? = nil
+    private var currentAttribute: HTMLToken.Attribute? = nil
+
     private var aborted = false
     private var hasEmittedEOF = false
 
+    // https://infra.spec.whatwg.org/#ascii-upper-alpha
+    static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
+
+    // https://infra.spec.whatwg.org/#ascii-lower-alpha
+    static private var asciiLowerAlpha = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyz")
+
+    // https://infra.spec.whatwg.org/#ascii-upper-alpha
+    static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha)
+
     public init() {
         self.cursor = self.input.startIndex
         self.previousCursor = self.input.startIndex
@@ -181,14 +202,37 @@ public class HTMLTokenizer {
 
     func createNewToken(_ token: HTMLToken) {
         self.currentToken = token
+        if self.currentToken.isTag() {
+            self.currentTokensAttributes = []
+        }
         // FIXME: Assign Position
     }
 
+    enum AttributeStringBehavior {
+        case SetName
+        case SetValue
+        case IgnoreString
+    }
+    func finalizeCurrentAttribute(_ behavior: AttributeStringBehavior) {
+        precondition(self.currentAttribute != nil && self.currentTokensAttributes != nil)
+        switch behavior {
+        case .SetName:
+            self.currentAttribute!.localName = self.currentBuilder.takeString()
+        case .SetValue:
+            self.currentAttribute!.value = self.currentBuilder.takeString()
+        case .IgnoreString:
+            _ = self.currentBuilder.takeString()
+        }
+        self.currentTokensAttributes!.append(self.currentAttribute!)
+        self.currentAttribute = nil
+    }
+
     enum NextTokenState {
         case Emit(token: HTMLToken?)
         case SwitchTo
         case Reconsume(inputCharacter: Character?)
         case ReprocessQueue
+        case Continue
     }
 
     public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
@@ -210,7 +254,7 @@ public class HTMLTokenizer {
             switch nextTokenImpl(nextInputCharacter) {
             case .Emit(let token):
                 return token
-            case .SwitchTo:
+            case .SwitchTo, .Continue:
                 nextInputCharacter = nil
                 break
             case .Reconsume(let character):
@@ -226,12 +270,16 @@ public class HTMLTokenizer {
         }
     }
 
+    func continueInCurrentState() -> NextTokenState {
+        return .Continue
+    }
+
     func switchTo(_ state: State) -> NextTokenState {
         self.state = state
         return .SwitchTo
     }
 
-    func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
+    func reconsume(_ character: Character?, `in` state: State) -> NextTokenState {
         self.state = state
         return .Reconsume(inputCharacter: character)
     }
@@ -251,6 +299,10 @@ public class HTMLTokenizer {
 
     func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
         self.state = state
+        if self.currentToken.isTag() {
+            self.currentToken.attributes = self.currentTokensAttributes ?? []
+            self.currentTokensAttributes = nil
+        }
         self.queuedTokens.append(self.currentToken)
         self.currentToken = HTMLToken()
         return .Emit(token: self.queuedTokens.popFirst()!)
@@ -280,6 +332,10 @@ public class HTMLTokenizer {
 
     func emitCurrentTokenFollowedByEOF() -> NextTokenState {
         precondition(!self.hasEmittedEOF)
+        if self.currentToken.isTag() {
+            self.currentToken.attributes = self.currentTokensAttributes ?? []
+            self.currentTokensAttributes = nil
+        }
         self.queuedTokens.append(self.currentToken)
         self.currentToken = HTMLToken()
         return emitEOF()
@@ -292,20 +348,44 @@ public class HTMLTokenizer {
         return .Emit(token: self.queuedTokens.popFirst()!)
     }
 
+    func flushCodepointsConsumedAsACharacterReference() {
+        if consumedAsPartOfAnAttribute() {
+            self.currentBuilder += self.temporaryBuffer.takeString()
+        } else {
+            for codePoint in self.temporaryBuffer.takeString() {
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
+            }
+        }
+    }
+
+    func consumedAsPartOfAnAttribute() -> Bool {
+        return self.returnState == .AttributeValueDoubleQuoted || self.returnState == .AttributeValueSingleQuoted || self.returnState == .AttributeValueUnquoted
+    }
+
+    func isAppropriateEndTagToken(_ token: HTMLToken) -> Bool {
+        guard case let .EndTag(endTagName, _, _, _) = token.type else {
+            preconditionFailure("isAppropriateEndTagToken called with non-end-tag token")
+        }
+        if let startTagName = self.lastStartTagName {
+            return startTagName == endTagName
+        } else {
+            return false
+        }
+    }
+
     func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
         let dontConsumeNextInputCharacter = {
             self.restoreCursorToPrevious()
         }
         let _ = dontConsumeNextInputCharacter
 
-        // FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
-
         // Handle reconsume by passing the character around in the state enum
         let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
 
         switch self.state {
         // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
         case .Data:
+            precondition(currentTokensAttributes == nil)
             switch currentInputCharacter {
             case "&":
                 self.returnState = .Data
@@ -320,6 +400,637 @@ public class HTMLTokenizer {
             default:
                 return emitCharacter(currentInputCharacter!)
             }
+
+        // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
+        case .RCDATA:
+            switch currentInputCharacter {
+            case "&":
+                self.returnState = .RCDATA
+                return switchTo(.CharacterReference)
+            case "<":
+                return switchTo(.RCDATALessThanSign)
+            case "\0":
+                // FIXME: log_parse_error()
+                return emitCharacter("\u{FFFD}")
+            case nil:
+                return emitEOF()
+            default:
+                return emitCharacter(currentInputCharacter!)
+            }
+
+        // 13.2.5.3. RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
+        case .RAWTEXT:
+            switch currentInputCharacter {
+            case "<":
+                return switchTo(.RAWTEXTLessThanSign)
+            case "\0":
+                // FIXME: log_parse_error()
+                return emitCharacter("\u{FFFD}")
+            case nil:
+                return emitEOF()
+            default:
+                return emitCharacter(currentInputCharacter!)
+            }
+        // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
+        case .ScriptData:
+            switch currentInputCharacter {
+            case "<":
+                return switchTo(.ScriptDataLessThanSign)
+            case "\0":
+                // FIXME: log_parse_error()
+                return emitCharacter("\u{FFFD}")
+            case nil:
+                return emitEOF()
+            default:
+                return emitCharacter(currentInputCharacter!)
+            }
+        // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
+        case .PLAINTEXT:
+            switch currentInputCharacter {
+            case "\0":
+                // FIXME: log_parse_error()
+                return emitCharacter("\u{FFFD}")
+            case nil:
+                return emitEOF()
+            default:
+                return emitCharacter(currentInputCharacter!)
+            }
+        // 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
+        case .TagOpen:
+            switch currentInputCharacter {
+            case "!":
+                return switchTo(.MarkupDeclarationOpen)
+            case "/":
+                return switchTo(.EndTagOpen)
+            case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
+                createNewToken(HTMLToken(type: .StartTag(tagName: "")))
+                return reconsume(currentInputCharacter!, in: .TagName)
+            case "?":
+                // FIXME: log_parse_error()
+                createNewToken(HTMLToken(type: .Comment(data: "")))
+                return reconsume(currentInputCharacter!, in: .BogusComment)
+            case nil:
+                // FIXME: log_parse_error()
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                return emitEOF()
+            default:
+                // FIXME: log_parse_error()
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                return reconsume(currentInputCharacter!, in: .Data)
+            }
+        // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
+        case .EndTagOpen:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
+                createNewToken(HTMLToken(type: .EndTag(tagName: "")))
+                return reconsume(currentInputCharacter!, in: .TagName)
+            default:
+                return emitEOF()
+            }
+        // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
+        case .TagName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                self.currentToken.tagName = self.currentBuilder.takeString()
+                return switchTo(.BeforeAttributeName)
+            case "/":
+                self.currentToken.tagName = self.currentBuilder.takeString()
+                return switchTo(.SelfClosingStartTag)
+            case ">":
+                self.currentToken.tagName = self.currentBuilder.takeString()
+                return switchToAndEmitCurrentToken(.Data)
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                return continueInCurrentState()
+            case "\0":
+                // FIXME: log_parse_error()
+                currentBuilder += "\u{FFFD}"
+                return continueInCurrentState()
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
+        case .RCDATALessThanSign:
+            switch currentInputCharacter {
+            case "/":
+                self.temporaryBuffer = ""
+                return switchTo(.RCDATAEndTagOpen)
+            default:
+                return emitCharacterAndReconsume("<", in: .RCDATA, currentInputCharacter: currentInputCharacter)
+            }
+        // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
+        case .RCDATAEndTagOpen:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
+                createNewToken(HTMLToken(type: .EndTag(tagName: "")))
+                return reconsume(currentInputCharacter!, in: .RCDATAEndTagName)
+            default:
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+                return reconsume(currentInputCharacter, in: .RCDATA)
+            }
+        // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
+        case .RCDATAEndTagName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.BeforeAttributeName)
+                }
+                break
+            case "/":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.SelfClosingStartTag)
+                }
+                break
+            case ">":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchToAndEmitCurrentToken(.Data)
+                }
+                break
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(c)
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            default:
+                break
+            }
+
+            // First three steps fall through to the "anything else" block
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+            // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
+            self.currentBuilder = ""
+            for codePoint in self.temporaryBuffer {
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
+            }
+            return reconsume(currentInputCharacter, in: .RCDATA)
+        // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
+        case .ScriptDataLessThanSign:
+            switch currentInputCharacter {
+            case "/":
+                self.temporaryBuffer = ""
+                return switchTo(.ScriptDataEndTagOpen)
+            case "!":
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "!")))
+                return switchTo(.ScriptDataEscapeStart)
+            default:
+                return emitCharacterAndReconsume("<", in: .ScriptData, currentInputCharacter: currentInputCharacter)
+            }
+        // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
+        case .ScriptDataEndTagOpen:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
+                createNewToken(HTMLToken(type: .EndTag(tagName: "")))
+                return reconsume(currentInputCharacter!, in: .ScriptDataEndTagName)
+            default:
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+                return reconsume(currentInputCharacter, in: .ScriptData)
+            }
+        // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
+        case .ScriptDataEndTagName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.BeforeAttributeName)
+                }
+                break
+            case "/":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.SelfClosingStartTag)
+                }
+                break
+            case ">":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchToAndEmitCurrentToken(.Data)
+                }
+                break
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(c)
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            default:
+                break
+            }
+
+            // First three steps fall through to the "anything else" block
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+            // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
+            self.currentBuilder = ""
+            for codePoint in self.temporaryBuffer {
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
+            }
+            return reconsume(currentInputCharacter, in: .ScriptData)
+        // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
+        case .ScriptDataEscapeStart:
+            switch currentInputCharacter {
+            case "-":
+                return switchToAndEmitCharacter(.ScriptDataEscapeStartDash, character: "-")
+            default:
+                return reconsume(currentInputCharacter, in: .ScriptData)
+            }
+        // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
+        case .ScriptDataEscapeStartDash:
+            switch currentInputCharacter {
+            case "-":
+                return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
+            default:
+                return reconsume(currentInputCharacter, in: .ScriptData)
+            }
+        // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
+        case .ScriptDataEscaped:
+            switch currentInputCharacter {
+            case "-":
+                return switchToAndEmitCharacter(.ScriptDataEscapedDash, character: "-")
+            case "<":
+                return switchTo(.ScriptDataEscapedLessThanSign)
+            case "\0":
+                // FIXME: log_parse_error()
+                return emitCharacter("\u{FFFD}")
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                return emitCharacter(currentInputCharacter!)
+            }
+        // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
+        case .ScriptDataEscapedDash:
+            switch currentInputCharacter {
+            case "-":
+                return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
+            case "<":
+                return switchTo(.ScriptDataEscapedLessThanSign)
+            case "\0":
+                // FIXME: log_parse_error()
+                return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
+            }
+        // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
+        case .ScriptDataEscapedDashDash:
+            switch currentInputCharacter {
+            case "-":
+                return emitCharacter("-")
+            case "<":
+                return switchTo(.ScriptDataEscapedLessThanSign)
+            case ">":
+                return switchToAndEmitCharacter(.ScriptData, character: ">")
+            case "\0":
+                // FIXME: log_parse_error()
+                return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
+            }
+        // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
+        case .ScriptDataEscapedLessThanSign:
+            switch currentInputCharacter {
+            case "/":
+                self.temporaryBuffer = ""
+                return switchTo(.ScriptDataEscapedEndTagOpen)
+            case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
+                self.temporaryBuffer = ""
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                return reconsume(currentInputCharacter!, in: .ScriptDataDoubleEscapeStart)
+            default:
+                return emitCharacterAndReconsume("<", in: .ScriptDataEscaped, currentInputCharacter: currentInputCharacter)
+            }
+        // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
+        case .ScriptDataEscapedEndTagOpen:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
+                createNewToken(HTMLToken(type: .EndTag(tagName: "")))
+                return reconsume(currentInputCharacter!, in: .ScriptDataEscapedEndTagName)
+            default:
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+                return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
+            }
+        // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
+        case .ScriptDataEscapedEndTagName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.BeforeAttributeName)
+                }
+                break
+            case "/":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.SelfClosingStartTag)
+                }
+                break
+            case ">":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchToAndEmitCurrentToken(.Data)
+                }
+                break
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(c)
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            default:
+                break
+            }
+
+            // First three steps fall through to the "anything else" block
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+            // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
+            self.currentBuilder = ""
+            for codePoint in self.temporaryBuffer {
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
+            }
+            return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
+        // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
+        case .ScriptDataDoubleEscapeStart:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ", "/", ">":
+                if self.temporaryBuffer == "script" {
+                    return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
+                } else {
+                    return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
+                }
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                return emitCharacter(currentInputCharacter!)
+            case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
+                self.temporaryBuffer.append(c)
+                return emitCharacter(currentInputCharacter!)
+            default:
+                return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
+            }
+        // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
+        case .ScriptDataDoubleEscaped:
+            switch currentInputCharacter {
+            case "-":
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDash, character: "-")
+            case "<":
+                return switchTo(.ScriptDataDoubleEscapedLessThanSign)
+            case "\0":
+                // FIXME: log_parse_error()
+                return emitCharacter("\u{FFFD}")
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                return emitCharacter(currentInputCharacter!)
+            }
+        // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
+        case .ScriptDataDoubleEscapedDash:
+            switch currentInputCharacter {
+            case "-":
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDashDash, character: "-")
+            case "<":
+                return switchTo(.ScriptDataDoubleEscapedLessThanSign)
+            case "\0":
+                // FIXME: log_parse_error()
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
+            }
+        // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
+        case .ScriptDataDoubleEscapedDashDash:
+            switch currentInputCharacter {
+            case "-":
+                return emitCharacter("-")
+            case "<":
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscapedLessThanSign, character: "<")
+            case ">":
+                return switchToAndEmitCharacter(.ScriptData, character: ">")
+            case "\0":
+                // FIXME: log_parse_error()
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
+            }
+        // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
+        case .ScriptDataDoubleEscapedLessThanSign:
+            switch currentInputCharacter {
+            case "/":
+                self.temporaryBuffer = ""
+                return switchToAndEmitCharacter(.ScriptDataDoubleEscapeEnd, character: "/")
+            default:
+                return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
+            }
+        // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
+        case .ScriptDataDoubleEscapeEnd:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ", "/", ">":
+                if self.temporaryBuffer == "script" {
+                    return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
+                } else {
+                    return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
+                }
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                return emitCharacter(currentInputCharacter!)
+            case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
+                self.temporaryBuffer.append(c)
+                return emitCharacter(currentInputCharacter!)
+            default:
+                return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
+            }
+        // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
+        case .BeforeAttributeName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case "/", ">", nil:
+                return reconsume(currentInputCharacter, in: .AfterAttributeName)
+            case "=":
+                // FIXME: log_parse_error()
+                self.currentBuilder = Swift.String(currentInputCharacter!)
+                self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
+                return switchTo(.AttributeName)
+            default:
+                self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
+                return reconsume(currentInputCharacter!, in: .AttributeName)
+            }
+        // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
+        case .AttributeName:
+            // FIXME: When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
+            //        the complete attribute's name must be compared to the other attributes on the same token;
+            //        if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
+            //        parse error and the new attribute must be removed from the token.
+            // NOTE:  If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
+            //        are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
+            //        in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
+                // FIXME: set name position
+                self.currentAttribute!.localName = self.currentBuilder.takeString()
+                return reconsume(currentInputCharacter, in: .AfterAttributeName)
+            case "=":
+                // FIXME: set name position
+                self.currentAttribute!.localName = self.currentBuilder.takeString()
+                return switchTo(.BeforeAttributeValue)
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                return continueInCurrentState()
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
+        case .AfterAttributeName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case "/":
+                self.finalizeCurrentAttribute(.SetName)
+                return switchTo(.SelfClosingStartTag)
+            case "=":
+                self.finalizeCurrentAttribute(.SetName)
+                return switchTo(.BeforeAttributeValue)
+            case ">":
+                self.finalizeCurrentAttribute(.SetName)
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.finalizeCurrentAttribute(.IgnoreString)
+                return emitEOF()
+            default:
+                self.finalizeCurrentAttribute(.SetName)
+                self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
+                return reconsume(currentInputCharacter!, in: .AttributeName)
+            }
+        // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
+        case .BeforeAttributeValue:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case "\"":
+                return switchTo(.AttributeValueDoubleQuoted)
+            case "'":
+                return switchTo(.AttributeValueSingleQuoted)
+            case ">":
+                // FIXME: log_parse_error()
+                self.finalizeCurrentAttribute(.IgnoreString)
+                return switchToAndEmitCurrentToken(.Data)
+            default:
+                return reconsume(currentInputCharacter, in: .AttributeValueUnquoted)
+            }
+        // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
+        case .AttributeValueDoubleQuoted:
+            switch currentInputCharacter {
+            case "\"":
+                return switchTo(.AfterAttributeValueQuoted)
+            case "&":
+                self.returnState = .AttributeValueDoubleQuoted
+                return switchTo(.CharacterReference)
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case nil:
+                // FIXME: log_parse_error()
+                self.finalizeCurrentAttribute(.IgnoreString)
+                return emitEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
+        case .AttributeValueSingleQuoted:
+            switch currentInputCharacter {
+            case "'":
+                return switchTo(.AfterAttributeValueQuoted)
+            case "&":
+                self.returnState = .AttributeValueSingleQuoted
+                return switchTo(.CharacterReference)
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
+        case .AttributeValueUnquoted:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                self.finalizeCurrentAttribute(.SetValue)
+                return switchTo(.BeforeAttributeName)
+            case "&":
+                self.returnState = .AttributeValueUnquoted
+                return switchTo(.CharacterReference)
+            case ">":
+                self.finalizeCurrentAttribute(.SetValue)
+                return switchToAndEmitCurrentToken(.Data)
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case "\"", "'", "<", "=", "`":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            case nil:
+                // FIXME: log_parse_error()
+                self.finalizeCurrentAttribute(.IgnoreString)
+                return emitEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
+        case .AfterAttributeValueQuoted:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                self.finalizeCurrentAttribute(.SetValue)
+                return switchTo(.BeforeAttributeName)
+            case "/":
+                self.finalizeCurrentAttribute(.SetValue)
+                return switchTo(.SelfClosingStartTag)
+            case ">":
+                self.finalizeCurrentAttribute(.SetValue)
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.finalizeCurrentAttribute(.IgnoreString)
+                return emitEOF()
+            default:
+                // FIXME: log_parse_error()
+                self.finalizeCurrentAttribute(.SetValue)
+                return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
+            }
         default:
             print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
             return emitEOF()