소스 검색

LibWeb: Add remaining states to the Swift tokenizer

This includes all the DOCTYPE and Character reference states, as well as
a few RAWTEXT ones that were missing by accident.
Andrew Kaster 8 달 전
부모
커밋
1383d03c02

+ 75 - 0
Libraries/LibWeb/HTML/Parser/HTMLToken.swift

@@ -163,6 +163,81 @@ public class HTMLToken {
             }
         }
     }
+    public var name: Swift.String? {
+        get {
+            switch self.type {
+            case .DOCTYPE(let name, _, _, _):
+                return name
+            default:
+                preconditionFailure("doctypeName called on non-doctype token")
+            }
+        }
+        set {
+            switch self.type {
+            case .DOCTYPE(_, let publicIdentifier, let systemIdentifier, let forceQuirksMode):
+                self.type = .DOCTYPE(name: newValue, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier, forceQuirksMode: forceQuirksMode)
+            default:
+                preconditionFailure("doctypeName= called on non-doctype token")
+            }
+        }
+    }
+
+    public var forceQuirks: Bool {
+        get {
+            switch self.type {
+            case .DOCTYPE(_, _, _, let forceQuirksMode):
+                return forceQuirksMode
+            default:
+                preconditionFailure("forceQuirks called on non-doctype token")
+            }
+        }
+        set {
+            switch self.type {
+            case .DOCTYPE(let name, let publicIdentifier, let systemIdentifier, _):
+                self.type = .DOCTYPE(name: name, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier, forceQuirksMode: newValue)
+            default:
+                preconditionFailure("forceQuirks= called on non-doctype token")
+            }
+        }
+    }
+
+    public var publicIdentifier: Swift.String? {
+        get {
+            switch self.type {
+            case .DOCTYPE(_, let publicIdentifier, _, _):
+                return publicIdentifier
+            default:
+                preconditionFailure("publicIdentifier called on non-doctype token")
+            }
+        }
+        set {
+            switch self.type {
+            case .DOCTYPE(let name, _, let systemIdentifier, let forceQuirksMode):
+                self.type = .DOCTYPE(name: name, publicIdentifier: newValue, systemIdentifier: systemIdentifier, forceQuirksMode: forceQuirksMode)
+            default:
+                preconditionFailure("publicIdentifier= called on non-doctype token")
+            }
+        }
+    }
+
+    public var systemIdentifier: Swift.String? {
+        get {
+            switch self.type {
+            case .DOCTYPE(_, _, let systemIdentifier, _):
+                return systemIdentifier
+            default:
+                preconditionFailure("systemIdentifier called on non-doctype token")
+            }
+        }
+        set {
+            switch self.type {
+            case .DOCTYPE(let name, let publicIdentifier, _, let forceQuirksMode):
+                self.type = .DOCTYPE(name: name, publicIdentifier: publicIdentifier, systemIdentifier: newValue, forceQuirksMode: forceQuirksMode)
+            default:
+                preconditionFailure("systemIdentifier= called on non-doctype token")
+            }
+        }
+    }
 
     public init() {}
     public init(type: TokenType) {

+ 697 - 4
Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift

@@ -126,10 +126,60 @@ public class HTMLTokenizer {
     private var lastStartTagName: Swift.String? = nil
     private var currentTokensAttributes: [HTMLToken.Attribute]? = nil
     private var currentAttribute: HTMLToken.Attribute? = nil
+    private var characterReferenceCode: Int = 0
 
     private var aborted = false
     private var hasEmittedEOF = false
 
+    // https://infra.spec.whatwg.org/#noncharacter
+    // A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, inclusive,
+    // or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF,
+    // U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
+    // U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF,
+    // U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
+    static private var nonCharacter = CharacterSet(charactersIn: Unicode.Scalar(0xFDD0)!...Unicode.Scalar(0xFDEF)!)
+        .union(CharacterSet(charactersIn: "\u{FFFE}"..."\u{FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{1FFFE}"..."\u{1FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{2FFFE}"..."\u{2FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{3FFFE}"..."\u{3FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{4FFFE}"..."\u{4FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{5FFFE}"..."\u{5FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{6FFFE}"..."\u{6FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{7FFFE}"..."\u{7FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{8FFFE}"..."\u{8FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{9FFFE}"..."\u{9FFFF}"))
+        .union(CharacterSet(charactersIn: "\u{AFFFE}"..."\u{AFFFF}"))
+        .union(CharacterSet(charactersIn: "\u{BFFFE}"..."\u{BFFFF}"))
+        .union(CharacterSet(charactersIn: "\u{CFFFE}"..."\u{CFFFF}"))
+        .union(CharacterSet(charactersIn: "\u{DFFFE}"..."\u{DFFFF}"))
+        .union(CharacterSet(charactersIn: "\u{EFFFE}"..."\u{EFFFF}"))
+        .union(CharacterSet(charactersIn: "\u{FFFFE}"..."\u{FFFFF}"))
+        .union(CharacterSet(charactersIn: "\u{10FFFE}"..."\u{10FFFF}"))
+
+    // https://infra.spec.whatwg.org/#ascii-whitespace
+    static private var asciiWhitespace = CharacterSet(charactersIn: "\t\n\u{000C}\u{000D} ")
+
+    // https://infra.spec.whatwg.org/#c0-control
+    static private var c0Control = CharacterSet(charactersIn: "\u{0000}"..."\u{001F}")
+
+    // https://infra.spec.whatwg.org/#control
+    static private var control = c0Control.union(CharacterSet(charactersIn: "\u{007F}"..."\u{009F}"))
+
+    // IMPLEMENTATION DEFINED: Used for the numeric character reference end state
+    static private var controlNotAsciiWhitespace = control.subtracting(asciiWhitespace)
+
+    // https://infra.spec.whatwg.org/#ascii-digit
+    static private var asciiDigit = CharacterSet(charactersIn: "0123456789")
+
+    // https://infra.spec.whatwg.org/#ascii-upper-hex-digit
+    static private var asciiUpperHexDigit = CharacterSet(charactersIn: "ABCDEF")
+
+    // https://infra.spec.whatwg.org/#ascii-lower-hex-digit
+    static private var asciiLowerHexDigit = CharacterSet(charactersIn: "abcdef")
+
+    // https://infra.spec.whatwg.org/#ascii-hex-digit
+    static private var asciiHexDigit = asciiUpperHexDigit.union(asciiLowerHexDigit)
+
     // https://infra.spec.whatwg.org/#ascii-upper-alpha
     static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
 
@@ -139,6 +189,40 @@ public class HTMLTokenizer {
     // https://infra.spec.whatwg.org/#ascii-upper-alpha
     static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha)
 
+    // https://infra.spec.whatwg.org/#ascii-alphanumeric
+    static private var asciiAlphanumeric = asciiAlpha.union(asciiDigit)
+
+    static private var characterReferenceControlCodeMapping: [Int: Unicode.Scalar] =
+        [
+            0x80: Unicode.Scalar(0x20AC)!,  // €
+            0x82: Unicode.Scalar(0x201A)!,  // ‚
+            0x83: Unicode.Scalar(0x0192)!,  // ƒ
+            0x84: Unicode.Scalar(0x201E)!,  // „
+            0x85: Unicode.Scalar(0x2026)!,  // …
+            0x86: Unicode.Scalar(0x2020)!,  // †
+            0x87: Unicode.Scalar(0x2021)!,  // ‡
+            0x88: Unicode.Scalar(0x02C6)!,  // ˆ
+            0x89: Unicode.Scalar(0x2030)!,  // ‰
+            0x8A: Unicode.Scalar(0x0160)!,  // Š
+            0x8B: Unicode.Scalar(0x2039)!,  // ‹
+            0x8C: Unicode.Scalar(0x0152)!,  // Œ
+            0x8E: Unicode.Scalar(0x017D)!,  // Ž
+            0x91: Unicode.Scalar(0x2018)!,  // ‘
+            0x92: Unicode.Scalar(0x2019)!,  // ’
+            0x93: Unicode.Scalar(0x201C)!,  // “
+            0x94: Unicode.Scalar(0x201D)!,  // ”
+            0x95: Unicode.Scalar(0x2022)!,  // •
+            0x96: Unicode.Scalar(0x2013)!,  // –
+            0x97: Unicode.Scalar(0x2014)!,  // —
+            0x98: Unicode.Scalar(0x02DC)!,  // ˜
+            0x99: Unicode.Scalar(0x2122)!,  // ™
+            0x9A: Unicode.Scalar(0x0161)!,  // š
+            0x9B: Unicode.Scalar(0x203A)!,  // ›
+            0x9C: Unicode.Scalar(0x0153)!,  // œ
+            0x9E: Unicode.Scalar(0x017E)!,  // ž
+            0x9F: Unicode.Scalar(0x0178)!,  // Ÿ
+        ]
+
     public init() {
         self.cursor = self.input.startIndex
         self.previousCursor = self.input.startIndex
@@ -162,8 +246,8 @@ public class HTMLTokenizer {
     }
 
     func skip(_ count: Int) {
+        self.previousCursor = self.cursor
         self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
-        self.previousCursor = self.input.index(before: self.cursor)
     }
 
     func peekCodePoint(_ offset: Int = 0) -> Character? {
@@ -579,6 +663,65 @@ public class HTMLTokenizer {
                 self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
             }
             return reconsume(currentInputCharacter, in: .RCDATA)
+        // 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
+        case .RAWTEXTLessThanSign:
+            switch currentInputCharacter {
+            case "/":
+                self.temporaryBuffer = ""
+                return switchTo(.RAWTEXTEndTagOpen)
+            default:
+                return emitCharacterAndReconsume("<", in: .RAWTEXT, currentInputCharacter: currentInputCharacter)
+            }
+        // 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
+        case .RAWTEXTEndTagOpen:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
+                createNewToken(HTMLToken(type: .EndTag(tagName: "")))
+                return reconsume(currentInputCharacter!, in: .RAWTEXTEndTagName)
+            default:
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+                return reconsume(currentInputCharacter, in: .RAWTEXT)
+            }
+        // 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
+        case .RAWTEXTEndTagName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.BeforeAttributeName)
+                }
+                break
+            case "/":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchTo(.SelfClosingStartTag)
+                }
+                break
+            case ">":
+                if self.isAppropriateEndTagToken(currentToken) {
+                    return switchToAndEmitCurrentToken(.Data)
+                }
+                break
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(c)
+                self.temporaryBuffer.append(c)
+                return continueInCurrentState()
+            default:
+                break
+            }
+
+            // First three steps fall through to the "anything else" block
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
+            self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
+            // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
+            self.currentBuilder = ""
+            for codePoint in self.temporaryBuffer {
+                self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
+            }
+            return reconsume(currentInputCharacter, in: .RAWTEXT)
         // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
         case .ScriptDataLessThanSign:
             switch currentInputCharacter {
@@ -1225,9 +1368,559 @@ public class HTMLTokenizer {
                 currentBuilder.append("--!")
                 return reconsume(currentInputCharacter, in: .Comment)
             }
-        default:
-            print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
-            return emitEOF()
+        // 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
+        case .DOCTYPE:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return switchTo(.BeforeDOCTYPEName)
+            case ">":
+                return reconsume(currentInputCharacter, in: .BeforeDOCTYPEName)
+            case nil:
+                // FIXME: log_parse_error()
+                currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                return reconsume(currentInputCharacter!, in: .BeforeDOCTYPEName)
+            }
+        // 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
+        case .BeforeDOCTYPEName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                precondition(self.currentBuilder.isEmpty)
+                self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
+                self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                return switchTo(.DOCTYPEName)
+            case "\0":
+                // FIXME: log_parse_error()
+                precondition(self.currentBuilder.isEmpty)
+                self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
+                self.currentBuilder.append("\u{FFFD}")
+                return switchTo(.DOCTYPEName)
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                precondition(self.currentBuilder.isEmpty)
+                self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
+                self.currentBuilder.append(currentInputCharacter!)
+                return switchTo(.DOCTYPEName)
+            }
+        // 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
+        case .DOCTYPEName:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                self.currentToken.name = self.currentBuilder.takeString()
+                return switchTo(.AfterDOCTYPEName)
+            case ">":
+                self.currentToken.name = self.currentBuilder.takeString()
+                return switchToAndEmitCurrentToken(.Data)
+            case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
+                self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
+                return continueInCurrentState()
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                self.currentToken.name = self.currentBuilder.takeString()
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
+        case .AfterDOCTYPEName:
+            precondition(self.currentBuilder.isEmpty)
+            precondition(self.currentToken.name != nil)
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case ">":
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                if "pP".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "UBLIC" {
+                    skip(5)
+                    return switchTo(.AfterDOCTYPEPublicKeyword)
+                }
+                if "sS".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "YSTEM" {
+                    skip(5)
+                    return switchTo(.AfterDOCTYPESystemKeyword)
+                }
+
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
+        case .AfterDOCTYPEPublicKeyword:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return switchTo(.BeforeDOCTYPEPublicIdentifier)
+            case "\"":
+                // FIXME: log_parse_error()
+                self.currentToken.publicIdentifier = ""
+                return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted)
+            case "'":
+                self.currentToken.publicIdentifier = ""
+                return switchTo(.DOCTYPEPublicIdentifierSingleQuoted)
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
+        case .BeforeDOCTYPEPublicIdentifier:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case "\"":
+                self.currentToken.publicIdentifier = ""
+                return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted)
+            case "'":
+                self.currentToken.publicIdentifier = ""
+                return switchTo(.DOCTYPEPublicIdentifierSingleQuoted)
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-double-quoted-state
+        case .DOCTYPEPublicIdentifierDoubleQuoted:
+            switch currentInputCharacter {
+            case "\"":
+                self.currentToken.publicIdentifier = self.currentBuilder.takeString()
+                return switchTo(.AfterDOCTYPEPublicIdentifier)
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.publicIdentifier = self.currentBuilder.takeString()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-single-quoted-state
+        case .DOCTYPEPublicIdentifierSingleQuoted:
+            switch currentInputCharacter {
+            case "'":
+                self.currentToken.publicIdentifier = self.currentBuilder.takeString()
+                return switchTo(.AfterDOCTYPEPublicIdentifier)
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.publicIdentifier = self.currentBuilder.takeString()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
+        case .AfterDOCTYPEPublicIdentifier:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return switchTo(.BetweenDOCTYPEPublicAndSystemIdentifiers)
+            case ">":
+                return switchToAndEmitCurrentToken(.Data)
+            case "\"":
+                // FIXME: log_parse_error()
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
+            case "'":
+                // FIXME: log_parse_error()
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
+        case .BetweenDOCTYPEPublicAndSystemIdentifiers:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case ">":
+                return switchToAndEmitCurrentToken(.Data)
+            case "\"":
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
+            case "'":
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
+        case .AfterDOCTYPESystemKeyword:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return switchTo(.BeforeDOCTYPESystemIdentifier)
+            case "\"":
+                // FIXME: log_parse_error()
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
+            case "'":
+                // FIXME: log_parse_error()
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
+        case .BeforeDOCTYPESystemIdentifier:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case "\"":
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
+            case "'":
+                self.currentToken.systemIdentifier = ""
+                return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-double-quoted-state
+        case .DOCTYPESystemIdentifierDoubleQuoted:
+            switch currentInputCharacter {
+            case "\"":
+                self.currentToken.systemIdentifier = self.currentBuilder.takeString()
+                return switchTo(.AfterDOCTYPESystemIdentifier)
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.systemIdentifier = self.currentBuilder.takeString()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-single-quoted-state
+        case .DOCTYPESystemIdentifierSingleQuoted:
+            switch currentInputCharacter {
+            case "'":
+                return switchTo(.AfterDOCTYPESystemIdentifier)
+            case "\0":
+                // FIXME: log_parse_error()
+                self.currentBuilder.append("\u{FFFD}")
+                return continueInCurrentState()
+            case ">":
+                // FIXME: log_parse_error()
+                self.currentToken.systemIdentifier = self.currentBuilder.takeString()
+                self.currentToken.forceQuirks = true
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                self.currentBuilder.append(currentInputCharacter!)
+                return continueInCurrentState()
+            }
+        // 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
+        case .AfterDOCTYPESystemIdentifier:
+            switch currentInputCharacter {
+            case "\t", "\n", "\u{000C}", " ":
+                return continueInCurrentState()
+            case ">":
+                return switchToAndEmitCurrentToken(.Data)
+            case nil:
+                // FIXME: log_parse_error()
+                self.currentToken.forceQuirks = true
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                // FIXME: log_parse_error()
+                // NOTE: This does not set the current DOCTYPE token's force-quirks flag to on.
+                return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
+            }
+        // 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
+        case .BogusDOCTYPE:
+            switch currentInputCharacter {
+            case ">":
+                return switchToAndEmitCurrentToken(.Data)
+            case "\0":
+                // FIXME: log_parse_error()
+                return continueInCurrentState()
+            case nil:
+                return emitCurrentTokenFollowedByEOF()
+            default:
+                return continueInCurrentState()
+            }
+        // 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
+        case .CDATASection:
+            switch currentInputCharacter {
+            case "]":
+                return switchTo(.CDATASectionBracket)
+            case nil:
+                // FIXME: log_parse_error()
+                return emitEOF()
+            default:
+                // NOTE: U+0000 NULL characters are handled in the tree construction stage,
+                //       as part of the in foreign content insertion mode, which is the only place where CDATA sections can appear.
+                return emitCharacter(currentInputCharacter!)
+            }
+        // 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
+        case .CDATASectionBracket:
+            switch currentInputCharacter {
+            case "]":
+                return switchTo(.CDATASectionEnd)
+            default:
+                return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter)
+            }
+
+        // 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
+        case .CDATASectionEnd:
+            switch currentInputCharacter {
+            case "]":
+                return emitCharacter("]")
+            case ">":
+                return switchTo(.Data)
+            default:
+                queuedTokens.append(HTMLToken(type: .Character(codePoint: "]")))
+                return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter)
+            }
+        // 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
+        case .CharacterReference:
+            self.temporaryBuffer = "&"
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!):
+                return reconsume(currentInputCharacter!, in: .NamedCharacterReference)
+            case "#":
+                self.temporaryBuffer.append(currentInputCharacter!)
+                return switchTo(.NumericCharacterReference)
+            default:
+                self.flushCodepointsConsumedAsACharacterReference()
+                return reconsume(currentInputCharacter, in: self.returnState)
+            }
+        // 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
+        case .NamedCharacterReference:
+            var subString = self.input[self.previousCursor...]
+            let entityMatch = subString.withUTF8 { utf8 in
+                return Web.HTML.match_entity_for_named_character_reference(AK.StringView(utf8.baseAddress!, utf8.count))
+            }
+            if entityMatch.hasValue {
+                let entity = entityMatch.value!.entity
+                skip(entity.length())
+                // FIXME: Iterate over the entity's code points and add them instead of creating a string
+                self.temporaryBuffer.append(Swift.String(akStringView: entity)!)
+
+                if self.consumedAsPartOfAnAttribute(), !entity.endsWith(";") {
+                    if let peeked = peekCodePoint(), peeked == "=" || HTMLTokenizer.asciiAlphanumeric.contains(peeked.unicodeScalars.first!) {
+                        self.flushCodepointsConsumedAsACharacterReference()
+                        return switchTo(self.returnState)
+                    }
+                }
+
+                if !entity.endsWith(";") {
+                    // FIXME: log_parse_error()
+                }
+
+                self.temporaryBuffer = ""
+
+                // FIXME: This AK::Vector<u32, 2> should be CxxConvertibleToContainer, but https://github.com/swiftlang/swift/issues/77607
+                let codePoints = entityMatch.value!.code_points
+                for i in 0...codePoints.size() {
+                    self.temporaryBuffer.append(Character(Unicode.Scalar(codePoints[i])!))
+                }
+                self.flushCodepointsConsumedAsACharacterReference()
+                return switchTo(self.returnState)
+            }
+
+            self.flushCodepointsConsumedAsACharacterReference()
+            return reconsume(currentInputCharacter, in: .AmbiguousAmpersand)
+        // 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
+        case .AmbiguousAmpersand:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!):
+                if self.consumedAsPartOfAnAttribute() {
+                    self.currentBuilder.append(currentInputCharacter!)
+                    return continueInCurrentState()
+                }
+                return emitCharacter(currentInputCharacter!)
+            case ";":
+                // FIXME: log_parse_error()
+                return reconsume(currentInputCharacter!, in: self.returnState)
+            default:
+                return reconsume(currentInputCharacter, in: self.returnState)
+            }
+        // 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
+        case .NumericCharacterReference:
+            self.characterReferenceCode = 0
+            switch currentInputCharacter {
+            case "x", "X":
+                self.temporaryBuffer.append(currentInputCharacter!)
+                return switchTo(.HexadecimalCharacterReferenceStart)
+            default:
+                return reconsume(currentInputCharacter, in: .DecimalCharacterReferenceStart)
+            }
+        // 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
+        case .HexadecimalCharacterReferenceStart:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiHexDigit.contains(c.unicodeScalars.first!):
+                return reconsume(currentInputCharacter!, in: .HexadecimalCharacterReference)
+            default:
+                // FIXME: log_parse_error()
+                self.flushCodepointsConsumedAsACharacterReference()
+                return reconsume(currentInputCharacter, in: self.returnState)
+            }
+        // 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
+        case .DecimalCharacterReferenceStart:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
+                return reconsume(currentInputCharacter!, in: .DecimalCharacterReference)
+            default:
+                // FIXME: log_parse_error()
+                self.flushCodepointsConsumedAsACharacterReference()
+                return reconsume(currentInputCharacter, in: self.returnState)
+            }
+        // 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
+        case .HexadecimalCharacterReference:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
+                self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x30)
+                return continueInCurrentState()
+            case let c? where HTMLTokenizer.asciiUpperHexDigit.contains(c.unicodeScalars.first!):
+                self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x37)
+                return continueInCurrentState()
+            case let c? where HTMLTokenizer.asciiLowerHexDigit.contains(c.unicodeScalars.first!):
+                self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x57)
+                return continueInCurrentState()
+            case ";":
+                return switchTo(.NumericCharacterReferenceEnd)
+            default:
+                // FIXME: log_parse_error()
+                return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd)
+            }
+
+        // 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
+        case .DecimalCharacterReference:
+            switch currentInputCharacter {
+            case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
+                self.characterReferenceCode = self.characterReferenceCode * 10 + Int(c.asciiValue! - 0x30)
+                return continueInCurrentState()
+            case ";":
+                return switchTo(.NumericCharacterReferenceEnd)
+            default:
+                // FIXME: log_parse_error()
+                return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd)
+            }
+
+        // 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
+        case .NumericCharacterReferenceEnd:
+            dontConsumeNextInputCharacter()
+            let codePoint: UnicodeScalar =
+                switch self.characterReferenceCode {
+                case 0x00:
+                    // FIXME: log_parse_error()
+                    UnicodeScalar(0xFFFD)!
+                case let c where c > 0x10FFFF:
+                    // FIXME: log_parse_error()
+                    UnicodeScalar(0xFFFD)!
+                case let c where UTF16.CodeUnit(exactly: c).map({ UTF16.isSurrogate($0) }) != nil:
+                    // FIXME: log_parse_error()
+                    UnicodeScalar(0xFFFD)!
+                case let c where UnicodeScalar(c)! == "\u{000D}" || HTMLTokenizer.controlNotAsciiWhitespace.contains(UnicodeScalar(c)!):
+                    // FIXME: log_parse_error()
+                    if let codePoint = HTMLTokenizer.characterReferenceControlCodeMapping[c] {
+                        codePoint
+                    } else {
+                        UnicodeScalar(c)!
+                    }
+                case let c where HTMLTokenizer.nonCharacter.contains(UnicodeScalar(c)!):
+                    // FIXME: log_parse_error()
+                    UnicodeScalar(c)!
+                default:
+                    UnicodeScalar(self.characterReferenceCode)!
+                }
+
+            self.temporaryBuffer = Swift.String(Character(codePoint))
+            self.flushCodepointsConsumedAsACharacterReference()
+            return switchTo(self.returnState)
         }
     }
 }

+ 8 - 0
Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp

@@ -20,4 +20,12 @@ OptionalString decode_to_utf8(StringView text, StringView encoding)
     return decoded_or_error.release_value();
 }
 
+OptionalEntityMatch match_entity_for_named_character_reference(StringView entity)
+{
+    auto entity_match = code_points_from_entity(entity);
+    if (entity_match.has_value())
+        return entity_match.release_value();
+    return std::nullopt;
+}
+
 }

+ 5 - 0
Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h

@@ -8,6 +8,7 @@
 
 #include <AK/String.h>
 #include <AK/StringView.h>
+#include <LibWeb/HTML/Parser/Entities.h>
 #include <optional>
 
 namespace Web::HTML {
@@ -16,4 +17,8 @@ namespace Web::HTML {
 using OptionalString = std::optional<String>;
 OptionalString decode_to_utf8(StringView text, StringView encoding);
 
+// Swift-friendly wrapper for HTML::code_points_from_entity
+using OptionalEntityMatch = std::optional<EntityMatch>;
+OptionalEntityMatch match_entity_for_named_character_reference(StringView entity);
+
 }

+ 5 - 3
Tests/LibWeb/TestHTMLTokenizerSwift.swift

@@ -69,11 +69,13 @@ struct TestHTMLTokenizerSwift {
         #expect(tokenizer.state == HTMLTokenizer.State.Data)  // initial state
 
         let token = tokenizer.nextToken()
-        #expect(token?.type == .EndOfFile)
-        #expect(tokenizer.state == HTMLTokenizer.State.CharacterReference)
+        #expect(token?.type == .Character(codePoint: "&"))
 
         let token2 = tokenizer.nextToken()
-        #expect(token2 == nil)
+        #expect(token2?.type == .EndOfFile)
+
+        let token3 = tokenizer.nextToken()
+        #expect(token3 == nil)
     }
 
     @Test func tagOpenOnly() {