123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328 |
- /*
- * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
- *
- * SPDX-License-Identifier: BSD-2-Clause
- */
- import AK
- import Collections
- import Foundation
- @_exported import WebCxx
- extension Swift.String {
- public init?(decoding: AK.StringView, as: AK.StringView) {
- let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
- if maybe_decoded.hasValue {
- self.init(akString: maybe_decoded.value!)
- } else {
- return nil
- }
- }
- }
- public class HTMLTokenizer {
- public enum State {
- case Data
- case RCDATA
- case RAWTEXT
- case ScriptData
- case PLAINTEXT
- case TagOpen
- case EndTagOpen
- case TagName
- case RCDATALessThanSign
- case RCDATAEndTagOpen
- case RCDATAEndTagName
- case RAWTEXTLessThanSign
- case RAWTEXTEndTagOpen
- case RAWTEXTEndTagName
- case ScriptDataLessThanSign
- case ScriptDataEndTagOpen
- case ScriptDataEndTagName
- case ScriptDataEscapeStart
- case ScriptDataEscapeStartDash
- case ScriptDataEscaped
- case ScriptDataEscapedDash
- case ScriptDataEscapedDashDash
- case ScriptDataEscapedLessThanSign
- case ScriptDataEscapedEndTagOpen
- case ScriptDataEscapedEndTagName
- case ScriptDataDoubleEscapeStart
- case ScriptDataDoubleEscaped
- case ScriptDataDoubleEscapedDash
- case ScriptDataDoubleEscapedDashDash
- case ScriptDataDoubleEscapedLessThanSign
- case ScriptDataDoubleEscapeEnd
- case BeforeAttributeName
- case AttributeName
- case AfterAttributeName
- case BeforeAttributeValue
- case AttributeValueDoubleQuoted
- case AttributeValueSingleQuoted
- case AttributeValueUnquoted
- case AfterAttributeValueQuoted
- case SelfClosingStartTag
- case BogusComment
- case MarkupDeclarationOpen
- case CommentStart
- case CommentStartDash
- case Comment
- case CommentLessThanSign
- case CommentLessThanSignBang
- case CommentLessThanSignBangDash
- case CommentLessThanSignBangDashDash
- case CommentEndDash
- case CommentEnd
- case CommentEndBang
- case DOCTYPE
- case BeforeDOCTYPEName
- case DOCTYPEName
- case AfterDOCTYPEName
- case AfterDOCTYPEPublicKeyword
- case BeforeDOCTYPEPublicIdentifier
- case DOCTYPEPublicIdentifierDoubleQuoted
- case DOCTYPEPublicIdentifierSingleQuoted
- case AfterDOCTYPEPublicIdentifier
- case BetweenDOCTYPEPublicAndSystemIdentifiers
- case AfterDOCTYPESystemKeyword
- case BeforeDOCTYPESystemIdentifier
- case DOCTYPESystemIdentifierDoubleQuoted
- case DOCTYPESystemIdentifierSingleQuoted
- case AfterDOCTYPESystemIdentifier
- case BogusDOCTYPE
- case CDATASection
- case CDATASectionBracket
- case CDATASectionEnd
- case CharacterReference
- case NamedCharacterReference
- case AmbiguousAmpersand
- case NumericCharacterReference
- case HexadecimalCharacterReferenceStart
- case DecimalCharacterReferenceStart
- case HexadecimalCharacterReference
- case DecimalCharacterReference
- case NumericCharacterReferenceEnd
- }
- private var input = Swift.String()
- private var cursor: Swift.String.Index
- private var previousCursor: Swift.String.Index
- public private(set) var state = State.Data
- private var returnState = State.Data
- private var currentToken = HTMLToken()
- private var queuedTokens = Deque<HTMLToken>()
- private var aborted = false
- private var hasEmittedEOF = false
- public init() {
- self.cursor = self.input.startIndex
- self.previousCursor = self.input.startIndex
- }
- public init?(input: AK.StringView, encoding: AK.StringView) {
- if let string = Swift.String(decoding: input, as: encoding) {
- self.input = string
- } else {
- return nil
- }
- self.cursor = self.input.startIndex
- self.previousCursor = self.input.startIndex
- }
- public convenience init?(input: AK.StringView) {
- self.init(input: input, encoding: "UTF-8")
- }
- public func abort() {
- self.aborted = true
- }
- func skip(_ count: Int) {
- self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
- self.previousCursor = self.input.index(before: self.cursor)
- }
- func peekCodePoint(_ offset: Int = 0) -> Character? {
- guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
- return nil
- }
- return self.input[index]
- }
- func nextCodePoint() -> Character? {
- guard self.cursor < self.input.endIndex else {
- return nil
- }
- // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
- // https://infra.spec.whatwg.org/#normalize-newlines
- var codePoint: Character
- if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
- // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
- skip(2)
- codePoint = "\n"
- } else if let peeked = peekCodePoint(), peeked == "\r" {
- // replace every remaining U+000D CR code point with a U+000A LF code point.
- skip(1)
- codePoint = "\n"
- } else {
- skip(1)
- codePoint = self.input[self.previousCursor]
- }
- return codePoint
- }
- func restoreCursorToPrevious() {
- self.cursor = self.previousCursor
- }
- func createNewToken(_ token: HTMLToken) {
- self.currentToken = token
- // FIXME: Assign Position
- }
- enum NextTokenState {
- case Emit(token: HTMLToken?)
- case SwitchTo
- case Reconsume(inputCharacter: Character?)
- case ReprocessQueue
- }
- public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
- let processQueue = { () -> HTMLToken?? in
- if let token = self.queuedTokens.popFirst() {
- return token
- }
- return self.aborted ? Optional(nil) : nil
- }
- if let maybeToken = processQueue() {
- return maybeToken
- }
- var nextInputCharacter: Character? = nil
- while true {
- // FIXME: Handle insertion point
- switch nextTokenImpl(nextInputCharacter) {
- case .Emit(let token):
- return token
- case .SwitchTo:
- nextInputCharacter = nil
- break
- case .Reconsume(let character):
- nextInputCharacter = character
- break
- case .ReprocessQueue:
- if let maybeToken = processQueue() {
- return maybeToken
- }
- nextInputCharacter = nil
- break
- }
- }
- }
- func switchTo(_ state: State) -> NextTokenState {
- self.state = state
- return .SwitchTo
- }
- func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
- self.state = state
- return .Reconsume(inputCharacter: character)
- }
- func switchToReturnState() -> NextTokenState {
- self.state = self.returnState
- return .ReprocessQueue
- }
- func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
- self.state = self.returnState
- if character != nil {
- restoreCursorToPrevious()
- }
- return .ReprocessQueue
- }
- func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
- self.state = state
- self.queuedTokens.append(self.currentToken)
- self.currentToken = HTMLToken()
- return .Emit(token: self.queuedTokens.popFirst()!)
- }
- func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
- self.state = state
- return emitCharacter(character)
- }
- func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
- self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
- self.state = `in`
- return .Reconsume(inputCharacter: currentInputCharacter)
- }
- func emitEOF() -> NextTokenState {
- if self.hasEmittedEOF {
- return .Emit(token: nil)
- }
- self.hasEmittedEOF = true
- createNewToken(HTMLToken(type: .EndOfFile))
- self.queuedTokens.append(self.currentToken)
- self.currentToken = HTMLToken()
- return .Emit(token: self.queuedTokens.popFirst()!)
- }
- func emitCurrentTokenFollowedByEOF() -> NextTokenState {
- precondition(!self.hasEmittedEOF)
- self.queuedTokens.append(self.currentToken)
- self.currentToken = HTMLToken()
- return emitEOF()
- }
- func emitCharacter(_ character: Character) -> NextTokenState {
- createNewToken(HTMLToken(type: .Character(codePoint: character)))
- self.queuedTokens.append(self.currentToken)
- self.currentToken = HTMLToken()
- return .Emit(token: self.queuedTokens.popFirst()!)
- }
- func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
- let dontConsumeNextInputCharacter = {
- self.restoreCursorToPrevious()
- }
- let _ = dontConsumeNextInputCharacter
- // FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
- // Handle reconsume by passing the character around in the state enum
- let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
- switch self.state {
- // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
- case .Data:
- switch currentInputCharacter {
- case "&":
- self.returnState = .Data
- return switchTo(.CharacterReference)
- case "<":
- return switchTo(.TagOpen)
- case "\0":
- // FIXME: log_parse_error()
- return emitCharacter("\u{FFFD}")
- case nil:
- return emitEOF()
- default:
- return emitCharacter(currentInputCharacter!)
- }
- default:
- print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
- return emitEOF()
- }
- }
- }
|