HTMLTokenizer.swift 91 KB


  1. /*
  2. * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. import AK
  7. import Collections
  8. import Foundation
  9. @_exported import WebCxx
  10. extension Swift.String {
  11. public init?(decoding: AK.StringView, as: AK.StringView) {
  12. let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
  13. if maybe_decoded.hasValue {
  14. self.init(akString: maybe_decoded.value!)
  15. } else {
  16. return nil
  17. }
  18. }
  19. public mutating func takeString() -> Swift.String {
  20. let result = self
  21. self = ""
  22. return result
  23. }
  24. }
  25. public class HTMLTokenizer {
  26. public enum State {
  27. case Data
  28. case RCDATA
  29. case RAWTEXT
  30. case ScriptData
  31. case PLAINTEXT
  32. case TagOpen
  33. case EndTagOpen
  34. case TagName
  35. case RCDATALessThanSign
  36. case RCDATAEndTagOpen
  37. case RCDATAEndTagName
  38. case RAWTEXTLessThanSign
  39. case RAWTEXTEndTagOpen
  40. case RAWTEXTEndTagName
  41. case ScriptDataLessThanSign
  42. case ScriptDataEndTagOpen
  43. case ScriptDataEndTagName
  44. case ScriptDataEscapeStart
  45. case ScriptDataEscapeStartDash
  46. case ScriptDataEscaped
  47. case ScriptDataEscapedDash
  48. case ScriptDataEscapedDashDash
  49. case ScriptDataEscapedLessThanSign
  50. case ScriptDataEscapedEndTagOpen
  51. case ScriptDataEscapedEndTagName
  52. case ScriptDataDoubleEscapeStart
  53. case ScriptDataDoubleEscaped
  54. case ScriptDataDoubleEscapedDash
  55. case ScriptDataDoubleEscapedDashDash
  56. case ScriptDataDoubleEscapedLessThanSign
  57. case ScriptDataDoubleEscapeEnd
  58. case BeforeAttributeName
  59. case AttributeName
  60. case AfterAttributeName
  61. case BeforeAttributeValue
  62. case AttributeValueDoubleQuoted
  63. case AttributeValueSingleQuoted
  64. case AttributeValueUnquoted
  65. case AfterAttributeValueQuoted
  66. case SelfClosingStartTag
  67. case BogusComment
  68. case MarkupDeclarationOpen
  69. case CommentStart
  70. case CommentStartDash
  71. case Comment
  72. case CommentLessThanSign
  73. case CommentLessThanSignBang
  74. case CommentLessThanSignBangDash
  75. case CommentLessThanSignBangDashDash
  76. case CommentEndDash
  77. case CommentEnd
  78. case CommentEndBang
  79. case DOCTYPE
  80. case BeforeDOCTYPEName
  81. case DOCTYPEName
  82. case AfterDOCTYPEName
  83. case AfterDOCTYPEPublicKeyword
  84. case BeforeDOCTYPEPublicIdentifier
  85. case DOCTYPEPublicIdentifierDoubleQuoted
  86. case DOCTYPEPublicIdentifierSingleQuoted
  87. case AfterDOCTYPEPublicIdentifier
  88. case BetweenDOCTYPEPublicAndSystemIdentifiers
  89. case AfterDOCTYPESystemKeyword
  90. case BeforeDOCTYPESystemIdentifier
  91. case DOCTYPESystemIdentifierDoubleQuoted
  92. case DOCTYPESystemIdentifierSingleQuoted
  93. case AfterDOCTYPESystemIdentifier
  94. case BogusDOCTYPE
  95. case CDATASection
  96. case CDATASectionBracket
  97. case CDATASectionEnd
  98. case CharacterReference
  99. case NamedCharacterReference
  100. case AmbiguousAmpersand
  101. case NumericCharacterReference
  102. case HexadecimalCharacterReferenceStart
  103. case DecimalCharacterReferenceStart
  104. case HexadecimalCharacterReference
  105. case DecimalCharacterReference
  106. case NumericCharacterReferenceEnd
  107. }
  108. private var input = Swift.String()
  109. private var cursor: Swift.String.Index
  110. private var previousCursor: Swift.String.Index
  111. public private(set) var state = State.Data
  112. private var returnState = State.Data
  113. private var currentToken = HTMLToken()
  114. private var queuedTokens = Deque<HTMLToken>()
  115. private var currentBuilder = Swift.String()
  116. private var temporaryBuffer = Swift.String()
  117. private var lastStartTagName: Swift.String? = nil
  118. private var currentTokensAttributes: [HTMLToken.Attribute]? = nil
  119. private var currentAttribute: HTMLToken.Attribute? = nil
  120. private var characterReferenceCode: Int = 0
  121. private var aborted = false
  122. private var hasEmittedEOF = false
  123. // https://infra.spec.whatwg.org/#noncharacter
  124. // A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, inclusive,
  125. // or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF,
  126. // U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
  127. // U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF,
  128. // U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
  129. static private var nonCharacter = CharacterSet(charactersIn: Unicode.Scalar(0xFDD0)!...Unicode.Scalar(0xFDEF)!)
  130. .union(CharacterSet(charactersIn: "\u{FFFE}"..."\u{FFFF}"))
  131. .union(CharacterSet(charactersIn: "\u{1FFFE}"..."\u{1FFFF}"))
  132. .union(CharacterSet(charactersIn: "\u{2FFFE}"..."\u{2FFFF}"))
  133. .union(CharacterSet(charactersIn: "\u{3FFFE}"..."\u{3FFFF}"))
  134. .union(CharacterSet(charactersIn: "\u{4FFFE}"..."\u{4FFFF}"))
  135. .union(CharacterSet(charactersIn: "\u{5FFFE}"..."\u{5FFFF}"))
  136. .union(CharacterSet(charactersIn: "\u{6FFFE}"..."\u{6FFFF}"))
  137. .union(CharacterSet(charactersIn: "\u{7FFFE}"..."\u{7FFFF}"))
  138. .union(CharacterSet(charactersIn: "\u{8FFFE}"..."\u{8FFFF}"))
  139. .union(CharacterSet(charactersIn: "\u{9FFFE}"..."\u{9FFFF}"))
  140. .union(CharacterSet(charactersIn: "\u{AFFFE}"..."\u{AFFFF}"))
  141. .union(CharacterSet(charactersIn: "\u{BFFFE}"..."\u{BFFFF}"))
  142. .union(CharacterSet(charactersIn: "\u{CFFFE}"..."\u{CFFFF}"))
  143. .union(CharacterSet(charactersIn: "\u{DFFFE}"..."\u{DFFFF}"))
  144. .union(CharacterSet(charactersIn: "\u{EFFFE}"..."\u{EFFFF}"))
  145. .union(CharacterSet(charactersIn: "\u{FFFFE}"..."\u{FFFFF}"))
  146. .union(CharacterSet(charactersIn: "\u{10FFFE}"..."\u{10FFFF}"))
  147. // https://infra.spec.whatwg.org/#ascii-whitespace
  148. static private var asciiWhitespace = CharacterSet(charactersIn: "\t\n\u{000C}\u{000D} ")
  149. // https://infra.spec.whatwg.org/#c0-control
  150. static private var c0Control = CharacterSet(charactersIn: "\u{0000}"..."\u{001F}")
  151. // https://infra.spec.whatwg.org/#control
  152. static private var control = c0Control.union(CharacterSet(charactersIn: "\u{007F}"..."\u{009F}"))
  153. // IMPLEMENTATION DEFINED: Used for the numeric character reference end state
  154. static private var controlNotAsciiWhitespace = control.subtracting(asciiWhitespace)
  155. // https://infra.spec.whatwg.org/#ascii-digit
  156. static private var asciiDigit = CharacterSet(charactersIn: "0123456789")
  157. // https://infra.spec.whatwg.org/#ascii-upper-hex-digit
  158. static private var asciiUpperHexDigit = CharacterSet(charactersIn: "ABCDEF")
  159. // https://infra.spec.whatwg.org/#ascii-lower-hex-digit
  160. static private var asciiLowerHexDigit = CharacterSet(charactersIn: "abcdef")
  161. // https://infra.spec.whatwg.org/#ascii-hex-digit
  162. static private var asciiHexDigit = asciiUpperHexDigit.union(asciiLowerHexDigit)
  163. // https://infra.spec.whatwg.org/#ascii-upper-alpha
  164. static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
  165. // https://infra.spec.whatwg.org/#ascii-lower-alpha
  166. static private var asciiLowerAlpha = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyz")
  167. // https://infra.spec.whatwg.org/#ascii-upper-alpha
  168. static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha)
  169. // https://infra.spec.whatwg.org/#ascii-alphanumeric
  170. static private var asciiAlphanumeric = asciiAlpha.union(asciiDigit)
  171. static private var characterReferenceControlCodeMapping: [Int: Unicode.Scalar] =
  172. [
  173. 0x80: Unicode.Scalar(0x20AC)!, // €
  174. 0x82: Unicode.Scalar(0x201A)!, // ‚
  175. 0x83: Unicode.Scalar(0x0192)!, // ƒ
  176. 0x84: Unicode.Scalar(0x201E)!, // „
  177. 0x85: Unicode.Scalar(0x2026)!, // …
  178. 0x86: Unicode.Scalar(0x2020)!, // †
  179. 0x87: Unicode.Scalar(0x2021)!, // ‡
  180. 0x88: Unicode.Scalar(0x02C6)!, // ˆ
  181. 0x89: Unicode.Scalar(0x2030)!, // ‰
  182. 0x8A: Unicode.Scalar(0x0160)!, // Š
  183. 0x8B: Unicode.Scalar(0x2039)!, // ‹
  184. 0x8C: Unicode.Scalar(0x0152)!, // Œ
  185. 0x8E: Unicode.Scalar(0x017D)!, // Ž
  186. 0x91: Unicode.Scalar(0x2018)!, // ‘
  187. 0x92: Unicode.Scalar(0x2019)!, // ’
  188. 0x93: Unicode.Scalar(0x201C)!, // “
  189. 0x94: Unicode.Scalar(0x201D)!, // ”
  190. 0x95: Unicode.Scalar(0x2022)!, // •
  191. 0x96: Unicode.Scalar(0x2013)!, // –
  192. 0x97: Unicode.Scalar(0x2014)!, // —
  193. 0x98: Unicode.Scalar(0x02DC)!, // ˜
  194. 0x99: Unicode.Scalar(0x2122)!, // ™
  195. 0x9A: Unicode.Scalar(0x0161)!, // š
  196. 0x9B: Unicode.Scalar(0x203A)!, // ›
  197. 0x9C: Unicode.Scalar(0x0153)!, // œ
  198. 0x9E: Unicode.Scalar(0x017E)!, // ž
  199. 0x9F: Unicode.Scalar(0x0178)!, // Ÿ
  200. ]
  201. public init() {
  202. self.cursor = self.input.startIndex
  203. self.previousCursor = self.input.startIndex
  204. }
  205. public init?(input: AK.StringView, encoding: AK.StringView) {
  206. if let string = Swift.String(decoding: input, as: encoding) {
  207. self.input = string
  208. } else {
  209. return nil
  210. }
  211. self.cursor = self.input.startIndex
  212. self.previousCursor = self.input.startIndex
  213. }
  214. public convenience init?(input: AK.StringView) {
  215. self.init(input: input, encoding: "UTF-8")
  216. }
  217. public func abort() {
  218. self.aborted = true
  219. }
  220. func skip(_ count: Int) {
  221. self.previousCursor = self.cursor
  222. self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
  223. }
  224. func peekCodePoint(_ offset: Int = 0) -> Character? {
  225. guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
  226. return nil
  227. }
  228. return self.input[index]
  229. }
  230. func peekNext(count: Int) -> Swift.Substring? {
  231. guard let endIndex = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.index(before: self.input.endIndex)) else {
  232. return nil
  233. }
  234. return self.input[self.cursor..<endIndex]
  235. }
  236. func nextCodePoint() -> Character? {
  237. guard self.cursor < self.input.endIndex else {
  238. return nil
  239. }
  240. // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
  241. // https://infra.spec.whatwg.org/#normalize-newlines
  242. var codePoint: Character
  243. if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
  244. // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
  245. skip(2)
  246. codePoint = "\n"
  247. } else if let peeked = peekCodePoint(), peeked == "\r" {
  248. // replace every remaining U+000D CR code point with a U+000A LF code point.
  249. skip(1)
  250. codePoint = "\n"
  251. } else {
  252. skip(1)
  253. codePoint = self.input[self.previousCursor]
  254. }
  255. return codePoint
  256. }
  257. func restoreCursorToPrevious() {
  258. self.cursor = self.previousCursor
  259. }
  260. func createNewToken(_ token: HTMLToken) {
  261. self.currentToken = token
  262. if self.currentToken.isTag() {
  263. self.currentTokensAttributes = []
  264. }
  265. // FIXME: Assign Position
  266. }
  267. enum AttributeStringBehavior {
  268. case SetName
  269. case SetValue
  270. case IgnoreString
  271. }
  272. func finalizeCurrentAttribute(_ behavior: AttributeStringBehavior) {
  273. precondition(self.currentAttribute != nil && self.currentTokensAttributes != nil)
  274. switch behavior {
  275. case .SetName:
  276. self.currentAttribute!.localName = self.currentBuilder.takeString()
  277. case .SetValue:
  278. self.currentAttribute!.value = self.currentBuilder.takeString()
  279. case .IgnoreString:
  280. _ = self.currentBuilder.takeString()
  281. }
  282. self.currentTokensAttributes!.append(self.currentAttribute!)
  283. self.currentAttribute = nil
  284. }
  285. enum NextTokenState {
  286. case Emit(token: HTMLToken?)
  287. case SwitchTo
  288. case Reconsume(inputCharacter: Character?)
  289. case ReprocessQueue
  290. case Continue
  291. }
  292. public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
  293. let processQueue = { () -> HTMLToken?? in
  294. if let token = self.queuedTokens.popFirst() {
  295. return token
  296. }
  297. return self.aborted ? Optional(nil) : nil
  298. }
  299. if let maybeToken = processQueue() {
  300. return maybeToken
  301. }
  302. var nextInputCharacter: Character? = nil
  303. while true {
  304. // FIXME: Handle insertion point
  305. switch nextTokenImpl(nextInputCharacter) {
  306. case .Emit(let token):
  307. return token
  308. case .SwitchTo, .Continue:
  309. nextInputCharacter = nil
  310. break
  311. case .Reconsume(let character):
  312. nextInputCharacter = character
  313. break
  314. case .ReprocessQueue:
  315. if let maybeToken = processQueue() {
  316. return maybeToken
  317. }
  318. nextInputCharacter = nil
  319. break
  320. }
  321. }
  322. }
  323. func continueInCurrentState() -> NextTokenState {
  324. return .Continue
  325. }
  326. func switchTo(_ state: State) -> NextTokenState {
  327. self.state = state
  328. return .SwitchTo
  329. }
  330. func reconsume(_ character: Character?, `in` state: State) -> NextTokenState {
  331. self.state = state
  332. return .Reconsume(inputCharacter: character)
  333. }
  334. func switchToReturnState() -> NextTokenState {
  335. self.state = self.returnState
  336. return .ReprocessQueue
  337. }
  338. func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
  339. self.state = self.returnState
  340. if character != nil {
  341. restoreCursorToPrevious()
  342. }
  343. return .ReprocessQueue
  344. }
  345. func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
  346. self.state = state
  347. if self.currentToken.isTag() {
  348. self.currentToken.attributes = self.currentTokensAttributes ?? []
  349. self.currentTokensAttributes = nil
  350. }
  351. self.queuedTokens.append(self.currentToken)
  352. self.currentToken = HTMLToken()
  353. return .Emit(token: self.queuedTokens.popFirst()!)
  354. }
  355. func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
  356. self.state = state
  357. return emitCharacter(character)
  358. }
  359. func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
  360. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
  361. self.state = `in`
  362. return .Reconsume(inputCharacter: currentInputCharacter)
  363. }
  364. func emitEOF() -> NextTokenState {
  365. if self.hasEmittedEOF {
  366. return .Emit(token: nil)
  367. }
  368. self.hasEmittedEOF = true
  369. createNewToken(HTMLToken(type: .EndOfFile))
  370. self.queuedTokens.append(self.currentToken)
  371. self.currentToken = HTMLToken()
  372. return .Emit(token: self.queuedTokens.popFirst()!)
  373. }
  374. func emitCurrentTokenFollowedByEOF() -> NextTokenState {
  375. precondition(!self.hasEmittedEOF)
  376. if self.currentToken.isTag() {
  377. self.currentToken.attributes = self.currentTokensAttributes ?? []
  378. self.currentTokensAttributes = nil
  379. }
  380. self.queuedTokens.append(self.currentToken)
  381. self.currentToken = HTMLToken()
  382. return emitEOF()
  383. }
  384. func emitCharacter(_ character: Character) -> NextTokenState {
  385. createNewToken(HTMLToken(type: .Character(codePoint: character)))
  386. self.queuedTokens.append(self.currentToken)
  387. self.currentToken = HTMLToken()
  388. return .Emit(token: self.queuedTokens.popFirst()!)
  389. }
  390. func flushCodepointsConsumedAsACharacterReference() {
  391. if consumedAsPartOfAnAttribute() {
  392. self.currentBuilder += self.temporaryBuffer.takeString()
  393. } else {
  394. for codePoint in self.temporaryBuffer.takeString() {
  395. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  396. }
  397. }
  398. }
  399. func consumedAsPartOfAnAttribute() -> Bool {
  400. return self.returnState == .AttributeValueDoubleQuoted || self.returnState == .AttributeValueSingleQuoted || self.returnState == .AttributeValueUnquoted
  401. }
  402. func isAppropriateEndTagToken(_ token: HTMLToken) -> Bool {
  403. guard case let .EndTag(endTagName, _, _, _) = token.type else {
  404. preconditionFailure("isAppropriateEndTagToken called with non-end-tag token")
  405. }
  406. if let startTagName = self.lastStartTagName {
  407. return startTagName == endTagName
  408. } else {
  409. return false
  410. }
  411. }
  412. func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
  413. let dontConsumeNextInputCharacter = {
  414. self.restoreCursorToPrevious()
  415. }
  416. let _ = dontConsumeNextInputCharacter
  417. // Handle reconsume by passing the character around in the state enum
  418. let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
  419. switch self.state {
  420. // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
  421. case .Data:
  422. precondition(currentTokensAttributes == nil)
  423. switch currentInputCharacter {
  424. case "&":
  425. self.returnState = .Data
  426. return switchTo(.CharacterReference)
  427. case "<":
  428. return switchTo(.TagOpen)
  429. case "\0":
  430. // FIXME: log_parse_error()
  431. return emitCharacter("\u{FFFD}")
  432. case nil:
  433. return emitEOF()
  434. default:
  435. return emitCharacter(currentInputCharacter!)
  436. }
  437. // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
  438. case .RCDATA:
  439. switch currentInputCharacter {
  440. case "&":
  441. self.returnState = .RCDATA
  442. return switchTo(.CharacterReference)
  443. case "<":
  444. return switchTo(.RCDATALessThanSign)
  445. case "\0":
  446. // FIXME: log_parse_error()
  447. return emitCharacter("\u{FFFD}")
  448. case nil:
  449. return emitEOF()
  450. default:
  451. return emitCharacter(currentInputCharacter!)
  452. }
  453. // 13.2.5.3. RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
  454. case .RAWTEXT:
  455. switch currentInputCharacter {
  456. case "<":
  457. return switchTo(.RAWTEXTLessThanSign)
  458. case "\0":
  459. // FIXME: log_parse_error()
  460. return emitCharacter("\u{FFFD}")
  461. case nil:
  462. return emitEOF()
  463. default:
  464. return emitCharacter(currentInputCharacter!)
  465. }
  466. // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
  467. case .ScriptData:
  468. switch currentInputCharacter {
  469. case "<":
  470. return switchTo(.ScriptDataLessThanSign)
  471. case "\0":
  472. // FIXME: log_parse_error()
  473. return emitCharacter("\u{FFFD}")
  474. case nil:
  475. return emitEOF()
  476. default:
  477. return emitCharacter(currentInputCharacter!)
  478. }
  479. // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
  480. case .PLAINTEXT:
  481. switch currentInputCharacter {
  482. case "\0":
  483. // FIXME: log_parse_error()
  484. return emitCharacter("\u{FFFD}")
  485. case nil:
  486. return emitEOF()
  487. default:
  488. return emitCharacter(currentInputCharacter!)
  489. }
  490. // 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
  491. case .TagOpen:
  492. switch currentInputCharacter {
  493. case "!":
  494. return switchTo(.MarkupDeclarationOpen)
  495. case "/":
  496. return switchTo(.EndTagOpen)
  497. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  498. createNewToken(HTMLToken(type: .StartTag(tagName: "")))
  499. return reconsume(currentInputCharacter!, in: .TagName)
  500. case "?":
  501. // FIXME: log_parse_error()
  502. createNewToken(HTMLToken(type: .Comment(data: "")))
  503. return reconsume(currentInputCharacter!, in: .BogusComment)
  504. case nil:
  505. // FIXME: log_parse_error()
  506. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  507. return emitEOF()
  508. default:
  509. // FIXME: log_parse_error()
  510. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  511. return reconsume(currentInputCharacter!, in: .Data)
  512. }
  513. // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
  514. case .EndTagOpen:
  515. switch currentInputCharacter {
  516. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  517. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  518. return reconsume(currentInputCharacter!, in: .TagName)
  519. default:
  520. return emitEOF()
  521. }
  522. // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
  523. case .TagName:
  524. switch currentInputCharacter {
  525. case "\t", "\n", "\u{000C}", " ":
  526. self.currentToken.tagName = self.currentBuilder.takeString()
  527. return switchTo(.BeforeAttributeName)
  528. case "/":
  529. self.currentToken.tagName = self.currentBuilder.takeString()
  530. return switchTo(.SelfClosingStartTag)
  531. case ">":
  532. self.currentToken.tagName = self.currentBuilder.takeString()
  533. return switchToAndEmitCurrentToken(.Data)
  534. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  535. currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  536. return continueInCurrentState()
  537. case "\0":
  538. // FIXME: log_parse_error()
  539. currentBuilder += "\u{FFFD}"
  540. return continueInCurrentState()
  541. case nil:
  542. // FIXME: log_parse_error()
  543. return emitEOF()
  544. default:
  545. currentBuilder.append(currentInputCharacter!)
  546. return continueInCurrentState()
  547. }
  548. // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
  549. case .RCDATALessThanSign:
  550. switch currentInputCharacter {
  551. case "/":
  552. self.temporaryBuffer = ""
  553. return switchTo(.RCDATAEndTagOpen)
  554. default:
  555. return emitCharacterAndReconsume("<", in: .RCDATA, currentInputCharacter: currentInputCharacter)
  556. }
  557. // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
  558. case .RCDATAEndTagOpen:
  559. switch currentInputCharacter {
  560. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  561. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  562. return reconsume(currentInputCharacter!, in: .RCDATAEndTagName)
  563. default:
  564. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  565. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  566. return reconsume(currentInputCharacter, in: .RCDATA)
  567. }
  568. // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
  569. case .RCDATAEndTagName:
  570. switch currentInputCharacter {
  571. case "\t", "\n", "\u{000C}", " ":
  572. if self.isAppropriateEndTagToken(currentToken) {
  573. return switchTo(.BeforeAttributeName)
  574. }
  575. break
  576. case "/":
  577. if self.isAppropriateEndTagToken(currentToken) {
  578. return switchTo(.SelfClosingStartTag)
  579. }
  580. break
  581. case ">":
  582. if self.isAppropriateEndTagToken(currentToken) {
  583. return switchToAndEmitCurrentToken(.Data)
  584. }
  585. break
  586. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  587. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  588. self.temporaryBuffer.append(c)
  589. return continueInCurrentState()
  590. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  591. self.currentBuilder.append(c)
  592. self.temporaryBuffer.append(c)
  593. return continueInCurrentState()
  594. default:
  595. break
  596. }
  597. // First three steps fall through to the "anything else" block
  598. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  599. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  600. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  601. self.currentBuilder = ""
  602. for codePoint in self.temporaryBuffer {
  603. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  604. }
  605. return reconsume(currentInputCharacter, in: .RCDATA)
  606. // 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
  607. case .RAWTEXTLessThanSign:
  608. switch currentInputCharacter {
  609. case "/":
  610. self.temporaryBuffer = ""
  611. return switchTo(.RAWTEXTEndTagOpen)
  612. default:
  613. return emitCharacterAndReconsume("<", in: .RAWTEXT, currentInputCharacter: currentInputCharacter)
  614. }
  615. // 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
  616. case .RAWTEXTEndTagOpen:
  617. switch currentInputCharacter {
  618. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  619. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  620. return reconsume(currentInputCharacter!, in: .RAWTEXTEndTagName)
  621. default:
  622. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  623. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  624. return reconsume(currentInputCharacter, in: .RAWTEXT)
  625. }
  626. // 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
  627. case .RAWTEXTEndTagName:
  628. switch currentInputCharacter {
  629. case "\t", "\n", "\u{000C}", " ":
  630. if self.isAppropriateEndTagToken(currentToken) {
  631. return switchTo(.BeforeAttributeName)
  632. }
  633. break
  634. case "/":
  635. if self.isAppropriateEndTagToken(currentToken) {
  636. return switchTo(.SelfClosingStartTag)
  637. }
  638. break
  639. case ">":
  640. if self.isAppropriateEndTagToken(currentToken) {
  641. return switchToAndEmitCurrentToken(.Data)
  642. }
  643. break
  644. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  645. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  646. self.temporaryBuffer.append(c)
  647. return continueInCurrentState()
  648. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  649. self.currentBuilder.append(c)
  650. self.temporaryBuffer.append(c)
  651. return continueInCurrentState()
  652. default:
  653. break
  654. }
  655. // First three steps fall through to the "anything else" block
  656. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  657. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  658. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  659. self.currentBuilder = ""
  660. for codePoint in self.temporaryBuffer {
  661. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  662. }
  663. return reconsume(currentInputCharacter, in: .RAWTEXT)
  664. // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
  665. case .ScriptDataLessThanSign:
  666. switch currentInputCharacter {
  667. case "/":
  668. self.temporaryBuffer = ""
  669. return switchTo(.ScriptDataEndTagOpen)
  670. case "!":
  671. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  672. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "!")))
  673. return switchTo(.ScriptDataEscapeStart)
  674. default:
  675. return emitCharacterAndReconsume("<", in: .ScriptData, currentInputCharacter: currentInputCharacter)
  676. }
  677. // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
  678. case .ScriptDataEndTagOpen:
  679. switch currentInputCharacter {
  680. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  681. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  682. return reconsume(currentInputCharacter!, in: .ScriptDataEndTagName)
  683. default:
  684. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  685. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  686. return reconsume(currentInputCharacter, in: .ScriptData)
  687. }
  688. // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
  689. case .ScriptDataEndTagName:
  690. switch currentInputCharacter {
  691. case "\t", "\n", "\u{000C}", " ":
  692. if self.isAppropriateEndTagToken(currentToken) {
  693. return switchTo(.BeforeAttributeName)
  694. }
  695. break
  696. case "/":
  697. if self.isAppropriateEndTagToken(currentToken) {
  698. return switchTo(.SelfClosingStartTag)
  699. }
  700. break
  701. case ">":
  702. if self.isAppropriateEndTagToken(currentToken) {
  703. return switchToAndEmitCurrentToken(.Data)
  704. }
  705. break
  706. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  707. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  708. self.temporaryBuffer.append(c)
  709. return continueInCurrentState()
  710. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  711. self.currentBuilder.append(c)
  712. self.temporaryBuffer.append(c)
  713. return continueInCurrentState()
  714. default:
  715. break
  716. }
  717. // First three steps fall through to the "anything else" block
  718. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  719. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  720. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  721. self.currentBuilder = ""
  722. for codePoint in self.temporaryBuffer {
  723. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  724. }
  725. return reconsume(currentInputCharacter, in: .ScriptData)
  726. // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
  727. case .ScriptDataEscapeStart:
  728. switch currentInputCharacter {
  729. case "-":
  730. return switchToAndEmitCharacter(.ScriptDataEscapeStartDash, character: "-")
  731. default:
  732. return reconsume(currentInputCharacter, in: .ScriptData)
  733. }
  734. // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
  735. case .ScriptDataEscapeStartDash:
  736. switch currentInputCharacter {
  737. case "-":
  738. return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
  739. default:
  740. return reconsume(currentInputCharacter, in: .ScriptData)
  741. }
  742. // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
  743. case .ScriptDataEscaped:
  744. switch currentInputCharacter {
  745. case "-":
  746. return switchToAndEmitCharacter(.ScriptDataEscapedDash, character: "-")
  747. case "<":
  748. return switchTo(.ScriptDataEscapedLessThanSign)
  749. case "\0":
  750. // FIXME: log_parse_error()
  751. return emitCharacter("\u{FFFD}")
  752. case nil:
  753. // FIXME: log_parse_error()
  754. return emitEOF()
  755. default:
  756. return emitCharacter(currentInputCharacter!)
  757. }
  758. // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
  759. case .ScriptDataEscapedDash:
  760. switch currentInputCharacter {
  761. case "-":
  762. return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
  763. case "<":
  764. return switchTo(.ScriptDataEscapedLessThanSign)
  765. case "\0":
  766. // FIXME: log_parse_error()
  767. return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
  768. case nil:
  769. // FIXME: log_parse_error()
  770. return emitEOF()
  771. default:
  772. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  773. }
  774. // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
  775. case .ScriptDataEscapedDashDash:
  776. switch currentInputCharacter {
  777. case "-":
  778. return emitCharacter("-")
  779. case "<":
  780. return switchTo(.ScriptDataEscapedLessThanSign)
  781. case ">":
  782. return switchToAndEmitCharacter(.ScriptData, character: ">")
  783. case "\0":
  784. // FIXME: log_parse_error()
  785. return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
  786. case nil:
  787. // FIXME: log_parse_error()
  788. return emitEOF()
  789. default:
  790. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  791. }
  792. // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
  793. case .ScriptDataEscapedLessThanSign:
  794. switch currentInputCharacter {
  795. case "/":
  796. self.temporaryBuffer = ""
  797. return switchTo(.ScriptDataEscapedEndTagOpen)
  798. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  799. self.temporaryBuffer = ""
  800. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  801. return reconsume(currentInputCharacter!, in: .ScriptDataDoubleEscapeStart)
  802. default:
  803. return emitCharacterAndReconsume("<", in: .ScriptDataEscaped, currentInputCharacter: currentInputCharacter)
  804. }
  805. // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
  806. case .ScriptDataEscapedEndTagOpen:
  807. switch currentInputCharacter {
  808. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  809. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  810. return reconsume(currentInputCharacter!, in: .ScriptDataEscapedEndTagName)
  811. default:
  812. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  813. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  814. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  815. }
  816. // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
  817. case .ScriptDataEscapedEndTagName:
  818. switch currentInputCharacter {
  819. case "\t", "\n", "\u{000C}", " ":
  820. if self.isAppropriateEndTagToken(currentToken) {
  821. return switchTo(.BeforeAttributeName)
  822. }
  823. break
  824. case "/":
  825. if self.isAppropriateEndTagToken(currentToken) {
  826. return switchTo(.SelfClosingStartTag)
  827. }
  828. break
  829. case ">":
  830. if self.isAppropriateEndTagToken(currentToken) {
  831. return switchToAndEmitCurrentToken(.Data)
  832. }
  833. break
  834. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  835. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  836. self.temporaryBuffer.append(c)
  837. return continueInCurrentState()
  838. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  839. self.currentBuilder.append(c)
  840. self.temporaryBuffer.append(c)
  841. return continueInCurrentState()
  842. default:
  843. break
  844. }
  845. // First three steps fall through to the "anything else" block
  846. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  847. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  848. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  849. self.currentBuilder = ""
  850. for codePoint in self.temporaryBuffer {
  851. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  852. }
  853. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  854. // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
  855. case .ScriptDataDoubleEscapeStart:
  856. switch currentInputCharacter {
  857. case "\t", "\n", "\u{000C}", " ", "/", ">":
  858. if self.temporaryBuffer == "script" {
  859. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  860. } else {
  861. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  862. }
  863. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  864. self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  865. return emitCharacter(currentInputCharacter!)
  866. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  867. self.temporaryBuffer.append(c)
  868. return emitCharacter(currentInputCharacter!)
  869. default:
  870. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  871. }
  872. // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
  873. case .ScriptDataDoubleEscaped:
  874. switch currentInputCharacter {
  875. case "-":
  876. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDash, character: "-")
  877. case "<":
  878. return switchTo(.ScriptDataDoubleEscapedLessThanSign)
  879. case "\0":
  880. // FIXME: log_parse_error()
  881. return emitCharacter("\u{FFFD}")
  882. case nil:
  883. // FIXME: log_parse_error()
  884. return emitEOF()
  885. default:
  886. return emitCharacter(currentInputCharacter!)
  887. }
  888. // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
  889. case .ScriptDataDoubleEscapedDash:
  890. switch currentInputCharacter {
  891. case "-":
  892. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDashDash, character: "-")
  893. case "<":
  894. return switchTo(.ScriptDataDoubleEscapedLessThanSign)
  895. case "\0":
  896. // FIXME: log_parse_error()
  897. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
  898. case nil:
  899. // FIXME: log_parse_error()
  900. return emitEOF()
  901. default:
  902. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  903. }
  904. // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
  905. case .ScriptDataDoubleEscapedDashDash:
  906. switch currentInputCharacter {
  907. case "-":
  908. return emitCharacter("-")
  909. case "<":
  910. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedLessThanSign, character: "<")
  911. case ">":
  912. return switchToAndEmitCharacter(.ScriptData, character: ">")
  913. case "\0":
  914. // FIXME: log_parse_error()
  915. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
  916. case nil:
  917. // FIXME: log_parse_error()
  918. return emitEOF()
  919. default:
  920. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  921. }
  922. // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
  923. case .ScriptDataDoubleEscapedLessThanSign:
  924. switch currentInputCharacter {
  925. case "/":
  926. self.temporaryBuffer = ""
  927. return switchToAndEmitCharacter(.ScriptDataDoubleEscapeEnd, character: "/")
  928. default:
  929. return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
  930. }
  931. // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
  932. case .ScriptDataDoubleEscapeEnd:
  933. switch currentInputCharacter {
  934. case "\t", "\n", "\u{000C}", " ", "/", ">":
  935. if self.temporaryBuffer == "script" {
  936. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  937. } else {
  938. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  939. }
  940. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  941. self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  942. return emitCharacter(currentInputCharacter!)
  943. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  944. self.temporaryBuffer.append(c)
  945. return emitCharacter(currentInputCharacter!)
  946. default:
  947. return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
  948. }
  949. // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
  950. case .BeforeAttributeName:
  951. switch currentInputCharacter {
  952. case "\t", "\n", "\u{000C}", " ":
  953. return continueInCurrentState()
  954. case "/", ">", nil:
  955. return reconsume(currentInputCharacter, in: .AfterAttributeName)
  956. case "=":
  957. // FIXME: log_parse_error()
  958. self.currentBuilder = Swift.String(currentInputCharacter!)
  959. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  960. return switchTo(.AttributeName)
  961. default:
  962. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  963. return reconsume(currentInputCharacter!, in: .AttributeName)
  964. }
  965. // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
  966. case .AttributeName:
  967. // FIXME: When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
  968. // the complete attribute's name must be compared to the other attributes on the same token;
  969. // if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
  970. // parse error and the new attribute must be removed from the token.
  971. // NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
  972. // are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
  973. // in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
  974. switch currentInputCharacter {
  975. case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
  976. // FIXME: set name position
  977. self.currentAttribute!.localName = self.currentBuilder.takeString()
  978. return reconsume(currentInputCharacter, in: .AfterAttributeName)
  979. case "=":
  980. // FIXME: set name position
  981. self.currentAttribute!.localName = self.currentBuilder.takeString()
  982. return switchTo(.BeforeAttributeValue)
  983. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  984. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  985. return continueInCurrentState()
  986. case "\0":
  987. // FIXME: log_parse_error()
  988. self.currentBuilder.append("\u{FFFD}")
  989. return continueInCurrentState()
  990. default:
  991. self.currentBuilder.append(currentInputCharacter!)
  992. return continueInCurrentState()
  993. }
  994. // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
  995. case .AfterAttributeName:
  996. switch currentInputCharacter {
  997. case "\t", "\n", "\u{000C}", " ":
  998. return continueInCurrentState()
  999. case "/":
  1000. self.finalizeCurrentAttribute(.SetName)
  1001. return switchTo(.SelfClosingStartTag)
  1002. case "=":
  1003. self.finalizeCurrentAttribute(.SetName)
  1004. return switchTo(.BeforeAttributeValue)
  1005. case ">":
  1006. self.finalizeCurrentAttribute(.SetName)
  1007. return switchToAndEmitCurrentToken(.Data)
  1008. case nil:
  1009. // FIXME: log_parse_error()
  1010. self.finalizeCurrentAttribute(.IgnoreString)
  1011. return emitEOF()
  1012. default:
  1013. self.finalizeCurrentAttribute(.SetName)
  1014. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  1015. return reconsume(currentInputCharacter!, in: .AttributeName)
  1016. }
  1017. // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
  1018. case .BeforeAttributeValue:
  1019. switch currentInputCharacter {
  1020. case "\t", "\n", "\u{000C}", " ":
  1021. return continueInCurrentState()
  1022. case "\"":
  1023. return switchTo(.AttributeValueDoubleQuoted)
  1024. case "'":
  1025. return switchTo(.AttributeValueSingleQuoted)
  1026. case ">":
  1027. // FIXME: log_parse_error()
  1028. self.finalizeCurrentAttribute(.IgnoreString)
  1029. return switchToAndEmitCurrentToken(.Data)
  1030. default:
  1031. return reconsume(currentInputCharacter, in: .AttributeValueUnquoted)
  1032. }
  1033. // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
  1034. case .AttributeValueDoubleQuoted:
  1035. switch currentInputCharacter {
  1036. case "\"":
  1037. return switchTo(.AfterAttributeValueQuoted)
  1038. case "&":
  1039. self.returnState = .AttributeValueDoubleQuoted
  1040. return switchTo(.CharacterReference)
  1041. case "\0":
  1042. // FIXME: log_parse_error()
  1043. self.currentBuilder.append("\u{FFFD}")
  1044. return continueInCurrentState()
  1045. case nil:
  1046. // FIXME: log_parse_error()
  1047. self.finalizeCurrentAttribute(.IgnoreString)
  1048. return emitEOF()
  1049. default:
  1050. self.currentBuilder.append(currentInputCharacter!)
  1051. return continueInCurrentState()
  1052. }
  1053. // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
  1054. case .AttributeValueSingleQuoted:
  1055. switch currentInputCharacter {
  1056. case "'":
  1057. return switchTo(.AfterAttributeValueQuoted)
  1058. case "&":
  1059. self.returnState = .AttributeValueSingleQuoted
  1060. return switchTo(.CharacterReference)
  1061. case "\0":
  1062. // FIXME: log_parse_error()
  1063. self.currentBuilder.append("\u{FFFD}")
  1064. return continueInCurrentState()
  1065. case nil:
  1066. // FIXME: log_parse_error()
  1067. return emitEOF()
  1068. default:
  1069. self.currentBuilder.append(currentInputCharacter!)
  1070. return continueInCurrentState()
  1071. }
  1072. // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
  1073. case .AttributeValueUnquoted:
  1074. switch currentInputCharacter {
  1075. case "\t", "\n", "\u{000C}", " ":
  1076. self.finalizeCurrentAttribute(.SetValue)
  1077. return switchTo(.BeforeAttributeName)
  1078. case "&":
  1079. self.returnState = .AttributeValueUnquoted
  1080. return switchTo(.CharacterReference)
  1081. case ">":
  1082. self.finalizeCurrentAttribute(.SetValue)
  1083. return switchToAndEmitCurrentToken(.Data)
  1084. case "\0":
  1085. // FIXME: log_parse_error()
  1086. self.currentBuilder.append("\u{FFFD}")
  1087. return continueInCurrentState()
  1088. case "\"", "'", "<", "=", "`":
  1089. // FIXME: log_parse_error()
  1090. self.currentBuilder.append(currentInputCharacter!)
  1091. return continueInCurrentState()
  1092. case nil:
  1093. // FIXME: log_parse_error()
  1094. self.finalizeCurrentAttribute(.IgnoreString)
  1095. return emitEOF()
  1096. default:
  1097. self.currentBuilder.append(currentInputCharacter!)
  1098. return continueInCurrentState()
  1099. }
  1100. // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
  1101. case .AfterAttributeValueQuoted:
  1102. switch currentInputCharacter {
  1103. case "\t", "\n", "\u{000C}", " ":
  1104. self.finalizeCurrentAttribute(.SetValue)
  1105. return switchTo(.BeforeAttributeName)
  1106. case "/":
  1107. self.finalizeCurrentAttribute(.SetValue)
  1108. return switchTo(.SelfClosingStartTag)
  1109. case ">":
  1110. self.finalizeCurrentAttribute(.SetValue)
  1111. return switchToAndEmitCurrentToken(.Data)
  1112. case nil:
  1113. // FIXME: log_parse_error()
  1114. self.finalizeCurrentAttribute(.IgnoreString)
  1115. return emitEOF()
  1116. default:
  1117. // FIXME: log_parse_error()
  1118. self.finalizeCurrentAttribute(.SetValue)
  1119. return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
  1120. }
  1121. // 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
  1122. case .SelfClosingStartTag:
  1123. switch currentInputCharacter {
  1124. case ">":
  1125. self.currentToken.selfClosing = true
  1126. return switchToAndEmitCurrentToken(.Data)
  1127. case nil:
  1128. // FIXME: log_parse_error()
  1129. return emitEOF()
  1130. default:
  1131. // FIXME: log_parse_error()
  1132. return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
  1133. }
  1134. // 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
  1135. case .BogusComment:
  1136. switch currentInputCharacter {
  1137. case ">":
  1138. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1139. return switchToAndEmitCurrentToken(.Data)
  1140. case nil:
  1141. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1142. return emitCurrentTokenFollowedByEOF()
  1143. case "\0":
  1144. // FIXME: log_parse_error()
  1145. currentBuilder.append("\u{FFFD}")
  1146. return continueInCurrentState()
  1147. default:
  1148. self.currentBuilder.append(currentInputCharacter!)
  1149. return continueInCurrentState()
  1150. }
  1151. // 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
  1152. case .MarkupDeclarationOpen:
  1153. dontConsumeNextInputCharacter()
  1154. if let nextTwo = peekNext(count: 2), nextTwo == "--" {
  1155. skip(2)
  1156. return switchTo(.CommentStart)
  1157. } else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "DOCTYPE" {
  1158. skip(7)
  1159. return switchTo(.DOCTYPE)
  1160. } else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "[CDATA[" {
  1161. skip(7)
  1162. // FIXME: If there is an adjusted current node and it is not an element in the HTML namespace,
  1163. // then switch to the CDATA section state.
  1164. // FIXME: log_parse_error()
  1165. self.currentBuilder = "[CDATA["
  1166. self.currentToken = HTMLToken(type: .Comment(data: ""))
  1167. return switchTo(.BogusComment)
  1168. } else {
  1169. // FIXME: log_parse_error()
  1170. self.currentToken = HTMLToken(type: .Comment(data: ""))
  1171. return switchTo(.BogusComment)
  1172. }
  1173. // 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
  1174. case .CommentStart:
  1175. switch currentInputCharacter {
  1176. case "-":
  1177. return switchTo(.CommentStartDash)
  1178. case ">":
  1179. // FIXME: log_parse_error()
  1180. return switchToAndEmitCurrentToken(.Data)
  1181. default:
  1182. return reconsume(currentInputCharacter, in: .Comment)
  1183. }
  1184. // 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
  1185. case .CommentStartDash:
  1186. switch currentInputCharacter {
  1187. case "-":
  1188. return switchTo(.CommentEnd)
  1189. case ">":
  1190. // FIXME: log_parse_error()
  1191. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1192. return switchToAndEmitCurrentToken(.Data)
  1193. case nil:
  1194. // FIXME: log_parse_error()
  1195. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1196. return emitCurrentTokenFollowedByEOF()
  1197. default:
  1198. currentBuilder.append("-")
  1199. return reconsume(currentInputCharacter, in: .Comment)
  1200. }
  1201. // 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
  1202. case .Comment:
  1203. switch currentInputCharacter {
  1204. case "<":
  1205. currentBuilder.append("<")
  1206. return switchTo(.CommentLessThanSign)
  1207. case "-":
  1208. return switchTo(.CommentEndDash)
  1209. case "\0":
  1210. // FIXME: log_parse_error()
  1211. currentBuilder.append("\u{FFFD}")
  1212. return continueInCurrentState()
  1213. case nil:
  1214. // FIXME: log_parse_error()
  1215. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1216. return emitCurrentTokenFollowedByEOF()
  1217. default:
  1218. currentBuilder.append(currentInputCharacter!)
  1219. return continueInCurrentState()
  1220. }
  1221. // 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
  1222. case .CommentLessThanSign:
  1223. switch currentInputCharacter {
  1224. case "!":
  1225. currentBuilder.append(currentInputCharacter!)
  1226. return switchTo(.CommentLessThanSignBang)
  1227. case "<":
  1228. currentBuilder.append(currentInputCharacter!)
  1229. return continueInCurrentState()
  1230. default:
  1231. return reconsume(currentInputCharacter, in: .Comment)
  1232. }
  1233. // 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
  1234. case .CommentLessThanSignBang:
  1235. switch currentInputCharacter {
  1236. case "-":
  1237. return switchTo(.CommentLessThanSignBangDash)
  1238. default:
  1239. return reconsume(currentInputCharacter, in: .Comment)
  1240. }
  1241. // 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
  1242. case .CommentLessThanSignBangDash:
  1243. switch currentInputCharacter {
  1244. case "-":
  1245. return switchTo(.CommentLessThanSignBangDashDash)
  1246. default:
  1247. return reconsume(currentInputCharacter, in: .CommentEndDash)
  1248. }
  1249. // 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
  1250. case .CommentLessThanSignBangDashDash:
  1251. switch currentInputCharacter {
  1252. case ">", nil:
  1253. return reconsume(currentInputCharacter, in: .CommentEnd)
  1254. default:
  1255. // FIXME: log_parse_error()
  1256. return reconsume(currentInputCharacter, in: .CommentEnd)
  1257. }
  1258. // 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
  1259. case .CommentEndDash:
  1260. switch currentInputCharacter {
  1261. case "-":
  1262. return switchTo(.CommentEnd)
  1263. case nil:
  1264. // FIXME: log_parse_error()
  1265. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1266. return emitCurrentTokenFollowedByEOF()
  1267. default:
  1268. currentBuilder.append("-")
  1269. return reconsume(currentInputCharacter, in: .Comment)
  1270. }
  1271. // 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
  1272. case .CommentEnd:
  1273. switch currentInputCharacter {
  1274. case ">":
  1275. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1276. return switchToAndEmitCurrentToken(.Data)
  1277. case "!":
  1278. return switchTo(.CommentEndBang)
  1279. case "-":
  1280. currentBuilder.append("-")
  1281. return continueInCurrentState()
  1282. case nil:
  1283. // FIXME: log_parse_error()
  1284. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1285. return emitCurrentTokenFollowedByEOF()
  1286. default:
  1287. currentBuilder.append("--")
  1288. return reconsume(currentInputCharacter, in: .Comment)
  1289. }
  1290. // 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
  1291. case .CommentEndBang:
  1292. switch currentInputCharacter {
  1293. case "-":
  1294. currentBuilder.append("--!")
  1295. return switchTo(.CommentEndDash)
  1296. case ">":
  1297. // FIXME: log_parse_error()
  1298. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1299. return switchToAndEmitCurrentToken(.Data)
  1300. case nil:
  1301. // FIXME: log_parse_error()
  1302. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1303. return emitCurrentTokenFollowedByEOF()
  1304. default:
  1305. currentBuilder.append("--!")
  1306. return reconsume(currentInputCharacter, in: .Comment)
  1307. }
  1308. // 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
  1309. case .DOCTYPE:
  1310. switch currentInputCharacter {
  1311. case "\t", "\n", "\u{000C}", " ":
  1312. return switchTo(.BeforeDOCTYPEName)
  1313. case ">":
  1314. return reconsume(currentInputCharacter, in: .BeforeDOCTYPEName)
  1315. case nil:
  1316. // FIXME: log_parse_error()
  1317. currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
  1318. return emitCurrentTokenFollowedByEOF()
  1319. default:
  1320. // FIXME: log_parse_error()
  1321. return reconsume(currentInputCharacter!, in: .BeforeDOCTYPEName)
  1322. }
  1323. // 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
  1324. case .BeforeDOCTYPEName:
  1325. switch currentInputCharacter {
  1326. case "\t", "\n", "\u{000C}", " ":
  1327. return continueInCurrentState()
  1328. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  1329. precondition(self.currentBuilder.isEmpty)
  1330. self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
  1331. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  1332. return switchTo(.DOCTYPEName)
  1333. case "\0":
  1334. // FIXME: log_parse_error()
  1335. precondition(self.currentBuilder.isEmpty)
  1336. self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
  1337. self.currentBuilder.append("\u{FFFD}")
  1338. return switchTo(.DOCTYPEName)
  1339. case ">":
  1340. // FIXME: log_parse_error()
  1341. self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
  1342. return switchToAndEmitCurrentToken(.Data)
  1343. case nil:
  1344. // FIXME: log_parse_error()
  1345. self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
  1346. return emitCurrentTokenFollowedByEOF()
  1347. default:
  1348. // FIXME: log_parse_error()
  1349. precondition(self.currentBuilder.isEmpty)
  1350. self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
  1351. self.currentBuilder.append(currentInputCharacter!)
  1352. return switchTo(.DOCTYPEName)
  1353. }
  1354. // 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
  1355. case .DOCTYPEName:
  1356. switch currentInputCharacter {
  1357. case "\t", "\n", "\u{000C}", " ":
  1358. self.currentToken.name = self.currentBuilder.takeString()
  1359. return switchTo(.AfterDOCTYPEName)
  1360. case ">":
  1361. self.currentToken.name = self.currentBuilder.takeString()
  1362. return switchToAndEmitCurrentToken(.Data)
  1363. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  1364. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  1365. return continueInCurrentState()
  1366. case "\0":
  1367. // FIXME: log_parse_error()
  1368. self.currentBuilder.append("\u{FFFD}")
  1369. return continueInCurrentState()
  1370. case nil:
  1371. // FIXME: log_parse_error()
  1372. self.currentToken.forceQuirks = true
  1373. self.currentToken.name = self.currentBuilder.takeString()
  1374. return emitCurrentTokenFollowedByEOF()
  1375. default:
  1376. self.currentBuilder.append(currentInputCharacter!)
  1377. return continueInCurrentState()
  1378. }
  1379. // 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
  1380. case .AfterDOCTYPEName:
  1381. precondition(self.currentBuilder.isEmpty)
  1382. precondition(self.currentToken.name != nil)
  1383. switch currentInputCharacter {
  1384. case "\t", "\n", "\u{000C}", " ":
  1385. return continueInCurrentState()
  1386. case ">":
  1387. return switchToAndEmitCurrentToken(.Data)
  1388. case nil:
  1389. // FIXME: log_parse_error()
  1390. self.currentToken.forceQuirks = true
  1391. return emitCurrentTokenFollowedByEOF()
  1392. default:
  1393. if "pP".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "UBLIC" {
  1394. skip(5)
  1395. return switchTo(.AfterDOCTYPEPublicKeyword)
  1396. }
  1397. if "sS".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "YSTEM" {
  1398. skip(5)
  1399. return switchTo(.AfterDOCTYPESystemKeyword)
  1400. }
  1401. // FIXME: log_parse_error()
  1402. self.currentToken.forceQuirks = true
  1403. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1404. }
  1405. // 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
  1406. case .AfterDOCTYPEPublicKeyword:
  1407. switch currentInputCharacter {
  1408. case "\t", "\n", "\u{000C}", " ":
  1409. return switchTo(.BeforeDOCTYPEPublicIdentifier)
  1410. case "\"":
  1411. // FIXME: log_parse_error()
  1412. self.currentToken.publicIdentifier = ""
  1413. return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted)
  1414. case "'":
  1415. self.currentToken.publicIdentifier = ""
  1416. return switchTo(.DOCTYPEPublicIdentifierSingleQuoted)
  1417. case ">":
  1418. // FIXME: log_parse_error()
  1419. self.currentToken.forceQuirks = true
  1420. return switchToAndEmitCurrentToken(.Data)
  1421. case nil:
  1422. // FIXME: log_parse_error()
  1423. self.currentToken.forceQuirks = true
  1424. return emitCurrentTokenFollowedByEOF()
  1425. default:
  1426. // FIXME: log_parse_error()
  1427. self.currentToken.forceQuirks = true
  1428. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1429. }
  1430. // 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
  1431. case .BeforeDOCTYPEPublicIdentifier:
  1432. switch currentInputCharacter {
  1433. case "\t", "\n", "\u{000C}", " ":
  1434. return continueInCurrentState()
  1435. case "\"":
  1436. self.currentToken.publicIdentifier = ""
  1437. return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted)
  1438. case "'":
  1439. self.currentToken.publicIdentifier = ""
  1440. return switchTo(.DOCTYPEPublicIdentifierSingleQuoted)
  1441. case ">":
  1442. // FIXME: log_parse_error()
  1443. self.currentToken.forceQuirks = true
  1444. return switchToAndEmitCurrentToken(.Data)
  1445. case nil:
  1446. // FIXME: log_parse_error()
  1447. self.currentToken.forceQuirks = true
  1448. return emitCurrentTokenFollowedByEOF()
  1449. default:
  1450. // FIXME: log_parse_error()
  1451. self.currentToken.forceQuirks = true
  1452. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1453. }
  1454. // 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-double-quoted-state
  1455. case .DOCTYPEPublicIdentifierDoubleQuoted:
  1456. switch currentInputCharacter {
  1457. case "\"":
  1458. self.currentToken.publicIdentifier = self.currentBuilder.takeString()
  1459. return switchTo(.AfterDOCTYPEPublicIdentifier)
  1460. case "\0":
  1461. // FIXME: log_parse_error()
  1462. self.currentBuilder.append("\u{FFFD}")
  1463. return continueInCurrentState()
  1464. case ">":
  1465. // FIXME: log_parse_error()
  1466. self.currentToken.publicIdentifier = self.currentBuilder.takeString()
  1467. self.currentToken.forceQuirks = true
  1468. return switchToAndEmitCurrentToken(.Data)
  1469. case nil:
  1470. // FIXME: log_parse_error()
  1471. self.currentToken.forceQuirks = true
  1472. return emitCurrentTokenFollowedByEOF()
  1473. default:
  1474. self.currentBuilder.append(currentInputCharacter!)
  1475. return continueInCurrentState()
  1476. }
  1477. // 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-single-quoted-state
  1478. case .DOCTYPEPublicIdentifierSingleQuoted:
  1479. switch currentInputCharacter {
  1480. case "'":
  1481. self.currentToken.publicIdentifier = self.currentBuilder.takeString()
  1482. return switchTo(.AfterDOCTYPEPublicIdentifier)
  1483. case "\0":
  1484. // FIXME: log_parse_error()
  1485. self.currentBuilder.append("\u{FFFD}")
  1486. return continueInCurrentState()
  1487. case ">":
  1488. // FIXME: log_parse_error()
  1489. self.currentToken.publicIdentifier = self.currentBuilder.takeString()
  1490. self.currentToken.forceQuirks = true
  1491. return switchToAndEmitCurrentToken(.Data)
  1492. case nil:
  1493. // FIXME: log_parse_error()
  1494. self.currentToken.forceQuirks = true
  1495. return emitCurrentTokenFollowedByEOF()
  1496. default:
  1497. self.currentBuilder.append(currentInputCharacter!)
  1498. return continueInCurrentState()
  1499. }
  1500. // 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
  1501. case .AfterDOCTYPEPublicIdentifier:
  1502. switch currentInputCharacter {
  1503. case "\t", "\n", "\u{000C}", " ":
  1504. return switchTo(.BetweenDOCTYPEPublicAndSystemIdentifiers)
  1505. case ">":
  1506. return switchToAndEmitCurrentToken(.Data)
  1507. case "\"":
  1508. // FIXME: log_parse_error()
  1509. self.currentToken.systemIdentifier = ""
  1510. return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
  1511. case "'":
  1512. // FIXME: log_parse_error()
  1513. self.currentToken.systemIdentifier = ""
  1514. return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
  1515. case nil:
  1516. // FIXME: log_parse_error()
  1517. self.currentToken.forceQuirks = true
  1518. return emitCurrentTokenFollowedByEOF()
  1519. default:
  1520. // FIXME: log_parse_error()
  1521. self.currentToken.forceQuirks = true
  1522. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1523. }
  1524. // 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
  1525. case .BetweenDOCTYPEPublicAndSystemIdentifiers:
  1526. switch currentInputCharacter {
  1527. case "\t", "\n", "\u{000C}", " ":
  1528. return continueInCurrentState()
  1529. case ">":
  1530. return switchToAndEmitCurrentToken(.Data)
  1531. case "\"":
  1532. self.currentToken.systemIdentifier = ""
  1533. return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
  1534. case "'":
  1535. self.currentToken.systemIdentifier = ""
  1536. return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
  1537. case nil:
  1538. // FIXME: log_parse_error()
  1539. self.currentToken.forceQuirks = true
  1540. return emitCurrentTokenFollowedByEOF()
  1541. default:
  1542. // FIXME: log_parse_error()
  1543. self.currentToken.forceQuirks = true
  1544. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1545. }
  1546. // 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
  1547. case .AfterDOCTYPESystemKeyword:
  1548. switch currentInputCharacter {
  1549. case "\t", "\n", "\u{000C}", " ":
  1550. return switchTo(.BeforeDOCTYPESystemIdentifier)
  1551. case "\"":
  1552. // FIXME: log_parse_error()
  1553. self.currentToken.systemIdentifier = ""
  1554. return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
  1555. case "'":
  1556. // FIXME: log_parse_error()
  1557. self.currentToken.systemIdentifier = ""
  1558. return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
  1559. case ">":
  1560. // FIXME: log_parse_error()
  1561. self.currentToken.forceQuirks = true
  1562. return switchToAndEmitCurrentToken(.Data)
  1563. case nil:
  1564. // FIXME: log_parse_error()
  1565. self.currentToken.forceQuirks = true
  1566. return emitCurrentTokenFollowedByEOF()
  1567. default:
  1568. // FIXME: log_parse_error()
  1569. self.currentToken.forceQuirks = true
  1570. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1571. }
  1572. // 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
  1573. case .BeforeDOCTYPESystemIdentifier:
  1574. switch currentInputCharacter {
  1575. case "\t", "\n", "\u{000C}", " ":
  1576. return continueInCurrentState()
  1577. case "\"":
  1578. self.currentToken.systemIdentifier = ""
  1579. return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
  1580. case "'":
  1581. self.currentToken.systemIdentifier = ""
  1582. return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
  1583. case ">":
  1584. // FIXME: log_parse_error()
  1585. self.currentToken.forceQuirks = true
  1586. return switchToAndEmitCurrentToken(.Data)
  1587. case nil:
  1588. // FIXME: log_parse_error()
  1589. self.currentToken.forceQuirks = true
  1590. return emitCurrentTokenFollowedByEOF()
  1591. default:
  1592. // FIXME: log_parse_error()
  1593. self.currentToken.forceQuirks = true
  1594. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1595. }
  1596. // 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-double-quoted-state
  1597. case .DOCTYPESystemIdentifierDoubleQuoted:
  1598. switch currentInputCharacter {
  1599. case "\"":
  1600. self.currentToken.systemIdentifier = self.currentBuilder.takeString()
  1601. return switchTo(.AfterDOCTYPESystemIdentifier)
  1602. case "\0":
  1603. // FIXME: log_parse_error()
  1604. self.currentBuilder.append("\u{FFFD}")
  1605. return continueInCurrentState()
  1606. case ">":
  1607. // FIXME: log_parse_error()
  1608. self.currentToken.systemIdentifier = self.currentBuilder.takeString()
  1609. self.currentToken.forceQuirks = true
  1610. return switchToAndEmitCurrentToken(.Data)
  1611. case nil:
  1612. // FIXME: log_parse_error()
  1613. self.currentToken.forceQuirks = true
  1614. return emitCurrentTokenFollowedByEOF()
  1615. default:
  1616. self.currentBuilder.append(currentInputCharacter!)
  1617. return continueInCurrentState()
  1618. }
  1619. // 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-single-quoted-state
  1620. case .DOCTYPESystemIdentifierSingleQuoted:
  1621. switch currentInputCharacter {
  1622. case "'":
  1623. return switchTo(.AfterDOCTYPESystemIdentifier)
  1624. case "\0":
  1625. // FIXME: log_parse_error()
  1626. self.currentBuilder.append("\u{FFFD}")
  1627. return continueInCurrentState()
  1628. case ">":
  1629. // FIXME: log_parse_error()
  1630. self.currentToken.systemIdentifier = self.currentBuilder.takeString()
  1631. self.currentToken.forceQuirks = true
  1632. return switchToAndEmitCurrentToken(.Data)
  1633. case nil:
  1634. // FIXME: log_parse_error()
  1635. self.currentToken.forceQuirks = true
  1636. return emitCurrentTokenFollowedByEOF()
  1637. default:
  1638. self.currentBuilder.append(currentInputCharacter!)
  1639. return continueInCurrentState()
  1640. }
  1641. // 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
  1642. case .AfterDOCTYPESystemIdentifier:
  1643. switch currentInputCharacter {
  1644. case "\t", "\n", "\u{000C}", " ":
  1645. return continueInCurrentState()
  1646. case ">":
  1647. return switchToAndEmitCurrentToken(.Data)
  1648. case nil:
  1649. // FIXME: log_parse_error()
  1650. self.currentToken.forceQuirks = true
  1651. return emitCurrentTokenFollowedByEOF()
  1652. default:
  1653. // FIXME: log_parse_error()
  1654. // NOTE: This does not set the current DOCTYPE token's force-quirks flag to on.
  1655. return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
  1656. }
  1657. // 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
  1658. case .BogusDOCTYPE:
  1659. switch currentInputCharacter {
  1660. case ">":
  1661. return switchToAndEmitCurrentToken(.Data)
  1662. case "\0":
  1663. // FIXME: log_parse_error()
  1664. return continueInCurrentState()
  1665. case nil:
  1666. return emitCurrentTokenFollowedByEOF()
  1667. default:
  1668. return continueInCurrentState()
  1669. }
  1670. // 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
  1671. case .CDATASection:
  1672. switch currentInputCharacter {
  1673. case "]":
  1674. return switchTo(.CDATASectionBracket)
  1675. case nil:
  1676. // FIXME: log_parse_error()
  1677. return emitEOF()
  1678. default:
  1679. // NOTE: U+0000 NULL characters are handled in the tree construction stage,
  1680. // as part of the in foreign content insertion mode, which is the only place where CDATA sections can appear.
  1681. return emitCharacter(currentInputCharacter!)
  1682. }
  1683. // 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
  1684. case .CDATASectionBracket:
  1685. switch currentInputCharacter {
  1686. case "]":
  1687. return switchTo(.CDATASectionEnd)
  1688. default:
  1689. return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter)
  1690. }
  1691. // 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
  1692. case .CDATASectionEnd:
  1693. switch currentInputCharacter {
  1694. case "]":
  1695. return emitCharacter("]")
  1696. case ">":
  1697. return switchTo(.Data)
  1698. default:
  1699. queuedTokens.append(HTMLToken(type: .Character(codePoint: "]")))
  1700. return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter)
  1701. }
  1702. // 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
  1703. case .CharacterReference:
  1704. self.temporaryBuffer = "&"
  1705. switch currentInputCharacter {
  1706. case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!):
  1707. return reconsume(currentInputCharacter!, in: .NamedCharacterReference)
  1708. case "#":
  1709. self.temporaryBuffer.append(currentInputCharacter!)
  1710. return switchTo(.NumericCharacterReference)
  1711. default:
  1712. self.flushCodepointsConsumedAsACharacterReference()
  1713. return reconsume(currentInputCharacter, in: self.returnState)
  1714. }
  1715. // 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
  1716. case .NamedCharacterReference:
  1717. var subString = self.input[self.previousCursor...]
  1718. let entityMatch = subString.withUTF8 { utf8 in
  1719. return Web.HTML.match_entity_for_named_character_reference(AK.StringView(utf8.baseAddress!, utf8.count))
  1720. }
  1721. if entityMatch.hasValue {
  1722. let entity = entityMatch.value!.entity
  1723. skip(entity.length())
  1724. // FIXME: Iterate over the entity's code points and add them instead of creating a string
  1725. self.temporaryBuffer.append(Swift.String(akStringView: entity)!)
  1726. if self.consumedAsPartOfAnAttribute(), !entity.endsWith(";") {
  1727. if let peeked = peekCodePoint(), peeked == "=" || HTMLTokenizer.asciiAlphanumeric.contains(peeked.unicodeScalars.first!) {
  1728. self.flushCodepointsConsumedAsACharacterReference()
  1729. return switchTo(self.returnState)
  1730. }
  1731. }
  1732. if !entity.endsWith(";") {
  1733. // FIXME: log_parse_error()
  1734. }
  1735. self.temporaryBuffer = ""
  1736. // FIXME: This AK::Vector<u32, 2> should be CxxConvertibleToContainer, but https://github.com/swiftlang/swift/issues/77607
  1737. let codePoints = entityMatch.value!.code_points
  1738. for i in 0...codePoints.size() {
  1739. self.temporaryBuffer.append(Character(Unicode.Scalar(codePoints[i])!))
  1740. }
  1741. self.flushCodepointsConsumedAsACharacterReference()
  1742. return switchTo(self.returnState)
  1743. }
  1744. self.flushCodepointsConsumedAsACharacterReference()
  1745. return reconsume(currentInputCharacter, in: .AmbiguousAmpersand)
  1746. // 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
  1747. case .AmbiguousAmpersand:
  1748. switch currentInputCharacter {
  1749. case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!):
  1750. if self.consumedAsPartOfAnAttribute() {
  1751. self.currentBuilder.append(currentInputCharacter!)
  1752. return continueInCurrentState()
  1753. }
  1754. return emitCharacter(currentInputCharacter!)
  1755. case ";":
  1756. // FIXME: log_parse_error()
  1757. return reconsume(currentInputCharacter!, in: self.returnState)
  1758. default:
  1759. return reconsume(currentInputCharacter, in: self.returnState)
  1760. }
  1761. // 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
  1762. case .NumericCharacterReference:
  1763. self.characterReferenceCode = 0
  1764. switch currentInputCharacter {
  1765. case "x", "X":
  1766. self.temporaryBuffer.append(currentInputCharacter!)
  1767. return switchTo(.HexadecimalCharacterReferenceStart)
  1768. default:
  1769. return reconsume(currentInputCharacter, in: .DecimalCharacterReferenceStart)
  1770. }
  1771. // 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
  1772. case .HexadecimalCharacterReferenceStart:
  1773. switch currentInputCharacter {
  1774. case let c? where HTMLTokenizer.asciiHexDigit.contains(c.unicodeScalars.first!):
  1775. return reconsume(currentInputCharacter!, in: .HexadecimalCharacterReference)
  1776. default:
  1777. // FIXME: log_parse_error()
  1778. self.flushCodepointsConsumedAsACharacterReference()
  1779. return reconsume(currentInputCharacter, in: self.returnState)
  1780. }
  1781. // 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
  1782. case .DecimalCharacterReferenceStart:
  1783. switch currentInputCharacter {
  1784. case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
  1785. return reconsume(currentInputCharacter!, in: .DecimalCharacterReference)
  1786. default:
  1787. // FIXME: log_parse_error()
  1788. self.flushCodepointsConsumedAsACharacterReference()
  1789. return reconsume(currentInputCharacter, in: self.returnState)
  1790. }
  1791. // 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
  1792. case .HexadecimalCharacterReference:
  1793. switch currentInputCharacter {
  1794. case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
  1795. self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x30)
  1796. return continueInCurrentState()
  1797. case let c? where HTMLTokenizer.asciiUpperHexDigit.contains(c.unicodeScalars.first!):
  1798. self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x37)
  1799. return continueInCurrentState()
  1800. case let c? where HTMLTokenizer.asciiLowerHexDigit.contains(c.unicodeScalars.first!):
  1801. self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x57)
  1802. return continueInCurrentState()
  1803. case ";":
  1804. return switchTo(.NumericCharacterReferenceEnd)
  1805. default:
  1806. // FIXME: log_parse_error()
  1807. return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd)
  1808. }
  1809. // 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
  1810. case .DecimalCharacterReference:
  1811. switch currentInputCharacter {
  1812. case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
  1813. self.characterReferenceCode = self.characterReferenceCode * 10 + Int(c.asciiValue! - 0x30)
  1814. return continueInCurrentState()
  1815. case ";":
  1816. return switchTo(.NumericCharacterReferenceEnd)
  1817. default:
  1818. // FIXME: log_parse_error()
  1819. return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd)
  1820. }
  1821. // 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
  1822. case .NumericCharacterReferenceEnd:
  1823. dontConsumeNextInputCharacter()
  1824. let codePoint: UnicodeScalar =
  1825. switch self.characterReferenceCode {
  1826. case 0x00:
  1827. // FIXME: log_parse_error()
  1828. UnicodeScalar(0xFFFD)!
  1829. case let c where c > 0x10FFFF:
  1830. // FIXME: log_parse_error()
  1831. UnicodeScalar(0xFFFD)!
  1832. case let c where UTF16.CodeUnit(exactly: c).map({ UTF16.isSurrogate($0) }) != nil:
  1833. // FIXME: log_parse_error()
  1834. UnicodeScalar(0xFFFD)!
  1835. case let c where UnicodeScalar(c)! == "\u{000D}" || HTMLTokenizer.controlNotAsciiWhitespace.contains(UnicodeScalar(c)!):
  1836. // FIXME: log_parse_error()
  1837. if let codePoint = HTMLTokenizer.characterReferenceControlCodeMapping[c] {
  1838. codePoint
  1839. } else {
  1840. UnicodeScalar(c)!
  1841. }
  1842. case let c where HTMLTokenizer.nonCharacter.contains(UnicodeScalar(c)!):
  1843. // FIXME: log_parse_error()
  1844. UnicodeScalar(c)!
  1845. default:
  1846. UnicodeScalar(self.characterReferenceCode)!
  1847. }
  1848. self.temporaryBuffer = Swift.String(Character(codePoint))
  1849. self.flushCodepointsConsumedAsACharacterReference()
  1850. return switchTo(self.returnState)
  1851. }
  1852. }
  1853. }