HTMLTokenizer.swift 55 KB


  1. /*
  2. * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. import AK
  7. import Collections
  8. import Foundation
  9. @_exported import WebCxx
  10. extension Swift.String {
  11. public init?(decoding: AK.StringView, as: AK.StringView) {
  12. let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
  13. if maybe_decoded.hasValue {
  14. self.init(akString: maybe_decoded.value!)
  15. } else {
  16. return nil
  17. }
  18. }
  19. public mutating func takeString() -> Swift.String {
  20. let result = self
  21. self = ""
  22. return result
  23. }
  24. }
  25. public class HTMLTokenizer {
  26. public enum State {
  27. case Data
  28. case RCDATA
  29. case RAWTEXT
  30. case ScriptData
  31. case PLAINTEXT
  32. case TagOpen
  33. case EndTagOpen
  34. case TagName
  35. case RCDATALessThanSign
  36. case RCDATAEndTagOpen
  37. case RCDATAEndTagName
  38. case RAWTEXTLessThanSign
  39. case RAWTEXTEndTagOpen
  40. case RAWTEXTEndTagName
  41. case ScriptDataLessThanSign
  42. case ScriptDataEndTagOpen
  43. case ScriptDataEndTagName
  44. case ScriptDataEscapeStart
  45. case ScriptDataEscapeStartDash
  46. case ScriptDataEscaped
  47. case ScriptDataEscapedDash
  48. case ScriptDataEscapedDashDash
  49. case ScriptDataEscapedLessThanSign
  50. case ScriptDataEscapedEndTagOpen
  51. case ScriptDataEscapedEndTagName
  52. case ScriptDataDoubleEscapeStart
  53. case ScriptDataDoubleEscaped
  54. case ScriptDataDoubleEscapedDash
  55. case ScriptDataDoubleEscapedDashDash
  56. case ScriptDataDoubleEscapedLessThanSign
  57. case ScriptDataDoubleEscapeEnd
  58. case BeforeAttributeName
  59. case AttributeName
  60. case AfterAttributeName
  61. case BeforeAttributeValue
  62. case AttributeValueDoubleQuoted
  63. case AttributeValueSingleQuoted
  64. case AttributeValueUnquoted
  65. case AfterAttributeValueQuoted
  66. case SelfClosingStartTag
  67. case BogusComment
  68. case MarkupDeclarationOpen
  69. case CommentStart
  70. case CommentStartDash
  71. case Comment
  72. case CommentLessThanSign
  73. case CommentLessThanSignBang
  74. case CommentLessThanSignBangDash
  75. case CommentLessThanSignBangDashDash
  76. case CommentEndDash
  77. case CommentEnd
  78. case CommentEndBang
  79. case DOCTYPE
  80. case BeforeDOCTYPEName
  81. case DOCTYPEName
  82. case AfterDOCTYPEName
  83. case AfterDOCTYPEPublicKeyword
  84. case BeforeDOCTYPEPublicIdentifier
  85. case DOCTYPEPublicIdentifierDoubleQuoted
  86. case DOCTYPEPublicIdentifierSingleQuoted
  87. case AfterDOCTYPEPublicIdentifier
  88. case BetweenDOCTYPEPublicAndSystemIdentifiers
  89. case AfterDOCTYPESystemKeyword
  90. case BeforeDOCTYPESystemIdentifier
  91. case DOCTYPESystemIdentifierDoubleQuoted
  92. case DOCTYPESystemIdentifierSingleQuoted
  93. case AfterDOCTYPESystemIdentifier
  94. case BogusDOCTYPE
  95. case CDATASection
  96. case CDATASectionBracket
  97. case CDATASectionEnd
  98. case CharacterReference
  99. case NamedCharacterReference
  100. case AmbiguousAmpersand
  101. case NumericCharacterReference
  102. case HexadecimalCharacterReferenceStart
  103. case DecimalCharacterReferenceStart
  104. case HexadecimalCharacterReference
  105. case DecimalCharacterReference
  106. case NumericCharacterReferenceEnd
  107. }
  108. private var input = Swift.String()
  109. private var cursor: Swift.String.Index
  110. private var previousCursor: Swift.String.Index
  111. public private(set) var state = State.Data
  112. private var returnState = State.Data
  113. private var currentToken = HTMLToken()
  114. private var queuedTokens = Deque<HTMLToken>()
  115. private var currentBuilder = Swift.String()
  116. private var temporaryBuffer = Swift.String()
  117. private var lastStartTagName: Swift.String? = nil
  118. private var currentTokensAttributes: [HTMLToken.Attribute]? = nil
  119. private var currentAttribute: HTMLToken.Attribute? = nil
  120. private var aborted = false
  121. private var hasEmittedEOF = false
  122. // https://infra.spec.whatwg.org/#ascii-upper-alpha
  123. static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
  124. // https://infra.spec.whatwg.org/#ascii-lower-alpha
  125. static private var asciiLowerAlpha = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyz")
  126. // https://infra.spec.whatwg.org/#ascii-upper-alpha
  127. static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha)
  128. public init() {
  129. self.cursor = self.input.startIndex
  130. self.previousCursor = self.input.startIndex
  131. }
  132. public init?(input: AK.StringView, encoding: AK.StringView) {
  133. if let string = Swift.String(decoding: input, as: encoding) {
  134. self.input = string
  135. } else {
  136. return nil
  137. }
  138. self.cursor = self.input.startIndex
  139. self.previousCursor = self.input.startIndex
  140. }
  141. public convenience init?(input: AK.StringView) {
  142. self.init(input: input, encoding: "UTF-8")
  143. }
  144. public func abort() {
  145. self.aborted = true
  146. }
  147. func skip(_ count: Int) {
  148. self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
  149. self.previousCursor = self.input.index(before: self.cursor)
  150. }
  151. func peekCodePoint(_ offset: Int = 0) -> Character? {
  152. guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
  153. return nil
  154. }
  155. return self.input[index]
  156. }
  157. func peekNext(count: Int) -> Swift.Substring? {
  158. guard let endIndex = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.index(before: self.input.endIndex)) else {
  159. return nil
  160. }
  161. return self.input[self.cursor..<endIndex]
  162. }
  163. func nextCodePoint() -> Character? {
  164. guard self.cursor < self.input.endIndex else {
  165. return nil
  166. }
  167. // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
  168. // https://infra.spec.whatwg.org/#normalize-newlines
  169. var codePoint: Character
  170. if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
  171. // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
  172. skip(2)
  173. codePoint = "\n"
  174. } else if let peeked = peekCodePoint(), peeked == "\r" {
  175. // replace every remaining U+000D CR code point with a U+000A LF code point.
  176. skip(1)
  177. codePoint = "\n"
  178. } else {
  179. skip(1)
  180. codePoint = self.input[self.previousCursor]
  181. }
  182. return codePoint
  183. }
  184. func restoreCursorToPrevious() {
  185. self.cursor = self.previousCursor
  186. }
  187. func createNewToken(_ token: HTMLToken) {
  188. self.currentToken = token
  189. if self.currentToken.isTag() {
  190. self.currentTokensAttributes = []
  191. }
  192. // FIXME: Assign Position
  193. }
  194. enum AttributeStringBehavior {
  195. case SetName
  196. case SetValue
  197. case IgnoreString
  198. }
  199. func finalizeCurrentAttribute(_ behavior: AttributeStringBehavior) {
  200. precondition(self.currentAttribute != nil && self.currentTokensAttributes != nil)
  201. switch behavior {
  202. case .SetName:
  203. self.currentAttribute!.localName = self.currentBuilder.takeString()
  204. case .SetValue:
  205. self.currentAttribute!.value = self.currentBuilder.takeString()
  206. case .IgnoreString:
  207. _ = self.currentBuilder.takeString()
  208. }
  209. self.currentTokensAttributes!.append(self.currentAttribute!)
  210. self.currentAttribute = nil
  211. }
  212. enum NextTokenState {
  213. case Emit(token: HTMLToken?)
  214. case SwitchTo
  215. case Reconsume(inputCharacter: Character?)
  216. case ReprocessQueue
  217. case Continue
  218. }
  219. public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
  220. let processQueue = { () -> HTMLToken?? in
  221. if let token = self.queuedTokens.popFirst() {
  222. return token
  223. }
  224. return self.aborted ? Optional(nil) : nil
  225. }
  226. if let maybeToken = processQueue() {
  227. return maybeToken
  228. }
  229. var nextInputCharacter: Character? = nil
  230. while true {
  231. // FIXME: Handle insertion point
  232. switch nextTokenImpl(nextInputCharacter) {
  233. case .Emit(let token):
  234. return token
  235. case .SwitchTo, .Continue:
  236. nextInputCharacter = nil
  237. break
  238. case .Reconsume(let character):
  239. nextInputCharacter = character
  240. break
  241. case .ReprocessQueue:
  242. if let maybeToken = processQueue() {
  243. return maybeToken
  244. }
  245. nextInputCharacter = nil
  246. break
  247. }
  248. }
  249. }
  250. func continueInCurrentState() -> NextTokenState {
  251. return .Continue
  252. }
  253. func switchTo(_ state: State) -> NextTokenState {
  254. self.state = state
  255. return .SwitchTo
  256. }
  257. func reconsume(_ character: Character?, `in` state: State) -> NextTokenState {
  258. self.state = state
  259. return .Reconsume(inputCharacter: character)
  260. }
  261. func switchToReturnState() -> NextTokenState {
  262. self.state = self.returnState
  263. return .ReprocessQueue
  264. }
  265. func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
  266. self.state = self.returnState
  267. if character != nil {
  268. restoreCursorToPrevious()
  269. }
  270. return .ReprocessQueue
  271. }
  272. func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
  273. self.state = state
  274. if self.currentToken.isTag() {
  275. self.currentToken.attributes = self.currentTokensAttributes ?? []
  276. self.currentTokensAttributes = nil
  277. }
  278. self.queuedTokens.append(self.currentToken)
  279. self.currentToken = HTMLToken()
  280. return .Emit(token: self.queuedTokens.popFirst()!)
  281. }
  282. func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
  283. self.state = state
  284. return emitCharacter(character)
  285. }
  286. func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
  287. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
  288. self.state = `in`
  289. return .Reconsume(inputCharacter: currentInputCharacter)
  290. }
  291. func emitEOF() -> NextTokenState {
  292. if self.hasEmittedEOF {
  293. return .Emit(token: nil)
  294. }
  295. self.hasEmittedEOF = true
  296. createNewToken(HTMLToken(type: .EndOfFile))
  297. self.queuedTokens.append(self.currentToken)
  298. self.currentToken = HTMLToken()
  299. return .Emit(token: self.queuedTokens.popFirst()!)
  300. }
  301. func emitCurrentTokenFollowedByEOF() -> NextTokenState {
  302. precondition(!self.hasEmittedEOF)
  303. if self.currentToken.isTag() {
  304. self.currentToken.attributes = self.currentTokensAttributes ?? []
  305. self.currentTokensAttributes = nil
  306. }
  307. self.queuedTokens.append(self.currentToken)
  308. self.currentToken = HTMLToken()
  309. return emitEOF()
  310. }
  311. func emitCharacter(_ character: Character) -> NextTokenState {
  312. createNewToken(HTMLToken(type: .Character(codePoint: character)))
  313. self.queuedTokens.append(self.currentToken)
  314. self.currentToken = HTMLToken()
  315. return .Emit(token: self.queuedTokens.popFirst()!)
  316. }
  317. func flushCodepointsConsumedAsACharacterReference() {
  318. if consumedAsPartOfAnAttribute() {
  319. self.currentBuilder += self.temporaryBuffer.takeString()
  320. } else {
  321. for codePoint in self.temporaryBuffer.takeString() {
  322. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  323. }
  324. }
  325. }
  326. func consumedAsPartOfAnAttribute() -> Bool {
  327. return self.returnState == .AttributeValueDoubleQuoted || self.returnState == .AttributeValueSingleQuoted || self.returnState == .AttributeValueUnquoted
  328. }
  329. func isAppropriateEndTagToken(_ token: HTMLToken) -> Bool {
  330. guard case let .EndTag(endTagName, _, _, _) = token.type else {
  331. preconditionFailure("isAppropriateEndTagToken called with non-end-tag token")
  332. }
  333. if let startTagName = self.lastStartTagName {
  334. return startTagName == endTagName
  335. } else {
  336. return false
  337. }
  338. }
  339. func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
  340. let dontConsumeNextInputCharacter = {
  341. self.restoreCursorToPrevious()
  342. }
  343. let _ = dontConsumeNextInputCharacter
  344. // Handle reconsume by passing the character around in the state enum
  345. let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
  346. switch self.state {
  347. // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
  348. case .Data:
  349. precondition(currentTokensAttributes == nil)
  350. switch currentInputCharacter {
  351. case "&":
  352. self.returnState = .Data
  353. return switchTo(.CharacterReference)
  354. case "<":
  355. return switchTo(.TagOpen)
  356. case "\0":
  357. // FIXME: log_parse_error()
  358. return emitCharacter("\u{FFFD}")
  359. case nil:
  360. return emitEOF()
  361. default:
  362. return emitCharacter(currentInputCharacter!)
  363. }
  364. // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
  365. case .RCDATA:
  366. switch currentInputCharacter {
  367. case "&":
  368. self.returnState = .RCDATA
  369. return switchTo(.CharacterReference)
  370. case "<":
  371. return switchTo(.RCDATALessThanSign)
  372. case "\0":
  373. // FIXME: log_parse_error()
  374. return emitCharacter("\u{FFFD}")
  375. case nil:
  376. return emitEOF()
  377. default:
  378. return emitCharacter(currentInputCharacter!)
  379. }
  380. // 13.2.5.3. RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
  381. case .RAWTEXT:
  382. switch currentInputCharacter {
  383. case "<":
  384. return switchTo(.RAWTEXTLessThanSign)
  385. case "\0":
  386. // FIXME: log_parse_error()
  387. return emitCharacter("\u{FFFD}")
  388. case nil:
  389. return emitEOF()
  390. default:
  391. return emitCharacter(currentInputCharacter!)
  392. }
  393. // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
  394. case .ScriptData:
  395. switch currentInputCharacter {
  396. case "<":
  397. return switchTo(.ScriptDataLessThanSign)
  398. case "\0":
  399. // FIXME: log_parse_error()
  400. return emitCharacter("\u{FFFD}")
  401. case nil:
  402. return emitEOF()
  403. default:
  404. return emitCharacter(currentInputCharacter!)
  405. }
  406. // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
  407. case .PLAINTEXT:
  408. switch currentInputCharacter {
  409. case "\0":
  410. // FIXME: log_parse_error()
  411. return emitCharacter("\u{FFFD}")
  412. case nil:
  413. return emitEOF()
  414. default:
  415. return emitCharacter(currentInputCharacter!)
  416. }
  417. // 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
  418. case .TagOpen:
  419. switch currentInputCharacter {
  420. case "!":
  421. return switchTo(.MarkupDeclarationOpen)
  422. case "/":
  423. return switchTo(.EndTagOpen)
  424. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  425. createNewToken(HTMLToken(type: .StartTag(tagName: "")))
  426. return reconsume(currentInputCharacter!, in: .TagName)
  427. case "?":
  428. // FIXME: log_parse_error()
  429. createNewToken(HTMLToken(type: .Comment(data: "")))
  430. return reconsume(currentInputCharacter!, in: .BogusComment)
  431. case nil:
  432. // FIXME: log_parse_error()
  433. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  434. return emitEOF()
  435. default:
  436. // FIXME: log_parse_error()
  437. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  438. return reconsume(currentInputCharacter!, in: .Data)
  439. }
  440. // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
  441. case .EndTagOpen:
  442. switch currentInputCharacter {
  443. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  444. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  445. return reconsume(currentInputCharacter!, in: .TagName)
  446. default:
  447. return emitEOF()
  448. }
  449. // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
  450. case .TagName:
  451. switch currentInputCharacter {
  452. case "\t", "\n", "\u{000C}", " ":
  453. self.currentToken.tagName = self.currentBuilder.takeString()
  454. return switchTo(.BeforeAttributeName)
  455. case "/":
  456. self.currentToken.tagName = self.currentBuilder.takeString()
  457. return switchTo(.SelfClosingStartTag)
  458. case ">":
  459. self.currentToken.tagName = self.currentBuilder.takeString()
  460. return switchToAndEmitCurrentToken(.Data)
  461. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  462. currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  463. return continueInCurrentState()
  464. case "\0":
  465. // FIXME: log_parse_error()
  466. currentBuilder += "\u{FFFD}"
  467. return continueInCurrentState()
  468. case nil:
  469. // FIXME: log_parse_error()
  470. return emitEOF()
  471. default:
  472. currentBuilder.append(currentInputCharacter!)
  473. return continueInCurrentState()
  474. }
  475. // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
  476. case .RCDATALessThanSign:
  477. switch currentInputCharacter {
  478. case "/":
  479. self.temporaryBuffer = ""
  480. return switchTo(.RCDATAEndTagOpen)
  481. default:
  482. return emitCharacterAndReconsume("<", in: .RCDATA, currentInputCharacter: currentInputCharacter)
  483. }
  484. // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
  485. case .RCDATAEndTagOpen:
  486. switch currentInputCharacter {
  487. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  488. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  489. return reconsume(currentInputCharacter!, in: .RCDATAEndTagName)
  490. default:
  491. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  492. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  493. return reconsume(currentInputCharacter, in: .RCDATA)
  494. }
  495. // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
  496. case .RCDATAEndTagName:
  497. switch currentInputCharacter {
  498. case "\t", "\n", "\u{000C}", " ":
  499. if self.isAppropriateEndTagToken(currentToken) {
  500. return switchTo(.BeforeAttributeName)
  501. }
  502. break
  503. case "/":
  504. if self.isAppropriateEndTagToken(currentToken) {
  505. return switchTo(.SelfClosingStartTag)
  506. }
  507. break
  508. case ">":
  509. if self.isAppropriateEndTagToken(currentToken) {
  510. return switchToAndEmitCurrentToken(.Data)
  511. }
  512. break
  513. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  514. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  515. self.temporaryBuffer.append(c)
  516. return continueInCurrentState()
  517. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  518. self.currentBuilder.append(c)
  519. self.temporaryBuffer.append(c)
  520. return continueInCurrentState()
  521. default:
  522. break
  523. }
  524. // First three steps fall through to the "anything else" block
  525. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  526. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  527. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  528. self.currentBuilder = ""
  529. for codePoint in self.temporaryBuffer {
  530. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  531. }
  532. return reconsume(currentInputCharacter, in: .RCDATA)
  533. // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
  534. case .ScriptDataLessThanSign:
  535. switch currentInputCharacter {
  536. case "/":
  537. self.temporaryBuffer = ""
  538. return switchTo(.ScriptDataEndTagOpen)
  539. case "!":
  540. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  541. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "!")))
  542. return switchTo(.ScriptDataEscapeStart)
  543. default:
  544. return emitCharacterAndReconsume("<", in: .ScriptData, currentInputCharacter: currentInputCharacter)
  545. }
  546. // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
  547. case .ScriptDataEndTagOpen:
  548. switch currentInputCharacter {
  549. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  550. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  551. return reconsume(currentInputCharacter!, in: .ScriptDataEndTagName)
  552. default:
  553. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  554. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  555. return reconsume(currentInputCharacter, in: .ScriptData)
  556. }
  557. // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
  558. case .ScriptDataEndTagName:
  559. switch currentInputCharacter {
  560. case "\t", "\n", "\u{000C}", " ":
  561. if self.isAppropriateEndTagToken(currentToken) {
  562. return switchTo(.BeforeAttributeName)
  563. }
  564. break
  565. case "/":
  566. if self.isAppropriateEndTagToken(currentToken) {
  567. return switchTo(.SelfClosingStartTag)
  568. }
  569. break
  570. case ">":
  571. if self.isAppropriateEndTagToken(currentToken) {
  572. return switchToAndEmitCurrentToken(.Data)
  573. }
  574. break
  575. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  576. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  577. self.temporaryBuffer.append(c)
  578. return continueInCurrentState()
  579. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  580. self.currentBuilder.append(c)
  581. self.temporaryBuffer.append(c)
  582. return continueInCurrentState()
  583. default:
  584. break
  585. }
  586. // First three steps fall through to the "anything else" block
  587. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  588. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  589. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  590. self.currentBuilder = ""
  591. for codePoint in self.temporaryBuffer {
  592. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  593. }
  594. return reconsume(currentInputCharacter, in: .ScriptData)
  595. // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
  596. case .ScriptDataEscapeStart:
  597. switch currentInputCharacter {
  598. case "-":
  599. return switchToAndEmitCharacter(.ScriptDataEscapeStartDash, character: "-")
  600. default:
  601. return reconsume(currentInputCharacter, in: .ScriptData)
  602. }
  603. // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
  604. case .ScriptDataEscapeStartDash:
  605. switch currentInputCharacter {
  606. case "-":
  607. return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
  608. default:
  609. return reconsume(currentInputCharacter, in: .ScriptData)
  610. }
  611. // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
  612. case .ScriptDataEscaped:
  613. switch currentInputCharacter {
  614. case "-":
  615. return switchToAndEmitCharacter(.ScriptDataEscapedDash, character: "-")
  616. case "<":
  617. return switchTo(.ScriptDataEscapedLessThanSign)
  618. case "\0":
  619. // FIXME: log_parse_error()
  620. return emitCharacter("\u{FFFD}")
  621. case nil:
  622. // FIXME: log_parse_error()
  623. return emitEOF()
  624. default:
  625. return emitCharacter(currentInputCharacter!)
  626. }
  627. // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
  628. case .ScriptDataEscapedDash:
  629. switch currentInputCharacter {
  630. case "-":
  631. return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
  632. case "<":
  633. return switchTo(.ScriptDataEscapedLessThanSign)
  634. case "\0":
  635. // FIXME: log_parse_error()
  636. return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
  637. case nil:
  638. // FIXME: log_parse_error()
  639. return emitEOF()
  640. default:
  641. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  642. }
  643. // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
  644. case .ScriptDataEscapedDashDash:
  645. switch currentInputCharacter {
  646. case "-":
  647. return emitCharacter("-")
  648. case "<":
  649. return switchTo(.ScriptDataEscapedLessThanSign)
  650. case ">":
  651. return switchToAndEmitCharacter(.ScriptData, character: ">")
  652. case "\0":
  653. // FIXME: log_parse_error()
  654. return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
  655. case nil:
  656. // FIXME: log_parse_error()
  657. return emitEOF()
  658. default:
  659. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  660. }
  661. // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
  662. case .ScriptDataEscapedLessThanSign:
  663. switch currentInputCharacter {
  664. case "/":
  665. self.temporaryBuffer = ""
  666. return switchTo(.ScriptDataEscapedEndTagOpen)
  667. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  668. self.temporaryBuffer = ""
  669. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  670. return reconsume(currentInputCharacter!, in: .ScriptDataDoubleEscapeStart)
  671. default:
  672. return emitCharacterAndReconsume("<", in: .ScriptDataEscaped, currentInputCharacter: currentInputCharacter)
  673. }
  674. // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
  675. case .ScriptDataEscapedEndTagOpen:
  676. switch currentInputCharacter {
  677. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  678. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  679. return reconsume(currentInputCharacter!, in: .ScriptDataEscapedEndTagName)
  680. default:
  681. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  682. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  683. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  684. }
  685. // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
  686. case .ScriptDataEscapedEndTagName:
  687. switch currentInputCharacter {
  688. case "\t", "\n", "\u{000C}", " ":
  689. if self.isAppropriateEndTagToken(currentToken) {
  690. return switchTo(.BeforeAttributeName)
  691. }
  692. break
  693. case "/":
  694. if self.isAppropriateEndTagToken(currentToken) {
  695. return switchTo(.SelfClosingStartTag)
  696. }
  697. break
  698. case ">":
  699. if self.isAppropriateEndTagToken(currentToken) {
  700. return switchToAndEmitCurrentToken(.Data)
  701. }
  702. break
  703. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  704. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  705. self.temporaryBuffer.append(c)
  706. return continueInCurrentState()
  707. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  708. self.currentBuilder.append(c)
  709. self.temporaryBuffer.append(c)
  710. return continueInCurrentState()
  711. default:
  712. break
  713. }
  714. // First three steps fall through to the "anything else" block
  715. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  716. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  717. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  718. self.currentBuilder = ""
  719. for codePoint in self.temporaryBuffer {
  720. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  721. }
  722. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  723. // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
  724. case .ScriptDataDoubleEscapeStart:
  725. switch currentInputCharacter {
  726. case "\t", "\n", "\u{000C}", " ", "/", ">":
  727. if self.temporaryBuffer == "script" {
  728. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  729. } else {
  730. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  731. }
  732. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  733. self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  734. return emitCharacter(currentInputCharacter!)
  735. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  736. self.temporaryBuffer.append(c)
  737. return emitCharacter(currentInputCharacter!)
  738. default:
  739. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  740. }
  741. // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
  742. case .ScriptDataDoubleEscaped:
  743. switch currentInputCharacter {
  744. case "-":
  745. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDash, character: "-")
  746. case "<":
  747. return switchTo(.ScriptDataDoubleEscapedLessThanSign)
  748. case "\0":
  749. // FIXME: log_parse_error()
  750. return emitCharacter("\u{FFFD}")
  751. case nil:
  752. // FIXME: log_parse_error()
  753. return emitEOF()
  754. default:
  755. return emitCharacter(currentInputCharacter!)
  756. }
  757. // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
  758. case .ScriptDataDoubleEscapedDash:
  759. switch currentInputCharacter {
  760. case "-":
  761. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDashDash, character: "-")
  762. case "<":
  763. return switchTo(.ScriptDataDoubleEscapedLessThanSign)
  764. case "\0":
  765. // FIXME: log_parse_error()
  766. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
  767. case nil:
  768. // FIXME: log_parse_error()
  769. return emitEOF()
  770. default:
  771. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  772. }
  773. // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
  774. case .ScriptDataDoubleEscapedDashDash:
  775. switch currentInputCharacter {
  776. case "-":
  777. return emitCharacter("-")
  778. case "<":
  779. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedLessThanSign, character: "<")
  780. case ">":
  781. return switchToAndEmitCharacter(.ScriptData, character: ">")
  782. case "\0":
  783. // FIXME: log_parse_error()
  784. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
  785. case nil:
  786. // FIXME: log_parse_error()
  787. return emitEOF()
  788. default:
  789. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  790. }
  791. // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
  792. case .ScriptDataDoubleEscapedLessThanSign:
  793. switch currentInputCharacter {
  794. case "/":
  795. self.temporaryBuffer = ""
  796. return switchToAndEmitCharacter(.ScriptDataDoubleEscapeEnd, character: "/")
  797. default:
  798. return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
  799. }
  800. // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
  801. case .ScriptDataDoubleEscapeEnd:
  802. switch currentInputCharacter {
  803. case "\t", "\n", "\u{000C}", " ", "/", ">":
  804. if self.temporaryBuffer == "script" {
  805. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  806. } else {
  807. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  808. }
  809. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  810. self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  811. return emitCharacter(currentInputCharacter!)
  812. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  813. self.temporaryBuffer.append(c)
  814. return emitCharacter(currentInputCharacter!)
  815. default:
  816. return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
  817. }
  818. // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
  819. case .BeforeAttributeName:
  820. switch currentInputCharacter {
  821. case "\t", "\n", "\u{000C}", " ":
  822. return continueInCurrentState()
  823. case "/", ">", nil:
  824. return reconsume(currentInputCharacter, in: .AfterAttributeName)
  825. case "=":
  826. // FIXME: log_parse_error()
  827. self.currentBuilder = Swift.String(currentInputCharacter!)
  828. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  829. return switchTo(.AttributeName)
  830. default:
  831. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  832. return reconsume(currentInputCharacter!, in: .AttributeName)
  833. }
  834. // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
  835. case .AttributeName:
  836. // FIXME: When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
  837. // the complete attribute's name must be compared to the other attributes on the same token;
  838. // if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
  839. // parse error and the new attribute must be removed from the token.
  840. // NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
  841. // are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
  842. // in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
  843. switch currentInputCharacter {
  844. case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
  845. // FIXME: set name position
  846. self.currentAttribute!.localName = self.currentBuilder.takeString()
  847. return reconsume(currentInputCharacter, in: .AfterAttributeName)
  848. case "=":
  849. // FIXME: set name position
  850. self.currentAttribute!.localName = self.currentBuilder.takeString()
  851. return switchTo(.BeforeAttributeValue)
  852. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  853. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  854. return continueInCurrentState()
  855. case "\0":
  856. // FIXME: log_parse_error()
  857. self.currentBuilder.append("\u{FFFD}")
  858. return continueInCurrentState()
  859. default:
  860. self.currentBuilder.append(currentInputCharacter!)
  861. return continueInCurrentState()
  862. }
  863. // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
  864. case .AfterAttributeName:
  865. switch currentInputCharacter {
  866. case "\t", "\n", "\u{000C}", " ":
  867. return continueInCurrentState()
  868. case "/":
  869. self.finalizeCurrentAttribute(.SetName)
  870. return switchTo(.SelfClosingStartTag)
  871. case "=":
  872. self.finalizeCurrentAttribute(.SetName)
  873. return switchTo(.BeforeAttributeValue)
  874. case ">":
  875. self.finalizeCurrentAttribute(.SetName)
  876. return switchToAndEmitCurrentToken(.Data)
  877. case nil:
  878. // FIXME: log_parse_error()
  879. self.finalizeCurrentAttribute(.IgnoreString)
  880. return emitEOF()
  881. default:
  882. self.finalizeCurrentAttribute(.SetName)
  883. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  884. return reconsume(currentInputCharacter!, in: .AttributeName)
  885. }
  886. // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
  887. case .BeforeAttributeValue:
  888. switch currentInputCharacter {
  889. case "\t", "\n", "\u{000C}", " ":
  890. return continueInCurrentState()
  891. case "\"":
  892. return switchTo(.AttributeValueDoubleQuoted)
  893. case "'":
  894. return switchTo(.AttributeValueSingleQuoted)
  895. case ">":
  896. // FIXME: log_parse_error()
  897. self.finalizeCurrentAttribute(.IgnoreString)
  898. return switchToAndEmitCurrentToken(.Data)
  899. default:
  900. return reconsume(currentInputCharacter, in: .AttributeValueUnquoted)
  901. }
  902. // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
  903. case .AttributeValueDoubleQuoted:
  904. switch currentInputCharacter {
  905. case "\"":
  906. return switchTo(.AfterAttributeValueQuoted)
  907. case "&":
  908. self.returnState = .AttributeValueDoubleQuoted
  909. return switchTo(.CharacterReference)
  910. case "\0":
  911. // FIXME: log_parse_error()
  912. self.currentBuilder.append("\u{FFFD}")
  913. return continueInCurrentState()
  914. case nil:
  915. // FIXME: log_parse_error()
  916. self.finalizeCurrentAttribute(.IgnoreString)
  917. return emitEOF()
  918. default:
  919. self.currentBuilder.append(currentInputCharacter!)
  920. return continueInCurrentState()
  921. }
  922. // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
  923. case .AttributeValueSingleQuoted:
  924. switch currentInputCharacter {
  925. case "'":
  926. return switchTo(.AfterAttributeValueQuoted)
  927. case "&":
  928. self.returnState = .AttributeValueSingleQuoted
  929. return switchTo(.CharacterReference)
  930. case "\0":
  931. // FIXME: log_parse_error()
  932. self.currentBuilder.append("\u{FFFD}")
  933. return continueInCurrentState()
  934. case nil:
  935. // FIXME: log_parse_error()
  936. return emitEOF()
  937. default:
  938. self.currentBuilder.append(currentInputCharacter!)
  939. return continueInCurrentState()
  940. }
  941. // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
  942. case .AttributeValueUnquoted:
  943. switch currentInputCharacter {
  944. case "\t", "\n", "\u{000C}", " ":
  945. self.finalizeCurrentAttribute(.SetValue)
  946. return switchTo(.BeforeAttributeName)
  947. case "&":
  948. self.returnState = .AttributeValueUnquoted
  949. return switchTo(.CharacterReference)
  950. case ">":
  951. self.finalizeCurrentAttribute(.SetValue)
  952. return switchToAndEmitCurrentToken(.Data)
  953. case "\0":
  954. // FIXME: log_parse_error()
  955. self.currentBuilder.append("\u{FFFD}")
  956. return continueInCurrentState()
  957. case "\"", "'", "<", "=", "`":
  958. // FIXME: log_parse_error()
  959. self.currentBuilder.append(currentInputCharacter!)
  960. return continueInCurrentState()
  961. case nil:
  962. // FIXME: log_parse_error()
  963. self.finalizeCurrentAttribute(.IgnoreString)
  964. return emitEOF()
  965. default:
  966. self.currentBuilder.append(currentInputCharacter!)
  967. return continueInCurrentState()
  968. }
  969. // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
  970. case .AfterAttributeValueQuoted:
  971. switch currentInputCharacter {
  972. case "\t", "\n", "\u{000C}", " ":
  973. self.finalizeCurrentAttribute(.SetValue)
  974. return switchTo(.BeforeAttributeName)
  975. case "/":
  976. self.finalizeCurrentAttribute(.SetValue)
  977. return switchTo(.SelfClosingStartTag)
  978. case ">":
  979. self.finalizeCurrentAttribute(.SetValue)
  980. return switchToAndEmitCurrentToken(.Data)
  981. case nil:
  982. // FIXME: log_parse_error()
  983. self.finalizeCurrentAttribute(.IgnoreString)
  984. return emitEOF()
  985. default:
  986. // FIXME: log_parse_error()
  987. self.finalizeCurrentAttribute(.SetValue)
  988. return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
  989. }
  990. // 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
  991. case .SelfClosingStartTag:
  992. switch currentInputCharacter {
  993. case ">":
  994. self.currentToken.selfClosing = true
  995. return switchToAndEmitCurrentToken(.Data)
  996. case nil:
  997. // FIXME: log_parse_error()
  998. return emitEOF()
  999. default:
  1000. // FIXME: log_parse_error()
  1001. return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
  1002. }
  1003. // 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
  1004. case .BogusComment:
  1005. switch currentInputCharacter {
  1006. case ">":
  1007. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1008. return switchToAndEmitCurrentToken(.Data)
  1009. case nil:
  1010. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1011. return emitCurrentTokenFollowedByEOF()
  1012. case "\0":
  1013. // FIXME: log_parse_error()
  1014. currentBuilder.append("\u{FFFD}")
  1015. return continueInCurrentState()
  1016. default:
  1017. self.currentBuilder.append(currentInputCharacter!)
  1018. return continueInCurrentState()
  1019. }
  1020. // 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
  1021. case .MarkupDeclarationOpen:
  1022. dontConsumeNextInputCharacter()
  1023. if let nextTwo = peekNext(count: 2), nextTwo == "--" {
  1024. skip(2)
  1025. return switchTo(.CommentStart)
  1026. } else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "DOCTYPE" {
  1027. skip(7)
  1028. return switchTo(.DOCTYPE)
  1029. } else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "[CDATA[" {
  1030. skip(7)
  1031. // FIXME: If there is an adjusted current node and it is not an element in the HTML namespace,
  1032. // then switch to the CDATA section state.
  1033. // FIXME: log_parse_error()
  1034. self.currentBuilder = "[CDATA["
  1035. self.currentToken = HTMLToken(type: .Comment(data: ""))
  1036. return switchTo(.BogusComment)
  1037. } else {
  1038. // FIXME: log_parse_error()
  1039. self.currentToken = HTMLToken(type: .Comment(data: ""))
  1040. return switchTo(.BogusComment)
  1041. }
  1042. // 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
  1043. case .CommentStart:
  1044. switch currentInputCharacter {
  1045. case "-":
  1046. return switchTo(.CommentStartDash)
  1047. case ">":
  1048. // FIXME: log_parse_error()
  1049. return switchToAndEmitCurrentToken(.Data)
  1050. default:
  1051. return reconsume(currentInputCharacter, in: .Comment)
  1052. }
  1053. // 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
  1054. case .CommentStartDash:
  1055. switch currentInputCharacter {
  1056. case "-":
  1057. return switchTo(.CommentEnd)
  1058. case ">":
  1059. // FIXME: log_parse_error()
  1060. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1061. return switchToAndEmitCurrentToken(.Data)
  1062. case nil:
  1063. // FIXME: log_parse_error()
  1064. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1065. return emitCurrentTokenFollowedByEOF()
  1066. default:
  1067. currentBuilder.append("-")
  1068. return reconsume(currentInputCharacter, in: .Comment)
  1069. }
  1070. // 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
  1071. case .Comment:
  1072. switch currentInputCharacter {
  1073. case "<":
  1074. currentBuilder.append("<")
  1075. return switchTo(.CommentLessThanSign)
  1076. case "-":
  1077. return switchTo(.CommentEndDash)
  1078. case "\0":
  1079. // FIXME: log_parse_error()
  1080. currentBuilder.append("\u{FFFD}")
  1081. return continueInCurrentState()
  1082. case nil:
  1083. // FIXME: log_parse_error()
  1084. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1085. return emitCurrentTokenFollowedByEOF()
  1086. default:
  1087. currentBuilder.append(currentInputCharacter!)
  1088. return continueInCurrentState()
  1089. }
  1090. // 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
  1091. case .CommentLessThanSign:
  1092. switch currentInputCharacter {
  1093. case "!":
  1094. currentBuilder.append(currentInputCharacter!)
  1095. return switchTo(.CommentLessThanSignBang)
  1096. case "<":
  1097. currentBuilder.append(currentInputCharacter!)
  1098. return continueInCurrentState()
  1099. default:
  1100. return reconsume(currentInputCharacter, in: .Comment)
  1101. }
  1102. // 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
  1103. case .CommentLessThanSignBang:
  1104. switch currentInputCharacter {
  1105. case "-":
  1106. return switchTo(.CommentLessThanSignBangDash)
  1107. default:
  1108. return reconsume(currentInputCharacter, in: .Comment)
  1109. }
  1110. // 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
  1111. case .CommentLessThanSignBangDash:
  1112. switch currentInputCharacter {
  1113. case "-":
  1114. return switchTo(.CommentLessThanSignBangDashDash)
  1115. default:
  1116. return reconsume(currentInputCharacter, in: .CommentEndDash)
  1117. }
  1118. // 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
  1119. case .CommentLessThanSignBangDashDash:
  1120. switch currentInputCharacter {
  1121. case ">", nil:
  1122. return reconsume(currentInputCharacter, in: .CommentEnd)
  1123. default:
  1124. // FIXME: log_parse_error()
  1125. return reconsume(currentInputCharacter, in: .CommentEnd)
  1126. }
  1127. // 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
  1128. case .CommentEndDash:
  1129. switch currentInputCharacter {
  1130. case "-":
  1131. return switchTo(.CommentEnd)
  1132. case nil:
  1133. // FIXME: log_parse_error()
  1134. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1135. return emitCurrentTokenFollowedByEOF()
  1136. default:
  1137. currentBuilder.append("-")
  1138. return reconsume(currentInputCharacter, in: .Comment)
  1139. }
  1140. // 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
  1141. case .CommentEnd:
  1142. switch currentInputCharacter {
  1143. case ">":
  1144. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1145. return switchToAndEmitCurrentToken(.Data)
  1146. case "!":
  1147. return switchTo(.CommentEndBang)
  1148. case "-":
  1149. currentBuilder.append("-")
  1150. return continueInCurrentState()
  1151. case nil:
  1152. // FIXME: log_parse_error()
  1153. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1154. return emitCurrentTokenFollowedByEOF()
  1155. default:
  1156. currentBuilder.append("--")
  1157. return reconsume(currentInputCharacter, in: .Comment)
  1158. }
  1159. // 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
  1160. case .CommentEndBang:
  1161. switch currentInputCharacter {
  1162. case "-":
  1163. currentBuilder.append("--!")
  1164. return switchTo(.CommentEndDash)
  1165. case ">":
  1166. // FIXME: log_parse_error()
  1167. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1168. return switchToAndEmitCurrentToken(.Data)
  1169. case nil:
  1170. // FIXME: log_parse_error()
  1171. currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
  1172. return emitCurrentTokenFollowedByEOF()
  1173. default:
  1174. currentBuilder.append("--!")
  1175. return reconsume(currentInputCharacter, in: .Comment)
  1176. }
  1177. default:
  1178. print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
  1179. return emitEOF()
  1180. }
  1181. }
  1182. }