HTMLTokenizer.swift 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039
  1. /*
  2. * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. import AK
  7. import Collections
  8. import Foundation
  9. @_exported import WebCxx
  10. extension Swift.String {
  11. public init?(decoding: AK.StringView, as: AK.StringView) {
  12. let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
  13. if maybe_decoded.hasValue {
  14. self.init(akString: maybe_decoded.value!)
  15. } else {
  16. return nil
  17. }
  18. }
  19. public mutating func takeString() -> Swift.String {
  20. let result = self
  21. self = ""
  22. return result
  23. }
  24. }
  25. public class HTMLTokenizer {
  26. public enum State {
  27. case Data
  28. case RCDATA
  29. case RAWTEXT
  30. case ScriptData
  31. case PLAINTEXT
  32. case TagOpen
  33. case EndTagOpen
  34. case TagName
  35. case RCDATALessThanSign
  36. case RCDATAEndTagOpen
  37. case RCDATAEndTagName
  38. case RAWTEXTLessThanSign
  39. case RAWTEXTEndTagOpen
  40. case RAWTEXTEndTagName
  41. case ScriptDataLessThanSign
  42. case ScriptDataEndTagOpen
  43. case ScriptDataEndTagName
  44. case ScriptDataEscapeStart
  45. case ScriptDataEscapeStartDash
  46. case ScriptDataEscaped
  47. case ScriptDataEscapedDash
  48. case ScriptDataEscapedDashDash
  49. case ScriptDataEscapedLessThanSign
  50. case ScriptDataEscapedEndTagOpen
  51. case ScriptDataEscapedEndTagName
  52. case ScriptDataDoubleEscapeStart
  53. case ScriptDataDoubleEscaped
  54. case ScriptDataDoubleEscapedDash
  55. case ScriptDataDoubleEscapedDashDash
  56. case ScriptDataDoubleEscapedLessThanSign
  57. case ScriptDataDoubleEscapeEnd
  58. case BeforeAttributeName
  59. case AttributeName
  60. case AfterAttributeName
  61. case BeforeAttributeValue
  62. case AttributeValueDoubleQuoted
  63. case AttributeValueSingleQuoted
  64. case AttributeValueUnquoted
  65. case AfterAttributeValueQuoted
  66. case SelfClosingStartTag
  67. case BogusComment
  68. case MarkupDeclarationOpen
  69. case CommentStart
  70. case CommentStartDash
  71. case Comment
  72. case CommentLessThanSign
  73. case CommentLessThanSignBang
  74. case CommentLessThanSignBangDash
  75. case CommentLessThanSignBangDashDash
  76. case CommentEndDash
  77. case CommentEnd
  78. case CommentEndBang
  79. case DOCTYPE
  80. case BeforeDOCTYPEName
  81. case DOCTYPEName
  82. case AfterDOCTYPEName
  83. case AfterDOCTYPEPublicKeyword
  84. case BeforeDOCTYPEPublicIdentifier
  85. case DOCTYPEPublicIdentifierDoubleQuoted
  86. case DOCTYPEPublicIdentifierSingleQuoted
  87. case AfterDOCTYPEPublicIdentifier
  88. case BetweenDOCTYPEPublicAndSystemIdentifiers
  89. case AfterDOCTYPESystemKeyword
  90. case BeforeDOCTYPESystemIdentifier
  91. case DOCTYPESystemIdentifierDoubleQuoted
  92. case DOCTYPESystemIdentifierSingleQuoted
  93. case AfterDOCTYPESystemIdentifier
  94. case BogusDOCTYPE
  95. case CDATASection
  96. case CDATASectionBracket
  97. case CDATASectionEnd
  98. case CharacterReference
  99. case NamedCharacterReference
  100. case AmbiguousAmpersand
  101. case NumericCharacterReference
  102. case HexadecimalCharacterReferenceStart
  103. case DecimalCharacterReferenceStart
  104. case HexadecimalCharacterReference
  105. case DecimalCharacterReference
  106. case NumericCharacterReferenceEnd
  107. }
  108. private var input = Swift.String()
  109. private var cursor: Swift.String.Index
  110. private var previousCursor: Swift.String.Index
  111. public private(set) var state = State.Data
  112. private var returnState = State.Data
  113. private var currentToken = HTMLToken()
  114. private var queuedTokens = Deque<HTMLToken>()
  115. private var currentBuilder = Swift.String()
  116. private var temporaryBuffer = Swift.String()
  117. private var lastStartTagName: Swift.String? = nil
  118. private var currentTokensAttributes: [HTMLToken.Attribute]? = nil
  119. private var currentAttribute: HTMLToken.Attribute? = nil
  120. private var aborted = false
  121. private var hasEmittedEOF = false
  122. // https://infra.spec.whatwg.org/#ascii-upper-alpha
  123. static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
  124. // https://infra.spec.whatwg.org/#ascii-lower-alpha
  125. static private var asciiLowerAlpha = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyz")
  126. // https://infra.spec.whatwg.org/#ascii-upper-alpha
  127. static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha)
  128. public init() {
  129. self.cursor = self.input.startIndex
  130. self.previousCursor = self.input.startIndex
  131. }
  132. public init?(input: AK.StringView, encoding: AK.StringView) {
  133. if let string = Swift.String(decoding: input, as: encoding) {
  134. self.input = string
  135. } else {
  136. return nil
  137. }
  138. self.cursor = self.input.startIndex
  139. self.previousCursor = self.input.startIndex
  140. }
  141. public convenience init?(input: AK.StringView) {
  142. self.init(input: input, encoding: "UTF-8")
  143. }
  144. public func abort() {
  145. self.aborted = true
  146. }
  147. func skip(_ count: Int) {
  148. self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
  149. self.previousCursor = self.input.index(before: self.cursor)
  150. }
  151. func peekCodePoint(_ offset: Int = 0) -> Character? {
  152. guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
  153. return nil
  154. }
  155. return self.input[index]
  156. }
  157. func nextCodePoint() -> Character? {
  158. guard self.cursor < self.input.endIndex else {
  159. return nil
  160. }
  161. // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
  162. // https://infra.spec.whatwg.org/#normalize-newlines
  163. var codePoint: Character
  164. if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
  165. // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
  166. skip(2)
  167. codePoint = "\n"
  168. } else if let peeked = peekCodePoint(), peeked == "\r" {
  169. // replace every remaining U+000D CR code point with a U+000A LF code point.
  170. skip(1)
  171. codePoint = "\n"
  172. } else {
  173. skip(1)
  174. codePoint = self.input[self.previousCursor]
  175. }
  176. return codePoint
  177. }
  178. func restoreCursorToPrevious() {
  179. self.cursor = self.previousCursor
  180. }
  181. func createNewToken(_ token: HTMLToken) {
  182. self.currentToken = token
  183. if self.currentToken.isTag() {
  184. self.currentTokensAttributes = []
  185. }
  186. // FIXME: Assign Position
  187. }
  188. enum AttributeStringBehavior {
  189. case SetName
  190. case SetValue
  191. case IgnoreString
  192. }
  193. func finalizeCurrentAttribute(_ behavior: AttributeStringBehavior) {
  194. precondition(self.currentAttribute != nil && self.currentTokensAttributes != nil)
  195. switch behavior {
  196. case .SetName:
  197. self.currentAttribute!.localName = self.currentBuilder.takeString()
  198. case .SetValue:
  199. self.currentAttribute!.value = self.currentBuilder.takeString()
  200. case .IgnoreString:
  201. _ = self.currentBuilder.takeString()
  202. }
  203. self.currentTokensAttributes!.append(self.currentAttribute!)
  204. self.currentAttribute = nil
  205. }
  206. enum NextTokenState {
  207. case Emit(token: HTMLToken?)
  208. case SwitchTo
  209. case Reconsume(inputCharacter: Character?)
  210. case ReprocessQueue
  211. case Continue
  212. }
  213. public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
  214. let processQueue = { () -> HTMLToken?? in
  215. if let token = self.queuedTokens.popFirst() {
  216. return token
  217. }
  218. return self.aborted ? Optional(nil) : nil
  219. }
  220. if let maybeToken = processQueue() {
  221. return maybeToken
  222. }
  223. var nextInputCharacter: Character? = nil
  224. while true {
  225. // FIXME: Handle insertion point
  226. switch nextTokenImpl(nextInputCharacter) {
  227. case .Emit(let token):
  228. return token
  229. case .SwitchTo, .Continue:
  230. nextInputCharacter = nil
  231. break
  232. case .Reconsume(let character):
  233. nextInputCharacter = character
  234. break
  235. case .ReprocessQueue:
  236. if let maybeToken = processQueue() {
  237. return maybeToken
  238. }
  239. nextInputCharacter = nil
  240. break
  241. }
  242. }
  243. }
  244. func continueInCurrentState() -> NextTokenState {
  245. return .Continue
  246. }
  247. func switchTo(_ state: State) -> NextTokenState {
  248. self.state = state
  249. return .SwitchTo
  250. }
  251. func reconsume(_ character: Character?, `in` state: State) -> NextTokenState {
  252. self.state = state
  253. return .Reconsume(inputCharacter: character)
  254. }
  255. func switchToReturnState() -> NextTokenState {
  256. self.state = self.returnState
  257. return .ReprocessQueue
  258. }
  259. func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
  260. self.state = self.returnState
  261. if character != nil {
  262. restoreCursorToPrevious()
  263. }
  264. return .ReprocessQueue
  265. }
  266. func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
  267. self.state = state
  268. if self.currentToken.isTag() {
  269. self.currentToken.attributes = self.currentTokensAttributes ?? []
  270. self.currentTokensAttributes = nil
  271. }
  272. self.queuedTokens.append(self.currentToken)
  273. self.currentToken = HTMLToken()
  274. return .Emit(token: self.queuedTokens.popFirst()!)
  275. }
  276. func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
  277. self.state = state
  278. return emitCharacter(character)
  279. }
  280. func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
  281. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
  282. self.state = `in`
  283. return .Reconsume(inputCharacter: currentInputCharacter)
  284. }
  285. func emitEOF() -> NextTokenState {
  286. if self.hasEmittedEOF {
  287. return .Emit(token: nil)
  288. }
  289. self.hasEmittedEOF = true
  290. createNewToken(HTMLToken(type: .EndOfFile))
  291. self.queuedTokens.append(self.currentToken)
  292. self.currentToken = HTMLToken()
  293. return .Emit(token: self.queuedTokens.popFirst()!)
  294. }
  295. func emitCurrentTokenFollowedByEOF() -> NextTokenState {
  296. precondition(!self.hasEmittedEOF)
  297. if self.currentToken.isTag() {
  298. self.currentToken.attributes = self.currentTokensAttributes ?? []
  299. self.currentTokensAttributes = nil
  300. }
  301. self.queuedTokens.append(self.currentToken)
  302. self.currentToken = HTMLToken()
  303. return emitEOF()
  304. }
  305. func emitCharacter(_ character: Character) -> NextTokenState {
  306. createNewToken(HTMLToken(type: .Character(codePoint: character)))
  307. self.queuedTokens.append(self.currentToken)
  308. self.currentToken = HTMLToken()
  309. return .Emit(token: self.queuedTokens.popFirst()!)
  310. }
  311. func flushCodepointsConsumedAsACharacterReference() {
  312. if consumedAsPartOfAnAttribute() {
  313. self.currentBuilder += self.temporaryBuffer.takeString()
  314. } else {
  315. for codePoint in self.temporaryBuffer.takeString() {
  316. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  317. }
  318. }
  319. }
  320. func consumedAsPartOfAnAttribute() -> Bool {
  321. return self.returnState == .AttributeValueDoubleQuoted || self.returnState == .AttributeValueSingleQuoted || self.returnState == .AttributeValueUnquoted
  322. }
  323. func isAppropriateEndTagToken(_ token: HTMLToken) -> Bool {
  324. guard case let .EndTag(endTagName, _, _, _) = token.type else {
  325. preconditionFailure("isAppropriateEndTagToken called with non-end-tag token")
  326. }
  327. if let startTagName = self.lastStartTagName {
  328. return startTagName == endTagName
  329. } else {
  330. return false
  331. }
  332. }
  333. func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
  334. let dontConsumeNextInputCharacter = {
  335. self.restoreCursorToPrevious()
  336. }
  337. let _ = dontConsumeNextInputCharacter
  338. // Handle reconsume by passing the character around in the state enum
  339. let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
  340. switch self.state {
  341. // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
  342. case .Data:
  343. precondition(currentTokensAttributes == nil)
  344. switch currentInputCharacter {
  345. case "&":
  346. self.returnState = .Data
  347. return switchTo(.CharacterReference)
  348. case "<":
  349. return switchTo(.TagOpen)
  350. case "\0":
  351. // FIXME: log_parse_error()
  352. return emitCharacter("\u{FFFD}")
  353. case nil:
  354. return emitEOF()
  355. default:
  356. return emitCharacter(currentInputCharacter!)
  357. }
  358. // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
  359. case .RCDATA:
  360. switch currentInputCharacter {
  361. case "&":
  362. self.returnState = .RCDATA
  363. return switchTo(.CharacterReference)
  364. case "<":
  365. return switchTo(.RCDATALessThanSign)
  366. case "\0":
  367. // FIXME: log_parse_error()
  368. return emitCharacter("\u{FFFD}")
  369. case nil:
  370. return emitEOF()
  371. default:
  372. return emitCharacter(currentInputCharacter!)
  373. }
  374. // 13.2.5.3. RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
  375. case .RAWTEXT:
  376. switch currentInputCharacter {
  377. case "<":
  378. return switchTo(.RAWTEXTLessThanSign)
  379. case "\0":
  380. // FIXME: log_parse_error()
  381. return emitCharacter("\u{FFFD}")
  382. case nil:
  383. return emitEOF()
  384. default:
  385. return emitCharacter(currentInputCharacter!)
  386. }
  387. // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
  388. case .ScriptData:
  389. switch currentInputCharacter {
  390. case "<":
  391. return switchTo(.ScriptDataLessThanSign)
  392. case "\0":
  393. // FIXME: log_parse_error()
  394. return emitCharacter("\u{FFFD}")
  395. case nil:
  396. return emitEOF()
  397. default:
  398. return emitCharacter(currentInputCharacter!)
  399. }
  400. // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
  401. case .PLAINTEXT:
  402. switch currentInputCharacter {
  403. case "\0":
  404. // FIXME: log_parse_error()
  405. return emitCharacter("\u{FFFD}")
  406. case nil:
  407. return emitEOF()
  408. default:
  409. return emitCharacter(currentInputCharacter!)
  410. }
  411. // 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
  412. case .TagOpen:
  413. switch currentInputCharacter {
  414. case "!":
  415. return switchTo(.MarkupDeclarationOpen)
  416. case "/":
  417. return switchTo(.EndTagOpen)
  418. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  419. createNewToken(HTMLToken(type: .StartTag(tagName: "")))
  420. return reconsume(currentInputCharacter!, in: .TagName)
  421. case "?":
  422. // FIXME: log_parse_error()
  423. createNewToken(HTMLToken(type: .Comment(data: "")))
  424. return reconsume(currentInputCharacter!, in: .BogusComment)
  425. case nil:
  426. // FIXME: log_parse_error()
  427. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  428. return emitEOF()
  429. default:
  430. // FIXME: log_parse_error()
  431. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  432. return reconsume(currentInputCharacter!, in: .Data)
  433. }
  434. // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
  435. case .EndTagOpen:
  436. switch currentInputCharacter {
  437. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  438. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  439. return reconsume(currentInputCharacter!, in: .TagName)
  440. default:
  441. return emitEOF()
  442. }
  443. // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
  444. case .TagName:
  445. switch currentInputCharacter {
  446. case "\t", "\n", "\u{000C}", " ":
  447. self.currentToken.tagName = self.currentBuilder.takeString()
  448. return switchTo(.BeforeAttributeName)
  449. case "/":
  450. self.currentToken.tagName = self.currentBuilder.takeString()
  451. return switchTo(.SelfClosingStartTag)
  452. case ">":
  453. self.currentToken.tagName = self.currentBuilder.takeString()
  454. return switchToAndEmitCurrentToken(.Data)
  455. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  456. currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  457. return continueInCurrentState()
  458. case "\0":
  459. // FIXME: log_parse_error()
  460. currentBuilder += "\u{FFFD}"
  461. return continueInCurrentState()
  462. case nil:
  463. // FIXME: log_parse_error()
  464. return emitEOF()
  465. default:
  466. currentBuilder.append(currentInputCharacter!)
  467. return continueInCurrentState()
  468. }
  469. // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
  470. case .RCDATALessThanSign:
  471. switch currentInputCharacter {
  472. case "/":
  473. self.temporaryBuffer = ""
  474. return switchTo(.RCDATAEndTagOpen)
  475. default:
  476. return emitCharacterAndReconsume("<", in: .RCDATA, currentInputCharacter: currentInputCharacter)
  477. }
  478. // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
  479. case .RCDATAEndTagOpen:
  480. switch currentInputCharacter {
  481. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  482. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  483. return reconsume(currentInputCharacter!, in: .RCDATAEndTagName)
  484. default:
  485. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  486. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  487. return reconsume(currentInputCharacter, in: .RCDATA)
  488. }
  489. // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
  490. case .RCDATAEndTagName:
  491. switch currentInputCharacter {
  492. case "\t", "\n", "\u{000C}", " ":
  493. if self.isAppropriateEndTagToken(currentToken) {
  494. return switchTo(.BeforeAttributeName)
  495. }
  496. break
  497. case "/":
  498. if self.isAppropriateEndTagToken(currentToken) {
  499. return switchTo(.SelfClosingStartTag)
  500. }
  501. break
  502. case ">":
  503. if self.isAppropriateEndTagToken(currentToken) {
  504. return switchToAndEmitCurrentToken(.Data)
  505. }
  506. break
  507. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  508. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  509. self.temporaryBuffer.append(c)
  510. return continueInCurrentState()
  511. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  512. self.currentBuilder.append(c)
  513. self.temporaryBuffer.append(c)
  514. return continueInCurrentState()
  515. default:
  516. break
  517. }
  518. // First three steps fall through to the "anything else" block
  519. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  520. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  521. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  522. self.currentBuilder = ""
  523. for codePoint in self.temporaryBuffer {
  524. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  525. }
  526. return reconsume(currentInputCharacter, in: .RCDATA)
  527. // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
  528. case .ScriptDataLessThanSign:
  529. switch currentInputCharacter {
  530. case "/":
  531. self.temporaryBuffer = ""
  532. return switchTo(.ScriptDataEndTagOpen)
  533. case "!":
  534. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  535. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "!")))
  536. return switchTo(.ScriptDataEscapeStart)
  537. default:
  538. return emitCharacterAndReconsume("<", in: .ScriptData, currentInputCharacter: currentInputCharacter)
  539. }
  540. // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
  541. case .ScriptDataEndTagOpen:
  542. switch currentInputCharacter {
  543. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  544. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  545. return reconsume(currentInputCharacter!, in: .ScriptDataEndTagName)
  546. default:
  547. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  548. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  549. return reconsume(currentInputCharacter, in: .ScriptData)
  550. }
  551. // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
  552. case .ScriptDataEndTagName:
  553. switch currentInputCharacter {
  554. case "\t", "\n", "\u{000C}", " ":
  555. if self.isAppropriateEndTagToken(currentToken) {
  556. return switchTo(.BeforeAttributeName)
  557. }
  558. break
  559. case "/":
  560. if self.isAppropriateEndTagToken(currentToken) {
  561. return switchTo(.SelfClosingStartTag)
  562. }
  563. break
  564. case ">":
  565. if self.isAppropriateEndTagToken(currentToken) {
  566. return switchToAndEmitCurrentToken(.Data)
  567. }
  568. break
  569. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  570. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  571. self.temporaryBuffer.append(c)
  572. return continueInCurrentState()
  573. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  574. self.currentBuilder.append(c)
  575. self.temporaryBuffer.append(c)
  576. return continueInCurrentState()
  577. default:
  578. break
  579. }
  580. // First three steps fall through to the "anything else" block
  581. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  582. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  583. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  584. self.currentBuilder = ""
  585. for codePoint in self.temporaryBuffer {
  586. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  587. }
  588. return reconsume(currentInputCharacter, in: .ScriptData)
  589. // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
  590. case .ScriptDataEscapeStart:
  591. switch currentInputCharacter {
  592. case "-":
  593. return switchToAndEmitCharacter(.ScriptDataEscapeStartDash, character: "-")
  594. default:
  595. return reconsume(currentInputCharacter, in: .ScriptData)
  596. }
  597. // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
  598. case .ScriptDataEscapeStartDash:
  599. switch currentInputCharacter {
  600. case "-":
  601. return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
  602. default:
  603. return reconsume(currentInputCharacter, in: .ScriptData)
  604. }
  605. // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
  606. case .ScriptDataEscaped:
  607. switch currentInputCharacter {
  608. case "-":
  609. return switchToAndEmitCharacter(.ScriptDataEscapedDash, character: "-")
  610. case "<":
  611. return switchTo(.ScriptDataEscapedLessThanSign)
  612. case "\0":
  613. // FIXME: log_parse_error()
  614. return emitCharacter("\u{FFFD}")
  615. case nil:
  616. // FIXME: log_parse_error()
  617. return emitEOF()
  618. default:
  619. return emitCharacter(currentInputCharacter!)
  620. }
  621. // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
  622. case .ScriptDataEscapedDash:
  623. switch currentInputCharacter {
  624. case "-":
  625. return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
  626. case "<":
  627. return switchTo(.ScriptDataEscapedLessThanSign)
  628. case "\0":
  629. // FIXME: log_parse_error()
  630. return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
  631. case nil:
  632. // FIXME: log_parse_error()
  633. return emitEOF()
  634. default:
  635. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  636. }
  637. // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
  638. case .ScriptDataEscapedDashDash:
  639. switch currentInputCharacter {
  640. case "-":
  641. return emitCharacter("-")
  642. case "<":
  643. return switchTo(.ScriptDataEscapedLessThanSign)
  644. case ">":
  645. return switchToAndEmitCharacter(.ScriptData, character: ">")
  646. case "\0":
  647. // FIXME: log_parse_error()
  648. return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
  649. case nil:
  650. // FIXME: log_parse_error()
  651. return emitEOF()
  652. default:
  653. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  654. }
  655. // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
  656. case .ScriptDataEscapedLessThanSign:
  657. switch currentInputCharacter {
  658. case "/":
  659. self.temporaryBuffer = ""
  660. return switchTo(.ScriptDataEscapedEndTagOpen)
  661. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  662. self.temporaryBuffer = ""
  663. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  664. return reconsume(currentInputCharacter!, in: .ScriptDataDoubleEscapeStart)
  665. default:
  666. return emitCharacterAndReconsume("<", in: .ScriptDataEscaped, currentInputCharacter: currentInputCharacter)
  667. }
  668. // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
  669. case .ScriptDataEscapedEndTagOpen:
  670. switch currentInputCharacter {
  671. case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
  672. createNewToken(HTMLToken(type: .EndTag(tagName: "")))
  673. return reconsume(currentInputCharacter!, in: .ScriptDataEscapedEndTagName)
  674. default:
  675. queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  676. queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  677. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  678. }
  679. // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
  680. case .ScriptDataEscapedEndTagName:
  681. switch currentInputCharacter {
  682. case "\t", "\n", "\u{000C}", " ":
  683. if self.isAppropriateEndTagToken(currentToken) {
  684. return switchTo(.BeforeAttributeName)
  685. }
  686. break
  687. case "/":
  688. if self.isAppropriateEndTagToken(currentToken) {
  689. return switchTo(.SelfClosingStartTag)
  690. }
  691. break
  692. case ">":
  693. if self.isAppropriateEndTagToken(currentToken) {
  694. return switchToAndEmitCurrentToken(.Data)
  695. }
  696. break
  697. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  698. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  699. self.temporaryBuffer.append(c)
  700. return continueInCurrentState()
  701. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  702. self.currentBuilder.append(c)
  703. self.temporaryBuffer.append(c)
  704. return continueInCurrentState()
  705. default:
  706. break
  707. }
  708. // First three steps fall through to the "anything else" block
  709. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
  710. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
  711. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  712. self.currentBuilder = ""
  713. for codePoint in self.temporaryBuffer {
  714. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
  715. }
  716. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  717. // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
  718. case .ScriptDataDoubleEscapeStart:
  719. switch currentInputCharacter {
  720. case "\t", "\n", "\u{000C}", " ", "/", ">":
  721. if self.temporaryBuffer == "script" {
  722. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  723. } else {
  724. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  725. }
  726. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  727. self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  728. return emitCharacter(currentInputCharacter!)
  729. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  730. self.temporaryBuffer.append(c)
  731. return emitCharacter(currentInputCharacter!)
  732. default:
  733. return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
  734. }
  735. // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
  736. case .ScriptDataDoubleEscaped:
  737. switch currentInputCharacter {
  738. case "-":
  739. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDash, character: "-")
  740. case "<":
  741. return switchTo(.ScriptDataDoubleEscapedLessThanSign)
  742. case "\0":
  743. // FIXME: log_parse_error()
  744. return emitCharacter("\u{FFFD}")
  745. case nil:
  746. // FIXME: log_parse_error()
  747. return emitEOF()
  748. default:
  749. return emitCharacter(currentInputCharacter!)
  750. }
  751. // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
  752. case .ScriptDataDoubleEscapedDash:
  753. switch currentInputCharacter {
  754. case "-":
  755. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDashDash, character: "-")
  756. case "<":
  757. return switchTo(.ScriptDataDoubleEscapedLessThanSign)
  758. case "\0":
  759. // FIXME: log_parse_error()
  760. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
  761. case nil:
  762. // FIXME: log_parse_error()
  763. return emitEOF()
  764. default:
  765. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  766. }
  767. // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
  768. case .ScriptDataDoubleEscapedDashDash:
  769. switch currentInputCharacter {
  770. case "-":
  771. return emitCharacter("-")
  772. case "<":
  773. return switchToAndEmitCharacter(.ScriptDataDoubleEscapedLessThanSign, character: "<")
  774. case ">":
  775. return switchToAndEmitCharacter(.ScriptData, character: ">")
  776. case "\0":
  777. // FIXME: log_parse_error()
  778. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
  779. case nil:
  780. // FIXME: log_parse_error()
  781. return emitEOF()
  782. default:
  783. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  784. }
  785. // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
  786. case .ScriptDataDoubleEscapedLessThanSign:
  787. switch currentInputCharacter {
  788. case "/":
  789. self.temporaryBuffer = ""
  790. return switchToAndEmitCharacter(.ScriptDataDoubleEscapeEnd, character: "/")
  791. default:
  792. return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
  793. }
  794. // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
  795. case .ScriptDataDoubleEscapeEnd:
  796. switch currentInputCharacter {
  797. case "\t", "\n", "\u{000C}", " ", "/", ">":
  798. if self.temporaryBuffer == "script" {
  799. return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
  800. } else {
  801. return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
  802. }
  803. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  804. self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  805. return emitCharacter(currentInputCharacter!)
  806. case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
  807. self.temporaryBuffer.append(c)
  808. return emitCharacter(currentInputCharacter!)
  809. default:
  810. return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
  811. }
  812. // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
  813. case .BeforeAttributeName:
  814. switch currentInputCharacter {
  815. case "\t", "\n", "\u{000C}", " ":
  816. return continueInCurrentState()
  817. case "/", ">", nil:
  818. return reconsume(currentInputCharacter, in: .AfterAttributeName)
  819. case "=":
  820. // FIXME: log_parse_error()
  821. self.currentBuilder = Swift.String(currentInputCharacter!)
  822. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  823. return switchTo(.AttributeName)
  824. default:
  825. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  826. return reconsume(currentInputCharacter!, in: .AttributeName)
  827. }
  828. // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
  829. case .AttributeName:
  830. // FIXME: When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
  831. // the complete attribute's name must be compared to the other attributes on the same token;
  832. // if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
  833. // parse error and the new attribute must be removed from the token.
  834. // NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
  835. // are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
  836. // in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
  837. switch currentInputCharacter {
  838. case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
  839. // FIXME: set name position
  840. self.currentAttribute!.localName = self.currentBuilder.takeString()
  841. return reconsume(currentInputCharacter, in: .AfterAttributeName)
  842. case "=":
  843. // FIXME: set name position
  844. self.currentAttribute!.localName = self.currentBuilder.takeString()
  845. return switchTo(.BeforeAttributeValue)
  846. case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
  847. self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
  848. return continueInCurrentState()
  849. case "\0":
  850. // FIXME: log_parse_error()
  851. self.currentBuilder.append("\u{FFFD}")
  852. return continueInCurrentState()
  853. default:
  854. self.currentBuilder.append(currentInputCharacter!)
  855. return continueInCurrentState()
  856. }
  857. // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
  858. case .AfterAttributeName:
  859. switch currentInputCharacter {
  860. case "\t", "\n", "\u{000C}", " ":
  861. return continueInCurrentState()
  862. case "/":
  863. self.finalizeCurrentAttribute(.SetName)
  864. return switchTo(.SelfClosingStartTag)
  865. case "=":
  866. self.finalizeCurrentAttribute(.SetName)
  867. return switchTo(.BeforeAttributeValue)
  868. case ">":
  869. self.finalizeCurrentAttribute(.SetName)
  870. return switchToAndEmitCurrentToken(.Data)
  871. case nil:
  872. // FIXME: log_parse_error()
  873. self.finalizeCurrentAttribute(.IgnoreString)
  874. return emitEOF()
  875. default:
  876. self.finalizeCurrentAttribute(.SetName)
  877. self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
  878. return reconsume(currentInputCharacter!, in: .AttributeName)
  879. }
  880. // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
  881. case .BeforeAttributeValue:
  882. switch currentInputCharacter {
  883. case "\t", "\n", "\u{000C}", " ":
  884. return continueInCurrentState()
  885. case "\"":
  886. return switchTo(.AttributeValueDoubleQuoted)
  887. case "'":
  888. return switchTo(.AttributeValueSingleQuoted)
  889. case ">":
  890. // FIXME: log_parse_error()
  891. self.finalizeCurrentAttribute(.IgnoreString)
  892. return switchToAndEmitCurrentToken(.Data)
  893. default:
  894. return reconsume(currentInputCharacter, in: .AttributeValueUnquoted)
  895. }
  896. // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
  897. case .AttributeValueDoubleQuoted:
  898. switch currentInputCharacter {
  899. case "\"":
  900. return switchTo(.AfterAttributeValueQuoted)
  901. case "&":
  902. self.returnState = .AttributeValueDoubleQuoted
  903. return switchTo(.CharacterReference)
  904. case "\0":
  905. // FIXME: log_parse_error()
  906. self.currentBuilder.append("\u{FFFD}")
  907. return continueInCurrentState()
  908. case nil:
  909. // FIXME: log_parse_error()
  910. self.finalizeCurrentAttribute(.IgnoreString)
  911. return emitEOF()
  912. default:
  913. self.currentBuilder.append(currentInputCharacter!)
  914. return continueInCurrentState()
  915. }
  916. // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
  917. case .AttributeValueSingleQuoted:
  918. switch currentInputCharacter {
  919. case "'":
  920. return switchTo(.AfterAttributeValueQuoted)
  921. case "&":
  922. self.returnState = .AttributeValueSingleQuoted
  923. return switchTo(.CharacterReference)
  924. case "\0":
  925. // FIXME: log_parse_error()
  926. self.currentBuilder.append("\u{FFFD}")
  927. return continueInCurrentState()
  928. case nil:
  929. // FIXME: log_parse_error()
  930. return emitEOF()
  931. default:
  932. self.currentBuilder.append(currentInputCharacter!)
  933. return continueInCurrentState()
  934. }
  935. // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
  936. case .AttributeValueUnquoted:
  937. switch currentInputCharacter {
  938. case "\t", "\n", "\u{000C}", " ":
  939. self.finalizeCurrentAttribute(.SetValue)
  940. return switchTo(.BeforeAttributeName)
  941. case "&":
  942. self.returnState = .AttributeValueUnquoted
  943. return switchTo(.CharacterReference)
  944. case ">":
  945. self.finalizeCurrentAttribute(.SetValue)
  946. return switchToAndEmitCurrentToken(.Data)
  947. case "\0":
  948. // FIXME: log_parse_error()
  949. self.currentBuilder.append("\u{FFFD}")
  950. return continueInCurrentState()
  951. case "\"", "'", "<", "=", "`":
  952. // FIXME: log_parse_error()
  953. self.currentBuilder.append(currentInputCharacter!)
  954. return continueInCurrentState()
  955. case nil:
  956. // FIXME: log_parse_error()
  957. self.finalizeCurrentAttribute(.IgnoreString)
  958. return emitEOF()
  959. default:
  960. self.currentBuilder.append(currentInputCharacter!)
  961. return continueInCurrentState()
  962. }
  963. // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
  964. case .AfterAttributeValueQuoted:
  965. switch currentInputCharacter {
  966. case "\t", "\n", "\u{000C}", " ":
  967. self.finalizeCurrentAttribute(.SetValue)
  968. return switchTo(.BeforeAttributeName)
  969. case "/":
  970. self.finalizeCurrentAttribute(.SetValue)
  971. return switchTo(.SelfClosingStartTag)
  972. case ">":
  973. self.finalizeCurrentAttribute(.SetValue)
  974. return switchToAndEmitCurrentToken(.Data)
  975. case nil:
  976. // FIXME: log_parse_error()
  977. self.finalizeCurrentAttribute(.IgnoreString)
  978. return emitEOF()
  979. default:
  980. // FIXME: log_parse_error()
  981. self.finalizeCurrentAttribute(.SetValue)
  982. return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
  983. }
  984. default:
  985. print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
  986. return emitEOF()
  987. }
  988. }
  989. }