HTMLTokenizer.swift 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. /*
  2. * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. import AK
  7. import Collections
  8. import Foundation
  9. @_exported import WebCxx
  10. extension Swift.String {
  11. public init?(decoding: AK.StringView, as: AK.StringView) {
  12. let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
  13. if maybe_decoded.hasValue {
  14. self.init(akString: maybe_decoded.value!)
  15. } else {
  16. return nil
  17. }
  18. }
  19. }
  20. public class HTMLTokenizer {
  21. public enum State {
  22. case Data
  23. case RCDATA
  24. case RAWTEXT
  25. case ScriptData
  26. case PLAINTEXT
  27. case TagOpen
  28. case EndTagOpen
  29. case TagName
  30. case RCDATALessThanSign
  31. case RCDATAEndTagOpen
  32. case RCDATAEndTagName
  33. case RAWTEXTLessThanSign
  34. case RAWTEXTEndTagOpen
  35. case RAWTEXTEndTagName
  36. case ScriptDataLessThanSign
  37. case ScriptDataEndTagOpen
  38. case ScriptDataEndTagName
  39. case ScriptDataEscapeStart
  40. case ScriptDataEscapeStartDash
  41. case ScriptDataEscaped
  42. case ScriptDataEscapedDash
  43. case ScriptDataEscapedDashDash
  44. case ScriptDataEscapedLessThanSign
  45. case ScriptDataEscapedEndTagOpen
  46. case ScriptDataEscapedEndTagName
  47. case ScriptDataDoubleEscapeStart
  48. case ScriptDataDoubleEscaped
  49. case ScriptDataDoubleEscapedDash
  50. case ScriptDataDoubleEscapedDashDash
  51. case ScriptDataDoubleEscapedLessThanSign
  52. case ScriptDataDoubleEscapeEnd
  53. case BeforeAttributeName
  54. case AttributeName
  55. case AfterAttributeName
  56. case BeforeAttributeValue
  57. case AttributeValueDoubleQuoted
  58. case AttributeValueSingleQuoted
  59. case AttributeValueUnquoted
  60. case AfterAttributeValueQuoted
  61. case SelfClosingStartTag
  62. case BogusComment
  63. case MarkupDeclarationOpen
  64. case CommentStart
  65. case CommentStartDash
  66. case Comment
  67. case CommentLessThanSign
  68. case CommentLessThanSignBang
  69. case CommentLessThanSignBangDash
  70. case CommentLessThanSignBangDashDash
  71. case CommentEndDash
  72. case CommentEnd
  73. case CommentEndBang
  74. case DOCTYPE
  75. case BeforeDOCTYPEName
  76. case DOCTYPEName
  77. case AfterDOCTYPEName
  78. case AfterDOCTYPEPublicKeyword
  79. case BeforeDOCTYPEPublicIdentifier
  80. case DOCTYPEPublicIdentifierDoubleQuoted
  81. case DOCTYPEPublicIdentifierSingleQuoted
  82. case AfterDOCTYPEPublicIdentifier
  83. case BetweenDOCTYPEPublicAndSystemIdentifiers
  84. case AfterDOCTYPESystemKeyword
  85. case BeforeDOCTYPESystemIdentifier
  86. case DOCTYPESystemIdentifierDoubleQuoted
  87. case DOCTYPESystemIdentifierSingleQuoted
  88. case AfterDOCTYPESystemIdentifier
  89. case BogusDOCTYPE
  90. case CDATASection
  91. case CDATASectionBracket
  92. case CDATASectionEnd
  93. case CharacterReference
  94. case NamedCharacterReference
  95. case AmbiguousAmpersand
  96. case NumericCharacterReference
  97. case HexadecimalCharacterReferenceStart
  98. case DecimalCharacterReferenceStart
  99. case HexadecimalCharacterReference
  100. case DecimalCharacterReference
  101. case NumericCharacterReferenceEnd
  102. }
  103. private var input = Swift.String()
  104. private var cursor: Swift.String.Index
  105. private var previousCursor: Swift.String.Index
  106. public private(set) var state = State.Data
  107. private var returnState = State.Data
  108. private var currentToken = HTMLToken()
  109. private var queuedTokens = Deque<HTMLToken>()
  110. private var aborted = false
  111. private var hasEmittedEOF = false
  112. public init() {
  113. self.cursor = self.input.startIndex
  114. self.previousCursor = self.input.startIndex
  115. }
  116. public init?(input: AK.StringView, encoding: AK.StringView) {
  117. if let string = Swift.String(decoding: input, as: encoding) {
  118. self.input = string
  119. } else {
  120. return nil
  121. }
  122. self.cursor = self.input.startIndex
  123. self.previousCursor = self.input.startIndex
  124. }
  125. public convenience init?(input: AK.StringView) {
  126. self.init(input: input, encoding: "UTF-8")
  127. }
  128. public func abort() {
  129. self.aborted = true
  130. }
  131. func skip(_ count: Int) {
  132. self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
  133. self.previousCursor = self.input.index(before: self.cursor)
  134. }
  135. func peekCodePoint(_ offset: Int = 0) -> Character? {
  136. guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
  137. return nil
  138. }
  139. return self.input[index]
  140. }
  141. func nextCodePoint() -> Character? {
  142. guard self.cursor < self.input.endIndex else {
  143. return nil
  144. }
  145. // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
  146. // https://infra.spec.whatwg.org/#normalize-newlines
  147. var codePoint: Character
  148. if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
  149. // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
  150. skip(2)
  151. codePoint = "\n"
  152. } else if let peeked = peekCodePoint(), peeked == "\r" {
  153. // replace every remaining U+000D CR code point with a U+000A LF code point.
  154. skip(1)
  155. codePoint = "\n"
  156. } else {
  157. skip(1)
  158. codePoint = self.input[self.previousCursor]
  159. }
  160. return codePoint
  161. }
  162. func restoreCursorToPrevious() {
  163. self.cursor = self.previousCursor
  164. }
  165. func createNewToken(_ token: HTMLToken) {
  166. self.currentToken = token
  167. // FIXME: Assign Position
  168. }
  169. enum NextTokenState {
  170. case Emit(token: HTMLToken?)
  171. case SwitchTo
  172. case Reconsume(inputCharacter: Character?)
  173. case ReprocessQueue
  174. }
  175. public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
  176. let processQueue = { () -> HTMLToken?? in
  177. if let token = self.queuedTokens.popFirst() {
  178. return token
  179. }
  180. return self.aborted ? Optional(nil) : nil
  181. }
  182. if let maybeToken = processQueue() {
  183. return maybeToken
  184. }
  185. var nextInputCharacter: Character? = nil
  186. while true {
  187. // FIXME: Handle insertion point
  188. switch nextTokenImpl(nextInputCharacter) {
  189. case .Emit(let token):
  190. return token
  191. case .SwitchTo:
  192. nextInputCharacter = nil
  193. break
  194. case .Reconsume(let character):
  195. nextInputCharacter = character
  196. break
  197. case .ReprocessQueue:
  198. if let maybeToken = processQueue() {
  199. return maybeToken
  200. }
  201. nextInputCharacter = nil
  202. break
  203. }
  204. }
  205. }
  206. func switchTo(_ state: State) -> NextTokenState {
  207. self.state = state
  208. return .SwitchTo
  209. }
  210. func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
  211. self.state = state
  212. return .Reconsume(inputCharacter: character)
  213. }
  214. func switchToReturnState() -> NextTokenState {
  215. self.state = self.returnState
  216. return .ReprocessQueue
  217. }
  218. func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
  219. self.state = self.returnState
  220. if character != nil {
  221. restoreCursorToPrevious()
  222. }
  223. return .ReprocessQueue
  224. }
  225. func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
  226. self.state = state
  227. self.queuedTokens.append(self.currentToken)
  228. self.currentToken = HTMLToken()
  229. return .Emit(token: self.queuedTokens.popFirst()!)
  230. }
  231. func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
  232. self.state = state
  233. return emitCharacter(character)
  234. }
  235. func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
  236. self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
  237. self.state = `in`
  238. return .Reconsume(inputCharacter: currentInputCharacter)
  239. }
  240. func emitEOF() -> NextTokenState {
  241. if self.hasEmittedEOF {
  242. return .Emit(token: nil)
  243. }
  244. self.hasEmittedEOF = true
  245. createNewToken(HTMLToken(type: .EndOfFile))
  246. self.queuedTokens.append(self.currentToken)
  247. self.currentToken = HTMLToken()
  248. return .Emit(token: self.queuedTokens.popFirst()!)
  249. }
  250. func emitCurrentTokenFollowedByEOF() -> NextTokenState {
  251. precondition(!self.hasEmittedEOF)
  252. self.queuedTokens.append(self.currentToken)
  253. self.currentToken = HTMLToken()
  254. return emitEOF()
  255. }
  256. func emitCharacter(_ character: Character) -> NextTokenState {
  257. createNewToken(HTMLToken(type: .Character(codePoint: character)))
  258. self.queuedTokens.append(self.currentToken)
  259. self.currentToken = HTMLToken()
  260. return .Emit(token: self.queuedTokens.popFirst()!)
  261. }
  262. func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
  263. let dontConsumeNextInputCharacter = {
  264. self.restoreCursorToPrevious()
  265. }
  266. let _ = dontConsumeNextInputCharacter
  267. // FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
  268. // Handle reconsume by passing the character around in the state enum
  269. let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
  270. switch self.state {
  271. // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
  272. case .Data:
  273. switch currentInputCharacter {
  274. case "&":
  275. self.returnState = .Data
  276. return switchTo(.CharacterReference)
  277. case "<":
  278. return switchTo(.TagOpen)
  279. case "\0":
  280. // FIXME: log_parse_error()
  281. return emitCharacter("\u{FFFD}")
  282. case nil:
  283. return emitEOF()
  284. default:
  285. return emitCharacter(currentInputCharacter!)
  286. }
  287. default:
  288. print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
  289. return emitEOF()
  290. }
  291. }
  292. }