HTMLTokenizer.swift 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. /*
  2. * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. import AK
  7. import Collections
  8. import Foundation
  9. @_exported import WebCxx
  10. extension Swift.String {
  11. public init?(decoding: AK.StringView, as: AK.StringView) {
  12. let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
  13. if maybe_decoded.hasValue {
  14. self.init(maybe_decoded.value!)
  15. } else {
  16. return nil
  17. }
  18. }
  19. }
  20. class HTMLTokenizer {
  21. enum State {
  22. case Data
  23. case RCDATA
  24. case RAWTEXT
  25. case ScriptData
  26. case PLAINTEXT
  27. case TagOpen
  28. case EndTagOpen
  29. case TagName
  30. case RCDATALessThanSign
  31. case RCDATAEndTagOpen
  32. case RCDATAEndTagName
  33. case RAWTEXTLessThanSign
  34. case RAWTEXTEndTagOpen
  35. case RAWTEXTEndTagName
  36. case ScriptDataLessThanSign
  37. case ScriptDataEndTagOpen
  38. case ScriptDataEndTagName
  39. case ScriptDataEscapeStart
  40. case ScriptDataEscapeStartDash
  41. case ScriptDataEscaped
  42. case ScriptDataEscapedDash
  43. case ScriptDataEscapedDashDash
  44. case ScriptDataEscapedLessThanSign
  45. case ScriptDataEscapedEndTagOpen
  46. case ScriptDataEscapedEndTagName
  47. case ScriptDataDoubleEscapeStart
  48. case ScriptDataDoubleEscaped
  49. case ScriptDataDoubleEscapedDash
  50. case ScriptDataDoubleEscapedDashDash
  51. case ScriptDataDoubleEscapedLessThanSign
  52. case ScriptDataDoubleEscapeEnd
  53. case BeforeAttributeName
  54. case AttributeName
  55. case AfterAttributeName
  56. case BeforeAttributeValue
  57. case AttributeValueDoubleQuoted
  58. case AttributeValueSingleQuoted
  59. case AttributeValueUnquoted
  60. case AfterAttributeValueQuoted
  61. case SelfClosingStartTag
  62. case BogusComment
  63. case MarkupDeclarationOpen
  64. case CommentStart
  65. case CommentStartDash
  66. case Comment
  67. case CommentLessThanSign
  68. case CommentLessThanSignBang
  69. case CommentLessThanSignBangDash
  70. case CommentLessThanSignBangDashDash
  71. case CommentEndDash
  72. case CommentEnd
  73. case CommentEndBang
  74. case DOCTYPE
  75. case BeforeDOCTYPEName
  76. case DOCTYPEName
  77. case AfterDOCTYPEName
  78. case AfterDOCTYPEPublicKeyword
  79. case BeforeDOCTYPEPublicIdentifier
  80. case DOCTYPEPublicIdentifierDoubleQuoted
  81. case DOCTYPEPublicIdentifierSingleQuoted
  82. case AfterDOCTYPEPublicIdentifier
  83. case BetweenDOCTYPEPublicAndSystemIdentifiers
  84. case AfterDOCTYPESystemKeyword
  85. case BeforeDOCTYPESystemIdentifier
  86. case DOCTYPESystemIdentifierDoubleQuoted
  87. case DOCTYPESystemIdentifierSingleQuoted
  88. case AfterDOCTYPESystemIdentifier
  89. case BogusDOCTYPE
  90. case CDATASection
  91. case CDATASectionBracket
  92. case CDATASectionEnd
  93. case CharacterReference
  94. case NamedCharacterReference
  95. case AmbiguousAmpersand
  96. case NumericCharacterReference
  97. case HexadecimalCharacterReferenceStart
  98. case DecimalCharacterReferenceStart
  99. case HexadecimalCharacterReference
  100. case DecimalCharacterReference
  101. case NumericCharacterReferenceEnd
  102. }
  103. var input = Swift.String()
  104. var state = State.Data
  105. var returnState = State.Data
  106. var currentToken = HTMLToken()
  107. var queuedTokens = Deque<HTMLToken>()
  108. public init() {}
  109. public init?(input: AK.StringView, encoding: AK.StringView) {
  110. if let string = Swift.String(decoding: input, as: encoding) {
  111. self.input = string
  112. } else {
  113. return nil
  114. }
  115. }
  116. public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
  117. while !queuedTokens.isEmpty {
  118. return queuedTokens.popFirst()
  119. }
  120. return nil
  121. }
  122. }