TestHTMLTokenizerSwift.swift 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314
  1. /*
  2. * Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. import AK
  7. import Testing
  8. import Web
  9. @Suite
  10. struct TestHTMLTokenizerSwift {
  11. @Test func tokenTypes() {
  12. let default_token = HTMLToken()
  13. default_token.type = .Character(codePoint: "a")
  14. #expect(default_token.isCharacter())
  15. #expect("\(default_token)" == "HTMLToken(type: Character(codePoint: a))")
  16. }
  17. @Test func parserWhitespace() {
  18. for codePoint: Character in ["\t", "\n", "\r", "\u{000C}", " "] {
  19. let token = HTMLToken(type: .Character(codePoint: codePoint))
  20. #expect(token.isParserWhitespace())
  21. }
  22. for codePoint: Character in ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] {
  23. let token = HTMLToken(type: .Character(codePoint: codePoint))
  24. #expect(!token.isParserWhitespace())
  25. }
  26. }
  27. @Test func dataStateNoInput() {
  28. let tokenizer = HTMLTokenizer()
  29. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  30. let token = tokenizer.nextToken()
  31. #expect(token?.type == .EndOfFile)
  32. let token2 = tokenizer.nextToken()
  33. #expect(token2 == nil)
  34. #expect(tokenizer.state == HTMLTokenizer.State.Data)
  35. }
  36. @Test func dataStateSingleChar() {
  37. guard let tokenizer = HTMLTokenizer(input: "X") else {
  38. Issue.record("Failed to create tokenizer for 'X'")
  39. return
  40. }
  41. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  42. let token = tokenizer.nextToken()
  43. #expect(token?.type == .Character(codePoint: "X"))
  44. let token2 = tokenizer.nextToken()
  45. #expect(token2?.type == .EndOfFile)
  46. let token3 = tokenizer.nextToken()
  47. #expect(token3 == nil)
  48. #expect(tokenizer.state == HTMLTokenizer.State.Data)
  49. }
  50. @Test func dataStateAmpersand() {
  51. guard let tokenizer = HTMLTokenizer(input: "&") else {
  52. Issue.record("Failed to create tokenizer for '&'")
  53. return
  54. }
  55. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  56. let token = tokenizer.nextToken()
  57. #expect(token?.type == .Character(codePoint: "&"))
  58. let token2 = tokenizer.nextToken()
  59. #expect(token2?.type == .EndOfFile)
  60. let token3 = tokenizer.nextToken()
  61. #expect(token3 == nil)
  62. }
  63. @Test func tagOpenOnly() {
  64. guard let tokenizer = HTMLTokenizer(input: "<") else {
  65. Issue.record("Failed to create tokenizer for '<'")
  66. return
  67. }
  68. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  69. let token = tokenizer.nextToken()
  70. #expect(token?.type == .Character(codePoint: "<"))
  71. let token2 = tokenizer.nextToken()
  72. #expect(token2?.type == .EndOfFile)
  73. #expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
  74. let token3 = tokenizer.nextToken()
  75. #expect(token3 == nil)
  76. }
  77. @Test func dataStateNulChar() {
  78. guard let tokenizer = HTMLTokenizer(input: "H\0I") else {
  79. Issue.record("Failed to create tokenizer for 'H\\0I'")
  80. return
  81. }
  82. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  83. let token = tokenizer.nextToken()
  84. #expect(token?.type == .Character(codePoint: "H"))
  85. let token2 = tokenizer.nextToken()
  86. #expect(token2?.type == .Character(codePoint: "\u{FFFD}"))
  87. let token3 = tokenizer.nextToken()
  88. #expect(token3?.type == .Character(codePoint: "I"))
  89. let token4 = tokenizer.nextToken()
  90. #expect(token4?.type == .EndOfFile)
  91. #expect(tokenizer.state == HTMLTokenizer.State.Data)
  92. }
  93. @Test func scriptTagWithAttributes() {
  94. guard let tokenizer = HTMLTokenizer(input: "<script type=\"text/javascript\">") else {
  95. Issue.record("Failed to create tokenizer for '<script type=\"text/javascript\">'")
  96. return
  97. }
  98. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  99. let token = tokenizer.nextToken()
  100. #expect(token?.type == .StartTag(tagName: "script", attributes: [HTMLToken.Attribute(localName: "type", value: "text/javascript")]))
  101. let token2 = tokenizer.nextToken()
  102. #expect(token2?.type == .EndOfFile)
  103. #expect(tokenizer.state == HTMLTokenizer.State.Data)
  104. }
  105. @Test func scriptWithContent() {
  106. guard let tokenizer = HTMLTokenizer(input: "<script>var x = 1;</script>") else {
  107. Issue.record("Failed to create tokenizer for '<script>var x = 1;</script>'")
  108. return
  109. }
  110. let token = tokenizer.nextToken()
  111. #expect(token?.type == .StartTag(tagName: "script", attributes: []))
  112. for codePoint in "var x = 1;" {
  113. let token = tokenizer.nextToken()
  114. #expect(token?.type == .Character(codePoint: codePoint))
  115. }
  116. let token2 = tokenizer.nextToken()
  117. #expect(token2?.type == .EndTag(tagName: "script"))
  118. let token3 = tokenizer.nextToken()
  119. #expect(token3?.type == .EndOfFile)
  120. }
  121. @Test func simpleDivWithContent() {
  122. guard let tokenizer = HTMLTokenizer(input: "<div>hi</div>") else {
  123. Issue.record("Failed to create tokenizer for '<div>hi</div>'")
  124. return
  125. }
  126. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  127. let token = tokenizer.nextToken()
  128. #expect(token?.type == .StartTag(tagName: "div", attributes: []))
  129. let token2 = tokenizer.nextToken()
  130. #expect(token2?.type == .Character(codePoint: "h"))
  131. let token3 = tokenizer.nextToken()
  132. #expect(token3?.type == .Character(codePoint: "i"))
  133. let token4 = tokenizer.nextToken()
  134. #expect(token4?.type == .EndTag(tagName: "div"))
  135. let token5 = tokenizer.nextToken()
  136. #expect(token5?.type == .EndOfFile)
  137. }
  138. @Test func simpleDivWithContentAndAttributes() {
  139. guard let tokenizer = HTMLTokenizer(input: "<div class=\"foo\">hi</div>") else {
  140. Issue.record("Failed to create tokenizer for '<div class=\"foo\">hi</div>'")
  141. return
  142. }
  143. #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
  144. let token = tokenizer.nextToken()
  145. #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")]))
  146. let token2 = tokenizer.nextToken()
  147. #expect(token2?.type == .Character(codePoint: "h"))
  148. let token3 = tokenizer.nextToken()
  149. #expect(token3?.type == .Character(codePoint: "i"))
  150. let token4 = tokenizer.nextToken()
  151. #expect(token4?.type == .EndTag(tagName: "div"))
  152. let token5 = tokenizer.nextToken()
  153. #expect(token5?.type == .EndOfFile)
  154. }
  155. @Test func severalDivsWithAttributesAndContent() {
  156. // Explicitly use unquoted and single quotes for attribute values
  157. guard let tokenizer = HTMLTokenizer(input: "<div class=foo>hi</div><div class='bar'>bye</div>") else {
  158. Issue.record("Failed to create tokenizer for '<div class=\"foo\">hi</div><div class=\"bar\">bye</div>'")
  159. return
  160. }
  161. let token = tokenizer.nextToken()
  162. #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")]))
  163. for codePoint in "hi" {
  164. let token = tokenizer.nextToken()
  165. #expect(token?.type == .Character(codePoint: codePoint))
  166. }
  167. let token2 = tokenizer.nextToken()
  168. #expect(token2?.type == .EndTag(tagName: "div"))
  169. let token3 = tokenizer.nextToken()
  170. #expect(token3?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "bar")]))
  171. for codePoint in "bye" {
  172. let token = tokenizer.nextToken()
  173. #expect(token?.type == .Character(codePoint: codePoint))
  174. }
  175. let token4 = tokenizer.nextToken()
  176. #expect(token4?.type == .EndTag(tagName: "div"))
  177. let token5 = tokenizer.nextToken()
  178. #expect(token5?.type == .EndOfFile)
  179. }
  180. @Test func startTagWithMultipleAttributes() {
  181. guard let tokenizer = HTMLTokenizer(input: "<div class=\"foo\" id=\"bar\">hi</div attr=endTagAttributeWhee>") else {
  182. Issue.record("Failed to create tokenizer for '<div class=\"foo\" id=\"bar\">hi</div>'")
  183. return
  184. }
  185. let token = tokenizer.nextToken()
  186. #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo"), HTMLToken.Attribute(localName: "id", value: "bar")]))
  187. for codePoint in "hi" {
  188. let token = tokenizer.nextToken()
  189. #expect(token?.type == .Character(codePoint: codePoint))
  190. }
  191. let token2 = tokenizer.nextToken()
  192. #expect(token2?.type == .EndTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "attr", value: "endTagAttributeWhee")]))
  193. let token3 = tokenizer.nextToken()
  194. #expect(token3?.type == .EndOfFile)
  195. }
  196. @Test func xmlDeclaration() {
  197. guard let tokenizer = HTMLTokenizer(input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") else {
  198. Issue.record("Failed to create tokenizer for '<?xml version=\"1.0\" encoding=\"UTF-8\"?>'")
  199. return
  200. }
  201. let token = tokenizer.nextToken()
  202. #expect(token?.type == .Comment(data: "?xml version=\"1.0\" encoding=\"UTF-8\"?"))
  203. let token2 = tokenizer.nextToken()
  204. #expect(token2?.type == .EndOfFile)
  205. }
  206. @Test func simpleComment() {
  207. guard let tokenizer = HTMLTokenizer(input: "<!-- comment -->") else {
  208. Issue.record("Failed to create tokenizer for '<!-- comment -->'")
  209. return
  210. }
  211. let token = tokenizer.nextToken()
  212. #expect(token?.type == .Comment(data: " comment "))
  213. let token2 = tokenizer.nextToken()
  214. #expect(token2?.type == .EndOfFile)
  215. }
  216. @Test func nestedComment() {
  217. guard let tokenizer = HTMLTokenizer(input: "<!-- <!-- nested --> -->") else {
  218. Issue.record("Failed to create tokenizer for '<!-- <!-- nested --> -->'")
  219. return
  220. }
  221. let token = tokenizer.nextToken()
  222. #expect(token?.type == .Comment(data: " <!-- nested "))
  223. for codePoint in " -->" {
  224. let token = tokenizer.nextToken()
  225. #expect(token?.type == .Character(codePoint: codePoint))
  226. }
  227. let token2 = tokenizer.nextToken()
  228. #expect(token2?.type == .EndOfFile)
  229. }
  230. @Test func commentWithScriptTagInside() {
  231. guard let tokenizer = HTMLTokenizer(input: "<!-- <script>var x = 1;</script> -->") else {
  232. Issue.record("Failed to create tokenizer for '<!-- <script>var x = 1;</script> -->'")
  233. return
  234. }
  235. let token = tokenizer.nextToken()
  236. #expect(token?.type == .Comment(data: " <script>var x = 1;</script> "))
  237. let token2 = tokenizer.nextToken()
  238. #expect(token2?.type == .EndOfFile)
  239. }
  240. }