HTMLTokenizer.cpp 66 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibTextCodec/Decoder.h>
  27. #include <LibWeb/Parser/Entities.h>
  28. #include <LibWeb/Parser/HTMLToken.h>
  29. #include <LibWeb/Parser/HTMLTokenizer.h>
  30. #include <ctype.h>
  31. #pragma GCC diagnostic ignored "-Wunused-label"
  32. //#define TOKENIZER_TRACE
  33. #ifdef TOKENIZER_TRACE
  34. #define PARSE_ERROR() \
  35. do { \
  36. dbg() << "Parse error (tokenization)" << __PRETTY_FUNCTION__ << " @ " << __LINE__; \
  37. } while (0)
  38. #else
  39. #define PARSE_ERROR()
  40. #endif
  41. #define CONSUME_NEXT_INPUT_CHARACTER \
  42. current_input_character = next_codepoint();
  43. #define SWITCH_TO(new_state) \
  44. do { \
  45. will_switch_to(State::new_state); \
  46. m_state = State::new_state; \
  47. CONSUME_NEXT_INPUT_CHARACTER; \
  48. goto new_state; \
  49. } while (0)
  50. #define RECONSUME_IN(new_state) \
  51. do { \
  52. will_reconsume_in(State::new_state); \
  53. m_state = State::new_state; \
  54. goto new_state; \
  55. } while (0)
  56. #define SWITCH_TO_RETURN_STATE \
  57. do { \
  58. will_switch_to(m_return_state); \
  59. m_state = m_return_state; \
  60. goto _StartOfFunction; \
  61. } while (0)
  62. #define RECONSUME_IN_RETURN_STATE \
  63. do { \
  64. will_reconsume_in(m_return_state); \
  65. m_state = m_return_state; \
  66. goto _StartOfFunction; \
  67. } while (0)
  68. #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
  69. do { \
  70. will_switch_to(State::new_state); \
  71. m_state = State::new_state; \
  72. will_emit(m_current_token); \
  73. m_queued_tokens.enqueue(m_current_token); \
  74. return m_queued_tokens.dequeue(); \
  75. } while (0)
  76. #define EMIT_CHARACTER_AND_RECONSUME_IN(codepoint, new_state) \
  77. do { \
  78. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint)); \
  79. will_reconsume_in(State::new_state); \
  80. m_state = State::new_state; \
  81. goto new_state; \
  82. } while (0)
  83. #define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \
  84. do { \
  85. for (auto codepoint : m_temporary_buffer) { \
  86. if (consumed_as_part_of_an_attribute()) { \
  87. m_current_token.m_tag.attributes.last().value_builder.append(codepoint); \
  88. } else { \
  89. create_new_token(HTMLToken::Type::Character); \
  90. m_current_token.m_comment_or_character.data.append(codepoint); \
  91. m_queued_tokens.enqueue(m_current_token); \
  92. } \
  93. } \
  94. } while (0)
  95. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  96. #define ON(codepoint) \
  97. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  98. #define ON_EOF \
  99. if (!current_input_character.has_value())
  100. #define ON_ASCII_ALPHA \
  101. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  102. #define ON_ASCII_ALPHANUMERIC \
  103. if (current_input_character.has_value() && isalnum(current_input_character.value()))
  104. #define ON_ASCII_UPPER_ALPHA \
  105. if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
  106. #define ON_ASCII_LOWER_ALPHA \
  107. if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
  108. #define ON_ASCII_DIGIT \
  109. if (current_input_character.has_value() && isxdigit(current_input_character.value()))
  110. #define ON_ASCII_HEX_DIGIT \
  111. if (current_input_character.has_value() && isxdigit(current_input_character.value()))
  112. #define ON_WHITESPACE \
  113. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  114. #define ANYTHING_ELSE if (1)
  115. #define EMIT_EOF \
  116. do { \
  117. if (m_has_emitted_eof) \
  118. return {}; \
  119. m_has_emitted_eof = true; \
  120. create_new_token(HTMLToken::Type::EndOfFile); \
  121. will_emit(m_current_token); \
  122. m_queued_tokens.enqueue(m_current_token); \
  123. return m_queued_tokens.dequeue(); \
  124. } while (0)
  125. #define EMIT_CURRENT_TOKEN \
  126. do { \
  127. will_emit(m_current_token); \
  128. m_queued_tokens.enqueue(m_current_token); \
  129. return m_queued_tokens.dequeue(); \
  130. } while (0)
  131. #define EMIT_CHARACTER(codepoint) \
  132. do { \
  133. create_new_token(HTMLToken::Type::Character); \
  134. m_current_token.m_comment_or_character.data.append(codepoint); \
  135. m_queued_tokens.enqueue(m_current_token); \
  136. return m_queued_tokens.dequeue(); \
  137. } while (0)
  138. #define EMIT_CURRENT_CHARACTER \
  139. EMIT_CHARACTER(current_input_character.value());
  140. #define BEGIN_STATE(state) \
  141. state: \
  142. case State::state: { \
  143. { \
  144. {
  145. #define END_STATE \
  146. ASSERT_NOT_REACHED(); \
  147. break; \
  148. } \
  149. } \
  150. }
  151. static inline bool is_surrogate(u32 codepoint)
  152. {
  153. return (codepoint & 0xfffff800) == 0xd800;
  154. }
  155. static inline bool is_noncharacter(u32 codepoint)
  156. {
  157. return codepoint >= 0xfdd0 && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe) && codepoint <= 0x10ffff;
  158. }
  159. static inline bool is_c0_control(u32 codepoint)
  160. {
  161. return codepoint <= 0x1f;
  162. }
  163. static inline bool is_control(u32 codepoint)
  164. {
  165. return is_c0_control(codepoint) || (codepoint >= 0x7f && codepoint <= 0x9f);
  166. }
  167. namespace Web {
  168. Optional<u32> HTMLTokenizer::next_codepoint()
  169. {
  170. if (m_cursor >= m_input.length())
  171. return {};
  172. return m_input[m_cursor++];
  173. }
  174. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  175. {
  176. if ((m_cursor + offset) >= m_input.length())
  177. return {};
  178. return m_input[m_cursor + offset];
  179. }
  180. Optional<HTMLToken> HTMLTokenizer::next_token()
  181. {
  182. _StartOfFunction:
  183. if (!m_queued_tokens.is_empty())
  184. return m_queued_tokens.dequeue();
  185. for (;;) {
  186. auto current_input_character = next_codepoint();
  187. switch (m_state) {
  188. BEGIN_STATE(Data)
  189. {
  190. ON('&')
  191. {
  192. m_return_state = State::Data;
  193. SWITCH_TO(CharacterReference);
  194. }
  195. ON('<')
  196. {
  197. SWITCH_TO(TagOpen);
  198. }
  199. ON(0)
  200. {
  201. PARSE_ERROR();
  202. EMIT_CURRENT_CHARACTER;
  203. }
  204. ON_EOF
  205. {
  206. EMIT_EOF;
  207. }
  208. ANYTHING_ELSE
  209. {
  210. EMIT_CURRENT_CHARACTER;
  211. }
  212. }
  213. END_STATE
  214. BEGIN_STATE(TagOpen)
  215. {
  216. ON('!')
  217. {
  218. SWITCH_TO(MarkupDeclarationOpen);
  219. }
  220. ON('/')
  221. {
  222. SWITCH_TO(EndTagOpen);
  223. }
  224. ON_ASCII_ALPHA
  225. {
  226. create_new_token(HTMLToken::Type::StartTag);
  227. RECONSUME_IN(TagName);
  228. }
  229. ON('?')
  230. {
  231. PARSE_ERROR();
  232. create_new_token(HTMLToken::Type::Comment);
  233. RECONSUME_IN(BogusComment);
  234. }
  235. ON_EOF
  236. {
  237. TODO();
  238. }
  239. ANYTHING_ELSE
  240. {
  241. PARSE_ERROR();
  242. EMIT_CHARACTER('<');
  243. RECONSUME_IN(Data);
  244. }
  245. }
  246. END_STATE
  247. BEGIN_STATE(TagName)
  248. {
  249. ON_WHITESPACE
  250. {
  251. SWITCH_TO(BeforeAttributeName);
  252. }
  253. ON('/')
  254. {
  255. SWITCH_TO(SelfClosingStartTag);
  256. }
  257. ON('>')
  258. {
  259. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  260. }
  261. ON_ASCII_UPPER_ALPHA
  262. {
  263. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  264. continue;
  265. }
  266. ON(0)
  267. {
  268. PARSE_ERROR();
  269. m_current_token.m_tag.tag_name.append("\uFFFD");
  270. continue;
  271. }
  272. ON_EOF
  273. {
  274. PARSE_ERROR();
  275. EMIT_EOF;
  276. }
  277. ANYTHING_ELSE
  278. {
  279. m_current_token.m_tag.tag_name.append(current_input_character.value());
  280. continue;
  281. }
  282. }
  283. END_STATE
  284. BEGIN_STATE(EndTagOpen)
  285. {
  286. ON_ASCII_ALPHA
  287. {
  288. create_new_token(HTMLToken::Type::EndTag);
  289. RECONSUME_IN(TagName);
  290. }
  291. ON('>')
  292. {
  293. PARSE_ERROR();
  294. SWITCH_TO(Data);
  295. }
  296. ON_EOF
  297. {
  298. PARSE_ERROR();
  299. // FIXME: Emit a U+003C LESS-THAN SIGN character token, a U+002F SOLIDUS character token and an end-of-file token.
  300. continue;
  301. }
  302. ANYTHING_ELSE
  303. {
  304. PARSE_ERROR();
  305. create_new_token(HTMLToken::Type::Comment);
  306. RECONSUME_IN(BogusComment);
  307. }
  308. }
  309. END_STATE
  310. BEGIN_STATE(MarkupDeclarationOpen)
  311. {
  312. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  313. if (consume_next_if_match("--")) {
  314. create_new_token(HTMLToken::Type::Comment);
  315. SWITCH_TO(CommentStart);
  316. }
  317. if (consume_next_if_match("DOCTYPE", CaseSensitivity::CaseInsensitive)) {
  318. SWITCH_TO(DOCTYPE);
  319. }
  320. }
  321. END_STATE
  322. BEGIN_STATE(BogusComment)
  323. {
  324. ON('>')
  325. {
  326. TODO();
  327. }
  328. ON_EOF
  329. {
  330. TODO();
  331. }
  332. ON(0)
  333. {
  334. TODO();
  335. }
  336. ANYTHING_ELSE
  337. {
  338. TODO();
  339. }
  340. }
  341. END_STATE
  342. BEGIN_STATE(DOCTYPE)
  343. {
  344. ON_WHITESPACE
  345. {
  346. SWITCH_TO(BeforeDOCTYPEName);
  347. }
  348. ON('>')
  349. {
  350. RECONSUME_IN(BeforeDOCTYPEName);
  351. }
  352. ON_EOF
  353. {
  354. TODO();
  355. }
  356. ANYTHING_ELSE
  357. {
  358. TODO();
  359. }
  360. }
  361. END_STATE
  362. BEGIN_STATE(BeforeDOCTYPEName)
  363. {
  364. ON_WHITESPACE
  365. {
  366. continue;
  367. }
  368. ON_ASCII_UPPER_ALPHA
  369. {
  370. create_new_token(HTMLToken::Type::DOCTYPE);
  371. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  372. SWITCH_TO(DOCTYPEName);
  373. }
  374. ON(0)
  375. {
  376. TODO();
  377. }
  378. ON('>')
  379. {
  380. TODO();
  381. }
  382. ON_EOF
  383. {
  384. TODO();
  385. }
  386. ANYTHING_ELSE
  387. {
  388. create_new_token(HTMLToken::Type::DOCTYPE);
  389. m_current_token.m_doctype.name.append(current_input_character.value());
  390. SWITCH_TO(DOCTYPEName);
  391. }
  392. }
  393. END_STATE
  394. BEGIN_STATE(DOCTYPEName)
  395. {
  396. ON_WHITESPACE
  397. {
  398. SWITCH_TO(AfterDOCTYPEName);
  399. }
  400. ON('>')
  401. {
  402. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  403. }
  404. ON_ASCII_UPPER_ALPHA
  405. {
  406. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  407. }
  408. ON(0)
  409. {
  410. TODO();
  411. }
  412. ON_EOF
  413. {
  414. TODO();
  415. }
  416. ANYTHING_ELSE
  417. {
  418. m_current_token.m_doctype.name.append(current_input_character.value());
  419. continue;
  420. }
  421. }
  422. END_STATE
  423. BEGIN_STATE(AfterDOCTYPEName)
  424. {
  425. ON_WHITESPACE
  426. {
  427. continue;
  428. }
  429. ON('>')
  430. {
  431. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  432. }
  433. ON_EOF
  434. {
  435. TODO();
  436. }
  437. ANYTHING_ELSE
  438. {
  439. if (toupper(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC", CaseSensitivity::CaseInsensitive)) {
  440. SWITCH_TO(AfterDOCTYPEPublicKeyword);
  441. }
  442. if (toupper(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM", CaseSensitivity::CaseInsensitive)) {
  443. SWITCH_TO(AfterDOCTYPESystemKeyword);
  444. }
  445. TODO();
  446. }
  447. }
  448. END_STATE
  449. BEGIN_STATE(AfterDOCTYPEPublicKeyword)
  450. {
  451. ON_WHITESPACE
  452. {
  453. SWITCH_TO(BeforeDOCTYPEPublicIdentifier);
  454. }
  455. ON('"')
  456. {
  457. TODO();
  458. }
  459. ON('\'')
  460. {
  461. TODO();
  462. }
  463. ON('>')
  464. {
  465. TODO();
  466. }
  467. ON_EOF
  468. {
  469. TODO();
  470. }
  471. ANYTHING_ELSE
  472. {
  473. TODO();
  474. }
  475. }
  476. END_STATE
  477. BEGIN_STATE(AfterDOCTYPESystemKeyword)
  478. {
  479. ON_WHITESPACE
  480. {
  481. SWITCH_TO(BeforeDOCTYPESystemIdentifier);
  482. }
  483. ON('"')
  484. {
  485. TODO();
  486. }
  487. ON('\'')
  488. {
  489. TODO();
  490. }
  491. ON('>')
  492. {
  493. TODO();
  494. }
  495. ON_EOF
  496. {
  497. TODO();
  498. }
  499. ANYTHING_ELSE
  500. {
  501. TODO();
  502. }
  503. }
  504. END_STATE
  505. BEGIN_STATE(BeforeDOCTYPEPublicIdentifier)
  506. {
  507. ON_WHITESPACE
  508. {
  509. continue;
  510. }
  511. ON('"')
  512. {
  513. m_current_token.m_doctype.public_identifier.clear();
  514. SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
  515. }
  516. ON('\'')
  517. {
  518. m_current_token.m_doctype.public_identifier.clear();
  519. SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
  520. }
  521. ON('>')
  522. {
  523. TODO();
  524. }
  525. ON_EOF
  526. {
  527. TODO();
  528. }
  529. ANYTHING_ELSE
  530. {
  531. TODO();
  532. }
  533. }
  534. END_STATE
  535. BEGIN_STATE(BeforeDOCTYPESystemIdentifier)
  536. {
  537. ON_WHITESPACE
  538. {
  539. continue;
  540. }
  541. ON('"')
  542. {
  543. m_current_token.m_doctype.system_identifier.clear();
  544. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  545. }
  546. ON('\'')
  547. {
  548. m_current_token.m_doctype.system_identifier.clear();
  549. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  550. }
  551. ON('>')
  552. {
  553. TODO();
  554. }
  555. ON_EOF
  556. {
  557. TODO();
  558. }
  559. ANYTHING_ELSE
  560. {
  561. TODO();
  562. }
  563. }
  564. END_STATE
  565. BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuoted)
  566. {
  567. ON('"')
  568. {
  569. SWITCH_TO(AfterDOCTYPEPublicIdentifier);
  570. }
  571. ON(0)
  572. {
  573. TODO();
  574. }
  575. ON('>')
  576. {
  577. TODO();
  578. }
  579. ON_EOF
  580. {
  581. TODO();
  582. }
  583. ANYTHING_ELSE
  584. {
  585. m_current_token.m_doctype.public_identifier.append(current_input_character.value());
  586. continue;
  587. }
  588. }
  589. END_STATE
  590. BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuoted)
  591. {
  592. ON('\'')
  593. {
  594. SWITCH_TO(AfterDOCTYPEPublicIdentifier);
  595. }
  596. ON(0)
  597. {
  598. TODO();
  599. }
  600. ON('>')
  601. {
  602. TODO();
  603. }
  604. ON_EOF
  605. {
  606. TODO();
  607. }
  608. ANYTHING_ELSE
  609. {
  610. m_current_token.m_doctype.public_identifier.append(current_input_character.value());
  611. continue;
  612. }
  613. }
  614. END_STATE
  615. BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuoted)
  616. {
  617. ON('"')
  618. {
  619. SWITCH_TO(AfterDOCTYPESystemIdentifier);
  620. }
  621. ON(0)
  622. {
  623. TODO();
  624. }
  625. ON('>')
  626. {
  627. TODO();
  628. }
  629. ON_EOF
  630. {
  631. TODO();
  632. }
  633. ANYTHING_ELSE
  634. {
  635. m_current_token.m_doctype.system_identifier.append(current_input_character.value());
  636. continue;
  637. }
  638. }
  639. END_STATE
  640. BEGIN_STATE(DOCTYPESystemIdentifierSingleQuoted)
  641. {
  642. ON('\'')
  643. {
  644. SWITCH_TO(AfterDOCTYPESystemIdentifier);
  645. }
  646. ON(0)
  647. {
  648. TODO();
  649. }
  650. ON('>')
  651. {
  652. TODO();
  653. }
  654. ON_EOF
  655. {
  656. TODO();
  657. }
  658. ANYTHING_ELSE
  659. {
  660. m_current_token.m_doctype.system_identifier.append(current_input_character.value());
  661. continue;
  662. }
  663. }
  664. END_STATE
  665. BEGIN_STATE(AfterDOCTYPEPublicIdentifier)
  666. {
  667. ON_WHITESPACE
  668. {
  669. SWITCH_TO(BetweenDOCTYPEPublicAndSystemIdentifiers);
  670. }
  671. ON('>')
  672. {
  673. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  674. }
  675. ON('"')
  676. {
  677. TODO();
  678. }
  679. ON('\'')
  680. {
  681. TODO();
  682. }
  683. ON_EOF
  684. {
  685. TODO();
  686. }
  687. ANYTHING_ELSE
  688. {
  689. TODO();
  690. }
  691. }
  692. END_STATE
  693. BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers)
  694. {
  695. ON_WHITESPACE
  696. {
  697. continue;
  698. }
  699. ON('>')
  700. {
  701. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  702. }
  703. ON('"')
  704. {
  705. m_current_token.m_doctype.system_identifier.clear();
  706. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  707. }
  708. ON('\'')
  709. {
  710. m_current_token.m_doctype.system_identifier.clear();
  711. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  712. }
  713. ON_EOF
  714. {
  715. TODO();
  716. }
  717. ANYTHING_ELSE
  718. {
  719. TODO();
  720. }
  721. }
  722. END_STATE
  723. BEGIN_STATE(AfterDOCTYPESystemIdentifier)
  724. {
  725. ON_WHITESPACE
  726. {
  727. continue;
  728. }
  729. ON('>')
  730. {
  731. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  732. }
  733. ON_EOF
  734. {
  735. TODO();
  736. }
  737. ANYTHING_ELSE
  738. {
  739. TODO();
  740. }
  741. }
  742. END_STATE
  743. BEGIN_STATE(BeforeAttributeName)
  744. {
  745. ON_WHITESPACE
  746. {
  747. continue;
  748. }
  749. ON('/')
  750. {
  751. RECONSUME_IN(AfterAttributeName);
  752. }
  753. ON('>')
  754. {
  755. RECONSUME_IN(AfterAttributeName);
  756. }
  757. ON_EOF
  758. {
  759. RECONSUME_IN(AfterAttributeName);
  760. }
  761. ON('=')
  762. {
  763. TODO();
  764. }
  765. ANYTHING_ELSE
  766. {
  767. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  768. RECONSUME_IN(AttributeName);
  769. }
  770. }
  771. END_STATE
  772. BEGIN_STATE(SelfClosingStartTag)
  773. {
  774. ON('>')
  775. {
  776. m_current_token.m_tag.self_closing = true;
  777. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  778. }
  779. ON_EOF
  780. {
  781. PARSE_ERROR();
  782. EMIT_EOF;
  783. }
  784. ANYTHING_ELSE
  785. {
  786. PARSE_ERROR();
  787. RECONSUME_IN(BeforeAttributeName);
  788. }
  789. }
  790. END_STATE
  791. BEGIN_STATE(AttributeName)
  792. {
  793. ON_WHITESPACE
  794. {
  795. RECONSUME_IN(AfterAttributeName);
  796. }
  797. ON('/')
  798. {
  799. RECONSUME_IN(AfterAttributeName);
  800. }
  801. ON('>')
  802. {
  803. RECONSUME_IN(AfterAttributeName);
  804. }
  805. ON_EOF
  806. {
  807. RECONSUME_IN(AfterAttributeName);
  808. }
  809. ON('=')
  810. {
  811. SWITCH_TO(BeforeAttributeValue);
  812. }
  813. ANYTHING_ELSE
  814. {
  815. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  816. continue;
  817. }
  818. }
  819. END_STATE
  820. BEGIN_STATE(AfterAttributeName)
  821. {
  822. ON_WHITESPACE
  823. {
  824. continue;
  825. }
  826. ON('/')
  827. {
  828. SWITCH_TO(SelfClosingStartTag);
  829. }
  830. ON('=')
  831. {
  832. SWITCH_TO(BeforeAttributeValue);
  833. }
  834. ON('>')
  835. {
  836. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  837. }
  838. ON_EOF
  839. {
  840. TODO();
  841. }
  842. ANYTHING_ELSE
  843. {
  844. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  845. RECONSUME_IN(AttributeName);
  846. }
  847. }
  848. END_STATE
  849. BEGIN_STATE(BeforeAttributeValue)
  850. {
  851. ON_WHITESPACE
  852. {
  853. continue;
  854. }
  855. ON('"')
  856. {
  857. SWITCH_TO(AttributeValueDoubleQuoted);
  858. }
  859. ON('\'')
  860. {
  861. SWITCH_TO(AttributeValueSingleQuoted);
  862. }
  863. ON('>')
  864. {
  865. PARSE_ERROR();
  866. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  867. }
  868. ANYTHING_ELSE
  869. {
  870. RECONSUME_IN(AttributeValueUnquoted);
  871. }
  872. }
  873. END_STATE
  874. BEGIN_STATE(AttributeValueDoubleQuoted)
  875. {
  876. ON('"')
  877. {
  878. SWITCH_TO(AfterAttributeValueQuoted);
  879. }
  880. ON('&')
  881. {
  882. m_return_state = State::AttributeValueDoubleQuoted;
  883. SWITCH_TO(CharacterReference);
  884. }
  885. ON(0)
  886. {
  887. TODO();
  888. }
  889. ON_EOF
  890. {
  891. TODO();
  892. }
  893. ANYTHING_ELSE
  894. {
  895. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  896. continue;
  897. }
  898. }
  899. END_STATE
  900. BEGIN_STATE(AttributeValueSingleQuoted)
  901. {
  902. ON('\'')
  903. {
  904. SWITCH_TO(AfterAttributeValueQuoted);
  905. }
  906. ON('&')
  907. {
  908. m_return_state = State::AttributeValueSingleQuoted;
  909. SWITCH_TO(CharacterReference);
  910. }
  911. ON(0)
  912. {
  913. TODO();
  914. }
  915. ON_EOF
  916. {
  917. TODO();
  918. }
  919. ANYTHING_ELSE
  920. {
  921. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  922. continue;
  923. }
  924. }
  925. END_STATE
  926. BEGIN_STATE(AttributeValueUnquoted)
  927. {
  928. ON_WHITESPACE
  929. {
  930. SWITCH_TO(BeforeAttributeName);
  931. }
  932. ON('&')
  933. {
  934. m_return_state = State::AttributeValueUnquoted;
  935. SWITCH_TO(CharacterReference);
  936. }
  937. ON('>')
  938. {
  939. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  940. }
  941. ON(0)
  942. {
  943. TODO();
  944. }
  945. ON_EOF
  946. {
  947. TODO();
  948. }
  949. ANYTHING_ELSE
  950. {
  951. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  952. continue;
  953. }
  954. }
  955. END_STATE
  956. BEGIN_STATE(AfterAttributeValueQuoted)
  957. {
  958. ON_WHITESPACE
  959. {
  960. SWITCH_TO(BeforeAttributeName);
  961. }
  962. ON('/')
  963. {
  964. SWITCH_TO(SelfClosingStartTag);
  965. }
  966. ON('>')
  967. {
  968. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  969. }
  970. ON_EOF
  971. {
  972. PARSE_ERROR();
  973. EMIT_EOF;
  974. }
  975. ANYTHING_ELSE
  976. {
  977. PARSE_ERROR();
  978. RECONSUME_IN(BeforeAttributeName);
  979. }
  980. }
  981. END_STATE
  982. BEGIN_STATE(CommentStart)
  983. {
  984. ON('-')
  985. {
  986. SWITCH_TO(CommentStartDash);
  987. }
  988. ON('>')
  989. {
  990. TODO();
  991. }
  992. ANYTHING_ELSE
  993. {
  994. RECONSUME_IN(Comment);
  995. }
  996. }
  997. END_STATE
  998. BEGIN_STATE(CommentStartDash)
  999. {
  1000. ON('-')
  1001. {
  1002. SWITCH_TO(CommentEnd);
  1003. }
  1004. ON('>')
  1005. {
  1006. TODO();
  1007. }
  1008. ON_EOF
  1009. {
  1010. TODO();
  1011. }
  1012. ANYTHING_ELSE
  1013. {
  1014. m_current_token.m_comment_or_character.data.append('-');
  1015. RECONSUME_IN(Comment);
  1016. }
  1017. }
  1018. END_STATE
  1019. BEGIN_STATE(Comment)
  1020. {
  1021. ON('<')
  1022. {
  1023. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  1024. SWITCH_TO(CommentLessThanSign);
  1025. }
  1026. ON('-')
  1027. {
  1028. SWITCH_TO(CommentEndDash);
  1029. }
  1030. ON(0)
  1031. {
  1032. TODO();
  1033. }
  1034. ON_EOF
  1035. {
  1036. TODO();
  1037. }
  1038. ANYTHING_ELSE
  1039. {
  1040. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  1041. continue;
  1042. }
  1043. }
  1044. END_STATE
  1045. BEGIN_STATE(CommentEnd)
  1046. {
  1047. ON('>')
  1048. {
  1049. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1050. }
  1051. ON('!')
  1052. {
  1053. SWITCH_TO(CommentEndBang);
  1054. }
  1055. ON('-')
  1056. {
  1057. m_current_token.m_comment_or_character.data.append('-');
  1058. continue;
  1059. }
  1060. ON_EOF
  1061. {
  1062. TODO();
  1063. }
  1064. ANYTHING_ELSE
  1065. {
  1066. m_current_token.m_comment_or_character.data.append('-');
  1067. RECONSUME_IN(Comment);
  1068. }
  1069. }
  1070. END_STATE
  1071. BEGIN_STATE(CommentEndBang)
  1072. {
  1073. ON('-')
  1074. {
  1075. m_current_token.m_comment_or_character.data.append("--!");
  1076. SWITCH_TO(CommentEndDash);
  1077. }
  1078. ON('>')
  1079. {
  1080. TODO();
  1081. }
  1082. ON_EOF
  1083. {
  1084. TODO();
  1085. }
  1086. ANYTHING_ELSE
  1087. {
  1088. m_current_token.m_comment_or_character.data.append("--!");
  1089. RECONSUME_IN(Comment);
  1090. }
  1091. }
  1092. END_STATE
  1093. BEGIN_STATE(CommentEndDash)
  1094. {
  1095. ON('-')
  1096. {
  1097. SWITCH_TO(CommentEnd);
  1098. }
  1099. ON_EOF
  1100. {
  1101. TODO();
  1102. }
  1103. ANYTHING_ELSE
  1104. {
  1105. m_current_token.m_comment_or_character.data.append('-');
  1106. RECONSUME_IN(Comment);
  1107. }
  1108. }
  1109. END_STATE
  1110. BEGIN_STATE(CommentLessThanSign)
  1111. {
  1112. ON('!')
  1113. {
  1114. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  1115. SWITCH_TO(CommentLessThanSignBang);
  1116. }
  1117. ON('<')
  1118. {
  1119. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  1120. continue;
  1121. }
  1122. ANYTHING_ELSE
  1123. {
  1124. RECONSUME_IN(Comment);
  1125. }
  1126. }
  1127. END_STATE
  1128. BEGIN_STATE(CommentLessThanSignBang)
  1129. {
  1130. ON('-')
  1131. {
  1132. SWITCH_TO(CommentLessThanSignBangDash);
  1133. }
  1134. ANYTHING_ELSE
  1135. {
  1136. RECONSUME_IN(Comment);
  1137. }
  1138. }
  1139. END_STATE
  1140. BEGIN_STATE(CommentLessThanSignBangDash)
  1141. {
  1142. ON('-')
  1143. {
  1144. SWITCH_TO(CommentLessThanSignBangDashDash);
  1145. }
  1146. ANYTHING_ELSE
  1147. {
  1148. RECONSUME_IN(Comment);
  1149. }
  1150. }
  1151. END_STATE
  1152. BEGIN_STATE(CommentLessThanSignBangDashDash)
  1153. {
  1154. ON('>')
  1155. {
  1156. SWITCH_TO(CommentEnd);
  1157. }
  1158. ANYTHING_ELSE
  1159. {
  1160. PARSE_ERROR();
  1161. RECONSUME_IN(CommentEnd);
  1162. }
  1163. }
  1164. END_STATE
  1165. BEGIN_STATE(CharacterReference)
  1166. {
  1167. m_temporary_buffer.clear();
  1168. m_temporary_buffer.append('&');
  1169. ON_ASCII_ALPHANUMERIC
  1170. {
  1171. RECONSUME_IN(NamedCharacterReference);
  1172. }
  1173. ON('#')
  1174. {
  1175. m_temporary_buffer.append(current_input_character.value());
  1176. SWITCH_TO(NumericCharacterReference);
  1177. }
  1178. ANYTHING_ELSE
  1179. {
  1180. RECONSUME_IN_RETURN_STATE;
  1181. }
  1182. }
  1183. END_STATE
  1184. BEGIN_STATE(NamedCharacterReference)
  1185. {
  1186. auto match = HTML::codepoints_from_entity(m_input.substring_view(m_cursor - 1, m_input.length() - m_cursor + 1));
  1187. if (match.has_value()) {
  1188. m_cursor += match.value().entity.length();
  1189. for (auto ch : match.value().entity)
  1190. m_temporary_buffer.append(ch);
  1191. if (consumed_as_part_of_an_attribute() && match.value().entity.ends_with(';')) {
  1192. auto next_codepoint = peek_codepoint(0);
  1193. if (next_codepoint.has_value() && next_codepoint.value() == '=') {
  1194. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1195. SWITCH_TO_RETURN_STATE;
  1196. }
  1197. }
  1198. if (!match.value().entity.ends_with(';')) {
  1199. PARSE_ERROR();
  1200. }
  1201. m_temporary_buffer.clear();
  1202. m_temporary_buffer.append(match.value().codepoints);
  1203. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1204. SWITCH_TO_RETURN_STATE;
  1205. } else {
  1206. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1207. SWITCH_TO(AmbiguousAmpersand);
  1208. }
  1209. }
  1210. END_STATE
  1211. BEGIN_STATE(AmbiguousAmpersand)
  1212. {
  1213. ON_ASCII_ALPHANUMERIC
  1214. {
  1215. if (consumed_as_part_of_an_attribute()) {
  1216. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  1217. continue;
  1218. } else {
  1219. EMIT_CURRENT_CHARACTER;
  1220. }
  1221. }
  1222. ON(';')
  1223. {
  1224. TODO();
  1225. }
  1226. ANYTHING_ELSE
  1227. {
  1228. RECONSUME_IN_RETURN_STATE;
  1229. }
  1230. }
  1231. END_STATE
  1232. BEGIN_STATE(NumericCharacterReference)
  1233. {
  1234. m_character_reference_code = 0;
  1235. ON('X')
  1236. {
  1237. m_temporary_buffer.append(current_input_character.value());
  1238. SWITCH_TO(HexadecimalCharacterReferenceStart);
  1239. }
  1240. ON('x')
  1241. {
  1242. m_temporary_buffer.append(current_input_character.value());
  1243. SWITCH_TO(HexadecimalCharacterReferenceStart);
  1244. }
  1245. ANYTHING_ELSE
  1246. {
  1247. RECONSUME_IN(DecimalCharacterReferenceStart);
  1248. }
  1249. }
  1250. END_STATE
  1251. BEGIN_STATE(HexadecimalCharacterReferenceStart)
  1252. {
  1253. ON_ASCII_HEX_DIGIT
  1254. {
  1255. RECONSUME_IN(HexadecimalCharacterReference);
  1256. }
  1257. ANYTHING_ELSE
  1258. {
  1259. TODO();
  1260. }
  1261. }
  1262. END_STATE
  1263. BEGIN_STATE(DecimalCharacterReferenceStart)
  1264. {
  1265. ON_ASCII_DIGIT
  1266. {
  1267. RECONSUME_IN(DecimalCharacterReference);
  1268. }
  1269. ANYTHING_ELSE
  1270. {
  1271. TODO();
  1272. }
  1273. }
  1274. END_STATE
  1275. BEGIN_STATE(HexadecimalCharacterReference)
  1276. {
  1277. ON_ASCII_DIGIT
  1278. {
  1279. m_character_reference_code *= 16;
  1280. m_character_reference_code += current_input_character.value() - 0x30;
  1281. continue;
  1282. }
  1283. ON_ASCII_UPPER_ALPHA
  1284. {
  1285. m_character_reference_code *= 16;
  1286. m_character_reference_code += current_input_character.value() - 0x37;
  1287. continue;
  1288. }
  1289. ON_ASCII_LOWER_ALPHA
  1290. {
  1291. m_character_reference_code *= 16;
  1292. m_character_reference_code += current_input_character.value() - 0x57;
  1293. continue;
  1294. }
  1295. ON(';')
  1296. {
  1297. SWITCH_TO(NumericCharacterReferenceEnd);
  1298. }
  1299. ANYTHING_ELSE
  1300. {
  1301. TODO();
  1302. }
  1303. }
  1304. END_STATE
  1305. BEGIN_STATE(DecimalCharacterReference)
  1306. {
  1307. ON_ASCII_DIGIT
  1308. {
  1309. m_character_reference_code *= 10;
  1310. m_character_reference_code += current_input_character.value() - 0x30;
  1311. continue;
  1312. }
  1313. ON(';')
  1314. {
  1315. SWITCH_TO(NumericCharacterReferenceEnd);
  1316. }
  1317. ANYTHING_ELSE
  1318. {
  1319. TODO();
  1320. }
  1321. }
  1322. END_STATE
  1323. BEGIN_STATE(NumericCharacterReferenceEnd)
  1324. {
  1325. if (m_character_reference_code == 0) {
  1326. TODO();
  1327. }
  1328. if (m_character_reference_code > 0x10ffff) {
  1329. TODO();
  1330. }
  1331. if (is_surrogate(m_character_reference_code)) {
  1332. TODO();
  1333. }
  1334. if (is_noncharacter(m_character_reference_code)) {
  1335. TODO();
  1336. }
  1337. if (m_character_reference_code == 0xd || (is_control(m_character_reference_code) && !isspace(m_character_reference_code))) {
  1338. TODO();
  1339. }
  1340. if (is_control(m_character_reference_code)) {
  1341. constexpr struct {
  1342. u32 number;
  1343. u32 codepoint;
  1344. } conversion_table[] = {
  1345. { 0x80, 0x20AC },
  1346. { 0x82, 0x201A },
  1347. { 0x83, 0x0192 },
  1348. { 0x84, 0x201E },
  1349. { 0x85, 0x2026 },
  1350. { 0x86, 0x2020 },
  1351. { 0x87, 0x2021 },
  1352. { 0x88, 0x02C6 },
  1353. { 0x89, 0x2030 },
  1354. { 0x8A, 0x0160 },
  1355. { 0x8B, 0x2039 },
  1356. { 0x8C, 0x0152 },
  1357. { 0x8E, 0x017D },
  1358. { 0x91, 0x2018 },
  1359. { 0x92, 0x2019 },
  1360. { 0x93, 0x201C },
  1361. { 0x94, 0x201D },
  1362. { 0x95, 0x2022 },
  1363. { 0x96, 0x2013 },
  1364. { 0x97, 0x2014 },
  1365. { 0x98, 0x02DC },
  1366. { 0x99, 0x2122 },
  1367. { 0x9A, 0x0161 },
  1368. { 0x9B, 0x203A },
  1369. { 0x9C, 0x0153 },
  1370. { 0x9E, 0x017E },
  1371. { 0x9F, 0x0178 },
  1372. };
  1373. for (auto& entry : conversion_table) {
  1374. if (m_character_reference_code == entry.number) {
  1375. m_character_reference_code = entry.codepoint;
  1376. break;
  1377. }
  1378. }
  1379. }
  1380. m_temporary_buffer.clear();
  1381. m_temporary_buffer.append(m_character_reference_code);
  1382. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1383. SWITCH_TO_RETURN_STATE;
  1384. }
  1385. END_STATE
  1386. BEGIN_STATE(RCDATA)
  1387. {
  1388. ON('&')
  1389. {
  1390. m_return_state = State::RCDATA;
  1391. SWITCH_TO(CharacterReference);
  1392. }
  1393. ON('<')
  1394. {
  1395. SWITCH_TO(RCDATALessThanSign);
  1396. }
  1397. ON(0)
  1398. {
  1399. PARSE_ERROR();
  1400. EMIT_CHARACTER("\uFFFD");
  1401. }
  1402. ON_EOF
  1403. {
  1404. EMIT_EOF;
  1405. }
  1406. ANYTHING_ELSE
  1407. {
  1408. EMIT_CURRENT_CHARACTER;
  1409. }
  1410. }
  1411. END_STATE
  1412. BEGIN_STATE(RCDATALessThanSign)
  1413. {
  1414. ON('/')
  1415. {
  1416. m_temporary_buffer.clear();
  1417. SWITCH_TO(RCDATAEndTagOpen);
  1418. }
  1419. ANYTHING_ELSE
  1420. {
  1421. EMIT_CHARACTER('<');
  1422. RECONSUME_IN(RCDATA);
  1423. }
  1424. }
  1425. END_STATE
  1426. BEGIN_STATE(RCDATAEndTagOpen)
  1427. {
  1428. ON_ASCII_ALPHA
  1429. {
  1430. create_new_token(HTMLToken::Type::EndTag);
  1431. RECONSUME_IN(RCDATAEndTagName);
  1432. }
  1433. ANYTHING_ELSE
  1434. {
  1435. // FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RCDATA state.
  1436. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1437. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1438. RECONSUME_IN(RCDATA);
  1439. }
  1440. }
  1441. END_STATE
  1442. BEGIN_STATE(RCDATAEndTagName)
  1443. {
  1444. ON_WHITESPACE
  1445. {
  1446. if (!current_end_tag_token_is_appropriate()) {
  1447. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1448. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1449. for (auto codepoint : m_temporary_buffer)
  1450. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1451. RECONSUME_IN(RCDATA);
  1452. }
  1453. SWITCH_TO(BeforeAttributeName);
  1454. }
  1455. ON('/')
  1456. {
  1457. if (!current_end_tag_token_is_appropriate()) {
  1458. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1459. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1460. for (auto codepoint : m_temporary_buffer)
  1461. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1462. RECONSUME_IN(RCDATA);
  1463. }
  1464. SWITCH_TO(SelfClosingStartTag);
  1465. }
  1466. ON('>')
  1467. {
  1468. if (!current_end_tag_token_is_appropriate()) {
  1469. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1470. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1471. for (auto codepoint : m_temporary_buffer)
  1472. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1473. RECONSUME_IN(RCDATA);
  1474. }
  1475. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1476. }
  1477. ON_ASCII_UPPER_ALPHA
  1478. {
  1479. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  1480. m_temporary_buffer.append(current_input_character.value());
  1481. continue;
  1482. }
  1483. ON_ASCII_LOWER_ALPHA
  1484. {
  1485. m_current_token.m_tag.tag_name.append(current_input_character.value());
  1486. m_temporary_buffer.append(current_input_character.value());
  1487. continue;
  1488. }
  1489. ANYTHING_ELSE
  1490. {
  1491. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1492. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1493. for (auto codepoint : m_temporary_buffer)
  1494. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1495. RECONSUME_IN(RCDATA);
  1496. }
  1497. }
  1498. END_STATE
  1499. BEGIN_STATE(RAWTEXT)
  1500. {
  1501. ON('<')
  1502. {
  1503. SWITCH_TO(RAWTEXTLessThanSign);
  1504. }
  1505. ON(0)
  1506. {
  1507. PARSE_ERROR();
  1508. EMIT_CHARACTER("\uFFFD");
  1509. }
  1510. ON_EOF
  1511. {
  1512. EMIT_EOF;
  1513. }
  1514. ANYTHING_ELSE
  1515. {
  1516. EMIT_CURRENT_CHARACTER;
  1517. }
  1518. }
  1519. END_STATE
  1520. BEGIN_STATE(RAWTEXTLessThanSign)
  1521. {
  1522. ON('/')
  1523. {
  1524. m_temporary_buffer.clear();
  1525. SWITCH_TO(RAWTEXTEndTagOpen);
  1526. }
  1527. ANYTHING_ELSE
  1528. {
  1529. EMIT_CHARACTER('<');
  1530. RECONSUME_IN(RAWTEXT);
  1531. }
  1532. }
  1533. END_STATE
  1534. BEGIN_STATE(RAWTEXTEndTagOpen)
  1535. {
  1536. ON_ASCII_ALPHA
  1537. {
  1538. create_new_token(HTMLToken::Type::EndTag);
  1539. RECONSUME_IN(RAWTEXTEndTagName);
  1540. }
  1541. ANYTHING_ELSE
  1542. {
  1543. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1544. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1545. RECONSUME_IN(RAWTEXT);
  1546. }
  1547. }
  1548. END_STATE
  1549. BEGIN_STATE(RAWTEXTEndTagName)
  1550. {
  1551. ON_WHITESPACE
  1552. {
  1553. if (!current_end_tag_token_is_appropriate()) {
  1554. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1555. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1556. for (auto codepoint : m_temporary_buffer)
  1557. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1558. RECONSUME_IN(RAWTEXT);
  1559. }
  1560. SWITCH_TO(BeforeAttributeName);
  1561. }
  1562. ON('/')
  1563. {
  1564. if (!current_end_tag_token_is_appropriate()) {
  1565. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1566. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1567. for (auto codepoint : m_temporary_buffer)
  1568. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1569. RECONSUME_IN(RAWTEXT);
  1570. }
  1571. SWITCH_TO(SelfClosingStartTag);
  1572. }
  1573. ON('>')
  1574. {
  1575. if (!current_end_tag_token_is_appropriate()) {
  1576. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1577. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1578. for (auto codepoint : m_temporary_buffer)
  1579. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1580. RECONSUME_IN(RAWTEXT);
  1581. }
  1582. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1583. }
  1584. ON_ASCII_UPPER_ALPHA
  1585. {
  1586. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  1587. m_temporary_buffer.append(current_input_character.value());
  1588. continue;
  1589. }
  1590. ON_ASCII_LOWER_ALPHA
  1591. {
  1592. m_current_token.m_tag.tag_name.append(current_input_character.value());
  1593. m_temporary_buffer.append(current_input_character.value());
  1594. continue;
  1595. }
  1596. ANYTHING_ELSE
  1597. {
  1598. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1599. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1600. for (auto codepoint : m_temporary_buffer)
  1601. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1602. RECONSUME_IN(RAWTEXT);
  1603. }
  1604. }
  1605. END_STATE
  1606. BEGIN_STATE(ScriptData)
  1607. {
  1608. ON('<')
  1609. {
  1610. SWITCH_TO(ScriptDataLessThanSign);
  1611. }
  1612. ON(0)
  1613. {
  1614. PARSE_ERROR();
  1615. EMIT_CHARACTER("\uFFFD");
  1616. }
  1617. ON_EOF
  1618. {
  1619. EMIT_EOF;
  1620. }
  1621. ANYTHING_ELSE
  1622. {
  1623. EMIT_CURRENT_CHARACTER;
  1624. }
  1625. }
  1626. END_STATE
  1627. BEGIN_STATE(PLAINTEXT)
  1628. {
  1629. ON(0)
  1630. {
  1631. PARSE_ERROR();
  1632. EMIT_CHARACTER("\uFFFD");
  1633. }
  1634. ON_EOF
  1635. {
  1636. EMIT_EOF;
  1637. }
  1638. ANYTHING_ELSE
  1639. {
  1640. EMIT_CURRENT_CHARACTER;
  1641. }
  1642. }
  1643. END_STATE
  1644. BEGIN_STATE(ScriptDataLessThanSign)
  1645. {
  1646. ON('/')
  1647. {
  1648. m_temporary_buffer.clear();
  1649. SWITCH_TO(ScriptDataEndTagOpen);
  1650. }
  1651. ON('!')
  1652. {
  1653. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1654. m_queued_tokens.enqueue(HTMLToken::make_character('!'));
  1655. SWITCH_TO(ScriptDataEscapeStart);
  1656. }
  1657. ANYTHING_ELSE
  1658. {
  1659. EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptData);
  1660. }
  1661. }
  1662. END_STATE
  1663. BEGIN_STATE(ScriptDataEscapeStart)
  1664. {
  1665. ON('-')
  1666. {
  1667. m_queued_tokens.enqueue(HTMLToken::make_character('-'));
  1668. SWITCH_TO(ScriptDataEscapeStartDash);
  1669. }
  1670. ANYTHING_ELSE
  1671. {
  1672. RECONSUME_IN(ScriptData);
  1673. }
  1674. }
  1675. END_STATE
  1676. BEGIN_STATE(ScriptDataEscapeStartDash)
  1677. {
  1678. ON('-')
  1679. {
  1680. m_queued_tokens.enqueue(HTMLToken::make_character('-'));
  1681. SWITCH_TO(ScriptDataEscapedDashDash);
  1682. }
  1683. ANYTHING_ELSE
  1684. {
  1685. RECONSUME_IN(ScriptData);
  1686. }
  1687. }
  1688. END_STATE
  1689. BEGIN_STATE(ScriptDataEscapedDashDash)
  1690. {
  1691. ON('-')
  1692. {
  1693. EMIT_CHARACTER('-');
  1694. }
  1695. ON('<')
  1696. {
  1697. SWITCH_TO(ScriptDataEscapedLessThanSign);
  1698. }
  1699. ON('>')
  1700. {
  1701. m_queued_tokens.enqueue(HTMLToken::make_character('>'));
  1702. SWITCH_TO(ScriptData);
  1703. }
  1704. ON(0)
  1705. {
  1706. TODO();
  1707. }
  1708. ON_EOF
  1709. {
  1710. TODO();
  1711. }
  1712. ANYTHING_ELSE
  1713. {
  1714. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(ScriptDataEscaped);
  1715. }
  1716. }
  1717. END_STATE
  1718. BEGIN_STATE(ScriptDataEscapedLessThanSign)
  1719. {
  1720. ON('/')
  1721. {
  1722. m_temporary_buffer.clear();
  1723. SWITCH_TO(ScriptDataEscapedEndTagOpen);
  1724. }
  1725. ON_ASCII_ALPHA
  1726. {
  1727. m_temporary_buffer.clear();
  1728. EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataDoubleEscapeStart);
  1729. }
  1730. ANYTHING_ELSE
  1731. {
  1732. EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataEscaped);
  1733. }
  1734. }
  1735. END_STATE
  1736. BEGIN_STATE(ScriptDataEscapedEndTagOpen)
  1737. {
  1738. ON_ASCII_ALPHA
  1739. {
  1740. create_new_token(HTMLToken::Type::EndTag);
  1741. RECONSUME_IN(ScriptDataEscapedEndTagName);
  1742. }
  1743. ANYTHING_ELSE
  1744. {
  1745. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1746. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1747. RECONSUME_IN(ScriptDataEscaped);
  1748. }
  1749. }
  1750. END_STATE
  1751. BEGIN_STATE(ScriptDataEscapedEndTagName)
  1752. {
  1753. ON_WHITESPACE
  1754. {
  1755. if (current_end_tag_token_is_appropriate()) {
  1756. SWITCH_TO(BeforeAttributeName);
  1757. } else {
  1758. TODO();
  1759. }
  1760. }
  1761. ON('/')
  1762. {
  1763. if (current_end_tag_token_is_appropriate()) {
  1764. SWITCH_TO(SelfClosingStartTag);
  1765. } else {
  1766. TODO();
  1767. }
  1768. }
  1769. ON('>')
  1770. {
  1771. if (current_end_tag_token_is_appropriate()) {
  1772. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1773. } else {
  1774. TODO();
  1775. }
  1776. }
  1777. ON_ASCII_UPPER_ALPHA
  1778. {
  1779. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  1780. m_temporary_buffer.append(current_input_character.value());
  1781. continue;
  1782. }
  1783. ON_ASCII_LOWER_ALPHA
  1784. {
  1785. m_current_token.m_tag.tag_name.append(current_input_character.value());
  1786. m_temporary_buffer.append(current_input_character.value());
  1787. continue;
  1788. }
  1789. ANYTHING_ELSE
  1790. {
  1791. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1792. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1793. for (auto codepoint : m_temporary_buffer) {
  1794. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1795. }
  1796. RECONSUME_IN(ScriptDataEscaped);
  1797. }
  1798. }
  1799. END_STATE
  1800. BEGIN_STATE(ScriptDataDoubleEscapeStart)
  1801. {
  1802. TODO();
  1803. }
  1804. END_STATE
  1805. BEGIN_STATE(ScriptDataEscapedDash)
  1806. {
  1807. ON('-')
  1808. {
  1809. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(ScriptDataEscapedDashDash);
  1810. }
  1811. ON('<')
  1812. {
  1813. SWITCH_TO(ScriptDataEscapedLessThanSign);
  1814. }
  1815. ON(0)
  1816. {
  1817. TODO();
  1818. }
  1819. ON_EOF
  1820. {
  1821. TODO();
  1822. }
  1823. ANYTHING_ELSE
  1824. {
  1825. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(ScriptDataEscaped);
  1826. }
  1827. }
  1828. END_STATE
  1829. BEGIN_STATE(ScriptDataEscaped)
  1830. {
  1831. ON('-')
  1832. {
  1833. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(ScriptDataEscapedDash);
  1834. }
  1835. ON('<')
  1836. {
  1837. SWITCH_TO(ScriptDataEscapedLessThanSign);
  1838. }
  1839. ON(0)
  1840. {
  1841. TODO();
  1842. }
  1843. ON_EOF
  1844. {
  1845. TODO();
  1846. }
  1847. ANYTHING_ELSE
  1848. {
  1849. EMIT_CURRENT_CHARACTER;
  1850. }
  1851. }
  1852. END_STATE
  1853. BEGIN_STATE(ScriptDataEndTagOpen)
  1854. {
  1855. ON_ASCII_ALPHA
  1856. {
  1857. create_new_token(HTMLToken::Type::EndTag);
  1858. RECONSUME_IN(ScriptDataEndTagName);
  1859. }
  1860. ANYTHING_ELSE
  1861. {
  1862. TODO();
  1863. }
  1864. }
  1865. END_STATE
  1866. BEGIN_STATE(ScriptDataEndTagName)
  1867. {
  1868. ON_WHITESPACE
  1869. {
  1870. if (current_end_tag_token_is_appropriate())
  1871. SWITCH_TO(BeforeAttributeName);
  1872. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1873. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1874. for (auto codepoint : m_temporary_buffer)
  1875. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1876. RECONSUME_IN(ScriptData);
  1877. }
  1878. ON('/')
  1879. {
  1880. if (current_end_tag_token_is_appropriate())
  1881. SWITCH_TO(SelfClosingStartTag);
  1882. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1883. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1884. for (auto codepoint : m_temporary_buffer)
  1885. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1886. RECONSUME_IN(ScriptData);
  1887. }
  1888. ON('>')
  1889. {
  1890. if (current_end_tag_token_is_appropriate())
  1891. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1892. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1893. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1894. for (auto codepoint : m_temporary_buffer)
  1895. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1896. RECONSUME_IN(ScriptData);
  1897. }
  1898. ON_ASCII_UPPER_ALPHA
  1899. {
  1900. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  1901. m_temporary_buffer.append(current_input_character.value());
  1902. continue;
  1903. }
  1904. ON_ASCII_LOWER_ALPHA
  1905. {
  1906. m_current_token.m_tag.tag_name.append(current_input_character.value());
  1907. m_temporary_buffer.append(current_input_character.value());
  1908. continue;
  1909. }
  1910. ANYTHING_ELSE
  1911. {
  1912. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1913. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1914. for (auto codepoint : m_temporary_buffer)
  1915. m_queued_tokens.enqueue(HTMLToken::make_character(codepoint));
  1916. RECONSUME_IN(ScriptData);
  1917. }
  1918. }
  1919. END_STATE
  1920. default:
  1921. TODO();
  1922. }
  1923. }
  1924. }
  1925. bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitivity case_sensitivity)
  1926. {
  1927. for (size_t i = 0; i < string.length(); ++i) {
  1928. auto codepoint = peek_codepoint(i);
  1929. if (!codepoint.has_value())
  1930. return false;
  1931. // FIXME: This should be more Unicode-aware.
  1932. if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
  1933. if (codepoint.value() < 0x80) {
  1934. if (tolower(codepoint.value()) != tolower(string[i]))
  1935. return false;
  1936. continue;
  1937. }
  1938. }
  1939. if (codepoint.value() != (u32)string[i])
  1940. return false;
  1941. }
  1942. m_cursor += string.length();
  1943. return true;
  1944. }
  1945. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  1946. {
  1947. m_current_token = {};
  1948. m_current_token.m_type = type;
  1949. }
  1950. HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
  1951. {
  1952. auto* decoder = TextCodec::decoder_for(encoding);
  1953. ASSERT(decoder);
  1954. m_decoded_input = decoder->to_utf8(input);
  1955. m_input = m_decoded_input;
  1956. }
  1957. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  1958. {
  1959. #ifdef TOKENIZER_TRACE
  1960. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  1961. #endif
  1962. }
  1963. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  1964. {
  1965. #ifdef TOKENIZER_TRACE
  1966. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  1967. #endif
  1968. }
  1969. void HTMLTokenizer::switch_to(Badge<HTMLDocumentParser>, State new_state)
  1970. {
  1971. #ifdef TOKENIZER_TRACE
  1972. dbg() << "[" << state_name(m_state) << "] Parser switches tokenizer state to " << state_name(new_state);
  1973. #endif
  1974. m_state = new_state;
  1975. }
  1976. void HTMLTokenizer::will_emit(HTMLToken& token)
  1977. {
  1978. if (token.is_start_tag())
  1979. m_last_emitted_start_tag = token;
  1980. }
  1981. bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
  1982. {
  1983. ASSERT(m_current_token.is_end_tag());
  1984. if (!m_last_emitted_start_tag.is_start_tag())
  1985. return false;
  1986. return m_current_token.tag_name() == m_last_emitted_start_tag.tag_name();
  1987. }
  1988. bool HTMLTokenizer::consumed_as_part_of_an_attribute() const
  1989. {
  1990. return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
  1991. }
  1992. }