HTMLTokenizer.cpp 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. #pragma GCC diagnostic ignored "-Wunused-label"
  30. //#define TOKENIZER_TRACE
  31. #define CONSUME_NEXT_INPUT_CHARACTER \
  32. current_input_character = next_codepoint();
  33. #define TODO() \
  34. do { \
  35. dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
  36. ASSERT_NOT_REACHED(); \
  37. } while (0)
  38. #define SWITCH_TO(new_state) \
  39. do { \
  40. will_switch_to(State::new_state); \
  41. m_state = State::new_state; \
  42. CONSUME_NEXT_INPUT_CHARACTER; \
  43. goto new_state; \
  44. } while (0)
  45. #define RECONSUME_IN(new_state) \
  46. do { \
  47. will_reconsume_in(State::new_state); \
  48. m_state = State::new_state; \
  49. goto new_state; \
  50. } while (0)
  51. #define SWITCH_TO_RETURN_STATE \
  52. do { \
  53. will_switch_to(m_return_state); \
  54. m_state = m_return_state; \
  55. goto _StartOfFunction; \
  56. } while (0)
  57. #define RECONSUME_IN_RETURN_STATE \
  58. do { \
  59. will_reconsume_in(m_return_state); \
  60. m_state = m_return_state; \
  61. goto _StartOfFunction; \
  62. } while (0)
  63. #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
  64. do { \
  65. will_switch_to(State::new_state); \
  66. m_state = State::new_state; \
  67. will_emit(m_current_token); \
  68. m_queued_tokens.enqueue(m_current_token); \
  69. return m_queued_tokens.dequeue(); \
  70. } while (0)
  71. #define EMIT_CHARACTER_AND_RECONSUME_IN(codepoint, new_state) \
  72. do { \
  73. m_queued_tokens.enqueue(m_current_token); \
  74. will_reconsume_in(State::new_state); \
  75. m_state = State::new_state; \
  76. goto new_state; \
  77. } while (0)
  78. #define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \
  79. do { \
  80. for (auto codepoint : m_temporary_buffer) { \
  81. if (consumed_as_part_of_an_attribute()) { \
  82. m_current_token.m_tag.attributes.last().value_builder.append(codepoint); \
  83. } else { \
  84. create_new_token(HTMLToken::Type::Character); \
  85. m_current_token.m_comment_or_character.data.append(codepoint); \
  86. m_queued_tokens.enqueue(m_current_token); \
  87. } \
  88. } \
  89. } while (0)
  90. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  91. #define ON(codepoint) \
  92. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  93. #define ON_EOF \
  94. if (!current_input_character.has_value())
  95. #define ON_ASCII_ALPHA \
  96. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  97. #define ON_ASCII_ALPHANUMERIC \
  98. if (current_input_character.has_value() && isalnum(current_input_character.value()))
  99. #define ON_ASCII_UPPER_ALPHA \
  100. if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
  101. #define ON_ASCII_LOWER_ALPHA \
  102. if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
  103. #define ON_ASCII_DIGIT \
  104. if (current_input_character.has_value() && isxdigit(current_input_character.value()))
  105. #define ON_ASCII_HEX_DIGIT \
  106. if (current_input_character.has_value() && isxdigit(current_input_character.value()))
  107. #define ON_WHITESPACE \
  108. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  109. #define ANYTHING_ELSE if (1)
  110. #define EMIT_EOF \
  111. do { \
  112. if (m_has_emitted_eof) \
  113. return {}; \
  114. m_has_emitted_eof = true; \
  115. create_new_token(HTMLToken::Type::EndOfFile); \
  116. will_emit(m_current_token); \
  117. m_queued_tokens.enqueue(m_current_token); \
  118. return m_queued_tokens.dequeue(); \
  119. } while (0)
  120. #define EMIT_CURRENT_TOKEN \
  121. do { \
  122. will_emit(m_current_token); \
  123. m_queued_tokens.enqueue(m_current_token); \
  124. return m_queued_tokens.dequeue(); \
  125. } while (0)
  126. #define EMIT_CHARACTER(codepoint) \
  127. do { \
  128. create_new_token(HTMLToken::Type::Character); \
  129. m_current_token.m_comment_or_character.data.append(codepoint); \
  130. m_queued_tokens.enqueue(m_current_token); \
  131. return m_queued_tokens.dequeue(); \
  132. } while (0)
  133. #define EMIT_CURRENT_CHARACTER \
  134. EMIT_CHARACTER(current_input_character.value());
  135. #define BEGIN_STATE(state) \
  136. state: \
  137. case State::state: { \
  138. { \
  139. {
  140. #define END_STATE \
  141. ASSERT_NOT_REACHED(); \
  142. break; \
  143. } \
  144. } \
  145. }
  146. static inline bool is_surrogate(u32 codepoint)
  147. {
  148. return (codepoint & 0xfffff800) == 0xd800;
  149. }
  150. static inline bool is_noncharacter(u32 codepoint)
  151. {
  152. return codepoint >= 0xfdd0 && (codepoint <= 0xfdef || (codepoint & 0xfffe) == 0xfffe) && codepoint <= 0x10ffff;
  153. }
  154. static inline bool is_c0_control(u32 codepoint)
  155. {
  156. return codepoint <= 0x1f;
  157. }
  158. static inline bool is_control(u32 codepoint)
  159. {
  160. return is_c0_control(codepoint) || (codepoint >= 0x7f && codepoint <= 0x9f);
  161. }
  162. namespace Web {
  163. Optional<u32> HTMLTokenizer::next_codepoint()
  164. {
  165. if (m_cursor >= m_input.length())
  166. return {};
  167. return m_input[m_cursor++];
  168. }
  169. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  170. {
  171. if ((m_cursor + offset) >= m_input.length())
  172. return {};
  173. return m_input[m_cursor + offset];
  174. }
  175. Optional<HTMLToken> HTMLTokenizer::next_token()
  176. {
  177. _StartOfFunction:
  178. if (!m_queued_tokens.is_empty())
  179. return m_queued_tokens.dequeue();
  180. for (;;) {
  181. auto current_input_character = next_codepoint();
  182. switch (m_state) {
  183. BEGIN_STATE(Data)
  184. {
  185. ON('&')
  186. {
  187. m_return_state = State::Data;
  188. SWITCH_TO(CharacterReference);
  189. }
  190. ON('<')
  191. {
  192. SWITCH_TO(TagOpen);
  193. }
  194. ON_EOF
  195. {
  196. EMIT_EOF;
  197. }
  198. ANYTHING_ELSE
  199. {
  200. EMIT_CURRENT_CHARACTER;
  201. }
  202. }
  203. END_STATE
  204. BEGIN_STATE(TagOpen)
  205. {
  206. ON('!')
  207. {
  208. SWITCH_TO(MarkupDeclarationOpen);
  209. }
  210. ON('/')
  211. {
  212. SWITCH_TO(EndTagOpen);
  213. }
  214. ON_ASCII_ALPHA
  215. {
  216. create_new_token(HTMLToken::Type::StartTag);
  217. RECONSUME_IN(TagName);
  218. }
  219. ON('?')
  220. {
  221. TODO();
  222. }
  223. ANYTHING_ELSE
  224. {
  225. TODO();
  226. }
  227. }
  228. END_STATE
  229. BEGIN_STATE(TagName)
  230. {
  231. ON_WHITESPACE
  232. {
  233. SWITCH_TO(BeforeAttributeName);
  234. }
  235. ON('/')
  236. {
  237. SWITCH_TO(SelfClosingStartTag);
  238. }
  239. ON('>')
  240. {
  241. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  242. }
  243. ANYTHING_ELSE
  244. {
  245. m_current_token.m_tag.tag_name.append(current_input_character.value());
  246. continue;
  247. }
  248. }
  249. END_STATE
  250. BEGIN_STATE(EndTagOpen)
  251. {
  252. ON_ASCII_ALPHA
  253. {
  254. create_new_token(HTMLToken::Type::EndTag);
  255. RECONSUME_IN(TagName);
  256. }
  257. }
  258. END_STATE
  259. BEGIN_STATE(MarkupDeclarationOpen)
  260. {
  261. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  262. if (consume_next_if_match("--")) {
  263. create_new_token(HTMLToken::Type::Comment);
  264. SWITCH_TO(CommentStart);
  265. }
  266. if (consume_next_if_match("DOCTYPE", CaseSensitivity::CaseInsensitive)) {
  267. SWITCH_TO(DOCTYPE);
  268. }
  269. }
  270. END_STATE
  271. BEGIN_STATE(DOCTYPE)
  272. {
  273. ON_WHITESPACE
  274. {
  275. SWITCH_TO(BeforeDOCTYPEName);
  276. }
  277. ON('>')
  278. {
  279. RECONSUME_IN(BeforeDOCTYPEName);
  280. }
  281. ON_EOF
  282. {
  283. TODO();
  284. }
  285. ANYTHING_ELSE
  286. {
  287. TODO();
  288. }
  289. }
  290. END_STATE
  291. BEGIN_STATE(BeforeDOCTYPEName)
  292. {
  293. ON_WHITESPACE
  294. {
  295. continue;
  296. }
  297. ON_ASCII_UPPER_ALPHA
  298. {
  299. create_new_token(HTMLToken::Type::DOCTYPE);
  300. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  301. SWITCH_TO(DOCTYPEName);
  302. }
  303. ON(0)
  304. {
  305. TODO();
  306. }
  307. ON('>')
  308. {
  309. TODO();
  310. }
  311. ON_EOF
  312. {
  313. TODO();
  314. }
  315. ANYTHING_ELSE
  316. {
  317. create_new_token(HTMLToken::Type::DOCTYPE);
  318. m_current_token.m_doctype.name.append(current_input_character.value());
  319. SWITCH_TO(DOCTYPEName);
  320. }
  321. }
  322. END_STATE
  323. BEGIN_STATE(DOCTYPEName)
  324. {
  325. ON_WHITESPACE
  326. {
  327. SWITCH_TO(AfterDOCTYPEName);
  328. }
  329. ON('>')
  330. {
  331. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  332. }
  333. ON_ASCII_UPPER_ALPHA
  334. {
  335. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  336. }
  337. ON(0)
  338. {
  339. TODO();
  340. }
  341. ON_EOF
  342. {
  343. TODO();
  344. }
  345. ANYTHING_ELSE
  346. {
  347. m_current_token.m_doctype.name.append(current_input_character.value());
  348. continue;
  349. }
  350. }
  351. END_STATE
  352. BEGIN_STATE(AfterDOCTYPEName)
  353. {
  354. ON_WHITESPACE
  355. {
  356. continue;
  357. }
  358. ON('>')
  359. {
  360. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  361. }
  362. ON_EOF
  363. {
  364. TODO();
  365. }
  366. ANYTHING_ELSE
  367. {
  368. if (toupper(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC", CaseSensitivity::CaseInsensitive)) {
  369. SWITCH_TO(AfterDOCTYPEPublicKeyword);
  370. }
  371. if (toupper(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM", CaseSensitivity::CaseInsensitive)) {
  372. SWITCH_TO(AfterDOCTYPESystemKeyword);
  373. }
  374. TODO();
  375. }
  376. }
  377. END_STATE
  378. BEGIN_STATE(AfterDOCTYPEPublicKeyword)
  379. {
  380. ON_WHITESPACE
  381. {
  382. SWITCH_TO(BeforeDOCTYPEPublicIdentifier);
  383. }
  384. ON('"')
  385. {
  386. TODO();
  387. }
  388. ON('\'')
  389. {
  390. TODO();
  391. }
  392. ON('>')
  393. {
  394. TODO();
  395. }
  396. ON_EOF
  397. {
  398. TODO();
  399. }
  400. ANYTHING_ELSE
  401. {
  402. TODO();
  403. }
  404. }
  405. END_STATE
  406. BEGIN_STATE(AfterDOCTYPESystemKeyword)
  407. {
  408. ON_WHITESPACE
  409. {
  410. SWITCH_TO(BeforeDOCTYPESystemIdentifier);
  411. }
  412. ON('"')
  413. {
  414. TODO();
  415. }
  416. ON('\'')
  417. {
  418. TODO();
  419. }
  420. ON('>')
  421. {
  422. TODO();
  423. }
  424. ON_EOF
  425. {
  426. TODO();
  427. }
  428. ANYTHING_ELSE
  429. {
  430. TODO();
  431. }
  432. }
  433. END_STATE
  434. BEGIN_STATE(BeforeDOCTYPEPublicIdentifier)
  435. {
  436. ON_WHITESPACE
  437. {
  438. continue;
  439. }
  440. ON('"')
  441. {
  442. m_current_token.m_doctype.public_identifier.clear();
  443. SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
  444. }
  445. ON('\'')
  446. {
  447. m_current_token.m_doctype.public_identifier.clear();
  448. SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
  449. }
  450. ON('>')
  451. {
  452. TODO();
  453. }
  454. ON_EOF
  455. {
  456. TODO();
  457. }
  458. ANYTHING_ELSE
  459. {
  460. TODO();
  461. }
  462. }
  463. END_STATE
  464. BEGIN_STATE(BeforeDOCTYPESystemIdentifier)
  465. {
  466. ON_WHITESPACE
  467. {
  468. continue;
  469. }
  470. ON('"')
  471. {
  472. m_current_token.m_doctype.system_identifier.clear();
  473. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  474. }
  475. ON('\'')
  476. {
  477. m_current_token.m_doctype.system_identifier.clear();
  478. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  479. }
  480. ON('>')
  481. {
  482. TODO();
  483. }
  484. ON_EOF
  485. {
  486. TODO();
  487. }
  488. ANYTHING_ELSE
  489. {
  490. TODO();
  491. }
  492. }
  493. END_STATE
  494. BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuoted)
  495. {
  496. ON('"')
  497. {
  498. SWITCH_TO(AfterDOCTYPEPublicIdentifier);
  499. }
  500. ON(0)
  501. {
  502. TODO();
  503. }
  504. ON('>')
  505. {
  506. TODO();
  507. }
  508. ON_EOF
  509. {
  510. TODO();
  511. }
  512. ANYTHING_ELSE
  513. {
  514. m_current_token.m_doctype.public_identifier.append(current_input_character.value());
  515. continue;
  516. }
  517. }
  518. END_STATE
  519. BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuoted)
  520. {
  521. ON('\'')
  522. {
  523. SWITCH_TO(AfterDOCTYPEPublicIdentifier);
  524. }
  525. ON(0)
  526. {
  527. TODO();
  528. }
  529. ON('>')
  530. {
  531. TODO();
  532. }
  533. ON_EOF
  534. {
  535. TODO();
  536. }
  537. ANYTHING_ELSE
  538. {
  539. m_current_token.m_doctype.public_identifier.append(current_input_character.value());
  540. continue;
  541. }
  542. }
  543. END_STATE
  544. BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuoted)
  545. {
  546. ON('"')
  547. {
  548. SWITCH_TO(AfterDOCTYPESystemIdentifier);
  549. }
  550. ON(0)
  551. {
  552. TODO();
  553. }
  554. ON('>')
  555. {
  556. TODO();
  557. }
  558. ON_EOF
  559. {
  560. TODO();
  561. }
  562. ANYTHING_ELSE
  563. {
  564. m_current_token.m_doctype.system_identifier.append(current_input_character.value());
  565. continue;
  566. }
  567. }
  568. END_STATE
  569. BEGIN_STATE(DOCTYPESystemIdentifierSingleQuoted)
  570. {
  571. ON('\'')
  572. {
  573. SWITCH_TO(AfterDOCTYPESystemIdentifier);
  574. }
  575. ON(0)
  576. {
  577. TODO();
  578. }
  579. ON('>')
  580. {
  581. TODO();
  582. }
  583. ON_EOF
  584. {
  585. TODO();
  586. }
  587. ANYTHING_ELSE
  588. {
  589. m_current_token.m_doctype.system_identifier.append(current_input_character.value());
  590. continue;
  591. }
  592. }
  593. END_STATE
  594. BEGIN_STATE(AfterDOCTYPEPublicIdentifier)
  595. {
  596. ON_WHITESPACE
  597. {
  598. SWITCH_TO(BetweenDOCTYPEPublicAndSystemIdentifiers);
  599. }
  600. ON('>')
  601. {
  602. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  603. }
  604. ON('"')
  605. {
  606. TODO();
  607. }
  608. ON('\'')
  609. {
  610. TODO();
  611. }
  612. ON_EOF
  613. {
  614. TODO();
  615. }
  616. ANYTHING_ELSE
  617. {
  618. TODO();
  619. }
  620. }
  621. END_STATE
  622. BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers)
  623. {
  624. ON_WHITESPACE
  625. {
  626. continue;
  627. }
  628. ON('>')
  629. {
  630. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  631. }
  632. ON('"')
  633. {
  634. m_current_token.m_doctype.system_identifier.clear();
  635. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  636. }
  637. ON('\'')
  638. {
  639. m_current_token.m_doctype.system_identifier.clear();
  640. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  641. }
  642. ON_EOF
  643. {
  644. TODO();
  645. }
  646. ANYTHING_ELSE
  647. {
  648. TODO();
  649. }
  650. }
  651. END_STATE
  652. BEGIN_STATE(AfterDOCTYPESystemIdentifier)
  653. {
  654. ON_WHITESPACE
  655. {
  656. continue;
  657. }
  658. ON('>')
  659. {
  660. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  661. }
  662. ON_EOF
  663. {
  664. TODO();
  665. }
  666. ANYTHING_ELSE
  667. {
  668. TODO();
  669. }
  670. }
  671. END_STATE
  672. BEGIN_STATE(BeforeAttributeName)
  673. {
  674. ON_WHITESPACE
  675. {
  676. continue;
  677. }
  678. ON('/')
  679. {
  680. RECONSUME_IN(AfterAttributeName);
  681. }
  682. ON('>')
  683. {
  684. RECONSUME_IN(AfterAttributeName);
  685. }
  686. ON_EOF
  687. {
  688. RECONSUME_IN(AfterAttributeName);
  689. }
  690. ON('=')
  691. {
  692. TODO();
  693. }
  694. ANYTHING_ELSE
  695. {
  696. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  697. RECONSUME_IN(AttributeName);
  698. }
  699. }
  700. END_STATE
  701. BEGIN_STATE(SelfClosingStartTag)
  702. {
  703. }
  704. END_STATE
  705. BEGIN_STATE(AttributeName)
  706. {
  707. ON_WHITESPACE
  708. {
  709. RECONSUME_IN(AfterAttributeName);
  710. }
  711. ON('/')
  712. {
  713. RECONSUME_IN(AfterAttributeName);
  714. }
  715. ON('>')
  716. {
  717. RECONSUME_IN(AfterAttributeName);
  718. }
  719. ON_EOF
  720. {
  721. RECONSUME_IN(AfterAttributeName);
  722. }
  723. ON('=')
  724. {
  725. SWITCH_TO(BeforeAttributeValue);
  726. }
  727. ANYTHING_ELSE
  728. {
  729. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  730. continue;
  731. }
  732. }
  733. END_STATE
  734. BEGIN_STATE(AfterAttributeName)
  735. {
  736. }
  737. END_STATE
  738. BEGIN_STATE(BeforeAttributeValue)
  739. {
  740. ON_WHITESPACE
  741. {
  742. continue;
  743. }
  744. ON('"')
  745. {
  746. SWITCH_TO(AttributeValueDoubleQuoted);
  747. }
  748. ON('\'')
  749. {
  750. SWITCH_TO(AttributeValueSingleQuoted);
  751. }
  752. ON('>')
  753. {
  754. TODO();
  755. }
  756. ANYTHING_ELSE
  757. {
  758. RECONSUME_IN(AttributeValueUnquoted);
  759. }
  760. }
  761. END_STATE
  762. BEGIN_STATE(AttributeValueDoubleQuoted)
  763. {
  764. ON('"')
  765. {
  766. SWITCH_TO(AfterAttributeValueQuoted);
  767. }
  768. ON('&')
  769. {
  770. m_return_state = State::AttributeValueDoubleQuoted;
  771. SWITCH_TO(CharacterReference);
  772. }
  773. ON(0)
  774. {
  775. TODO();
  776. }
  777. ON_EOF
  778. {
  779. TODO();
  780. }
  781. ANYTHING_ELSE
  782. {
  783. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  784. continue;
  785. }
  786. }
  787. END_STATE
  788. BEGIN_STATE(AttributeValueSingleQuoted)
  789. {
  790. ON('\'')
  791. {
  792. SWITCH_TO(AfterAttributeValueQuoted);
  793. }
  794. ON('&')
  795. {
  796. m_return_state = State::AttributeValueSingleQuoted;
  797. SWITCH_TO(CharacterReference);
  798. }
  799. ON(0)
  800. {
  801. TODO();
  802. }
  803. ON_EOF
  804. {
  805. TODO();
  806. }
  807. ANYTHING_ELSE
  808. {
  809. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  810. continue;
  811. }
  812. }
  813. END_STATE
  814. BEGIN_STATE(AttributeValueUnquoted)
  815. {
  816. ON_WHITESPACE
  817. {
  818. SWITCH_TO(BeforeAttributeName);
  819. }
  820. ON('&')
  821. {
  822. m_return_state = State::AttributeValueUnquoted;
  823. SWITCH_TO(CharacterReference);
  824. }
  825. ON('>')
  826. {
  827. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  828. }
  829. ON(0)
  830. {
  831. TODO();
  832. }
  833. ON_EOF
  834. {
  835. TODO();
  836. }
  837. ANYTHING_ELSE
  838. {
  839. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  840. continue;
  841. }
  842. }
  843. END_STATE
  844. BEGIN_STATE(AfterAttributeValueQuoted)
  845. {
  846. ON_WHITESPACE
  847. {
  848. SWITCH_TO(BeforeAttributeName);
  849. }
  850. ON('/')
  851. {
  852. SWITCH_TO(SelfClosingStartTag);
  853. }
  854. ON('>')
  855. {
  856. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  857. }
  858. ON_EOF
  859. {
  860. TODO();
  861. }
  862. ANYTHING_ELSE
  863. {
  864. TODO();
  865. }
  866. }
  867. END_STATE
  868. BEGIN_STATE(CommentStart)
  869. {
  870. ON('-')
  871. {
  872. SWITCH_TO(CommentStartDash);
  873. }
  874. ON('>')
  875. {
  876. TODO();
  877. }
  878. ANYTHING_ELSE
  879. {
  880. RECONSUME_IN(Comment);
  881. }
  882. }
  883. END_STATE
  884. BEGIN_STATE(CommentStartDash)
  885. {
  886. ON('-')
  887. {
  888. SWITCH_TO(CommentEnd);
  889. }
  890. ON('>')
  891. {
  892. TODO();
  893. }
  894. ON_EOF
  895. {
  896. TODO();
  897. }
  898. ANYTHING_ELSE
  899. {
  900. m_current_token.m_comment_or_character.data.append('-');
  901. RECONSUME_IN(Comment);
  902. }
  903. }
  904. END_STATE
  905. BEGIN_STATE(Comment)
  906. {
  907. ON('<')
  908. {
  909. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  910. SWITCH_TO(CommentLessThanSign);
  911. }
  912. ON('-')
  913. {
  914. SWITCH_TO(CommentEndDash);
  915. }
  916. ON(0)
  917. {
  918. TODO();
  919. }
  920. ON_EOF
  921. {
  922. TODO();
  923. }
  924. ANYTHING_ELSE
  925. {
  926. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  927. continue;
  928. }
  929. }
  930. END_STATE
  931. BEGIN_STATE(CommentEnd)
  932. {
  933. ON('>')
  934. {
  935. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  936. }
  937. ON('!')
  938. {
  939. SWITCH_TO(CommentEndBang);
  940. }
  941. ON('-')
  942. {
  943. m_current_token.m_comment_or_character.data.append('-');
  944. continue;
  945. }
  946. ON_EOF
  947. {
  948. TODO();
  949. }
  950. ANYTHING_ELSE
  951. {
  952. m_current_token.m_comment_or_character.data.append('-');
  953. RECONSUME_IN(Comment);
  954. }
  955. }
  956. END_STATE
  957. BEGIN_STATE(CommentEndBang)
  958. {
  959. ON('-')
  960. {
  961. m_current_token.m_comment_or_character.data.append("--!");
  962. SWITCH_TO(CommentEndDash);
  963. }
  964. ON('>')
  965. {
  966. TODO();
  967. }
  968. ON_EOF
  969. {
  970. TODO();
  971. }
  972. ANYTHING_ELSE
  973. {
  974. m_current_token.m_comment_or_character.data.append("--!");
  975. RECONSUME_IN(Comment);
  976. }
  977. }
  978. END_STATE
  979. BEGIN_STATE(CommentEndDash)
  980. {
  981. ON('-')
  982. {
  983. SWITCH_TO(CommentEnd);
  984. }
  985. ON_EOF
  986. {
  987. TODO();
  988. }
  989. ANYTHING_ELSE
  990. {
  991. m_current_token.m_comment_or_character.data.append('-');
  992. RECONSUME_IN(Comment);
  993. }
  994. }
  995. END_STATE
  996. BEGIN_STATE(CommentLessThanSign)
  997. {
  998. ON('!')
  999. {
  1000. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  1001. SWITCH_TO(CommentLessThanSignBang);
  1002. }
  1003. ON('<')
  1004. {
  1005. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  1006. continue;
  1007. }
  1008. ANYTHING_ELSE
  1009. {
  1010. RECONSUME_IN(Comment);
  1011. }
  1012. }
  1013. END_STATE
  1014. BEGIN_STATE(CommentLessThanSignBang)
  1015. {
  1016. ON('-')
  1017. {
  1018. SWITCH_TO(CommentLessThanSignBangDash);
  1019. }
  1020. ANYTHING_ELSE
  1021. {
  1022. RECONSUME_IN(Comment);
  1023. }
  1024. }
  1025. END_STATE
  1026. BEGIN_STATE(CommentLessThanSignBangDash)
  1027. {
  1028. ON('-')
  1029. {
  1030. SWITCH_TO(CommentLessThanSignBangDashDash);
  1031. }
  1032. ANYTHING_ELSE
  1033. {
  1034. RECONSUME_IN(Comment);
  1035. }
  1036. }
  1037. END_STATE
  1038. BEGIN_STATE(CommentLessThanSignBangDashDash)
  1039. {
  1040. ON('>')
  1041. {
  1042. SWITCH_TO(CommentEnd);
  1043. }
  1044. ANYTHING_ELSE
  1045. {
  1046. TODO();
  1047. }
  1048. }
  1049. END_STATE
  1050. BEGIN_STATE(CharacterReference)
  1051. {
  1052. m_temporary_buffer.clear();
  1053. m_temporary_buffer.append('&');
  1054. ON_ASCII_ALPHANUMERIC
  1055. {
  1056. RECONSUME_IN(NamedCharacterReference);
  1057. }
  1058. ON('#')
  1059. {
  1060. m_temporary_buffer.append(current_input_character.value());
  1061. SWITCH_TO(NumericCharacterReference);
  1062. }
  1063. ANYTHING_ELSE
  1064. {
  1065. RECONSUME_IN_RETURN_STATE;
  1066. }
  1067. }
  1068. END_STATE
  1069. BEGIN_STATE(NamedCharacterReference)
  1070. {
  1071. // FIXME:This is not the right way to implement this state!!
  1072. ON(';')
  1073. {
  1074. SWITCH_TO_RETURN_STATE;
  1075. }
  1076. ANYTHING_ELSE
  1077. {
  1078. dbg() << "NamedCharacterReference: '" << (char)current_input_character.value() << "'";
  1079. continue;
  1080. }
  1081. }
  1082. END_STATE
  1083. BEGIN_STATE(NumericCharacterReference)
  1084. {
  1085. m_character_reference_code = 0;
  1086. ON('X')
  1087. {
  1088. m_temporary_buffer.append(current_input_character.value());
  1089. SWITCH_TO(HexadecimalCharacterReferenceStart);
  1090. }
  1091. ON('x')
  1092. {
  1093. m_temporary_buffer.append(current_input_character.value());
  1094. SWITCH_TO(HexadecimalCharacterReferenceStart);
  1095. }
  1096. ANYTHING_ELSE
  1097. {
  1098. RECONSUME_IN(DecimalCharacterReferenceStart);
  1099. }
  1100. }
  1101. END_STATE
  1102. BEGIN_STATE(HexadecimalCharacterReferenceStart)
  1103. {
  1104. ON_ASCII_HEX_DIGIT
  1105. {
  1106. RECONSUME_IN(HexadecimalCharacterReference);
  1107. }
  1108. ANYTHING_ELSE
  1109. {
  1110. TODO();
  1111. }
  1112. }
  1113. END_STATE
  1114. BEGIN_STATE(DecimalCharacterReferenceStart)
  1115. {
  1116. ON_ASCII_DIGIT
  1117. {
  1118. RECONSUME_IN(DecimalCharacterReference);
  1119. }
  1120. ANYTHING_ELSE
  1121. {
  1122. TODO();
  1123. }
  1124. }
  1125. END_STATE
  1126. BEGIN_STATE(HexadecimalCharacterReference)
  1127. {
  1128. ON_ASCII_DIGIT
  1129. {
  1130. m_character_reference_code *= 16;
  1131. m_character_reference_code += current_input_character.value() - 0x30;
  1132. continue;
  1133. }
  1134. ON_ASCII_UPPER_ALPHA
  1135. {
  1136. m_character_reference_code *= 16;
  1137. m_character_reference_code += current_input_character.value() - 0x37;
  1138. continue;
  1139. }
  1140. ON_ASCII_LOWER_ALPHA
  1141. {
  1142. m_character_reference_code *= 16;
  1143. m_character_reference_code += current_input_character.value() - 0x57;
  1144. continue;
  1145. }
  1146. ON(';')
  1147. {
  1148. SWITCH_TO(NumericCharacterReferenceEnd);
  1149. }
  1150. ANYTHING_ELSE
  1151. {
  1152. TODO();
  1153. }
  1154. }
  1155. END_STATE
  1156. BEGIN_STATE(DecimalCharacterReference)
  1157. {
  1158. ON_ASCII_DIGIT
  1159. {
  1160. m_character_reference_code *= 10;
  1161. m_character_reference_code += current_input_character.value() - 0x30;
  1162. continue;
  1163. }
  1164. ON(';')
  1165. {
  1166. SWITCH_TO(NumericCharacterReferenceEnd);
  1167. }
  1168. ANYTHING_ELSE
  1169. {
  1170. TODO();
  1171. }
  1172. }
  1173. END_STATE
  1174. BEGIN_STATE(NumericCharacterReferenceEnd)
  1175. {
  1176. if (m_character_reference_code == 0) {
  1177. TODO();
  1178. }
  1179. if (m_character_reference_code > 0x10ffff) {
  1180. TODO();
  1181. }
  1182. if (is_surrogate(m_character_reference_code)) {
  1183. TODO();
  1184. }
  1185. if (is_noncharacter(m_character_reference_code)) {
  1186. TODO();
  1187. }
  1188. if (m_character_reference_code == 0xd || (is_control(m_character_reference_code) && !isspace(m_character_reference_code))) {
  1189. TODO();
  1190. }
  1191. if (is_control(m_character_reference_code)) {
  1192. constexpr struct {
  1193. u32 number;
  1194. u32 codepoint;
  1195. } conversion_table[] = {
  1196. { 0x80, 0x20AC },
  1197. { 0x82, 0x201A },
  1198. { 0x83, 0x0192 },
  1199. { 0x84, 0x201E },
  1200. { 0x85, 0x2026 },
  1201. { 0x86, 0x2020 },
  1202. { 0x87, 0x2021 },
  1203. { 0x88, 0x02C6 },
  1204. { 0x89, 0x2030 },
  1205. { 0x8A, 0x0160 },
  1206. { 0x8B, 0x2039 },
  1207. { 0x8C, 0x0152 },
  1208. { 0x8E, 0x017D },
  1209. { 0x91, 0x2018 },
  1210. { 0x92, 0x2019 },
  1211. { 0x93, 0x201C },
  1212. { 0x94, 0x201D },
  1213. { 0x95, 0x2022 },
  1214. { 0x96, 0x2013 },
  1215. { 0x97, 0x2014 },
  1216. { 0x98, 0x02DC },
  1217. { 0x99, 0x2122 },
  1218. { 0x9A, 0x0161 },
  1219. { 0x9B, 0x203A },
  1220. { 0x9C, 0x0153 },
  1221. { 0x9E, 0x017E },
  1222. { 0x9F, 0x0178 },
  1223. };
  1224. for (auto& entry : conversion_table) {
  1225. if (m_character_reference_code == entry.number) {
  1226. m_character_reference_code = entry.codepoint;
  1227. break;
  1228. }
  1229. }
  1230. }
  1231. m_temporary_buffer.clear();
  1232. m_temporary_buffer.append(m_character_reference_code);
  1233. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1234. SWITCH_TO_RETURN_STATE;
  1235. }
  1236. END_STATE
  1237. BEGIN_STATE(RCDATA)
  1238. {
  1239. ON('&')
  1240. {
  1241. m_return_state = State::RCDATA;
  1242. SWITCH_TO(CharacterReference);
  1243. }
  1244. ON('<')
  1245. {
  1246. SWITCH_TO(RCDATALessThanSign);
  1247. }
  1248. ON(0)
  1249. {
  1250. TODO();
  1251. }
  1252. ON_EOF
  1253. {
  1254. EMIT_EOF;
  1255. }
  1256. ANYTHING_ELSE
  1257. {
  1258. EMIT_CURRENT_CHARACTER;
  1259. }
  1260. }
  1261. END_STATE
  1262. BEGIN_STATE(RCDATALessThanSign)
  1263. {
  1264. ON('/')
  1265. {
  1266. m_temporary_buffer.clear();
  1267. SWITCH_TO(RCDATAEndTagOpen);
  1268. }
  1269. ANYTHING_ELSE
  1270. {
  1271. EMIT_CHARACTER('<');
  1272. RECONSUME_IN(RCDATA);
  1273. }
  1274. }
  1275. END_STATE
  1276. BEGIN_STATE(RCDATAEndTagOpen)
  1277. {
  1278. ON_ASCII_ALPHA
  1279. {
  1280. create_new_token(HTMLToken::Type::EndTag);
  1281. RECONSUME_IN(RCDATAEndTagName);
  1282. }
  1283. ANYTHING_ELSE
  1284. {
  1285. // FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RCDATA state.
  1286. TODO();
  1287. }
  1288. }
  1289. END_STATE
  1290. BEGIN_STATE(RCDATAEndTagName)
  1291. {
  1292. ON_WHITESPACE
  1293. {
  1294. TODO();
  1295. }
  1296. ON('/')
  1297. {
  1298. TODO();
  1299. }
  1300. ON('>')
  1301. {
  1302. if (!current_end_tag_token_is_appropriate()) {
  1303. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  1304. TODO();
  1305. }
  1306. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1307. }
  1308. ON_ASCII_UPPER_ALPHA
  1309. {
  1310. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  1311. m_temporary_buffer.append(current_input_character.value());
  1312. continue;
  1313. }
  1314. ON_ASCII_LOWER_ALPHA
  1315. {
  1316. m_current_token.m_tag.tag_name.append(current_input_character.value());
  1317. m_temporary_buffer.append(current_input_character.value());
  1318. continue;
  1319. }
  1320. ANYTHING_ELSE
  1321. {
  1322. TODO();
  1323. }
  1324. }
  1325. END_STATE
  1326. BEGIN_STATE(RAWTEXT)
  1327. {
  1328. ON('<')
  1329. {
  1330. SWITCH_TO(RAWTEXTLessThanSign);
  1331. }
  1332. ON(0)
  1333. {
  1334. TODO();
  1335. }
  1336. ON_EOF
  1337. {
  1338. EMIT_EOF;
  1339. }
  1340. ANYTHING_ELSE
  1341. {
  1342. EMIT_CURRENT_CHARACTER;
  1343. }
  1344. }
  1345. END_STATE
  1346. BEGIN_STATE(RAWTEXTLessThanSign)
  1347. {
  1348. ON('/')
  1349. {
  1350. m_temporary_buffer.clear();
  1351. SWITCH_TO(RAWTEXTEndTagOpen);
  1352. }
  1353. ANYTHING_ELSE
  1354. {
  1355. EMIT_CHARACTER('<');
  1356. RECONSUME_IN(RAWTEXT);
  1357. }
  1358. }
  1359. END_STATE
  1360. BEGIN_STATE(RAWTEXTEndTagOpen)
  1361. {
  1362. ON_ASCII_ALPHA
  1363. {
  1364. create_new_token(HTMLToken::Type::EndTag);
  1365. RECONSUME_IN(RAWTEXTEndTagName);
  1366. }
  1367. ANYTHING_ELSE
  1368. {
  1369. // FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RAWTEXT state.
  1370. TODO();
  1371. }
  1372. }
  1373. END_STATE
  1374. BEGIN_STATE(RAWTEXTEndTagName)
  1375. {
  1376. ON_WHITESPACE
  1377. {
  1378. TODO();
  1379. }
  1380. ON('/')
  1381. {
  1382. TODO();
  1383. }
  1384. ON('>')
  1385. {
  1386. if (!current_end_tag_token_is_appropriate()) {
  1387. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  1388. TODO();
  1389. }
  1390. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1391. }
  1392. ON_ASCII_UPPER_ALPHA
  1393. {
  1394. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  1395. m_temporary_buffer.append(current_input_character.value());
  1396. continue;
  1397. }
  1398. ON_ASCII_LOWER_ALPHA
  1399. {
  1400. m_current_token.m_tag.tag_name.append(current_input_character.value());
  1401. m_temporary_buffer.append(current_input_character.value());
  1402. continue;
  1403. }
  1404. ANYTHING_ELSE
  1405. {
  1406. TODO();
  1407. }
  1408. }
  1409. END_STATE
  1410. BEGIN_STATE(ScriptData)
  1411. {
  1412. ON('<')
  1413. {
  1414. SWITCH_TO(ScriptDataLessThanSign);
  1415. }
  1416. ON(0)
  1417. {
  1418. TODO();
  1419. }
  1420. ON_EOF
  1421. {
  1422. EMIT_EOF;
  1423. }
  1424. ANYTHING_ELSE
  1425. {
  1426. EMIT_CURRENT_CHARACTER;
  1427. }
  1428. }
  1429. END_STATE
  1430. BEGIN_STATE(ScriptDataLessThanSign)
  1431. {
  1432. ON('/')
  1433. {
  1434. m_temporary_buffer.clear();
  1435. SWITCH_TO(ScriptDataEndTagOpen);
  1436. }
  1437. ON('!')
  1438. {
  1439. TODO();
  1440. }
  1441. ANYTHING_ELSE
  1442. {
  1443. EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptData);
  1444. }
  1445. }
  1446. END_STATE
  1447. BEGIN_STATE(ScriptDataEndTagOpen)
  1448. {
  1449. ON_ASCII_ALPHA
  1450. {
  1451. create_new_token(HTMLToken::Type::EndTag);
  1452. RECONSUME_IN(ScriptDataEndTagName);
  1453. }
  1454. ANYTHING_ELSE
  1455. {
  1456. TODO();
  1457. }
  1458. }
  1459. END_STATE
  1460. BEGIN_STATE(ScriptDataEndTagName)
  1461. {
  1462. ON_WHITESPACE
  1463. {
  1464. if (current_end_tag_token_is_appropriate())
  1465. SWITCH_TO(BeforeAttributeName);
  1466. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  1467. TODO();
  1468. }
  1469. ON('/')
  1470. {
  1471. if (current_end_tag_token_is_appropriate())
  1472. SWITCH_TO(SelfClosingStartTag);
  1473. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  1474. TODO();
  1475. }
  1476. ON('>')
  1477. {
  1478. if (current_end_tag_token_is_appropriate())
  1479. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1480. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  1481. TODO();
  1482. }
  1483. ON_ASCII_UPPER_ALPHA
  1484. {
  1485. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  1486. m_temporary_buffer.append(current_input_character.value());
  1487. continue;
  1488. }
  1489. ON_ASCII_LOWER_ALPHA
  1490. {
  1491. m_current_token.m_tag.tag_name.append(current_input_character.value());
  1492. m_temporary_buffer.append(current_input_character.value());
  1493. continue;
  1494. }
  1495. ANYTHING_ELSE
  1496. {
  1497. TODO();
  1498. }
  1499. }
  1500. END_STATE
  1501. default:
  1502. TODO();
  1503. }
  1504. }
  1505. }
  1506. bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitivity case_sensitivity)
  1507. {
  1508. for (size_t i = 0; i < string.length(); ++i) {
  1509. auto codepoint = peek_codepoint(i);
  1510. if (!codepoint.has_value())
  1511. return false;
  1512. // FIXME: This should be more Unicode-aware.
  1513. if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
  1514. if (codepoint.value() < 0x80) {
  1515. if (tolower(codepoint.value()) != tolower(string[i]))
  1516. return false;
  1517. continue;
  1518. }
  1519. }
  1520. if (codepoint.value() != (u32)string[i])
  1521. return false;
  1522. }
  1523. m_cursor += string.length();
  1524. return true;
  1525. }
  1526. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  1527. {
  1528. m_current_token = {};
  1529. m_current_token.m_type = type;
  1530. }
  1531. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  1532. : m_input(input)
  1533. {
  1534. }
  1535. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  1536. {
  1537. #ifdef TOKENIZER_TRACE
  1538. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  1539. #endif
  1540. }
  1541. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  1542. {
  1543. #ifdef TOKENIZER_TRACE
  1544. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  1545. #endif
  1546. }
  1547. void HTMLTokenizer::switch_to(Badge<HTMLDocumentParser>, State new_state)
  1548. {
  1549. #ifdef TOKENIZER_TRACE
  1550. dbg() << "[" << state_name(m_state) << "] Parser switches tokenizer state to " << state_name(new_state);
  1551. #endif
  1552. m_state = new_state;
  1553. }
  1554. void HTMLTokenizer::will_emit(HTMLToken& token)
  1555. {
  1556. if (token.is_start_tag())
  1557. m_last_emitted_start_tag = token;
  1558. }
  1559. bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
  1560. {
  1561. ASSERT(m_current_token.is_end_tag());
  1562. if (!m_last_emitted_start_tag.is_start_tag())
  1563. return false;
  1564. return m_current_token.tag_name() == m_last_emitted_start_tag.tag_name();
  1565. }
  1566. bool HTMLTokenizer::consumed_as_part_of_an_attribute() const
  1567. {
  1568. return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
  1569. }
  1570. }