HTMLTokenizer.cpp 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. #pragma GCC diagnostic ignored "-Wunused-label"
  30. //#define TOKENIZER_TRACE
  31. #define TODO() \
  32. do { \
  33. dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
  34. ASSERT_NOT_REACHED(); \
  35. } while (0)
  36. #define SWITCH_TO(new_state) \
  37. do { \
  38. will_switch_to(State::new_state); \
  39. m_state = State::new_state; \
  40. current_input_character = next_codepoint(); \
  41. goto new_state; \
  42. } while (0)
  43. #define RECONSUME_IN(new_state) \
  44. do { \
  45. will_reconsume_in(State::new_state); \
  46. m_state = State::new_state; \
  47. goto new_state; \
  48. } while (0)
  49. #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
  50. do { \
  51. will_switch_to(State::new_state); \
  52. m_state = State::new_state; \
  53. will_emit(m_current_token); \
  54. return m_current_token; \
  55. } while (0)
  56. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  57. #define ON(codepoint) \
  58. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  59. #define ON_EOF \
  60. if (!current_input_character.has_value())
  61. #define ON_ASCII_ALPHA \
  62. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  63. #define ON_ASCII_UPPER_ALPHA \
  64. if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
  65. #define ON_ASCII_LOWER_ALPHA \
  66. if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
  67. #define ON_WHITESPACE \
  68. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  69. #define ANYTHING_ELSE if (1)
  70. #define EMIT_EOF \
  71. do { \
  72. if (m_has_emitted_eof) \
  73. return {}; \
  74. m_has_emitted_eof = true; \
  75. create_new_token(HTMLToken::Type::EndOfFile); \
  76. will_emit(m_current_token); \
  77. return m_current_token; \
  78. } while (0)
  79. #define EMIT_CURRENT_TOKEN \
  80. do { \
  81. will_emit(m_current_token); \
  82. return m_current_token; \
  83. } while (0)
  84. #define EMIT_CHARACTER(codepoint) \
  85. do { \
  86. create_new_token(HTMLToken::Type::Character); \
  87. m_current_token.m_comment_or_character.data.append(codepoint); \
  88. will_emit(m_current_token); \
  89. return m_current_token; \
  90. } while (0)
  91. #define EMIT_CURRENT_CHARACTER \
  92. EMIT_CHARACTER(current_input_character.value());
  93. #define BEGIN_STATE(state) \
  94. state: \
  95. case State::state: { \
  96. { \
  97. {
  98. #define END_STATE \
  99. ASSERT_NOT_REACHED(); \
  100. break; \
  101. } \
  102. } \
  103. }
  104. namespace Web {
  105. Optional<u32> HTMLTokenizer::next_codepoint()
  106. {
  107. if (m_cursor >= m_input.length())
  108. return {};
  109. return m_input[m_cursor++];
  110. }
  111. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  112. {
  113. if ((m_cursor + offset) >= m_input.length())
  114. return {};
  115. return m_input[m_cursor + offset];
  116. }
  117. Optional<HTMLToken> HTMLTokenizer::next_token()
  118. {
  119. for (;;) {
  120. auto current_input_character = next_codepoint();
  121. switch (m_state) {
  122. BEGIN_STATE(Data)
  123. {
  124. ON('&')
  125. {
  126. m_return_state = State::Data;
  127. SWITCH_TO(CharacterReference);
  128. }
  129. ON('<')
  130. {
  131. SWITCH_TO(TagOpen);
  132. }
  133. ON_EOF
  134. {
  135. EMIT_EOF;
  136. }
  137. ANYTHING_ELSE
  138. {
  139. EMIT_CURRENT_CHARACTER;
  140. }
  141. }
  142. END_STATE
  143. BEGIN_STATE(TagOpen)
  144. {
  145. ON('!')
  146. {
  147. SWITCH_TO(MarkupDeclarationOpen);
  148. }
  149. ON('/')
  150. {
  151. SWITCH_TO(EndTagOpen);
  152. }
  153. ON_ASCII_ALPHA
  154. {
  155. create_new_token(HTMLToken::Type::StartTag);
  156. RECONSUME_IN(TagName);
  157. }
  158. ON('?')
  159. {
  160. TODO();
  161. }
  162. ANYTHING_ELSE
  163. {
  164. TODO();
  165. }
  166. }
  167. END_STATE
  168. BEGIN_STATE(TagName)
  169. {
  170. ON_WHITESPACE
  171. {
  172. SWITCH_TO(BeforeAttributeName);
  173. }
  174. ON('/')
  175. {
  176. SWITCH_TO(SelfClosingStartTag);
  177. }
  178. ON('>')
  179. {
  180. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  181. }
  182. ANYTHING_ELSE
  183. {
  184. m_current_token.m_tag.tag_name.append(current_input_character.value());
  185. continue;
  186. }
  187. }
  188. END_STATE
  189. BEGIN_STATE(EndTagOpen)
  190. {
  191. ON_ASCII_ALPHA
  192. {
  193. create_new_token(HTMLToken::Type::EndTag);
  194. RECONSUME_IN(TagName);
  195. }
  196. }
  197. END_STATE
  198. BEGIN_STATE(MarkupDeclarationOpen)
  199. {
  200. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  201. if (next_few_characters_are("--")) {
  202. consume("--");
  203. create_new_token(HTMLToken::Type::Comment);
  204. SWITCH_TO(CommentStart);
  205. }
  206. if (next_few_characters_are("DOCTYPE")) {
  207. consume("DOCTYPE");
  208. SWITCH_TO(DOCTYPE);
  209. }
  210. }
  211. END_STATE
  212. BEGIN_STATE(DOCTYPE)
  213. {
  214. ON_WHITESPACE
  215. {
  216. SWITCH_TO(BeforeDOCTYPEName);
  217. }
  218. ON('>')
  219. {
  220. RECONSUME_IN(BeforeDOCTYPEName);
  221. }
  222. ON_EOF
  223. {
  224. TODO();
  225. }
  226. ANYTHING_ELSE
  227. {
  228. TODO();
  229. }
  230. }
  231. END_STATE
  232. BEGIN_STATE(BeforeDOCTYPEName)
  233. {
  234. ON_WHITESPACE
  235. {
  236. continue;
  237. }
  238. ON_ASCII_UPPER_ALPHA
  239. {
  240. create_new_token(HTMLToken::Type::DOCTYPE);
  241. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  242. SWITCH_TO(DOCTYPEName);
  243. }
  244. ON(0)
  245. {
  246. TODO();
  247. }
  248. ON('>')
  249. {
  250. TODO();
  251. }
  252. ON_EOF
  253. {
  254. TODO();
  255. }
  256. ANYTHING_ELSE
  257. {
  258. create_new_token(HTMLToken::Type::DOCTYPE);
  259. m_current_token.m_doctype.name.append(current_input_character.value());
  260. SWITCH_TO(DOCTYPEName);
  261. }
  262. }
  263. END_STATE
  264. BEGIN_STATE(DOCTYPEName)
  265. {
  266. ON_WHITESPACE
  267. {
  268. SWITCH_TO(AfterDOCTYPEName);
  269. }
  270. ON('>')
  271. {
  272. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  273. }
  274. ON_ASCII_UPPER_ALPHA
  275. {
  276. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  277. }
  278. ON(0)
  279. {
  280. TODO();
  281. }
  282. ON_EOF
  283. {
  284. TODO();
  285. }
  286. ANYTHING_ELSE
  287. {
  288. m_current_token.m_doctype.name.append(current_input_character.value());
  289. continue;
  290. }
  291. }
  292. END_STATE
  293. BEGIN_STATE(AfterDOCTYPEName)
  294. {
  295. ON_WHITESPACE
  296. {
  297. continue;
  298. }
  299. ON('>')
  300. {
  301. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  302. }
  303. ON_EOF
  304. {
  305. TODO();
  306. }
  307. ANYTHING_ELSE
  308. {
  309. TODO();
  310. }
  311. }
  312. END_STATE
  313. BEGIN_STATE(BeforeAttributeName)
  314. {
  315. ON_WHITESPACE
  316. {
  317. continue;
  318. }
  319. ON('/')
  320. {
  321. RECONSUME_IN(AfterAttributeName);
  322. }
  323. ON('>')
  324. {
  325. RECONSUME_IN(AfterAttributeName);
  326. }
  327. ON_EOF
  328. {
  329. RECONSUME_IN(AfterAttributeName);
  330. }
  331. ON('=')
  332. {
  333. TODO();
  334. }
  335. ANYTHING_ELSE
  336. {
  337. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  338. RECONSUME_IN(AttributeName);
  339. }
  340. }
  341. END_STATE
  342. BEGIN_STATE(SelfClosingStartTag)
  343. {
  344. }
  345. END_STATE
  346. BEGIN_STATE(AttributeName)
  347. {
  348. ON_WHITESPACE
  349. {
  350. RECONSUME_IN(AfterAttributeName);
  351. }
  352. ON('/')
  353. {
  354. RECONSUME_IN(AfterAttributeName);
  355. }
  356. ON('>')
  357. {
  358. RECONSUME_IN(AfterAttributeName);
  359. }
  360. ON_EOF
  361. {
  362. RECONSUME_IN(AfterAttributeName);
  363. }
  364. ON('=')
  365. {
  366. SWITCH_TO(BeforeAttributeValue);
  367. }
  368. ANYTHING_ELSE
  369. {
  370. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  371. continue;
  372. }
  373. }
  374. END_STATE
  375. BEGIN_STATE(AfterAttributeName)
  376. {
  377. }
  378. END_STATE
  379. BEGIN_STATE(BeforeAttributeValue)
  380. {
  381. ON_WHITESPACE
  382. {
  383. continue;
  384. }
  385. ON('"')
  386. {
  387. SWITCH_TO(AttributeValueDoubleQuoted);
  388. }
  389. ON('\'')
  390. {
  391. SWITCH_TO(AttributeValueSingleQuoted);
  392. }
  393. ON('>')
  394. {
  395. TODO();
  396. }
  397. ANYTHING_ELSE
  398. {
  399. RECONSUME_IN(AttributeValueUnquoted);
  400. }
  401. }
  402. END_STATE
  403. BEGIN_STATE(AttributeValueDoubleQuoted)
  404. {
  405. ON('"')
  406. {
  407. SWITCH_TO(AfterAttributeValueQuoted);
  408. }
  409. ON('&')
  410. {
  411. m_return_state = State::AttributeValueDoubleQuoted;
  412. SWITCH_TO(CharacterReference);
  413. }
  414. ON(0)
  415. {
  416. TODO();
  417. }
  418. ON_EOF
  419. {
  420. TODO();
  421. }
  422. ANYTHING_ELSE
  423. {
  424. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  425. continue;
  426. }
  427. }
  428. END_STATE
  429. BEGIN_STATE(AttributeValueSingleQuoted)
  430. {
  431. ON('\'')
  432. {
  433. SWITCH_TO(AfterAttributeValueQuoted);
  434. }
  435. ON('&')
  436. {
  437. m_return_state = State::AttributeValueSingleQuoted;
  438. SWITCH_TO(CharacterReference);
  439. }
  440. ON(0)
  441. {
  442. TODO();
  443. }
  444. ON_EOF
  445. {
  446. TODO();
  447. }
  448. ANYTHING_ELSE
  449. {
  450. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  451. continue;
  452. }
  453. }
  454. END_STATE
  455. BEGIN_STATE(AttributeValueUnquoted)
  456. {
  457. ON_WHITESPACE
  458. {
  459. SWITCH_TO(BeforeAttributeName);
  460. }
  461. ON('&')
  462. {
  463. m_return_state = State::AttributeValueUnquoted;
  464. SWITCH_TO(CharacterReference);
  465. }
  466. ON('>')
  467. {
  468. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  469. }
  470. ON(0)
  471. {
  472. TODO();
  473. }
  474. ON_EOF
  475. {
  476. TODO();
  477. }
  478. ANYTHING_ELSE
  479. {
  480. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  481. continue;
  482. }
  483. }
  484. END_STATE
  485. BEGIN_STATE(AfterAttributeValueQuoted)
  486. {
  487. ON_WHITESPACE
  488. {
  489. SWITCH_TO(BeforeAttributeName);
  490. }
  491. ON('/')
  492. {
  493. SWITCH_TO(SelfClosingStartTag);
  494. }
  495. ON('>')
  496. {
  497. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  498. }
  499. ON_EOF
  500. {
  501. TODO();
  502. }
  503. ANYTHING_ELSE
  504. {
  505. TODO();
  506. }
  507. }
  508. END_STATE
  509. BEGIN_STATE(CommentStart)
  510. {
  511. ON('-')
  512. {
  513. SWITCH_TO(CommentStartDash);
  514. }
  515. ON('>')
  516. {
  517. TODO();
  518. }
  519. ANYTHING_ELSE
  520. {
  521. RECONSUME_IN(Comment);
  522. }
  523. }
  524. END_STATE
  525. BEGIN_STATE(CommentStartDash)
  526. {
  527. ON('-')
  528. {
  529. SWITCH_TO(CommentEnd);
  530. }
  531. ON('>')
  532. {
  533. TODO();
  534. }
  535. ON_EOF
  536. {
  537. TODO();
  538. }
  539. ANYTHING_ELSE
  540. {
  541. m_current_token.m_comment_or_character.data.append('-');
  542. RECONSUME_IN(Comment);
  543. }
  544. }
  545. END_STATE
  546. BEGIN_STATE(Comment)
  547. {
  548. ON('<')
  549. {
  550. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  551. SWITCH_TO(CommentLessThanSign);
  552. }
  553. ON('-')
  554. {
  555. SWITCH_TO(CommentEndDash);
  556. }
  557. ON(0)
  558. {
  559. TODO();
  560. }
  561. ON_EOF
  562. {
  563. TODO();
  564. }
  565. ANYTHING_ELSE
  566. {
  567. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  568. continue;
  569. }
  570. }
  571. END_STATE
  572. BEGIN_STATE(CommentEnd)
  573. {
  574. ON('>')
  575. {
  576. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  577. }
  578. ON('!')
  579. {
  580. SWITCH_TO(CommentEndBang);
  581. }
  582. ON('-')
  583. {
  584. m_current_token.m_comment_or_character.data.append('-');
  585. continue;
  586. }
  587. ON_EOF
  588. {
  589. TODO();
  590. }
  591. ANYTHING_ELSE
  592. {
  593. m_current_token.m_comment_or_character.data.append('-');
  594. RECONSUME_IN(Comment);
  595. }
  596. }
  597. END_STATE
  598. BEGIN_STATE(CommentEndBang)
  599. {
  600. ON('-')
  601. {
  602. m_current_token.m_comment_or_character.data.append("--!");
  603. SWITCH_TO(CommentEndDash);
  604. }
  605. ON('>')
  606. {
  607. TODO();
  608. }
  609. ON_EOF
  610. {
  611. TODO();
  612. }
  613. ANYTHING_ELSE
  614. {
  615. m_current_token.m_comment_or_character.data.append("--!");
  616. RECONSUME_IN(Comment);
  617. }
  618. }
  619. END_STATE
  620. BEGIN_STATE(CommentEndDash)
  621. {
  622. ON('-')
  623. {
  624. SWITCH_TO(CommentEnd);
  625. }
  626. ON_EOF
  627. {
  628. TODO();
  629. }
  630. ANYTHING_ELSE
  631. {
  632. m_current_token.m_comment_or_character.data.append('-');
  633. RECONSUME_IN(Comment);
  634. }
  635. }
  636. END_STATE
  637. BEGIN_STATE(CommentLessThanSign)
  638. {
  639. ON('!')
  640. {
  641. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  642. SWITCH_TO(CommentLessThanSignBang);
  643. }
  644. ON('<')
  645. {
  646. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  647. continue;
  648. }
  649. ANYTHING_ELSE
  650. {
  651. RECONSUME_IN(Comment);
  652. }
  653. }
  654. END_STATE
  655. BEGIN_STATE(CommentLessThanSignBang)
  656. {
  657. ON('-')
  658. {
  659. SWITCH_TO(CommentLessThanSignBangDash);
  660. }
  661. ANYTHING_ELSE
  662. {
  663. RECONSUME_IN(Comment);
  664. }
  665. }
  666. END_STATE
  667. BEGIN_STATE(CommentLessThanSignBangDash)
  668. {
  669. ON('-')
  670. {
  671. SWITCH_TO(CommentLessThanSignBangDashDash);
  672. }
  673. ANYTHING_ELSE
  674. {
  675. RECONSUME_IN(Comment);
  676. }
  677. }
  678. END_STATE
  679. BEGIN_STATE(CommentLessThanSignBangDashDash)
  680. {
  681. ON('>')
  682. {
  683. SWITCH_TO(CommentEnd);
  684. }
  685. ANYTHING_ELSE
  686. {
  687. TODO();
  688. }
  689. }
  690. END_STATE
  691. BEGIN_STATE(CharacterReference)
  692. {
  693. }
  694. END_STATE
  695. BEGIN_STATE(RCDATA)
  696. {
  697. ON('&')
  698. {
  699. m_return_state = State::RCDATA;
  700. SWITCH_TO(CharacterReference);
  701. }
  702. ON('<')
  703. {
  704. SWITCH_TO(RCDATALessThanSign);
  705. }
  706. ON(0)
  707. {
  708. TODO();
  709. }
  710. ON_EOF
  711. {
  712. EMIT_EOF;
  713. }
  714. ANYTHING_ELSE
  715. {
  716. EMIT_CURRENT_CHARACTER;
  717. }
  718. }
  719. END_STATE
  720. BEGIN_STATE(RCDATALessThanSign)
  721. {
  722. ON('/')
  723. {
  724. m_temporary_buffer.clear();
  725. SWITCH_TO(RCDATAEndTagOpen);
  726. }
  727. ANYTHING_ELSE
  728. {
  729. EMIT_CHARACTER('<');
  730. RECONSUME_IN(RCDATA);
  731. }
  732. }
  733. END_STATE
  734. BEGIN_STATE(RCDATAEndTagOpen)
  735. {
  736. ON_ASCII_ALPHA
  737. {
  738. create_new_token(HTMLToken::Type::EndTag);
  739. RECONSUME_IN(RCDATAEndTagName);
  740. }
  741. ANYTHING_ELSE
  742. {
  743. // FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RCDATA state.
  744. TODO();
  745. }
  746. }
  747. END_STATE
  748. BEGIN_STATE(RCDATAEndTagName)
  749. {
  750. ON_WHITESPACE
  751. {
  752. TODO();
  753. }
  754. ON('/')
  755. {
  756. TODO();
  757. }
  758. ON('>')
  759. {
  760. if (!current_end_tag_token_is_appropriate()) {
  761. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  762. TODO();
  763. }
  764. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  765. }
  766. ON_ASCII_UPPER_ALPHA
  767. {
  768. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  769. m_temporary_buffer.append(current_input_character.value());
  770. continue;
  771. }
  772. ON_ASCII_LOWER_ALPHA
  773. {
  774. m_current_token.m_tag.tag_name.append(current_input_character.value());
  775. m_temporary_buffer.append(current_input_character.value());
  776. continue;
  777. }
  778. ANYTHING_ELSE
  779. {
  780. TODO();
  781. }
  782. }
  783. END_STATE
  784. BEGIN_STATE(RAWTEXT)
  785. {
  786. ON('<')
  787. {
  788. SWITCH_TO(RAWTEXTLessThanSign);
  789. }
  790. ON(0)
  791. {
  792. TODO();
  793. }
  794. ON_EOF
  795. {
  796. EMIT_EOF;
  797. }
  798. ANYTHING_ELSE
  799. {
  800. EMIT_CURRENT_CHARACTER;
  801. }
  802. }
  803. END_STATE
  804. BEGIN_STATE(RAWTEXTLessThanSign)
  805. {
  806. ON('/')
  807. {
  808. m_temporary_buffer.clear();
  809. SWITCH_TO(RAWTEXTEndTagOpen);
  810. }
  811. ANYTHING_ELSE
  812. {
  813. EMIT_CHARACTER('<');
  814. RECONSUME_IN(RAWTEXT);
  815. }
  816. }
  817. END_STATE
  818. BEGIN_STATE(RAWTEXTEndTagOpen)
  819. {
  820. ON_ASCII_ALPHA
  821. {
  822. create_new_token(HTMLToken::Type::EndTag);
  823. RECONSUME_IN(RAWTEXTEndTagName);
  824. }
  825. ANYTHING_ELSE
  826. {
  827. // FIXME: Emit a U+003C LESS-THAN SIGN character token and a U+002F SOLIDUS character token. Reconsume in the RAWTEXT state.
  828. TODO();
  829. }
  830. }
  831. END_STATE
  832. BEGIN_STATE(RAWTEXTEndTagName)
  833. {
  834. ON_WHITESPACE
  835. {
  836. TODO();
  837. }
  838. ON('/')
  839. {
  840. TODO();
  841. }
  842. ON('>')
  843. {
  844. if (!current_end_tag_token_is_appropriate()) {
  845. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  846. TODO();
  847. }
  848. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  849. }
  850. ON_ASCII_UPPER_ALPHA
  851. {
  852. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  853. m_temporary_buffer.append(current_input_character.value());
  854. continue;
  855. }
  856. ON_ASCII_LOWER_ALPHA
  857. {
  858. m_current_token.m_tag.tag_name.append(current_input_character.value());
  859. m_temporary_buffer.append(current_input_character.value());
  860. continue;
  861. }
  862. ANYTHING_ELSE
  863. {
  864. TODO();
  865. }
  866. }
  867. END_STATE
  868. BEGIN_STATE(ScriptData)
  869. {
  870. ON('<')
  871. {
  872. SWITCH_TO(ScriptDataLessThanSign);
  873. }
  874. ON(0)
  875. {
  876. TODO();
  877. }
  878. ON_EOF
  879. {
  880. EMIT_EOF;
  881. }
  882. ANYTHING_ELSE
  883. {
  884. EMIT_CURRENT_CHARACTER;
  885. }
  886. }
  887. END_STATE
  888. BEGIN_STATE(ScriptDataLessThanSign)
  889. {
  890. ON('/')
  891. {
  892. m_temporary_buffer.clear();
  893. SWITCH_TO(ScriptDataEndTagOpen);
  894. }
  895. ON('!')
  896. {
  897. TODO();
  898. }
  899. ANYTHING_ELSE
  900. {
  901. EMIT_CHARACTER('<');
  902. RECONSUME_IN(ScriptData);
  903. }
  904. }
  905. END_STATE
  906. BEGIN_STATE(ScriptDataEndTagOpen)
  907. {
  908. ON_ASCII_ALPHA
  909. {
  910. create_new_token(HTMLToken::Type::EndTag);
  911. RECONSUME_IN(ScriptDataEndTagName);
  912. }
  913. ANYTHING_ELSE
  914. {
  915. TODO();
  916. }
  917. }
  918. END_STATE
  919. BEGIN_STATE(ScriptDataEndTagName)
  920. {
  921. ON_WHITESPACE
  922. {
  923. if (current_end_tag_token_is_appropriate())
  924. SWITCH_TO(BeforeAttributeName);
  925. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  926. TODO();
  927. }
  928. ON('/')
  929. {
  930. if (current_end_tag_token_is_appropriate())
  931. SWITCH_TO(SelfClosingStartTag);
  932. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  933. TODO();
  934. }
  935. ON('>')
  936. {
  937. if (current_end_tag_token_is_appropriate())
  938. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  939. // FIXME: Otherwise, treat it as per the "anything else" entry below.
  940. TODO();
  941. }
  942. ON_ASCII_UPPER_ALPHA
  943. {
  944. m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
  945. m_temporary_buffer.append(current_input_character.value());
  946. continue;
  947. }
  948. ON_ASCII_LOWER_ALPHA
  949. {
  950. m_current_token.m_tag.tag_name.append(current_input_character.value());
  951. m_temporary_buffer.append(current_input_character.value());
  952. continue;
  953. }
  954. ANYTHING_ELSE
  955. {
  956. TODO();
  957. }
  958. }
  959. END_STATE
  960. default:
  961. TODO();
  962. }
  963. }
  964. }
  965. void HTMLTokenizer::consume(const StringView& string)
  966. {
  967. ASSERT(next_few_characters_are(string));
  968. m_cursor += string.length();
  969. }
  970. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  971. {
  972. for (size_t i = 0; i < string.length(); ++i) {
  973. auto codepoint = peek_codepoint(i);
  974. if (!codepoint.has_value())
  975. return false;
  976. // FIXME: This should be more Unicode-aware.
  977. if (codepoint.value() != (u32)string[i])
  978. return false;
  979. }
  980. return true;
  981. }
  982. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  983. {
  984. m_current_token = {};
  985. m_current_token.m_type = type;
  986. }
  987. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  988. : m_input(input)
  989. {
  990. }
  991. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  992. {
  993. #ifdef TOKENIZER_TRACE
  994. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  995. #endif
  996. }
  997. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  998. {
  999. #ifdef TOKENIZER_TRACE
  1000. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  1001. #endif
  1002. }
  1003. void HTMLTokenizer::switch_to(Badge<HTMLDocumentParser>, State new_state)
  1004. {
  1005. #ifdef TOKENIZER_TRACE
  1006. dbg() << "[" << state_name(m_state) << "] Parser switches tokenizer state to " << state_name(new_state);
  1007. #endif
  1008. m_state = new_state;
  1009. }
  1010. void HTMLTokenizer::will_emit(HTMLToken& token)
  1011. {
  1012. if (token.is_start_tag())
  1013. m_last_emitted_start_tag = token;
  1014. }
  1015. bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
  1016. {
  1017. ASSERT(m_current_token.is_end_tag());
  1018. if (!m_last_emitted_start_tag.is_start_tag())
  1019. return false;
  1020. return m_current_token.tag_name() == m_last_emitted_start_tag.tag_name();
  1021. }
  1022. }