HTMLTokenizer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. #pragma GCC diagnostic ignored "-Wunused-label"
  30. //#define TOKENIZER_TRACE
  31. #define TODO() \
  32. do { \
  33. dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
  34. ASSERT_NOT_REACHED(); \
  35. } while (0)
  36. #define SWITCH_TO(new_state) \
  37. will_switch_to(State::new_state); \
  38. m_state = State::new_state; \
  39. current_input_character = next_codepoint(); \
  40. goto new_state;
  41. #define RECONSUME_IN(new_state) \
  42. will_reconsume_in(State::new_state); \
  43. m_state = State::new_state; \
  44. goto new_state;
  45. #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
  46. will_switch_to(State::new_state); \
  47. m_state = State::new_state; \
  48. return m_current_token;
  49. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  50. #define ON(codepoint) \
  51. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  52. #define ON_EOF \
  53. if (!current_input_character.has_value())
  54. #define ON_ASCII_ALPHA \
  55. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  56. #define ON_ASCII_UPPER_ALPHA \
  57. if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
  58. #define ON_WHITESPACE \
  59. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  60. #define ANYTHING_ELSE if (1)
  61. #define EMIT_EOF \
  62. if (m_has_emitted_eof) \
  63. return {}; \
  64. m_has_emitted_eof = true; \
  65. create_new_token(HTMLToken::Type::EndOfFile); \
  66. return m_current_token;
  67. #define EMIT_CURRENT_TOKEN \
  68. return m_current_token;
  69. #define BEGIN_STATE(state) \
  70. state: \
  71. case State::state: { \
  72. { \
  73. {
  74. #define END_STATE \
  75. ASSERT_NOT_REACHED(); \
  76. break; \
  77. } \
  78. } \
  79. }
  80. namespace Web {
  81. Optional<u32> HTMLTokenizer::next_codepoint()
  82. {
  83. if (m_cursor >= m_input.length())
  84. return {};
  85. return m_input[m_cursor++];
  86. }
  87. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  88. {
  89. if ((m_cursor + offset) >= m_input.length())
  90. return {};
  91. return m_input[m_cursor + offset];
  92. }
  93. Optional<HTMLToken> HTMLTokenizer::next_token()
  94. {
  95. for (;;) {
  96. auto current_input_character = next_codepoint();
  97. switch (m_state) {
  98. BEGIN_STATE(Data)
  99. {
  100. ON('&')
  101. {
  102. m_return_state = State::Data;
  103. SWITCH_TO(CharacterReference);
  104. }
  105. ON('<')
  106. {
  107. SWITCH_TO(TagOpen);
  108. }
  109. ON_EOF
  110. {
  111. EMIT_EOF;
  112. }
  113. ANYTHING_ELSE
  114. {
  115. if (m_current_token.type() != HTMLToken::Type::Character)
  116. create_new_token(HTMLToken::Type::Character);
  117. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  118. continue;
  119. }
  120. }
  121. END_STATE
  122. BEGIN_STATE(TagOpen)
  123. {
  124. ON('!')
  125. {
  126. SWITCH_TO(MarkupDeclarationOpen);
  127. }
  128. ON('/')
  129. {
  130. SWITCH_TO(EndTagOpen);
  131. }
  132. ON_ASCII_ALPHA
  133. {
  134. create_new_token(HTMLToken::Type::StartTag);
  135. RECONSUME_IN(TagName);
  136. }
  137. ON('?')
  138. {
  139. TODO();
  140. }
  141. ANYTHING_ELSE
  142. {
  143. TODO();
  144. }
  145. }
  146. END_STATE
  147. BEGIN_STATE(TagName)
  148. {
  149. ON_WHITESPACE
  150. {
  151. SWITCH_TO(BeforeAttributeName);
  152. }
  153. ON('/')
  154. {
  155. SWITCH_TO(SelfClosingStartTag);
  156. }
  157. ON('>')
  158. {
  159. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  160. }
  161. ANYTHING_ELSE
  162. {
  163. m_current_token.m_tag.tag_name.append(current_input_character.value());
  164. continue;
  165. }
  166. }
  167. END_STATE
  168. BEGIN_STATE(EndTagOpen)
  169. {
  170. ON_ASCII_ALPHA
  171. {
  172. create_new_token(HTMLToken::Type::EndTag);
  173. RECONSUME_IN(TagName);
  174. }
  175. }
  176. END_STATE
  177. BEGIN_STATE(MarkupDeclarationOpen)
  178. {
  179. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  180. if (next_few_characters_are("--")) {
  181. consume("--");
  182. create_new_token(HTMLToken::Type::Comment);
  183. SWITCH_TO(CommentStart);
  184. }
  185. if (next_few_characters_are("DOCTYPE")) {
  186. consume("DOCTYPE");
  187. SWITCH_TO(DOCTYPE);
  188. }
  189. }
  190. END_STATE
  191. BEGIN_STATE(DOCTYPE)
  192. {
  193. ON_WHITESPACE
  194. {
  195. SWITCH_TO(BeforeDOCTYPEName);
  196. }
  197. ON('>')
  198. {
  199. RECONSUME_IN(BeforeDOCTYPEName);
  200. }
  201. ON_EOF
  202. {
  203. TODO();
  204. }
  205. ANYTHING_ELSE
  206. {
  207. TODO();
  208. }
  209. }
  210. END_STATE
  211. BEGIN_STATE(BeforeDOCTYPEName)
  212. {
  213. ON_WHITESPACE
  214. {
  215. continue;
  216. }
  217. ON_ASCII_UPPER_ALPHA
  218. {
  219. create_new_token(HTMLToken::Type::DOCTYPE);
  220. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  221. SWITCH_TO(DOCTYPEName);
  222. }
  223. ON(0)
  224. {
  225. TODO();
  226. }
  227. ON('>')
  228. {
  229. TODO();
  230. }
  231. ON_EOF
  232. {
  233. TODO();
  234. }
  235. ANYTHING_ELSE
  236. {
  237. create_new_token(HTMLToken::Type::DOCTYPE);
  238. m_current_token.m_doctype.name.append(current_input_character.value());
  239. SWITCH_TO(DOCTYPEName);
  240. }
  241. }
  242. END_STATE
  243. BEGIN_STATE(DOCTYPEName)
  244. {
  245. ON_WHITESPACE
  246. {
  247. SWITCH_TO(AfterDOCTYPEName);
  248. }
  249. ON('>')
  250. {
  251. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  252. }
  253. ON_ASCII_UPPER_ALPHA
  254. {
  255. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  256. }
  257. ON(0)
  258. {
  259. TODO();
  260. }
  261. ON_EOF
  262. {
  263. TODO();
  264. }
  265. ANYTHING_ELSE
  266. {
  267. m_current_token.m_doctype.name.append(current_input_character.value());
  268. continue;
  269. }
  270. }
  271. END_STATE
  272. BEGIN_STATE(AfterDOCTYPEName)
  273. {
  274. ON_WHITESPACE
  275. {
  276. continue;
  277. }
  278. ON('>')
  279. {
  280. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  281. }
  282. ON_EOF
  283. {
  284. TODO();
  285. }
  286. ANYTHING_ELSE
  287. {
  288. TODO();
  289. }
  290. }
  291. END_STATE
  292. BEGIN_STATE(BeforeAttributeName)
  293. {
  294. ON_WHITESPACE
  295. {
  296. continue;
  297. }
  298. ON('/')
  299. {
  300. RECONSUME_IN(AfterAttributeName);
  301. }
  302. ON('>')
  303. {
  304. RECONSUME_IN(AfterAttributeName);
  305. }
  306. ON_EOF
  307. {
  308. RECONSUME_IN(AfterAttributeName);
  309. }
  310. ON('=')
  311. {
  312. TODO();
  313. }
  314. ANYTHING_ELSE
  315. {
  316. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  317. RECONSUME_IN(AttributeName);
  318. }
  319. }
  320. END_STATE
  321. BEGIN_STATE(SelfClosingStartTag)
  322. {
  323. }
  324. END_STATE
  325. BEGIN_STATE(AttributeName)
  326. {
  327. ON_WHITESPACE
  328. {
  329. RECONSUME_IN(AfterAttributeName);
  330. }
  331. ON('/')
  332. {
  333. RECONSUME_IN(AfterAttributeName);
  334. }
  335. ON('>')
  336. {
  337. RECONSUME_IN(AfterAttributeName);
  338. }
  339. ON_EOF
  340. {
  341. RECONSUME_IN(AfterAttributeName);
  342. }
  343. ON('=')
  344. {
  345. SWITCH_TO(BeforeAttributeValue);
  346. }
  347. ANYTHING_ELSE
  348. {
  349. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  350. continue;
  351. }
  352. }
  353. END_STATE
  354. BEGIN_STATE(AfterAttributeName)
  355. {
  356. }
  357. END_STATE
  358. BEGIN_STATE(BeforeAttributeValue)
  359. {
  360. ON_WHITESPACE
  361. {
  362. continue;
  363. }
  364. ON('"')
  365. {
  366. SWITCH_TO(AttributeValueDoubleQuoted);
  367. }
  368. ON('\'')
  369. {
  370. SWITCH_TO(AttributeValueSingleQuoted);
  371. }
  372. ON('>')
  373. {
  374. TODO();
  375. }
  376. ANYTHING_ELSE
  377. {
  378. RECONSUME_IN(AttributeValueUnquoted);
  379. }
  380. }
  381. END_STATE
  382. BEGIN_STATE(AttributeValueDoubleQuoted)
  383. {
  384. ON('"')
  385. {
  386. SWITCH_TO(AfterAttributeValueQuoted);
  387. }
  388. ON('&')
  389. {
  390. m_return_state = State::AttributeValueDoubleQuoted;
  391. SWITCH_TO(CharacterReference);
  392. }
  393. ON(0)
  394. {
  395. TODO();
  396. }
  397. ON_EOF
  398. {
  399. TODO();
  400. }
  401. ANYTHING_ELSE
  402. {
  403. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  404. continue;
  405. }
  406. }
  407. END_STATE
  408. BEGIN_STATE(AttributeValueSingleQuoted)
  409. {
  410. ON('\'')
  411. {
  412. SWITCH_TO(AfterAttributeValueQuoted);
  413. }
  414. ON('&')
  415. {
  416. m_return_state = State::AttributeValueSingleQuoted;
  417. SWITCH_TO(CharacterReference);
  418. }
  419. ON(0)
  420. {
  421. TODO();
  422. }
  423. ON_EOF
  424. {
  425. TODO();
  426. }
  427. ANYTHING_ELSE
  428. {
  429. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  430. continue;
  431. }
  432. }
  433. END_STATE
  434. BEGIN_STATE(AttributeValueUnquoted)
  435. {
  436. ON_WHITESPACE
  437. {
  438. SWITCH_TO(BeforeAttributeName);
  439. }
  440. ON('&')
  441. {
  442. m_return_state = State::AttributeValueUnquoted;
  443. SWITCH_TO(CharacterReference);
  444. }
  445. ON('>')
  446. {
  447. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  448. }
  449. ON(0)
  450. {
  451. TODO();
  452. }
  453. ON_EOF
  454. {
  455. TODO();
  456. }
  457. ANYTHING_ELSE
  458. {
  459. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  460. continue;
  461. }
  462. }
  463. END_STATE
  464. BEGIN_STATE(AfterAttributeValueQuoted)
  465. {
  466. ON_WHITESPACE
  467. {
  468. SWITCH_TO(BeforeAttributeName);
  469. }
  470. ON('/')
  471. {
  472. SWITCH_TO(SelfClosingStartTag);
  473. }
  474. ON('>')
  475. {
  476. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  477. }
  478. ON_EOF
  479. {
  480. TODO();
  481. }
  482. ANYTHING_ELSE
  483. {
  484. TODO();
  485. }
  486. }
  487. END_STATE
  488. BEGIN_STATE(CommentStart)
  489. {
  490. ON('-')
  491. {
  492. SWITCH_TO(CommentStartDash);
  493. }
  494. ON('>')
  495. {
  496. TODO();
  497. }
  498. ANYTHING_ELSE
  499. {
  500. RECONSUME_IN(Comment);
  501. }
  502. }
  503. END_STATE
  504. BEGIN_STATE(CommentStartDash)
  505. {
  506. ON('-')
  507. {
  508. SWITCH_TO(CommentEnd);
  509. }
  510. ON('>')
  511. {
  512. TODO();
  513. }
  514. ON_EOF
  515. {
  516. TODO();
  517. }
  518. ANYTHING_ELSE
  519. {
  520. m_current_token.m_comment_or_character.data.append('-');
  521. RECONSUME_IN(Comment);
  522. }
  523. }
  524. END_STATE
  525. BEGIN_STATE(Comment)
  526. {
  527. ON('<')
  528. {
  529. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  530. SWITCH_TO(CommentLessThanSign);
  531. }
  532. ON('-')
  533. {
  534. SWITCH_TO(CommentEndDash);
  535. }
  536. ON(0)
  537. {
  538. TODO();
  539. }
  540. ON_EOF
  541. {
  542. TODO();
  543. }
  544. ANYTHING_ELSE
  545. {
  546. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  547. continue;
  548. }
  549. }
  550. END_STATE
  551. BEGIN_STATE(CommentEnd)
  552. {
  553. ON('>')
  554. {
  555. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  556. }
  557. ON('!')
  558. {
  559. SWITCH_TO(CommentEndBang);
  560. }
  561. ON('-')
  562. {
  563. m_current_token.m_comment_or_character.data.append('-');
  564. continue;
  565. }
  566. ON_EOF
  567. {
  568. TODO();
  569. }
  570. ANYTHING_ELSE
  571. {
  572. m_current_token.m_comment_or_character.data.append('-');
  573. RECONSUME_IN(Comment);
  574. }
  575. }
  576. END_STATE
  577. BEGIN_STATE(CommentEndBang)
  578. {
  579. ON('-')
  580. {
  581. m_current_token.m_comment_or_character.data.append("--!");
  582. SWITCH_TO(CommentEndDash);
  583. }
  584. ON('>')
  585. {
  586. TODO();
  587. }
  588. ON_EOF
  589. {
  590. TODO();
  591. }
  592. ANYTHING_ELSE
  593. {
  594. m_current_token.m_comment_or_character.data.append("--!");
  595. RECONSUME_IN(Comment);
  596. }
  597. }
  598. END_STATE
  599. BEGIN_STATE(CommentEndDash)
  600. {
  601. ON('-')
  602. {
  603. SWITCH_TO(CommentEnd);
  604. }
  605. ON_EOF
  606. {
  607. TODO();
  608. }
  609. ANYTHING_ELSE
  610. {
  611. m_current_token.m_comment_or_character.data.append('-');
  612. RECONSUME_IN(Comment);
  613. }
  614. }
  615. END_STATE
  616. BEGIN_STATE(CommentLessThanSign)
  617. {
  618. ON('!')
  619. {
  620. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  621. SWITCH_TO(CommentLessThanSignBang);
  622. }
  623. ON('<')
  624. {
  625. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  626. continue;
  627. }
  628. ANYTHING_ELSE
  629. {
  630. RECONSUME_IN(Comment);
  631. }
  632. }
  633. END_STATE
  634. BEGIN_STATE(CommentLessThanSignBang)
  635. {
  636. ON('-')
  637. {
  638. SWITCH_TO(CommentLessThanSignBangDash);
  639. }
  640. ANYTHING_ELSE
  641. {
  642. RECONSUME_IN(Comment);
  643. }
  644. }
  645. END_STATE
  646. BEGIN_STATE(CommentLessThanSignBangDash)
  647. {
  648. ON('-')
  649. {
  650. SWITCH_TO(CommentLessThanSignBangDashDash);
  651. }
  652. ANYTHING_ELSE
  653. {
  654. RECONSUME_IN(Comment);
  655. }
  656. }
  657. END_STATE
  658. BEGIN_STATE(CommentLessThanSignBangDashDash)
  659. {
  660. ON('>')
  661. {
  662. SWITCH_TO(CommentEnd);
  663. }
  664. ANYTHING_ELSE
  665. {
  666. TODO();
  667. }
  668. }
  669. END_STATE
  670. BEGIN_STATE(CharacterReference)
  671. {
  672. }
  673. END_STATE
  674. default:
  675. ASSERT_NOT_REACHED();
  676. }
  677. }
  678. }
  679. void HTMLTokenizer::consume(const StringView& string)
  680. {
  681. ASSERT(next_few_characters_are(string));
  682. m_cursor += string.length();
  683. }
  684. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  685. {
  686. for (size_t i = 0; i < string.length(); ++i) {
  687. auto codepoint = peek_codepoint(i);
  688. if (!codepoint.has_value())
  689. return false;
  690. // FIXME: This should be more Unicode-aware.
  691. if (codepoint.value() != (u32)string[i])
  692. return false;
  693. }
  694. return true;
  695. }
  696. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  697. {
  698. flush_current_character_or_comment_if_needed();
  699. m_current_token = {};
  700. m_current_token.m_type = type;
  701. }
  702. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  703. : m_input(input)
  704. {
  705. }
  706. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  707. {
  708. flush_current_character_or_comment_if_needed();
  709. #ifdef TOKENIZER_TRACE
  710. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  711. #endif
  712. }
  713. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  714. {
  715. flush_current_character_or_comment_if_needed();
  716. #ifdef TOKENIZER_TRACE
  717. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  718. #endif
  719. }
  720. void HTMLTokenizer::flush_current_character_or_comment_if_needed()
  721. {
  722. //if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
  723. // emit_current_token();
  724. }
  725. }