HTMLTokenizer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. #pragma GCC diagnostic ignored "-Wunused-label"
  30. //#define TOKENIZER_TRACE
  31. #define TODO() \
  32. do { \
  33. dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
  34. ASSERT_NOT_REACHED(); \
  35. } while (0)
  36. #define SWITCH_TO(new_state) \
  37. will_switch_to(State::new_state); \
  38. m_state = State::new_state; \
  39. current_input_character = next_codepoint(); \
  40. goto new_state;
  41. #define RECONSUME_IN(new_state) \
  42. will_reconsume_in(State::new_state); \
  43. m_state = State::new_state; \
  44. goto new_state;
  45. #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
  46. will_switch_to(State::new_state); \
  47. m_state = State::new_state; \
  48. return m_current_token;
  49. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  50. #define ON(codepoint) \
  51. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  52. #define ON_EOF \
  53. if (!current_input_character.has_value())
  54. #define ON_ASCII_ALPHA \
  55. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  56. #define ON_ASCII_UPPER_ALPHA \
  57. if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
  58. #define ON_WHITESPACE \
  59. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\a' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  60. #define ANYTHING_ELSE if (1)
  61. #define EMIT_EOF \
  62. create_new_token(HTMLToken::Type::EndOfFile); \
  63. return m_current_token;
  64. #define EMIT_CURRENT_TOKEN \
  65. return m_current_token;
  66. #define BEGIN_STATE(state) \
  67. state: \
  68. case State::state: { \
  69. { \
  70. {
  71. #define END_STATE \
  72. ASSERT_NOT_REACHED(); \
  73. break; \
  74. } \
  75. } \
  76. }
  77. namespace Web {
  78. Optional<u32> HTMLTokenizer::next_codepoint()
  79. {
  80. if (m_cursor >= m_input.length())
  81. return {};
  82. return m_input[m_cursor++];
  83. }
  84. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  85. {
  86. if ((m_cursor + offset) >= m_input.length())
  87. return {};
  88. return m_input[m_cursor + offset];
  89. }
  90. Optional<HTMLToken> HTMLTokenizer::next_token()
  91. {
  92. for (;;) {
  93. auto current_input_character = next_codepoint();
  94. switch (m_state) {
  95. BEGIN_STATE(Data)
  96. {
  97. ON('&')
  98. {
  99. m_return_state = State::Data;
  100. SWITCH_TO(CharacterReference);
  101. }
  102. ON('<')
  103. {
  104. SWITCH_TO(TagOpen);
  105. }
  106. ON_EOF
  107. {
  108. EMIT_EOF;
  109. }
  110. ANYTHING_ELSE
  111. {
  112. if (m_current_token.type() != HTMLToken::Type::Character)
  113. create_new_token(HTMLToken::Type::Character);
  114. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  115. continue;
  116. }
  117. }
  118. END_STATE
  119. BEGIN_STATE(TagOpen)
  120. {
  121. ON('!')
  122. {
  123. SWITCH_TO(MarkupDeclarationOpen);
  124. }
  125. ON('/')
  126. {
  127. SWITCH_TO(EndTagOpen);
  128. }
  129. ON_ASCII_ALPHA
  130. {
  131. create_new_token(HTMLToken::Type::StartTag);
  132. RECONSUME_IN(TagName);
  133. }
  134. ON('?')
  135. {
  136. TODO();
  137. }
  138. ANYTHING_ELSE
  139. {
  140. TODO();
  141. }
  142. }
  143. END_STATE
  144. BEGIN_STATE(TagName)
  145. {
  146. ON_WHITESPACE
  147. {
  148. SWITCH_TO(BeforeAttributeName);
  149. }
  150. ON('/')
  151. {
  152. SWITCH_TO(SelfClosingStartTag);
  153. }
  154. ON('>')
  155. {
  156. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  157. }
  158. ANYTHING_ELSE
  159. {
  160. m_current_token.m_tag.tag_name.append(current_input_character.value());
  161. continue;
  162. }
  163. }
  164. END_STATE
  165. BEGIN_STATE(EndTagOpen)
  166. {
  167. ON_ASCII_ALPHA
  168. {
  169. create_new_token(HTMLToken::Type::EndTag);
  170. RECONSUME_IN(TagName);
  171. }
  172. }
  173. END_STATE
  174. BEGIN_STATE(MarkupDeclarationOpen)
  175. {
  176. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  177. if (next_few_characters_are("--")) {
  178. consume("--");
  179. create_new_token(HTMLToken::Type::Comment);
  180. SWITCH_TO(CommentStart);
  181. }
  182. if (next_few_characters_are("DOCTYPE")) {
  183. consume("DOCTYPE");
  184. SWITCH_TO(DOCTYPE);
  185. }
  186. }
  187. END_STATE
  188. BEGIN_STATE(DOCTYPE)
  189. {
  190. ON_WHITESPACE
  191. {
  192. SWITCH_TO(BeforeDOCTYPEName);
  193. }
  194. ON('>')
  195. {
  196. RECONSUME_IN(BeforeDOCTYPEName);
  197. }
  198. ON_EOF
  199. {
  200. TODO();
  201. }
  202. ANYTHING_ELSE
  203. {
  204. TODO();
  205. }
  206. }
  207. END_STATE
  208. BEGIN_STATE(BeforeDOCTYPEName)
  209. {
  210. ON_WHITESPACE
  211. {
  212. continue;
  213. }
  214. ON_ASCII_UPPER_ALPHA
  215. {
  216. create_new_token(HTMLToken::Type::DOCTYPE);
  217. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  218. SWITCH_TO(DOCTYPEName);
  219. }
  220. ON(0)
  221. {
  222. TODO();
  223. }
  224. ON('>')
  225. {
  226. TODO();
  227. }
  228. ON_EOF
  229. {
  230. TODO();
  231. }
  232. ANYTHING_ELSE
  233. {
  234. create_new_token(HTMLToken::Type::DOCTYPE);
  235. m_current_token.m_doctype.name.append(current_input_character.value());
  236. SWITCH_TO(DOCTYPEName);
  237. }
  238. }
  239. END_STATE
  240. BEGIN_STATE(DOCTYPEName)
  241. {
  242. ON_WHITESPACE
  243. {
  244. SWITCH_TO(AfterDOCTYPEName);
  245. }
  246. ON('>')
  247. {
  248. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  249. }
  250. ON_ASCII_UPPER_ALPHA
  251. {
  252. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  253. }
  254. ON(0)
  255. {
  256. TODO();
  257. }
  258. ON_EOF
  259. {
  260. TODO();
  261. }
  262. ANYTHING_ELSE
  263. {
  264. m_current_token.m_doctype.name.append(current_input_character.value());
  265. continue;
  266. }
  267. }
  268. END_STATE
  269. BEGIN_STATE(AfterDOCTYPEName)
  270. {
  271. ON_WHITESPACE
  272. {
  273. continue;
  274. }
  275. ON('>')
  276. {
  277. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  278. }
  279. ON_EOF
  280. {
  281. TODO();
  282. }
  283. ANYTHING_ELSE
  284. {
  285. TODO();
  286. }
  287. }
  288. END_STATE
  289. BEGIN_STATE(BeforeAttributeName)
  290. {
  291. ON_WHITESPACE
  292. {
  293. continue;
  294. }
  295. ON('/')
  296. {
  297. RECONSUME_IN(AfterAttributeName);
  298. }
  299. ON('>')
  300. {
  301. RECONSUME_IN(AfterAttributeName);
  302. }
  303. ON_EOF
  304. {
  305. RECONSUME_IN(AfterAttributeName);
  306. }
  307. ON('=')
  308. {
  309. TODO();
  310. }
  311. ANYTHING_ELSE
  312. {
  313. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  314. RECONSUME_IN(AttributeName);
  315. }
  316. }
  317. END_STATE
  318. BEGIN_STATE(SelfClosingStartTag)
  319. {
  320. }
  321. END_STATE
  322. BEGIN_STATE(AttributeName)
  323. {
  324. ON_WHITESPACE
  325. {
  326. RECONSUME_IN(AfterAttributeName);
  327. }
  328. ON('/')
  329. {
  330. RECONSUME_IN(AfterAttributeName);
  331. }
  332. ON('>')
  333. {
  334. RECONSUME_IN(AfterAttributeName);
  335. }
  336. ON_EOF
  337. {
  338. RECONSUME_IN(AfterAttributeName);
  339. }
  340. ON('=')
  341. {
  342. SWITCH_TO(BeforeAttributeValue);
  343. }
  344. ANYTHING_ELSE
  345. {
  346. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  347. continue;
  348. }
  349. }
  350. END_STATE
  351. BEGIN_STATE(AfterAttributeName)
  352. {
  353. }
  354. END_STATE
  355. BEGIN_STATE(BeforeAttributeValue)
  356. {
  357. ON_WHITESPACE
  358. {
  359. continue;
  360. }
  361. ON('"')
  362. {
  363. SWITCH_TO(AttributeValueDoubleQuoted);
  364. }
  365. ON('\'')
  366. {
  367. SWITCH_TO(AttributeValueSingleQuoted);
  368. }
  369. ON('>')
  370. {
  371. TODO();
  372. }
  373. ANYTHING_ELSE
  374. {
  375. RECONSUME_IN(AttributeValueUnquoted);
  376. }
  377. }
  378. END_STATE
  379. BEGIN_STATE(AttributeValueDoubleQuoted)
  380. {
  381. ON('"')
  382. {
  383. SWITCH_TO(AfterAttributeValueQuoted);
  384. }
  385. ON('&')
  386. {
  387. m_return_state = State::AttributeValueDoubleQuoted;
  388. SWITCH_TO(CharacterReference);
  389. }
  390. ON(0)
  391. {
  392. TODO();
  393. }
  394. ON_EOF
  395. {
  396. TODO();
  397. }
  398. ANYTHING_ELSE
  399. {
  400. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  401. continue;
  402. }
  403. }
  404. END_STATE
  405. BEGIN_STATE(AttributeValueSingleQuoted)
  406. {
  407. ON('\'')
  408. {
  409. SWITCH_TO(AfterAttributeValueQuoted);
  410. }
  411. ON('&')
  412. {
  413. m_return_state = State::AttributeValueSingleQuoted;
  414. SWITCH_TO(CharacterReference);
  415. }
  416. ON(0)
  417. {
  418. TODO();
  419. }
  420. ON_EOF
  421. {
  422. TODO();
  423. }
  424. ANYTHING_ELSE
  425. {
  426. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  427. continue;
  428. }
  429. }
  430. END_STATE
  431. BEGIN_STATE(AttributeValueUnquoted)
  432. {
  433. ON_WHITESPACE
  434. {
  435. SWITCH_TO(BeforeAttributeName);
  436. }
  437. ON('&')
  438. {
  439. m_return_state = State::AttributeValueUnquoted;
  440. SWITCH_TO(CharacterReference);
  441. }
  442. ON('>')
  443. {
  444. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  445. }
  446. ON(0)
  447. {
  448. TODO();
  449. }
  450. ON_EOF
  451. {
  452. TODO();
  453. }
  454. ANYTHING_ELSE
  455. {
  456. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  457. continue;
  458. }
  459. }
  460. END_STATE
  461. BEGIN_STATE(AfterAttributeValueQuoted)
  462. {
  463. ON_WHITESPACE
  464. {
  465. SWITCH_TO(BeforeAttributeName);
  466. }
  467. ON('/')
  468. {
  469. SWITCH_TO(SelfClosingStartTag);
  470. }
  471. ON('>')
  472. {
  473. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  474. }
  475. ON_EOF
  476. {
  477. TODO();
  478. }
  479. ANYTHING_ELSE
  480. {
  481. TODO();
  482. }
  483. }
  484. END_STATE
  485. BEGIN_STATE(CommentStart)
  486. {
  487. ON('-')
  488. {
  489. SWITCH_TO(CommentStartDash);
  490. }
  491. ON('>')
  492. {
  493. TODO();
  494. }
  495. ANYTHING_ELSE
  496. {
  497. RECONSUME_IN(Comment);
  498. }
  499. }
  500. END_STATE
  501. BEGIN_STATE(CommentStartDash)
  502. {
  503. ON('-')
  504. {
  505. SWITCH_TO(CommentEnd);
  506. }
  507. ON('>')
  508. {
  509. TODO();
  510. }
  511. ON_EOF
  512. {
  513. TODO();
  514. }
  515. ANYTHING_ELSE
  516. {
  517. m_current_token.m_comment_or_character.data.append('-');
  518. RECONSUME_IN(Comment);
  519. }
  520. }
  521. END_STATE
  522. BEGIN_STATE(Comment)
  523. {
  524. ON('<')
  525. {
  526. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  527. SWITCH_TO(CommentLessThanSign);
  528. }
  529. ON('-')
  530. {
  531. SWITCH_TO(CommentEndDash);
  532. }
  533. ON(0)
  534. {
  535. TODO();
  536. }
  537. ON_EOF
  538. {
  539. TODO();
  540. }
  541. ANYTHING_ELSE
  542. {
  543. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  544. continue;
  545. }
  546. }
  547. END_STATE
  548. BEGIN_STATE(CommentEnd)
  549. {
  550. ON('>')
  551. {
  552. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  553. }
  554. ON('!')
  555. {
  556. SWITCH_TO(CommentEndBang);
  557. }
  558. ON('-')
  559. {
  560. m_current_token.m_comment_or_character.data.append('-');
  561. continue;
  562. }
  563. ON_EOF
  564. {
  565. TODO();
  566. }
  567. ANYTHING_ELSE
  568. {
  569. m_current_token.m_comment_or_character.data.append('-');
  570. RECONSUME_IN(Comment);
  571. }
  572. }
  573. END_STATE
  574. BEGIN_STATE(CommentEndBang)
  575. {
  576. ON('-')
  577. {
  578. m_current_token.m_comment_or_character.data.append("--!");
  579. SWITCH_TO(CommentEndDash);
  580. }
  581. ON('>')
  582. {
  583. TODO();
  584. }
  585. ON_EOF
  586. {
  587. TODO();
  588. }
  589. ANYTHING_ELSE
  590. {
  591. m_current_token.m_comment_or_character.data.append("--!");
  592. RECONSUME_IN(Comment);
  593. }
  594. }
  595. END_STATE
  596. BEGIN_STATE(CommentEndDash)
  597. {
  598. ON('-')
  599. {
  600. SWITCH_TO(CommentEnd);
  601. }
  602. ON_EOF
  603. {
  604. TODO();
  605. }
  606. ANYTHING_ELSE
  607. {
  608. m_current_token.m_comment_or_character.data.append('-');
  609. RECONSUME_IN(Comment);
  610. }
  611. }
  612. END_STATE
  613. BEGIN_STATE(CommentLessThanSign)
  614. {
  615. ON('!')
  616. {
  617. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  618. SWITCH_TO(CommentLessThanSignBang);
  619. }
  620. ON('<')
  621. {
  622. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  623. continue;
  624. }
  625. ANYTHING_ELSE
  626. {
  627. RECONSUME_IN(Comment);
  628. }
  629. }
  630. END_STATE
  631. BEGIN_STATE(CommentLessThanSignBang)
  632. {
  633. ON('-')
  634. {
  635. SWITCH_TO(CommentLessThanSignBangDash);
  636. }
  637. ANYTHING_ELSE
  638. {
  639. RECONSUME_IN(Comment);
  640. }
  641. }
  642. END_STATE
  643. BEGIN_STATE(CommentLessThanSignBangDash)
  644. {
  645. ON('-')
  646. {
  647. SWITCH_TO(CommentLessThanSignBangDashDash);
  648. }
  649. ANYTHING_ELSE
  650. {
  651. RECONSUME_IN(Comment);
  652. }
  653. }
  654. END_STATE
  655. BEGIN_STATE(CommentLessThanSignBangDashDash)
  656. {
  657. ON('>')
  658. {
  659. SWITCH_TO(CommentEnd);
  660. }
  661. ANYTHING_ELSE
  662. {
  663. TODO();
  664. }
  665. }
  666. END_STATE
  667. BEGIN_STATE(CharacterReference)
  668. {
  669. }
  670. END_STATE
  671. default:
  672. ASSERT_NOT_REACHED();
  673. }
  674. }
  675. }
  676. void HTMLTokenizer::consume(const StringView& string)
  677. {
  678. ASSERT(next_few_characters_are(string));
  679. m_cursor += string.length();
  680. }
  681. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  682. {
  683. for (size_t i = 0; i < string.length(); ++i) {
  684. auto codepoint = peek_codepoint(i);
  685. if (!codepoint.has_value())
  686. return false;
  687. // FIXME: This should be more Unicode-aware.
  688. if (codepoint.value() != (u32)string[i])
  689. return false;
  690. }
  691. return true;
  692. }
  693. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  694. {
  695. flush_current_character_or_comment_if_needed();
  696. m_current_token = {};
  697. m_current_token.m_type = type;
  698. }
  699. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  700. : m_input(input)
  701. {
  702. }
  703. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  704. {
  705. flush_current_character_or_comment_if_needed();
  706. #ifdef TOKENIZER_TRACE
  707. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  708. #endif
  709. }
  710. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  711. {
  712. flush_current_character_or_comment_if_needed();
  713. #ifdef TOKENIZER_TRACE
  714. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  715. #endif
  716. }
  717. void HTMLTokenizer::flush_current_character_or_comment_if_needed()
  718. {
  719. //if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
  720. // emit_current_token();
  721. }
  722. }