HTMLTokenizer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. #pragma GCC diagnostic ignored "-Wunused-label"
  30. //#define TOKENIZER_TRACE
  31. #define TODO() \
  32. do { \
  33. dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
  34. ASSERT_NOT_REACHED(); \
  35. } while (0)
  36. #define SWITCH_TO(new_state) \
  37. will_switch_to(State::new_state); \
  38. m_state = State::new_state; \
  39. current_input_character = next_codepoint(); \
  40. goto new_state;
  41. #define RECONSUME_IN(new_state) \
  42. will_reconsume_in(State::new_state); \
  43. m_state = State::new_state; \
  44. goto new_state;
  45. #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
  46. will_switch_to(State::new_state); \
  47. m_state = State::new_state; \
  48. return m_current_token;
  49. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  50. #define ON(codepoint) \
  51. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  52. #define ON_EOF \
  53. if (!current_input_character.has_value())
  54. #define ON_ASCII_ALPHA \
  55. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  56. #define ON_ASCII_UPPER_ALPHA \
  57. if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
  58. #define ON_WHITESPACE \
  59. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\n' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  60. #define ANYTHING_ELSE if (1)
  61. #define EMIT_EOF \
  62. if (m_has_emitted_eof) \
  63. return {}; \
  64. m_has_emitted_eof = true; \
  65. create_new_token(HTMLToken::Type::EndOfFile); \
  66. return m_current_token;
  67. #define EMIT_CURRENT_TOKEN \
  68. return m_current_token;
  69. #define BEGIN_STATE(state) \
  70. state: \
  71. case State::state: { \
  72. { \
  73. {
  74. #define END_STATE \
  75. ASSERT_NOT_REACHED(); \
  76. break; \
  77. } \
  78. } \
  79. }
  80. namespace Web {
  81. Optional<u32> HTMLTokenizer::next_codepoint()
  82. {
  83. if (m_cursor >= m_input.length())
  84. return {};
  85. return m_input[m_cursor++];
  86. }
  87. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  88. {
  89. if ((m_cursor + offset) >= m_input.length())
  90. return {};
  91. return m_input[m_cursor + offset];
  92. }
  93. Optional<HTMLToken> HTMLTokenizer::next_token()
  94. {
  95. for (;;) {
  96. auto current_input_character = next_codepoint();
  97. switch (m_state) {
  98. BEGIN_STATE(Data)
  99. {
  100. ON('&')
  101. {
  102. m_return_state = State::Data;
  103. SWITCH_TO(CharacterReference);
  104. }
  105. ON('<')
  106. {
  107. SWITCH_TO(TagOpen);
  108. }
  109. ON_EOF
  110. {
  111. EMIT_EOF;
  112. }
  113. ANYTHING_ELSE
  114. {
  115. create_new_token(HTMLToken::Type::Character);
  116. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  117. return m_current_token;
  118. }
  119. }
  120. END_STATE
  121. BEGIN_STATE(TagOpen)
  122. {
  123. ON('!')
  124. {
  125. SWITCH_TO(MarkupDeclarationOpen);
  126. }
  127. ON('/')
  128. {
  129. SWITCH_TO(EndTagOpen);
  130. }
  131. ON_ASCII_ALPHA
  132. {
  133. create_new_token(HTMLToken::Type::StartTag);
  134. RECONSUME_IN(TagName);
  135. }
  136. ON('?')
  137. {
  138. TODO();
  139. }
  140. ANYTHING_ELSE
  141. {
  142. TODO();
  143. }
  144. }
  145. END_STATE
  146. BEGIN_STATE(TagName)
  147. {
  148. ON_WHITESPACE
  149. {
  150. SWITCH_TO(BeforeAttributeName);
  151. }
  152. ON('/')
  153. {
  154. SWITCH_TO(SelfClosingStartTag);
  155. }
  156. ON('>')
  157. {
  158. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  159. }
  160. ANYTHING_ELSE
  161. {
  162. m_current_token.m_tag.tag_name.append(current_input_character.value());
  163. continue;
  164. }
  165. }
  166. END_STATE
  167. BEGIN_STATE(EndTagOpen)
  168. {
  169. ON_ASCII_ALPHA
  170. {
  171. create_new_token(HTMLToken::Type::EndTag);
  172. RECONSUME_IN(TagName);
  173. }
  174. }
  175. END_STATE
  176. BEGIN_STATE(MarkupDeclarationOpen)
  177. {
  178. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  179. if (next_few_characters_are("--")) {
  180. consume("--");
  181. create_new_token(HTMLToken::Type::Comment);
  182. SWITCH_TO(CommentStart);
  183. }
  184. if (next_few_characters_are("DOCTYPE")) {
  185. consume("DOCTYPE");
  186. SWITCH_TO(DOCTYPE);
  187. }
  188. }
  189. END_STATE
  190. BEGIN_STATE(DOCTYPE)
  191. {
  192. ON_WHITESPACE
  193. {
  194. SWITCH_TO(BeforeDOCTYPEName);
  195. }
  196. ON('>')
  197. {
  198. RECONSUME_IN(BeforeDOCTYPEName);
  199. }
  200. ON_EOF
  201. {
  202. TODO();
  203. }
  204. ANYTHING_ELSE
  205. {
  206. TODO();
  207. }
  208. }
  209. END_STATE
  210. BEGIN_STATE(BeforeDOCTYPEName)
  211. {
  212. ON_WHITESPACE
  213. {
  214. continue;
  215. }
  216. ON_ASCII_UPPER_ALPHA
  217. {
  218. create_new_token(HTMLToken::Type::DOCTYPE);
  219. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  220. SWITCH_TO(DOCTYPEName);
  221. }
  222. ON(0)
  223. {
  224. TODO();
  225. }
  226. ON('>')
  227. {
  228. TODO();
  229. }
  230. ON_EOF
  231. {
  232. TODO();
  233. }
  234. ANYTHING_ELSE
  235. {
  236. create_new_token(HTMLToken::Type::DOCTYPE);
  237. m_current_token.m_doctype.name.append(current_input_character.value());
  238. SWITCH_TO(DOCTYPEName);
  239. }
  240. }
  241. END_STATE
  242. BEGIN_STATE(DOCTYPEName)
  243. {
  244. ON_WHITESPACE
  245. {
  246. SWITCH_TO(AfterDOCTYPEName);
  247. }
  248. ON('>')
  249. {
  250. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  251. }
  252. ON_ASCII_UPPER_ALPHA
  253. {
  254. m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
  255. }
  256. ON(0)
  257. {
  258. TODO();
  259. }
  260. ON_EOF
  261. {
  262. TODO();
  263. }
  264. ANYTHING_ELSE
  265. {
  266. m_current_token.m_doctype.name.append(current_input_character.value());
  267. continue;
  268. }
  269. }
  270. END_STATE
  271. BEGIN_STATE(AfterDOCTYPEName)
  272. {
  273. ON_WHITESPACE
  274. {
  275. continue;
  276. }
  277. ON('>')
  278. {
  279. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  280. }
  281. ON_EOF
  282. {
  283. TODO();
  284. }
  285. ANYTHING_ELSE
  286. {
  287. TODO();
  288. }
  289. }
  290. END_STATE
  291. BEGIN_STATE(BeforeAttributeName)
  292. {
  293. ON_WHITESPACE
  294. {
  295. continue;
  296. }
  297. ON('/')
  298. {
  299. RECONSUME_IN(AfterAttributeName);
  300. }
  301. ON('>')
  302. {
  303. RECONSUME_IN(AfterAttributeName);
  304. }
  305. ON_EOF
  306. {
  307. RECONSUME_IN(AfterAttributeName);
  308. }
  309. ON('=')
  310. {
  311. TODO();
  312. }
  313. ANYTHING_ELSE
  314. {
  315. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  316. RECONSUME_IN(AttributeName);
  317. }
  318. }
  319. END_STATE
  320. BEGIN_STATE(SelfClosingStartTag)
  321. {
  322. }
  323. END_STATE
  324. BEGIN_STATE(AttributeName)
  325. {
  326. ON_WHITESPACE
  327. {
  328. RECONSUME_IN(AfterAttributeName);
  329. }
  330. ON('/')
  331. {
  332. RECONSUME_IN(AfterAttributeName);
  333. }
  334. ON('>')
  335. {
  336. RECONSUME_IN(AfterAttributeName);
  337. }
  338. ON_EOF
  339. {
  340. RECONSUME_IN(AfterAttributeName);
  341. }
  342. ON('=')
  343. {
  344. SWITCH_TO(BeforeAttributeValue);
  345. }
  346. ANYTHING_ELSE
  347. {
  348. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  349. continue;
  350. }
  351. }
  352. END_STATE
  353. BEGIN_STATE(AfterAttributeName)
  354. {
  355. }
  356. END_STATE
  357. BEGIN_STATE(BeforeAttributeValue)
  358. {
  359. ON_WHITESPACE
  360. {
  361. continue;
  362. }
  363. ON('"')
  364. {
  365. SWITCH_TO(AttributeValueDoubleQuoted);
  366. }
  367. ON('\'')
  368. {
  369. SWITCH_TO(AttributeValueSingleQuoted);
  370. }
  371. ON('>')
  372. {
  373. TODO();
  374. }
  375. ANYTHING_ELSE
  376. {
  377. RECONSUME_IN(AttributeValueUnquoted);
  378. }
  379. }
  380. END_STATE
  381. BEGIN_STATE(AttributeValueDoubleQuoted)
  382. {
  383. ON('"')
  384. {
  385. SWITCH_TO(AfterAttributeValueQuoted);
  386. }
  387. ON('&')
  388. {
  389. m_return_state = State::AttributeValueDoubleQuoted;
  390. SWITCH_TO(CharacterReference);
  391. }
  392. ON(0)
  393. {
  394. TODO();
  395. }
  396. ON_EOF
  397. {
  398. TODO();
  399. }
  400. ANYTHING_ELSE
  401. {
  402. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  403. continue;
  404. }
  405. }
  406. END_STATE
  407. BEGIN_STATE(AttributeValueSingleQuoted)
  408. {
  409. ON('\'')
  410. {
  411. SWITCH_TO(AfterAttributeValueQuoted);
  412. }
  413. ON('&')
  414. {
  415. m_return_state = State::AttributeValueSingleQuoted;
  416. SWITCH_TO(CharacterReference);
  417. }
  418. ON(0)
  419. {
  420. TODO();
  421. }
  422. ON_EOF
  423. {
  424. TODO();
  425. }
  426. ANYTHING_ELSE
  427. {
  428. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  429. continue;
  430. }
  431. }
  432. END_STATE
  433. BEGIN_STATE(AttributeValueUnquoted)
  434. {
  435. ON_WHITESPACE
  436. {
  437. SWITCH_TO(BeforeAttributeName);
  438. }
  439. ON('&')
  440. {
  441. m_return_state = State::AttributeValueUnquoted;
  442. SWITCH_TO(CharacterReference);
  443. }
  444. ON('>')
  445. {
  446. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  447. }
  448. ON(0)
  449. {
  450. TODO();
  451. }
  452. ON_EOF
  453. {
  454. TODO();
  455. }
  456. ANYTHING_ELSE
  457. {
  458. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  459. continue;
  460. }
  461. }
  462. END_STATE
  463. BEGIN_STATE(AfterAttributeValueQuoted)
  464. {
  465. ON_WHITESPACE
  466. {
  467. SWITCH_TO(BeforeAttributeName);
  468. }
  469. ON('/')
  470. {
  471. SWITCH_TO(SelfClosingStartTag);
  472. }
  473. ON('>')
  474. {
  475. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  476. }
  477. ON_EOF
  478. {
  479. TODO();
  480. }
  481. ANYTHING_ELSE
  482. {
  483. TODO();
  484. }
  485. }
  486. END_STATE
  487. BEGIN_STATE(CommentStart)
  488. {
  489. ON('-')
  490. {
  491. SWITCH_TO(CommentStartDash);
  492. }
  493. ON('>')
  494. {
  495. TODO();
  496. }
  497. ANYTHING_ELSE
  498. {
  499. RECONSUME_IN(Comment);
  500. }
  501. }
  502. END_STATE
  503. BEGIN_STATE(CommentStartDash)
  504. {
  505. ON('-')
  506. {
  507. SWITCH_TO(CommentEnd);
  508. }
  509. ON('>')
  510. {
  511. TODO();
  512. }
  513. ON_EOF
  514. {
  515. TODO();
  516. }
  517. ANYTHING_ELSE
  518. {
  519. m_current_token.m_comment_or_character.data.append('-');
  520. RECONSUME_IN(Comment);
  521. }
  522. }
  523. END_STATE
  524. BEGIN_STATE(Comment)
  525. {
  526. ON('<')
  527. {
  528. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  529. SWITCH_TO(CommentLessThanSign);
  530. }
  531. ON('-')
  532. {
  533. SWITCH_TO(CommentEndDash);
  534. }
  535. ON(0)
  536. {
  537. TODO();
  538. }
  539. ON_EOF
  540. {
  541. TODO();
  542. }
  543. ANYTHING_ELSE
  544. {
  545. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  546. continue;
  547. }
  548. }
  549. END_STATE
  550. BEGIN_STATE(CommentEnd)
  551. {
  552. ON('>')
  553. {
  554. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  555. }
  556. ON('!')
  557. {
  558. SWITCH_TO(CommentEndBang);
  559. }
  560. ON('-')
  561. {
  562. m_current_token.m_comment_or_character.data.append('-');
  563. continue;
  564. }
  565. ON_EOF
  566. {
  567. TODO();
  568. }
  569. ANYTHING_ELSE
  570. {
  571. m_current_token.m_comment_or_character.data.append('-');
  572. RECONSUME_IN(Comment);
  573. }
  574. }
  575. END_STATE
  576. BEGIN_STATE(CommentEndBang)
  577. {
  578. ON('-')
  579. {
  580. m_current_token.m_comment_or_character.data.append("--!");
  581. SWITCH_TO(CommentEndDash);
  582. }
  583. ON('>')
  584. {
  585. TODO();
  586. }
  587. ON_EOF
  588. {
  589. TODO();
  590. }
  591. ANYTHING_ELSE
  592. {
  593. m_current_token.m_comment_or_character.data.append("--!");
  594. RECONSUME_IN(Comment);
  595. }
  596. }
  597. END_STATE
  598. BEGIN_STATE(CommentEndDash)
  599. {
  600. ON('-')
  601. {
  602. SWITCH_TO(CommentEnd);
  603. }
  604. ON_EOF
  605. {
  606. TODO();
  607. }
  608. ANYTHING_ELSE
  609. {
  610. m_current_token.m_comment_or_character.data.append('-');
  611. RECONSUME_IN(Comment);
  612. }
  613. }
  614. END_STATE
  615. BEGIN_STATE(CommentLessThanSign)
  616. {
  617. ON('!')
  618. {
  619. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  620. SWITCH_TO(CommentLessThanSignBang);
  621. }
  622. ON('<')
  623. {
  624. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  625. continue;
  626. }
  627. ANYTHING_ELSE
  628. {
  629. RECONSUME_IN(Comment);
  630. }
  631. }
  632. END_STATE
  633. BEGIN_STATE(CommentLessThanSignBang)
  634. {
  635. ON('-')
  636. {
  637. SWITCH_TO(CommentLessThanSignBangDash);
  638. }
  639. ANYTHING_ELSE
  640. {
  641. RECONSUME_IN(Comment);
  642. }
  643. }
  644. END_STATE
  645. BEGIN_STATE(CommentLessThanSignBangDash)
  646. {
  647. ON('-')
  648. {
  649. SWITCH_TO(CommentLessThanSignBangDashDash);
  650. }
  651. ANYTHING_ELSE
  652. {
  653. RECONSUME_IN(Comment);
  654. }
  655. }
  656. END_STATE
  657. BEGIN_STATE(CommentLessThanSignBangDashDash)
  658. {
  659. ON('>')
  660. {
  661. SWITCH_TO(CommentEnd);
  662. }
  663. ANYTHING_ELSE
  664. {
  665. TODO();
  666. }
  667. }
  668. END_STATE
  669. BEGIN_STATE(CharacterReference)
  670. {
  671. }
  672. END_STATE
  673. default:
  674. ASSERT_NOT_REACHED();
  675. }
  676. }
  677. }
  678. void HTMLTokenizer::consume(const StringView& string)
  679. {
  680. ASSERT(next_few_characters_are(string));
  681. m_cursor += string.length();
  682. }
  683. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  684. {
  685. for (size_t i = 0; i < string.length(); ++i) {
  686. auto codepoint = peek_codepoint(i);
  687. if (!codepoint.has_value())
  688. return false;
  689. // FIXME: This should be more Unicode-aware.
  690. if (codepoint.value() != (u32)string[i])
  691. return false;
  692. }
  693. return true;
  694. }
  695. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  696. {
  697. m_current_token = {};
  698. m_current_token.m_type = type;
  699. }
  700. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  701. : m_input(input)
  702. {
  703. }
  704. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  705. {
  706. #ifdef TOKENIZER_TRACE
  707. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  708. #endif
  709. }
  710. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  711. {
  712. #ifdef TOKENIZER_TRACE
  713. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  714. #endif
  715. }
  716. }