HTMLTokenizer.cpp 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. //#define TOKENIZER_TRACE
  30. #define TODO() \
  31. do { \
  32. dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
  33. ASSERT_NOT_REACHED(); \
  34. } while (0)
  35. #define SWITCH_TO(new_state) \
  36. will_switch_to(State::new_state); \
  37. m_state = State::new_state; \
  38. current_input_character = next_codepoint(); \
  39. goto new_state;
  40. #define RECONSUME_IN(new_state) \
  41. will_reconsume_in(State::new_state); \
  42. m_state = State::new_state; \
  43. goto new_state;
  44. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  45. #define ON(codepoint) \
  46. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  47. #define ON_EOF \
  48. if (!current_input_character.has_value())
  49. #define ON_ASCII_ALPHA \
  50. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  51. #define ON_WHITESPACE \
  52. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\a' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  53. #define ANYTHING_ELSE if (1)
  54. #define EMIT_EOF_AND_RETURN \
  55. create_new_token(HTMLToken::Type::EndOfFile); \
  56. emit_current_token(); \
  57. return;
  58. #define BEGIN_STATE(state) \
  59. state: \
  60. case State::state: { \
  61. { \
  62. {
  63. #define END_STATE \
  64. ASSERT_NOT_REACHED(); \
  65. break; \
  66. } \
  67. } \
  68. }
  69. namespace Web {
  70. Optional<u32> HTMLTokenizer::next_codepoint()
  71. {
  72. if (m_cursor >= m_input.length())
  73. return {};
  74. return m_input[m_cursor++];
  75. }
  76. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  77. {
  78. if ((m_cursor + offset) >= m_input.length())
  79. return {};
  80. return m_input[m_cursor + offset];
  81. }
  82. void HTMLTokenizer::run()
  83. {
  84. for (;;) {
  85. auto current_input_character = next_codepoint();
  86. switch (m_state) {
  87. BEGIN_STATE(Data)
  88. {
  89. ON('&')
  90. {
  91. m_return_state = State::Data;
  92. SWITCH_TO(CharacterReference);
  93. }
  94. ON('<')
  95. {
  96. SWITCH_TO(TagOpen);
  97. }
  98. ON_EOF
  99. {
  100. EMIT_EOF_AND_RETURN;
  101. }
  102. ANYTHING_ELSE
  103. {
  104. if (m_current_token.type() != HTMLToken::Type::Character)
  105. create_new_token(HTMLToken::Type::Character);
  106. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  107. continue;
  108. }
  109. }
  110. END_STATE
  111. BEGIN_STATE(TagOpen)
  112. {
  113. ON('!')
  114. {
  115. SWITCH_TO(MarkupDeclarationOpen);
  116. }
  117. ON('/')
  118. {
  119. SWITCH_TO(EndTagOpen);
  120. }
  121. ON_ASCII_ALPHA
  122. {
  123. create_new_token(HTMLToken::Type::StartTag);
  124. RECONSUME_IN(TagName);
  125. }
  126. ON('?')
  127. {
  128. TODO();
  129. }
  130. ANYTHING_ELSE
  131. {
  132. TODO();
  133. }
  134. }
  135. END_STATE
  136. BEGIN_STATE(TagName)
  137. {
  138. ON_WHITESPACE
  139. {
  140. SWITCH_TO(BeforeAttributeName);
  141. }
  142. ON('/')
  143. {
  144. SWITCH_TO(SelfClosingStartTag);
  145. }
  146. ON('>')
  147. {
  148. emit_current_token();
  149. SWITCH_TO(Data);
  150. }
  151. ANYTHING_ELSE
  152. {
  153. m_current_token.m_tag.tag_name.append(current_input_character.value());
  154. continue;
  155. }
  156. }
  157. END_STATE
  158. BEGIN_STATE(EndTagOpen)
  159. {
  160. ON_ASCII_ALPHA
  161. {
  162. create_new_token(HTMLToken::Type::EndTag);
  163. RECONSUME_IN(TagName);
  164. }
  165. }
  166. END_STATE
  167. BEGIN_STATE(MarkupDeclarationOpen)
  168. {
  169. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  170. if (next_few_characters_are("--")) {
  171. consume("--");
  172. create_new_token(HTMLToken::Type::Comment);
  173. SWITCH_TO(CommentStart);
  174. }
  175. if (next_few_characters_are("DOCTYPE")) {
  176. consume("DOCTYPE");
  177. SWITCH_TO(DOCTYPE);
  178. }
  179. }
  180. END_STATE
  181. BEGIN_STATE(DOCTYPE)
  182. {
  183. ON_WHITESPACE
  184. {
  185. SWITCH_TO(BeforeDOCTYPEName);
  186. }
  187. }
  188. END_STATE
  189. BEGIN_STATE(BeforeDOCTYPEName)
  190. {
  191. ON_WHITESPACE
  192. {
  193. continue;
  194. }
  195. ANYTHING_ELSE
  196. {
  197. create_new_token(HTMLToken::Type::DOCTYPE);
  198. m_current_token.m_doctype.name.append(current_input_character.value());
  199. SWITCH_TO(DOCTYPEName);
  200. }
  201. }
  202. END_STATE
  203. BEGIN_STATE(DOCTYPEName)
  204. {
  205. ON('>')
  206. {
  207. emit_current_token();
  208. SWITCH_TO(Data);
  209. }
  210. ANYTHING_ELSE
  211. {
  212. m_current_token.m_doctype.name.append(current_input_character.value());
  213. continue;
  214. }
  215. }
  216. END_STATE
  217. BEGIN_STATE(BeforeAttributeName)
  218. {
  219. ON_WHITESPACE
  220. {
  221. continue;
  222. }
  223. ON('/')
  224. {
  225. RECONSUME_IN(AfterAttributeName);
  226. }
  227. ON('>')
  228. {
  229. RECONSUME_IN(AfterAttributeName);
  230. }
  231. ON_EOF
  232. {
  233. RECONSUME_IN(AfterAttributeName);
  234. }
  235. ON('=')
  236. {
  237. TODO();
  238. }
  239. ANYTHING_ELSE
  240. {
  241. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  242. RECONSUME_IN(AttributeName);
  243. }
  244. }
  245. END_STATE
  246. BEGIN_STATE(SelfClosingStartTag)
  247. {
  248. }
  249. END_STATE
  250. BEGIN_STATE(AttributeName)
  251. {
  252. ON_WHITESPACE
  253. {
  254. RECONSUME_IN(AfterAttributeName);
  255. }
  256. ON('/')
  257. {
  258. RECONSUME_IN(AfterAttributeName);
  259. }
  260. ON('>')
  261. {
  262. RECONSUME_IN(AfterAttributeName);
  263. }
  264. ON_EOF
  265. {
  266. RECONSUME_IN(AfterAttributeName);
  267. }
  268. ON('=')
  269. {
  270. SWITCH_TO(BeforeAttributeValue);
  271. }
  272. ANYTHING_ELSE
  273. {
  274. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  275. continue;
  276. }
  277. }
  278. END_STATE
  279. BEGIN_STATE(AfterAttributeName)
  280. {
  281. }
  282. END_STATE
  283. BEGIN_STATE(BeforeAttributeValue)
  284. {
  285. ON_WHITESPACE
  286. {
  287. continue;
  288. }
  289. ON('"')
  290. {
  291. SWITCH_TO(AttributeValueDoubleQuoted);
  292. }
  293. ON('\'')
  294. {
  295. SWITCH_TO(AttributeValueSingleQuoted);
  296. }
  297. ON('>')
  298. {
  299. TODO();
  300. }
  301. ANYTHING_ELSE
  302. {
  303. RECONSUME_IN(AttributeValueUnquoted);
  304. }
  305. }
  306. END_STATE
  307. BEGIN_STATE(AttributeValueDoubleQuoted)
  308. {
  309. ON('"')
  310. {
  311. SWITCH_TO(AfterAttributeValueQuoted);
  312. }
  313. ON('&')
  314. {
  315. m_return_state = State::AttributeValueDoubleQuoted;
  316. SWITCH_TO(CharacterReference);
  317. }
  318. ON(0)
  319. {
  320. TODO();
  321. }
  322. ON_EOF
  323. {
  324. TODO();
  325. }
  326. ANYTHING_ELSE
  327. {
  328. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  329. continue;
  330. }
  331. }
  332. END_STATE
  333. BEGIN_STATE(AttributeValueSingleQuoted)
  334. {
  335. ON('\'')
  336. {
  337. SWITCH_TO(AfterAttributeValueQuoted);
  338. }
  339. ON('&')
  340. {
  341. m_return_state = State::AttributeValueSingleQuoted;
  342. SWITCH_TO(CharacterReference);
  343. }
  344. ON(0)
  345. {
  346. TODO();
  347. }
  348. ON_EOF
  349. {
  350. TODO();
  351. }
  352. ANYTHING_ELSE
  353. {
  354. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  355. continue;
  356. }
  357. }
  358. END_STATE
  359. BEGIN_STATE(AttributeValueUnquoted)
  360. {
  361. ON_WHITESPACE
  362. {
  363. SWITCH_TO(BeforeAttributeName);
  364. }
  365. ON('&')
  366. {
  367. m_return_state = State::AttributeValueUnquoted;
  368. SWITCH_TO(CharacterReference);
  369. }
  370. ON('>')
  371. {
  372. emit_current_token();
  373. SWITCH_TO(Data);
  374. }
  375. ON(0)
  376. {
  377. TODO();
  378. }
  379. ON_EOF
  380. {
  381. TODO();
  382. }
  383. ANYTHING_ELSE
  384. {
  385. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  386. continue;
  387. }
  388. }
  389. END_STATE
  390. BEGIN_STATE(AfterAttributeValueQuoted)
  391. {
  392. ON_WHITESPACE
  393. {
  394. SWITCH_TO(BeforeAttributeName);
  395. }
  396. ON('/')
  397. {
  398. SWITCH_TO(SelfClosingStartTag);
  399. }
  400. ON('>')
  401. {
  402. emit_current_token();
  403. SWITCH_TO(Data);
  404. }
  405. ON_EOF
  406. {
  407. TODO();
  408. }
  409. ANYTHING_ELSE
  410. {
  411. TODO();
  412. }
  413. }
  414. END_STATE
  415. BEGIN_STATE(CommentStart)
  416. {
  417. ON('-')
  418. {
  419. SWITCH_TO(CommentStartDash);
  420. }
  421. ON('>')
  422. {
  423. TODO();
  424. }
  425. ANYTHING_ELSE
  426. {
  427. RECONSUME_IN(Comment);
  428. }
  429. }
  430. END_STATE
  431. BEGIN_STATE(CommentStartDash)
  432. {
  433. ON('-')
  434. {
  435. SWITCH_TO(CommentEnd);
  436. }
  437. ON('>')
  438. {
  439. TODO();
  440. }
  441. ON_EOF
  442. {
  443. TODO();
  444. }
  445. ANYTHING_ELSE
  446. {
  447. m_current_token.m_comment_or_character.data.append('-');
  448. RECONSUME_IN(Comment);
  449. }
  450. }
  451. END_STATE
  452. BEGIN_STATE(Comment)
  453. {
  454. ON('<')
  455. {
  456. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  457. SWITCH_TO(CommentLessThanSign);
  458. }
  459. ON('-')
  460. {
  461. SWITCH_TO(CommentEndDash);
  462. }
  463. ON(0)
  464. {
  465. TODO();
  466. }
  467. ON_EOF
  468. {
  469. TODO();
  470. }
  471. ANYTHING_ELSE
  472. {
  473. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  474. continue;
  475. }
  476. }
  477. END_STATE
  478. BEGIN_STATE(CommentEnd)
  479. {
  480. ON('>')
  481. {
  482. emit_current_token();
  483. SWITCH_TO(Data);
  484. }
  485. ON('!')
  486. {
  487. SWITCH_TO(CommentEndBang);
  488. }
  489. ON('-')
  490. {
  491. m_current_token.m_comment_or_character.data.append('-');
  492. continue;
  493. }
  494. ON_EOF
  495. {
  496. TODO();
  497. }
  498. ANYTHING_ELSE
  499. {
  500. m_current_token.m_comment_or_character.data.append('-');
  501. RECONSUME_IN(Comment);
  502. }
  503. }
  504. END_STATE
  505. BEGIN_STATE(CommentEndBang)
  506. {
  507. ON('-')
  508. {
  509. m_current_token.m_comment_or_character.data.append("--!");
  510. SWITCH_TO(CommentEndDash);
  511. }
  512. ON('>')
  513. {
  514. TODO();
  515. }
  516. ON_EOF
  517. {
  518. TODO();
  519. }
  520. ANYTHING_ELSE
  521. {
  522. m_current_token.m_comment_or_character.data.append("--!");
  523. RECONSUME_IN(Comment);
  524. }
  525. }
  526. END_STATE
  527. BEGIN_STATE(CommentEndDash)
  528. {
  529. ON('-')
  530. {
  531. SWITCH_TO(CommentEnd);
  532. }
  533. ON_EOF
  534. {
  535. TODO();
  536. }
  537. ANYTHING_ELSE
  538. {
  539. m_current_token.m_comment_or_character.data.append('-');
  540. RECONSUME_IN(Comment);
  541. }
  542. }
  543. END_STATE
  544. BEGIN_STATE(CommentLessThanSign)
  545. {
  546. ON('!')
  547. {
  548. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  549. SWITCH_TO(CommentLessThanSignBang);
  550. }
  551. ON('<')
  552. {
  553. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  554. continue;
  555. }
  556. ANYTHING_ELSE
  557. {
  558. RECONSUME_IN(Comment);
  559. }
  560. }
  561. END_STATE
  562. BEGIN_STATE(CommentLessThanSignBang)
  563. {
  564. ON('-')
  565. {
  566. SWITCH_TO(CommentLessThanSignBangDash);
  567. }
  568. ANYTHING_ELSE
  569. {
  570. RECONSUME_IN(Comment);
  571. }
  572. }
  573. END_STATE
  574. BEGIN_STATE(CommentLessThanSignBangDash)
  575. {
  576. ON('-')
  577. {
  578. SWITCH_TO(CommentLessThanSignBangDashDash);
  579. }
  580. ANYTHING_ELSE
  581. {
  582. RECONSUME_IN(Comment);
  583. }
  584. }
  585. END_STATE
  586. BEGIN_STATE(CommentLessThanSignBangDashDash)
  587. {
  588. ON('>')
  589. {
  590. SWITCH_TO(CommentEnd);
  591. }
  592. ANYTHING_ELSE
  593. {
  594. TODO();
  595. }
  596. }
  597. END_STATE
  598. BEGIN_STATE(CharacterReference)
  599. {
  600. }
  601. END_STATE
  602. default:
  603. ASSERT_NOT_REACHED();
  604. }
  605. }
  606. }
  607. void HTMLTokenizer::consume(const StringView& string)
  608. {
  609. ASSERT(next_few_characters_are(string));
  610. m_cursor += string.length();
  611. }
  612. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  613. {
  614. for (size_t i = 0; i < string.length(); ++i) {
  615. auto codepoint = peek_codepoint(i);
  616. if (!codepoint.has_value())
  617. return false;
  618. // FIXME: This should be more Unicode-aware.
  619. if (codepoint.value() != (u32)string[i])
  620. return false;
  621. }
  622. return true;
  623. }
  624. void HTMLTokenizer::emit_current_token()
  625. {
  626. StringBuilder builder;
  627. switch (m_current_token.type()) {
  628. case HTMLToken::Type::DOCTYPE:
  629. builder.append("DOCTYPE");
  630. builder.append(" { name: '");
  631. builder.append(m_current_token.m_doctype.name.to_string());
  632. builder.append("' }");
  633. break;
  634. case HTMLToken::Type::StartTag:
  635. builder.append("StartTag");
  636. break;
  637. case HTMLToken::Type::EndTag:
  638. builder.append("EndTag");
  639. break;
  640. case HTMLToken::Type::Comment:
  641. builder.append("Comment");
  642. break;
  643. case HTMLToken::Type::Character:
  644. builder.append("Character");
  645. break;
  646. case HTMLToken::Type::EndOfFile:
  647. builder.append("EndOfFile");
  648. break;
  649. }
  650. if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) {
  651. builder.append(" { name: '");
  652. builder.append(m_current_token.m_tag.tag_name.to_string());
  653. builder.append("', { ");
  654. for (auto& attribute : m_current_token.m_tag.attributes) {
  655. builder.append(attribute.name_builder.to_string());
  656. builder.append("=\"");
  657. builder.append(attribute.value_builder.to_string());
  658. builder.append("\" ");
  659. }
  660. builder.append("} }");
  661. }
  662. if (m_current_token.type() == HTMLToken::Type::Comment || m_current_token.type() == HTMLToken::Type::Character) {
  663. builder.append(" { data: '");
  664. builder.append(m_current_token.m_comment_or_character.data.to_string());
  665. builder.append(" }");
  666. }
  667. dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
  668. m_current_token = {};
  669. }
  670. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  671. {
  672. flush_current_character_or_comment_if_needed();
  673. m_current_token = {};
  674. m_current_token.m_type = type;
  675. }
  676. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  677. : m_input(input)
  678. {
  679. }
  680. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  681. {
  682. flush_current_character_or_comment_if_needed();
  683. #ifdef TOKENIZER_TRACE
  684. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  685. #endif
  686. }
  687. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  688. {
  689. flush_current_character_or_comment_if_needed();
  690. #ifdef TOKENIZER_TRACE
  691. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  692. #endif
  693. }
  694. void HTMLTokenizer::flush_current_character_or_comment_if_needed()
  695. {
  696. if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
  697. emit_current_token();
  698. }
  699. }