HTMLTokenizer.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. //#define TOKENIZER_TRACE
  30. #define TODO ASSERT_NOT_REACHED
  31. #define SWITCH_TO(new_state) \
  32. will_switch_to(State::new_state); \
  33. m_state = State::new_state; \
  34. current_input_character = next_codepoint(); \
  35. goto new_state;
  36. #define RECONSUME_IN(new_state) \
  37. will_reconsume_in(State::new_state); \
  38. m_state = State::new_state; \
  39. goto new_state;
  40. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  41. #define ON(codepoint) \
  42. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  43. #define ON_EOF \
  44. if (!current_input_character.has_value())
  45. #define ON_ASCII_ALPHA \
  46. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  47. #define ON_WHITESPACE \
  48. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\a' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  49. #define ANYTHING_ELSE if (1)
  50. #define EMIT_EOF_AND_RETURN \
  51. create_new_token(HTMLToken::Type::EndOfFile); \
  52. emit_current_token(); \
  53. return;
  54. #define BEGIN_STATE(state) \
  55. state: \
  56. case State::state: { \
  57. { \
  58. {
  59. #define END_STATE \
  60. ASSERT_NOT_REACHED(); \
  61. break; \
  62. } \
  63. } \
  64. }
  65. namespace Web {
  66. Optional<u32> HTMLTokenizer::next_codepoint()
  67. {
  68. if (m_cursor >= m_input.length())
  69. return {};
  70. return m_input[m_cursor++];
  71. }
  72. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  73. {
  74. if ((m_cursor + offset) >= m_input.length())
  75. return {};
  76. return m_input[m_cursor + offset];
  77. }
  78. void HTMLTokenizer::run()
  79. {
  80. for (;;) {
  81. auto current_input_character = next_codepoint();
  82. switch (m_state) {
  83. BEGIN_STATE(Data)
  84. {
  85. ON('&')
  86. {
  87. m_return_state = State::Data;
  88. SWITCH_TO(CharacterReference);
  89. }
  90. ON('<')
  91. {
  92. SWITCH_TO(TagOpen);
  93. }
  94. ON_EOF
  95. {
  96. EMIT_EOF_AND_RETURN;
  97. }
  98. ANYTHING_ELSE
  99. {
  100. create_new_token(HTMLToken::Type::Character);
  101. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  102. emit_current_token();
  103. continue;
  104. }
  105. }
  106. END_STATE
  107. BEGIN_STATE(TagOpen)
  108. {
  109. ON('!')
  110. {
  111. SWITCH_TO(MarkupDeclarationOpen);
  112. }
  113. ON('/')
  114. {
  115. SWITCH_TO(EndTagOpen);
  116. }
  117. ON_ASCII_ALPHA
  118. {
  119. create_new_token(HTMLToken::Type::StartTag);
  120. RECONSUME_IN(TagName);
  121. }
  122. }
  123. END_STATE
  124. BEGIN_STATE(TagName)
  125. {
  126. ON_WHITESPACE
  127. {
  128. SWITCH_TO(BeforeAttributeName);
  129. }
  130. ON('/')
  131. {
  132. SWITCH_TO(SelfClosingStartTag);
  133. }
  134. ON('>')
  135. {
  136. emit_current_token();
  137. SWITCH_TO(Data);
  138. }
  139. ANYTHING_ELSE
  140. {
  141. m_current_token.m_tag.tag_name.append(current_input_character.value());
  142. continue;
  143. }
  144. }
  145. END_STATE
  146. BEGIN_STATE(EndTagOpen)
  147. {
  148. ON_ASCII_ALPHA
  149. {
  150. create_new_token(HTMLToken::Type::EndTag);
  151. RECONSUME_IN(TagName);
  152. }
  153. }
  154. END_STATE
  155. BEGIN_STATE(MarkupDeclarationOpen)
  156. {
  157. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  158. if (next_few_characters_are("--")) {
  159. consume("--");
  160. create_new_token(HTMLToken::Type::Comment);
  161. SWITCH_TO(CommentStart);
  162. }
  163. if (next_few_characters_are("DOCTYPE")) {
  164. consume("DOCTYPE");
  165. SWITCH_TO(DOCTYPE);
  166. }
  167. }
  168. END_STATE
  169. BEGIN_STATE(DOCTYPE)
  170. {
  171. ON_WHITESPACE
  172. {
  173. SWITCH_TO(BeforeDOCTYPEName);
  174. }
  175. }
  176. END_STATE
  177. BEGIN_STATE(BeforeDOCTYPEName)
  178. {
  179. ON_WHITESPACE
  180. {
  181. continue;
  182. }
  183. ANYTHING_ELSE
  184. {
  185. create_new_token(HTMLToken::Type::DOCTYPE);
  186. m_current_token.m_doctype.name.append(current_input_character.value());
  187. SWITCH_TO(DOCTYPEName);
  188. }
  189. }
  190. END_STATE
  191. BEGIN_STATE(DOCTYPEName)
  192. {
  193. ON('>')
  194. {
  195. emit_current_token();
  196. SWITCH_TO(Data);
  197. }
  198. ANYTHING_ELSE
  199. {
  200. m_current_token.m_doctype.name.append(current_input_character.value());
  201. continue;
  202. }
  203. }
  204. END_STATE
  205. BEGIN_STATE(BeforeAttributeName)
  206. {
  207. ON_WHITESPACE
  208. {
  209. continue;
  210. }
  211. ON('/')
  212. {
  213. RECONSUME_IN(AfterAttributeName);
  214. }
  215. ON('>')
  216. {
  217. RECONSUME_IN(AfterAttributeName);
  218. }
  219. ON_EOF
  220. {
  221. RECONSUME_IN(AfterAttributeName);
  222. }
  223. ON('=')
  224. {
  225. TODO();
  226. }
  227. ANYTHING_ELSE
  228. {
  229. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  230. RECONSUME_IN(AttributeName);
  231. }
  232. }
  233. END_STATE
  234. BEGIN_STATE(SelfClosingStartTag)
  235. {
  236. }
  237. END_STATE
  238. BEGIN_STATE(AttributeName)
  239. {
  240. ON_WHITESPACE
  241. {
  242. RECONSUME_IN(AfterAttributeName);
  243. }
  244. ON('/')
  245. {
  246. RECONSUME_IN(AfterAttributeName);
  247. }
  248. ON('>')
  249. {
  250. RECONSUME_IN(AfterAttributeName);
  251. }
  252. ON_EOF
  253. {
  254. RECONSUME_IN(AfterAttributeName);
  255. }
  256. ON('=')
  257. {
  258. SWITCH_TO(BeforeAttributeValue);
  259. }
  260. ANYTHING_ELSE
  261. {
  262. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  263. continue;
  264. }
  265. }
  266. END_STATE
  267. BEGIN_STATE(AfterAttributeName)
  268. {
  269. }
  270. END_STATE
  271. BEGIN_STATE(BeforeAttributeValue)
  272. {
  273. ON_WHITESPACE
  274. {
  275. continue;
  276. }
  277. ON('"')
  278. {
  279. SWITCH_TO(AttributeValueDoubleQuoted);
  280. }
  281. ON('\'')
  282. {
  283. SWITCH_TO(AttributeValueSingleQuoted);
  284. }
  285. ON('>')
  286. {
  287. TODO();
  288. }
  289. ANYTHING_ELSE
  290. {
  291. RECONSUME_IN(AttributeValueUnquoted);
  292. }
  293. }
  294. END_STATE
  295. BEGIN_STATE(AttributeValueDoubleQuoted)
  296. {
  297. ON('"')
  298. {
  299. SWITCH_TO(AfterAttributeValueQuoted);
  300. }
  301. ON('&')
  302. {
  303. m_return_state = State::AttributeValueDoubleQuoted;
  304. SWITCH_TO(CharacterReference);
  305. }
  306. ON(0)
  307. {
  308. TODO();
  309. }
  310. ON_EOF
  311. {
  312. TODO();
  313. }
  314. ANYTHING_ELSE
  315. {
  316. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  317. continue;
  318. }
  319. }
  320. END_STATE
  321. BEGIN_STATE(AttributeValueSingleQuoted)
  322. {
  323. ON('\'')
  324. {
  325. SWITCH_TO(AfterAttributeValueQuoted);
  326. }
  327. ON('&')
  328. {
  329. m_return_state = State::AttributeValueSingleQuoted;
  330. SWITCH_TO(CharacterReference);
  331. }
  332. ON(0)
  333. {
  334. TODO();
  335. }
  336. ON_EOF
  337. {
  338. TODO();
  339. }
  340. ANYTHING_ELSE
  341. {
  342. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  343. continue;
  344. }
  345. }
  346. END_STATE
  347. BEGIN_STATE(AttributeValueUnquoted)
  348. {
  349. ON_WHITESPACE
  350. {
  351. SWITCH_TO(BeforeAttributeName);
  352. }
  353. ON('&')
  354. {
  355. m_return_state = State::AttributeValueUnquoted;
  356. SWITCH_TO(CharacterReference);
  357. }
  358. ON('>')
  359. {
  360. emit_current_token();
  361. SWITCH_TO(Data);
  362. }
  363. ON(0)
  364. {
  365. TODO();
  366. }
  367. ON_EOF
  368. {
  369. TODO();
  370. }
  371. ANYTHING_ELSE
  372. {
  373. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  374. continue;
  375. }
  376. }
  377. END_STATE
  378. BEGIN_STATE(AfterAttributeValueQuoted)
  379. {
  380. ON_WHITESPACE
  381. {
  382. SWITCH_TO(BeforeAttributeName);
  383. }
  384. ON('/')
  385. {
  386. SWITCH_TO(SelfClosingStartTag);
  387. }
  388. ON('>')
  389. {
  390. emit_current_token();
  391. SWITCH_TO(Data);
  392. }
  393. ON_EOF
  394. {
  395. TODO();
  396. }
  397. ANYTHING_ELSE
  398. {
  399. TODO();
  400. }
  401. }
  402. END_STATE
  403. BEGIN_STATE(CommentStart)
  404. {
  405. ON('-')
  406. {
  407. SWITCH_TO(CommentStartDash);
  408. }
  409. ON('>')
  410. {
  411. TODO();
  412. }
  413. ANYTHING_ELSE
  414. {
  415. RECONSUME_IN(Comment);
  416. }
  417. }
  418. END_STATE
  419. BEGIN_STATE(CommentStartDash)
  420. {
  421. ON('-')
  422. {
  423. SWITCH_TO(CommentEnd);
  424. }
  425. ON('>')
  426. {
  427. TODO();
  428. }
  429. ON_EOF
  430. {
  431. TODO();
  432. }
  433. ANYTHING_ELSE
  434. {
  435. m_current_token.m_comment_or_character.data.append('-');
  436. RECONSUME_IN(Comment);
  437. }
  438. }
  439. END_STATE
  440. BEGIN_STATE(Comment)
  441. {
  442. ON('<')
  443. {
  444. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  445. SWITCH_TO(CommentLessThanSign);
  446. }
  447. ON('-')
  448. {
  449. SWITCH_TO(CommentEndDash);
  450. }
  451. ON(0)
  452. {
  453. TODO();
  454. }
  455. ON_EOF
  456. {
  457. TODO();
  458. }
  459. ANYTHING_ELSE
  460. {
  461. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  462. continue;
  463. }
  464. }
  465. END_STATE
  466. BEGIN_STATE(CommentEnd)
  467. {
  468. ON('>')
  469. {
  470. emit_current_token();
  471. SWITCH_TO(Data);
  472. }
  473. ON('!')
  474. {
  475. SWITCH_TO(CommentEndBang);
  476. }
  477. ON('-')
  478. {
  479. m_current_token.m_comment_or_character.data.append('-');
  480. continue;
  481. }
  482. ON_EOF
  483. {
  484. TODO();
  485. }
  486. ANYTHING_ELSE
  487. {
  488. m_current_token.m_comment_or_character.data.append('-');
  489. RECONSUME_IN(Comment);
  490. }
  491. }
  492. END_STATE
  493. BEGIN_STATE(CommentEndBang)
  494. {
  495. ON('-')
  496. {
  497. m_current_token.m_comment_or_character.data.append("--!");
  498. SWITCH_TO(CommentEndDash);
  499. }
  500. ON('>')
  501. {
  502. TODO();
  503. }
  504. ON_EOF
  505. {
  506. TODO();
  507. }
  508. ANYTHING_ELSE
  509. {
  510. m_current_token.m_comment_or_character.data.append("--!");
  511. RECONSUME_IN(Comment);
  512. }
  513. }
  514. END_STATE
  515. BEGIN_STATE(CommentEndDash)
  516. {
  517. ON('-')
  518. {
  519. SWITCH_TO(CommentEnd);
  520. }
  521. ON_EOF
  522. {
  523. TODO();
  524. }
  525. ANYTHING_ELSE
  526. {
  527. m_current_token.m_comment_or_character.data.append('-');
  528. RECONSUME_IN(Comment);
  529. }
  530. }
  531. END_STATE
  532. BEGIN_STATE(CommentLessThanSign)
  533. {
  534. ON('!')
  535. {
  536. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  537. SWITCH_TO(CommentLessThanSignBang);
  538. }
  539. ON('<')
  540. {
  541. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  542. continue;
  543. }
  544. ANYTHING_ELSE
  545. {
  546. RECONSUME_IN(Comment);
  547. }
  548. }
  549. END_STATE
  550. BEGIN_STATE(CommentLessThanSignBang)
  551. {
  552. ON('-')
  553. {
  554. SWITCH_TO(CommentLessThanSignBangDash);
  555. }
  556. ANYTHING_ELSE
  557. {
  558. RECONSUME_IN(Comment);
  559. }
  560. }
  561. END_STATE
  562. BEGIN_STATE(CommentLessThanSignBangDash)
  563. {
  564. ON('-')
  565. {
  566. SWITCH_TO(CommentLessThanSignBangDashDash);
  567. }
  568. ANYTHING_ELSE
  569. {
  570. RECONSUME_IN(Comment);
  571. }
  572. }
  573. END_STATE
  574. BEGIN_STATE(CommentLessThanSignBangDashDash)
  575. {
  576. ON('>')
  577. {
  578. SWITCH_TO(CommentEnd);
  579. }
  580. ANYTHING_ELSE
  581. {
  582. TODO();
  583. }
  584. }
  585. END_STATE
  586. BEGIN_STATE(CharacterReference)
  587. {
  588. }
  589. END_STATE
  590. default:
  591. ASSERT_NOT_REACHED();
  592. }
  593. }
  594. }
  595. void HTMLTokenizer::consume(const StringView& string)
  596. {
  597. ASSERT(next_few_characters_are(string));
  598. m_cursor += string.length();
  599. }
  600. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  601. {
  602. for (size_t i = 0; i < string.length(); ++i) {
  603. auto codepoint = peek_codepoint(i);
  604. if (!codepoint.has_value())
  605. return false;
  606. // FIXME: This should be more Unicode-aware.
  607. if (codepoint.value() != (u32)string[i])
  608. return false;
  609. }
  610. return true;
  611. }
  612. void HTMLTokenizer::emit_current_token()
  613. {
  614. StringBuilder builder;
  615. switch (m_current_token.type()) {
  616. case HTMLToken::Type::DOCTYPE:
  617. builder.append("DOCTYPE");
  618. builder.append(" { name: '");
  619. builder.append(m_current_token.m_doctype.name.to_string());
  620. builder.append("' }");
  621. break;
  622. case HTMLToken::Type::StartTag:
  623. builder.append("StartTag");
  624. break;
  625. case HTMLToken::Type::EndTag:
  626. builder.append("EndTag");
  627. break;
  628. case HTMLToken::Type::Comment:
  629. builder.append("Comment");
  630. break;
  631. case HTMLToken::Type::Character:
  632. builder.append("Character");
  633. break;
  634. case HTMLToken::Type::EndOfFile:
  635. builder.append("EndOfFile");
  636. break;
  637. }
  638. if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) {
  639. builder.append(" { name: '");
  640. builder.append(m_current_token.m_tag.tag_name.to_string());
  641. builder.append("', { ");
  642. for (auto& attribute : m_current_token.m_tag.attributes) {
  643. builder.append(attribute.name_builder.to_string());
  644. builder.append("=\"");
  645. builder.append(attribute.value_builder.to_string());
  646. builder.append("\" ");
  647. }
  648. builder.append("} }");
  649. }
  650. dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
  651. m_current_token = {};
  652. }
  653. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  654. {
  655. m_current_token = {};
  656. m_current_token.m_type = type;
  657. }
  658. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  659. : m_input(input)
  660. {
  661. }
  662. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  663. {
  664. #ifdef TOKENIZER_TRACE
  665. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  666. #endif
  667. }
  668. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  669. {
  670. #ifdef TOKENIZER_TRACE
  671. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  672. #endif
  673. }
  674. }