HTMLTokenizer.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. //#define TOKENIZER_TRACE
  30. #define TODO ASSERT_NOT_REACHED
  31. #define SWITCH_TO(new_state) \
  32. will_switch_to(State::new_state); \
  33. m_state = State::new_state; \
  34. current_input_character = next_codepoint(); \
  35. goto new_state;
  36. #define RECONSUME_IN(new_state) \
  37. will_reconsume_in(State::new_state); \
  38. m_state = State::new_state; \
  39. goto new_state;
  40. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  41. #define ON(codepoint) \
  42. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  43. #define ON_EOF \
  44. if (!current_input_character.has_value())
  45. #define ON_ASCII_ALPHA \
  46. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  47. #define ON_WHITESPACE \
  48. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\a' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  49. #define ANYTHING_ELSE if (1)
  50. #define EMIT_EOF_AND_RETURN \
  51. create_new_token(HTMLToken::Type::EndOfFile); \
  52. emit_current_token(); \
  53. return;
  54. #define BEGIN_STATE(state) \
  55. state: \
  56. case State::state:
  57. #define END_STATE \
  58. ASSERT_NOT_REACHED(); \
  59. break;
  60. namespace Web {
  61. Optional<u32> HTMLTokenizer::next_codepoint()
  62. {
  63. if (m_cursor >= m_input.length())
  64. return {};
  65. return m_input[m_cursor++];
  66. }
  67. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  68. {
  69. if ((m_cursor + offset) >= m_input.length())
  70. return {};
  71. return m_input[m_cursor + offset];
  72. }
  73. void HTMLTokenizer::run()
  74. {
  75. for (;;) {
  76. auto current_input_character = next_codepoint();
  77. switch (m_state) {
  78. BEGIN_STATE(Data)
  79. {
  80. ON('&')
  81. {
  82. m_return_state = State::Data;
  83. SWITCH_TO(CharacterReference);
  84. }
  85. ON('<')
  86. {
  87. SWITCH_TO(TagOpen);
  88. }
  89. ON_EOF
  90. {
  91. EMIT_EOF_AND_RETURN;
  92. }
  93. ANYTHING_ELSE
  94. {
  95. create_new_token(HTMLToken::Type::Character);
  96. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  97. emit_current_token();
  98. continue;
  99. }
  100. }
  101. END_STATE
  102. BEGIN_STATE(TagOpen)
  103. {
  104. ON('!')
  105. {
  106. SWITCH_TO(MarkupDeclarationOpen);
  107. }
  108. ON('/')
  109. {
  110. SWITCH_TO(EndTagOpen);
  111. }
  112. ON_ASCII_ALPHA
  113. {
  114. create_new_token(HTMLToken::Type::StartTag);
  115. RECONSUME_IN(TagName);
  116. }
  117. }
  118. END_STATE
  119. BEGIN_STATE(TagName)
  120. {
  121. ON_WHITESPACE
  122. {
  123. SWITCH_TO(BeforeAttributeName);
  124. }
  125. ON('/')
  126. {
  127. SWITCH_TO(SelfClosingStartTag);
  128. }
  129. ON('>')
  130. {
  131. emit_current_token();
  132. SWITCH_TO(Data);
  133. }
  134. ANYTHING_ELSE
  135. {
  136. m_current_token.m_tag.tag_name.append(current_input_character.value());
  137. continue;
  138. }
  139. }
  140. END_STATE
  141. BEGIN_STATE(EndTagOpen)
  142. {
  143. ON_ASCII_ALPHA
  144. {
  145. create_new_token(HTMLToken::Type::EndTag);
  146. RECONSUME_IN(TagName);
  147. }
  148. }
  149. END_STATE
  150. BEGIN_STATE(MarkupDeclarationOpen)
  151. {
  152. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  153. if (next_few_characters_are("--")) {
  154. consume("--");
  155. create_new_token(HTMLToken::Type::Comment);
  156. SWITCH_TO(CommentStart);
  157. }
  158. if (next_few_characters_are("DOCTYPE")) {
  159. consume("DOCTYPE");
  160. SWITCH_TO(DOCTYPE);
  161. }
  162. }
  163. END_STATE
  164. BEGIN_STATE(DOCTYPE)
  165. {
  166. ON_WHITESPACE
  167. {
  168. SWITCH_TO(BeforeDOCTYPEName);
  169. }
  170. }
  171. END_STATE
  172. BEGIN_STATE(BeforeDOCTYPEName)
  173. {
  174. ON_WHITESPACE
  175. {
  176. continue;
  177. }
  178. ANYTHING_ELSE
  179. {
  180. create_new_token(HTMLToken::Type::DOCTYPE);
  181. m_current_token.m_doctype.name.append(current_input_character.value());
  182. SWITCH_TO(DOCTYPEName);
  183. }
  184. }
  185. END_STATE
  186. BEGIN_STATE(DOCTYPEName)
  187. {
  188. ON('>')
  189. {
  190. emit_current_token();
  191. SWITCH_TO(Data);
  192. }
  193. ANYTHING_ELSE
  194. {
  195. m_current_token.m_doctype.name.append(current_input_character.value());
  196. continue;
  197. }
  198. }
  199. END_STATE
  200. BEGIN_STATE(BeforeAttributeName)
  201. {
  202. ON_WHITESPACE
  203. {
  204. continue;
  205. }
  206. ON('/')
  207. {
  208. RECONSUME_IN(AfterAttributeName);
  209. }
  210. ON('>')
  211. {
  212. RECONSUME_IN(AfterAttributeName);
  213. }
  214. ON_EOF
  215. {
  216. RECONSUME_IN(AfterAttributeName);
  217. }
  218. ON('=')
  219. {
  220. TODO();
  221. }
  222. ANYTHING_ELSE
  223. {
  224. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  225. RECONSUME_IN(AttributeName);
  226. }
  227. }
  228. END_STATE
  229. BEGIN_STATE(SelfClosingStartTag)
  230. {
  231. }
  232. END_STATE
  233. BEGIN_STATE(AttributeName)
  234. {
  235. ON_WHITESPACE
  236. {
  237. RECONSUME_IN(AfterAttributeName);
  238. }
  239. ON('/')
  240. {
  241. RECONSUME_IN(AfterAttributeName);
  242. }
  243. ON('>')
  244. {
  245. RECONSUME_IN(AfterAttributeName);
  246. }
  247. ON_EOF
  248. {
  249. RECONSUME_IN(AfterAttributeName);
  250. }
  251. ON('=')
  252. {
  253. SWITCH_TO(BeforeAttributeValue);
  254. }
  255. ANYTHING_ELSE
  256. {
  257. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  258. continue;
  259. }
  260. }
  261. END_STATE
  262. BEGIN_STATE(AfterAttributeName)
  263. {
  264. }
  265. END_STATE
  266. BEGIN_STATE(BeforeAttributeValue)
  267. {
  268. ON_WHITESPACE
  269. {
  270. continue;
  271. }
  272. ON('"')
  273. {
  274. SWITCH_TO(AttributeValueDoubleQuoted);
  275. }
  276. ON('\'')
  277. {
  278. SWITCH_TO(AttributeValueSingleQuoted);
  279. }
  280. ON('>')
  281. {
  282. TODO();
  283. }
  284. ANYTHING_ELSE
  285. {
  286. RECONSUME_IN(AttributeValueUnquoted);
  287. }
  288. }
  289. END_STATE
  290. BEGIN_STATE(AttributeValueDoubleQuoted)
  291. {
  292. ON('"')
  293. {
  294. SWITCH_TO(AfterAttributeValueQuoted);
  295. }
  296. ON('&')
  297. {
  298. m_return_state = State::AttributeValueDoubleQuoted;
  299. SWITCH_TO(CharacterReference);
  300. }
  301. ON(0)
  302. {
  303. TODO();
  304. }
  305. ON_EOF
  306. {
  307. TODO();
  308. }
  309. ANYTHING_ELSE
  310. {
  311. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  312. continue;
  313. }
  314. }
  315. END_STATE
  316. BEGIN_STATE(AttributeValueSingleQuoted)
  317. {
  318. ON('\'')
  319. {
  320. SWITCH_TO(AfterAttributeValueQuoted);
  321. }
  322. ON('&')
  323. {
  324. m_return_state = State::AttributeValueSingleQuoted;
  325. SWITCH_TO(CharacterReference);
  326. }
  327. ON(0)
  328. {
  329. TODO();
  330. }
  331. ON_EOF
  332. {
  333. TODO();
  334. }
  335. ANYTHING_ELSE
  336. {
  337. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  338. continue;
  339. }
  340. }
  341. END_STATE
  342. BEGIN_STATE(AttributeValueUnquoted)
  343. {
  344. ON_WHITESPACE
  345. {
  346. SWITCH_TO(BeforeAttributeName);
  347. }
  348. ON('&')
  349. {
  350. m_return_state = State::AttributeValueUnquoted;
  351. SWITCH_TO(CharacterReference);
  352. }
  353. ON('>')
  354. {
  355. emit_current_token();
  356. SWITCH_TO(Data);
  357. }
  358. ON(0)
  359. {
  360. TODO();
  361. }
  362. ON_EOF
  363. {
  364. TODO();
  365. }
  366. ANYTHING_ELSE
  367. {
  368. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  369. continue;
  370. }
  371. }
  372. END_STATE
  373. BEGIN_STATE(AfterAttributeValueQuoted)
  374. {
  375. ON_WHITESPACE
  376. {
  377. SWITCH_TO(BeforeAttributeName);
  378. }
  379. ON('/')
  380. {
  381. SWITCH_TO(SelfClosingStartTag);
  382. }
  383. ON('>')
  384. {
  385. emit_current_token();
  386. SWITCH_TO(Data);
  387. }
  388. ON_EOF
  389. {
  390. TODO();
  391. }
  392. ANYTHING_ELSE
  393. {
  394. TODO();
  395. }
  396. }
  397. END_STATE
  398. BEGIN_STATE(CommentStart)
  399. {
  400. ON('-')
  401. {
  402. SWITCH_TO(CommentStartDash);
  403. }
  404. ON('>')
  405. {
  406. TODO();
  407. }
  408. ANYTHING_ELSE
  409. {
  410. RECONSUME_IN(Comment);
  411. }
  412. }
  413. END_STATE
  414. BEGIN_STATE(CommentStartDash)
  415. {
  416. ON('-')
  417. {
  418. SWITCH_TO(CommentEnd);
  419. }
  420. ON('>')
  421. {
  422. TODO();
  423. }
  424. ON_EOF
  425. {
  426. TODO();
  427. }
  428. ANYTHING_ELSE
  429. {
  430. m_current_token.m_comment_or_character.data.append('-');
  431. RECONSUME_IN(Comment);
  432. }
  433. }
  434. END_STATE
  435. BEGIN_STATE(Comment)
  436. {
  437. ON('<')
  438. {
  439. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  440. SWITCH_TO(CommentLessThanSign);
  441. }
  442. ON('-')
  443. {
  444. SWITCH_TO(CommentEndDash);
  445. }
  446. ON(0)
  447. {
  448. TODO();
  449. }
  450. ON_EOF
  451. {
  452. TODO();
  453. }
  454. ANYTHING_ELSE
  455. {
  456. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  457. continue;
  458. }
  459. }
  460. BEGIN_STATE(CommentEnd)
  461. {
  462. ON('>')
  463. {
  464. emit_current_token();
  465. SWITCH_TO(Data);
  466. }
  467. ON('!')
  468. {
  469. SWITCH_TO(CommentEndBang);
  470. }
  471. ON('-')
  472. {
  473. m_current_token.m_comment_or_character.data.append('-');
  474. continue;
  475. }
  476. ON_EOF
  477. {
  478. TODO();
  479. }
  480. ANYTHING_ELSE
  481. {
  482. m_current_token.m_comment_or_character.data.append('-');
  483. RECONSUME_IN(Comment);
  484. }
  485. }
  486. END_STATE
  487. BEGIN_STATE(CommentEndBang)
  488. {
  489. ON('-')
  490. {
  491. m_current_token.m_comment_or_character.data.append("--!");
  492. SWITCH_TO(CommentEndDash);
  493. }
  494. ON('>')
  495. {
  496. TODO();
  497. }
  498. ON_EOF
  499. {
  500. TODO();
  501. }
  502. ANYTHING_ELSE
  503. {
  504. m_current_token.m_comment_or_character.data.append("--!");
  505. RECONSUME_IN(Comment);
  506. }
  507. }
  508. END_STATE
  509. BEGIN_STATE(CommentEndDash)
  510. {
  511. ON('-')
  512. {
  513. SWITCH_TO(CommentEnd);
  514. }
  515. ON_EOF
  516. {
  517. TODO();
  518. }
  519. ANYTHING_ELSE
  520. {
  521. m_current_token.m_comment_or_character.data.append('-');
  522. RECONSUME_IN(Comment);
  523. }
  524. }
  525. BEGIN_STATE(CommentLessThanSign)
  526. {
  527. ON('!')
  528. {
  529. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  530. SWITCH_TO(CommentLessThanSignBang);
  531. }
  532. ON('<')
  533. {
  534. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  535. continue;
  536. }
  537. ANYTHING_ELSE
  538. {
  539. RECONSUME_IN(Comment);
  540. }
  541. }
  542. END_STATE
  543. BEGIN_STATE(CommentLessThanSignBang)
  544. {
  545. ON('-')
  546. {
  547. SWITCH_TO(CommentLessThanSignBangDash);
  548. }
  549. ANYTHING_ELSE
  550. {
  551. RECONSUME_IN(Comment);
  552. }
  553. }
  554. END_STATE
  555. BEGIN_STATE(CommentLessThanSignBangDash)
  556. {
  557. ON('-')
  558. {
  559. SWITCH_TO(CommentLessThanSignBangDashDash);
  560. }
  561. ANYTHING_ELSE
  562. {
  563. RECONSUME_IN(Comment);
  564. }
  565. }
  566. END_STATE
  567. BEGIN_STATE(CommentLessThanSignBangDashDash)
  568. {
  569. ON('>')
  570. {
  571. SWITCH_TO(CommentEnd);
  572. }
  573. ANYTHING_ELSE
  574. {
  575. TODO();
  576. }
  577. }
  578. END_STATE
  579. BEGIN_STATE(CharacterReference)
  580. {
  581. }
  582. END_STATE
  583. default:
  584. ASSERT_NOT_REACHED();
  585. }
  586. }
  587. }
  588. void HTMLTokenizer::consume(const StringView& string)
  589. {
  590. ASSERT(next_few_characters_are(string));
  591. m_cursor += string.length();
  592. }
  593. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  594. {
  595. for (size_t i = 0; i < string.length(); ++i) {
  596. auto codepoint = peek_codepoint(i);
  597. if (!codepoint.has_value())
  598. return false;
  599. // FIXME: This should be more Unicode-aware.
  600. if (codepoint.value() != (u32)string[i])
  601. return false;
  602. }
  603. return true;
  604. }
  605. void HTMLTokenizer::emit_current_token()
  606. {
  607. StringBuilder builder;
  608. switch (m_current_token.type()) {
  609. case HTMLToken::Type::DOCTYPE:
  610. builder.append("DOCTYPE");
  611. builder.append(" { name: '");
  612. builder.append(m_current_token.m_doctype.name.to_string());
  613. builder.append("' }");
  614. break;
  615. case HTMLToken::Type::StartTag:
  616. builder.append("StartTag");
  617. break;
  618. case HTMLToken::Type::EndTag:
  619. builder.append("EndTag");
  620. break;
  621. case HTMLToken::Type::Comment:
  622. builder.append("Comment");
  623. break;
  624. case HTMLToken::Type::Character:
  625. builder.append("Character");
  626. break;
  627. case HTMLToken::Type::EndOfFile:
  628. builder.append("EndOfFile");
  629. break;
  630. }
  631. if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) {
  632. builder.append(" { name: '");
  633. builder.append(m_current_token.m_tag.tag_name.to_string());
  634. builder.append("', { ");
  635. for (auto& attribute : m_current_token.m_tag.attributes) {
  636. builder.append(attribute.name_builder.to_string());
  637. builder.append("=\"");
  638. builder.append(attribute.value_builder.to_string());
  639. builder.append("\" ");
  640. }
  641. builder.append("} }");
  642. }
  643. dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
  644. m_current_token = {};
  645. }
  646. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  647. {
  648. m_current_token = {};
  649. m_current_token.m_type = type;
  650. }
  651. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  652. : m_input(input)
  653. {
  654. }
  655. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  656. {
  657. #ifdef TOKENIZER_TRACE
  658. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  659. #endif
  660. }
  661. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  662. {
  663. #ifdef TOKENIZER_TRACE
  664. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  665. #endif
  666. }
  667. }