HTMLTokenizer.cpp 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
  3. * All rights reserved.
  4. *
  5. * Redistribution and use in source and binary forms, with or without
  6. * modification, are permitted provided that the following conditions are met:
  7. *
  8. * 1. Redistributions of source code must retain the above copyright notice, this
  9. * list of conditions and the following disclaimer.
  10. *
  11. * 2. Redistributions in binary form must reproduce the above copyright notice,
  12. * this list of conditions and the following disclaimer in the documentation
  13. * and/or other materials provided with the distribution.
  14. *
  15. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  16. * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  17. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  18. * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
  19. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  20. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
  21. * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  22. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  23. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  24. * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  25. */
  26. #include <LibWeb/Parser/HTMLToken.h>
  27. #include <LibWeb/Parser/HTMLTokenizer.h>
  28. #include <ctype.h>
  29. //#define TOKENIZER_TRACE
  30. #define TODO ASSERT_NOT_REACHED
  31. #define SWITCH_TO(new_state) \
  32. will_switch_to(State::new_state); \
  33. m_state = State::new_state; \
  34. current_input_character = next_codepoint(); \
  35. goto new_state;
  36. #define RECONSUME_IN(new_state) \
  37. will_reconsume_in(State::new_state); \
  38. m_state = State::new_state; \
  39. goto new_state;
  40. #define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
  41. #define ON(codepoint) \
  42. if (current_input_character.has_value() && current_input_character.value() == codepoint)
  43. #define ON_EOF \
  44. if (!current_input_character.has_value())
  45. #define ON_ASCII_ALPHA \
  46. if (current_input_character.has_value() && isalpha(current_input_character.value()))
  47. #define ON_WHITESPACE \
  48. if (current_input_character.has_value() && (current_input_character.value() == '\t' || current_input_character.value() == '\a' || current_input_character.value() == '\f' || current_input_character.value() == ' '))
  49. #define ANYTHING_ELSE if (1)
  50. #define EMIT_EOF_AND_RETURN \
  51. create_new_token(HTMLToken::Type::EndOfFile); \
  52. emit_current_token(); \
  53. return;
  54. #define BEGIN_STATE(state) \
  55. state: \
  56. case State::state:
  57. #define END_STATE \
  58. ASSERT_NOT_REACHED(); \
  59. break;
  60. namespace Web {
  61. Optional<u32> HTMLTokenizer::next_codepoint()
  62. {
  63. if (m_cursor >= m_input.length())
  64. return {};
  65. return m_input[m_cursor++];
  66. }
  67. Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
  68. {
  69. if ((m_cursor + offset) >= m_input.length())
  70. return {};
  71. return m_input[m_cursor + offset];
  72. }
  73. void HTMLTokenizer::run()
  74. {
  75. for (;;) {
  76. auto current_input_character = next_codepoint();
  77. switch (m_state) {
  78. BEGIN_STATE(Data)
  79. {
  80. ON('&')
  81. {
  82. m_return_state = State::Data;
  83. SWITCH_TO(CharacterReference);
  84. }
  85. ON('<')
  86. {
  87. SWITCH_TO(TagOpen);
  88. }
  89. ON_EOF
  90. {
  91. EMIT_EOF_AND_RETURN;
  92. }
  93. ANYTHING_ELSE
  94. {
  95. create_new_token(HTMLToken::Type::Character);
  96. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  97. emit_current_token();
  98. continue;
  99. }
  100. }
  101. END_STATE
  102. BEGIN_STATE(TagOpen)
  103. {
  104. ON('!')
  105. {
  106. SWITCH_TO(MarkupDeclarationOpen);
  107. }
  108. ON('/')
  109. {
  110. SWITCH_TO(EndTagOpen);
  111. }
  112. ON_ASCII_ALPHA
  113. {
  114. create_new_token(HTMLToken::Type::StartTag);
  115. RECONSUME_IN(TagName);
  116. }
  117. }
  118. END_STATE
  119. BEGIN_STATE(TagName)
  120. {
  121. ON_WHITESPACE
  122. {
  123. SWITCH_TO(BeforeAttributeName);
  124. }
  125. ON('/')
  126. {
  127. SWITCH_TO(SelfClosingStartTag);
  128. }
  129. ON('>')
  130. {
  131. emit_current_token();
  132. SWITCH_TO(Data);
  133. }
  134. ANYTHING_ELSE
  135. {
  136. m_current_token.m_tag.tag_name.append(current_input_character.value());
  137. continue;
  138. }
  139. }
  140. BEGIN_STATE(EndTagOpen)
  141. {
  142. ON_ASCII_ALPHA
  143. {
  144. create_new_token(HTMLToken::Type::EndTag);
  145. RECONSUME_IN(TagName);
  146. }
  147. }
  148. END_STATE
  149. BEGIN_STATE(MarkupDeclarationOpen)
  150. {
  151. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  152. if (next_few_characters_are("--")) {
  153. consume("--");
  154. create_new_token(HTMLToken::Type::Comment);
  155. SWITCH_TO(CommentStart);
  156. }
  157. if (next_few_characters_are("DOCTYPE")) {
  158. consume("DOCTYPE");
  159. SWITCH_TO(DOCTYPE);
  160. }
  161. }
  162. END_STATE
  163. BEGIN_STATE(DOCTYPE)
  164. {
  165. ON_WHITESPACE
  166. {
  167. SWITCH_TO(BeforeDOCTYPEName);
  168. }
  169. }
  170. END_STATE
  171. BEGIN_STATE(BeforeDOCTYPEName)
  172. {
  173. ON_WHITESPACE
  174. {
  175. continue;
  176. }
  177. ANYTHING_ELSE
  178. {
  179. create_new_token(HTMLToken::Type::DOCTYPE);
  180. m_current_token.m_doctype.name.append(current_input_character.value());
  181. SWITCH_TO(DOCTYPEName);
  182. }
  183. }
  184. END_STATE
  185. BEGIN_STATE(DOCTYPEName)
  186. {
  187. ON('>')
  188. {
  189. emit_current_token();
  190. SWITCH_TO(Data);
  191. }
  192. ANYTHING_ELSE
  193. {
  194. m_current_token.m_doctype.name.append(current_input_character.value());
  195. continue;
  196. }
  197. }
  198. END_STATE
  199. BEGIN_STATE(BeforeAttributeName)
  200. {
  201. ON_WHITESPACE
  202. {
  203. continue;
  204. }
  205. ON('/')
  206. {
  207. RECONSUME_IN(AfterAttributeName);
  208. }
  209. ON('>')
  210. {
  211. RECONSUME_IN(AfterAttributeName);
  212. }
  213. ON_EOF
  214. {
  215. RECONSUME_IN(AfterAttributeName);
  216. }
  217. ON('=')
  218. {
  219. TODO();
  220. }
  221. ANYTHING_ELSE
  222. {
  223. m_current_token.m_tag.attributes.append(HTMLToken::AttributeBuilder());
  224. RECONSUME_IN(AttributeName);
  225. }
  226. }
  227. END_STATE
  228. BEGIN_STATE(SelfClosingStartTag)
  229. {
  230. }
  231. END_STATE
  232. BEGIN_STATE(AttributeName)
  233. {
  234. ON_WHITESPACE
  235. {
  236. RECONSUME_IN(AfterAttributeName);
  237. }
  238. ON('/')
  239. {
  240. RECONSUME_IN(AfterAttributeName);
  241. }
  242. ON('>')
  243. {
  244. RECONSUME_IN(AfterAttributeName);
  245. }
  246. ON_EOF
  247. {
  248. RECONSUME_IN(AfterAttributeName);
  249. }
  250. ON('=')
  251. {
  252. SWITCH_TO(BeforeAttributeValue);
  253. }
  254. ANYTHING_ELSE
  255. {
  256. m_current_token.m_tag.attributes.last().name_builder.append(current_input_character.value());
  257. continue;
  258. }
  259. }
  260. END_STATE
  261. BEGIN_STATE(AfterAttributeName)
  262. {
  263. }
  264. END_STATE
  265. BEGIN_STATE(BeforeAttributeValue)
  266. {
  267. ON_WHITESPACE
  268. {
  269. continue;
  270. }
  271. ON('"')
  272. {
  273. SWITCH_TO(AttributeValueDoubleQuoted);
  274. }
  275. ON('\'')
  276. {
  277. SWITCH_TO(AttributeValueSingleQuoted);
  278. }
  279. ON('>')
  280. {
  281. TODO();
  282. }
  283. ANYTHING_ELSE
  284. {
  285. RECONSUME_IN(AttributeValueUnquoted);
  286. }
  287. }
  288. END_STATE
  289. BEGIN_STATE(AttributeValueDoubleQuoted)
  290. {
  291. ON('"')
  292. {
  293. SWITCH_TO(AfterAttributeValueQuoted);
  294. }
  295. ON('&')
  296. {
  297. m_return_state = State::AttributeValueDoubleQuoted;
  298. SWITCH_TO(CharacterReference);
  299. }
  300. ON(0)
  301. {
  302. TODO();
  303. }
  304. ON_EOF
  305. {
  306. TODO();
  307. }
  308. ANYTHING_ELSE
  309. {
  310. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  311. continue;
  312. }
  313. }
  314. END_STATE
  315. BEGIN_STATE(AttributeValueSingleQuoted)
  316. {
  317. ON('\'')
  318. {
  319. SWITCH_TO(AfterAttributeValueQuoted);
  320. }
  321. ON('&')
  322. {
  323. m_return_state = State::AttributeValueSingleQuoted;
  324. SWITCH_TO(CharacterReference);
  325. }
  326. ON(0)
  327. {
  328. TODO();
  329. }
  330. ON_EOF
  331. {
  332. TODO();
  333. }
  334. ANYTHING_ELSE
  335. {
  336. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  337. continue;
  338. }
  339. }
  340. END_STATE
  341. BEGIN_STATE(AttributeValueUnquoted)
  342. {
  343. ON_WHITESPACE
  344. {
  345. SWITCH_TO(BeforeAttributeName);
  346. }
  347. ON('&')
  348. {
  349. m_return_state = State::AttributeValueUnquoted;
  350. SWITCH_TO(CharacterReference);
  351. }
  352. ON('>')
  353. {
  354. emit_current_token();
  355. SWITCH_TO(Data);
  356. }
  357. ON(0)
  358. {
  359. TODO();
  360. }
  361. ON_EOF
  362. {
  363. TODO();
  364. }
  365. ANYTHING_ELSE
  366. {
  367. m_current_token.m_tag.attributes.last().value_builder.append(current_input_character.value());
  368. continue;
  369. }
  370. }
  371. END_STATE
  372. BEGIN_STATE(AfterAttributeValueQuoted)
  373. {
  374. ON_WHITESPACE
  375. {
  376. SWITCH_TO(BeforeAttributeName);
  377. }
  378. ON('/')
  379. {
  380. SWITCH_TO(SelfClosingStartTag);
  381. }
  382. ON('>')
  383. {
  384. emit_current_token();
  385. SWITCH_TO(Data);
  386. }
  387. ON_EOF
  388. {
  389. TODO();
  390. }
  391. ANYTHING_ELSE
  392. {
  393. TODO();
  394. }
  395. }
  396. END_STATE
  397. BEGIN_STATE(CommentStart)
  398. {
  399. ON('-')
  400. {
  401. SWITCH_TO(CommentStartDash);
  402. }
  403. ON('>')
  404. {
  405. TODO();
  406. }
  407. ANYTHING_ELSE
  408. {
  409. RECONSUME_IN(Comment);
  410. }
  411. }
  412. END_STATE
  413. BEGIN_STATE(CommentStartDash)
  414. {
  415. ON('-')
  416. {
  417. SWITCH_TO(CommentEnd);
  418. }
  419. ON('>')
  420. {
  421. TODO();
  422. }
  423. ON_EOF
  424. {
  425. TODO();
  426. }
  427. ANYTHING_ELSE
  428. {
  429. m_current_token.m_comment_or_character.data.append('-');
  430. RECONSUME_IN(Comment);
  431. }
  432. }
  433. END_STATE
  434. BEGIN_STATE(Comment)
  435. {
  436. ON('<')
  437. {
  438. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  439. SWITCH_TO(CommentLessThanSign);
  440. }
  441. ON('-')
  442. {
  443. SWITCH_TO(CommentEndDash);
  444. }
  445. ON(0)
  446. {
  447. TODO();
  448. }
  449. ON_EOF
  450. {
  451. TODO();
  452. }
  453. ANYTHING_ELSE
  454. {
  455. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  456. continue;
  457. }
  458. }
  459. BEGIN_STATE(CommentEnd)
  460. {
  461. ON('>')
  462. {
  463. emit_current_token();
  464. SWITCH_TO(Data);
  465. }
  466. ON('!')
  467. {
  468. SWITCH_TO(CommentEndBang);
  469. }
  470. ON('-')
  471. {
  472. m_current_token.m_comment_or_character.data.append('-');
  473. continue;
  474. }
  475. ON_EOF
  476. {
  477. TODO();
  478. }
  479. ANYTHING_ELSE
  480. {
  481. m_current_token.m_comment_or_character.data.append('-');
  482. RECONSUME_IN(Comment);
  483. }
  484. }
  485. END_STATE
  486. BEGIN_STATE(CommentEndBang)
  487. {
  488. ON('-')
  489. {
  490. m_current_token.m_comment_or_character.data.append("--!");
  491. SWITCH_TO(CommentEndDash);
  492. }
  493. ON('>')
  494. {
  495. TODO();
  496. }
  497. ON_EOF
  498. {
  499. TODO();
  500. }
  501. ANYTHING_ELSE
  502. {
  503. m_current_token.m_comment_or_character.data.append("--!");
  504. RECONSUME_IN(Comment);
  505. }
  506. }
  507. END_STATE
  508. BEGIN_STATE(CommentEndDash)
  509. {
  510. ON('-')
  511. {
  512. SWITCH_TO(CommentEnd);
  513. }
  514. ON_EOF
  515. {
  516. TODO();
  517. }
  518. ANYTHING_ELSE
  519. {
  520. m_current_token.m_comment_or_character.data.append('-');
  521. RECONSUME_IN(Comment);
  522. }
  523. }
  524. BEGIN_STATE(CommentLessThanSign)
  525. {
  526. ON('!')
  527. {
  528. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  529. SWITCH_TO(CommentLessThanSignBang);
  530. }
  531. ON('<')
  532. {
  533. m_current_token.m_comment_or_character.data.append(current_input_character.value());
  534. continue;
  535. }
  536. ANYTHING_ELSE
  537. {
  538. RECONSUME_IN(Comment);
  539. }
  540. }
  541. END_STATE
  542. BEGIN_STATE(CommentLessThanSignBang)
  543. {
  544. ON('-')
  545. {
  546. SWITCH_TO(CommentLessThanSignBangDash);
  547. }
  548. ANYTHING_ELSE
  549. {
  550. RECONSUME_IN(Comment);
  551. }
  552. }
  553. END_STATE
  554. BEGIN_STATE(CommentLessThanSignBangDash)
  555. {
  556. ON('-')
  557. {
  558. SWITCH_TO(CommentLessThanSignBangDashDash);
  559. }
  560. ANYTHING_ELSE
  561. {
  562. RECONSUME_IN(Comment);
  563. }
  564. }
  565. END_STATE
  566. BEGIN_STATE(CommentLessThanSignBangDashDash)
  567. {
  568. ON('>')
  569. {
  570. SWITCH_TO(CommentEnd);
  571. }
  572. ANYTHING_ELSE
  573. {
  574. TODO();
  575. }
  576. }
  577. END_STATE
  578. BEGIN_STATE(CharacterReference)
  579. {
  580. }
  581. END_STATE
  582. default:
  583. ASSERT_NOT_REACHED();
  584. }
  585. }
  586. }
  587. void HTMLTokenizer::consume(const StringView& string)
  588. {
  589. ASSERT(next_few_characters_are(string));
  590. m_cursor += string.length();
  591. }
  592. bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
  593. {
  594. for (size_t i = 0; i < string.length(); ++i) {
  595. auto codepoint = peek_codepoint(i);
  596. if (!codepoint.has_value())
  597. return false;
  598. // FIXME: This should be more Unicode-aware.
  599. if (codepoint.value() != (u32)string[i])
  600. return false;
  601. }
  602. return true;
  603. }
  604. void HTMLTokenizer::emit_current_token()
  605. {
  606. StringBuilder builder;
  607. switch (m_current_token.type()) {
  608. case HTMLToken::Type::DOCTYPE:
  609. builder.append("DOCTYPE");
  610. builder.append(" { name: '");
  611. builder.append(m_current_token.m_doctype.name.to_string());
  612. builder.append("' }");
  613. break;
  614. case HTMLToken::Type::StartTag:
  615. builder.append("StartTag");
  616. break;
  617. case HTMLToken::Type::EndTag:
  618. builder.append("EndTag");
  619. break;
  620. case HTMLToken::Type::Comment:
  621. builder.append("Comment");
  622. break;
  623. case HTMLToken::Type::Character:
  624. builder.append("Character");
  625. break;
  626. case HTMLToken::Type::EndOfFile:
  627. builder.append("EndOfFile");
  628. break;
  629. }
  630. if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) {
  631. builder.append(" { name: '");
  632. builder.append(m_current_token.m_tag.tag_name.to_string());
  633. builder.append("', { ");
  634. for (auto& attribute : m_current_token.m_tag.attributes) {
  635. builder.append(attribute.name_builder.to_string());
  636. builder.append("=\"");
  637. builder.append(attribute.value_builder.to_string());
  638. builder.append("\" ");
  639. }
  640. builder.append("} }");
  641. }
  642. dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
  643. m_current_token = {};
  644. }
  645. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  646. {
  647. m_current_token = {};
  648. m_current_token.m_type = type;
  649. }
  650. HTMLTokenizer::HTMLTokenizer(const StringView& input)
  651. : m_input(input)
  652. {
  653. }
  654. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  655. {
  656. #ifdef TOKENIZER_TRACE
  657. dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
  658. #endif
  659. }
  660. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  661. {
  662. #ifdef TOKENIZER_TRACE
  663. dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
  664. #endif
  665. }
  666. }