HTMLTokenizer.cpp 114 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958
  1. /*
  2. * Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
  3. * Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
  4. *
  5. * SPDX-License-Identifier: BSD-2-Clause
  6. */
  7. #include <AK/CharacterTypes.h>
  8. #include <AK/Debug.h>
  9. #include <AK/GenericShorthands.h>
  10. #include <AK/SourceLocation.h>
  11. #include <LibTextCodec/Decoder.h>
  12. #include <LibWeb/HTML/Parser/Entities.h>
  13. #include <LibWeb/HTML/Parser/HTMLParser.h>
  14. #include <LibWeb/HTML/Parser/HTMLToken.h>
  15. #include <LibWeb/HTML/Parser/HTMLTokenizer.h>
  16. #include <LibWeb/Namespace.h>
  17. #include <string.h>
  18. namespace Web::HTML {
  19. #pragma GCC diagnostic ignored "-Wunused-label"
  20. #define CONSUME_NEXT_INPUT_CHARACTER \
  21. current_input_character = next_code_point(stop_at_insertion_point);
  22. #define SWITCH_TO(new_state) \
  23. do { \
  24. VERIFY(m_current_builder.is_empty()); \
  25. SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state); \
  26. } while (0)
  27. #define SWITCH_TO_WITH_UNCLEAN_BUILDER(new_state) \
  28. do { \
  29. will_switch_to(State::new_state); \
  30. m_state = State::new_state; \
  31. if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_reached()) \
  32. return {}; \
  33. CONSUME_NEXT_INPUT_CHARACTER; \
  34. goto new_state; \
  35. } while (0)
  36. #define RECONSUME_IN(new_state) \
  37. do { \
  38. will_reconsume_in(State::new_state); \
  39. m_state = State::new_state; \
  40. goto new_state; \
  41. } while (0)
  42. #define SWITCH_TO_RETURN_STATE \
  43. do { \
  44. will_switch_to(m_return_state); \
  45. m_state = m_return_state; \
  46. goto _StartOfFunction; \
  47. } while (0)
  48. #define RECONSUME_IN_RETURN_STATE \
  49. do { \
  50. will_reconsume_in(m_return_state); \
  51. m_state = m_return_state; \
  52. if (current_input_character.has_value()) \
  53. restore_to(m_prev_utf8_iterator); \
  54. goto _StartOfFunction; \
  55. } while (0)
  56. #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
  57. do { \
  58. VERIFY(m_current_builder.is_empty()); \
  59. will_switch_to(State::new_state); \
  60. m_state = State::new_state; \
  61. will_emit(m_current_token); \
  62. m_queued_tokens.enqueue(move(m_current_token)); \
  63. return m_queued_tokens.dequeue(); \
  64. } while (0)
  65. #define EMIT_CHARACTER_AND_RECONSUME_IN(code_point, new_state) \
  66. do { \
  67. m_queued_tokens.enqueue(HTMLToken::make_character(code_point)); \
  68. will_reconsume_in(State::new_state); \
  69. m_state = State::new_state; \
  70. goto new_state; \
  71. } while (0)
  72. #define FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE \
  73. do { \
  74. for (auto code_point : m_temporary_buffer) { \
  75. if (consumed_as_part_of_an_attribute()) { \
  76. m_current_builder.append_code_point(code_point); \
  77. } else { \
  78. create_new_token(HTMLToken::Type::Character); \
  79. m_current_token.set_code_point(code_point); \
  80. m_queued_tokens.enqueue(move(m_current_token)); \
  81. } \
  82. } \
  83. } while (0)
  84. #define DONT_CONSUME_NEXT_INPUT_CHARACTER \
  85. do { \
  86. restore_to(m_prev_utf8_iterator); \
  87. } while (0)
  88. #define ON(code_point) \
  89. if (current_input_character.has_value() && current_input_character.value() == code_point)
  90. #define ON_EOF \
  91. if (!current_input_character.has_value())
  92. #define ON_ASCII_ALPHA \
  93. if (current_input_character.has_value() && is_ascii_alpha(current_input_character.value()))
  94. #define ON_ASCII_ALPHANUMERIC \
  95. if (current_input_character.has_value() && is_ascii_alphanumeric(current_input_character.value()))
  96. #define ON_ASCII_UPPER_ALPHA \
  97. if (current_input_character.has_value() && is_ascii_upper_alpha(current_input_character.value()))
  98. #define ON_ASCII_LOWER_ALPHA \
  99. if (current_input_character.has_value() && is_ascii_lower_alpha(current_input_character.value()))
  100. #define ON_ASCII_DIGIT \
  101. if (current_input_character.has_value() && is_ascii_digit(current_input_character.value()))
  102. #define ON_ASCII_HEX_DIGIT \
  103. if (current_input_character.has_value() && is_ascii_hex_digit(current_input_character.value()))
  104. #define ON_WHITESPACE \
  105. if (current_input_character.has_value() && is_ascii(*current_input_character) && first_is_one_of(static_cast<char>(*current_input_character), '\t', '\n', '\f', ' '))
  106. #define ANYTHING_ELSE if (1)
  107. #define EMIT_EOF \
  108. do { \
  109. if (m_has_emitted_eof) \
  110. return {}; \
  111. m_has_emitted_eof = true; \
  112. create_new_token(HTMLToken::Type::EndOfFile); \
  113. will_emit(m_current_token); \
  114. m_queued_tokens.enqueue(move(m_current_token)); \
  115. return m_queued_tokens.dequeue(); \
  116. } while (0)
  117. #define EMIT_CURRENT_TOKEN_FOLLOWED_BY_EOF \
  118. do { \
  119. VERIFY(m_current_builder.is_empty()); \
  120. will_emit(m_current_token); \
  121. m_queued_tokens.enqueue(move(m_current_token)); \
  122. \
  123. m_has_emitted_eof = true; \
  124. create_new_token(HTMLToken::Type::EndOfFile); \
  125. will_emit(m_current_token); \
  126. m_queued_tokens.enqueue(move(m_current_token)); \
  127. \
  128. return m_queued_tokens.dequeue(); \
  129. } while (0)
  130. #define EMIT_CHARACTER(code_point) \
  131. do { \
  132. create_new_token(HTMLToken::Type::Character); \
  133. m_current_token.set_code_point(code_point); \
  134. m_queued_tokens.enqueue(move(m_current_token)); \
  135. return m_queued_tokens.dequeue(); \
  136. } while (0)
  137. #define EMIT_CURRENT_CHARACTER \
  138. EMIT_CHARACTER(current_input_character.value());
  139. #define SWITCH_TO_AND_EMIT_CHARACTER(code_point, new_state) \
  140. do { \
  141. will_switch_to(State::new_state); \
  142. m_state = State::new_state; \
  143. EMIT_CHARACTER(code_point); \
  144. } while (0)
  145. #define SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(new_state) \
  146. SWITCH_TO_AND_EMIT_CHARACTER(current_input_character.value(), new_state)
  147. // clang-format-18 handles the `state:` label rather badly.
  148. // clang-format off
  149. #define BEGIN_STATE(state) \
  150. state: \
  151. case State::state: { \
  152. { \
  153. {
  154. // clang-format on
  155. #define END_STATE \
  156. VERIFY_NOT_REACHED(); \
  157. break; \
  158. } \
  159. } \
  160. }
  161. static inline void log_parse_error(SourceLocation const& location = SourceLocation::current())
  162. {
  163. dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location);
  164. }
  165. Optional<u32> HTMLTokenizer::next_code_point(StopAtInsertionPoint stop_at_insertion_point)
  166. {
  167. if (m_utf8_iterator == m_utf8_view.end())
  168. return {};
  169. u32 code_point;
  170. // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
  171. // https://infra.spec.whatwg.org/#normalize-newlines
  172. if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r' && peek_code_point(1, stop_at_insertion_point).value_or(0) == '\n') {
  173. // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
  174. skip(2);
  175. code_point = '\n';
  176. } else if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r') {
  177. // replace every remaining U+000D CR code point with a U+000A LF code point.
  178. skip(1);
  179. code_point = '\n';
  180. } else {
  181. skip(1);
  182. code_point = *m_prev_utf8_iterator;
  183. }
  184. dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point);
  185. return code_point;
  186. }
  187. void HTMLTokenizer::skip(size_t count)
  188. {
  189. if (!m_source_positions.is_empty())
  190. m_source_positions.append(m_source_positions.last());
  191. for (size_t i = 0; i < count; ++i) {
  192. m_prev_utf8_iterator = m_utf8_iterator;
  193. auto code_point = *m_utf8_iterator;
  194. if (!m_source_positions.is_empty()) {
  195. if (code_point == '\n') {
  196. m_source_positions.last().column = 0;
  197. m_source_positions.last().line++;
  198. } else {
  199. m_source_positions.last().column++;
  200. }
  201. m_source_positions.last().byte_offset += m_utf8_iterator.underlying_code_point_length_in_bytes();
  202. }
  203. ++m_utf8_iterator;
  204. }
  205. }
  206. Optional<u32> HTMLTokenizer::peek_code_point(size_t offset, StopAtInsertionPoint stop_at_insertion_point) const
  207. {
  208. auto it = m_utf8_iterator;
  209. for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
  210. ++it;
  211. if (stop_at_insertion_point == StopAtInsertionPoint::Yes
  212. && m_insertion_point.defined
  213. && m_utf8_view.byte_offset_of(it) >= m_insertion_point.position) {
  214. return {};
  215. }
  216. if (it == m_utf8_view.end())
  217. return {};
  218. return *it;
  219. }
  220. HTMLToken::Position HTMLTokenizer::nth_last_position(size_t n)
  221. {
  222. if (n + 1 > m_source_positions.size()) {
  223. dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer::nth_last_position) Invalid position requested: {}th-last of {}. Returning (0-0).", n, m_source_positions.size());
  224. return HTMLToken::Position { 0, 0 };
  225. };
  226. return m_source_positions.at(m_source_positions.size() - 1 - n);
  227. }
  228. Optional<HTMLToken> HTMLTokenizer::next_token(StopAtInsertionPoint stop_at_insertion_point)
  229. {
  230. if (!m_source_positions.is_empty()) {
  231. auto last_position = m_source_positions.last();
  232. m_source_positions.clear_with_capacity();
  233. m_source_positions.append(move(last_position));
  234. }
  235. _StartOfFunction:
  236. if (!m_queued_tokens.is_empty())
  237. return m_queued_tokens.dequeue();
  238. if (m_aborted)
  239. return {};
  240. for (;;) {
  241. if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_reached())
  242. return {};
  243. auto current_input_character = next_code_point(stop_at_insertion_point);
  244. switch (m_state) {
  245. // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
  246. BEGIN_STATE(Data)
  247. {
  248. ON('&')
  249. {
  250. m_return_state = State::Data;
  251. SWITCH_TO(CharacterReference);
  252. }
  253. ON('<')
  254. {
  255. SWITCH_TO(TagOpen);
  256. }
  257. ON(0)
  258. {
  259. log_parse_error();
  260. EMIT_CURRENT_CHARACTER;
  261. }
  262. ON_EOF
  263. {
  264. EMIT_EOF;
  265. }
  266. ANYTHING_ELSE
  267. {
  268. EMIT_CURRENT_CHARACTER;
  269. }
  270. }
  271. END_STATE
  272. // 13.2.5.6 Tag open state, https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
  273. BEGIN_STATE(TagOpen)
  274. {
  275. ON('!')
  276. {
  277. SWITCH_TO(MarkupDeclarationOpen);
  278. }
  279. ON('/')
  280. {
  281. SWITCH_TO(EndTagOpen);
  282. }
  283. ON_ASCII_ALPHA
  284. {
  285. create_new_token(HTMLToken::Type::StartTag);
  286. RECONSUME_IN(TagName);
  287. }
  288. ON('?')
  289. {
  290. log_parse_error();
  291. create_new_token(HTMLToken::Type::Comment);
  292. m_current_token.set_start_position({}, nth_last_position(2));
  293. RECONSUME_IN(BogusComment);
  294. }
  295. ON_EOF
  296. {
  297. log_parse_error();
  298. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  299. EMIT_EOF;
  300. }
  301. ANYTHING_ELSE
  302. {
  303. log_parse_error();
  304. EMIT_CHARACTER_AND_RECONSUME_IN('<', Data);
  305. }
  306. }
  307. END_STATE
  308. // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
  309. BEGIN_STATE(TagName)
  310. {
  311. ON_WHITESPACE
  312. {
  313. m_current_token.set_tag_name(consume_current_builder());
  314. m_current_token.set_end_position({}, nth_last_position(1));
  315. SWITCH_TO(BeforeAttributeName);
  316. }
  317. ON('/')
  318. {
  319. m_current_token.set_tag_name(consume_current_builder());
  320. m_current_token.set_end_position({}, nth_last_position(0));
  321. SWITCH_TO(SelfClosingStartTag);
  322. }
  323. ON('>')
  324. {
  325. m_current_token.set_tag_name(consume_current_builder());
  326. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  327. }
  328. ON_ASCII_UPPER_ALPHA
  329. {
  330. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  331. m_current_token.set_end_position({}, nth_last_position(0));
  332. continue;
  333. }
  334. ON(0)
  335. {
  336. log_parse_error();
  337. m_current_builder.append_code_point(0xFFFD);
  338. m_current_token.set_end_position({}, nth_last_position(0));
  339. continue;
  340. }
  341. ON_EOF
  342. {
  343. log_parse_error();
  344. EMIT_EOF;
  345. }
  346. ANYTHING_ELSE
  347. {
  348. m_current_builder.append_code_point(current_input_character.value());
  349. m_current_token.set_end_position({}, nth_last_position(0));
  350. continue;
  351. }
  352. }
  353. END_STATE
  354. // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
  355. BEGIN_STATE(EndTagOpen)
  356. {
  357. ON_ASCII_ALPHA
  358. {
  359. create_new_token(HTMLToken::Type::EndTag);
  360. RECONSUME_IN(TagName);
  361. }
  362. ON('>')
  363. {
  364. log_parse_error();
  365. SWITCH_TO(Data);
  366. }
  367. ON_EOF
  368. {
  369. log_parse_error();
  370. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  371. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  372. EMIT_EOF;
  373. }
  374. ANYTHING_ELSE
  375. {
  376. log_parse_error();
  377. create_new_token(HTMLToken::Type::Comment);
  378. RECONSUME_IN(BogusComment);
  379. }
  380. }
  381. END_STATE
  382. // 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
  383. BEGIN_STATE(MarkupDeclarationOpen)
  384. {
  385. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  386. switch (consume_next_if_match("--"sv, stop_at_insertion_point)) {
  387. case ConsumeNextResult::Consumed:
  388. create_new_token(HTMLToken::Type::Comment);
  389. m_current_token.set_start_position({}, nth_last_position(3));
  390. SWITCH_TO(CommentStart);
  391. break;
  392. case ConsumeNextResult::NotConsumed:
  393. break;
  394. case ConsumeNextResult::RanOutOfCharacters:
  395. return {};
  396. }
  397. switch (consume_next_if_match("DOCTYPE"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
  398. case ConsumeNextResult::Consumed:
  399. SWITCH_TO(DOCTYPE);
  400. break;
  401. case ConsumeNextResult::NotConsumed:
  402. break;
  403. case ConsumeNextResult::RanOutOfCharacters:
  404. return {};
  405. }
  406. switch (consume_next_if_match("[CDATA["sv, stop_at_insertion_point)) {
  407. case ConsumeNextResult::Consumed:
  408. // We keep the parser optional so that syntax highlighting can be lexer-only.
  409. // The parser registers itself with the lexer it creates.
  410. if (m_parser != nullptr
  411. && m_parser->adjusted_current_node()
  412. && m_parser->adjusted_current_node()->namespace_uri() != Namespace::HTML) {
  413. SWITCH_TO(CDATASection);
  414. } else {
  415. create_new_token(HTMLToken::Type::Comment);
  416. m_current_builder.append("[CDATA["sv);
  417. SWITCH_TO_WITH_UNCLEAN_BUILDER(BogusComment);
  418. }
  419. break;
  420. case ConsumeNextResult::NotConsumed:
  421. break;
  422. case ConsumeNextResult::RanOutOfCharacters:
  423. return {};
  424. }
  425. ANYTHING_ELSE
  426. {
  427. log_parse_error();
  428. create_new_token(HTMLToken::Type::Comment);
  429. SWITCH_TO(BogusComment);
  430. }
  431. }
  432. END_STATE
  433. // 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
  434. BEGIN_STATE(BogusComment)
  435. {
  436. ON('>')
  437. {
  438. m_current_token.set_comment(consume_current_builder());
  439. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  440. }
  441. ON_EOF
  442. {
  443. m_queued_tokens.enqueue(move(m_current_token));
  444. EMIT_EOF;
  445. }
  446. ON(0)
  447. {
  448. log_parse_error();
  449. m_current_builder.append_code_point(0xFFFD);
  450. continue;
  451. }
  452. ANYTHING_ELSE
  453. {
  454. m_current_builder.append_code_point(current_input_character.value());
  455. continue;
  456. }
  457. }
  458. END_STATE
  459. // 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
  460. BEGIN_STATE(DOCTYPE)
  461. {
  462. ON_WHITESPACE
  463. {
  464. SWITCH_TO(BeforeDOCTYPEName);
  465. }
  466. ON('>')
  467. {
  468. RECONSUME_IN(BeforeDOCTYPEName);
  469. }
  470. ON_EOF
  471. {
  472. log_parse_error();
  473. create_new_token(HTMLToken::Type::DOCTYPE);
  474. m_current_token.ensure_doctype_data().force_quirks = true;
  475. m_queued_tokens.enqueue(move(m_current_token));
  476. EMIT_EOF;
  477. }
  478. ANYTHING_ELSE
  479. {
  480. log_parse_error();
  481. RECONSUME_IN(BeforeDOCTYPEName);
  482. }
  483. }
  484. END_STATE
  485. // 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
  486. BEGIN_STATE(BeforeDOCTYPEName)
  487. {
  488. ON_WHITESPACE
  489. {
  490. continue;
  491. }
  492. ON_ASCII_UPPER_ALPHA
  493. {
  494. create_new_token(HTMLToken::Type::DOCTYPE);
  495. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  496. m_current_token.ensure_doctype_data().missing_name = false;
  497. SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
  498. }
  499. ON(0)
  500. {
  501. log_parse_error();
  502. create_new_token(HTMLToken::Type::DOCTYPE);
  503. m_current_builder.append_code_point(0xFFFD);
  504. m_current_token.ensure_doctype_data().missing_name = false;
  505. SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
  506. }
  507. ON('>')
  508. {
  509. log_parse_error();
  510. create_new_token(HTMLToken::Type::DOCTYPE);
  511. m_current_token.ensure_doctype_data().force_quirks = true;
  512. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  513. }
  514. ON_EOF
  515. {
  516. log_parse_error();
  517. create_new_token(HTMLToken::Type::DOCTYPE);
  518. m_current_token.ensure_doctype_data().force_quirks = true;
  519. m_queued_tokens.enqueue(move(m_current_token));
  520. EMIT_EOF;
  521. }
  522. ANYTHING_ELSE
  523. {
  524. create_new_token(HTMLToken::Type::DOCTYPE);
  525. m_current_builder.append_code_point(current_input_character.value());
  526. m_current_token.ensure_doctype_data().missing_name = false;
  527. SWITCH_TO_WITH_UNCLEAN_BUILDER(DOCTYPEName);
  528. }
  529. }
  530. END_STATE
  531. // 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
  532. BEGIN_STATE(DOCTYPEName)
  533. {
  534. ON_WHITESPACE
  535. {
  536. m_current_token.ensure_doctype_data().name = consume_current_builder();
  537. SWITCH_TO(AfterDOCTYPEName);
  538. }
  539. ON('>')
  540. {
  541. m_current_token.ensure_doctype_data().name = consume_current_builder();
  542. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  543. }
  544. ON_ASCII_UPPER_ALPHA
  545. {
  546. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  547. continue;
  548. }
  549. ON(0)
  550. {
  551. log_parse_error();
  552. m_current_builder.append_code_point(0xFFFD);
  553. continue;
  554. }
  555. ON_EOF
  556. {
  557. log_parse_error();
  558. m_current_token.ensure_doctype_data().force_quirks = true;
  559. m_queued_tokens.enqueue(move(m_current_token));
  560. EMIT_EOF;
  561. }
  562. ANYTHING_ELSE
  563. {
  564. m_current_builder.append_code_point(current_input_character.value());
  565. continue;
  566. }
  567. }
  568. END_STATE
  569. // 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
  570. BEGIN_STATE(AfterDOCTYPEName)
  571. {
  572. ON_WHITESPACE
  573. {
  574. continue;
  575. }
  576. ON('>')
  577. {
  578. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  579. }
  580. ON_EOF
  581. {
  582. log_parse_error();
  583. m_current_token.ensure_doctype_data().force_quirks = true;
  584. m_queued_tokens.enqueue(move(m_current_token));
  585. EMIT_EOF;
  586. }
  587. ANYTHING_ELSE
  588. {
  589. if (to_ascii_uppercase(current_input_character.value()) == 'P') {
  590. switch (consume_next_if_match("UBLIC"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
  591. case ConsumeNextResult::Consumed:
  592. SWITCH_TO(AfterDOCTYPEPublicKeyword);
  593. break;
  594. case ConsumeNextResult::NotConsumed:
  595. break;
  596. case ConsumeNextResult::RanOutOfCharacters:
  597. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  598. return {};
  599. }
  600. }
  601. if (to_ascii_uppercase(current_input_character.value()) == 'S') {
  602. switch (consume_next_if_match("YSTEM"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
  603. case ConsumeNextResult::Consumed:
  604. SWITCH_TO(AfterDOCTYPESystemKeyword);
  605. break;
  606. case ConsumeNextResult::NotConsumed:
  607. break;
  608. case ConsumeNextResult::RanOutOfCharacters:
  609. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  610. return {};
  611. }
  612. }
  613. log_parse_error();
  614. m_current_token.ensure_doctype_data().force_quirks = true;
  615. RECONSUME_IN(BogusDOCTYPE);
  616. }
  617. }
  618. END_STATE
  619. // 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
  620. BEGIN_STATE(AfterDOCTYPEPublicKeyword)
  621. {
  622. ON_WHITESPACE
  623. {
  624. SWITCH_TO(BeforeDOCTYPEPublicIdentifier);
  625. }
  626. ON('"')
  627. {
  628. log_parse_error();
  629. m_current_token.ensure_doctype_data().missing_public_identifier = false;
  630. SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
  631. }
  632. ON('\'')
  633. {
  634. log_parse_error();
  635. m_current_token.ensure_doctype_data().missing_public_identifier = false;
  636. SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
  637. }
  638. ON('>')
  639. {
  640. log_parse_error();
  641. m_current_token.ensure_doctype_data().force_quirks = true;
  642. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  643. }
  644. ON_EOF
  645. {
  646. log_parse_error();
  647. m_current_token.ensure_doctype_data().force_quirks = true;
  648. m_queued_tokens.enqueue(move(m_current_token));
  649. EMIT_EOF;
  650. }
  651. ANYTHING_ELSE
  652. {
  653. log_parse_error();
  654. m_current_token.ensure_doctype_data().force_quirks = true;
  655. RECONSUME_IN(BogusDOCTYPE);
  656. }
  657. }
  658. END_STATE
  659. // 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
  660. BEGIN_STATE(AfterDOCTYPESystemKeyword)
  661. {
  662. ON_WHITESPACE
  663. {
  664. SWITCH_TO(BeforeDOCTYPESystemIdentifier);
  665. }
  666. ON('"')
  667. {
  668. log_parse_error();
  669. m_current_token.ensure_doctype_data().system_identifier = {};
  670. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  671. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  672. }
  673. ON('\'')
  674. {
  675. log_parse_error();
  676. m_current_token.ensure_doctype_data().system_identifier = {};
  677. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  678. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  679. }
  680. ON('>')
  681. {
  682. log_parse_error();
  683. m_current_token.ensure_doctype_data().force_quirks = true;
  684. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  685. }
  686. ON_EOF
  687. {
  688. log_parse_error();
  689. m_current_token.ensure_doctype_data().force_quirks = true;
  690. m_queued_tokens.enqueue(move(m_current_token));
  691. EMIT_EOF;
  692. }
  693. ANYTHING_ELSE
  694. {
  695. log_parse_error();
  696. m_current_token.ensure_doctype_data().force_quirks = true;
  697. RECONSUME_IN(BogusDOCTYPE);
  698. }
  699. }
  700. END_STATE
  701. // 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
  702. BEGIN_STATE(BeforeDOCTYPEPublicIdentifier)
  703. {
  704. ON_WHITESPACE
  705. {
  706. continue;
  707. }
  708. ON('"')
  709. {
  710. m_current_token.ensure_doctype_data().missing_public_identifier = false;
  711. SWITCH_TO(DOCTYPEPublicIdentifierDoubleQuoted);
  712. }
  713. ON('\'')
  714. {
  715. m_current_token.ensure_doctype_data().missing_public_identifier = false;
  716. SWITCH_TO(DOCTYPEPublicIdentifierSingleQuoted);
  717. }
  718. ON('>')
  719. {
  720. log_parse_error();
  721. m_current_token.ensure_doctype_data().force_quirks = true;
  722. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  723. }
  724. ON_EOF
  725. {
  726. log_parse_error();
  727. m_current_token.ensure_doctype_data().force_quirks = true;
  728. m_queued_tokens.enqueue(move(m_current_token));
  729. EMIT_EOF;
  730. }
  731. ANYTHING_ELSE
  732. {
  733. log_parse_error();
  734. m_current_token.ensure_doctype_data().force_quirks = true;
  735. RECONSUME_IN(BogusDOCTYPE);
  736. }
  737. }
  738. END_STATE
  739. // 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
  740. BEGIN_STATE(BeforeDOCTYPESystemIdentifier)
  741. {
  742. ON_WHITESPACE
  743. {
  744. continue;
  745. }
  746. ON('"')
  747. {
  748. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  749. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  750. }
  751. ON('\'')
  752. {
  753. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  754. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  755. }
  756. ON('>')
  757. {
  758. log_parse_error();
  759. m_current_token.ensure_doctype_data().force_quirks = true;
  760. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  761. }
  762. ON_EOF
  763. {
  764. log_parse_error();
  765. m_current_token.ensure_doctype_data().force_quirks = true;
  766. m_queued_tokens.enqueue(move(m_current_token));
  767. EMIT_EOF;
  768. }
  769. ANYTHING_ELSE
  770. {
  771. log_parse_error();
  772. m_current_token.ensure_doctype_data().force_quirks = true;
  773. RECONSUME_IN(BogusDOCTYPE);
  774. }
  775. }
  776. END_STATE
  777. // 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(double-quoted)-state
  778. BEGIN_STATE(DOCTYPEPublicIdentifierDoubleQuoted)
  779. {
  780. ON('"')
  781. {
  782. m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
  783. SWITCH_TO(AfterDOCTYPEPublicIdentifier);
  784. }
  785. ON(0)
  786. {
  787. log_parse_error();
  788. m_current_builder.append_code_point(0xFFFD);
  789. continue;
  790. }
  791. ON('>')
  792. {
  793. log_parse_error();
  794. m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
  795. m_current_token.ensure_doctype_data().force_quirks = true;
  796. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  797. }
  798. ON_EOF
  799. {
  800. log_parse_error();
  801. m_current_token.ensure_doctype_data().force_quirks = true;
  802. m_queued_tokens.enqueue(move(m_current_token));
  803. EMIT_EOF;
  804. }
  805. ANYTHING_ELSE
  806. {
  807. m_current_builder.append_code_point(current_input_character.value());
  808. continue;
  809. }
  810. }
  811. END_STATE
  812. // 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-(single-quoted)-state
  813. BEGIN_STATE(DOCTYPEPublicIdentifierSingleQuoted)
  814. {
  815. ON('\'')
  816. {
  817. m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
  818. SWITCH_TO(AfterDOCTYPEPublicIdentifier);
  819. }
  820. ON(0)
  821. {
  822. log_parse_error();
  823. m_current_builder.append_code_point(0xFFFD);
  824. continue;
  825. }
  826. ON('>')
  827. {
  828. log_parse_error();
  829. m_current_token.ensure_doctype_data().public_identifier = consume_current_builder();
  830. m_current_token.ensure_doctype_data().force_quirks = true;
  831. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  832. }
  833. ON_EOF
  834. {
  835. log_parse_error();
  836. m_current_token.ensure_doctype_data().force_quirks = true;
  837. m_queued_tokens.enqueue(move(m_current_token));
  838. EMIT_EOF;
  839. }
  840. ANYTHING_ELSE
  841. {
  842. m_current_builder.append_code_point(current_input_character.value());
  843. continue;
  844. }
  845. }
  846. END_STATE
  847. // 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(double-quoted)-state
  848. BEGIN_STATE(DOCTYPESystemIdentifierDoubleQuoted)
  849. {
  850. ON('"')
  851. {
  852. m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
  853. SWITCH_TO(AfterDOCTYPESystemIdentifier);
  854. }
  855. ON(0)
  856. {
  857. log_parse_error();
  858. m_current_builder.append_code_point(0xFFFD);
  859. continue;
  860. }
  861. ON('>')
  862. {
  863. log_parse_error();
  864. m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
  865. m_current_token.ensure_doctype_data().force_quirks = true;
  866. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  867. }
  868. ON_EOF
  869. {
  870. log_parse_error();
  871. m_current_token.ensure_doctype_data().force_quirks = true;
  872. m_queued_tokens.enqueue(move(m_current_token));
  873. EMIT_EOF;
  874. }
  875. ANYTHING_ELSE
  876. {
  877. m_current_builder.append_code_point(current_input_character.value());
  878. continue;
  879. }
  880. }
  881. END_STATE
  882. // 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-(single-quoted)-state
  883. BEGIN_STATE(DOCTYPESystemIdentifierSingleQuoted)
  884. {
  885. ON('\'')
  886. {
  887. m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
  888. SWITCH_TO(AfterDOCTYPESystemIdentifier);
  889. }
  890. ON(0)
  891. {
  892. log_parse_error();
  893. m_current_builder.append_code_point(0xFFFD);
  894. continue;
  895. }
  896. ON('>')
  897. {
  898. log_parse_error();
  899. m_current_token.ensure_doctype_data().system_identifier = consume_current_builder();
  900. m_current_token.ensure_doctype_data().force_quirks = true;
  901. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  902. }
  903. ON_EOF
  904. {
  905. log_parse_error();
  906. m_current_token.ensure_doctype_data().force_quirks = true;
  907. m_queued_tokens.enqueue(move(m_current_token));
  908. EMIT_EOF;
  909. }
  910. ANYTHING_ELSE
  911. {
  912. m_current_builder.append_code_point(current_input_character.value());
  913. continue;
  914. }
  915. }
  916. END_STATE
  917. // 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
  918. BEGIN_STATE(AfterDOCTYPEPublicIdentifier)
  919. {
  920. ON_WHITESPACE
  921. {
  922. SWITCH_TO(BetweenDOCTYPEPublicAndSystemIdentifiers);
  923. }
  924. ON('>')
  925. {
  926. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  927. }
  928. ON('"')
  929. {
  930. log_parse_error();
  931. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  932. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  933. }
  934. ON('\'')
  935. {
  936. log_parse_error();
  937. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  938. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  939. }
  940. ON_EOF
  941. {
  942. log_parse_error();
  943. m_current_token.ensure_doctype_data().force_quirks = true;
  944. m_queued_tokens.enqueue(move(m_current_token));
  945. EMIT_EOF;
  946. }
  947. ANYTHING_ELSE
  948. {
  949. log_parse_error();
  950. m_current_token.ensure_doctype_data().force_quirks = true;
  951. RECONSUME_IN(BogusDOCTYPE);
  952. }
  953. }
  954. END_STATE
  955. // 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
  956. BEGIN_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers)
  957. {
  958. ON_WHITESPACE
  959. {
  960. continue;
  961. }
  962. ON('>')
  963. {
  964. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  965. }
  966. ON('"')
  967. {
  968. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  969. SWITCH_TO(DOCTYPESystemIdentifierDoubleQuoted);
  970. }
  971. ON('\'')
  972. {
  973. m_current_token.ensure_doctype_data().missing_system_identifier = false;
  974. SWITCH_TO(DOCTYPESystemIdentifierSingleQuoted);
  975. }
  976. ON_EOF
  977. {
  978. log_parse_error();
  979. m_current_token.ensure_doctype_data().force_quirks = true;
  980. m_queued_tokens.enqueue(move(m_current_token));
  981. EMIT_EOF;
  982. }
  983. ANYTHING_ELSE
  984. {
  985. log_parse_error();
  986. m_current_token.ensure_doctype_data().force_quirks = true;
  987. RECONSUME_IN(BogusDOCTYPE);
  988. }
  989. }
  990. END_STATE
  991. // 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
  992. BEGIN_STATE(AfterDOCTYPESystemIdentifier)
  993. {
  994. ON_WHITESPACE
  995. {
  996. continue;
  997. }
  998. ON('>')
  999. {
  1000. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1001. }
  1002. ON_EOF
  1003. {
  1004. log_parse_error();
  1005. m_current_token.ensure_doctype_data().force_quirks = true;
  1006. m_queued_tokens.enqueue(move(m_current_token));
  1007. EMIT_EOF;
  1008. }
  1009. ANYTHING_ELSE
  1010. {
  1011. log_parse_error();
  1012. RECONSUME_IN(BogusDOCTYPE);
  1013. }
  1014. }
  1015. END_STATE
  1016. // 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
  1017. BEGIN_STATE(BogusDOCTYPE)
  1018. {
  1019. ON('>')
  1020. {
  1021. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1022. }
  1023. ON(0)
  1024. {
  1025. log_parse_error();
  1026. continue;
  1027. }
  1028. ON_EOF
  1029. {
  1030. m_queued_tokens.enqueue(move(m_current_token));
  1031. EMIT_EOF;
  1032. }
  1033. ANYTHING_ELSE
  1034. {
  1035. continue;
  1036. }
  1037. }
  1038. END_STATE
  1039. // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
  1040. BEGIN_STATE(BeforeAttributeName)
  1041. {
  1042. ON_WHITESPACE
  1043. {
  1044. continue;
  1045. }
  1046. ON('/')
  1047. {
  1048. RECONSUME_IN(AfterAttributeName);
  1049. }
  1050. ON('>')
  1051. {
  1052. RECONSUME_IN(AfterAttributeName);
  1053. }
  1054. ON_EOF
  1055. {
  1056. RECONSUME_IN(AfterAttributeName);
  1057. }
  1058. ON('=')
  1059. {
  1060. log_parse_error();
  1061. HTMLToken::Attribute new_attribute;
  1062. new_attribute.name_start_position = nth_last_position(1);
  1063. m_current_builder.append_code_point(current_input_character.value());
  1064. m_current_token.add_attribute(move(new_attribute));
  1065. SWITCH_TO_WITH_UNCLEAN_BUILDER(AttributeName);
  1066. }
  1067. ANYTHING_ELSE
  1068. {
  1069. HTMLToken::Attribute new_attribute;
  1070. new_attribute.name_start_position = nth_last_position(1);
  1071. m_current_token.add_attribute(move(new_attribute));
  1072. RECONSUME_IN(AttributeName);
  1073. }
  1074. }
  1075. END_STATE
  1076. // 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
  1077. BEGIN_STATE(SelfClosingStartTag)
  1078. {
  1079. ON('>')
  1080. {
  1081. m_current_token.set_self_closing(true);
  1082. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1083. }
  1084. ON_EOF
  1085. {
  1086. log_parse_error();
  1087. EMIT_EOF;
  1088. }
  1089. ANYTHING_ELSE
  1090. {
  1091. log_parse_error();
  1092. RECONSUME_IN(BeforeAttributeName);
  1093. }
  1094. }
  1095. END_STATE
  1096. // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
  1097. BEGIN_STATE(AttributeName)
  1098. {
  1099. ON_WHITESPACE
  1100. {
  1101. m_current_token.last_attribute().name_end_position = nth_last_position(1);
  1102. m_current_token.last_attribute().local_name = consume_current_builder();
  1103. RECONSUME_IN(AfterAttributeName);
  1104. }
  1105. ON('/')
  1106. {
  1107. m_current_token.last_attribute().name_end_position = nth_last_position(1);
  1108. m_current_token.last_attribute().local_name = consume_current_builder();
  1109. RECONSUME_IN(AfterAttributeName);
  1110. }
  1111. ON('>')
  1112. {
  1113. m_current_token.last_attribute().name_end_position = nth_last_position(1);
  1114. m_current_token.last_attribute().local_name = consume_current_builder();
  1115. RECONSUME_IN(AfterAttributeName);
  1116. }
  1117. ON_EOF
  1118. {
  1119. m_current_token.last_attribute().name_end_position = nth_last_position(1);
  1120. m_current_token.last_attribute().local_name = consume_current_builder();
  1121. RECONSUME_IN(AfterAttributeName);
  1122. }
  1123. ON('=')
  1124. {
  1125. m_current_token.last_attribute().name_end_position = nth_last_position(1);
  1126. m_current_token.last_attribute().local_name = consume_current_builder();
  1127. SWITCH_TO(BeforeAttributeValue);
  1128. }
  1129. ON_ASCII_UPPER_ALPHA
  1130. {
  1131. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  1132. continue;
  1133. }
  1134. ON(0)
  1135. {
  1136. log_parse_error();
  1137. m_current_builder.append_code_point(0xFFFD);
  1138. continue;
  1139. }
  1140. ON('"')
  1141. {
  1142. log_parse_error();
  1143. goto AnythingElseAttributeName;
  1144. }
  1145. ON('\'')
  1146. {
  1147. log_parse_error();
  1148. goto AnythingElseAttributeName;
  1149. }
  1150. ON('<')
  1151. {
  1152. log_parse_error();
  1153. goto AnythingElseAttributeName;
  1154. }
  1155. ANYTHING_ELSE
  1156. {
  1157. AnythingElseAttributeName:
  1158. m_current_builder.append_code_point(current_input_character.value());
  1159. continue;
  1160. }
  1161. }
  1162. END_STATE
  1163. // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
  1164. BEGIN_STATE(AfterAttributeName)
  1165. {
  1166. ON_WHITESPACE
  1167. {
  1168. continue;
  1169. }
  1170. ON('/')
  1171. {
  1172. SWITCH_TO(SelfClosingStartTag);
  1173. }
  1174. ON('=')
  1175. {
  1176. m_current_token.last_attribute().name_end_position = nth_last_position(1);
  1177. SWITCH_TO(BeforeAttributeValue);
  1178. }
  1179. ON('>')
  1180. {
  1181. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1182. }
  1183. ON_EOF
  1184. {
  1185. log_parse_error();
  1186. EMIT_EOF;
  1187. }
  1188. ANYTHING_ELSE
  1189. {
  1190. m_current_token.add_attribute({});
  1191. if (!m_source_positions.is_empty())
  1192. m_current_token.last_attribute().name_start_position = nth_last_position(1);
  1193. RECONSUME_IN(AttributeName);
  1194. }
  1195. }
  1196. END_STATE
  1197. // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
  1198. BEGIN_STATE(BeforeAttributeValue)
  1199. {
  1200. m_current_token.last_attribute().value_start_position = nth_last_position(1);
  1201. ON_WHITESPACE
  1202. {
  1203. continue;
  1204. }
  1205. ON('"')
  1206. {
  1207. SWITCH_TO(AttributeValueDoubleQuoted);
  1208. }
  1209. ON('\'')
  1210. {
  1211. SWITCH_TO(AttributeValueSingleQuoted);
  1212. }
  1213. ON('>')
  1214. {
  1215. log_parse_error();
  1216. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1217. }
  1218. ANYTHING_ELSE
  1219. {
  1220. RECONSUME_IN(AttributeValueUnquoted);
  1221. }
  1222. }
  1223. END_STATE
  1224. // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
  1225. BEGIN_STATE(AttributeValueDoubleQuoted)
  1226. {
  1227. ON('"')
  1228. {
  1229. m_current_token.last_attribute().value = consume_current_builder();
  1230. SWITCH_TO(AfterAttributeValueQuoted);
  1231. }
  1232. ON('&')
  1233. {
  1234. m_return_state = State::AttributeValueDoubleQuoted;
  1235. SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference);
  1236. }
  1237. ON(0)
  1238. {
  1239. log_parse_error();
  1240. m_current_builder.append_code_point(0xFFFD);
  1241. continue;
  1242. }
  1243. ON_EOF
  1244. {
  1245. log_parse_error();
  1246. EMIT_EOF;
  1247. }
  1248. ANYTHING_ELSE
  1249. {
  1250. m_current_builder.append_code_point(current_input_character.value());
  1251. continue;
  1252. }
  1253. }
  1254. END_STATE
  1255. // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
  1256. BEGIN_STATE(AttributeValueSingleQuoted)
  1257. {
  1258. ON('\'')
  1259. {
  1260. m_current_token.last_attribute().value = consume_current_builder();
  1261. SWITCH_TO(AfterAttributeValueQuoted);
  1262. }
  1263. ON('&')
  1264. {
  1265. m_return_state = State::AttributeValueSingleQuoted;
  1266. SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference);
  1267. }
  1268. ON(0)
  1269. {
  1270. log_parse_error();
  1271. m_current_builder.append_code_point(0xFFFD);
  1272. continue;
  1273. }
  1274. ON_EOF
  1275. {
  1276. log_parse_error();
  1277. EMIT_EOF;
  1278. }
  1279. ANYTHING_ELSE
  1280. {
  1281. m_current_builder.append_code_point(current_input_character.value());
  1282. continue;
  1283. }
  1284. }
  1285. END_STATE
  1286. // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
  1287. BEGIN_STATE(AttributeValueUnquoted)
  1288. {
  1289. ON_WHITESPACE
  1290. {
  1291. m_current_token.last_attribute().value = consume_current_builder();
  1292. m_current_token.last_attribute().value_end_position = nth_last_position(1);
  1293. SWITCH_TO(BeforeAttributeName);
  1294. }
  1295. ON('&')
  1296. {
  1297. m_return_state = State::AttributeValueUnquoted;
  1298. SWITCH_TO_WITH_UNCLEAN_BUILDER(CharacterReference);
  1299. }
  1300. ON('>')
  1301. {
  1302. m_current_token.last_attribute().value = consume_current_builder();
  1303. m_current_token.last_attribute().value_end_position = nth_last_position(1);
  1304. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1305. }
  1306. ON(0)
  1307. {
  1308. log_parse_error();
  1309. m_current_builder.append_code_point(0xFFFD);
  1310. continue;
  1311. }
  1312. ON('"')
  1313. {
  1314. log_parse_error();
  1315. goto AnythingElseAttributeValueUnquoted;
  1316. }
  1317. ON('\'')
  1318. {
  1319. log_parse_error();
  1320. goto AnythingElseAttributeValueUnquoted;
  1321. }
  1322. ON('<')
  1323. {
  1324. log_parse_error();
  1325. goto AnythingElseAttributeValueUnquoted;
  1326. }
  1327. ON('=')
  1328. {
  1329. log_parse_error();
  1330. goto AnythingElseAttributeValueUnquoted;
  1331. }
  1332. ON('`')
  1333. {
  1334. log_parse_error();
  1335. goto AnythingElseAttributeValueUnquoted;
  1336. }
  1337. ON_EOF
  1338. {
  1339. log_parse_error();
  1340. EMIT_EOF;
  1341. }
  1342. ANYTHING_ELSE
  1343. {
  1344. AnythingElseAttributeValueUnquoted:
  1345. m_current_builder.append_code_point(current_input_character.value());
  1346. continue;
  1347. }
  1348. }
  1349. END_STATE
  1350. // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
  1351. BEGIN_STATE(AfterAttributeValueQuoted)
  1352. {
  1353. m_current_token.last_attribute().value_end_position = nth_last_position(1);
  1354. ON_WHITESPACE
  1355. {
  1356. SWITCH_TO(BeforeAttributeName);
  1357. }
  1358. ON('/')
  1359. {
  1360. SWITCH_TO(SelfClosingStartTag);
  1361. }
  1362. ON('>')
  1363. {
  1364. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1365. }
  1366. ON_EOF
  1367. {
  1368. log_parse_error();
  1369. EMIT_EOF;
  1370. }
  1371. ANYTHING_ELSE
  1372. {
  1373. log_parse_error();
  1374. RECONSUME_IN(BeforeAttributeName);
  1375. }
  1376. }
  1377. END_STATE
  1378. // 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
  1379. BEGIN_STATE(CommentStart)
  1380. {
  1381. ON('-')
  1382. {
  1383. SWITCH_TO(CommentStartDash);
  1384. }
  1385. ON('>')
  1386. {
  1387. log_parse_error();
  1388. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1389. }
  1390. ANYTHING_ELSE
  1391. {
  1392. RECONSUME_IN(Comment);
  1393. }
  1394. }
  1395. END_STATE
  1396. // 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
  1397. BEGIN_STATE(CommentStartDash)
  1398. {
  1399. ON('-')
  1400. {
  1401. SWITCH_TO(CommentEnd);
  1402. }
  1403. ON('>')
  1404. {
  1405. log_parse_error();
  1406. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1407. }
  1408. ON_EOF
  1409. {
  1410. log_parse_error();
  1411. EMIT_CURRENT_TOKEN_FOLLOWED_BY_EOF;
  1412. }
  1413. ANYTHING_ELSE
  1414. {
  1415. m_current_builder.append('-');
  1416. RECONSUME_IN(Comment);
  1417. }
  1418. }
  1419. END_STATE
  1420. // 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
  1421. BEGIN_STATE(Comment)
  1422. {
  1423. ON('<')
  1424. {
  1425. m_current_builder.append_code_point(current_input_character.value());
  1426. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSign);
  1427. }
  1428. ON('-')
  1429. {
  1430. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndDash);
  1431. }
  1432. ON(0)
  1433. {
  1434. log_parse_error();
  1435. m_current_builder.append_code_point(0xFFFD);
  1436. continue;
  1437. }
  1438. ON_EOF
  1439. {
  1440. log_parse_error();
  1441. m_current_token.set_comment(consume_current_builder());
  1442. EMIT_CURRENT_TOKEN_FOLLOWED_BY_EOF;
  1443. }
  1444. ANYTHING_ELSE
  1445. {
  1446. m_current_builder.append_code_point(current_input_character.value());
  1447. continue;
  1448. }
  1449. }
  1450. END_STATE
  1451. // 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
  1452. BEGIN_STATE(CommentEnd)
  1453. {
  1454. ON('>')
  1455. {
  1456. m_current_token.set_comment(consume_current_builder());
  1457. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1458. }
  1459. ON('!')
  1460. {
  1461. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndBang);
  1462. }
  1463. ON('-')
  1464. {
  1465. m_current_builder.append('-');
  1466. continue;
  1467. }
  1468. ON_EOF
  1469. {
  1470. log_parse_error();
  1471. m_current_token.set_comment(consume_current_builder());
  1472. EMIT_CURRENT_TOKEN_FOLLOWED_BY_EOF;
  1473. }
  1474. ANYTHING_ELSE
  1475. {
  1476. m_current_builder.append("--"sv);
  1477. RECONSUME_IN(Comment);
  1478. }
  1479. }
  1480. END_STATE
  1481. // 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
  1482. BEGIN_STATE(CommentEndBang)
  1483. {
  1484. ON('-')
  1485. {
  1486. m_current_builder.append("--!"sv);
  1487. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEndDash);
  1488. }
  1489. ON('>')
  1490. {
  1491. log_parse_error();
  1492. m_current_token.set_comment(consume_current_builder());
  1493. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1494. }
  1495. ON_EOF
  1496. {
  1497. log_parse_error();
  1498. m_current_token.set_comment(consume_current_builder());
  1499. EMIT_CURRENT_TOKEN_FOLLOWED_BY_EOF;
  1500. }
  1501. ANYTHING_ELSE
  1502. {
  1503. m_current_builder.append("--!"sv);
  1504. RECONSUME_IN(Comment);
  1505. }
  1506. }
  1507. END_STATE
  1508. // 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
  1509. BEGIN_STATE(CommentEndDash)
  1510. {
  1511. ON('-')
  1512. {
  1513. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentEnd);
  1514. }
  1515. ON_EOF
  1516. {
  1517. log_parse_error();
  1518. m_current_token.set_comment(consume_current_builder());
  1519. EMIT_CURRENT_TOKEN_FOLLOWED_BY_EOF;
  1520. }
  1521. ANYTHING_ELSE
  1522. {
  1523. m_current_builder.append('-');
  1524. RECONSUME_IN(Comment);
  1525. }
  1526. }
  1527. END_STATE
  1528. // 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
  1529. BEGIN_STATE(CommentLessThanSign)
  1530. {
  1531. ON('!')
  1532. {
  1533. m_current_builder.append_code_point(current_input_character.value());
  1534. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBang);
  1535. }
  1536. ON('<')
  1537. {
  1538. m_current_builder.append_code_point(current_input_character.value());
  1539. continue;
  1540. }
  1541. ANYTHING_ELSE
  1542. {
  1543. RECONSUME_IN(Comment);
  1544. }
  1545. }
  1546. END_STATE
  1547. // 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
  1548. BEGIN_STATE(CommentLessThanSignBang)
  1549. {
  1550. ON('-')
  1551. {
  1552. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBangDash);
  1553. }
  1554. ANYTHING_ELSE
  1555. {
  1556. RECONSUME_IN(Comment);
  1557. }
  1558. }
  1559. END_STATE
  1560. // 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
  1561. BEGIN_STATE(CommentLessThanSignBangDash)
  1562. {
  1563. ON('-')
  1564. {
  1565. SWITCH_TO_WITH_UNCLEAN_BUILDER(CommentLessThanSignBangDashDash);
  1566. }
  1567. ANYTHING_ELSE
  1568. {
  1569. RECONSUME_IN(CommentEndDash);
  1570. }
  1571. }
  1572. END_STATE
  1573. // 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
  1574. BEGIN_STATE(CommentLessThanSignBangDashDash)
  1575. {
  1576. ON('>')
  1577. {
  1578. RECONSUME_IN(CommentEnd);
  1579. }
  1580. ON_EOF
  1581. {
  1582. RECONSUME_IN(CommentEnd);
  1583. }
  1584. ANYTHING_ELSE
  1585. {
  1586. log_parse_error();
  1587. RECONSUME_IN(CommentEnd);
  1588. }
  1589. }
  1590. END_STATE
  1591. // 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
  1592. BEGIN_STATE(CharacterReference)
  1593. {
  1594. m_temporary_buffer.clear();
  1595. m_temporary_buffer.append('&');
  1596. ON_ASCII_ALPHANUMERIC
  1597. {
  1598. RECONSUME_IN(NamedCharacterReference);
  1599. }
  1600. ON('#')
  1601. {
  1602. m_temporary_buffer.append(current_input_character.value());
  1603. SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReference);
  1604. }
  1605. ANYTHING_ELSE
  1606. {
  1607. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1608. RECONSUME_IN_RETURN_STATE;
  1609. }
  1610. }
  1611. END_STATE
  1612. // 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
  1613. BEGIN_STATE(NamedCharacterReference)
  1614. {
  1615. size_t byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator);
  1616. auto match = HTML::code_points_from_entity(m_decoded_input.substring_view(byte_offset, m_decoded_input.length() - byte_offset));
  1617. if (match.has_value()) {
  1618. skip(match->entity.length() - 1);
  1619. for (auto ch : match.value().entity)
  1620. m_temporary_buffer.append(ch);
  1621. if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
  1622. auto next_code_point = peek_code_point(0, stop_at_insertion_point);
  1623. if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) {
  1624. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1625. SWITCH_TO_RETURN_STATE;
  1626. }
  1627. }
  1628. if (!match.value().entity.ends_with(';')) {
  1629. log_parse_error();
  1630. }
  1631. m_temporary_buffer = match.value().code_points;
  1632. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1633. SWITCH_TO_RETURN_STATE;
  1634. } else {
  1635. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1636. // FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it.
  1637. // I can't wrap my head around how to do it as the spec says.
  1638. RECONSUME_IN(AmbiguousAmpersand);
  1639. }
  1640. }
  1641. END_STATE
  1642. // 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
  1643. BEGIN_STATE(AmbiguousAmpersand)
  1644. {
  1645. ON_ASCII_ALPHANUMERIC
  1646. {
  1647. if (consumed_as_part_of_an_attribute()) {
  1648. m_current_builder.append_code_point(current_input_character.value());
  1649. continue;
  1650. } else {
  1651. EMIT_CURRENT_CHARACTER;
  1652. }
  1653. }
  1654. ON(';')
  1655. {
  1656. log_parse_error();
  1657. RECONSUME_IN_RETURN_STATE;
  1658. }
  1659. ANYTHING_ELSE
  1660. {
  1661. RECONSUME_IN_RETURN_STATE;
  1662. }
  1663. }
  1664. END_STATE
  1665. // 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
  1666. BEGIN_STATE(NumericCharacterReference)
  1667. {
  1668. m_character_reference_code = 0;
  1669. ON('X')
  1670. {
  1671. m_temporary_buffer.append(current_input_character.value());
  1672. SWITCH_TO_WITH_UNCLEAN_BUILDER(HexadecimalCharacterReferenceStart);
  1673. }
  1674. ON('x')
  1675. {
  1676. m_temporary_buffer.append(current_input_character.value());
  1677. SWITCH_TO_WITH_UNCLEAN_BUILDER(HexadecimalCharacterReferenceStart);
  1678. }
  1679. ANYTHING_ELSE
  1680. {
  1681. RECONSUME_IN(DecimalCharacterReferenceStart);
  1682. }
  1683. }
  1684. END_STATE
  1685. // 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
  1686. BEGIN_STATE(HexadecimalCharacterReferenceStart)
  1687. {
  1688. ON_ASCII_HEX_DIGIT
  1689. {
  1690. RECONSUME_IN(HexadecimalCharacterReference);
  1691. }
  1692. ANYTHING_ELSE
  1693. {
  1694. log_parse_error();
  1695. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1696. RECONSUME_IN_RETURN_STATE;
  1697. }
  1698. }
  1699. END_STATE
  1700. // 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
  1701. BEGIN_STATE(DecimalCharacterReferenceStart)
  1702. {
  1703. ON_ASCII_DIGIT
  1704. {
  1705. RECONSUME_IN(DecimalCharacterReference);
  1706. }
  1707. ANYTHING_ELSE
  1708. {
  1709. log_parse_error();
  1710. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1711. RECONSUME_IN_RETURN_STATE;
  1712. }
  1713. }
  1714. END_STATE
  1715. // 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
  1716. BEGIN_STATE(HexadecimalCharacterReference)
  1717. {
  1718. ON_ASCII_DIGIT
  1719. {
  1720. m_character_reference_code *= 16;
  1721. m_character_reference_code += current_input_character.value() - 0x30;
  1722. continue;
  1723. }
  1724. ON_ASCII_UPPER_ALPHA
  1725. {
  1726. m_character_reference_code *= 16;
  1727. m_character_reference_code += current_input_character.value() - 0x37;
  1728. continue;
  1729. }
  1730. ON_ASCII_LOWER_ALPHA
  1731. {
  1732. m_character_reference_code *= 16;
  1733. m_character_reference_code += current_input_character.value() - 0x57;
  1734. continue;
  1735. }
  1736. ON(';')
  1737. {
  1738. SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReferenceEnd);
  1739. }
  1740. ANYTHING_ELSE
  1741. {
  1742. log_parse_error();
  1743. RECONSUME_IN(NumericCharacterReferenceEnd);
  1744. }
  1745. }
  1746. END_STATE
  1747. // 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
  1748. BEGIN_STATE(DecimalCharacterReference)
  1749. {
  1750. ON_ASCII_DIGIT
  1751. {
  1752. m_character_reference_code *= 10;
  1753. m_character_reference_code += current_input_character.value() - 0x30;
  1754. continue;
  1755. }
  1756. ON(';')
  1757. {
  1758. SWITCH_TO_WITH_UNCLEAN_BUILDER(NumericCharacterReferenceEnd);
  1759. }
  1760. ANYTHING_ELSE
  1761. {
  1762. log_parse_error();
  1763. RECONSUME_IN(NumericCharacterReferenceEnd);
  1764. }
  1765. }
  1766. END_STATE
  1767. // 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
  1768. BEGIN_STATE(NumericCharacterReferenceEnd)
  1769. {
  1770. DONT_CONSUME_NEXT_INPUT_CHARACTER;
  1771. if (m_character_reference_code == 0) {
  1772. log_parse_error();
  1773. m_character_reference_code = 0xFFFD;
  1774. }
  1775. if (m_character_reference_code > 0x10ffff) {
  1776. log_parse_error();
  1777. m_character_reference_code = 0xFFFD;
  1778. }
  1779. if (is_unicode_surrogate(m_character_reference_code)) {
  1780. log_parse_error();
  1781. m_character_reference_code = 0xFFFD;
  1782. }
  1783. if (is_unicode_noncharacter(m_character_reference_code)) {
  1784. log_parse_error();
  1785. }
  1786. if (m_character_reference_code == 0xd || (is_unicode_control(m_character_reference_code) && !is_ascii_space(m_character_reference_code))) {
  1787. log_parse_error();
  1788. constexpr struct {
  1789. u32 number;
  1790. u32 code_point;
  1791. } conversion_table[] = {
  1792. { 0x80, 0x20AC },
  1793. { 0x82, 0x201A },
  1794. { 0x83, 0x0192 },
  1795. { 0x84, 0x201E },
  1796. { 0x85, 0x2026 },
  1797. { 0x86, 0x2020 },
  1798. { 0x87, 0x2021 },
  1799. { 0x88, 0x02C6 },
  1800. { 0x89, 0x2030 },
  1801. { 0x8A, 0x0160 },
  1802. { 0x8B, 0x2039 },
  1803. { 0x8C, 0x0152 },
  1804. { 0x8E, 0x017D },
  1805. { 0x91, 0x2018 },
  1806. { 0x92, 0x2019 },
  1807. { 0x93, 0x201C },
  1808. { 0x94, 0x201D },
  1809. { 0x95, 0x2022 },
  1810. { 0x96, 0x2013 },
  1811. { 0x97, 0x2014 },
  1812. { 0x98, 0x02DC },
  1813. { 0x99, 0x2122 },
  1814. { 0x9A, 0x0161 },
  1815. { 0x9B, 0x203A },
  1816. { 0x9C, 0x0153 },
  1817. { 0x9E, 0x017E },
  1818. { 0x9F, 0x0178 },
  1819. };
  1820. for (auto& entry : conversion_table) {
  1821. if (m_character_reference_code == entry.number) {
  1822. m_character_reference_code = entry.code_point;
  1823. break;
  1824. }
  1825. }
  1826. }
  1827. m_temporary_buffer.clear();
  1828. m_temporary_buffer.append(m_character_reference_code);
  1829. FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
  1830. SWITCH_TO_RETURN_STATE;
  1831. }
  1832. END_STATE
  1833. // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
  1834. BEGIN_STATE(RCDATA)
  1835. {
  1836. ON('&')
  1837. {
  1838. m_return_state = State::RCDATA;
  1839. SWITCH_TO(CharacterReference);
  1840. }
  1841. ON('<')
  1842. {
  1843. SWITCH_TO(RCDATALessThanSign);
  1844. }
  1845. ON(0)
  1846. {
  1847. log_parse_error();
  1848. EMIT_CHARACTER(0xFFFD);
  1849. }
  1850. ON_EOF
  1851. {
  1852. EMIT_EOF;
  1853. }
  1854. ANYTHING_ELSE
  1855. {
  1856. EMIT_CURRENT_CHARACTER;
  1857. }
  1858. }
  1859. END_STATE
  1860. // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
  1861. BEGIN_STATE(RCDATALessThanSign)
  1862. {
  1863. ON('/')
  1864. {
  1865. m_temporary_buffer.clear();
  1866. SWITCH_TO(RCDATAEndTagOpen);
  1867. }
  1868. ANYTHING_ELSE
  1869. {
  1870. EMIT_CHARACTER_AND_RECONSUME_IN('<', RCDATA);
  1871. }
  1872. }
  1873. END_STATE
  1874. // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
  1875. BEGIN_STATE(RCDATAEndTagOpen)
  1876. {
  1877. ON_ASCII_ALPHA
  1878. {
  1879. create_new_token(HTMLToken::Type::EndTag);
  1880. RECONSUME_IN(RCDATAEndTagName);
  1881. }
  1882. ANYTHING_ELSE
  1883. {
  1884. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1885. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1886. RECONSUME_IN(RCDATA);
  1887. }
  1888. }
  1889. END_STATE
  1890. // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
  1891. BEGIN_STATE(RCDATAEndTagName)
  1892. {
  1893. ON_WHITESPACE
  1894. {
  1895. m_current_token.set_tag_name(consume_current_builder());
  1896. if (!current_end_tag_token_is_appropriate()) {
  1897. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1898. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1899. for (auto code_point : m_temporary_buffer)
  1900. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  1901. RECONSUME_IN(RCDATA);
  1902. }
  1903. SWITCH_TO(BeforeAttributeName);
  1904. }
  1905. ON('/')
  1906. {
  1907. m_current_token.set_tag_name(consume_current_builder());
  1908. if (!current_end_tag_token_is_appropriate()) {
  1909. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1910. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1911. for (auto code_point : m_temporary_buffer)
  1912. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  1913. RECONSUME_IN(RCDATA);
  1914. }
  1915. SWITCH_TO(SelfClosingStartTag);
  1916. }
  1917. ON('>')
  1918. {
  1919. m_current_token.set_tag_name(consume_current_builder());
  1920. if (!current_end_tag_token_is_appropriate()) {
  1921. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1922. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1923. for (auto code_point : m_temporary_buffer)
  1924. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  1925. RECONSUME_IN(RCDATA);
  1926. }
  1927. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  1928. }
  1929. ON_ASCII_UPPER_ALPHA
  1930. {
  1931. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  1932. m_temporary_buffer.append(current_input_character.value());
  1933. continue;
  1934. }
  1935. ON_ASCII_LOWER_ALPHA
  1936. {
  1937. m_current_builder.append_code_point(current_input_character.value());
  1938. m_temporary_buffer.append(current_input_character.value());
  1939. continue;
  1940. }
  1941. ANYTHING_ELSE
  1942. {
  1943. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  1944. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  1945. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  1946. m_current_builder.clear();
  1947. for (auto code_point : m_temporary_buffer)
  1948. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  1949. RECONSUME_IN(RCDATA);
  1950. }
  1951. }
  1952. END_STATE
  1953. // 13.2.5.3 RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
  1954. BEGIN_STATE(RAWTEXT)
  1955. {
  1956. ON('<')
  1957. {
  1958. SWITCH_TO(RAWTEXTLessThanSign);
  1959. }
  1960. ON(0)
  1961. {
  1962. log_parse_error();
  1963. EMIT_CHARACTER(0xFFFD);
  1964. }
  1965. ON_EOF
  1966. {
  1967. EMIT_EOF;
  1968. }
  1969. ANYTHING_ELSE
  1970. {
  1971. EMIT_CURRENT_CHARACTER;
  1972. }
  1973. }
  1974. END_STATE
  1975. // 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
  1976. BEGIN_STATE(RAWTEXTLessThanSign)
  1977. {
  1978. ON('/')
  1979. {
  1980. m_temporary_buffer.clear();
  1981. SWITCH_TO(RAWTEXTEndTagOpen);
  1982. }
  1983. ANYTHING_ELSE
  1984. {
  1985. EMIT_CHARACTER_AND_RECONSUME_IN('<', RAWTEXT);
  1986. }
  1987. }
  1988. END_STATE
  1989. // 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
  1990. BEGIN_STATE(RAWTEXTEndTagOpen)
  1991. {
  1992. ON_ASCII_ALPHA
  1993. {
  1994. create_new_token(HTMLToken::Type::EndTag);
  1995. RECONSUME_IN(RAWTEXTEndTagName);
  1996. }
  1997. ANYTHING_ELSE
  1998. {
  1999. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2000. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2001. RECONSUME_IN(RAWTEXT);
  2002. }
  2003. }
  2004. END_STATE
  2005. // 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
  2006. BEGIN_STATE(RAWTEXTEndTagName)
  2007. {
  2008. ON_WHITESPACE
  2009. {
  2010. m_current_token.set_tag_name(consume_current_builder());
  2011. if (!current_end_tag_token_is_appropriate()) {
  2012. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2013. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2014. for (auto code_point : m_temporary_buffer)
  2015. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2016. RECONSUME_IN(RAWTEXT);
  2017. }
  2018. SWITCH_TO(BeforeAttributeName);
  2019. }
  2020. ON('/')
  2021. {
  2022. m_current_token.set_tag_name(consume_current_builder());
  2023. if (!current_end_tag_token_is_appropriate()) {
  2024. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2025. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2026. for (auto code_point : m_temporary_buffer)
  2027. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2028. RECONSUME_IN(RAWTEXT);
  2029. }
  2030. SWITCH_TO(SelfClosingStartTag);
  2031. }
  2032. ON('>')
  2033. {
  2034. m_current_token.set_tag_name(consume_current_builder());
  2035. if (!current_end_tag_token_is_appropriate()) {
  2036. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2037. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2038. for (auto code_point : m_temporary_buffer)
  2039. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2040. RECONSUME_IN(RAWTEXT);
  2041. }
  2042. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  2043. }
  2044. ON_ASCII_UPPER_ALPHA
  2045. {
  2046. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  2047. m_temporary_buffer.append(current_input_character.value());
  2048. continue;
  2049. }
  2050. ON_ASCII_LOWER_ALPHA
  2051. {
  2052. m_current_builder.append(current_input_character.value());
  2053. m_temporary_buffer.append(current_input_character.value());
  2054. continue;
  2055. }
  2056. ANYTHING_ELSE
  2057. {
  2058. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2059. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2060. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2061. m_current_builder.clear();
  2062. for (auto code_point : m_temporary_buffer)
  2063. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2064. RECONSUME_IN(RAWTEXT);
  2065. }
  2066. }
  2067. END_STATE
  2068. // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
  2069. BEGIN_STATE(ScriptData)
  2070. {
  2071. ON('<')
  2072. {
  2073. SWITCH_TO(ScriptDataLessThanSign);
  2074. }
  2075. ON(0)
  2076. {
  2077. log_parse_error();
  2078. EMIT_CHARACTER(0xFFFD);
  2079. }
  2080. ON_EOF
  2081. {
  2082. EMIT_EOF;
  2083. }
  2084. ANYTHING_ELSE
  2085. {
  2086. EMIT_CURRENT_CHARACTER;
  2087. }
  2088. }
  2089. END_STATE
  2090. // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
  2091. BEGIN_STATE(PLAINTEXT)
  2092. {
  2093. ON(0)
  2094. {
  2095. log_parse_error();
  2096. EMIT_CHARACTER(0xFFFD);
  2097. }
  2098. ON_EOF
  2099. {
  2100. EMIT_EOF;
  2101. }
  2102. ANYTHING_ELSE
  2103. {
  2104. EMIT_CURRENT_CHARACTER;
  2105. }
  2106. }
  2107. END_STATE
  2108. // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
  2109. BEGIN_STATE(ScriptDataLessThanSign)
  2110. {
  2111. ON('/')
  2112. {
  2113. m_temporary_buffer.clear();
  2114. SWITCH_TO(ScriptDataEndTagOpen);
  2115. }
  2116. ON('!')
  2117. {
  2118. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2119. m_queued_tokens.enqueue(HTMLToken::make_character('!'));
  2120. SWITCH_TO(ScriptDataEscapeStart);
  2121. }
  2122. ANYTHING_ELSE
  2123. {
  2124. EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptData);
  2125. }
  2126. }
  2127. END_STATE
  2128. // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
  2129. BEGIN_STATE(ScriptDataEscapeStart)
  2130. {
  2131. ON('-')
  2132. {
  2133. SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapeStartDash);
  2134. }
  2135. ANYTHING_ELSE
  2136. {
  2137. RECONSUME_IN(ScriptData);
  2138. }
  2139. }
  2140. END_STATE
  2141. // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
  2142. BEGIN_STATE(ScriptDataEscapeStartDash)
  2143. {
  2144. ON('-')
  2145. {
  2146. SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash);
  2147. }
  2148. ANYTHING_ELSE
  2149. {
  2150. RECONSUME_IN(ScriptData);
  2151. }
  2152. }
  2153. END_STATE
  2154. // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
  2155. BEGIN_STATE(ScriptDataEscapedDashDash)
  2156. {
  2157. ON('-')
  2158. {
  2159. EMIT_CHARACTER('-');
  2160. }
  2161. ON('<')
  2162. {
  2163. SWITCH_TO(ScriptDataEscapedLessThanSign);
  2164. }
  2165. ON('>')
  2166. {
  2167. SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData);
  2168. }
  2169. ON(0)
  2170. {
  2171. log_parse_error();
  2172. SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped);
  2173. }
  2174. ON_EOF
  2175. {
  2176. log_parse_error();
  2177. EMIT_EOF;
  2178. }
  2179. ANYTHING_ELSE
  2180. {
  2181. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2182. }
  2183. }
  2184. END_STATE
  2185. // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
  2186. BEGIN_STATE(ScriptDataEscapedLessThanSign)
  2187. {
  2188. ON('/')
  2189. {
  2190. m_temporary_buffer.clear();
  2191. SWITCH_TO(ScriptDataEscapedEndTagOpen);
  2192. }
  2193. ON_ASCII_ALPHA
  2194. {
  2195. m_temporary_buffer.clear();
  2196. EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataDoubleEscapeStart);
  2197. }
  2198. ANYTHING_ELSE
  2199. {
  2200. EMIT_CHARACTER_AND_RECONSUME_IN('<', ScriptDataEscaped);
  2201. }
  2202. }
  2203. END_STATE
  2204. // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
  2205. BEGIN_STATE(ScriptDataEscapedEndTagOpen)
  2206. {
  2207. ON_ASCII_ALPHA
  2208. {
  2209. create_new_token(HTMLToken::Type::EndTag);
  2210. RECONSUME_IN(ScriptDataEscapedEndTagName);
  2211. }
  2212. ANYTHING_ELSE
  2213. {
  2214. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2215. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2216. RECONSUME_IN(ScriptDataEscaped);
  2217. }
  2218. }
  2219. END_STATE
  2220. // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
  2221. BEGIN_STATE(ScriptDataEscapedEndTagName)
  2222. {
  2223. ON_WHITESPACE
  2224. {
  2225. m_current_token.set_tag_name(consume_current_builder());
  2226. if (current_end_tag_token_is_appropriate())
  2227. SWITCH_TO(BeforeAttributeName);
  2228. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2229. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2230. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2231. m_current_builder.clear();
  2232. for (auto code_point : m_temporary_buffer) {
  2233. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2234. }
  2235. RECONSUME_IN(ScriptDataEscaped);
  2236. }
  2237. ON('/')
  2238. {
  2239. m_current_token.set_tag_name(consume_current_builder());
  2240. if (current_end_tag_token_is_appropriate())
  2241. SWITCH_TO(SelfClosingStartTag);
  2242. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2243. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2244. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2245. m_current_builder.clear();
  2246. for (auto code_point : m_temporary_buffer) {
  2247. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2248. }
  2249. RECONSUME_IN(ScriptDataEscaped);
  2250. }
  2251. ON('>')
  2252. {
  2253. m_current_token.set_tag_name(consume_current_builder());
  2254. if (current_end_tag_token_is_appropriate())
  2255. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  2256. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2257. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2258. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2259. m_current_builder.clear();
  2260. for (auto code_point : m_temporary_buffer) {
  2261. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2262. }
  2263. RECONSUME_IN(ScriptDataEscaped);
  2264. }
  2265. ON_ASCII_UPPER_ALPHA
  2266. {
  2267. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  2268. m_temporary_buffer.append(current_input_character.value());
  2269. continue;
  2270. }
  2271. ON_ASCII_LOWER_ALPHA
  2272. {
  2273. m_current_builder.append(current_input_character.value());
  2274. m_temporary_buffer.append(current_input_character.value());
  2275. continue;
  2276. }
  2277. ANYTHING_ELSE
  2278. {
  2279. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2280. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2281. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2282. m_current_builder.clear();
  2283. for (auto code_point : m_temporary_buffer) {
  2284. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2285. }
  2286. RECONSUME_IN(ScriptDataEscaped);
  2287. }
  2288. }
  2289. END_STATE
  2290. // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
  2291. BEGIN_STATE(ScriptDataDoubleEscapeStart)
  2292. {
  2293. auto temporary_buffer_equal_to_script = [this]() -> bool {
  2294. if (m_temporary_buffer.size() != 6)
  2295. return false;
  2296. // FIXME: Is there a better way of doing this?
  2297. return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't';
  2298. };
  2299. ON_WHITESPACE
  2300. {
  2301. if (temporary_buffer_equal_to_script())
  2302. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2303. else
  2304. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2305. }
  2306. ON('/')
  2307. {
  2308. if (temporary_buffer_equal_to_script())
  2309. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2310. else
  2311. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2312. }
  2313. ON('>')
  2314. {
  2315. if (temporary_buffer_equal_to_script())
  2316. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2317. else
  2318. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2319. }
  2320. ON_ASCII_UPPER_ALPHA
  2321. {
  2322. m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
  2323. EMIT_CURRENT_CHARACTER;
  2324. }
  2325. ON_ASCII_LOWER_ALPHA
  2326. {
  2327. m_temporary_buffer.append(current_input_character.value());
  2328. EMIT_CURRENT_CHARACTER;
  2329. }
  2330. ANYTHING_ELSE
  2331. {
  2332. RECONSUME_IN(ScriptDataEscaped);
  2333. }
  2334. }
  2335. END_STATE
  2336. // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
  2337. BEGIN_STATE(ScriptDataDoubleEscaped)
  2338. {
  2339. ON('-')
  2340. {
  2341. SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDash);
  2342. }
  2343. ON('<')
  2344. {
  2345. SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
  2346. }
  2347. ON(0)
  2348. {
  2349. log_parse_error();
  2350. EMIT_CHARACTER(0xFFFD);
  2351. }
  2352. ON_EOF
  2353. {
  2354. log_parse_error();
  2355. EMIT_EOF;
  2356. }
  2357. ANYTHING_ELSE
  2358. {
  2359. EMIT_CURRENT_CHARACTER;
  2360. }
  2361. }
  2362. END_STATE
  2363. // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
  2364. BEGIN_STATE(ScriptDataDoubleEscapedDash)
  2365. {
  2366. ON('-')
  2367. {
  2368. SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataDoubleEscapedDashDash);
  2369. }
  2370. ON('<')
  2371. {
  2372. SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
  2373. }
  2374. ON(0)
  2375. {
  2376. log_parse_error();
  2377. SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped);
  2378. }
  2379. ON_EOF
  2380. {
  2381. log_parse_error();
  2382. EMIT_EOF;
  2383. }
  2384. ANYTHING_ELSE
  2385. {
  2386. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2387. }
  2388. }
  2389. END_STATE
  2390. // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
  2391. BEGIN_STATE(ScriptDataDoubleEscapedDashDash)
  2392. {
  2393. ON('-')
  2394. {
  2395. EMIT_CHARACTER('-');
  2396. }
  2397. ON('<')
  2398. {
  2399. SWITCH_TO_AND_EMIT_CHARACTER('<', ScriptDataDoubleEscapedLessThanSign);
  2400. }
  2401. ON('>')
  2402. {
  2403. SWITCH_TO_AND_EMIT_CHARACTER('>', ScriptData);
  2404. }
  2405. ON(0)
  2406. {
  2407. log_parse_error();
  2408. SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataDoubleEscaped);
  2409. }
  2410. ON_EOF
  2411. {
  2412. log_parse_error();
  2413. EMIT_EOF;
  2414. }
  2415. ANYTHING_ELSE
  2416. {
  2417. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2418. }
  2419. }
  2420. END_STATE
  2421. // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
  2422. BEGIN_STATE(ScriptDataDoubleEscapedLessThanSign)
  2423. {
  2424. ON('/')
  2425. {
  2426. m_temporary_buffer.clear();
  2427. SWITCH_TO_AND_EMIT_CHARACTER('/', ScriptDataDoubleEscapeEnd);
  2428. }
  2429. ANYTHING_ELSE
  2430. {
  2431. RECONSUME_IN(ScriptDataDoubleEscaped);
  2432. }
  2433. }
  2434. END_STATE
  2435. // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
  2436. BEGIN_STATE(ScriptDataDoubleEscapeEnd)
  2437. {
  2438. auto temporary_buffer_equal_to_script = [this]() -> bool {
  2439. if (m_temporary_buffer.size() != 6)
  2440. return false;
  2441. // FIXME: Is there a better way of doing this?
  2442. return m_temporary_buffer[0] == 's' && m_temporary_buffer[1] == 'c' && m_temporary_buffer[2] == 'r' && m_temporary_buffer[3] == 'i' && m_temporary_buffer[4] == 'p' && m_temporary_buffer[5] == 't';
  2443. };
  2444. ON_WHITESPACE
  2445. {
  2446. if (temporary_buffer_equal_to_script())
  2447. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2448. else
  2449. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2450. }
  2451. ON('/')
  2452. {
  2453. if (temporary_buffer_equal_to_script())
  2454. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2455. else
  2456. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2457. }
  2458. ON('>')
  2459. {
  2460. if (temporary_buffer_equal_to_script())
  2461. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2462. else
  2463. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataDoubleEscaped);
  2464. }
  2465. ON_ASCII_UPPER_ALPHA
  2466. {
  2467. m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
  2468. EMIT_CURRENT_CHARACTER;
  2469. }
  2470. ON_ASCII_LOWER_ALPHA
  2471. {
  2472. m_temporary_buffer.append(current_input_character.value());
  2473. EMIT_CURRENT_CHARACTER;
  2474. }
  2475. ANYTHING_ELSE
  2476. {
  2477. RECONSUME_IN(ScriptDataDoubleEscaped);
  2478. }
  2479. }
  2480. END_STATE
  2481. // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
  2482. BEGIN_STATE(ScriptDataEscapedDash)
  2483. {
  2484. ON('-')
  2485. {
  2486. SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDashDash);
  2487. }
  2488. ON('<')
  2489. {
  2490. SWITCH_TO(ScriptDataEscapedLessThanSign);
  2491. }
  2492. ON(0)
  2493. {
  2494. log_parse_error();
  2495. SWITCH_TO_AND_EMIT_CHARACTER(0xFFFD, ScriptDataEscaped);
  2496. }
  2497. ON_EOF
  2498. {
  2499. log_parse_error();
  2500. EMIT_EOF;
  2501. }
  2502. ANYTHING_ELSE
  2503. {
  2504. SWITCH_TO_AND_EMIT_CURRENT_CHARACTER(ScriptDataEscaped);
  2505. }
  2506. }
  2507. END_STATE
  2508. // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
  2509. BEGIN_STATE(ScriptDataEscaped)
  2510. {
  2511. ON('-')
  2512. {
  2513. SWITCH_TO_AND_EMIT_CHARACTER('-', ScriptDataEscapedDash);
  2514. }
  2515. ON('<')
  2516. {
  2517. SWITCH_TO(ScriptDataEscapedLessThanSign);
  2518. }
  2519. ON(0)
  2520. {
  2521. log_parse_error();
  2522. EMIT_CHARACTER(0xFFFD);
  2523. }
  2524. ON_EOF
  2525. {
  2526. log_parse_error();
  2527. EMIT_EOF;
  2528. }
  2529. ANYTHING_ELSE
  2530. {
  2531. EMIT_CURRENT_CHARACTER;
  2532. }
  2533. }
  2534. END_STATE
  2535. // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
  2536. BEGIN_STATE(ScriptDataEndTagOpen)
  2537. {
  2538. ON_ASCII_ALPHA
  2539. {
  2540. create_new_token(HTMLToken::Type::EndTag);
  2541. RECONSUME_IN(ScriptDataEndTagName);
  2542. }
  2543. ANYTHING_ELSE
  2544. {
  2545. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2546. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2547. RECONSUME_IN(ScriptData);
  2548. }
  2549. }
  2550. END_STATE
  2551. // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
  2552. BEGIN_STATE(ScriptDataEndTagName)
  2553. {
  2554. ON_WHITESPACE
  2555. {
  2556. m_current_token.set_tag_name(consume_current_builder());
  2557. if (current_end_tag_token_is_appropriate())
  2558. SWITCH_TO(BeforeAttributeName);
  2559. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2560. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2561. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2562. m_current_builder.clear();
  2563. for (auto code_point : m_temporary_buffer)
  2564. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2565. RECONSUME_IN(ScriptData);
  2566. }
  2567. ON('/')
  2568. {
  2569. m_current_token.set_tag_name(consume_current_builder());
  2570. if (current_end_tag_token_is_appropriate())
  2571. SWITCH_TO(SelfClosingStartTag);
  2572. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2573. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2574. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2575. m_current_builder.clear();
  2576. for (auto code_point : m_temporary_buffer)
  2577. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2578. RECONSUME_IN(ScriptData);
  2579. }
  2580. ON('>')
  2581. {
  2582. m_current_token.set_tag_name(consume_current_builder());
  2583. if (current_end_tag_token_is_appropriate())
  2584. SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
  2585. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2586. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2587. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2588. m_current_builder.clear();
  2589. for (auto code_point : m_temporary_buffer)
  2590. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2591. RECONSUME_IN(ScriptData);
  2592. }
  2593. ON_ASCII_UPPER_ALPHA
  2594. {
  2595. m_current_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
  2596. m_temporary_buffer.append(current_input_character.value());
  2597. continue;
  2598. }
  2599. ON_ASCII_LOWER_ALPHA
  2600. {
  2601. m_current_builder.append(current_input_character.value());
  2602. m_temporary_buffer.append(current_input_character.value());
  2603. continue;
  2604. }
  2605. ANYTHING_ELSE
  2606. {
  2607. m_queued_tokens.enqueue(HTMLToken::make_character('<'));
  2608. m_queued_tokens.enqueue(HTMLToken::make_character('/'));
  2609. // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
  2610. m_current_builder.clear();
  2611. for (auto code_point : m_temporary_buffer)
  2612. m_queued_tokens.enqueue(HTMLToken::make_character(code_point));
  2613. RECONSUME_IN(ScriptData);
  2614. }
  2615. }
  2616. END_STATE
  2617. // 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
  2618. BEGIN_STATE(CDATASection)
  2619. {
  2620. ON(']')
  2621. {
  2622. SWITCH_TO(CDATASectionBracket);
  2623. }
  2624. ON_EOF
  2625. {
  2626. log_parse_error();
  2627. EMIT_EOF;
  2628. }
  2629. ANYTHING_ELSE
  2630. {
  2631. EMIT_CURRENT_CHARACTER;
  2632. }
  2633. }
  2634. END_STATE
  2635. // 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
  2636. BEGIN_STATE(CDATASectionBracket)
  2637. {
  2638. ON(']')
  2639. {
  2640. SWITCH_TO(CDATASectionEnd);
  2641. }
  2642. ANYTHING_ELSE
  2643. {
  2644. EMIT_CHARACTER_AND_RECONSUME_IN(']', CDATASection);
  2645. }
  2646. }
  2647. END_STATE
  2648. // 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
  2649. BEGIN_STATE(CDATASectionEnd)
  2650. {
  2651. ON(']')
  2652. {
  2653. EMIT_CHARACTER(']');
  2654. }
  2655. ON('>')
  2656. {
  2657. SWITCH_TO(Data);
  2658. }
  2659. ANYTHING_ELSE
  2660. {
  2661. m_queued_tokens.enqueue(HTMLToken::make_character(']'));
  2662. m_queued_tokens.enqueue(HTMLToken::make_character(']'));
  2663. RECONSUME_IN(CDATASection);
  2664. }
  2665. }
  2666. END_STATE
  2667. default:
  2668. TODO();
  2669. }
  2670. }
  2671. }
  2672. HTMLTokenizer::ConsumeNextResult HTMLTokenizer::consume_next_if_match(StringView string, StopAtInsertionPoint stop_at_insertion_point, CaseSensitivity case_sensitivity)
  2673. {
  2674. for (size_t i = 0; i < string.length(); ++i) {
  2675. auto code_point = peek_code_point(i, stop_at_insertion_point);
  2676. if (!code_point.has_value()) {
  2677. if (StopAtInsertionPoint::Yes == stop_at_insertion_point) {
  2678. return ConsumeNextResult::RanOutOfCharacters;
  2679. }
  2680. return ConsumeNextResult::NotConsumed;
  2681. }
  2682. // FIXME: This should be more Unicode-aware.
  2683. if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
  2684. if (code_point.value() < 0x80) {
  2685. if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i]))
  2686. return ConsumeNextResult::NotConsumed;
  2687. continue;
  2688. }
  2689. }
  2690. if (code_point.value() != (u32)string[i])
  2691. return ConsumeNextResult::NotConsumed;
  2692. }
  2693. skip(string.length());
  2694. return ConsumeNextResult::Consumed;
  2695. }
  2696. void HTMLTokenizer::create_new_token(HTMLToken::Type type)
  2697. {
  2698. m_current_token = { type };
  2699. auto is_start_or_end_tag = type == HTMLToken::Type::StartTag || type == HTMLToken::Type::EndTag;
  2700. m_current_token.set_start_position({}, nth_last_position(is_start_or_end_tag ? 1 : 0));
  2701. }
  2702. HTMLTokenizer::HTMLTokenizer()
  2703. {
  2704. m_decoded_input = "";
  2705. m_utf8_view = Utf8View(m_decoded_input);
  2706. m_utf8_iterator = m_utf8_view.begin();
  2707. m_prev_utf8_iterator = m_utf8_view.begin();
  2708. m_source_positions.empend(0u, 0u);
  2709. }
  2710. HTMLTokenizer::HTMLTokenizer(StringView input, ByteString const& encoding)
  2711. {
  2712. auto decoder = TextCodec::decoder_for(encoding);
  2713. VERIFY(decoder.has_value());
  2714. m_decoded_input = decoder->to_utf8(input).release_value_but_fixme_should_propagate_errors().to_byte_string();
  2715. m_utf8_view = Utf8View(m_decoded_input);
  2716. m_utf8_iterator = m_utf8_view.begin();
  2717. m_prev_utf8_iterator = m_utf8_view.begin();
  2718. m_source_positions.empend(0u, 0u);
  2719. }
  2720. void HTMLTokenizer::insert_input_at_insertion_point(StringView input)
  2721. {
  2722. auto utf8_iterator_byte_offset = m_utf8_view.byte_offset_of(m_utf8_iterator);
  2723. auto prev_utf8_iterator_byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator);
  2724. // FIXME: Implement a InputStream to handle insertion_point and iterators.
  2725. StringBuilder builder {};
  2726. builder.append(m_decoded_input.substring_view(0, m_insertion_point.position));
  2727. builder.append(input);
  2728. builder.append(m_decoded_input.substring_view(m_insertion_point.position));
  2729. m_decoded_input = builder.to_byte_string();
  2730. m_utf8_view = Utf8View(m_decoded_input);
  2731. m_utf8_iterator = m_utf8_view.iterator_at_byte_offset(utf8_iterator_byte_offset);
  2732. m_prev_utf8_iterator = m_utf8_view.iterator_at_byte_offset(prev_utf8_iterator_byte_offset);
  2733. m_insertion_point.position += input.length();
  2734. }
  2735. void HTMLTokenizer::insert_eof()
  2736. {
  2737. m_explicit_eof_inserted = true;
  2738. }
  2739. bool HTMLTokenizer::is_eof_inserted()
  2740. {
  2741. return m_explicit_eof_inserted;
  2742. }
  2743. void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
  2744. {
  2745. dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Switch to {}", state_name(m_state), state_name(new_state));
  2746. }
  2747. void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
  2748. {
  2749. dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Reconsume in {}", state_name(m_state), state_name(new_state));
  2750. }
  2751. void HTMLTokenizer::switch_to(Badge<HTMLParser>, State new_state)
  2752. {
  2753. dbgln_if(TOKENIZER_TRACE_DEBUG, "[{}] Parser switches tokenizer state to {}", state_name(m_state), state_name(new_state));
  2754. m_state = new_state;
  2755. }
  2756. void HTMLTokenizer::will_emit(HTMLToken& token)
  2757. {
  2758. if (token.is_start_tag())
  2759. m_last_emitted_start_tag_name = token.tag_name().to_deprecated_fly_string();
  2760. auto is_start_or_end_tag = token.type() == HTMLToken::Type::StartTag || token.type() == HTMLToken::Type::EndTag;
  2761. token.set_end_position({}, nth_last_position(is_start_or_end_tag ? 1 : 0));
  2762. if (is_start_or_end_tag)
  2763. token.normalize_attributes();
  2764. }
  2765. bool HTMLTokenizer::current_end_tag_token_is_appropriate() const
  2766. {
  2767. VERIFY(m_current_token.is_end_tag());
  2768. if (!m_last_emitted_start_tag_name.has_value())
  2769. return false;
  2770. return m_current_token.tag_name().to_deprecated_fly_string() == m_last_emitted_start_tag_name.value();
  2771. }
  2772. bool HTMLTokenizer::consumed_as_part_of_an_attribute() const
  2773. {
  2774. return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
  2775. }
  2776. void HTMLTokenizer::restore_to(Utf8CodePointIterator const& new_iterator)
  2777. {
  2778. auto diff = m_utf8_iterator - new_iterator;
  2779. if (diff > 0) {
  2780. for (ssize_t i = 0; i < diff; ++i) {
  2781. if (!m_source_positions.is_empty())
  2782. m_source_positions.take_last();
  2783. }
  2784. } else {
  2785. // Going forwards...?
  2786. TODO();
  2787. }
  2788. m_utf8_iterator = new_iterator;
  2789. }
  2790. String HTMLTokenizer::consume_current_builder()
  2791. {
  2792. auto string = m_current_builder.to_string_without_validation();
  2793. m_current_builder.clear();
  2794. return string;
  2795. }
  2796. }