فهرست منبع

LibWeb: Emit character/comment tokens lazily to accumulate more data

Instead of emitting data-bearing tokens immediately, do it lazily at
the next state change. This allows us to accumulate full bursts of
text in between tags instead of having one token per character. :^)
Andreas Kling 5 سال پیش
والد
کامیت
7be36366be
2فایلهای تغییر یافته به همراه31 افزوده شده و 3 حذف شده
  1. 30 3
      Libraries/LibWeb/Parser/HTMLTokenizer.cpp
  2. 1 0
      Libraries/LibWeb/Parser/HTMLTokenizer.h

+ 30 - 3
Libraries/LibWeb/Parser/HTMLTokenizer.cpp

@@ -30,7 +30,11 @@
 
 
 //#define TOKENIZER_TRACE
 //#define TOKENIZER_TRACE
 
 
-#define TODO ASSERT_NOT_REACHED
+#define TODO()                                                                                              \
+    do {                                                                                                    \
+        dbg() << "[TODO: " << state_name(m_state) << "] '" << (char)current_input_character.value() << "'"; \
+        ASSERT_NOT_REACHED();                                                                               \
+    } while (0)
 
 
 #define SWITCH_TO(new_state)                    \
 #define SWITCH_TO(new_state)                    \
     will_switch_to(State::new_state);           \
     will_switch_to(State::new_state);           \
@@ -115,9 +119,9 @@ void HTMLTokenizer::run()
                 }
                 }
                 ANYTHING_ELSE
                 ANYTHING_ELSE
                 {
                 {
-                    create_new_token(HTMLToken::Type::Character);
+                    if (m_current_token.type() != HTMLToken::Type::Character)
+                        create_new_token(HTMLToken::Type::Character);
                     m_current_token.m_comment_or_character.data.append(current_input_character.value());
                     m_current_token.m_comment_or_character.data.append(current_input_character.value());
-                    emit_current_token();
                     continue;
                     continue;
                 }
                 }
             }
             }
@@ -138,6 +142,14 @@ void HTMLTokenizer::run()
                     create_new_token(HTMLToken::Type::StartTag);
                     create_new_token(HTMLToken::Type::StartTag);
                     RECONSUME_IN(TagName);
                     RECONSUME_IN(TagName);
                 }
                 }
+                ON('?')
+                {
+                    TODO();
+                }
+                ANYTHING_ELSE
+                {
+                    TODO();
+                }
             }
             }
             END_STATE
             END_STATE
 
 
@@ -699,12 +711,19 @@ void HTMLTokenizer::emit_current_token()
         builder.append("} }");
         builder.append("} }");
     }
     }
 
 
+    if (m_current_token.type() == HTMLToken::Type::Comment || m_current_token.type() == HTMLToken::Type::Character) {
+        builder.append(" { data: '");
+        builder.append(m_current_token.m_comment_or_character.data.to_string());
+        builder.append(" }");
+    }
+
     dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
     dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
     m_current_token = {};
     m_current_token = {};
 }
 }
 
 
 void HTMLTokenizer::create_new_token(HTMLToken::Type type)
 void HTMLTokenizer::create_new_token(HTMLToken::Type type)
 {
 {
+    flush_current_character_or_comment_if_needed();
     m_current_token = {};
     m_current_token = {};
     m_current_token.m_type = type;
     m_current_token.m_type = type;
 }
 }
@@ -716,6 +735,7 @@ HTMLTokenizer::HTMLTokenizer(const StringView& input)
 
 
 void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
 void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
 {
 {
+    flush_current_character_or_comment_if_needed();
 #ifdef TOKENIZER_TRACE
 #ifdef TOKENIZER_TRACE
     dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
     dbg() << "[" << state_name(m_state) << "] Switch to " << state_name(new_state);
 #endif
 #endif
@@ -723,9 +743,16 @@ void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
 
 
 void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
 void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
 {
 {
+    flush_current_character_or_comment_if_needed();
 #ifdef TOKENIZER_TRACE
 #ifdef TOKENIZER_TRACE
     dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
     dbg() << "[" << state_name(m_state) << "] Reconsume in " << state_name(new_state);
 #endif
 #endif
 }
 }
 
 
+void HTMLTokenizer::flush_current_character_or_comment_if_needed()
+{
+    if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
+        emit_current_token();
+}
+
 }
 }

+ 1 - 0
Libraries/LibWeb/Parser/HTMLTokenizer.h

@@ -148,6 +148,7 @@ private:
 
 
     void will_switch_to(State);
     void will_switch_to(State);
     void will_reconsume_in(State);
     void will_reconsume_in(State);
+    void flush_current_character_or_comment_if_needed();
 
 
     State m_state { State::Data };
     State m_state { State::Data };
     State m_return_state { State::Data };
     State m_return_state { State::Data };