Kaynağa Gözat

LibWeb: Fix character references losing characters in certain situations

This fixes 4 issues:
- RECONSUME_IN_RETURN_STATE was functionally equivalent to
  SWITCH_TO_RETURN_STATE, which caused us to lose characters.
  For example, &test= would lose the =

- & characters by themselves would be lost. For example, 1 & 2
  would become 1 2. This is because we forgot to flush
  characters in the the ANYTHING_ELSE path in CharacterReference

- Named character references didn't work at all in attributes.
  This is because there was a path that was checking the entity
  code points instead of the entity itself. Plus, the path that
  was checking the entity itself wasn't quite spec compliant.

- If we fail to match a named character reference, the first
  character is lost. For example &test would become &est.
  However, this relies on a little hack since I can't wrap my
  head around on how to change the code to do as the spec says.
  The hack is to reconsume in AmbigiousAmpersand instead of
  just switching to it.

Fixes #3957
Luke 4 yıl önce
ebeveyn
işleme
6ffcd53479
1 değiştirilmiş dosya ile 13 ekleme ve 16 silme
  1. 13 16
      Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp

+ 13 - 16
Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp

@@ -71,11 +71,13 @@ namespace Web::HTML {
         goto _StartOfFunction;          \
     } while (0)
 
-#define RECONSUME_IN_RETURN_STATE          \
-    do {                                   \
-        will_reconsume_in(m_return_state); \
-        m_state = m_return_state;          \
-        goto _StartOfFunction;             \
+#define RECONSUME_IN_RETURN_STATE                   \
+    do {                                            \
+        will_reconsume_in(m_return_state);          \
+        m_state = m_return_state;                   \
+        if (current_input_character.has_value())    \
+            m_utf8_iterator = m_prev_utf8_iterator; \
+        goto _StartOfFunction;                      \
     } while (0)
 
 #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
@@ -1525,6 +1527,7 @@ _StartOfFunction:
                 }
                 ANYTHING_ELSE
                 {
+                    FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
                     RECONSUME_IN_RETURN_STATE;
                 }
             }
@@ -1544,17 +1547,9 @@ _StartOfFunction:
                     for (auto ch : match.value().entity)
                         m_temporary_buffer.append(ch);
 
-                    if (consumed_as_part_of_an_attribute() && match.value().code_points.last() != ';') {
-                        auto next = peek_code_point(0);
-                        if (next.has_value() && (next.value() == '=' || isalnum(next.value()))) {
-                            FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
-                            SWITCH_TO_RETURN_STATE;
-                        }
-                    }
-
-                    if (consumed_as_part_of_an_attribute() && match.value().entity.ends_with(';')) {
+                    if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
                         auto next_code_point = peek_code_point(0);
-                        if (next_code_point.has_value() && next_code_point.value() == '=') {
+                        if (next_code_point.has_value() && (next_code_point.value() == '=' || isalnum(next_code_point.value()))) {
                             FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
                             SWITCH_TO_RETURN_STATE;
                         }
@@ -1571,7 +1566,9 @@ _StartOfFunction:
                     SWITCH_TO_RETURN_STATE;
                 } else {
                     FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
-                    SWITCH_TO(AmbiguousAmpersand);
+                    // FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it.
+                    //        I can't wrap my head around how to do it as the spec says.
+                    RECONSUME_IN(AmbiguousAmpersand);
                 }
             }
             END_STATE