Prechádzať zdrojové kódy

LibWeb: Handle iso-8859-1 web content a little bit better

We now look at the HTTP response headers for a Content-Type header and
try to parse it if present to find the text encoding.

If the text encoding is iso-8859-1, we turn all non-ASCII characters
into question marks. This makes Swedish Google load on my machine! :^)
Andreas Kling 5 rokov pred
rodič
commit
f3676ebef5

+ 18 - 1
Libraries/LibWeb/HtmlView.cpp

@@ -343,6 +343,15 @@ static RefPtr<Document> create_image_document(const ByteBuffer& data, const URL&
     return document;
 }
 
+String encoding_from_content_type(const String& content_type)
+{
+    auto offset = content_type.index_of("charset=");
+    if (offset.has_value())
+        return content_type.substring(offset.value() + 8, content_type.length() - offset.value() - 8).to_lowercase();
+
+    return "utf-8";
+}
+
 void HtmlView::load(const URL& url)
 {
     dbg() << "HtmlView::load: " << url.to_string();
@@ -370,7 +379,15 @@ void HtmlView::load(const URL& url)
             if (url.path().ends_with(".png") || url.path().ends_with(".gif")) {
                 document = create_image_document(data, url);
             } else {
-                document = parse_html_document(data, url);
+                String encoding = "utf-8";
+
+                auto content_type = response_headers.get("Content-Type");
+                if (content_type.has_value()) {
+                    encoding = encoding_from_content_type(content_type.value());
+                    dbg() << "I think this content has encoding '" << encoding << "'";
+                }
+
+                document = parse_html_document(data, url, encoding);
             }
             ASSERT(document);
             set_document(document);

+ 24 - 3
Libraries/LibWeb/Parser/HTMLParser.cpp

@@ -383,16 +383,37 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
     return true;
 }
 
-RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& html)
+String to_utf8(const StringView& input, const String& encoding)
+{
+    String output;
+    if (encoding == "utf-8") {
+        output = input;
+    } else if (encoding == "iso-8859-1") {
+        StringBuilder builder(input.length());
+        for (size_t i = 0; i < input.length(); ++i) {
+            u8 ch = input[i];
+            builder.append(ch >= 0x80 ? '?' : ch);
+        }
+        output = builder.to_string();
+    } else {
+        dbg() << "Unknown encoding " << encoding;
+        ASSERT_NOT_REACHED();
+    }
+    return output;
+}
+
+RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& raw_html, const String& encoding)
 {
     auto fragment = adopt(*new DocumentFragment(document));
-    if (!parse_html_document(html, document, *fragment))
+    if (!parse_html_document(to_utf8(raw_html, encoding), document, *fragment))
         return nullptr;
     return fragment;
 }
 
-RefPtr<Document> parse_html_document(const StringView& html, const URL& url)
+RefPtr<Document> parse_html_document(const StringView& raw_html, const URL& url, const String& encoding)
 {
+    String html = to_utf8(raw_html, encoding);
+
     auto document = adopt(*new Document(url));
     document->set_source(html);
 

+ 2 - 2
Libraries/LibWeb/Parser/HTMLParser.h

@@ -33,7 +33,7 @@ namespace Web {
 
 class DocumentFragment;
 
-RefPtr<Document> parse_html_document(const StringView&, const URL& = URL());
-RefPtr<DocumentFragment> parse_html_fragment(Document&, const StringView&);
+RefPtr<Document> parse_html_document(const StringView&, const URL& = URL(), const String& encoding = "utf-8");
+RefPtr<DocumentFragment> parse_html_fragment(Document&, const StringView&, const String& encoding = "utf-8");
 
 }