From f3676ebef501a6a3ea5748cc405841a90f60d979 Mon Sep 17 00:00:00 2001 From: Andreas Kling Date: Sun, 3 May 2020 22:22:05 +0200 Subject: [PATCH] LibWeb: Handle iso-8859-1 web content a little bit better We now look at the HTTP response headers for a Content-Type header and try to parse it if present to find the text encoding. If the text encoding is iso-8859-1, we turn all non-ASCII characters into question marks. This makes Swedish Google load on my machine! :^) --- Libraries/LibWeb/HtmlView.cpp | 19 +++++++++++++++++- Libraries/LibWeb/Parser/HTMLParser.cpp | 27 +++++++++++++++++++++++--- Libraries/LibWeb/Parser/HTMLParser.h | 4 ++-- 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/Libraries/LibWeb/HtmlView.cpp b/Libraries/LibWeb/HtmlView.cpp index b74b5c85530..0a1cb337ada 100644 --- a/Libraries/LibWeb/HtmlView.cpp +++ b/Libraries/LibWeb/HtmlView.cpp @@ -343,6 +343,15 @@ static RefPtr create_image_document(const ByteBuffer& data, const URL& return document; } +String encoding_from_content_type(const String& content_type) +{ + auto offset = content_type.index_of("charset="); + if (offset.has_value()) + return content_type.substring(offset.value() + 8, content_type.length() - offset.value() - 8).to_lowercase(); + + return "utf-8"; +} + void HtmlView::load(const URL& url) { dbg() << "HtmlView::load: " << url.to_string(); @@ -370,7 +379,15 @@ void HtmlView::load(const URL& url) if (url.path().ends_with(".png") || url.path().ends_with(".gif")) { document = create_image_document(data, url); } else { - document = parse_html_document(data, url); + String encoding = "utf-8"; + + auto content_type = response_headers.get("Content-Type"); + if (content_type.has_value()) { + encoding = encoding_from_content_type(content_type.value()); + dbg() << "I think this content has encoding '" << encoding << "'"; + } + + document = parse_html_document(data, url, encoding); } ASSERT(document); set_document(document); diff --git a/Libraries/LibWeb/Parser/HTMLParser.cpp b/Libraries/LibWeb/Parser/HTMLParser.cpp index 25eaa237fdf..ca46b8de53c 100644 --- a/Libraries/LibWeb/Parser/HTMLParser.cpp +++ b/Libraries/LibWeb/Parser/HTMLParser.cpp @@ -383,16 +383,37 @@ static bool parse_html_document(const StringView& html, Document& document, Pare return true; } -RefPtr parse_html_fragment(Document& document, const StringView& html) +String to_utf8(const StringView& input, const String& encoding) +{ + String output; + if (encoding == "utf-8") { + output = input; + } else if (encoding == "iso-8859-1") { + StringBuilder builder(input.length()); + for (size_t i = 0; i < input.length(); ++i) { + u8 ch = input[i]; + builder.append(ch >= 0x80 ? '?' : ch); + } + output = builder.to_string(); + } else { + dbg() << "Unknown encoding " << encoding; + ASSERT_NOT_REACHED(); + } + return output; +} + +RefPtr parse_html_fragment(Document& document, const StringView& raw_html, const String& encoding) { auto fragment = adopt(*new DocumentFragment(document)); - if (!parse_html_document(html, document, *fragment)) + if (!parse_html_document(to_utf8(raw_html, encoding), document, *fragment)) return nullptr; return fragment; } -RefPtr parse_html_document(const StringView& html, const URL& url) +RefPtr parse_html_document(const StringView& raw_html, const URL& url, const String& encoding) { + String html = to_utf8(raw_html, encoding); + auto document = adopt(*new Document(url)); document->set_source(html); diff --git a/Libraries/LibWeb/Parser/HTMLParser.h b/Libraries/LibWeb/Parser/HTMLParser.h index 60d234f87c1..da60ddcca1d 100644 --- a/Libraries/LibWeb/Parser/HTMLParser.h +++ b/Libraries/LibWeb/Parser/HTMLParser.h @@ -33,7 +33,7 @@ namespace Web { class DocumentFragment; -RefPtr parse_html_document(const StringView&, const URL& = URL()); -RefPtr parse_html_fragment(Document&, const StringView&); +RefPtr parse_html_document(const StringView&, const URL& = URL(), const String& encoding = "utf-8"); +RefPtr parse_html_fragment(Document&, const StringView&, const String& encoding = "utf-8"); }