Forráskód Böngészése

LibTextCodec: Start fleshing out a simple text codec library

We're starting with a very basic decoding API and only ISO-8859-1 and
UTF-8 decoding (and UTF-8 decoding is really a no-op since String is
expected to be UTF-8.)
Andreas Kling 5 éve
szülő
commit
e09b83c60c

+ 1 - 1
Applications/Browser/Makefile

@@ -7,7 +7,7 @@ OBJS = \
 
 PROGRAM = Browser
 
-LIB_DEPS = Web JS GUI Gfx IPC Protocol Core
+LIB_DEPS = Web JS TextCodec GUI Gfx IPC Protocol Core
 
 main.cpp: ../../Libraries/LibWeb/CSS/PropertyID.h
 ../../Libraries/LibWeb/CSS/PropertyID.h:

+ 1 - 1
Applications/Help/Makefile

@@ -7,6 +7,6 @@ OBJS = \
 
 PROGRAM = Help
 
-LIB_DEPS = GUI Web JS Gfx Markdown IPC Protocol Thread Pthread Core
+LIB_DEPS = GUI Web TextCodec JS Gfx Markdown IPC Protocol Thread Pthread Core
 
 include ../../Makefile.common

+ 1 - 1
Applications/IRCClient/Makefile

@@ -11,6 +11,6 @@ OBJS = \
 
 PROGRAM = IRCClient
 
-LIB_DEPS = Web JS GUI Gfx Protocol IPC Thread Pthread Core
+LIB_DEPS = Web TextCodec JS GUI Gfx Protocol IPC Thread Pthread Core
 
 include ../../Makefile.common

+ 1 - 1
Applications/TextEditor/Makefile

@@ -4,6 +4,6 @@ OBJS = \
 
 PROGRAM = TextEditor
 
-LIB_DEPS = Web Markdown GUI Gfx VT Protocol IPC Thread Pthread Core JS
+LIB_DEPS = Web TextCodec Markdown GUI Gfx VT Protocol IPC Thread Pthread Core JS
 
 include ../../Makefile.common

+ 1 - 1
DevTools/HackStudio/Makefile

@@ -18,6 +18,6 @@ OBJS = \
 
 PROGRAM = HackStudio
 
-LIB_DEPS = GUI Web VT Protocol Markdown Gfx IPC Thread Pthread Core JS Debug
+LIB_DEPS = GUI Web TextCodec VT Protocol Markdown Gfx IPC Thread Pthread Core JS Debug
 
 include ../../Makefile.common

+ 73 - 0
Libraries/LibTextCodec/Decoder.cpp

@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <AK/String.h>
+#include <AK/StringBuilder.h>
+#include <LibTextCodec/Decoder.h>
+
+namespace TextCodec {
+
+Latin1Decoder& latin1_decoder()
+{
+    static Latin1Decoder* decoder;
+    if (!decoder)
+        decoder = new Latin1Decoder;
+    return *decoder;
+}
+
+UTF8Decoder& utf8_decoder()
+{
+    static UTF8Decoder* decoder;
+    if (!decoder)
+        decoder = new UTF8Decoder;
+    return *decoder;
+}
+
+Decoder* decoder_for(const String& encoding)
+{
+    if (encoding.equals_ignoring_case("iso-8859-1"))
+        return &latin1_decoder();
+    if (encoding.equals_ignoring_case("utf-8"))
+        return &utf8_decoder();
+    return nullptr;
+}
+
+String UTF8Decoder::to_utf8(const StringView& input)
+{
+    return input;
+}
+
+String Latin1Decoder::to_utf8(const StringView& input)
+{
+    StringBuilder builder(input.length());
+    for (size_t i = 0; i < input.length(); ++i) {
+        u8 ch = input[i];
+        builder.append(ch >= 0x80 ? '?' : ch);
+    }
+    return builder.to_string();
+}
+
+}

+ 50 - 0
Libraries/LibTextCodec/Decoder.h

@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <AK/Forward.h>
+
+namespace TextCodec {
+
+class Decoder {
+public:
+    virtual String to_utf8(const StringView&) = 0;
+};
+
+class UTF8Decoder final : public Decoder {
+public:
+    virtual String to_utf8(const StringView&) override;
+};
+
+class Latin1Decoder final : public Decoder {
+public:
+    virtual String to_utf8(const StringView&) override;
+};
+
+Decoder* decoder_for(const String& encoding);
+
+}

+ 15 - 0
Libraries/LibTextCodec/Makefile

@@ -0,0 +1,15 @@
+OBJS = \
+    Decoder.o
+
+LIBRARY = libtextcodec.a
+
+install:
+	for dir in .; do \
+	    mkdir -p $(SERENITY_BASE_DIR)/Root/usr/include/LibTextCodec/$$dir; \
+	    cp $$dir/*.h $(SERENITY_BASE_DIR)/Root/usr/include/LibTextCodec/$$dir/; \
+	done
+	cp $(LIBRARY) $(SERENITY_BASE_DIR)/Root/usr/lib/
+
+include ../../Makefile.common
+
+include ../../Makefile.subdir

+ 4 - 15
Libraries/LibWeb/Parser/HTMLParser.cpp

@@ -27,6 +27,7 @@
 #include <AK/Function.h>
 #include <AK/NonnullRefPtrVector.h>
 #include <AK/StringBuilder.h>
+#include <LibTextCodec/Decoder.h>
 #include <LibWeb/DOM/Comment.h>
 #include <LibWeb/DOM/DocumentFragment.h>
 #include <LibWeb/DOM/DocumentType.h>
@@ -385,21 +386,9 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
 
 String to_utf8(const StringView& input, const String& encoding)
 {
-    String output;
-    if (encoding == "utf-8") {
-        output = input;
-    } else if (encoding == "iso-8859-1") {
-        StringBuilder builder(input.length());
-        for (size_t i = 0; i < input.length(); ++i) {
-            u8 ch = input[i];
-            builder.append(ch >= 0x80 ? '?' : ch);
-        }
-        output = builder.to_string();
-    } else {
-        dbg() << "Unknown encoding " << encoding;
-        ASSERT_NOT_REACHED();
-    }
-    return output;
+    auto* decoder = TextCodec::decoder_for(encoding);
+    ASSERT(decoder);
+    return decoder->to_utf8(input);
 }
 
 RefPtr<DocumentFragment> parse_html_fragment(Document& document, const StringView& raw_html, const String& encoding)

+ 1 - 1
Userland/Makefile

@@ -4,7 +4,7 @@ APPS = ${SRCS:.cpp=}
 
 EXTRA_CLEAN = $(APPS)
 
-LIB_DEPS = Crypto TLS Web GUI Gfx Audio Protocol IPC Thread Pthread PCIDB Markdown JS Core Line X86 Debug
+LIB_DEPS = Crypto TLS Web TextCodec GUI Gfx Audio Protocol IPC Thread Pthread PCIDB Markdown JS Core Line X86 Debug
 
 include ../Makefile.common