Browse Source

LibArchive: Add Zip file parser

This is based on the zip specification on PKWARE's zip specification
(https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT) and will
be used in the unzip utility and eventually in the zip utility.
Idan Horowitz 4 years ago
parent
commit
caf4bde3a9

+ 3 - 2
Userland/Libraries/LibArchive/CMakeLists.txt

@@ -1,6 +1,7 @@
 set(SOURCES
-    TarStream.cpp
-)
+        TarStream.cpp
+        Zip.cpp
+        )
 
 serenity_lib(LibArchive archive)
 target_link_libraries(LibArchive LibCore)

+ 120 - 0
Userland/Libraries/LibArchive/Zip.cpp

@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021, Idan Horowitz <idan.horowitz@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <LibArchive/Zip.h>
+
+namespace Archive {
+
+bool Zip::find_end_of_central_directory_offset(const ReadonlyBytes& buffer, size_t& offset)
+{
+    for (size_t backwards_offset = 0; backwards_offset <= UINT16_MAX; backwards_offset++) // the file may have a trailing comment of an arbitrary 16 bit length
+    {
+        if (buffer.size() < (sizeof(EndOfCentralDirectory) - sizeof(u8*)) + backwards_offset)
+            return false;
+
+        auto signature_offset = (buffer.size() - (sizeof(EndOfCentralDirectory) - sizeof(u8*)) - backwards_offset);
+        if (memcmp(buffer.data() + signature_offset, end_of_central_directory_signature, sizeof(end_of_central_directory_signature)) == 0) {
+            offset = signature_offset;
+            return true;
+        }
+    }
+    return false;
+}
+
+Optional<Zip> Zip::try_create(const ReadonlyBytes& buffer)
+{
+    size_t end_of_central_directory_offset;
+    if (!find_end_of_central_directory_offset(buffer, end_of_central_directory_offset))
+        return {};
+
+    EndOfCentralDirectory end_of_central_directory {};
+    if (!end_of_central_directory.read(buffer.slice(end_of_central_directory_offset)))
+        return {};
+
+    if (end_of_central_directory.disk_number != 0 || end_of_central_directory.central_directory_start_disk != 0 || end_of_central_directory.disk_records_count != end_of_central_directory.total_records_count)
+        return {}; // TODO: support multi-volume zip archives
+
+    size_t member_offset = end_of_central_directory.central_directory_offset;
+    for (size_t i = 0; i < end_of_central_directory.total_records_count; i++) {
+        CentralDirectoryRecord central_directory_record {};
+        if (!central_directory_record.read(buffer.slice(member_offset)))
+            return {};
+        if (central_directory_record.general_purpose_flags & 1)
+            return {}; // TODO: support encrypted zip members
+        if (central_directory_record.general_purpose_flags & 3)
+            return {}; // TODO: support zip data descriptors
+        if (central_directory_record.compression_method != ZipCompressionMethod::Store && central_directory_record.compression_method != ZipCompressionMethod::Deflate)
+            return {}; // TODO: support obsolete zip compression methods
+        if (central_directory_record.compression_method == ZipCompressionMethod::Store && central_directory_record.uncompressed_size != central_directory_record.compressed_size)
+            return {};
+        if (central_directory_record.start_disk != 0)
+            return {}; // TODO: support multi-volume zip archives
+        if (memchr(central_directory_record.name, 0, central_directory_record.name_length) != nullptr)
+            return {};
+        LocalFileHeader local_file_header {};
+        if (!local_file_header.read(buffer.slice(central_directory_record.local_file_header_offset)))
+            return {};
+        if (buffer.size() - (local_file_header.compressed_data - buffer.data()) < central_directory_record.compressed_size)
+            return {};
+        member_offset += central_directory_record.size();
+    }
+
+    Zip zip;
+    zip.m_input_data = buffer;
+    zip.member_count = end_of_central_directory.total_records_count;
+    zip.members_start_offset = end_of_central_directory.central_directory_offset;
+    return zip;
+}
+
+bool Zip::for_each_member(Function<IterationDecision(const ZipMember&)> callback)
+{
+    size_t member_offset = members_start_offset;
+    for (size_t i = 0; i < member_count; i++) {
+        CentralDirectoryRecord central_directory_record {};
+        VERIFY(central_directory_record.read(m_input_data.slice(member_offset)));
+        LocalFileHeader local_file_header {};
+        VERIFY(local_file_header.read(m_input_data.slice(central_directory_record.local_file_header_offset)));
+
+        ZipMember member;
+        char null_terminated_name[central_directory_record.name_length + 1];
+        memcpy(null_terminated_name, central_directory_record.name, central_directory_record.name_length);
+        null_terminated_name[central_directory_record.name_length] = 0;
+        member.name = String { null_terminated_name };
+        member.compressed_data = { local_file_header.compressed_data, central_directory_record.compressed_size };
+        member.compression_method = static_cast<ZipCompressionMethod>(central_directory_record.compression_method);
+        member.uncompressed_size = central_directory_record.uncompressed_size;
+        member.crc32 = central_directory_record.crc32;
+        member.is_directory = central_directory_record.external_attributes & zip_directory_external_attribute || member.name.ends_with('/'); // FIXME: better directory detection
+
+        if (callback(member) == IterationDecision::Break)
+            return false;
+
+        member_offset += central_directory_record.size();
+    }
+    return true;
+}
+
+}

+ 234 - 0
Userland/Libraries/LibArchive/Zip.h

@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2021, Idan Horowitz <idan.horowitz@gmail.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#pragma once
+
+#include <AK/Function.h>
+#include <AK/IterationDecision.h>
+#include <AK/Span.h>
+#include <AK/Stream.h>
+#include <AK/String.h>
+#include <string.h>
+
+namespace Archive {
+
+// NOTE: Due to the format of zip files compression is streamed and decompression is random access.
+
+static constexpr u8 end_of_central_directory_signature[] = { 0x50, 0x4b, 0x05, 0x06 }; // 'PK\x05\x06'
+struct [[gnu::packed]] EndOfCentralDirectory {
+    u16 disk_number;
+    u16 central_directory_start_disk;
+    u16 disk_records_count;
+    u16 total_records_count;
+    u32 central_directory_size;
+    u32 central_directory_offset;
+    u16 comment_length;
+    const u8* comment;
+
+    bool read(ReadonlyBytes buffer)
+    {
+        auto fields_size = sizeof(EndOfCentralDirectory) - sizeof(u8*);
+        if (buffer.size() < fields_size)
+            return false;
+        if (memcmp(buffer.data(), end_of_central_directory_signature, sizeof(end_of_central_directory_signature)) != 0)
+            return false;
+        memcpy(reinterpret_cast<void*>(&disk_number), buffer.data() + sizeof(end_of_central_directory_signature), fields_size);
+        comment = buffer.data() + sizeof(end_of_central_directory_signature) + fields_size;
+        return true;
+    }
+
+    void write(OutputStream& stream) const
+    {
+        stream.write_or_error({ end_of_central_directory_signature, sizeof(end_of_central_directory_signature) });
+        stream << disk_number;
+        stream << central_directory_start_disk;
+        stream << disk_records_count;
+        stream << total_records_count;
+        stream << central_directory_size;
+        stream << central_directory_offset;
+        stream << comment_length;
+        if (comment_length > 0)
+            stream.write_or_error({ comment, comment_length });
+    }
+};
+
+static constexpr u8 central_directory_record_signature[] = { 0x50, 0x4b, 0x01, 0x02 }; // 'PK\x01\x02'
+struct [[gnu::packed]] CentralDirectoryRecord {
+    u16 made_by_version;
+    u16 minimum_version;
+    u16 general_purpose_flags;
+    u16 compression_method;
+    u16 modification_time;
+    u16 modification_date;
+    u32 crc32;
+    u32 compressed_size;
+    u32 uncompressed_size;
+    u16 name_length;
+    u16 extra_data_length;
+    u16 comment_length;
+    u16 start_disk;
+    u16 internal_attributes;
+    u32 external_attributes;
+    u32 local_file_header_offset;
+    const u8* name;
+    const u8* extra_data;
+    const u8* comment;
+
+    bool read(ReadonlyBytes buffer)
+    {
+        auto fields_size = sizeof(CentralDirectoryRecord) - (sizeof(u8*) * 3);
+        if (buffer.size() < fields_size)
+            return false;
+        if (memcmp(buffer.data(), central_directory_record_signature, sizeof(central_directory_record_signature)) != 0)
+            return false;
+        memcpy(reinterpret_cast<void*>(&made_by_version), buffer.data() + sizeof(central_directory_record_signature), fields_size);
+        name = buffer.data() + sizeof(central_directory_record_signature) + fields_size;
+        extra_data = name + name_length;
+        comment = extra_data + extra_data_length;
+        return true;
+    }
+
+    void write(OutputStream& stream) const
+    {
+        stream.write_or_error({ central_directory_record_signature, sizeof(central_directory_record_signature) });
+        stream << made_by_version;
+        stream << minimum_version;
+        stream << general_purpose_flags;
+        stream << compression_method;
+        stream << modification_time;
+        stream << modification_date;
+        stream << crc32;
+        stream << compressed_size;
+        stream << uncompressed_size;
+        stream << name_length;
+        stream << extra_data_length;
+        stream << comment_length;
+        stream << start_disk;
+        stream << internal_attributes;
+        stream << external_attributes;
+        stream << local_file_header_offset;
+        if (name_length > 0)
+            stream.write_or_error({ name, name_length });
+        if (extra_data_length > 0)
+            stream.write_or_error({ extra_data, extra_data_length });
+        if (comment_length > 0)
+            stream.write_or_error({ comment, comment_length });
+    }
+
+    [[nodiscard]] size_t size() const
+    {
+        return sizeof(central_directory_record_signature) + (sizeof(CentralDirectoryRecord) - (sizeof(u8*) * 3)) + name_length + extra_data_length + comment_length;
+    }
+};
+static constexpr u32 zip_directory_external_attribute = 1 << 4;
+
+static constexpr u8 local_file_header_signature[] = { 0x50, 0x4b, 0x03, 0x04 }; // 'PK\x03\x04'
+struct [[gnu::packed]] LocalFileHeader {
+    u16 minimum_version;
+    u16 general_purpose_flags;
+    u16 compression_method;
+    u16 modification_time;
+    u16 modification_date;
+    u32 crc32;
+    u32 compressed_size;
+    u32 uncompressed_size;
+    u16 name_length;
+    u16 extra_data_length;
+    const u8* name;
+    const u8* extra_data;
+    const u8* compressed_data;
+
+    bool read(ReadonlyBytes buffer)
+    {
+        auto fields_size = sizeof(LocalFileHeader) - (sizeof(u8*) * 3);
+        if (buffer.size() < fields_size)
+            return false;
+        if (memcmp(buffer.data(), local_file_header_signature, sizeof(local_file_header_signature)) != 0)
+            return false;
+        memcpy(reinterpret_cast<void*>(&minimum_version), buffer.data() + sizeof(local_file_header_signature), fields_size);
+        name = buffer.data() + sizeof(local_file_header_signature) + fields_size;
+        extra_data = name + name_length;
+        compressed_data = extra_data + extra_data_length;
+        return true;
+    }
+
+    void write(OutputStream& stream) const
+    {
+        stream.write_or_error({ local_file_header_signature, sizeof(local_file_header_signature) });
+        stream << minimum_version;
+        stream << general_purpose_flags;
+        stream << compression_method;
+        stream << modification_time;
+        stream << modification_date;
+        stream << crc32;
+        stream << compressed_size;
+        stream << uncompressed_size;
+        stream << name_length;
+        stream << extra_data_length;
+        if (name_length > 0)
+            stream.write_or_error({ name, name_length });
+        if (extra_data_length > 0)
+            stream.write_or_error({ extra_data, extra_data_length });
+        if (compressed_size > 0)
+            stream.write_or_error({ compressed_data, compressed_size });
+    }
+};
+
+enum ZipCompressionMethod : u16 {
+    Store = 0,
+    Shrink = 1,
+    Reduce1 = 2,
+    Reduce2 = 3,
+    Reduce3 = 4,
+    Reduce4 = 5,
+    Implode = 6,
+    Reserved = 7,
+    Deflate = 8
+};
+
+struct ZipMember {
+    String name;
+    ReadonlyBytes compressed_data; // TODO: maybe the decompression/compression should be handled by LibArchive instead of the user?
+    ZipCompressionMethod compression_method;
+    u32 uncompressed_size;
+    u32 crc32;
+    bool is_directory;
+};
+
+class Zip {
+public:
+    static Optional<Zip> try_create(const ReadonlyBytes& buffer);
+    bool for_each_member(Function<IterationDecision(const ZipMember&)>);
+
+private:
+    static bool find_end_of_central_directory_offset(const ReadonlyBytes&, size_t& offset);
+
+    u16 member_count { 0 };
+    size_t members_start_offset { 0 };
+    ReadonlyBytes m_input_data;
+};
+
+}