0ct0pu5
/
ladybird


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
							/*
 * Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
 *
 * SPDX-License-Identifier: BSD-2-Clause
 */

#include <AK/StringView.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
#include <ctype.h>

namespace Web::HTML {

bool prescan_should_abort(const ByteBuffer& input, const size_t& position)
{
    return position >= input.size() || position >= 1024;
}

bool prescan_is_whitespace_or_slash(const u8& byte)
{
    return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
}

bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position)
{
    while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
        ++position;
    return !prescan_should_abort(input, position);
}

Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
{
    if (!prescan_skip_whitespace_and_slashes(input, position))
        return {};
    if (input[position] == '>')
        return {};

    StringBuilder attribute_name;
    while (true) {
        if (input[position] == '=' && !attribute_name.is_empty()) {
            ++position;
            goto value;
        } else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
            goto spaces;
        else if (input[position] == '/' || input[position] == '>')
            return Attribute(attribute_name.to_string(), "");
        else
            attribute_name.append_as_lowercase(input[position]);
        ++position;
        if (prescan_should_abort(input, position))
            return {};
    }

spaces:
    if (!prescan_skip_whitespace_and_slashes(input, position))
        return {};
    if (input[position] != '=')
        return Attribute(attribute_name.to_string(), "");
    ++position;

value:
    if (!prescan_skip_whitespace_and_slashes(input, position))
        return {};

    StringBuilder attribute_value;
    if (input[position] == '"' || input[position] == '\'') {
        u8 quote_character = input[position];
        ++position;
        for (; !prescan_should_abort(input, position); ++position) {
            if (input[position] == quote_character)
                return Attribute(attribute_name.to_string(), attribute_value.to_string());
            else
                attribute_value.append_as_lowercase(input[position]);
        }
        return {};
    } else if (input[position] == '>')
        return Attribute(attribute_name.to_string(), "");
    else
        attribute_value.append_as_lowercase(input[position]);

    ++position;
    if (prescan_should_abort(input, position))
        return {};

    for (; !prescan_should_abort(input, position); ++position) {
        if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
            return Attribute(attribute_name.to_string(), attribute_value.to_string());
        else
            attribute_value.append_as_lowercase(input[position]);
    }
    return {};
}

// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
{
    // https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding

    // Detects '<?x'
    if (!prescan_should_abort(input, 6)) {
        if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
            return "utf-16le";
        if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
            return "utf-16be";
    }

    for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
        if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
            && input[position + 2] == '-' && input[position + 3] == '-') {
            position += 2;
            for (; !prescan_should_abort(input, position + 3); ++position) {
                if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
                    position += 2;
                    break;
                }
            }
        } else if (!prescan_should_abort(input, position + 6)
            && input[position] == '<'
            && (input[position + 1] == 'M' || input[position + 1] == 'm')
            && (input[position + 2] == 'E' || input[position + 2] == 'e')
            && (input[position + 3] == 'T' || input[position + 3] == 't')
            && (input[position + 4] == 'A' || input[position + 4] == 'a')
            && prescan_is_whitespace_or_slash(input[position + 5])) {
            position += 6;
            Vector<String> attribute_list {};
            bool got_pragma = false;
            Optional<bool> need_pragma {};
            Optional<String> charset {};

            while (true) {
                auto attribute = prescan_get_attribute(input, position);
                if (!attribute.has_value())
                    break;
                if (attribute_list.contains_slow(attribute.value().name()))
                    continue;
                auto& attribute_name = attribute.value().name();
                attribute_list.append(attribute.value().name());

                if (attribute_name == "http-equiv" && attribute.value().value() == "content-type")
                    got_pragma = true;
                else if (attribute_name == "charset") {
                    auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
                    if (maybe_charset.has_value()) {
                        charset = Optional<String> { maybe_charset };
                        need_pragma = { false };
                    }
                }

                // FIXME: For attribute name "content", do this:
                //        Apply the "algorithm for extracting a character encoding from a meta
                //        element", giving the attribute's value as the string to parse. If a
                //        character encoding is returned, and if charset is still set to null,
                //        let charset be the encoding returned, and set need pragma to true.
            }

            if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
                continue;
            if (charset.value() == "UTF-16BE/LE")
                return "UTF-8";
            else if (charset.value() == "x-user-defined")
                return "windows-1252";
            else
                return charset.value();
        } else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
            && ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
            position += 2;
            prescan_skip_whitespace_and_slashes(input, position);
            while (prescan_get_attribute(input, position).has_value()) { };
        } else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
            position += 2;
            while (input[position] != '>') {
                ++position;
                if (prescan_should_abort(input, position))
                    return {};
            }
        } else {
            // Do nothing.
        }
    }
    return {};
}

// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
String run_encoding_sniffing_algorithm(const ByteBuffer& input)
{
    if (input.size() >= 2) {
        if (input[0] == 0xFE && input[1] == 0xFF) {
            return "UTF-16BE";
        } else if (input[0] == 0xFF && input[1] == 0xFE) {
            return "UTF-16LE";
        } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
            return "UTF-8";
        }
    }

    // FIXME: If the user has explicitly instructed the user agent to override the document's character
    //        encoding with a specific encoding.
    // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
    //        at any later step in this algorithm.
    // FIXME: If the transport layer specifies a character encoding, and it is supported.

    auto optional_encoding = run_prescan_byte_stream_algorithm(input);
    if (optional_encoding.has_value()) {
        return optional_encoding.value();
    }

    // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
    //        is non-null and a child browsing context.
    // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
    //        when it was last visited.

    if (!Utf8View(StringView(input)).validate()) {
        // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
        return "windows-1252";
    }

    // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
    //       "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
    return "UTF-8";
}

}