LibLocale: Replace available locale lookups with ICU

This commit is contained in:
Timothy Flynn 2024-06-15 20:54:38 -04:00 committed by Andreas Kling
parent 1bcc29d0d1
commit 4de8adabac
Notes: sideshowbarker 2024-07-17 06:20:50 +09:00
10 changed files with 50 additions and 508 deletions

View file

@ -9,37 +9,12 @@ set(CLDR_VERSION_FILE "${CLDR_PATH}/version.txt")
set(CLDR_ZIP_URL "https://github.com/unicode-org/cldr-json/releases/download/${CLDR_VERSION}/cldr-${CLDR_VERSION}-json-modern.zip")
set(CLDR_ZIP_PATH "${CLDR_PATH}/cldr.zip")
set(CLDR_CORE_SOURCE cldr-core)
set(CLDR_CORE_PATH "${CLDR_PATH}/${CLDR_CORE_SOURCE}")
set(CLDR_NUMBERS_SOURCE cldr-numbers-modern)
set(CLDR_NUMBERS_PATH "${CLDR_PATH}/${CLDR_NUMBERS_SOURCE}")
if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
remove_path_if_version_changed("${CLDR_VERSION}" "${CLDR_VERSION_FILE}" "${CLDR_PATH}")
if (ENABLE_NETWORK_DOWNLOADS)
download_file("${CLDR_ZIP_URL}" "${CLDR_ZIP_PATH}" SHA256 "${CLDR_SHA256}")
extract_path("${CLDR_PATH}" "${CLDR_ZIP_PATH}" "${CLDR_CORE_SOURCE}/**" "${CLDR_CORE_PATH}")
extract_path("${CLDR_PATH}" "${CLDR_ZIP_PATH}" "${CLDR_NUMBERS_SOURCE}/**" "${CLDR_NUMBERS_PATH}")
else()
message(STATUS "Skipping download of ${CLDR_ZIP_URL}, expecting the archive to have been extracted to ${CLDR_PATH}")
endif()
set(LOCALE_DATA_HEADER LocaleData.h)
set(LOCALE_DATA_IMPLEMENTATION LocaleData.cpp)
invoke_generator(
"LocaleData"
Lagom::GenerateLocaleData
"${CLDR_VERSION_FILE}"
"${LOCALE_DATA_HEADER}"
"${LOCALE_DATA_IMPLEMENTATION}"
arguments -r "${CLDR_CORE_PATH}" -n "${CLDR_NUMBERS_PATH}"
)
set(LOCALE_DATA_SOURCES
${LOCALE_DATA_HEADER}
${LOCALE_DATA_IMPLEMENTATION}
)
endif()

View file

@ -1,5 +1,4 @@
add_subdirectory(IPCCompiler)
add_subdirectory(LibLocale)
add_subdirectory(LibTextCodec)
add_subdirectory(LibTimeZone)
add_subdirectory(LibUnicode)

View file

@ -1 +0,0 @@
lagom_tool(GenerateLocaleData SOURCES GenerateLocaleData.cpp LIBS LibMain)

View file

@ -1,279 +0,0 @@
/*
* Copyright (c) 2021-2022, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include "../LibUnicode/GeneratorUtil.h" // FIXME: Move this somewhere common.
#include <AK/AllOf.h>
#include <AK/ByteString.h>
#include <AK/CharacterTypes.h>
#include <AK/Error.h>
#include <AK/Format.h>
#include <AK/HashMap.h>
#include <AK/JsonObject.h>
#include <AK/JsonParser.h>
#include <AK/JsonValue.h>
#include <AK/LexicalPath.h>
#include <AK/QuickSort.h>
#include <AK/SourceGenerator.h>
#include <AK/StringBuilder.h>
#include <LibCore/ArgsParser.h>
#include <LibCore/Directory.h>
#include <LibFileSystem/FileSystem.h>
static ByteString format_identifier(StringView owner, ByteString identifier)
{
identifier = identifier.replace("-"sv, "_"sv, ReplaceMode::All);
if (all_of(identifier, is_ascii_digit))
return ByteString::formatted("{}_{}", owner[0], identifier);
if (is_ascii_lower_alpha(identifier[0]))
return ByteString::formatted("{:c}{}", to_ascii_uppercase(identifier[0]), identifier.substring_view(1));
return identifier;
}
struct LocaleData {
};
struct CLDR {
UniqueStringStorage unique_strings;
HashMap<ByteString, LocaleData> locales;
Vector<Alias> locale_aliases;
};
// Some parsing is expected to fail. For example, the CLDR contains language mappings
// with locales such as "en-GB-oed" that are canonically invalid locale IDs.
#define TRY_OR_DISCARD(expression) \
({ \
auto&& _temporary_result = (expression); \
if (_temporary_result.is_error()) \
return; \
static_assert(!::AK::Detail::IsLvalueReference<decltype(_temporary_result.release_value())>, \
"Do not return a reference from a fallible expression"); \
_temporary_result.release_value(); \
})
// NOTE: We return a pointer only because ErrorOr cannot store references. You may safely assume the pointer is non-null.
ErrorOr<JsonValue const*> read_json_file_with_cache(ByteString const& path)
{
static HashMap<ByteString, JsonValue> parsed_json_cache;
if (auto parsed_json = parsed_json_cache.get(path); parsed_json.has_value())
return &parsed_json.value();
auto parsed_json = TRY(read_json_file(path));
TRY(parsed_json_cache.try_set(path, move(parsed_json)));
return &parsed_json_cache.get(path).value();
}
static ErrorOr<void> parse_default_content_locales(ByteString core_path, CLDR& cldr)
{
LexicalPath default_content_path(move(core_path));
default_content_path = default_content_path.append("defaultContent.json"sv);
auto default_content = TRY(read_json_file(default_content_path.string()));
auto const& default_content_array = default_content.as_object().get_array("defaultContent"sv).value();
default_content_array.for_each([&](JsonValue const& value) {
auto locale = value.as_string();
StringView default_locale = locale;
while (true) {
if (cldr.locales.contains(default_locale))
break;
auto pos = default_locale.find_last('-');
if (!pos.has_value())
return;
default_locale = default_locale.substring_view(0, *pos);
}
if (default_locale != locale)
cldr.locale_aliases.append({ default_locale, move(locale) });
});
return {};
}
static ErrorOr<void> define_aliases_without_scripts(CLDR& cldr)
{
// From ECMA-402: https://tc39.es/ecma402/#sec-internal-slots
//
// For locales that include a script subtag in addition to language and region, the
// corresponding locale without a script subtag must also be supported.
//
// So we define aliases for locales that contain all three subtags, but we must also take
// care to handle when the locale itself or the locale without a script subtag are an alias
// by way of default-content locales.
auto find_alias = [&](auto const& locale) {
return cldr.locale_aliases.find_if([&](auto const& alias) { return locale == alias.alias; });
};
auto append_alias_without_script = [&](auto const& locale) -> ErrorOr<void> {
auto parsed_locale = TRY(CanonicalLanguageID::parse(cldr.unique_strings, locale));
if ((parsed_locale.language == 0) || (parsed_locale.script == 0) || (parsed_locale.region == 0))
return {};
auto locale_without_script = ByteString::formatted("{}-{}",
cldr.unique_strings.get(parsed_locale.language),
cldr.unique_strings.get(parsed_locale.region));
if (cldr.locales.contains(locale_without_script))
return {};
if (find_alias(locale_without_script) != cldr.locale_aliases.end())
return {};
if (auto it = find_alias(locale); it != cldr.locale_aliases.end())
cldr.locale_aliases.append({ it->name, locale_without_script });
else
cldr.locale_aliases.append({ locale, locale_without_script });
return {};
};
for (auto const& locale : cldr.locales)
TRY(append_alias_without_script(locale.key));
for (auto const& locale : cldr.locale_aliases)
TRY(append_alias_without_script(locale.alias));
return {};
}
static ErrorOr<void> parse_all_locales(ByteString core_path, ByteString numbers_path, CLDR& cldr)
{
LexicalPath core_supplemental_path(core_path);
core_supplemental_path = core_supplemental_path.append("supplemental"sv);
VERIFY(FileSystem::is_directory(core_supplemental_path.string()));
auto remove_variants_from_path = [&](ByteString path) -> ErrorOr<ByteString> {
auto parsed_locale = TRY(CanonicalLanguageID::parse(cldr.unique_strings, LexicalPath::basename(path)));
StringBuilder builder;
builder.append(cldr.unique_strings.get(parsed_locale.language));
if (auto script = cldr.unique_strings.get(parsed_locale.script); !script.is_empty())
builder.appendff("-{}", script);
if (auto region = cldr.unique_strings.get(parsed_locale.region); !region.is_empty())
builder.appendff("-{}", region);
return builder.to_byte_string();
};
TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", numbers_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr<IterationDecision> {
auto numbers_path = LexicalPath::join(directory.path().string(), entry.name).string();
auto language = TRY(remove_variants_from_path(numbers_path));
cldr.locales.ensure(language);
return IterationDecision::Continue;
}));
TRY(parse_default_content_locales(move(core_path), cldr));
TRY(define_aliases_without_scripts(cldr));
return {};
}
static ErrorOr<void> generate_unicode_locale_header(Core::InputBufferedFile& file, CLDR& cldr)
{
StringBuilder builder;
SourceGenerator generator { builder };
generator.append(R"~~~(
#pragma once
#include <AK/Types.h>
namespace Locale {
)~~~");
auto locales = cldr.locales.keys();
generate_enum(generator, format_identifier, "Locale"sv, "None"sv, locales, cldr.locale_aliases);
generator.append(R"~~~(
}
)~~~");
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
return {};
}
static ErrorOr<void> generate_unicode_locale_implementation(Core::InputBufferedFile& file, CLDR& cldr)
{
auto string_index_type = cldr.unique_strings.type_that_fits();
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("string_index_type"sv, string_index_type);
generator.set("locales_size"sv, ByteString::number(cldr.locales.size()));
generator.append(R"~~~(
#include <AK/Array.h>
#include <AK/BinarySearch.h>
#include <AK/Optional.h>
#include <AK/Span.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Vector.h>
#include <LibLocale/DateTimeFormat.h>
#include <LibLocale/Locale.h>
#include <LibLocale/LocaleData.h>
#include <LibUnicode/CurrencyCode.h>
namespace Locale {
)~~~");
auto locales = cldr.locales.keys();
quick_sort(locales);
auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values, Vector<Alias> const& aliases = {}) -> ErrorOr<void> {
HashValueMap<ByteString> hashes;
TRY(hashes.try_ensure_capacity(values.size()));
for (auto const& value : values)
hashes.set(value.hash(), format_identifier(enum_title, value));
for (auto const& alias : aliases)
hashes.set(alias.alias.hash(), format_identifier(enum_title, alias.alias));
generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes));
return {};
};
TRY(append_from_string("Locale"sv, "locale"sv, cldr.locales.keys(), cldr.locale_aliases));
generator.append(R"~~~(
}
)~~~");
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
return {};
}
ErrorOr<int> serenity_main(Main::Arguments arguments)
{
StringView generated_header_path;
StringView generated_implementation_path;
StringView core_path;
StringView numbers_path;
Core::ArgsParser args_parser;
args_parser.add_option(generated_header_path, "Path to the Unicode locale header file to generate", "generated-header-path", 'h', "generated-header-path");
args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path");
args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path");
args_parser.parse(arguments);
auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
CLDR cldr;
TRY(parse_all_locales(core_path, numbers_path, cldr));
TRY(generate_unicode_locale_header(*generated_header_file, cldr));
TRY(generate_unicode_locale_implementation(*generated_implementation_file, cldr));
return 0;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -10,26 +10,17 @@
#include <AK/Function.h>
#include <AK/HashFunctions.h>
#include <AK/HashMap.h>
#include <AK/JsonValue.h>
#include <AK/LexicalPath.h>
#include <AK/NumericLimits.h>
#include <AK/Optional.h>
#include <AK/QuickSort.h>
#include <AK/SourceGenerator.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Traits.h>
#include <AK/Vector.h>
#include <LibCore/File.h>
#include <LibLocale/Locale.h>
#include <LibUnicode/CharacterTypes.h>
template<class T>
inline constexpr bool StorageTypeIsList = false;
template<class T>
inline constexpr bool StorageTypeIsList<Vector<T>> = true;
template<typename T>
concept IntegralOrEnum = Integral<T> || Enum<T>;
@ -96,92 +87,6 @@ public:
return "u64"sv;
}
void generate(SourceGenerator& generator, StringView type, StringView name, size_t max_values_per_row)
requires(!StorageTypeIsList<StorageType>)
{
generator.set("type"sv, type);
generator.set("name"sv, name);
generator.set("size"sv, ByteString::number(m_storage.size()));
generator.append(R"~~~(
static constexpr Array<@type@, @size@ + 1> @name@ { {
{})~~~");
size_t values_in_current_row = 1;
for (auto const& value : m_storage) {
if (values_in_current_row++ > 0)
generator.append(", ");
if constexpr (IsSame<StorageType, ByteString>)
generator.append(ByteString::formatted("\"{}\"sv", value));
else
generator.append(ByteString::formatted("{}", value));
if (values_in_current_row == max_values_per_row) {
values_in_current_row = 0;
generator.append(",\n ");
}
}
generator.append(R"~~~(
} };
)~~~");
}
void generate(SourceGenerator& generator, StringView type, StringView name)
requires(StorageTypeIsList<StorageType>)
{
generator.set("type"sv, type);
generator.set("name"sv, name);
for (size_t i = 0; i < m_storage.size(); ++i) {
auto const& list = m_storage[i];
generator.set("index"sv, ByteString::number(i));
generator.set("size"sv, ByteString::number(list.size()));
generator.append(R"~~~(
static constexpr Array<@type@, @size@> @name@@index@ { {)~~~");
bool first = true;
for (auto const& value : list) {
generator.append(first ? " "sv : ", "sv);
generator.append(ByteString::formatted("{}", value));
first = false;
}
generator.append(" } };");
}
generator.set("size"sv, ByteString::number(m_storage.size()));
generator.append(R"~~~(
static constexpr Array<ReadonlySpan<@type@>, @size@ + 1> @name@ { {
{})~~~");
constexpr size_t max_values_per_row = 10;
size_t values_in_current_row = 1;
for (size_t i = 0; i < m_storage.size(); ++i) {
if (values_in_current_row++ > 0)
generator.append(", ");
generator.set("index"sv, ByteString::number(i));
generator.append("@name@@index@.span()");
if (values_in_current_row == max_values_per_row) {
values_in_current_row = 0;
generator.append(",\n ");
}
}
generator.append(R"~~~(
} };
)~~~");
}
protected:
Vector<StorageType> m_storage;
HashMap<StorageType, size_t> m_storage_indices;
@ -280,50 +185,6 @@ struct Alias {
ByteString alias;
};
struct CanonicalLanguageID {
static ErrorOr<CanonicalLanguageID> parse(UniqueStringStorage& unique_strings, StringView language)
{
CanonicalLanguageID language_id {};
auto segments = language.split_view('-');
VERIFY(!segments.is_empty());
size_t index = 0;
if (Locale::is_unicode_language_subtag(segments[index])) {
language_id.language = unique_strings.ensure(segments[index]);
if (segments.size() == ++index)
return language_id;
} else {
return Error::from_string_literal("Expected language subtag");
}
if (Locale::is_unicode_script_subtag(segments[index])) {
language_id.script = unique_strings.ensure(segments[index]);
if (segments.size() == ++index)
return language_id;
}
if (Locale::is_unicode_region_subtag(segments[index])) {
language_id.region = unique_strings.ensure(segments[index]);
if (segments.size() == ++index)
return language_id;
}
while (index < segments.size()) {
if (!Locale::is_unicode_variant_subtag(segments[index]))
return Error::from_string_literal("Expected variant subtag");
language_id.variants.append(unique_strings.ensure(segments[index++]));
}
return language_id;
}
size_t language { 0 };
size_t script { 0 };
size_t region { 0 };
Vector<size_t> variants {};
};
inline ErrorOr<NonnullOwnPtr<Core::InputBufferedFile>> open_file(StringView path, Core::File::OpenMode mode)
{
if (path.is_empty())
@ -333,14 +194,6 @@ inline ErrorOr<NonnullOwnPtr<Core::InputBufferedFile>> open_file(StringView path
return Core::InputBufferedFile::create(move(file));
}
inline ErrorOr<JsonValue> read_json_file(StringView path)
{
auto file = TRY(open_file(path, Core::File::OpenMode::Read));
auto buffer = TRY(file->read_until_eof());
return JsonValue::from_string(buffer);
}
inline void ensure_from_string_types_are_generated(SourceGenerator& generator)
{
static bool generated_from_string_types = false;
@ -570,36 +423,6 @@ static constexpr Array<ReadonlySpan<@type@>, @size@> @name@ { {
)~~~");
}
template<typename T>
void generate_available_values(SourceGenerator& generator, StringView name, Vector<T> const& values, Vector<Alias> const& aliases = {}, Function<bool(StringView)> value_filter = {})
{
generator.set("name", name);
generator.append(R"~~~(
ReadonlySpan<StringView> @name@()
{
static constexpr auto values = Array {)~~~");
bool first = true;
for (auto const& value : values) {
if (value_filter && !value_filter(value))
continue;
generator.append(first ? " "sv : ", "sv);
first = false;
if (auto it = aliases.find_if([&](auto const& alias) { return alias.alias == value; }); it != aliases.end())
generator.append(ByteString::formatted("\"{}\"sv", it->name));
else
generator.append(ByteString::formatted("\"{}\"sv", value));
}
generator.append(R"~~~( };
return values.span();
}
)~~~");
}
inline Vector<u32> parse_code_point_list(StringView list)
{
Vector<u32> code_points;

View file

@ -1,14 +1,3 @@
include(${SerenityOS_SOURCE_DIR}/Meta/CMake/locale_data.cmake)
if (DEFINED LOCALE_DATA_SOURCES)
set(SOURCES ${LOCALE_DATA_SOURCES})
set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED})
serenity_lib(LibLocaleData localedata TYPE OBJECT)
target_compile_options(LibLocaleData PRIVATE -g0 -Os -Wno-parentheses-equality)
target_link_libraries(LibLocaleData PRIVATE LibTimeZone)
unset(GENERATED_SOURCES)
endif()
set(SOURCES
DateTimeFormat.cpp
DurationFormat.cpp
@ -23,13 +12,7 @@ set(SOURCES
)
serenity_lib(LibLocale locale)
target_link_libraries(LibLocale PRIVATE LibUnicode)
target_compile_definitions(LibLocale PRIVATE ENABLE_UNICODE_DATA=$<BOOL:${ENABLE_UNICODE_DATABASE_DOWNLOAD}>)
find_package(ICU REQUIRED COMPONENTS data i18n uc)
target_include_directories(LibLocale PRIVATE ${ICU_INCLUDE_DIRS})
target_link_libraries(LibLocale PUBLIC ${ICU_LIBRARIES})
if (DEFINED LOCALE_DATA_SOURCES)
target_link_libraries(LibLocale PRIVATE LibLocaleData)
endif()

View file

@ -12,7 +12,6 @@ namespace Locale {
enum class CalendarPatternStyle : u8;
enum class HourCycle : u8;
enum class Locale : u16;
enum class PluralCategory : u8;
enum class Style : u8;
enum class Weekday : u8;

View file

@ -8,6 +8,7 @@
#include <AK/AllOf.h>
#include <AK/GenericLexer.h>
#include <AK/HashTable.h>
#include <AK/QuickSort.h>
#include <AK/StringBuilder.h>
#include <LibLocale/ICU.h>
@ -514,9 +515,56 @@ StringView default_locale()
return "en"sv;
}
static void define_locales_without_scripts(HashTable<String>& locales)
{
// https://tc39.es/ecma402/#sec-internal-slots
// For locales that include a script subtag in addition to language and region, the corresponding locale without a
// script subtag must also be supported.
HashTable<String> new_locales;
auto append_locale_without_script = [&](auto const& locale) {
auto parsed_locale = parse_unicode_language_id(locale);
if (!parsed_locale.has_value())
return;
if (!parsed_locale->language.has_value() || !parsed_locale->script.has_value() || !parsed_locale->region.has_value())
return;
auto locale_without_script = MUST(String::formatted("{}-{}", *parsed_locale->language, *parsed_locale->region));
new_locales.set(move(locale_without_script));
};
for (auto const& locale : locales)
append_locale_without_script(locale);
for (auto const& new_locale : new_locales)
locales.set(new_locale);
}
bool is_locale_available(StringView locale)
{
return locale_from_string(locale).has_value();
static auto available_locales = []() {
i32 count = 0;
auto const* locale_list = icu::Locale::getAvailableLocales(count);
HashTable<String> available_locales;
available_locales.ensure_capacity(static_cast<size_t>(count));
for (i32 i = 0; i < count; ++i) {
UErrorCode status = U_ZERO_ERROR;
auto locale_name = locale_list[i].toLanguageTag<StringBuilder>(status);
if (icu_failure(status))
continue;
available_locales.set(MUST(locale_name.to_string()));
}
define_locales_without_scripts(available_locales);
return available_locales;
}();
return available_locales.contains(locale);
}
Style style_from_string(StringView style)
@ -544,8 +592,6 @@ StringView style_to_string(Style style)
}
}
Optional<Locale> __attribute__((weak)) locale_from_string(StringView) { return {}; }
static void apply_extensions_to_locale(icu::Locale& locale, icu::Locale const& locale_with_extensions)
{
UErrorCode status = U_ZERO_ERROR;

View file

@ -136,8 +136,6 @@ bool is_locale_available(StringView locale);
Style style_from_string(StringView style);
StringView style_to_string(Style style);
Optional<Locale> locale_from_string(StringView locale);
Optional<String> add_likely_subtags(StringView);
Optional<String> remove_likely_subtags(StringView);

View file

@ -12,7 +12,6 @@
#include <LibLocale/ICU.h>
#include <LibLocale/Locale.h>
#include <LibLocale/NumberFormat.h>
#include <LibUnicode/CharacterTypes.h>
#include <math.h>
#include <unicode/numberformatter.h>