
This commit is preemptive to upcoming commits which add more subtags to the CLDR generator. Rather than generating a giant HashMap containing all data, generate more (smaller) Array-based tables. This mimics the UCD generator. This also allows simpler lookups at runtime since we can generate index-based lookups into the smaller tables rather easily. Without this change, adding the remaining locale subtags would result in the generation and compilation of UnicodeLocale.cpp taking about 30s on my machine. With this change, it takes about half that. Additionally, the size of the generated file reduces by about 1.5MB.
180 lines
4.9 KiB
C++
180 lines
4.9 KiB
C++
/*
|
|
* Copyright (c) 2021, Tim Flynn <trflynn89@pm.me>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <AK/AllOf.h>
|
|
#include <AK/CharacterTypes.h>
|
|
#include <AK/GenericLexer.h>
|
|
#include <AK/QuickSort.h>
|
|
#include <AK/StringBuilder.h>
|
|
#include <LibUnicode/Locale.h>
|
|
|
|
#if ENABLE_UNICODE_DATA
|
|
# include <LibUnicode/UnicodeLocale.h>
|
|
#endif
|
|
|
|
namespace Unicode {
|
|
|
|
bool is_unicode_language_subtag(StringView subtag)
|
|
{
|
|
// unicode_language_subtag = alpha{2,3} | alpha{5,8}
|
|
if ((subtag.length() < 2) || (subtag.length() == 4) || (subtag.length() > 8))
|
|
return false;
|
|
return all_of(subtag, is_ascii_alpha);
|
|
}
|
|
|
|
bool is_unicode_script_subtag(StringView subtag)
|
|
{
|
|
// unicode_script_subtag = alpha{4}
|
|
if (subtag.length() != 4)
|
|
return false;
|
|
return all_of(subtag, is_ascii_alpha);
|
|
}
|
|
|
|
bool is_unicode_region_subtag(StringView subtag)
|
|
{
|
|
// unicode_region_subtag = (alpha{2} | digit{3})
|
|
if (subtag.length() == 2)
|
|
return all_of(subtag, is_ascii_alpha);
|
|
if (subtag.length() == 3)
|
|
return all_of(subtag, is_ascii_digit);
|
|
return false;
|
|
}
|
|
|
|
bool is_unicode_variant_subtag(StringView subtag)
|
|
{
|
|
// unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
|
|
if ((subtag.length() >= 5) && (subtag.length() <= 8))
|
|
return all_of(subtag, is_ascii_alphanumeric);
|
|
if (subtag.length() == 4)
|
|
return is_ascii_digit(subtag[0]) && all_of(subtag.substring_view(1), is_ascii_alphanumeric);
|
|
return false;
|
|
}
|
|
|
|
Optional<LanguageID> parse_unicode_language_id(StringView language)
|
|
{
|
|
// https://unicode.org/reports/tr35/#Unicode_language_identifier
|
|
//
|
|
// unicode_language_id = "root"
|
|
// OR
|
|
// unicode_language_id = ((unicode_language_subtag (sep unicode_script_subtag)?) | unicode_script_subtag)
|
|
// (sep unicode_region_subtag)?
|
|
// (sep unicode_variant_subtag)*
|
|
LanguageID language_id {};
|
|
|
|
if (language == "root"sv) {
|
|
language_id.is_root = true;
|
|
return language_id;
|
|
}
|
|
|
|
auto segments = language.split_view_if(is_any_of("-_"sv), true); // keep_empty=true to ensure valid data follows a separator.
|
|
size_t index = 0;
|
|
|
|
if (segments.size() == index)
|
|
return {};
|
|
|
|
if (is_unicode_language_subtag(segments[index])) {
|
|
language_id.language = segments[index];
|
|
if (segments.size() == ++index)
|
|
return language_id;
|
|
}
|
|
|
|
if (is_unicode_script_subtag(segments[index])) {
|
|
language_id.script = segments[index];
|
|
if (segments.size() == ++index)
|
|
return language_id;
|
|
} else if (!language_id.language.has_value()) {
|
|
return {};
|
|
}
|
|
|
|
if (is_unicode_region_subtag(segments[index])) {
|
|
language_id.region = segments[index];
|
|
if (segments.size() == ++index)
|
|
return language_id;
|
|
}
|
|
|
|
while (index < segments.size()) {
|
|
if (!is_unicode_variant_subtag(segments[index]))
|
|
return {};
|
|
language_id.variants.append(segments[index++]);
|
|
}
|
|
|
|
return language_id;
|
|
}
|
|
|
|
Optional<LocaleID> parse_unicode_locale_id(StringView locale)
|
|
{
|
|
LocaleID locale_id {};
|
|
|
|
// https://unicode.org/reports/tr35/#Unicode_locale_identifier
|
|
//
|
|
// unicode_locale_id = unicode_language_id
|
|
// extensions*
|
|
// pu_extensions?
|
|
auto language_id = parse_unicode_language_id(locale);
|
|
if (!language_id.has_value())
|
|
return {};
|
|
|
|
// FIXME: Handle extensions and pu_extensions.
|
|
return LocaleID { language_id.release_value() };
|
|
}
|
|
|
|
Optional<String> canonicalize_unicode_locale_id(LocaleID& locale_id)
|
|
{
|
|
// https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers
|
|
StringBuilder builder;
|
|
|
|
if (!locale_id.language_id.language.has_value())
|
|
return {};
|
|
|
|
builder.append(locale_id.language_id.language->to_lowercase_string());
|
|
|
|
if (locale_id.language_id.script.has_value()) {
|
|
builder.append('-');
|
|
builder.append(locale_id.language_id.script->to_titlecase_string());
|
|
}
|
|
|
|
if (locale_id.language_id.region.has_value()) {
|
|
builder.append('-');
|
|
builder.append(locale_id.language_id.region->to_uppercase_string());
|
|
}
|
|
|
|
quick_sort(locale_id.language_id.variants);
|
|
|
|
for (auto const& variant : locale_id.language_id.variants) {
|
|
builder.append('-');
|
|
builder.append(variant.to_lowercase_string());
|
|
}
|
|
|
|
// FIXME: Handle extensions and pu_extensions.
|
|
|
|
return builder.build();
|
|
}
|
|
|
|
String const& default_locale()
|
|
{
|
|
static String locale = "en"sv;
|
|
return locale;
|
|
}
|
|
|
|
bool is_locale_available([[maybe_unused]] StringView locale)
|
|
{
|
|
#if ENABLE_UNICODE_DATA
|
|
return Detail::locale_from_string(locale).has_value();
|
|
#else
|
|
return false;
|
|
#endif
|
|
}
|
|
|
|
Optional<StringView> get_locale_territory_mapping([[maybe_unused]] StringView locale, [[maybe_unused]] StringView territory)
|
|
{
|
|
#if ENABLE_UNICODE_DATA
|
|
return Detail::get_locale_territory_mapping(locale, territory);
|
|
#else
|
|
return {};
|
|
#endif
|
|
}
|
|
|
|
}
|