AK: Add CharacterTypes.h

This patch introduces CharacterTypes.h, which aims to be a replacement
for most usages of ctype.h. In contrast to that implementation, this
header makes use of exclusively constexpr functions and support the full
Unicode code point set without any narrowing-conversion issues.
This commit is contained in:
Max Wipfli 2021-06-01 13:43:30 +02:00 committed by Andreas Kling
parent d0dbb014a0
commit f4a4c36fa0
Notes: sideshowbarker 2024-07-18 16:57:37 +09:00

167
AK/CharacterTypes.h Normal file
View file

@ -0,0 +1,167 @@
/*
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Types.h>
// NOTE: For a quick reference for most of this, see https://www.cplusplus.com/reference/cctype/ and https://infra.spec.whatwg.org/#code-points.
// NOTE: To avoid ambiguity when including this header, all methods contains names should contain "ascii" or "unicode".
namespace AK {
constexpr bool is_ascii(u32 code_point)
{
return code_point < 0x80;
}
constexpr bool is_ascii_digit(u32 code_point)
{
return code_point >= '0' && code_point <= '9';
}
constexpr bool is_ascii_upper_alpha(u32 code_point)
{
return (code_point >= 'A' && code_point <= 'Z');
}
constexpr bool is_ascii_lower_alpha(u32 code_point)
{
return (code_point >= 'a' && code_point <= 'z');
}
constexpr bool is_ascii_alpha(u32 code_point)
{
return is_ascii_lower_alpha(code_point) || is_ascii_upper_alpha(code_point);
}
constexpr bool is_ascii_alphanumeric(u32 code_point)
{
return is_ascii_alpha(code_point) || is_ascii_digit(code_point);
}
constexpr bool is_ascii_hex_digit(u32 code_point)
{
return is_ascii_digit(code_point) || (code_point >= 'A' && code_point <= 'F') || (code_point >= 'a' && code_point <= 'f');
}
constexpr bool is_ascii_blank(u32 code_point)
{
return code_point == '\t' || code_point == ' ';
}
constexpr bool is_ascii_space(u32 code_point)
{
return code_point == ' ' || code_point == '\t' || code_point == '\n' || code_point == '\v' || code_point == '\f' || code_point == '\r';
}
constexpr bool is_ascii_punctuation(u32 code_point)
{
return (code_point >= 0x21 && code_point <= 0x2F) || (code_point >= 0x3A && code_point <= 0x40) || (code_point >= 0x5B && code_point <= 0x60) || (code_point >= 0x7B && code_point <= 0x7E);
}
constexpr bool is_ascii_graphical(u32 code_point)
{
return code_point >= 0x21 && code_point <= 0x7E;
}
constexpr bool is_ascii_printable(u32 code_point)
{
return code_point >= 0x20 && code_point <= 0x7E;
}
constexpr bool is_ascii_c0_control(u32 code_point)
{
return code_point < 0x20;
}
constexpr bool is_ascii_control(u32 code_point)
{
return is_ascii_c0_control(code_point) || code_point == 0x7F;
}
constexpr bool is_unicode(u32 code_point)
{
return code_point <= 0x10FFFF;
}
constexpr bool is_unicode_control(u32 code_point)
{
return is_ascii_c0_control(code_point) || (code_point >= 0x7E && code_point <= 0x9F);
}
constexpr bool is_unicode_surrogate(u32 code_point)
{
return code_point >= 0xD800 && code_point <= 0xDFFF;
}
constexpr bool is_unicode_scalar_value(u32 code_point)
{
return is_unicode(code_point) && !is_unicode_surrogate(code_point);
}
constexpr bool is_unicode_noncharacter(u32 code_point)
{
return is_unicode(code_point) && ((code_point >= 0xFDD0 && code_point <= 0xFDEF) || ((code_point & 0xFFFE) == 0xFFFE) || ((code_point & 0xFFFF) == 0xFFFF));
}
constexpr u32 to_ascii_lowercase(u32 code_point)
{
if (is_ascii_upper_alpha(code_point))
return code_point + 0x20;
return code_point;
}
constexpr u32 to_ascii_uppercase(u32 code_point)
{
if (is_ascii_lower_alpha(code_point))
return code_point - 0x20;
return code_point;
}
constexpr u32 parse_ascii_digit(u32 code_point)
{
if (is_ascii_digit(code_point))
return code_point - '0';
VERIFY_NOT_REACHED();
}
constexpr u32 parse_ascii_hex_digit(u32 code_point)
{
if (is_ascii_digit(code_point))
return parse_ascii_digit(code_point);
if (code_point >= 'A' && code_point <= 'F')
return code_point - 'A' + 10;
if (code_point >= 'a' && code_point <= 'f')
return code_point - 'a' + 10;
VERIFY_NOT_REACHED();
}
}
using AK::is_ascii;
using AK::is_ascii_alpha;
using AK::is_ascii_alphanumeric;
using AK::is_ascii_blank;
using AK::is_ascii_c0_control;
using AK::is_ascii_control;
using AK::is_ascii_digit;
using AK::is_ascii_graphical;
using AK::is_ascii_hex_digit;
using AK::is_ascii_lower_alpha;
using AK::is_ascii_printable;
using AK::is_ascii_punctuation;
using AK::is_ascii_space;
using AK::is_ascii_upper_alpha;
using AK::is_unicode;
using AK::is_unicode_control;
using AK::is_unicode_noncharacter;
using AK::is_unicode_scalar_value;
using AK::is_unicode_surrogate;
using AK::parse_ascii_digit;
using AK::parse_ascii_hex_digit;
using AK::to_ascii_lowercase;
using AK::to_ascii_uppercase;