wchar.cpp 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Assertions.h>
  7. #include <AK/Format.h>
  8. #include <errno.h>
  9. #include <wchar.h>
  10. static unsigned int mbstate_expected_bytes(mbstate_t* state)
  11. {
  12. if (state->stored_bytes == 0) {
  13. return 0;
  14. }
  15. unsigned char first = state->bytes[0];
  16. // Single-byte sequences have their first bit unset
  17. if ((first & 0b10000000) == 0) {
  18. return 1;
  19. }
  20. // Two-byte sequences start with 0b110xxxxx
  21. if ((first & 0b11100000) == 0b11000000) {
  22. return 2;
  23. }
  24. // Three-byte sequences start with 0b1110xxxx
  25. if ((first & 0b11110000) == 0b11100000) {
  26. return 3;
  27. }
  28. // Four-byte sequences start with 0b11110xxx
  29. if ((first & 0b11111000) == 0b11110000) {
  30. return 4;
  31. }
  32. // Everything else is invalid
  33. return 0;
  34. }
  35. extern "C" {
  36. size_t wcslen(const wchar_t* str)
  37. {
  38. size_t len = 0;
  39. while (*(str++))
  40. ++len;
  41. return len;
  42. }
  43. wchar_t* wcscpy(wchar_t* dest, const wchar_t* src)
  44. {
  45. wchar_t* original_dest = dest;
  46. while ((*dest++ = *src++) != '\0')
  47. ;
  48. return original_dest;
  49. }
  50. wchar_t* wcsncpy(wchar_t* dest, const wchar_t* src, size_t num)
  51. {
  52. wchar_t* original_dest = dest;
  53. while (((*dest++ = *src++) != '\0') && ((size_t)(dest - original_dest) < num))
  54. ;
  55. return original_dest;
  56. }
  57. int wcscmp(const wchar_t* s1, const wchar_t* s2)
  58. {
  59. while (*s1 == *s2++)
  60. if (*s1++ == 0)
  61. return 0;
  62. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  63. }
  64. int wcsncmp(const wchar_t* s1, const wchar_t* s2, size_t n)
  65. {
  66. if (!n)
  67. return 0;
  68. do {
  69. if (*s1 != *s2++)
  70. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  71. if (*s1++ == 0)
  72. break;
  73. } while (--n);
  74. return 0;
  75. }
  76. wchar_t* wcschr(const wchar_t* str, int c)
  77. {
  78. wchar_t ch = c;
  79. for (;; ++str) {
  80. if (*str == ch)
  81. return const_cast<wchar_t*>(str);
  82. if (!*str)
  83. return nullptr;
  84. }
  85. }
  86. const wchar_t* wcsrchr(const wchar_t* str, wchar_t wc)
  87. {
  88. wchar_t* last = nullptr;
  89. wchar_t c;
  90. for (; (c = *str); ++str) {
  91. if (c == wc)
  92. last = const_cast<wchar_t*>(str);
  93. }
  94. return last;
  95. }
  96. wchar_t* wcscat(wchar_t* dest, const wchar_t* src)
  97. {
  98. size_t dest_length = wcslen(dest);
  99. size_t i;
  100. for (i = 0; src[i] != '\0'; i++)
  101. dest[dest_length + i] = src[i];
  102. dest[dest_length + i] = '\0';
  103. return dest;
  104. }
  105. wchar_t* wcsncat(wchar_t* dest, const wchar_t* src, size_t n)
  106. {
  107. size_t dest_length = wcslen(dest);
  108. size_t i;
  109. for (i = 0; i < n && src[i] != '\0'; i++)
  110. dest[dest_length + i] = src[i];
  111. dest[dest_length + i] = '\0';
  112. return dest;
  113. }
  114. wchar_t* wcstok(wchar_t* str, const wchar_t* delim, wchar_t** ptr)
  115. {
  116. wchar_t* used_str = str;
  117. if (!used_str) {
  118. used_str = *ptr;
  119. }
  120. size_t token_start = 0;
  121. size_t token_end = 0;
  122. size_t str_len = wcslen(used_str);
  123. size_t delim_len = wcslen(delim);
  124. for (size_t i = 0; i < str_len; ++i) {
  125. bool is_proper_delim = false;
  126. for (size_t j = 0; j < delim_len; ++j) {
  127. if (used_str[i] == delim[j]) {
  128. // Skip beginning delimiters
  129. if (token_end - token_start == 0) {
  130. ++token_start;
  131. break;
  132. }
  133. is_proper_delim = true;
  134. }
  135. }
  136. ++token_end;
  137. if (is_proper_delim && token_end > 0) {
  138. --token_end;
  139. break;
  140. }
  141. }
  142. if (used_str[token_start] == '\0')
  143. return nullptr;
  144. if (token_end == 0) {
  145. return &used_str[token_start];
  146. }
  147. used_str[token_end] = '\0';
  148. return &used_str[token_start];
  149. }
  150. long wcstol(const wchar_t*, wchar_t**, int)
  151. {
  152. dbgln("FIXME: Implement wcstol()");
  153. TODO();
  154. }
  155. long long wcstoll(const wchar_t*, wchar_t**, int)
  156. {
  157. dbgln("FIXME: Implement wcstoll()");
  158. TODO();
  159. }
  160. wint_t btowc(int c)
  161. {
  162. if (c == EOF) {
  163. return WEOF;
  164. }
  165. // Multi-byte sequences in UTF-8 have their highest bit set
  166. if (c & (1 << 7)) {
  167. return WEOF;
  168. }
  169. return c;
  170. }
  171. size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* state)
  172. {
  173. static mbstate_t _anonymous_state = {};
  174. if (state == nullptr) {
  175. state = &_anonymous_state;
  176. }
  177. // s being a null pointer is a shorthand for reading a single null byte.
  178. if (s == nullptr) {
  179. pwc = nullptr;
  180. s = "";
  181. n = 1;
  182. }
  183. // Stop early if we can't read anything
  184. if (n == 0) {
  185. return 0;
  186. }
  187. size_t consumed_bytes = 0;
  188. // Fill the first byte if we haven't done that yet
  189. if (state->stored_bytes == 0) {
  190. state->bytes[state->stored_bytes++] = s[0];
  191. consumed_bytes++;
  192. }
  193. size_t expected_bytes = mbstate_expected_bytes(state);
  194. // Check if the first byte is invalid
  195. if (expected_bytes == 0) {
  196. *state = {};
  197. errno = EILSEQ;
  198. return -1;
  199. }
  200. while (state->stored_bytes < expected_bytes) {
  201. if (consumed_bytes == n) {
  202. // No complete multibyte character
  203. return -2;
  204. }
  205. unsigned char c = s[consumed_bytes];
  206. // Continuation bytes have to start with 0b10xxxxxx
  207. if ((c & 0b11000000) != 0b10000000) {
  208. // Invalid multibyte character
  209. *state = {};
  210. errno = EILSEQ;
  211. return -1;
  212. }
  213. state->bytes[state->stored_bytes++] = c;
  214. consumed_bytes++;
  215. }
  216. wchar_t codepoint = state->bytes[0];
  217. // Mask out the "length" bits if necessary
  218. if (expected_bytes > 1) {
  219. codepoint &= (1 << (7 - expected_bytes)) - 1;
  220. }
  221. for (unsigned int i = 1; i < expected_bytes; i++) {
  222. // Each continuation byte contains 6 bits of data
  223. codepoint = codepoint << 6;
  224. codepoint |= state->bytes[i] & 0b111111;
  225. }
  226. if (pwc) {
  227. *pwc = codepoint;
  228. }
  229. // We want to read the next multibyte character, but keep all other properties.
  230. state->stored_bytes = 0;
  231. if (codepoint == 0) {
  232. *state = {};
  233. return 0;
  234. }
  235. return consumed_bytes;
  236. }
  237. size_t mbrlen(const char*, size_t, mbstate_t*)
  238. {
  239. dbgln("FIXME: Implement mbrlen()");
  240. TODO();
  241. }
  242. size_t wcrtomb(char*, wchar_t, mbstate_t*)
  243. {
  244. dbgln("FIXME: Implement wcrtomb()");
  245. TODO();
  246. }
  247. int wcscoll(const wchar_t* ws1, const wchar_t* ws2)
  248. {
  249. // TODO: Actually implement a sensible sort order for this,
  250. // because right now we are doing what LC_COLLATE=C would do.
  251. return wcscmp(ws1, ws2);
  252. }
  253. int wctob(wint_t)
  254. {
  255. dbgln("FIXME: Implement wctob()");
  256. TODO();
  257. }
  258. int mbsinit(const mbstate_t* state)
  259. {
  260. if (!state) {
  261. return 1;
  262. }
  263. if (state->stored_bytes != 0) {
  264. return 0;
  265. }
  266. return 1;
  267. }
  268. wchar_t* wcspbrk(const wchar_t* wcs, const wchar_t* accept)
  269. {
  270. for (const wchar_t* cur = accept; *cur; cur++) {
  271. wchar_t* res = wcschr(wcs, *cur);
  272. if (res)
  273. return res;
  274. }
  275. return nullptr;
  276. }
  277. wchar_t* wcsstr(const wchar_t* haystack, const wchar_t* needle)
  278. {
  279. size_t nlen = wcslen(needle);
  280. if (nlen == 0)
  281. return const_cast<wchar_t*>(haystack);
  282. size_t hlen = wcslen(haystack);
  283. while (hlen >= nlen) {
  284. if (wcsncmp(haystack, needle, nlen) == 0)
  285. return const_cast<wchar_t*>(haystack);
  286. haystack++;
  287. hlen--;
  288. }
  289. return nullptr;
  290. }
  291. wchar_t* wmemchr(const wchar_t* s, wchar_t c, size_t n)
  292. {
  293. for (size_t i = 0; i < n; i++) {
  294. if (s[i] == c)
  295. return const_cast<wchar_t*>(&s[i]);
  296. }
  297. return nullptr;
  298. }
  299. wchar_t* wmemcpy(wchar_t* dest, const wchar_t* src, size_t n)
  300. {
  301. for (size_t i = 0; i < n; i++)
  302. dest[i] = src[i];
  303. return dest;
  304. }
  305. wchar_t* wmemset(wchar_t* wcs, wchar_t wc, size_t n)
  306. {
  307. for (size_t i = 0; i < n; i++) {
  308. wcs[i] = wc;
  309. }
  310. return wcs;
  311. }
  312. wchar_t* wmemmove(wchar_t* dest, const wchar_t* src, size_t n)
  313. {
  314. if (dest > src) {
  315. for (size_t i = 1; i <= n; i++) {
  316. dest[n - i] = src[n - i];
  317. }
  318. } else if (dest < src) {
  319. for (size_t i = 0; i < n; i++) {
  320. dest[i] = src[i];
  321. }
  322. }
  323. return dest;
  324. }
  325. unsigned long wcstoul(const wchar_t*, wchar_t**, int)
  326. {
  327. dbgln("TODO: Implement wcstoul()");
  328. TODO();
  329. }
  330. unsigned long long wcstoull(const wchar_t*, wchar_t**, int)
  331. {
  332. dbgln("TODO: Implement wcstoull()");
  333. TODO();
  334. }
  335. float wcstof(const wchar_t*, wchar_t**)
  336. {
  337. dbgln("TODO: Implement wcstof()");
  338. TODO();
  339. }
  340. double wcstod(const wchar_t*, wchar_t**)
  341. {
  342. dbgln("TODO: Implement wcstod()");
  343. TODO();
  344. }
  345. long double wcstold(const wchar_t*, wchar_t**)
  346. {
  347. dbgln("TODO: Implement wcstold()");
  348. TODO();
  349. }
  350. int swprintf(wchar_t*, size_t, const wchar_t*, ...)
  351. {
  352. dbgln("TODO: Implement swprintf()");
  353. TODO();
  354. }
  355. int wcwidth(wchar_t wc)
  356. {
  357. if (wc == L'\0')
  358. return 0;
  359. // Printable ASCII.
  360. if (wc >= 0x20 && wc <= 0x7e)
  361. return 1;
  362. // Non-printable ASCII.
  363. if (wc <= 0x7f)
  364. return -1;
  365. // TODO: Implement wcwidth for non-ASCII characters.
  366. return 1;
  367. }
  368. }