wchar.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Assertions.h>
  7. #include <AK/Format.h>
  8. #include <AK/UnicodeUtils.h>
  9. #include <errno.h>
  10. #include <string.h>
  11. #include <wchar.h>
  12. static unsigned int mbstate_expected_bytes(mbstate_t* state)
  13. {
  14. if (state->stored_bytes == 0) {
  15. return 0;
  16. }
  17. unsigned char first = state->bytes[0];
  18. // Single-byte sequences have their first bit unset
  19. if ((first & 0b10000000) == 0) {
  20. return 1;
  21. }
  22. // Two-byte sequences start with 0b110xxxxx
  23. if ((first & 0b11100000) == 0b11000000) {
  24. return 2;
  25. }
  26. // Three-byte sequences start with 0b1110xxxx
  27. if ((first & 0b11110000) == 0b11100000) {
  28. return 3;
  29. }
  30. // Four-byte sequences start with 0b11110xxx
  31. if ((first & 0b11111000) == 0b11110000) {
  32. return 4;
  33. }
  34. // Everything else is invalid
  35. return 0;
  36. }
  37. extern "C" {
  38. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcslen.html
  39. size_t wcslen(wchar_t const* str)
  40. {
  41. size_t len = 0;
  42. while (*(str++))
  43. ++len;
  44. return len;
  45. }
  46. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcscpy.html
  47. wchar_t* wcscpy(wchar_t* dest, wchar_t const* src)
  48. {
  49. wchar_t* original_dest = dest;
  50. while ((*dest++ = *src++) != '\0')
  51. ;
  52. return original_dest;
  53. }
  54. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsdup.html
  55. wchar_t* wcsdup(wchar_t const* str)
  56. {
  57. size_t length = wcslen(str);
  58. wchar_t* new_str = (wchar_t*)malloc(sizeof(wchar_t) * (length + 1));
  59. if (!new_str) {
  60. errno = ENOMEM;
  61. return nullptr;
  62. }
  63. return wcscpy(new_str, str);
  64. }
  65. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsncpy.html
  66. wchar_t* wcsncpy(wchar_t* dest, wchar_t const* src, size_t num)
  67. {
  68. wchar_t* original_dest = dest;
  69. while (((*dest++ = *src++) != '\0') && ((size_t)(dest - original_dest) < num))
  70. ;
  71. return original_dest;
  72. }
  73. size_t wcslcpy(wchar_t* dest, wchar_t const* src, size_t n)
  74. {
  75. size_t i;
  76. for (i = 0; i + 1 < n && src[i] != L'\0'; ++i)
  77. dest[i] = src[i];
  78. if (n)
  79. dest[i] = L'\0';
  80. for (; src[i] != L'\0'; ++i)
  81. ; // Determine the length of src, don't copy.
  82. return i;
  83. }
  84. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcscmp.html
  85. int wcscmp(wchar_t const* s1, wchar_t const* s2)
  86. {
  87. while (*s1 == *s2++)
  88. if (*s1++ == 0)
  89. return 0;
  90. return *(wchar_t const*)s1 - *(wchar_t const*)--s2;
  91. }
  92. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsncmp.html
  93. int wcsncmp(wchar_t const* s1, wchar_t const* s2, size_t n)
  94. {
  95. if (!n)
  96. return 0;
  97. do {
  98. if (*s1 != *s2++)
  99. return *(wchar_t const*)s1 - *(wchar_t const*)--s2;
  100. if (*s1++ == 0)
  101. break;
  102. } while (--n);
  103. return 0;
  104. }
  105. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcschr.html
  106. wchar_t* wcschr(wchar_t const* str, int c)
  107. {
  108. wchar_t ch = c;
  109. for (;; ++str) {
  110. if (*str == ch)
  111. return const_cast<wchar_t*>(str);
  112. if (!*str)
  113. return nullptr;
  114. }
  115. }
  116. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsrchr.html
  117. wchar_t* wcsrchr(wchar_t const* str, wchar_t wc)
  118. {
  119. wchar_t* last = nullptr;
  120. wchar_t c;
  121. for (; (c = *str); ++str) {
  122. if (c == wc)
  123. last = const_cast<wchar_t*>(str);
  124. }
  125. return last;
  126. }
  127. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcscat.html
  128. wchar_t* wcscat(wchar_t* dest, wchar_t const* src)
  129. {
  130. size_t dest_length = wcslen(dest);
  131. size_t i;
  132. for (i = 0; src[i] != '\0'; i++)
  133. dest[dest_length + i] = src[i];
  134. dest[dest_length + i] = '\0';
  135. return dest;
  136. }
  137. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsncat.html
  138. wchar_t* wcsncat(wchar_t* dest, wchar_t const* src, size_t n)
  139. {
  140. size_t dest_length = wcslen(dest);
  141. size_t i;
  142. for (i = 0; i < n && src[i] != '\0'; i++)
  143. dest[dest_length + i] = src[i];
  144. dest[dest_length + i] = '\0';
  145. return dest;
  146. }
  147. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstok.html
  148. wchar_t* wcstok(wchar_t* str, wchar_t const* delim, wchar_t** ptr)
  149. {
  150. wchar_t* used_str = str;
  151. if (!used_str) {
  152. used_str = *ptr;
  153. }
  154. size_t token_start = 0;
  155. size_t token_end = 0;
  156. size_t str_len = wcslen(used_str);
  157. size_t delim_len = wcslen(delim);
  158. for (size_t i = 0; i < str_len; ++i) {
  159. bool is_proper_delim = false;
  160. for (size_t j = 0; j < delim_len; ++j) {
  161. if (used_str[i] == delim[j]) {
  162. // Skip beginning delimiters
  163. if (token_end - token_start == 0) {
  164. ++token_start;
  165. break;
  166. }
  167. is_proper_delim = true;
  168. }
  169. }
  170. ++token_end;
  171. if (is_proper_delim && token_end > 0) {
  172. --token_end;
  173. break;
  174. }
  175. }
  176. if (used_str[token_start] == '\0')
  177. return nullptr;
  178. if (token_end == 0) {
  179. return &used_str[token_start];
  180. }
  181. used_str[token_end] = '\0';
  182. return &used_str[token_start];
  183. }
  184. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstol.html
  185. long wcstol(wchar_t const*, wchar_t**, int)
  186. {
  187. dbgln("FIXME: Implement wcstol()");
  188. TODO();
  189. }
  190. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstoll.html
  191. long long wcstoll(wchar_t const*, wchar_t**, int)
  192. {
  193. dbgln("FIXME: Implement wcstoll()");
  194. TODO();
  195. }
  196. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/btowc.html
  197. wint_t btowc(int c)
  198. {
  199. if (c == EOF) {
  200. return WEOF;
  201. }
  202. // Multi-byte sequences in UTF-8 have their highest bit set
  203. if (c & (1 << 7)) {
  204. return WEOF;
  205. }
  206. return c;
  207. }
  208. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mbrtowc.html
  209. size_t mbrtowc(wchar_t* pwc, char const* s, size_t n, mbstate_t* state)
  210. {
  211. static mbstate_t _anonymous_state = {};
  212. if (state == nullptr) {
  213. state = &_anonymous_state;
  214. }
  215. // s being a null pointer is a shorthand for reading a single null byte.
  216. if (s == nullptr) {
  217. pwc = nullptr;
  218. s = "";
  219. n = 1;
  220. }
  221. // Stop early if we can't read anything
  222. if (n == 0) {
  223. return 0;
  224. }
  225. size_t consumed_bytes = 0;
  226. // Fill the first byte if we haven't done that yet
  227. if (state->stored_bytes == 0) {
  228. state->bytes[state->stored_bytes++] = s[0];
  229. consumed_bytes++;
  230. }
  231. size_t expected_bytes = mbstate_expected_bytes(state);
  232. // Check if the first byte is invalid
  233. if (expected_bytes == 0) {
  234. *state = {};
  235. errno = EILSEQ;
  236. return -1;
  237. }
  238. while (state->stored_bytes < expected_bytes) {
  239. if (consumed_bytes == n) {
  240. // No complete multibyte character
  241. return -2;
  242. }
  243. unsigned char c = s[consumed_bytes];
  244. // Continuation bytes have to start with 0b10xxxxxx
  245. if ((c & 0b11000000) != 0b10000000) {
  246. // Invalid multibyte character
  247. *state = {};
  248. errno = EILSEQ;
  249. return -1;
  250. }
  251. state->bytes[state->stored_bytes++] = c;
  252. consumed_bytes++;
  253. }
  254. wchar_t codepoint = state->bytes[0];
  255. // Mask out the "length" bits if necessary
  256. if (expected_bytes > 1) {
  257. codepoint &= (1 << (7 - expected_bytes)) - 1;
  258. }
  259. for (unsigned int i = 1; i < expected_bytes; i++) {
  260. // Each continuation byte contains 6 bits of data
  261. codepoint = codepoint << 6;
  262. codepoint |= state->bytes[i] & 0b111111;
  263. }
  264. if (pwc) {
  265. *pwc = codepoint;
  266. }
  267. // We want to read the next multibyte character, but keep all other properties.
  268. state->stored_bytes = 0;
  269. if (codepoint == 0) {
  270. *state = {};
  271. return 0;
  272. }
  273. return consumed_bytes;
  274. }
  275. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mbrlen.html
  276. size_t mbrlen(char const* s, size_t n, mbstate_t* ps)
  277. {
  278. static mbstate_t anonymous_state = {};
  279. if (ps == nullptr)
  280. ps = &anonymous_state;
  281. return mbrtowc(nullptr, s, n, ps);
  282. }
  283. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcrtomb.html
  284. size_t wcrtomb(char* s, wchar_t wc, mbstate_t*)
  285. {
  286. if (s == nullptr)
  287. wc = L'\0';
  288. auto nwritten = AK::UnicodeUtils::code_point_to_utf8(wc, [&s](char byte) {
  289. if (s != nullptr)
  290. *s++ = byte;
  291. });
  292. if (nwritten < 0) {
  293. errno = EILSEQ;
  294. return (size_t)-1;
  295. } else {
  296. return nwritten;
  297. }
  298. }
  299. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcscoll.html
  300. int wcscoll(wchar_t const* ws1, wchar_t const* ws2)
  301. {
  302. // TODO: Actually implement a sensible sort order for this,
  303. // because right now we are doing what LC_COLLATE=C would do.
  304. return wcscmp(ws1, ws2);
  305. }
  306. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsxfrm.html
  307. size_t wcsxfrm(wchar_t* dest, wchar_t const* src, size_t n)
  308. {
  309. // TODO: This needs to be changed when wcscoll is not just doing wcscmp
  310. return wcslcpy(dest, src, n);
  311. }
  312. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wctob.html
  313. int wctob(wint_t c)
  314. {
  315. if (c > 0x7f)
  316. return EOF;
  317. return static_cast<unsigned char>(c);
  318. }
  319. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mbsinit.html
  320. int mbsinit(mbstate_t const* state)
  321. {
  322. if (!state) {
  323. return 1;
  324. }
  325. if (state->stored_bytes != 0) {
  326. return 0;
  327. }
  328. return 1;
  329. }
  330. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcspbrk.html
  331. wchar_t* wcspbrk(wchar_t const* wcs, wchar_t const* accept)
  332. {
  333. for (wchar_t const* cur = accept; *cur; cur++) {
  334. wchar_t* res = wcschr(wcs, *cur);
  335. if (res)
  336. return res;
  337. }
  338. return nullptr;
  339. }
  340. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsstr.html
  341. wchar_t* wcsstr(wchar_t const* haystack, wchar_t const* needle)
  342. {
  343. size_t nlen = wcslen(needle);
  344. if (nlen == 0)
  345. return const_cast<wchar_t*>(haystack);
  346. size_t hlen = wcslen(haystack);
  347. while (hlen >= nlen) {
  348. if (wcsncmp(haystack, needle, nlen) == 0)
  349. return const_cast<wchar_t*>(haystack);
  350. haystack++;
  351. hlen--;
  352. }
  353. return nullptr;
  354. }
  355. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wmemchr.html
  356. wchar_t* wmemchr(wchar_t const* s, wchar_t c, size_t n)
  357. {
  358. for (size_t i = 0; i < n; i++) {
  359. if (s[i] == c)
  360. return const_cast<wchar_t*>(&s[i]);
  361. }
  362. return nullptr;
  363. }
  364. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wmemcpy.html
  365. wchar_t* wmemcpy(wchar_t* dest, wchar_t const* src, size_t n)
  366. {
  367. for (size_t i = 0; i < n; i++)
  368. dest[i] = src[i];
  369. return dest;
  370. }
  371. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wmemset.html
  372. wchar_t* wmemset(wchar_t* wcs, wchar_t wc, size_t n)
  373. {
  374. for (size_t i = 0; i < n; i++) {
  375. wcs[i] = wc;
  376. }
  377. return wcs;
  378. }
  379. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wmemmove.html
  380. wchar_t* wmemmove(wchar_t* dest, wchar_t const* src, size_t n)
  381. {
  382. if (dest > src) {
  383. for (size_t i = 1; i <= n; i++) {
  384. dest[n - i] = src[n - i];
  385. }
  386. } else if (dest < src) {
  387. for (size_t i = 0; i < n; i++) {
  388. dest[i] = src[i];
  389. }
  390. }
  391. return dest;
  392. }
  393. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstoul.html
  394. unsigned long wcstoul(wchar_t const*, wchar_t**, int)
  395. {
  396. dbgln("TODO: Implement wcstoul()");
  397. TODO();
  398. }
  399. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstoull.html
  400. unsigned long long wcstoull(wchar_t const*, wchar_t**, int)
  401. {
  402. dbgln("TODO: Implement wcstoull()");
  403. TODO();
  404. }
  405. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstof.html
  406. float wcstof(wchar_t const*, wchar_t**)
  407. {
  408. dbgln("TODO: Implement wcstof()");
  409. TODO();
  410. }
  411. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstod.html
  412. double wcstod(wchar_t const*, wchar_t**)
  413. {
  414. dbgln("TODO: Implement wcstod()");
  415. TODO();
  416. }
  417. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcstold.html
  418. long double wcstold(wchar_t const*, wchar_t**)
  419. {
  420. dbgln("TODO: Implement wcstold()");
  421. TODO();
  422. }
  423. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcwidth.html
  424. int wcwidth(wchar_t wc)
  425. {
  426. if (wc == L'\0')
  427. return 0;
  428. // Printable ASCII.
  429. if (wc >= 0x20 && wc <= 0x7e)
  430. return 1;
  431. // Non-printable ASCII.
  432. if (wc <= 0x7f)
  433. return -1;
  434. // TODO: Implement wcwidth for non-ASCII characters.
  435. return 1;
  436. }
  437. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsnrtombs.html
  438. size_t wcsnrtombs(char* dest, wchar_t const** src, size_t nwc, size_t len, mbstate_t* ps)
  439. {
  440. static mbstate_t _anonymous_state = {};
  441. if (ps == nullptr)
  442. ps = &_anonymous_state;
  443. size_t written = 0;
  444. size_t read = 0;
  445. while (read < nwc) {
  446. size_t ret = 0;
  447. char buf[MB_LEN_MAX];
  448. // Convert next wchar to multibyte.
  449. ret = wcrtomb(buf, **src, ps);
  450. // wchar can't be represented as multibyte.
  451. if (ret == (size_t)-1) {
  452. errno = EILSEQ;
  453. return (size_t)-1;
  454. }
  455. // New bytes don't fit the buffer.
  456. if (dest && len < written + ret) {
  457. return written;
  458. }
  459. if (dest) {
  460. memcpy(dest, buf, ret);
  461. dest += ret;
  462. }
  463. // Null character has been reached
  464. if (**src == L'\0') {
  465. *src = nullptr;
  466. return written;
  467. }
  468. *src += 1;
  469. read += 1;
  470. written += ret;
  471. }
  472. return written;
  473. }
  474. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mbsnrtowcs.html
  475. size_t mbsnrtowcs(wchar_t* dst, char const** src, size_t nms, size_t len, mbstate_t* ps)
  476. {
  477. static mbstate_t _anonymous_state = {};
  478. if (ps == nullptr)
  479. ps = &_anonymous_state;
  480. size_t written = 0;
  481. while (written < len || !dst) {
  482. // End of source buffer, no incomplete character.
  483. // src continues to point to the next byte.
  484. if (nms == 0) {
  485. return written;
  486. }
  487. // Convert next multibyte to wchar.
  488. size_t ret = mbrtowc(dst, *src, nms, ps);
  489. // Multibyte sequence is incomplete.
  490. if (ret == -2ul) {
  491. // Point just past the last processed byte.
  492. *src += nms;
  493. return written;
  494. }
  495. // Multibyte sequence is invalid.
  496. if (ret == -1ul) {
  497. errno = EILSEQ;
  498. return (size_t)-1;
  499. }
  500. // Null byte has been reached.
  501. if (**src == '\0') {
  502. *src = nullptr;
  503. return written;
  504. }
  505. *src += ret;
  506. nms -= ret;
  507. written += 1;
  508. if (dst)
  509. dst += 1;
  510. }
  511. // If we are here, we have written `len` wchars, but not reached the null byte.
  512. return written;
  513. }
  514. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wmemcmp.html
  515. int wmemcmp(wchar_t const* s1, wchar_t const* s2, size_t n)
  516. {
  517. while (n-- > 0) {
  518. if (*s1++ != *s2++)
  519. return s1[-1] < s2[-1] ? -1 : 1;
  520. }
  521. return 0;
  522. }
  523. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsrtombs.html
  524. size_t wcsrtombs(char* dest, wchar_t const** src, size_t len, mbstate_t* ps)
  525. {
  526. static mbstate_t anonymous_state = {};
  527. if (ps == nullptr)
  528. ps = &anonymous_state;
  529. // SIZE_MAX is as close as we are going to get to "unlimited".
  530. return wcsnrtombs(dest, src, SIZE_MAX, len, ps);
  531. }
  532. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/mbsrtowcs.html
  533. size_t mbsrtowcs(wchar_t* dst, char const** src, size_t len, mbstate_t* ps)
  534. {
  535. static mbstate_t anonymous_state = {};
  536. if (ps == nullptr)
  537. ps = &anonymous_state;
  538. // SIZE_MAX is as close as we are going to get to "unlimited".
  539. return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
  540. }
  541. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcscspn.html
  542. size_t wcscspn(wchar_t const* wcs, wchar_t const* reject)
  543. {
  544. for (auto const* wc_pointer = wcs;;) {
  545. auto c = *wc_pointer++;
  546. wchar_t rc;
  547. auto const* reject_copy = reject;
  548. do {
  549. if ((rc = *reject_copy++) == c)
  550. return wc_pointer - 1 - wcs;
  551. } while (rc != 0);
  552. }
  553. }
  554. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsspn.html
  555. size_t wcsspn(wchar_t const* wcs, wchar_t const* accept)
  556. {
  557. for (auto const* wc_pointer = wcs;;) {
  558. auto c = *wc_pointer++;
  559. wchar_t rc;
  560. auto const* accept_copy = accept;
  561. do {
  562. if ((rc = *accept_copy++) != c)
  563. return wc_pointer - 1 - wcs;
  564. } while (rc != 0);
  565. }
  566. }
  567. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsftime.html
  568. size_t wcsftime(wchar_t* __restrict wcs, size_t maxsize, wchar_t const* __restrict format, const struct tm* __restrict timeptr)
  569. {
  570. (void)wcs;
  571. (void)maxsize;
  572. (void)format;
  573. (void)timeptr;
  574. dbgln("FIXME: Implement wcsftime()");
  575. TODO();
  576. }
  577. }