wchar.cpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Assertions.h>
  7. #include <AK/Format.h>
  8. #include <AK/UnicodeUtils.h>
  9. #include <errno.h>
  10. #include <string.h>
  11. #include <wchar.h>
  12. static unsigned int mbstate_expected_bytes(mbstate_t* state)
  13. {
  14. if (state->stored_bytes == 0) {
  15. return 0;
  16. }
  17. unsigned char first = state->bytes[0];
  18. // Single-byte sequences have their first bit unset
  19. if ((first & 0b10000000) == 0) {
  20. return 1;
  21. }
  22. // Two-byte sequences start with 0b110xxxxx
  23. if ((first & 0b11100000) == 0b11000000) {
  24. return 2;
  25. }
  26. // Three-byte sequences start with 0b1110xxxx
  27. if ((first & 0b11110000) == 0b11100000) {
  28. return 3;
  29. }
  30. // Four-byte sequences start with 0b11110xxx
  31. if ((first & 0b11111000) == 0b11110000) {
  32. return 4;
  33. }
  34. // Everything else is invalid
  35. return 0;
  36. }
  37. extern "C" {
  38. size_t wcslen(wchar_t const* str)
  39. {
  40. size_t len = 0;
  41. while (*(str++))
  42. ++len;
  43. return len;
  44. }
  45. wchar_t* wcscpy(wchar_t* dest, wchar_t const* src)
  46. {
  47. wchar_t* original_dest = dest;
  48. while ((*dest++ = *src++) != '\0')
  49. ;
  50. return original_dest;
  51. }
  52. // https://pubs.opengroup.org/onlinepubs/9699919799/functions/wcsdup.html
  53. wchar_t* wcsdup(wchar_t const* str)
  54. {
  55. size_t length = wcslen(str);
  56. wchar_t* new_str = (wchar_t*)malloc(sizeof(wchar_t) * (length + 1));
  57. if (!new_str) {
  58. errno = ENOMEM;
  59. return nullptr;
  60. }
  61. return wcscpy(new_str, str);
  62. }
  63. wchar_t* wcsncpy(wchar_t* dest, wchar_t const* src, size_t num)
  64. {
  65. wchar_t* original_dest = dest;
  66. while (((*dest++ = *src++) != '\0') && ((size_t)(dest - original_dest) < num))
  67. ;
  68. return original_dest;
  69. }
  70. size_t wcslcpy(wchar_t* dest, wchar_t const* src, size_t n)
  71. {
  72. size_t i;
  73. for (i = 0; i + 1 < n && src[i] != L'\0'; ++i)
  74. dest[i] = src[i];
  75. if (n)
  76. dest[i] = L'\0';
  77. for (; src[i] != L'\0'; ++i)
  78. ; // Determine the length of src, don't copy.
  79. return i;
  80. }
  81. int wcscmp(wchar_t const* s1, wchar_t const* s2)
  82. {
  83. while (*s1 == *s2++)
  84. if (*s1++ == 0)
  85. return 0;
  86. return *(wchar_t const*)s1 - *(wchar_t const*)--s2;
  87. }
  88. int wcsncmp(wchar_t const* s1, wchar_t const* s2, size_t n)
  89. {
  90. if (!n)
  91. return 0;
  92. do {
  93. if (*s1 != *s2++)
  94. return *(wchar_t const*)s1 - *(wchar_t const*)--s2;
  95. if (*s1++ == 0)
  96. break;
  97. } while (--n);
  98. return 0;
  99. }
  100. wchar_t* wcschr(wchar_t const* str, int c)
  101. {
  102. wchar_t ch = c;
  103. for (;; ++str) {
  104. if (*str == ch)
  105. return const_cast<wchar_t*>(str);
  106. if (!*str)
  107. return nullptr;
  108. }
  109. }
  110. wchar_t* wcsrchr(wchar_t const* str, wchar_t wc)
  111. {
  112. wchar_t* last = nullptr;
  113. wchar_t c;
  114. for (; (c = *str); ++str) {
  115. if (c == wc)
  116. last = const_cast<wchar_t*>(str);
  117. }
  118. return last;
  119. }
  120. wchar_t* wcscat(wchar_t* dest, wchar_t const* src)
  121. {
  122. size_t dest_length = wcslen(dest);
  123. size_t i;
  124. for (i = 0; src[i] != '\0'; i++)
  125. dest[dest_length + i] = src[i];
  126. dest[dest_length + i] = '\0';
  127. return dest;
  128. }
  129. wchar_t* wcsncat(wchar_t* dest, wchar_t const* src, size_t n)
  130. {
  131. size_t dest_length = wcslen(dest);
  132. size_t i;
  133. for (i = 0; i < n && src[i] != '\0'; i++)
  134. dest[dest_length + i] = src[i];
  135. dest[dest_length + i] = '\0';
  136. return dest;
  137. }
  138. wchar_t* wcstok(wchar_t* str, wchar_t const* delim, wchar_t** ptr)
  139. {
  140. wchar_t* used_str = str;
  141. if (!used_str) {
  142. used_str = *ptr;
  143. }
  144. size_t token_start = 0;
  145. size_t token_end = 0;
  146. size_t str_len = wcslen(used_str);
  147. size_t delim_len = wcslen(delim);
  148. for (size_t i = 0; i < str_len; ++i) {
  149. bool is_proper_delim = false;
  150. for (size_t j = 0; j < delim_len; ++j) {
  151. if (used_str[i] == delim[j]) {
  152. // Skip beginning delimiters
  153. if (token_end - token_start == 0) {
  154. ++token_start;
  155. break;
  156. }
  157. is_proper_delim = true;
  158. }
  159. }
  160. ++token_end;
  161. if (is_proper_delim && token_end > 0) {
  162. --token_end;
  163. break;
  164. }
  165. }
  166. if (used_str[token_start] == '\0')
  167. return nullptr;
  168. if (token_end == 0) {
  169. return &used_str[token_start];
  170. }
  171. used_str[token_end] = '\0';
  172. return &used_str[token_start];
  173. }
  174. long wcstol(wchar_t const*, wchar_t**, int)
  175. {
  176. dbgln("FIXME: Implement wcstol()");
  177. TODO();
  178. }
  179. long long wcstoll(wchar_t const*, wchar_t**, int)
  180. {
  181. dbgln("FIXME: Implement wcstoll()");
  182. TODO();
  183. }
  184. wint_t btowc(int c)
  185. {
  186. if (c == EOF) {
  187. return WEOF;
  188. }
  189. // Multi-byte sequences in UTF-8 have their highest bit set
  190. if (c & (1 << 7)) {
  191. return WEOF;
  192. }
  193. return c;
  194. }
  195. size_t mbrtowc(wchar_t* pwc, char const* s, size_t n, mbstate_t* state)
  196. {
  197. static mbstate_t _anonymous_state = {};
  198. if (state == nullptr) {
  199. state = &_anonymous_state;
  200. }
  201. // s being a null pointer is a shorthand for reading a single null byte.
  202. if (s == nullptr) {
  203. pwc = nullptr;
  204. s = "";
  205. n = 1;
  206. }
  207. // Stop early if we can't read anything
  208. if (n == 0) {
  209. return 0;
  210. }
  211. size_t consumed_bytes = 0;
  212. // Fill the first byte if we haven't done that yet
  213. if (state->stored_bytes == 0) {
  214. state->bytes[state->stored_bytes++] = s[0];
  215. consumed_bytes++;
  216. }
  217. size_t expected_bytes = mbstate_expected_bytes(state);
  218. // Check if the first byte is invalid
  219. if (expected_bytes == 0) {
  220. *state = {};
  221. errno = EILSEQ;
  222. return -1;
  223. }
  224. while (state->stored_bytes < expected_bytes) {
  225. if (consumed_bytes == n) {
  226. // No complete multibyte character
  227. return -2;
  228. }
  229. unsigned char c = s[consumed_bytes];
  230. // Continuation bytes have to start with 0b10xxxxxx
  231. if ((c & 0b11000000) != 0b10000000) {
  232. // Invalid multibyte character
  233. *state = {};
  234. errno = EILSEQ;
  235. return -1;
  236. }
  237. state->bytes[state->stored_bytes++] = c;
  238. consumed_bytes++;
  239. }
  240. wchar_t codepoint = state->bytes[0];
  241. // Mask out the "length" bits if necessary
  242. if (expected_bytes > 1) {
  243. codepoint &= (1 << (7 - expected_bytes)) - 1;
  244. }
  245. for (unsigned int i = 1; i < expected_bytes; i++) {
  246. // Each continuation byte contains 6 bits of data
  247. codepoint = codepoint << 6;
  248. codepoint |= state->bytes[i] & 0b111111;
  249. }
  250. if (pwc) {
  251. *pwc = codepoint;
  252. }
  253. // We want to read the next multibyte character, but keep all other properties.
  254. state->stored_bytes = 0;
  255. if (codepoint == 0) {
  256. *state = {};
  257. return 0;
  258. }
  259. return consumed_bytes;
  260. }
  261. size_t mbrlen(char const* s, size_t n, mbstate_t* ps)
  262. {
  263. static mbstate_t anonymous_state = {};
  264. if (ps == nullptr)
  265. ps = &anonymous_state;
  266. return mbrtowc(nullptr, s, n, ps);
  267. }
  268. size_t wcrtomb(char* s, wchar_t wc, mbstate_t*)
  269. {
  270. if (s == nullptr)
  271. wc = L'\0';
  272. auto nwritten = AK::UnicodeUtils::code_point_to_utf8(wc, [&s](char byte) {
  273. if (s != nullptr)
  274. *s++ = byte;
  275. });
  276. if (nwritten < 0) {
  277. errno = EILSEQ;
  278. return (size_t)-1;
  279. } else {
  280. return nwritten;
  281. }
  282. }
  283. int wcscoll(wchar_t const* ws1, wchar_t const* ws2)
  284. {
  285. // TODO: Actually implement a sensible sort order for this,
  286. // because right now we are doing what LC_COLLATE=C would do.
  287. return wcscmp(ws1, ws2);
  288. }
  289. size_t wcsxfrm(wchar_t* dest, wchar_t const* src, size_t n)
  290. {
  291. // TODO: This needs to be changed when wcscoll is not just doing wcscmp
  292. return wcslcpy(dest, src, n);
  293. }
  294. int wctob(wint_t c)
  295. {
  296. if (c > 0x7f)
  297. return EOF;
  298. return static_cast<unsigned char>(c);
  299. }
  300. int mbsinit(mbstate_t const* state)
  301. {
  302. if (!state) {
  303. return 1;
  304. }
  305. if (state->stored_bytes != 0) {
  306. return 0;
  307. }
  308. return 1;
  309. }
  310. wchar_t* wcspbrk(wchar_t const* wcs, wchar_t const* accept)
  311. {
  312. for (wchar_t const* cur = accept; *cur; cur++) {
  313. wchar_t* res = wcschr(wcs, *cur);
  314. if (res)
  315. return res;
  316. }
  317. return nullptr;
  318. }
  319. wchar_t* wcsstr(wchar_t const* haystack, wchar_t const* needle)
  320. {
  321. size_t nlen = wcslen(needle);
  322. if (nlen == 0)
  323. return const_cast<wchar_t*>(haystack);
  324. size_t hlen = wcslen(haystack);
  325. while (hlen >= nlen) {
  326. if (wcsncmp(haystack, needle, nlen) == 0)
  327. return const_cast<wchar_t*>(haystack);
  328. haystack++;
  329. hlen--;
  330. }
  331. return nullptr;
  332. }
  333. wchar_t* wmemchr(wchar_t const* s, wchar_t c, size_t n)
  334. {
  335. for (size_t i = 0; i < n; i++) {
  336. if (s[i] == c)
  337. return const_cast<wchar_t*>(&s[i]);
  338. }
  339. return nullptr;
  340. }
  341. wchar_t* wmemcpy(wchar_t* dest, wchar_t const* src, size_t n)
  342. {
  343. for (size_t i = 0; i < n; i++)
  344. dest[i] = src[i];
  345. return dest;
  346. }
  347. wchar_t* wmemset(wchar_t* wcs, wchar_t wc, size_t n)
  348. {
  349. for (size_t i = 0; i < n; i++) {
  350. wcs[i] = wc;
  351. }
  352. return wcs;
  353. }
  354. wchar_t* wmemmove(wchar_t* dest, wchar_t const* src, size_t n)
  355. {
  356. if (dest > src) {
  357. for (size_t i = 1; i <= n; i++) {
  358. dest[n - i] = src[n - i];
  359. }
  360. } else if (dest < src) {
  361. for (size_t i = 0; i < n; i++) {
  362. dest[i] = src[i];
  363. }
  364. }
  365. return dest;
  366. }
  367. unsigned long wcstoul(wchar_t const*, wchar_t**, int)
  368. {
  369. dbgln("TODO: Implement wcstoul()");
  370. TODO();
  371. }
  372. unsigned long long wcstoull(wchar_t const*, wchar_t**, int)
  373. {
  374. dbgln("TODO: Implement wcstoull()");
  375. TODO();
  376. }
  377. float wcstof(wchar_t const*, wchar_t**)
  378. {
  379. dbgln("TODO: Implement wcstof()");
  380. TODO();
  381. }
  382. double wcstod(wchar_t const*, wchar_t**)
  383. {
  384. dbgln("TODO: Implement wcstod()");
  385. TODO();
  386. }
  387. long double wcstold(wchar_t const*, wchar_t**)
  388. {
  389. dbgln("TODO: Implement wcstold()");
  390. TODO();
  391. }
  392. int wcwidth(wchar_t wc)
  393. {
  394. if (wc == L'\0')
  395. return 0;
  396. // Printable ASCII.
  397. if (wc >= 0x20 && wc <= 0x7e)
  398. return 1;
  399. // Non-printable ASCII.
  400. if (wc <= 0x7f)
  401. return -1;
  402. // TODO: Implement wcwidth for non-ASCII characters.
  403. return 1;
  404. }
  405. size_t wcsnrtombs(char* dest, wchar_t const** src, size_t nwc, size_t len, mbstate_t* ps)
  406. {
  407. static mbstate_t _anonymous_state = {};
  408. if (ps == nullptr)
  409. ps = &_anonymous_state;
  410. size_t written = 0;
  411. size_t read = 0;
  412. while (read < nwc) {
  413. size_t ret = 0;
  414. char buf[MB_LEN_MAX];
  415. // Convert next wchar to multibyte.
  416. ret = wcrtomb(buf, **src, ps);
  417. // wchar can't be represented as multibyte.
  418. if (ret == (size_t)-1) {
  419. errno = EILSEQ;
  420. return (size_t)-1;
  421. }
  422. // New bytes don't fit the buffer.
  423. if (dest && len < written + ret) {
  424. return written;
  425. }
  426. if (dest) {
  427. memcpy(dest, buf, ret);
  428. dest += ret;
  429. }
  430. // Null character has been reached
  431. if (**src == L'\0') {
  432. *src = nullptr;
  433. return written;
  434. }
  435. *src += 1;
  436. read += 1;
  437. written += ret;
  438. }
  439. return written;
  440. }
  441. size_t mbsnrtowcs(wchar_t* dst, char const** src, size_t nms, size_t len, mbstate_t* ps)
  442. {
  443. static mbstate_t _anonymous_state = {};
  444. if (ps == nullptr)
  445. ps = &_anonymous_state;
  446. size_t written = 0;
  447. while (written < len || !dst) {
  448. // End of source buffer, no incomplete character.
  449. // src continues to point to the next byte.
  450. if (nms == 0) {
  451. return written;
  452. }
  453. // Convert next multibyte to wchar.
  454. size_t ret = mbrtowc(dst, *src, nms, ps);
  455. // Multibyte sequence is incomplete.
  456. if (ret == -2ul) {
  457. // Point just past the last processed byte.
  458. *src += nms;
  459. return written;
  460. }
  461. // Multibyte sequence is invalid.
  462. if (ret == -1ul) {
  463. errno = EILSEQ;
  464. return (size_t)-1;
  465. }
  466. // Null byte has been reached.
  467. if (**src == '\0') {
  468. *src = nullptr;
  469. return written;
  470. }
  471. *src += ret;
  472. nms -= ret;
  473. written += 1;
  474. if (dst)
  475. dst += 1;
  476. }
  477. // If we are here, we have written `len` wchars, but not reached the null byte.
  478. return written;
  479. }
  480. int wmemcmp(wchar_t const* s1, wchar_t const* s2, size_t n)
  481. {
  482. while (n-- > 0) {
  483. if (*s1++ != *s2++)
  484. return s1[-1] < s2[-1] ? -1 : 1;
  485. }
  486. return 0;
  487. }
  488. size_t wcsrtombs(char* dest, wchar_t const** src, size_t len, mbstate_t* ps)
  489. {
  490. static mbstate_t anonymous_state = {};
  491. if (ps == nullptr)
  492. ps = &anonymous_state;
  493. // SIZE_MAX is as close as we are going to get to "unlimited".
  494. return wcsnrtombs(dest, src, SIZE_MAX, len, ps);
  495. }
  496. size_t mbsrtowcs(wchar_t* dst, char const** src, size_t len, mbstate_t* ps)
  497. {
  498. static mbstate_t anonymous_state = {};
  499. if (ps == nullptr)
  500. ps = &anonymous_state;
  501. // SIZE_MAX is as close as we are going to get to "unlimited".
  502. return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
  503. }
  504. size_t wcscspn(wchar_t const* wcs, wchar_t const* reject)
  505. {
  506. for (auto const* wc_pointer = wcs;;) {
  507. auto c = *wc_pointer++;
  508. wchar_t rc;
  509. auto const* reject_copy = reject;
  510. do {
  511. if ((rc = *reject_copy++) == c)
  512. return wc_pointer - 1 - wcs;
  513. } while (rc != 0);
  514. }
  515. }
  516. size_t wcsspn(wchar_t const* wcs, wchar_t const* accept)
  517. {
  518. for (auto const* wc_pointer = wcs;;) {
  519. auto c = *wc_pointer++;
  520. wchar_t rc;
  521. auto const* accept_copy = accept;
  522. do {
  523. if ((rc = *accept_copy++) != c)
  524. return wc_pointer - 1 - wcs;
  525. } while (rc != 0);
  526. }
  527. }
  528. size_t wcsftime(wchar_t* __restrict wcs, size_t maxsize, wchar_t const* __restrict format, const struct tm* __restrict timeptr)
  529. {
  530. (void)wcs;
  531. (void)maxsize;
  532. (void)format;
  533. (void)timeptr;
  534. dbgln("FIXME: Implement wcsftime()");
  535. TODO();
  536. }
  537. }