wchar.cpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Assertions.h>
  7. #include <AK/Format.h>
  8. #include <AK/UnicodeUtils.h>
  9. #include <errno.h>
  10. #include <string.h>
  11. #include <wchar.h>
  12. static unsigned int mbstate_expected_bytes(mbstate_t* state)
  13. {
  14. if (state->stored_bytes == 0) {
  15. return 0;
  16. }
  17. unsigned char first = state->bytes[0];
  18. // Single-byte sequences have their first bit unset
  19. if ((first & 0b10000000) == 0) {
  20. return 1;
  21. }
  22. // Two-byte sequences start with 0b110xxxxx
  23. if ((first & 0b11100000) == 0b11000000) {
  24. return 2;
  25. }
  26. // Three-byte sequences start with 0b1110xxxx
  27. if ((first & 0b11110000) == 0b11100000) {
  28. return 3;
  29. }
  30. // Four-byte sequences start with 0b11110xxx
  31. if ((first & 0b11111000) == 0b11110000) {
  32. return 4;
  33. }
  34. // Everything else is invalid
  35. return 0;
  36. }
  37. extern "C" {
  38. size_t wcslen(const wchar_t* str)
  39. {
  40. size_t len = 0;
  41. while (*(str++))
  42. ++len;
  43. return len;
  44. }
  45. wchar_t* wcscpy(wchar_t* dest, const wchar_t* src)
  46. {
  47. wchar_t* original_dest = dest;
  48. while ((*dest++ = *src++) != '\0')
  49. ;
  50. return original_dest;
  51. }
  52. wchar_t* wcsncpy(wchar_t* dest, const wchar_t* src, size_t num)
  53. {
  54. wchar_t* original_dest = dest;
  55. while (((*dest++ = *src++) != '\0') && ((size_t)(dest - original_dest) < num))
  56. ;
  57. return original_dest;
  58. }
  59. size_t wcslcpy(wchar_t* dest, const wchar_t* src, size_t n)
  60. {
  61. size_t i;
  62. for (i = 0; i + 1 < n && src[i] != L'\0'; ++i)
  63. dest[i] = src[i];
  64. if (n)
  65. dest[i] = L'\0';
  66. for (; src[i] != L'\0'; ++i)
  67. ; // Determine the length of src, don't copy.
  68. return i;
  69. }
  70. int wcscmp(const wchar_t* s1, const wchar_t* s2)
  71. {
  72. while (*s1 == *s2++)
  73. if (*s1++ == 0)
  74. return 0;
  75. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  76. }
  77. int wcsncmp(const wchar_t* s1, const wchar_t* s2, size_t n)
  78. {
  79. if (!n)
  80. return 0;
  81. do {
  82. if (*s1 != *s2++)
  83. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  84. if (*s1++ == 0)
  85. break;
  86. } while (--n);
  87. return 0;
  88. }
  89. wchar_t* wcschr(const wchar_t* str, int c)
  90. {
  91. wchar_t ch = c;
  92. for (;; ++str) {
  93. if (*str == ch)
  94. return const_cast<wchar_t*>(str);
  95. if (!*str)
  96. return nullptr;
  97. }
  98. }
  99. wchar_t* wcsrchr(const wchar_t* str, wchar_t wc)
  100. {
  101. wchar_t* last = nullptr;
  102. wchar_t c;
  103. for (; (c = *str); ++str) {
  104. if (c == wc)
  105. last = const_cast<wchar_t*>(str);
  106. }
  107. return last;
  108. }
  109. wchar_t* wcscat(wchar_t* dest, const wchar_t* src)
  110. {
  111. size_t dest_length = wcslen(dest);
  112. size_t i;
  113. for (i = 0; src[i] != '\0'; i++)
  114. dest[dest_length + i] = src[i];
  115. dest[dest_length + i] = '\0';
  116. return dest;
  117. }
  118. wchar_t* wcsncat(wchar_t* dest, const wchar_t* src, size_t n)
  119. {
  120. size_t dest_length = wcslen(dest);
  121. size_t i;
  122. for (i = 0; i < n && src[i] != '\0'; i++)
  123. dest[dest_length + i] = src[i];
  124. dest[dest_length + i] = '\0';
  125. return dest;
  126. }
  127. wchar_t* wcstok(wchar_t* str, const wchar_t* delim, wchar_t** ptr)
  128. {
  129. wchar_t* used_str = str;
  130. if (!used_str) {
  131. used_str = *ptr;
  132. }
  133. size_t token_start = 0;
  134. size_t token_end = 0;
  135. size_t str_len = wcslen(used_str);
  136. size_t delim_len = wcslen(delim);
  137. for (size_t i = 0; i < str_len; ++i) {
  138. bool is_proper_delim = false;
  139. for (size_t j = 0; j < delim_len; ++j) {
  140. if (used_str[i] == delim[j]) {
  141. // Skip beginning delimiters
  142. if (token_end - token_start == 0) {
  143. ++token_start;
  144. break;
  145. }
  146. is_proper_delim = true;
  147. }
  148. }
  149. ++token_end;
  150. if (is_proper_delim && token_end > 0) {
  151. --token_end;
  152. break;
  153. }
  154. }
  155. if (used_str[token_start] == '\0')
  156. return nullptr;
  157. if (token_end == 0) {
  158. return &used_str[token_start];
  159. }
  160. used_str[token_end] = '\0';
  161. return &used_str[token_start];
  162. }
  163. long wcstol(const wchar_t*, wchar_t**, int)
  164. {
  165. dbgln("FIXME: Implement wcstol()");
  166. TODO();
  167. }
  168. long long wcstoll(const wchar_t*, wchar_t**, int)
  169. {
  170. dbgln("FIXME: Implement wcstoll()");
  171. TODO();
  172. }
  173. wint_t btowc(int c)
  174. {
  175. if (c == EOF) {
  176. return WEOF;
  177. }
  178. // Multi-byte sequences in UTF-8 have their highest bit set
  179. if (c & (1 << 7)) {
  180. return WEOF;
  181. }
  182. return c;
  183. }
  184. size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* state)
  185. {
  186. static mbstate_t _anonymous_state = {};
  187. if (state == nullptr) {
  188. state = &_anonymous_state;
  189. }
  190. // s being a null pointer is a shorthand for reading a single null byte.
  191. if (s == nullptr) {
  192. pwc = nullptr;
  193. s = "";
  194. n = 1;
  195. }
  196. // Stop early if we can't read anything
  197. if (n == 0) {
  198. return 0;
  199. }
  200. size_t consumed_bytes = 0;
  201. // Fill the first byte if we haven't done that yet
  202. if (state->stored_bytes == 0) {
  203. state->bytes[state->stored_bytes++] = s[0];
  204. consumed_bytes++;
  205. }
  206. size_t expected_bytes = mbstate_expected_bytes(state);
  207. // Check if the first byte is invalid
  208. if (expected_bytes == 0) {
  209. *state = {};
  210. errno = EILSEQ;
  211. return -1;
  212. }
  213. while (state->stored_bytes < expected_bytes) {
  214. if (consumed_bytes == n) {
  215. // No complete multibyte character
  216. return -2;
  217. }
  218. unsigned char c = s[consumed_bytes];
  219. // Continuation bytes have to start with 0b10xxxxxx
  220. if ((c & 0b11000000) != 0b10000000) {
  221. // Invalid multibyte character
  222. *state = {};
  223. errno = EILSEQ;
  224. return -1;
  225. }
  226. state->bytes[state->stored_bytes++] = c;
  227. consumed_bytes++;
  228. }
  229. wchar_t codepoint = state->bytes[0];
  230. // Mask out the "length" bits if necessary
  231. if (expected_bytes > 1) {
  232. codepoint &= (1 << (7 - expected_bytes)) - 1;
  233. }
  234. for (unsigned int i = 1; i < expected_bytes; i++) {
  235. // Each continuation byte contains 6 bits of data
  236. codepoint = codepoint << 6;
  237. codepoint |= state->bytes[i] & 0b111111;
  238. }
  239. if (pwc) {
  240. *pwc = codepoint;
  241. }
  242. // We want to read the next multibyte character, but keep all other properties.
  243. state->stored_bytes = 0;
  244. if (codepoint == 0) {
  245. *state = {};
  246. return 0;
  247. }
  248. return consumed_bytes;
  249. }
  250. size_t mbrlen(const char* s, size_t n, mbstate_t* ps)
  251. {
  252. static mbstate_t anonymous_state = {};
  253. if (ps == nullptr)
  254. ps = &anonymous_state;
  255. return mbrtowc(nullptr, s, n, ps);
  256. }
  257. size_t wcrtomb(char* s, wchar_t wc, mbstate_t*)
  258. {
  259. if (s == nullptr)
  260. wc = L'\0';
  261. auto nwritten = AK::UnicodeUtils::code_point_to_utf8(wc, [&s](char byte) {
  262. if (s != nullptr)
  263. *s++ = byte;
  264. });
  265. if (nwritten < 0) {
  266. errno = EILSEQ;
  267. return (size_t)-1;
  268. } else {
  269. return nwritten;
  270. }
  271. }
  272. int wcscoll(const wchar_t* ws1, const wchar_t* ws2)
  273. {
  274. // TODO: Actually implement a sensible sort order for this,
  275. // because right now we are doing what LC_COLLATE=C would do.
  276. return wcscmp(ws1, ws2);
  277. }
  278. size_t wcsxfrm(wchar_t* dest, const wchar_t* src, size_t n)
  279. {
  280. // TODO: This needs to be changed when wcscoll is not just doing wcscmp
  281. return wcslcpy(dest, src, n);
  282. }
  283. int wctob(wint_t c)
  284. {
  285. if (c > 0x7f)
  286. return EOF;
  287. return static_cast<unsigned char>(c);
  288. }
  289. int mbsinit(const mbstate_t* state)
  290. {
  291. if (!state) {
  292. return 1;
  293. }
  294. if (state->stored_bytes != 0) {
  295. return 0;
  296. }
  297. return 1;
  298. }
  299. wchar_t* wcspbrk(const wchar_t* wcs, const wchar_t* accept)
  300. {
  301. for (const wchar_t* cur = accept; *cur; cur++) {
  302. wchar_t* res = wcschr(wcs, *cur);
  303. if (res)
  304. return res;
  305. }
  306. return nullptr;
  307. }
  308. wchar_t* wcsstr(const wchar_t* haystack, const wchar_t* needle)
  309. {
  310. size_t nlen = wcslen(needle);
  311. if (nlen == 0)
  312. return const_cast<wchar_t*>(haystack);
  313. size_t hlen = wcslen(haystack);
  314. while (hlen >= nlen) {
  315. if (wcsncmp(haystack, needle, nlen) == 0)
  316. return const_cast<wchar_t*>(haystack);
  317. haystack++;
  318. hlen--;
  319. }
  320. return nullptr;
  321. }
  322. wchar_t* wmemchr(const wchar_t* s, wchar_t c, size_t n)
  323. {
  324. for (size_t i = 0; i < n; i++) {
  325. if (s[i] == c)
  326. return const_cast<wchar_t*>(&s[i]);
  327. }
  328. return nullptr;
  329. }
  330. wchar_t* wmemcpy(wchar_t* dest, const wchar_t* src, size_t n)
  331. {
  332. for (size_t i = 0; i < n; i++)
  333. dest[i] = src[i];
  334. return dest;
  335. }
  336. wchar_t* wmemset(wchar_t* wcs, wchar_t wc, size_t n)
  337. {
  338. for (size_t i = 0; i < n; i++) {
  339. wcs[i] = wc;
  340. }
  341. return wcs;
  342. }
  343. wchar_t* wmemmove(wchar_t* dest, const wchar_t* src, size_t n)
  344. {
  345. if (dest > src) {
  346. for (size_t i = 1; i <= n; i++) {
  347. dest[n - i] = src[n - i];
  348. }
  349. } else if (dest < src) {
  350. for (size_t i = 0; i < n; i++) {
  351. dest[i] = src[i];
  352. }
  353. }
  354. return dest;
  355. }
  356. unsigned long wcstoul(const wchar_t*, wchar_t**, int)
  357. {
  358. dbgln("TODO: Implement wcstoul()");
  359. TODO();
  360. }
  361. unsigned long long wcstoull(const wchar_t*, wchar_t**, int)
  362. {
  363. dbgln("TODO: Implement wcstoull()");
  364. TODO();
  365. }
  366. float wcstof(const wchar_t*, wchar_t**)
  367. {
  368. dbgln("TODO: Implement wcstof()");
  369. TODO();
  370. }
  371. double wcstod(const wchar_t*, wchar_t**)
  372. {
  373. dbgln("TODO: Implement wcstod()");
  374. TODO();
  375. }
  376. long double wcstold(const wchar_t*, wchar_t**)
  377. {
  378. dbgln("TODO: Implement wcstold()");
  379. TODO();
  380. }
  381. int swprintf(wchar_t*, size_t, const wchar_t*, ...)
  382. {
  383. dbgln("TODO: Implement swprintf()");
  384. TODO();
  385. }
  386. int wcwidth(wchar_t wc)
  387. {
  388. if (wc == L'\0')
  389. return 0;
  390. // Printable ASCII.
  391. if (wc >= 0x20 && wc <= 0x7e)
  392. return 1;
  393. // Non-printable ASCII.
  394. if (wc <= 0x7f)
  395. return -1;
  396. // TODO: Implement wcwidth for non-ASCII characters.
  397. return 1;
  398. }
  399. size_t wcsnrtombs(char* dest, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps)
  400. {
  401. static mbstate_t _anonymous_state = {};
  402. if (ps == nullptr)
  403. ps = &_anonymous_state;
  404. size_t written = 0;
  405. size_t read = 0;
  406. while (read < nwc) {
  407. size_t ret = 0;
  408. char buf[MB_LEN_MAX];
  409. // Convert next wchar to multibyte.
  410. ret = wcrtomb(buf, **src, ps);
  411. // wchar can't be represented as multibyte.
  412. if (ret == (size_t)-1) {
  413. errno = EILSEQ;
  414. return (size_t)-1;
  415. }
  416. // New bytes don't fit the buffer.
  417. if (dest && len < written + ret) {
  418. return written;
  419. }
  420. if (dest) {
  421. memcpy(dest, buf, ret);
  422. dest += ret;
  423. }
  424. // Null character has been reached
  425. if (**src == L'\0') {
  426. *src = nullptr;
  427. return written;
  428. }
  429. *src += 1;
  430. read += 1;
  431. written += ret;
  432. }
  433. return written;
  434. }
  435. size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps)
  436. {
  437. static mbstate_t _anonymous_state = {};
  438. if (ps == nullptr)
  439. ps = &_anonymous_state;
  440. size_t written = 0;
  441. while (written < len || !dst) {
  442. // Convert next multibyte to wchar.
  443. size_t ret = mbrtowc(dst, *src, MB_LEN_MAX, ps);
  444. // Multibyte sequence is invalid.
  445. if (ret == -1ul) {
  446. errno = EILSEQ;
  447. return (size_t)-1;
  448. }
  449. // Null byte has been reached.
  450. if (**src == '\0') {
  451. *src = nullptr;
  452. return written;
  453. }
  454. *src += ret;
  455. written += 1;
  456. if (dst)
  457. dst += 1;
  458. }
  459. // If we are here, we have written `len` wchars, but not reached the null byte.
  460. return written;
  461. }
  462. int wmemcmp(const wchar_t* s1, const wchar_t* s2, size_t n)
  463. {
  464. while (n-- > 0) {
  465. if (*s1++ != *s2++)
  466. return s1[-1] < s2[-1] ? -1 : 1;
  467. }
  468. return 0;
  469. }
  470. size_t wcsrtombs(char* dest, const wchar_t** src, size_t len, mbstate_t* ps)
  471. {
  472. static mbstate_t anonymous_state = {};
  473. if (ps == nullptr)
  474. ps = &anonymous_state;
  475. // SIZE_MAX is as close as we are going to get to "unlimited".
  476. return wcsnrtombs(dest, src, SIZE_MAX, len, ps);
  477. }
  478. size_t mbsnrtowcs(wchar_t*, const char**, size_t, size_t, mbstate_t*)
  479. {
  480. dbgln("FIXME: Implement mbsnrtowcs()");
  481. TODO();
  482. }
  483. }