wchar.cpp 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Assertions.h>
  7. #include <AK/Format.h>
  8. #include <errno.h>
  9. #include <wchar.h>
  10. static unsigned int mbstate_expected_bytes(mbstate_t* state)
  11. {
  12. if (state->stored_bytes == 0) {
  13. return 0;
  14. }
  15. unsigned char first = state->bytes[0];
  16. // Single-byte sequences have their first bit unset
  17. if ((first & 0b10000000) == 0) {
  18. return 1;
  19. }
  20. // Two-byte sequences start with 0b110xxxxx
  21. if ((first & 0b11100000) == 0b11000000) {
  22. return 2;
  23. }
  24. // Three-byte sequences start with 0b1110xxxx
  25. if ((first & 0b11110000) == 0b11100000) {
  26. return 3;
  27. }
  28. // Four-byte sequences start with 0b11110xxx
  29. if ((first & 0b11111000) == 0b11110000) {
  30. return 4;
  31. }
  32. // Everything else is invalid
  33. return 0;
  34. }
  35. extern "C" {
  36. size_t wcslen(const wchar_t* str)
  37. {
  38. size_t len = 0;
  39. while (*(str++))
  40. ++len;
  41. return len;
  42. }
  43. wchar_t* wcscpy(wchar_t* dest, const wchar_t* src)
  44. {
  45. wchar_t* original_dest = dest;
  46. while ((*dest++ = *src++) != '\0')
  47. ;
  48. return original_dest;
  49. }
  50. wchar_t* wcsncpy(wchar_t* dest, const wchar_t* src, size_t num)
  51. {
  52. wchar_t* original_dest = dest;
  53. while (((*dest++ = *src++) != '\0') && ((size_t)(dest - original_dest) < num))
  54. ;
  55. return original_dest;
  56. }
  57. int wcscmp(const wchar_t* s1, const wchar_t* s2)
  58. {
  59. while (*s1 == *s2++)
  60. if (*s1++ == 0)
  61. return 0;
  62. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  63. }
  64. int wcsncmp(const wchar_t* s1, const wchar_t* s2, size_t n)
  65. {
  66. if (!n)
  67. return 0;
  68. do {
  69. if (*s1 != *s2++)
  70. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  71. if (*s1++ == 0)
  72. break;
  73. } while (--n);
  74. return 0;
  75. }
  76. wchar_t* wcschr(const wchar_t* str, int c)
  77. {
  78. wchar_t ch = c;
  79. for (;; ++str) {
  80. if (*str == ch)
  81. return const_cast<wchar_t*>(str);
  82. if (!*str)
  83. return nullptr;
  84. }
  85. }
  86. const wchar_t* wcsrchr(const wchar_t* str, wchar_t wc)
  87. {
  88. wchar_t* last = nullptr;
  89. wchar_t c;
  90. for (; (c = *str); ++str) {
  91. if (c == wc)
  92. last = const_cast<wchar_t*>(str);
  93. }
  94. return last;
  95. }
  96. wchar_t* wcscat(wchar_t* dest, const wchar_t* src)
  97. {
  98. size_t dest_length = wcslen(dest);
  99. size_t i;
  100. for (i = 0; src[i] != '\0'; i++)
  101. dest[dest_length + i] = src[i];
  102. dest[dest_length + i] = '\0';
  103. return dest;
  104. }
  105. wchar_t* wcsncat(wchar_t* dest, const wchar_t* src, size_t n)
  106. {
  107. size_t dest_length = wcslen(dest);
  108. size_t i;
  109. for (i = 0; i < n && src[i] != '\0'; i++)
  110. dest[dest_length + i] = src[i];
  111. dest[dest_length + i] = '\0';
  112. return dest;
  113. }
  114. wchar_t* wcstok(wchar_t* str, const wchar_t* delim, wchar_t** ptr)
  115. {
  116. wchar_t* used_str = str;
  117. if (!used_str) {
  118. used_str = *ptr;
  119. }
  120. size_t token_start = 0;
  121. size_t token_end = 0;
  122. size_t str_len = wcslen(used_str);
  123. size_t delim_len = wcslen(delim);
  124. for (size_t i = 0; i < str_len; ++i) {
  125. bool is_proper_delim = false;
  126. for (size_t j = 0; j < delim_len; ++j) {
  127. if (used_str[i] == delim[j]) {
  128. // Skip beginning delimiters
  129. if (token_end - token_start == 0) {
  130. ++token_start;
  131. break;
  132. }
  133. is_proper_delim = true;
  134. }
  135. }
  136. ++token_end;
  137. if (is_proper_delim && token_end > 0) {
  138. --token_end;
  139. break;
  140. }
  141. }
  142. if (used_str[token_start] == '\0')
  143. return nullptr;
  144. if (token_end == 0) {
  145. return &used_str[token_start];
  146. }
  147. used_str[token_end] = '\0';
  148. return &used_str[token_start];
  149. }
  150. long wcstol(const wchar_t*, wchar_t**, int)
  151. {
  152. dbgln("FIXME: Implement wcstol()");
  153. TODO();
  154. }
  155. long long wcstoll(const wchar_t*, wchar_t**, int)
  156. {
  157. dbgln("FIXME: Implement wcstoll()");
  158. TODO();
  159. }
  160. wint_t btowc(int c)
  161. {
  162. if (c == EOF) {
  163. return WEOF;
  164. }
  165. // Multi-byte sequences in UTF-8 have their highest bit set
  166. if (c & (1 << 7)) {
  167. return WEOF;
  168. }
  169. return c;
  170. }
  171. size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* state)
  172. {
  173. static mbstate_t _anonymous_state = {};
  174. if (state == nullptr) {
  175. state = &_anonymous_state;
  176. }
  177. // If s is nullptr, check if the state contains a complete multibyte character
  178. if (s == nullptr) {
  179. if (mbstate_expected_bytes(state) == mbstate->stored_bytes) {
  180. *state = {};
  181. return 0;
  182. } else {
  183. *state = {};
  184. errno = EILSEQ;
  185. return -1;
  186. }
  187. }
  188. // Stop early if we can't read anything
  189. if (n == 0) {
  190. return 0;
  191. }
  192. size_t consumed_bytes = 0;
  193. // Fill the first byte if we haven't done that yet
  194. if (state->stored_bytes == 0) {
  195. state->bytes[state->stored_bytes++] = s[0];
  196. consumed_bytes++;
  197. }
  198. size_t expected_bytes = mbstate_expected_bytes(state);
  199. // Check if the first byte is invalid
  200. if (expected_bytes == 0) {
  201. *state = {};
  202. errno = EILSEQ;
  203. return -1;
  204. }
  205. while (state->stored_bytes < expected_bytes) {
  206. if (consumed_bytes == n) {
  207. // No complete multibyte character
  208. return -2;
  209. }
  210. unsigned char c = s[consumed_bytes];
  211. // Continuation bytes have to start with 0b10xxxxxx
  212. if ((c & 0b11000000) != 0b10000000) {
  213. // Invalid multibyte character
  214. *state = {};
  215. errno = EILSEQ;
  216. return -1;
  217. }
  218. state->bytes[state->stored_bytes++] = c;
  219. consumed_bytes++;
  220. }
  221. wchar_t codepoint = state->bytes[0];
  222. // Mask out the "length" bits if necessary
  223. if (expected_bytes > 1) {
  224. codepoint &= (1 << (7 - expected_bytes)) - 1;
  225. }
  226. for (unsigned int i = 1; i < expected_bytes; i++) {
  227. // Each continuation byte contains 6 bits of data
  228. codepoint = codepoint << 6;
  229. codepoint |= state->bytes[i] & 0b111111;
  230. }
  231. if (pwc) {
  232. *pwc = codepoint;
  233. }
  234. // We want to read the next multibyte character, but keep all other properties.
  235. state->stored_bytes = 0;
  236. if (codepoint == 0) {
  237. return 0;
  238. }
  239. return consumed_bytes;
  240. }
  241. size_t mbrlen(const char*, size_t, mbstate_t*)
  242. {
  243. dbgln("FIXME: Implement mbrlen()");
  244. TODO();
  245. }
  246. size_t wcrtomb(char*, wchar_t, mbstate_t*)
  247. {
  248. dbgln("FIXME: Implement wcrtomb()");
  249. TODO();
  250. }
  251. int wcscoll(const wchar_t* ws1, const wchar_t* ws2)
  252. {
  253. // TODO: Actually implement a sensible sort order for this,
  254. // because right now we are doing what LC_COLLATE=C would do.
  255. return wcscmp(ws1, ws2);
  256. }
  257. int wctob(wint_t)
  258. {
  259. dbgln("FIXME: Implement wctob()");
  260. TODO();
  261. }
  262. int mbsinit(const mbstate_t* state)
  263. {
  264. if (!state) {
  265. return 1;
  266. }
  267. if (state->stored_bytes != 0) {
  268. return 0;
  269. }
  270. return 1;
  271. }
  272. wchar_t* wcspbrk(const wchar_t* wcs, const wchar_t* accept)
  273. {
  274. for (const wchar_t* cur = accept; *cur; cur++) {
  275. wchar_t* res = wcschr(wcs, *cur);
  276. if (res)
  277. return res;
  278. }
  279. return nullptr;
  280. }
  281. wchar_t* wcsstr(const wchar_t* haystack, const wchar_t* needle)
  282. {
  283. size_t nlen = wcslen(needle);
  284. if (nlen == 0)
  285. return const_cast<wchar_t*>(haystack);
  286. size_t hlen = wcslen(haystack);
  287. while (hlen >= nlen) {
  288. if (wcsncmp(haystack, needle, nlen) == 0)
  289. return const_cast<wchar_t*>(haystack);
  290. haystack++;
  291. hlen--;
  292. }
  293. return nullptr;
  294. }
  295. wchar_t* wmemchr(const wchar_t* s, wchar_t c, size_t n)
  296. {
  297. for (size_t i = 0; i < n; i++) {
  298. if (s[i] == c)
  299. return const_cast<wchar_t*>(&s[i]);
  300. }
  301. return nullptr;
  302. }
  303. wchar_t* wmemcpy(wchar_t* dest, const wchar_t* src, size_t n)
  304. {
  305. for (size_t i = 0; i < n; i++)
  306. dest[i] = src[i];
  307. return dest;
  308. }
  309. wchar_t* wmemset(wchar_t* wcs, wchar_t wc, size_t n)
  310. {
  311. for (size_t i = 0; i < n; i++) {
  312. wcs[i] = wc;
  313. }
  314. return wcs;
  315. }
  316. wchar_t* wmemmove(wchar_t* dest, const wchar_t* src, size_t n)
  317. {
  318. if (dest > src) {
  319. for (size_t i = 1; i <= n; i++) {
  320. dest[n - i] = src[n - i];
  321. }
  322. } else if (dest < src) {
  323. for (size_t i = 0; i < n; i++) {
  324. dest[i] = src[i];
  325. }
  326. }
  327. return dest;
  328. }
  329. unsigned long wcstoul(const wchar_t*, wchar_t**, int)
  330. {
  331. dbgln("TODO: Implement wcstoul()");
  332. TODO();
  333. }
  334. unsigned long long wcstoull(const wchar_t*, wchar_t**, int)
  335. {
  336. dbgln("TODO: Implement wcstoull()");
  337. TODO();
  338. }
  339. float wcstof(const wchar_t*, wchar_t**)
  340. {
  341. dbgln("TODO: Implement wcstof()");
  342. TODO();
  343. }
  344. double wcstod(const wchar_t*, wchar_t**)
  345. {
  346. dbgln("TODO: Implement wcstod()");
  347. TODO();
  348. }
  349. long double wcstold(const wchar_t*, wchar_t**)
  350. {
  351. dbgln("TODO: Implement wcstold()");
  352. TODO();
  353. }
  354. int swprintf(wchar_t*, size_t, const wchar_t*, ...)
  355. {
  356. dbgln("TODO: Implement swprintf()");
  357. TODO();
  358. }
  359. }