wchar.cpp 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450
  1. /*
  2. * Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
  3. *
  4. * SPDX-License-Identifier: BSD-2-Clause
  5. */
  6. #include <AK/Assertions.h>
  7. #include <AK/Format.h>
  8. #include <errno.h>
  9. #include <wchar.h>
  10. static void mbstate_reset(mbstate_t* state)
  11. {
  12. *state = { 0 };
  13. }
  14. static unsigned int mbstate_stored_bytes(mbstate_t* state)
  15. {
  16. for (unsigned int i = 0; i < sizeof(state->bytes); i++) {
  17. if (!state->bytes[i]) {
  18. return i;
  19. }
  20. }
  21. return sizeof(state->bytes);
  22. }
  23. static unsigned int mbstate_expected_bytes(mbstate_t* state)
  24. {
  25. unsigned char first = state->bytes[0];
  26. // Single-byte sequences have their first bit unset
  27. if ((first & 0b10000000) == 0) {
  28. return 1;
  29. }
  30. // Two-byte sequences start with 0b110xxxxx
  31. if ((first & 0b11100000) == 0b11000000) {
  32. return 2;
  33. }
  34. // Three-byte sequences start with 0b1110xxxx
  35. if ((first & 0b11110000) == 0b11100000) {
  36. return 3;
  37. }
  38. // Four-byte sequences start with 0b11110xxx
  39. if ((first & 0b11111000) == 0b11110000) {
  40. return 4;
  41. }
  42. // Everything else is invalid
  43. return 0;
  44. }
  45. extern "C" {
  46. size_t wcslen(const wchar_t* str)
  47. {
  48. size_t len = 0;
  49. while (*(str++))
  50. ++len;
  51. return len;
  52. }
  53. wchar_t* wcscpy(wchar_t* dest, const wchar_t* src)
  54. {
  55. wchar_t* original_dest = dest;
  56. while ((*dest++ = *src++) != '\0')
  57. ;
  58. return original_dest;
  59. }
  60. wchar_t* wcsncpy(wchar_t* dest, const wchar_t* src, size_t num)
  61. {
  62. wchar_t* original_dest = dest;
  63. while (((*dest++ = *src++) != '\0') && ((size_t)(dest - original_dest) < num))
  64. ;
  65. return original_dest;
  66. }
  67. int wcscmp(const wchar_t* s1, const wchar_t* s2)
  68. {
  69. while (*s1 == *s2++)
  70. if (*s1++ == 0)
  71. return 0;
  72. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  73. }
  74. int wcsncmp(const wchar_t* s1, const wchar_t* s2, size_t n)
  75. {
  76. if (!n)
  77. return 0;
  78. do {
  79. if (*s1 != *s2++)
  80. return *(const wchar_t*)s1 - *(const wchar_t*)--s2;
  81. if (*s1++ == 0)
  82. break;
  83. } while (--n);
  84. return 0;
  85. }
  86. wchar_t* wcschr(const wchar_t* str, int c)
  87. {
  88. wchar_t ch = c;
  89. for (;; ++str) {
  90. if (*str == ch)
  91. return const_cast<wchar_t*>(str);
  92. if (!*str)
  93. return nullptr;
  94. }
  95. }
  96. const wchar_t* wcsrchr(const wchar_t* str, wchar_t wc)
  97. {
  98. wchar_t* last = nullptr;
  99. wchar_t c;
  100. for (; (c = *str); ++str) {
  101. if (c == wc)
  102. last = const_cast<wchar_t*>(str);
  103. }
  104. return last;
  105. }
  106. wchar_t* wcscat(wchar_t* dest, const wchar_t* src)
  107. {
  108. size_t dest_length = wcslen(dest);
  109. size_t i;
  110. for (i = 0; src[i] != '\0'; i++)
  111. dest[dest_length + i] = src[i];
  112. dest[dest_length + i] = '\0';
  113. return dest;
  114. }
  115. wchar_t* wcsncat(wchar_t* dest, const wchar_t* src, size_t n)
  116. {
  117. size_t dest_length = wcslen(dest);
  118. size_t i;
  119. for (i = 0; i < n && src[i] != '\0'; i++)
  120. dest[dest_length + i] = src[i];
  121. dest[dest_length + i] = '\0';
  122. return dest;
  123. }
  124. wchar_t* wcstok(wchar_t* str, const wchar_t* delim, wchar_t** ptr)
  125. {
  126. wchar_t* used_str = str;
  127. if (!used_str) {
  128. used_str = *ptr;
  129. }
  130. size_t token_start = 0;
  131. size_t token_end = 0;
  132. size_t str_len = wcslen(used_str);
  133. size_t delim_len = wcslen(delim);
  134. for (size_t i = 0; i < str_len; ++i) {
  135. bool is_proper_delim = false;
  136. for (size_t j = 0; j < delim_len; ++j) {
  137. if (used_str[i] == delim[j]) {
  138. // Skip beginning delimiters
  139. if (token_end - token_start == 0) {
  140. ++token_start;
  141. break;
  142. }
  143. is_proper_delim = true;
  144. }
  145. }
  146. ++token_end;
  147. if (is_proper_delim && token_end > 0) {
  148. --token_end;
  149. break;
  150. }
  151. }
  152. if (used_str[token_start] == '\0')
  153. return nullptr;
  154. if (token_end == 0) {
  155. return &used_str[token_start];
  156. }
  157. used_str[token_end] = '\0';
  158. return &used_str[token_start];
  159. }
  160. long wcstol(const wchar_t*, wchar_t**, int)
  161. {
  162. dbgln("FIXME: Implement wcstol()");
  163. TODO();
  164. }
  165. long long wcstoll(const wchar_t*, wchar_t**, int)
  166. {
  167. dbgln("FIXME: Implement wcstoll()");
  168. TODO();
  169. }
  170. wint_t btowc(int c)
  171. {
  172. if (c == EOF) {
  173. return WEOF;
  174. }
  175. // Multi-byte sequences in UTF-8 have their highest bit set
  176. if (c & (1 << 7)) {
  177. return WEOF;
  178. }
  179. return c;
  180. }
  181. size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* state)
  182. {
  183. static mbstate_t _anonymous_state = { 0 };
  184. if (state == nullptr) {
  185. state = &_anonymous_state;
  186. }
  187. // If s is nullptr, check if the state contains a complete multibyte character
  188. if (s == nullptr) {
  189. if (mbstate_expected_bytes(state) == mbstate_stored_bytes(state)) {
  190. mbstate_reset(state);
  191. return 0;
  192. } else {
  193. mbstate_reset(state);
  194. errno = EILSEQ;
  195. return -1;
  196. }
  197. }
  198. // Stop early if we can't read anything
  199. if (n == 0) {
  200. return 0;
  201. }
  202. size_t consumed_bytes = 0;
  203. size_t stored_bytes = mbstate_stored_bytes(state);
  204. // Fill the first byte if we haven't done that yet
  205. if (state->bytes[0] == 0) {
  206. state->bytes[0] = s[0];
  207. consumed_bytes++;
  208. }
  209. size_t expected_bytes = mbstate_expected_bytes(state);
  210. // Check if the first byte is invalid
  211. if (expected_bytes == 0) {
  212. mbstate_reset(state);
  213. errno = EILSEQ;
  214. return -1;
  215. }
  216. size_t needed_bytes = expected_bytes - stored_bytes;
  217. while (consumed_bytes < needed_bytes) {
  218. if (consumed_bytes == n) {
  219. // No complete multibyte character
  220. return -2;
  221. }
  222. unsigned char c = s[consumed_bytes];
  223. // Continuation bytes have to start with 0b10xxxxxx
  224. if ((c & 0b11000000) != 0b10000000) {
  225. // Invalid multibyte character
  226. mbstate_reset(state);
  227. errno = EILSEQ;
  228. return -1;
  229. }
  230. state->bytes[mbstate_stored_bytes(state)] = c;
  231. consumed_bytes++;
  232. }
  233. wchar_t codepoint = state->bytes[0];
  234. // Mask out the "length" bits if necessary
  235. if (expected_bytes > 1) {
  236. codepoint &= (1 << (7 - expected_bytes)) - 1;
  237. }
  238. for (unsigned int i = 1; i < expected_bytes; i++) {
  239. // Each continuation byte contains 6 bits of data
  240. codepoint = codepoint << 6;
  241. codepoint |= state->bytes[i] & 0b111111;
  242. }
  243. if (pwc) {
  244. *pwc = codepoint;
  245. }
  246. // We don't have a shift state that we need to keep, so just clear the entire state
  247. mbstate_reset(state);
  248. if (codepoint == 0) {
  249. return 0;
  250. }
  251. return consumed_bytes;
  252. }
  253. size_t mbrlen(const char*, size_t, mbstate_t*)
  254. {
  255. dbgln("FIXME: Implement mbrlen()");
  256. TODO();
  257. }
  258. size_t wcrtomb(char*, wchar_t, mbstate_t*)
  259. {
  260. dbgln("FIXME: Implement wcrtomb()");
  261. TODO();
  262. }
  263. int wcscoll(const wchar_t* ws1, const wchar_t* ws2)
  264. {
  265. // TODO: Actually implement a sensible sort order for this,
  266. // because right now we are doing what LC_COLLATE=C would do.
  267. return wcscmp(ws1, ws2);
  268. }
  269. int wctob(wint_t)
  270. {
  271. dbgln("FIXME: Implement wctob()");
  272. TODO();
  273. }
  274. int mbsinit(const mbstate_t* state)
  275. {
  276. if (!state) {
  277. return 1;
  278. }
  279. for (unsigned char byte : state->bytes) {
  280. if (byte) {
  281. return 0;
  282. }
  283. }
  284. return 1;
  285. }
  286. wchar_t* wcspbrk(const wchar_t* wcs, const wchar_t* accept)
  287. {
  288. for (const wchar_t* cur = accept; *cur; cur++) {
  289. wchar_t* res = wcschr(wcs, *cur);
  290. if (res)
  291. return res;
  292. }
  293. return nullptr;
  294. }
  295. wchar_t* wcsstr(const wchar_t* haystack, const wchar_t* needle)
  296. {
  297. size_t nlen = wcslen(needle);
  298. if (nlen == 0)
  299. return const_cast<wchar_t*>(haystack);
  300. size_t hlen = wcslen(haystack);
  301. while (hlen >= nlen) {
  302. if (wcsncmp(haystack, needle, nlen) == 0)
  303. return const_cast<wchar_t*>(haystack);
  304. haystack++;
  305. hlen--;
  306. }
  307. return nullptr;
  308. }
  309. wchar_t* wmemchr(const wchar_t* s, wchar_t c, size_t n)
  310. {
  311. for (size_t i = 0; i < n; i++) {
  312. if (s[i] == c)
  313. return const_cast<wchar_t*>(&s[i]);
  314. }
  315. return nullptr;
  316. }
  317. wchar_t* wmemcpy(wchar_t* dest, const wchar_t* src, size_t n)
  318. {
  319. for (size_t i = 0; i < n; i++)
  320. dest[i] = src[i];
  321. return dest;
  322. }
  323. wchar_t* wmemset(wchar_t* wcs, wchar_t wc, size_t n)
  324. {
  325. for (size_t i = 0; i < n; i++) {
  326. wcs[i] = wc;
  327. }
  328. return wcs;
  329. }
  330. wchar_t* wmemmove(wchar_t* dest, const wchar_t* src, size_t n)
  331. {
  332. if (dest > src) {
  333. for (size_t i = 1; i <= n; i++) {
  334. dest[n - i] = src[n - i];
  335. }
  336. } else if (dest < src) {
  337. for (size_t i = 0; i < n; i++) {
  338. dest[i] = src[i];
  339. }
  340. }
  341. return dest;
  342. }
  343. unsigned long wcstoul(const wchar_t*, wchar_t**, int)
  344. {
  345. dbgln("TODO: Implement wcstoul()");
  346. TODO();
  347. }
  348. unsigned long long wcstoull(const wchar_t*, wchar_t**, int)
  349. {
  350. dbgln("TODO: Implement wcstoull()");
  351. TODO();
  352. }
  353. float wcstof(const wchar_t*, wchar_t**)
  354. {
  355. dbgln("TODO: Implement wcstof()");
  356. TODO();
  357. }
  358. double wcstod(const wchar_t*, wchar_t**)
  359. {
  360. dbgln("TODO: Implement wcstod()");
  361. TODO();
  362. }
  363. long double wcstold(const wchar_t*, wchar_t**)
  364. {
  365. dbgln("TODO: Implement wcstold()");
  366. TODO();
  367. }
  368. }