cloned_binary.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. /*
  2. * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
  3. * Copyright (C) 2019 SUSE LLC
  4. *
  5. * Licensed under the Apache License, Version 2.0 (the "License");
  6. * you may not use this file except in compliance with the License.
  7. * You may obtain a copy of the License at
  8. *
  9. * http://www.apache.org/licenses/LICENSE-2.0
  10. *
  11. * Unless required by applicable law or agreed to in writing, software
  12. * distributed under the License is distributed on an "AS IS" BASIS,
  13. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14. * See the License for the specific language governing permissions and
  15. * limitations under the License.
  16. */
  17. #define _GNU_SOURCE
  18. #include <unistd.h>
  19. #include <stdio.h>
  20. #include <stdlib.h>
  21. #include <stdbool.h>
  22. #include <string.h>
  23. #include <limits.h>
  24. #include <fcntl.h>
  25. #include <errno.h>
  26. #include <sys/types.h>
  27. #include <sys/stat.h>
  28. #include <sys/statfs.h>
  29. #include <sys/vfs.h>
  30. #include <sys/mman.h>
  31. #include <sys/mount.h>
  32. #include <sys/sendfile.h>
  33. #include <sys/syscall.h>
  34. /* Use our own wrapper for memfd_create. */
  35. #if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
  36. # define SYS_memfd_create __NR_memfd_create
  37. #endif
  38. /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
  39. #ifndef MFD_CLOEXEC
  40. # define MFD_CLOEXEC 0x0001U
  41. # define MFD_ALLOW_SEALING 0x0002U
  42. #endif
  43. int memfd_create(const char *name, unsigned int flags)
  44. {
  45. #ifdef SYS_memfd_create
  46. return syscall(SYS_memfd_create, name, flags);
  47. #else
  48. errno = ENOSYS;
  49. return -1;
  50. #endif
  51. }
  52. /* This comes directly from <linux/fcntl.h>. */
  53. #ifndef F_LINUX_SPECIFIC_BASE
  54. # define F_LINUX_SPECIFIC_BASE 1024
  55. #endif
  56. #ifndef F_ADD_SEALS
  57. # define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
  58. # define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
  59. #endif
  60. #ifndef F_SEAL_SEAL
  61. # define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
  62. # define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
  63. # define F_SEAL_GROW 0x0004 /* prevent file from growing */
  64. # define F_SEAL_WRITE 0x0008 /* prevent writes */
  65. #endif
  66. #define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
  67. #define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
  68. #define RUNC_MEMFD_SEALS \
  69. (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
  70. static void *must_realloc(void *ptr, size_t size)
  71. {
  72. void *old = ptr;
  73. do {
  74. ptr = realloc(old, size);
  75. } while(!ptr);
  76. return ptr;
  77. }
  78. /*
  79. * Verify whether we are currently in a self-cloned program (namely, is
  80. * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
  81. * for shmem files), and we want to be sure it's actually sealed.
  82. */
  83. static int is_self_cloned(void)
  84. {
  85. int fd, ret, is_cloned = 0;
  86. struct stat statbuf = {};
  87. struct statfs fsbuf = {};
  88. fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
  89. if (fd < 0)
  90. return -ENOTRECOVERABLE;
  91. /*
  92. * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
  93. * this, because you cannot write to a sealed memfd no matter what (so
  94. * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
  95. * memfd to /usr/bin/runc to allow re-use).
  96. */
  97. ret = fcntl(fd, F_GET_SEALS);
  98. if (ret >= 0) {
  99. is_cloned = (ret == RUNC_MEMFD_SEALS);
  100. goto out;
  101. }
  102. /*
  103. * All other forms require CLONED_BINARY_ENV, since they are potentially
  104. * writeable (or we can't tell if they're fully safe) and thus we must
  105. * check the environment as an extra layer of defence.
  106. */
  107. if (!getenv(CLONED_BINARY_ENV)) {
  108. is_cloned = false;
  109. goto out;
  110. }
  111. /*
  112. * Is the binary on a read-only filesystem? We can't detect bind-mounts in
  113. * particular (in-kernel they are identical to regular mounts) but we can
  114. * at least be sure that it's read-only. In addition, to make sure that
  115. * it's *our* bind-mount we check CLONED_BINARY_ENV.
  116. */
  117. if (fstatfs(fd, &fsbuf) >= 0)
  118. is_cloned |= (fsbuf.f_flags & MS_RDONLY);
  119. /*
  120. * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
  121. * which appears to have a borked backport of F_GET_SEALS. Either way,
  122. * having a file which has no hardlinks indicates that we aren't using
  123. * a host-side "runc" binary and this is something that a container
  124. * cannot fake (because unlinking requires being able to resolve the
  125. * path that you want to unlink).
  126. */
  127. if (fstat(fd, &statbuf) >= 0)
  128. is_cloned |= (statbuf.st_nlink == 0);
  129. out:
  130. close(fd);
  131. return is_cloned;
  132. }
  133. /* Read a given file into a new buffer, and providing the length. */
  134. static char *read_file(char *path, size_t *length)
  135. {
  136. int fd;
  137. char buf[4096], *copy = NULL;
  138. if (!length)
  139. return NULL;
  140. fd = open(path, O_RDONLY | O_CLOEXEC);
  141. if (fd < 0)
  142. return NULL;
  143. *length = 0;
  144. for (;;) {
  145. ssize_t n;
  146. n = read(fd, buf, sizeof(buf));
  147. if (n < 0)
  148. goto error;
  149. if (!n)
  150. break;
  151. copy = must_realloc(copy, (*length + n) * sizeof(*copy));
  152. memcpy(copy + *length, buf, n);
  153. *length += n;
  154. }
  155. close(fd);
  156. return copy;
  157. error:
  158. close(fd);
  159. free(copy);
  160. return NULL;
  161. }
  162. /*
  163. * A poor-man's version of "xargs -0". Basically parses a given block of
  164. * NUL-delimited data, within the given length and adds a pointer to each entry
  165. * to the array of pointers.
  166. */
  167. static int parse_xargs(char *data, int data_length, char ***output)
  168. {
  169. int num = 0;
  170. char *cur = data;
  171. if (!data || *output != NULL)
  172. return -1;
  173. while (cur < data + data_length) {
  174. num++;
  175. *output = must_realloc(*output, (num + 1) * sizeof(**output));
  176. (*output)[num - 1] = cur;
  177. cur += strlen(cur) + 1;
  178. }
  179. (*output)[num] = NULL;
  180. return num;
  181. }
  182. /*
  183. * "Parse" out argv from /proc/self/cmdline.
  184. * This is necessary because we are running in a context where we don't have a
  185. * main() that we can just get the arguments from.
  186. */
  187. static int fetchve(char ***argv)
  188. {
  189. char *cmdline = NULL;
  190. size_t cmdline_size;
  191. cmdline = read_file("/proc/self/cmdline", &cmdline_size);
  192. if (!cmdline)
  193. goto error;
  194. if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
  195. goto error;
  196. return 0;
  197. error:
  198. free(cmdline);
  199. return -EINVAL;
  200. }
  201. enum {
  202. EFD_NONE = 0,
  203. EFD_MEMFD,
  204. EFD_FILE,
  205. };
  206. /*
  207. * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
  208. * changes depending on the architecture. If we don't have O_TMPFILE we always
  209. * have the mkostemp(3) fallback.
  210. */
  211. #ifndef O_TMPFILE
  212. # if defined(__O_TMPFILE) && defined(O_DIRECTORY)
  213. # define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
  214. # endif
  215. #endif
  216. static int make_execfd(int *fdtype)
  217. {
  218. int fd = -1;
  219. char template[PATH_MAX] = {0};
  220. char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR");
  221. if (!prefix || *prefix != '/')
  222. prefix = "/tmp";
  223. if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
  224. return -1;
  225. /*
  226. * Now try memfd, it's much nicer than actually creating a file in STATEDIR
  227. * since it's easily detected thanks to sealing and also doesn't require
  228. * assumptions about STATEDIR.
  229. */
  230. *fdtype = EFD_MEMFD;
  231. fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
  232. if (fd >= 0)
  233. return fd;
  234. if (errno != ENOSYS && errno != EINVAL)
  235. goto error;
  236. #ifdef O_TMPFILE
  237. /*
  238. * Try O_TMPFILE to avoid races where someone might snatch our file. Note
  239. * that O_EXCL isn't actually a security measure here (since you can just
  240. * fd re-open it and clear O_EXCL).
  241. */
  242. *fdtype = EFD_FILE;
  243. fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
  244. if (fd >= 0) {
  245. struct stat statbuf = {};
  246. bool working_otmpfile = false;
  247. /*
  248. * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
  249. * found this out too. As a result we can't check for EINVAL. However,
  250. * if we get nlink != 0 (or EISDIR) then we know that this kernel
  251. * doesn't support O_TMPFILE.
  252. */
  253. if (fstat(fd, &statbuf) >= 0)
  254. working_otmpfile = (statbuf.st_nlink == 0);
  255. if (working_otmpfile)
  256. return fd;
  257. /* Pretend that we got EISDIR since O_TMPFILE failed. */
  258. close(fd);
  259. errno = EISDIR;
  260. }
  261. if (errno != EISDIR)
  262. goto error;
  263. #endif /* defined(O_TMPFILE) */
  264. /*
  265. * Our final option is to create a temporary file the old-school way, and
  266. * then unlink it so that nothing else sees it by accident.
  267. */
  268. *fdtype = EFD_FILE;
  269. fd = mkostemp(template, O_CLOEXEC);
  270. if (fd >= 0) {
  271. if (unlink(template) >= 0)
  272. return fd;
  273. close(fd);
  274. }
  275. error:
  276. *fdtype = EFD_NONE;
  277. return -1;
  278. }
  279. static int seal_execfd(int *fd, int fdtype)
  280. {
  281. switch (fdtype) {
  282. case EFD_MEMFD:
  283. return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
  284. case EFD_FILE: {
  285. /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
  286. int newfd;
  287. char fdpath[PATH_MAX] = {0};
  288. if (fchmod(*fd, 0100) < 0)
  289. return -1;
  290. if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
  291. return -1;
  292. newfd = open(fdpath, O_PATH | O_CLOEXEC);
  293. if (newfd < 0)
  294. return -1;
  295. close(*fd);
  296. *fd = newfd;
  297. return 0;
  298. }
  299. default:
  300. break;
  301. }
  302. return -1;
  303. }
  304. static int try_bindfd(void)
  305. {
  306. int fd, ret = -1;
  307. char template[PATH_MAX] = {0};
  308. char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR");
  309. if (!prefix || *prefix != '/')
  310. prefix = "/tmp";
  311. if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
  312. return ret;
  313. /*
  314. * We need somewhere to mount it, mounting anything over /proc/self is a
  315. * BAD idea on the host -- even if we do it temporarily.
  316. */
  317. fd = mkstemp(template);
  318. if (fd < 0)
  319. return ret;
  320. close(fd);
  321. /*
  322. * For obvious reasons this won't work in rootless mode because we haven't
  323. * created a userns+mntns -- but getting that to work will be a bit
  324. * complicated and it's only worth doing if someone actually needs it.
  325. */
  326. ret = -EPERM;
  327. if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
  328. goto out;
  329. if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
  330. goto out_umount;
  331. /* Get read-only handle that we're sure can't be made read-write. */
  332. ret = open(template, O_PATH | O_CLOEXEC);
  333. out_umount:
  334. /*
  335. * Make sure the MNT_DETACH works, otherwise we could get remounted
  336. * read-write and that would be quite bad (the fd would be made read-write
  337. * too, invalidating the protection).
  338. */
  339. if (umount2(template, MNT_DETACH) < 0) {
  340. if (ret >= 0)
  341. close(ret);
  342. ret = -ENOTRECOVERABLE;
  343. }
  344. out:
  345. /*
  346. * We don't care about unlink errors, the worst that happens is that
  347. * there's an empty file left around in STATEDIR.
  348. */
  349. unlink(template);
  350. return ret;
  351. }
  352. static ssize_t fd_to_fd(int outfd, int infd)
  353. {
  354. ssize_t total = 0;
  355. char buffer[4096];
  356. for (;;) {
  357. ssize_t nread, nwritten = 0;
  358. nread = read(infd, buffer, sizeof(buffer));
  359. if (nread < 0)
  360. return -1;
  361. if (!nread)
  362. break;
  363. do {
  364. ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
  365. if (n < 0)
  366. return -1;
  367. nwritten += n;
  368. } while(nwritten < nread);
  369. total += nwritten;
  370. }
  371. return total;
  372. }
  373. static int clone_binary(void)
  374. {
  375. int binfd, execfd;
  376. struct stat statbuf = {};
  377. size_t sent = 0;
  378. int fdtype = EFD_NONE;
  379. /*
  380. * Before we resort to copying, let's try creating an ro-binfd in one shot
  381. * by getting a handle for a read-only bind-mount of the execfd.
  382. */
  383. execfd = try_bindfd();
  384. if (execfd >= 0)
  385. return execfd;
  386. /*
  387. * Dammit, that didn't work -- time to copy the binary to a safe place we
  388. * can seal the contents.
  389. */
  390. execfd = make_execfd(&fdtype);
  391. if (execfd < 0 || fdtype == EFD_NONE)
  392. return -ENOTRECOVERABLE;
  393. binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
  394. if (binfd < 0)
  395. goto error;
  396. if (fstat(binfd, &statbuf) < 0)
  397. goto error_binfd;
  398. while (sent < statbuf.st_size) {
  399. int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
  400. if (n < 0) {
  401. /* sendfile can fail so we fallback to a dumb user-space copy. */
  402. n = fd_to_fd(execfd, binfd);
  403. if (n < 0)
  404. goto error_binfd;
  405. }
  406. sent += n;
  407. }
  408. close(binfd);
  409. if (sent != statbuf.st_size)
  410. goto error;
  411. if (seal_execfd(&execfd, fdtype) < 0)
  412. goto error;
  413. return execfd;
  414. error_binfd:
  415. close(binfd);
  416. error:
  417. close(execfd);
  418. return -EIO;
  419. }
  420. /* Get cheap access to the environment. */
  421. extern char **environ;
  422. int ensure_cloned_binary(void)
  423. {
  424. int execfd;
  425. char **argv = NULL;
  426. /* Check that we're not self-cloned, and if we are then bail. */
  427. int cloned = is_self_cloned();
  428. if (cloned > 0 || cloned == -ENOTRECOVERABLE)
  429. return cloned;
  430. if (fetchve(&argv) < 0)
  431. return -EINVAL;
  432. execfd = clone_binary();
  433. if (execfd < 0)
  434. return -EIO;
  435. if (putenv(CLONED_BINARY_ENV "=1"))
  436. goto error;
  437. fexecve(execfd, argv, environ);
  438. error:
  439. close(execfd);
  440. return -ENOEXEC;
  441. }