cloned_binary.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542
  1. // SPDX-License-Identifier: Apache-2.0 OR LGPL-2.1-or-later
  2. /*
  3. * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
  4. * Copyright (C) 2019 SUSE LLC
  5. *
  6. * This work is dual licensed under the following licenses. You may use,
  7. * redistribute, and/or modify the work under the conditions of either (or
  8. * both) licenses.
  9. *
  10. * === Apache-2.0 ===
  11. *
  12. * Licensed under the Apache License, Version 2.0 (the "License");
  13. * you may not use this file except in compliance with the License.
  14. * You may obtain a copy of the License at
  15. *
  16. * http://www.apache.org/licenses/LICENSE-2.0
  17. *
  18. * Unless required by applicable law or agreed to in writing, software
  19. * distributed under the License is distributed on an "AS IS" BASIS,
  20. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  21. * See the License for the specific language governing permissions and
  22. * limitations under the License.
  23. *
  24. * === LGPL-2.1-or-later ===
  25. *
  26. * This library is free software; you can redistribute it and/or
  27. * modify it under the terms of the GNU Lesser General Public
  28. * License as published by the Free Software Foundation; either
  29. * version 2.1 of the License, or (at your option) any later version.
  30. *
  31. * This library is distributed in the hope that it will be useful,
  32. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  33. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  34. * Lesser General Public License for more details.
  35. *
  36. * You should have received a copy of the GNU Lesser General Public
  37. * License along with this library. If not, see
  38. * <https://www.gnu.org/licenses/>.
  39. *
  40. */
  41. #define _GNU_SOURCE
  42. #include <unistd.h>
  43. #include <stdio.h>
  44. #include <stdlib.h>
  45. #include <stdbool.h>
  46. #include <string.h>
  47. #include <limits.h>
  48. #include <fcntl.h>
  49. #include <errno.h>
  50. #include <sys/types.h>
  51. #include <sys/stat.h>
  52. #include <sys/statfs.h>
  53. #include <sys/vfs.h>
  54. #include <sys/mman.h>
  55. #include <sys/mount.h>
  56. #include <sys/sendfile.h>
  57. #include <sys/syscall.h>
  58. /* Use our own wrapper for memfd_create. */
  59. #if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
  60. # define SYS_memfd_create __NR_memfd_create
  61. #endif
  62. /* memfd_create(2) flags -- copied from <linux/memfd.h>. */
  63. #ifndef MFD_CLOEXEC
  64. # define MFD_CLOEXEC 0x0001U
  65. # define MFD_ALLOW_SEALING 0x0002U
  66. #endif
  67. int memfd_create(const char *name, unsigned int flags)
  68. {
  69. #ifdef SYS_memfd_create
  70. return syscall(SYS_memfd_create, name, flags);
  71. #else
  72. errno = ENOSYS;
  73. return -1;
  74. #endif
  75. }
  76. /* This comes directly from <linux/fcntl.h>. */
  77. #ifndef F_LINUX_SPECIFIC_BASE
  78. # define F_LINUX_SPECIFIC_BASE 1024
  79. #endif
  80. #ifndef F_ADD_SEALS
  81. # define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
  82. # define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
  83. #endif
  84. #ifndef F_SEAL_SEAL
  85. # define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
  86. # define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
  87. # define F_SEAL_GROW 0x0004 /* prevent file from growing */
  88. # define F_SEAL_WRITE 0x0008 /* prevent writes */
  89. #endif
  90. #define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
  91. #define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
  92. #define RUNC_MEMFD_SEALS \
  93. (F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
  94. static void *must_realloc(void *ptr, size_t size)
  95. {
  96. void *old = ptr;
  97. do {
  98. ptr = realloc(old, size);
  99. } while(!ptr);
  100. return ptr;
  101. }
  102. /*
  103. * Verify whether we are currently in a self-cloned program (namely, is
  104. * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
  105. * for shmem files), and we want to be sure it's actually sealed.
  106. */
  107. static int is_self_cloned(void)
  108. {
  109. int fd, ret, is_cloned = 0;
  110. struct stat statbuf = {};
  111. struct statfs fsbuf = {};
  112. fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
  113. if (fd < 0) {
  114. fprintf(stderr, "you have no read access to runc binary file\n");
  115. return -ENOTRECOVERABLE;
  116. }
  117. /*
  118. * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
  119. * this, because you cannot write to a sealed memfd no matter what (so
  120. * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
  121. * memfd to /usr/bin/runc to allow re-use).
  122. */
  123. ret = fcntl(fd, F_GET_SEALS);
  124. if (ret >= 0) {
  125. is_cloned = (ret == RUNC_MEMFD_SEALS);
  126. goto out;
  127. }
  128. /*
  129. * All other forms require CLONED_BINARY_ENV, since they are potentially
  130. * writeable (or we can't tell if they're fully safe) and thus we must
  131. * check the environment as an extra layer of defence.
  132. */
  133. if (!getenv(CLONED_BINARY_ENV)) {
  134. is_cloned = false;
  135. goto out;
  136. }
  137. /*
  138. * Is the binary on a read-only filesystem? We can't detect bind-mounts in
  139. * particular (in-kernel they are identical to regular mounts) but we can
  140. * at least be sure that it's read-only. In addition, to make sure that
  141. * it's *our* bind-mount we check CLONED_BINARY_ENV.
  142. */
  143. if (fstatfs(fd, &fsbuf) >= 0)
  144. is_cloned |= (fsbuf.f_flags & MS_RDONLY);
  145. /*
  146. * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
  147. * which appears to have a borked backport of F_GET_SEALS. Either way,
  148. * having a file which has no hardlinks indicates that we aren't using
  149. * a host-side "runc" binary and this is something that a container
  150. * cannot fake (because unlinking requires being able to resolve the
  151. * path that you want to unlink).
  152. */
  153. if (fstat(fd, &statbuf) >= 0)
  154. is_cloned |= (statbuf.st_nlink == 0);
  155. out:
  156. close(fd);
  157. return is_cloned;
  158. }
  159. /* Read a given file into a new buffer, and providing the length. */
  160. static char *read_file(char *path, size_t *length)
  161. {
  162. int fd;
  163. char buf[4096], *copy = NULL;
  164. if (!length)
  165. return NULL;
  166. fd = open(path, O_RDONLY | O_CLOEXEC);
  167. if (fd < 0)
  168. return NULL;
  169. *length = 0;
  170. for (;;) {
  171. ssize_t n;
  172. n = read(fd, buf, sizeof(buf));
  173. if (n < 0)
  174. goto error;
  175. if (!n)
  176. break;
  177. copy = must_realloc(copy, (*length + n) * sizeof(*copy));
  178. memcpy(copy + *length, buf, n);
  179. *length += n;
  180. }
  181. close(fd);
  182. return copy;
  183. error:
  184. close(fd);
  185. free(copy);
  186. return NULL;
  187. }
  188. /*
  189. * A poor-man's version of "xargs -0". Basically parses a given block of
  190. * NUL-delimited data, within the given length and adds a pointer to each entry
  191. * to the array of pointers.
  192. */
  193. static int parse_xargs(char *data, int data_length, char ***output)
  194. {
  195. int num = 0;
  196. char *cur = data;
  197. if (!data || *output != NULL)
  198. return -1;
  199. while (cur < data + data_length) {
  200. num++;
  201. *output = must_realloc(*output, (num + 1) * sizeof(**output));
  202. (*output)[num - 1] = cur;
  203. cur += strlen(cur) + 1;
  204. }
  205. (*output)[num] = NULL;
  206. return num;
  207. }
  208. /*
  209. * "Parse" out argv from /proc/self/cmdline.
  210. * This is necessary because we are running in a context where we don't have a
  211. * main() that we can just get the arguments from.
  212. */
  213. static int fetchve(char ***argv)
  214. {
  215. char *cmdline = NULL;
  216. size_t cmdline_size;
  217. cmdline = read_file("/proc/self/cmdline", &cmdline_size);
  218. if (!cmdline)
  219. goto error;
  220. if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
  221. goto error;
  222. return 0;
  223. error:
  224. free(cmdline);
  225. return -EINVAL;
  226. }
  227. enum {
  228. EFD_NONE = 0,
  229. EFD_MEMFD,
  230. EFD_FILE,
  231. };
  232. /*
  233. * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
  234. * changes depending on the architecture. If we don't have O_TMPFILE we always
  235. * have the mkostemp(3) fallback.
  236. */
  237. #ifndef O_TMPFILE
  238. # if defined(__O_TMPFILE) && defined(O_DIRECTORY)
  239. # define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
  240. # endif
  241. #endif
  242. static int make_execfd(int *fdtype)
  243. {
  244. int fd = -1;
  245. char template[PATH_MAX] = {0};
  246. char *prefix = getenv("_LIBCONTAINER_STATEDIR");
  247. if (!prefix || *prefix != '/')
  248. prefix = "/tmp";
  249. if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
  250. return -1;
  251. /*
  252. * Now try memfd, it's much nicer than actually creating a file in STATEDIR
  253. * since it's easily detected thanks to sealing and also doesn't require
  254. * assumptions about STATEDIR.
  255. */
  256. *fdtype = EFD_MEMFD;
  257. fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
  258. if (fd >= 0)
  259. return fd;
  260. if (errno != ENOSYS && errno != EINVAL)
  261. goto error;
  262. #ifdef O_TMPFILE
  263. /*
  264. * Try O_TMPFILE to avoid races where someone might snatch our file. Note
  265. * that O_EXCL isn't actually a security measure here (since you can just
  266. * fd re-open it and clear O_EXCL).
  267. */
  268. *fdtype = EFD_FILE;
  269. fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
  270. if (fd >= 0) {
  271. struct stat statbuf = {};
  272. bool working_otmpfile = false;
  273. /*
  274. * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
  275. * found this out too. As a result we can't check for EINVAL. However,
  276. * if we get nlink != 0 (or EISDIR) then we know that this kernel
  277. * doesn't support O_TMPFILE.
  278. */
  279. if (fstat(fd, &statbuf) >= 0)
  280. working_otmpfile = (statbuf.st_nlink == 0);
  281. if (working_otmpfile)
  282. return fd;
  283. /* Pretend that we got EISDIR since O_TMPFILE failed. */
  284. close(fd);
  285. errno = EISDIR;
  286. }
  287. if (errno != EISDIR)
  288. goto error;
  289. #endif /* defined(O_TMPFILE) */
  290. /*
  291. * Our final option is to create a temporary file the old-school way, and
  292. * then unlink it so that nothing else sees it by accident.
  293. */
  294. *fdtype = EFD_FILE;
  295. fd = mkostemp(template, O_CLOEXEC);
  296. if (fd >= 0) {
  297. if (unlink(template) >= 0)
  298. return fd;
  299. close(fd);
  300. }
  301. error:
  302. *fdtype = EFD_NONE;
  303. return -1;
  304. }
  305. static int seal_execfd(int *fd, int fdtype)
  306. {
  307. switch (fdtype) {
  308. case EFD_MEMFD:
  309. return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
  310. case EFD_FILE: {
  311. /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
  312. int newfd;
  313. char fdpath[PATH_MAX] = {0};
  314. if (fchmod(*fd, 0100) < 0)
  315. return -1;
  316. if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
  317. return -1;
  318. newfd = open(fdpath, O_PATH | O_CLOEXEC);
  319. if (newfd < 0)
  320. return -1;
  321. close(*fd);
  322. *fd = newfd;
  323. return 0;
  324. }
  325. default:
  326. break;
  327. }
  328. return -1;
  329. }
  330. static int try_bindfd(void)
  331. {
  332. int fd, ret = -1;
  333. char template[PATH_MAX] = {0};
  334. char *prefix = getenv("_LIBCONTAINER_STATEDIR");
  335. if (!prefix || *prefix != '/')
  336. prefix = "/tmp";
  337. if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
  338. return ret;
  339. /*
  340. * We need somewhere to mount it, mounting anything over /proc/self is a
  341. * BAD idea on the host -- even if we do it temporarily.
  342. */
  343. fd = mkstemp(template);
  344. if (fd < 0)
  345. return ret;
  346. close(fd);
  347. /*
  348. * For obvious reasons this won't work in rootless mode because we haven't
  349. * created a userns+mntns -- but getting that to work will be a bit
  350. * complicated and it's only worth doing if someone actually needs it.
  351. */
  352. ret = -EPERM;
  353. if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
  354. goto out;
  355. if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
  356. goto out_umount;
  357. /* Get read-only handle that we're sure can't be made read-write. */
  358. ret = open(template, O_PATH | O_CLOEXEC);
  359. out_umount:
  360. /*
  361. * Make sure the MNT_DETACH works, otherwise we could get remounted
  362. * read-write and that would be quite bad (the fd would be made read-write
  363. * too, invalidating the protection).
  364. */
  365. if (umount2(template, MNT_DETACH) < 0) {
  366. if (ret >= 0)
  367. close(ret);
  368. ret = -ENOTRECOVERABLE;
  369. }
  370. out:
  371. /*
  372. * We don't care about unlink errors, the worst that happens is that
  373. * there's an empty file left around in STATEDIR.
  374. */
  375. unlink(template);
  376. return ret;
  377. }
  378. static ssize_t fd_to_fd(int outfd, int infd)
  379. {
  380. ssize_t total = 0;
  381. char buffer[4096];
  382. for (;;) {
  383. ssize_t nread, nwritten = 0;
  384. nread = read(infd, buffer, sizeof(buffer));
  385. if (nread < 0)
  386. return -1;
  387. if (!nread)
  388. break;
  389. do {
  390. ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
  391. if (n < 0)
  392. return -1;
  393. nwritten += n;
  394. } while(nwritten < nread);
  395. total += nwritten;
  396. }
  397. return total;
  398. }
  399. static int clone_binary(void)
  400. {
  401. int binfd, execfd;
  402. struct stat statbuf = {};
  403. size_t sent = 0;
  404. int fdtype = EFD_NONE;
  405. /*
  406. * Before we resort to copying, let's try creating an ro-binfd in one shot
  407. * by getting a handle for a read-only bind-mount of the execfd.
  408. */
  409. execfd = try_bindfd();
  410. if (execfd >= 0)
  411. return execfd;
  412. /*
  413. * Dammit, that didn't work -- time to copy the binary to a safe place we
  414. * can seal the contents.
  415. */
  416. execfd = make_execfd(&fdtype);
  417. if (execfd < 0 || fdtype == EFD_NONE)
  418. return -ENOTRECOVERABLE;
  419. binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
  420. if (binfd < 0)
  421. goto error;
  422. if (fstat(binfd, &statbuf) < 0)
  423. goto error_binfd;
  424. while (sent < statbuf.st_size) {
  425. int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
  426. if (n < 0) {
  427. /* sendfile can fail so we fallback to a dumb user-space copy. */
  428. n = fd_to_fd(execfd, binfd);
  429. if (n < 0)
  430. goto error_binfd;
  431. }
  432. sent += n;
  433. }
  434. close(binfd);
  435. if (sent != statbuf.st_size)
  436. goto error;
  437. if (seal_execfd(&execfd, fdtype) < 0)
  438. goto error;
  439. return execfd;
  440. error_binfd:
  441. close(binfd);
  442. error:
  443. close(execfd);
  444. return -EIO;
  445. }
  446. /* Get cheap access to the environment. */
  447. extern char **environ;
  448. int ensure_cloned_binary(void)
  449. {
  450. int execfd;
  451. char **argv = NULL;
  452. /* Check that we're not self-cloned, and if we are then bail. */
  453. int cloned = is_self_cloned();
  454. if (cloned > 0 || cloned == -ENOTRECOVERABLE)
  455. return cloned;
  456. if (fetchve(&argv) < 0)
  457. return -EINVAL;
  458. execfd = clone_binary();
  459. if (execfd < 0)
  460. return -EIO;
  461. if (putenv(CLONED_BINARY_ENV "=1"))
  462. goto error;
  463. fexecve(execfd, argv, environ);
  464. error:
  465. close(execfd);
  466. return -ENOEXEC;
  467. }