Back before docker there was LXC. And I wrote a C program about that time (circa 2014) to run and contain another binary, basically doing the core of what docker did but without all the filesystem layering. It probably still works but probably doesn't do everything that should be done on a modern linux anymore. Here is the entirety of that program:
#define _GNU_SOURCE
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include // NR_OPEN
#include
#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \
} while (0)
struct child_args {
char *container;
char *hostname;
char *rootpivot;
uid_t uid;
gid_t gid;
char **argv; // Command to execute
int daemonize;
char *pidfile;
int pipe_to_parent[2]; // for child signal to parent
int pipe_to_child[2]; // for parent signal to child
};
static int
childFunc(void *arg)
{
struct child_args *args = (struct child_args *) arg;
close(args->pipe_to_parent[0]);
close(args->pipe_to_child[1]);
/* Change hostname in UTS namespace of child */
struct utsname uts;
if (sethostname(args->hostname, strlen(args->hostname)) == -1)
errExit("sethostname");
if (uname(&uts) == -1)
errExit("uname");
/* pivot_root */
char putold[256];
snprintf(putold,256,"%s/%s",args->container,args->rootpivot);
if (chdir(args->container)!=0) errExit("chdir container");
if (pivot_root(args->container, putold)!=0) errExit("pivot root");
if (chdir("/")!=0) errExit("chdir /");
if (umount2(args->rootpivot, MNT_DETACH)!=0) errExit("umount2");
/* If changing to less-secure chroot() instead, also use MS_MOVE */
/* Mount a new procfs at /proc since CLONE_NEWPID was set */
char *procdir = "/proc";
mkdir(procdir, 0555); // ignore any EEXIST
if (mount("proc", procdir, "proc", 0, NULL) == -1)
errExit("mount procfs");
// Change to 32000, because that will be the new root
if (setgid((gid_t)32000) != 0)
errExit("Unable to become group 32000");
if (setuid((uid_t)32000) != 0)
errExit("Unable to become user 32000");
// Now change to CLONE_NEWUSER
if (unshare(CLONE_NEWUSER) == -1) errExit("unshare");
// Signal parent to change UID/GID maps */
close(args->pipe_to_parent[1]);
/* Wait until the parent has updated UID and GID maps */
char ch;
if (read(args->pipe_to_child[0], &ch, 1) != 0) {
fprintf(stderr,"Failure in child: read from pipe returned != 0\n");
exit(EXIT_FAILURE);
}
/* Be sure we are root */
pid_t uid = getuid();
if (uid != 0) {
fprintf(stderr,"eUID = %ld; eGID = %ld;\n",
(long) geteuid(), (long) getegid());
fprintf(stderr,"If we exec now, we lose all capabilities. Failed.\n");
exit(1);
}
/* Change to the requested user */
if (setgid(args->gid) != 0)
errExit("Unable to drop group privilege");
if (setuid(args->uid) != 0)
errExit("Unable to drop user privilege");
/* Get information on the requested user */
struct passwd *pw = getpwuid(args->uid);
if (pw == NULL) errExit("getpwuid");
/* Save terminal setting */
char **envp = malloc(sizeof(char *) * 7);
char term[256];
snprintf(term,256,"TERM=%s", getenv("TERM"));
envp[0] = term;
char home[256];
snprintf(home,256,"HOME=%s", pw->pw_dir);
envp[1] = home;
char shell[256];
snprintf(shell,256,"SHELL=%s",pw->pw_shell);
envp[2] = shell;
char path[256];
snprintf(path,256,"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/bin:/usr/x86_64-pc-linux-gnu/gcc-bin/4.8.3");
envp[3] = path;
char pwd[256];
snprintf(pwd,256,"PWD=/");
envp[4] = pwd;
char lang[256];
snprintf(lang,256,"LANG=en_NZ.utf8");
envp[5] = lang;
envp[6] = NULL;
if (args->daemonize) {
// New session
if (setsid() == -1)
errExit("setsid");
// Close all open files
int i;
for (i = 0; i < NR_OPEN; i++) close(i);
// Redirect 0, 1, and 2 to /dev/null
open("/dev/null",O_RDWR); // Will be stdin
dup(0); // Will be stdout
dup(0); // will be stderr
}
/* Clear parent-death signal (HUP), so we don't get a HUP
when the parent terminates (since it is preserved across
execve() */
if (prctl(PR_SET_PDEATHSIG, 0) == -1)
errExit("prctl");
/* Execute the program */
execve(args->argv[0], args->argv, envp);
errExit("execve");
}
#define STACK_SIZE (1024 * 1024)
static char child_stack[STACK_SIZE];
void do_parent(struct child_args *args)
{
char *stack;
char *stackTop;
pid_t pid;
struct utsname uts;
/* Setup pipes for synchronization */
if (pipe(args->pipe_to_parent) == -1) errExit("pipe_to_parent");
if (pipe(args->pipe_to_child) == -1) errExit("pipe_to_child");
int flags =
CLONE_NEWUTS // hostname
| CLONE_NEWIPC // IPC
//| CLONE_NEWNET // Network (need to setup veth)
| CLONE_NEWNS // Mount namespace (starts as copy of parent)
| CLONE_NEWPID; // Process namespace
// Not CLONE_NEWUSER yet or pivot_root() will fail (MNT_LOCKED)
pid = clone(childFunc,
child_stack + STACK_SIZE,
flags | SIGCHLD,
args);
if (pid == -1) errExit("clone");
/* Parent continues here */
close(args->pipe_to_parent[1]);
close(args->pipe_to_child[0]);
/* Wait for the child to signal us that it has done the unshare() */
char ch;
if (read(args->pipe_to_parent[0], &ch, 1) != 0) {
fprintf(stderr,"Failure in parent: read from pipe returned != 0\n");
exit(EXIT_FAILURE);
}
char *map = "0 32000 32000";
/* Change uid_map of child */
int fd;
char uidmap[256];
snprintf(uidmap,256,"/proc/%ld/uid_map", (long)pid);
fd = open(uidmap,O_WRONLY);
if (fd == -1) errExit("open uid_map");
if (write(fd,map,strlen(map)) == -1)
errExit("write uid_map");
close(fd);
char gidmap[256];
snprintf(gidmap,256,"/proc/%ld/gid_map", (long)pid);
fd = open(gidmap,O_WRONLY);
if (fd == -1) errExit("open gid_map");
if (write(fd,map,strlen(map)) == -1)
errExit("write gid_map");
close(fd);
/* Close the write end of the pipe to signal the child */
close(args->pipe_to_child[1]);
if (! args->daemonize) {
/* Wait for child to finish */
int child_status;
if (waitpid(pid, &child_status, 0) == -1) /* Wait for child */
errExit("waitpid");
if (WEXITSTATUS(child_status))
exit(WEXITSTATUS(child_status));
}
else {
int fd;
if ((fd=open(args->pidfile,O_CREAT|O_WRONLY|O_TRUNC))==-1) {
fprintf(stderr,"PID is %d (could not write pid file)\n",(pid_t)pid);
} else {
char pidtext[24];
snprintf(pidtext,24,"%d",pid);
if (write(fd,pidtext,strlen(pidtext))==-1) {
fprintf(stderr,"PID is %d (could not write pid file)\n",(pid_t)pid);
}
}
}
exit(0);
}
void
usage()
{
fprintf(stderr,"USAGE: contain -u -c [OPTIONS] [ ...]\n"
" -u [required] User to run the program as.\n"
" -c [required] Directory to contain within.\n"
" -g Set group of the process. Strongly recommended.\n"
" -h Set hostname of the container.\n"
" -d Deamonize the process.\n"
" -p Write daemon PID to . Only if -d is specified.\n"
" -r Root pivot subdirectory for pivot_root() call. Default is /mnt/root.\n"
"\n"
);
exit(EXIT_FAILURE);
}
int
main(int argc, char *argv[])
{
int opt;
struct child_args args;
args.uid = -1;
args.gid = 100;
args.container = NULL;
args.hostname = "ocl_container";
args.daemonize = 0;
args.rootpivot = "/mnt/root";
struct passwd *pw;
struct group *gr;
while ((opt = getopt(argc, argv, "+u:g:c:h:dp:r:")) != -1) {
switch (opt) {
case 'u':
pw = getpwnam(optarg);
if (pw == NULL) {
fprintf(stderr,"No such user %s\n",optarg);
exit(EXIT_FAILURE);
}
args.uid = pw->pw_uid;
break;
case 'g':
gr = getgrnam(optarg);
if (gr == NULL) {
fprintf(stderr,"No such group %s\n",optarg);
exit(EXIT_FAILURE);
}
args.gid = gr->gr_gid;
break;
case 'c':
args.container = optarg;
break;
case 'h':
args.hostname = optarg;
break;
case 'd':
args.daemonize = 1;
break;
case 'p':
args.pidfile = optarg;
break;
case 'r':
args.rootpivot = optarg;
break;
default: usage(argv[0]);
}
}
args.argv = &argv[optind];
if (args.uid == -1) usage(argv[0]);
if (args.container == NULL) usage(argv[0]);
if (args.argv[0] == NULL) usage(argv[0]);
do_parent(&args);
}