I'm on a mission. To flush the docker down the tubes. They did really well for us, they made containerization legit, they destroyed stupid deployment tools like chef and terraform and proved that shell scripting is the ultimate DSL.

But NOW, they've gone under as an opensource company 🫡🫡🫡 and everything even kubernetes has not and will not use docker runtime for ANTYhing. So neither should you.

#systemd-nspawn -M pv

Reply to this note

Please Login to reply.

Discussion

Tho if you're on a Mac, you prob should just ssh or mosh to a linux Vm instead of docker or brew. Otherwise I dunno how to help you but I'm open to suggestions for my docs.

docker is dead, long live containerd (and nspawn)

preach 🙏

Back before docker there was LXC. And I wrote a C program about that time (circa 2014) to run and contain another binary, basically doing the core of what docker did but without all the filesystem layering. It probably still works but probably doesn't do everything that should be done on a modern linux anymore. Here is the entirety of that program:

#define _GNU_SOURCE

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include // NR_OPEN

#include

#define errExit(msg) do { perror(msg); exit(EXIT_FAILURE); \

} while (0)

struct child_args {

char *container;

char *hostname;

char *rootpivot;

uid_t uid;

gid_t gid;

char **argv; // Command to execute

int daemonize;

char *pidfile;

int pipe_to_parent[2]; // for child signal to parent

int pipe_to_child[2]; // for parent signal to child

};

static int

childFunc(void *arg)

{

struct child_args *args = (struct child_args *) arg;

close(args->pipe_to_parent[0]);

close(args->pipe_to_child[1]);

/* Change hostname in UTS namespace of child */

struct utsname uts;

if (sethostname(args->hostname, strlen(args->hostname)) == -1)

errExit("sethostname");

if (uname(&uts) == -1)

errExit("uname");

/* pivot_root */

char putold[256];

snprintf(putold,256,"%s/%s",args->container,args->rootpivot);

if (chdir(args->container)!=0) errExit("chdir container");

if (pivot_root(args->container, putold)!=0) errExit("pivot root");

if (chdir("/")!=0) errExit("chdir /");

if (umount2(args->rootpivot, MNT_DETACH)!=0) errExit("umount2");

/* If changing to less-secure chroot() instead, also use MS_MOVE */

/* Mount a new procfs at /proc since CLONE_NEWPID was set */

char *procdir = "/proc";

mkdir(procdir, 0555); // ignore any EEXIST

if (mount("proc", procdir, "proc", 0, NULL) == -1)

errExit("mount procfs");

// Change to 32000, because that will be the new root

if (setgid((gid_t)32000) != 0)

errExit("Unable to become group 32000");

if (setuid((uid_t)32000) != 0)

errExit("Unable to become user 32000");

// Now change to CLONE_NEWUSER

if (unshare(CLONE_NEWUSER) == -1) errExit("unshare");

// Signal parent to change UID/GID maps */

close(args->pipe_to_parent[1]);

/* Wait until the parent has updated UID and GID maps */

char ch;

if (read(args->pipe_to_child[0], &ch, 1) != 0) {

fprintf(stderr,"Failure in child: read from pipe returned != 0\n");

exit(EXIT_FAILURE);

}

/* Be sure we are root */

pid_t uid = getuid();

if (uid != 0) {

fprintf(stderr,"eUID = %ld; eGID = %ld;\n",

(long) geteuid(), (long) getegid());

fprintf(stderr,"If we exec now, we lose all capabilities. Failed.\n");

exit(1);

}

/* Change to the requested user */

if (setgid(args->gid) != 0)

errExit("Unable to drop group privilege");

if (setuid(args->uid) != 0)

errExit("Unable to drop user privilege");

/* Get information on the requested user */

struct passwd *pw = getpwuid(args->uid);

if (pw == NULL) errExit("getpwuid");

/* Save terminal setting */

char **envp = malloc(sizeof(char *) * 7);

char term[256];

snprintf(term,256,"TERM=%s", getenv("TERM"));

envp[0] = term;

char home[256];

snprintf(home,256,"HOME=%s", pw->pw_dir);

envp[1] = home;

char shell[256];

snprintf(shell,256,"SHELL=%s",pw->pw_shell);

envp[2] = shell;

char path[256];

snprintf(path,256,"PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/opt/bin:/usr/x86_64-pc-linux-gnu/gcc-bin/4.8.3");

envp[3] = path;

char pwd[256];

snprintf(pwd,256,"PWD=/");

envp[4] = pwd;

char lang[256];

snprintf(lang,256,"LANG=en_NZ.utf8");

envp[5] = lang;

envp[6] = NULL;

if (args->daemonize) {

// New session

if (setsid() == -1)

errExit("setsid");

// Close all open files

int i;

for (i = 0; i < NR_OPEN; i++) close(i);

// Redirect 0, 1, and 2 to /dev/null

open("/dev/null",O_RDWR); // Will be stdin

dup(0); // Will be stdout

dup(0); // will be stderr

}

/* Clear parent-death signal (HUP), so we don't get a HUP

when the parent terminates (since it is preserved across

execve() */

if (prctl(PR_SET_PDEATHSIG, 0) == -1)

errExit("prctl");

/* Execute the program */

execve(args->argv[0], args->argv, envp);

errExit("execve");

}

#define STACK_SIZE (1024 * 1024)

static char child_stack[STACK_SIZE];

void do_parent(struct child_args *args)

{

char *stack;

char *stackTop;

pid_t pid;

struct utsname uts;

/* Setup pipes for synchronization */

if (pipe(args->pipe_to_parent) == -1) errExit("pipe_to_parent");

if (pipe(args->pipe_to_child) == -1) errExit("pipe_to_child");

int flags =

CLONE_NEWUTS // hostname

| CLONE_NEWIPC // IPC

//| CLONE_NEWNET // Network (need to setup veth)

| CLONE_NEWNS // Mount namespace (starts as copy of parent)

| CLONE_NEWPID; // Process namespace

// Not CLONE_NEWUSER yet or pivot_root() will fail (MNT_LOCKED)

pid = clone(childFunc,

child_stack + STACK_SIZE,

flags | SIGCHLD,

args);

if (pid == -1) errExit("clone");

/* Parent continues here */

close(args->pipe_to_parent[1]);

close(args->pipe_to_child[0]);

/* Wait for the child to signal us that it has done the unshare() */

char ch;

if (read(args->pipe_to_parent[0], &ch, 1) != 0) {

fprintf(stderr,"Failure in parent: read from pipe returned != 0\n");

exit(EXIT_FAILURE);

}

char *map = "0 32000 32000";

/* Change uid_map of child */

int fd;

char uidmap[256];

snprintf(uidmap,256,"/proc/%ld/uid_map", (long)pid);

fd = open(uidmap,O_WRONLY);

if (fd == -1) errExit("open uid_map");

if (write(fd,map,strlen(map)) == -1)

errExit("write uid_map");

close(fd);

char gidmap[256];

snprintf(gidmap,256,"/proc/%ld/gid_map", (long)pid);

fd = open(gidmap,O_WRONLY);

if (fd == -1) errExit("open gid_map");

if (write(fd,map,strlen(map)) == -1)

errExit("write gid_map");

close(fd);

/* Close the write end of the pipe to signal the child */

close(args->pipe_to_child[1]);

if (! args->daemonize) {

/* Wait for child to finish */

int child_status;

if (waitpid(pid, &child_status, 0) == -1) /* Wait for child */

errExit("waitpid");

if (WEXITSTATUS(child_status))

exit(WEXITSTATUS(child_status));

}

else {

int fd;

if ((fd=open(args->pidfile,O_CREAT|O_WRONLY|O_TRUNC))==-1) {

fprintf(stderr,"PID is %d (could not write pid file)\n",(pid_t)pid);

} else {

char pidtext[24];

snprintf(pidtext,24,"%d",pid);

if (write(fd,pidtext,strlen(pidtext))==-1) {

fprintf(stderr,"PID is %d (could not write pid file)\n",(pid_t)pid);

}

}

}

exit(0);

}

void

usage()

{

fprintf(stderr,"USAGE: contain -u -c [OPTIONS] [ ...]\n"

" -u [required] User to run the program as.\n"

" -c [required] Directory to contain within.\n"

" -g Set group of the process. Strongly recommended.\n"

" -h Set hostname of the container.\n"

" -d Deamonize the process.\n"

" -p Write daemon PID to . Only if -d is specified.\n"

" -r Root pivot subdirectory for pivot_root() call. Default is /mnt/root.\n"

"\n"

);

exit(EXIT_FAILURE);

}

int

main(int argc, char *argv[])

{

int opt;

struct child_args args;

args.uid = -1;

args.gid = 100;

args.container = NULL;

args.hostname = "ocl_container";

args.daemonize = 0;

args.rootpivot = "/mnt/root";

struct passwd *pw;

struct group *gr;

while ((opt = getopt(argc, argv, "+u:g:c:h:dp:r:")) != -1) {

switch (opt) {

case 'u':

pw = getpwnam(optarg);

if (pw == NULL) {

fprintf(stderr,"No such user %s\n",optarg);

exit(EXIT_FAILURE);

}

args.uid = pw->pw_uid;

break;

case 'g':

gr = getgrnam(optarg);

if (gr == NULL) {

fprintf(stderr,"No such group %s\n",optarg);

exit(EXIT_FAILURE);

}

args.gid = gr->gr_gid;

break;

case 'c':

args.container = optarg;

break;

case 'h':

args.hostname = optarg;

break;

case 'd':

args.daemonize = 1;

break;

case 'p':

args.pidfile = optarg;

break;

case 'r':

args.rootpivot = optarg;

break;

default: usage(argv[0]);

}

}

args.argv = &argv[optind];

if (args.uid == -1) usage(argv[0]);

if (args.container == NULL) usage(argv[0]);

if (args.argv[0] == NULL) usage(argv[0]);

do_parent(&args);

}

Need triple backtick code folding/syntax in nostr:npub18m76awca3y37hkvuneavuw6pjj4525fw90necxmadrvjg0sdy6qsngq955 🤔

Ya, lxc used cgroups and that's what systemd-nspawn does too

That's right I'm starting to remember. This program just does dead-simple namespace isolation without cgroups. Cgroups give you more flexibility around managing resources.

Hm, I thought it did use cgroups.. will have to read more about the internals. I know it can do resource limits and permission management so I assumed it was doing that with cgroups. Using cgtop I can see them.

It does setrlimit() but that is a separate older mechanism. I don't think it does cgroups. At least the man page doesn't say anything about cgroups.

I didn't realize it did the networking stuff. It does have a lot of options.

Nspawn is pretty wild. There is very little googling it and to learn it just means reading man pages and fiddling around. I see it has some cgroups code in it tho..

https://github.com/systemd/systemd/blob/main/src/nspawn/nspawn-cgroup.c

systemd nspawn was written right around the same time I wrote that program, because that was when the kernel developers had just finished up the namespacing isolation and everybody wanted to start using it. I don't think nspawn uses cgroups, I think it is a fancier version of what I posted, doing the namespace isolation and the pivot_root and not much else.

So a docker replacement should do:

1. namespace isolation, pivot_root: nspawn and containerd do this part (so does my C program but I wouldn't trust it to be modern enough)

2. cgroup setup - limits on memory, cpu, and disk that a process group can utilize

3. layered filesystem - overlayfs (AUFS and unionfs probably are worse)

4. syncing filesystem layers from a server - like git pull - probably just using git actually.

5. some kind of networking that lets the containered process talk to the internet

I'm probably missing something important.

Still not sure if the filesystem layering was really a necessary part of containers. Having built thousands of docker containers and managing deployment pipelines I noticed how hard it is to actually utilize the layers effectively. Almost any change can easily blow the cache of a docker build. Usually you should be updating the OS frequently anyway for security reasons and caching it can lead to laziness on updates.

Systemd nspawn uses bind mounts and isolated filesystem access (like a chroot) which is nice for performance. It is a slight paradigm shift on what a container means, so far so good though..

One thing I've been really liking so far too is you can Boot your containers for multi process... For my use cases this is way superior to single process. For example, haproxy does hot reloading by managing processes. Additional processes managed by systemd from inside a container are also really handy.

It seems to me that you should update your OS filesystem layer regularly, and many different containers that run on it would then benefit from that one update. That presents a risk that an update might break something, but as long as you don't jump major versions I think it is a reasonable strategy. This is how QubesOS layers things (it usees Xen virtualization) - you do OS updates in TemplateVMs and you build AppVMs to use an underlying TemplateVM.

Yep, that's the pattern I am using now.. you have the base OS filesystem from a debootstrap (Debian), then you can clone it and build the various app images on top. If you wanted to you could build smaller and smaller images, all depends on if you want to treat the container like a single process or have extra tools / shell and init system inside.

I would read the code but amethyst doesn't have a code viewer 😂 nostr:npub1gcxzte5zlkncx26j68ez60fzkvtkm9e0vrwdcvsjakxf9mu9qewqlfnj5z let's goooo

but I did scroll it and it was impressively short

🤯😲

I hated docker's terminology. What they call a container is actually a process. What they call an image is actually a container.