Photo by Fancycrave on Unsplash (Edited)

What is a container?

A story about picking apart how and why containers work the way they do

History

# Get a shell
$ cd $(mktemp -d)
$ mkdir bin
$ $(which sh) bin/bash
# Find shared libraries required for shell
$ ldd bin/sh
linux-vdso.so.1 (0x00007ffe69784000)
/lib/x86_64-linux-gnu/libsnoopy.so (0x00007f6cc4c33000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f6cc4a42000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f6cc4a21000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f6cc4a1c000)
/lib64/ld-linux-x86-64.so.2 (0x00007f6cc4c66000)
# Duplicate libraries into root
$ mkdir -p lib64 lib/x86_64-linux-gnu
$ cp /lib/x86_64-linux-gnu/libsnoopy.so \
/lib/x86_64-linux-gnu/libc.so.6 \
/lib/x86_64-linux-gnu/libpthread.so.0 \
/lib/x86_64-linux-gnu/libdl.so.2 \
lib/x86_64-linux-gnu/
$ cp /lib64/ld-linux-x86-64.so.2 lib64/# Change into that root
$ sudo chroot .
# Test the chroot
# ls
/bin/bash: 1: ls: not found
#

Definition

Implementation

Kernel feature isolation: namespaces

# Scratch space
$ cd $(mktemp -d)
# Fork is required to spawn new processes, and proc is mounted to give accurate process information
$ sudo unshare \
--fork \
--pid \
--mount-proc \
--net
# Here we see that we only have access to the loopback interface
root@sw-20160616-01:/tmp/tmp.XBESuNMJJS# ip addr
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
# Here we see that we can only see the first process (bash) and our `ps aux` invocation
root@sw-20160616-01:/tmp/tmp.XBESuNMJJS# ps aux
USER PID %CPU %MEM VSZ RSS TTY STAT START TIME COMMAND
root 1 0.3 0.0 8304 5092 pts/7 S 05:48 0:00 -bash
root 5 0.0 0.0 10888 3248 pts/7 R+ 05:49 0:00 ps aux

Resource isolation: control groups

# Create a cgroup called "me"
$ mkdir /sys/fs/cgroup/memory/me
# Allocate the cgroup a max of 100Mb memory
$ echo '100000000' | sudo tee /sys/fs/cgroup/memory/me/memory.limit_in_bytes
# Move this proess into the cgroup
$ echo $$ | sudo tee /sys/fs/cgroup/memory/me/cgroup.procs
5924

Userland isolation: seccomp

$ grep CONFIG_SECCOMP= /boot/config-$(uname -r)# Our system supports seccomp
CONFIG_SECCOMP=y
docker run --rm \
-it \
--security-opt seccomp=/path/to/seccomp/profile.json \
hello-world

Distribution: the union file system

# Scratch space
$ cd $(mktemp -d)
# Create a docker file
$ cat <<EOF > Dockerfile
FROM debian:buster
# Create a test directory
RUN mkdir /test
# Create a bunch of spam files
RUN echo $(date) > /test/a
RUN echo $(date) > /test/b
RUN echo $(date) > /test/c
EOF# Build the image
$ docker build .
Sending build context to Docker daemon 4.096kB
Step 1/5 : FROM debian:buster
---> ebdc13caae1e
Step 2/5 : RUN mkdir /test
---> Running in a9c0fa1a56c7
Removing intermediate container a9c0fa1a56c7
---> 6837541a46a5
Step 3/5 : RUN echo Sat 30 Mar 18:05:24 CET 2019 > /test/a
---> Running in 8b61ca022296
Removing intermediate container 8b61ca022296
---> 3ea076dcea98
Step 4/5 : RUN echo Sat 30 Mar 18:05:24 CET 2019 > /test/b
---> Running in 940d5bcaa715
Removing intermediate container 940d5bcaa715
---> 07b2f7a4dff8
Step 5/5 : RUN echo Sat 30 Mar 18:05:24 CET 2019 > /test/c
---> Running in 251f5d00b55f
Removing intermediate container 251f5d00b55f
---> 0122a70ad0a3
Successfully built 0122a70ad0a3
$ docker run \
--rm=true \
-it \
0122a70ad0a3 \
/bin/bash
$ cd /test
$ ls
a b c
$ cat *
Sat 30 Mar 18:05:24 CET 2019
Sat 30 Mar 18:05:24 CET 2019
Sat 30 Mar 18:05:24 CET 2019
$ docker run \
--rm=true \
-it \
07b2f7a4dff8 \
/bin/bash
$ ls test
a b
$ docker history 0122a70ad0a3
IMAGE CREATED CREATED BY SIZE COMMENT
0122a70ad0a3 5 minutes ago /bin/sh -c echo Sat 30 Mar 18:05:24 CET 2019… 29B
07b2f7a4dff8 5 minutes ago /bin/sh -c echo Sat 30 Mar 18:05:24 CET 2019… 29B
3ea076dcea98 5 minutes ago /bin/sh -c echo Sat 30 Mar 18:05:24 CET 2019… 29B
6837541a46a5 5 minutes ago /bin/sh -c mkdir /test 0B
ebdc13caae1e 12 months ago /bin/sh -c #(nop) CMD ["bash"] 0B
<missing> 12 months ago /bin/sh -c #(nop) ADD file:2219cecc89ed69975… 106MB
$ docker info | grep Storage
Storage Driver: overlay2
# scratch
cd $(mktemp -d)
# Create some layers
$ mkdir \
lower \
upper \
workdir \
overlay
# Create some files that represent the layers
$ touch lower/i-am-the-lower
$ touch higher/i-am-the-higher
# Create the layered filesystem at overlay with lower, upper and workdir
$ mount -t overlay \
-o lowerdir=lower,upperdir=upper,workdir=workdir \
./overlay \
overlay
# List the directory
$ ls overlay/
i-am-the-lower i-am-the-upper

Connectivity: networking

# Create a new network namespace
$ sudo unshare --fork --net
# List the ethernet devices with associated ip addresses
$ ip addr
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
# List all iptables rules
root@sw-20160616-01:/home/andrewhowden# iptables -L
Chain INPUT (policy ACCEPT)
target prot opt source destination
Chain FORWARD (policy ACCEPT)
target prot opt source destination
Chain OUTPUT (policy ACCEPT)
target prot opt source destination
# List all network routes
$ ip route show
$ ping 127.0.0.1
PING 127.0.0.1 (127.0.0.1): 56 data bytes
ping: sending packet: Network is unreachable
$ ip link set lo up
root@sw-20160616-01:/home/andrewhowden# ip addr
1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
inet 127.0.0.1/8 scope host lo
valid_lft forever preferred_lft forever
inet6 ::1/128 scope host
valid_lft forever preferred_lft forever
# Test the loopback adapter
$ ping 127.0.0.1
PING 127.0.0.1 (127.0.0.1): 56 data bytes
64 bytes from 127.0.0.1: icmp_seq=0 ttl=64 time=0.092 ms
64 bytes from 127.0.0.1: icmp_seq=1 ttl=64 time=0.068 ms
$ echo $$
18171
$ sudo ip link add veth0 type veth peer name veth0 netns 18171
# Container$ ip addr
1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN group default qlen 1000
link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
2: veth0@if7: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN group default qlen 1000
link/ether 16:34:52:54:a2:a1 brd ff:ff:ff:ff:ff:ff link-netnsid 0
$ ip route show
# No output
# On the host
$ ip addr add 192.168.24.1 dev veth0
# Within the container
$ ip address add 192.168.24.10 dev veth0
# Both host and container
$ ip link set veth0 up
# Both host and guest
ip route add 192.168.24.0/24 dev veth0
# Within container
$ ping 192.168.24.1
PING 192.168.24.1 (192.168.24.1): 56 data bytes
64 bytes from 192.168.24.1: icmp_seq=0 ttl=64 time=0.149 ms
64 bytes from 192.168.24.1: icmp_seq=1 ttl=64 time=0.096 ms
64 bytes from 192.168.24.1: icmp_seq=2 ttl=64 time=0.104 ms
64 bytes from 192.168.24.1: icmp_seq=3 ttl=64 time=0.100 ms
# Within container
$ ping google.com
ping: unknown host
# Within container
$ echo 1 > /proc/sys/net/ipv4/ip_forward
# On the host
# Forward packets from the container to the host adapter
iptables -A FORWARD -i veth0 -o wlp2s0 -j ACCEPT
# Forward packets that have been established via egress from the host adapater back to the contianer
iptables -A FORWARD -i wlp2s0 -o veth0 -m state --state ESTABLISHED,RELATED -j ACCEPT
# Relabel the IPs for the container so return traffic will be routed correctly
iptables -t nat -A POSTROUTING -o wlp2s0 -j MASQUERADE
# Within the container
$ ip route add default via 192.168.24.1 dev veth0
$ # ping google.com
PING google.com (172.217.22.14): 56 data bytes
64 bytes from 172.217.22.14: icmp_seq=0 ttl=55 time=16.456 ms
64 bytes from 172.217.22.14: icmp_seq=1 ttl=55 time=15.102 ms
64 bytes from 172.217.22.14: icmp_seq=2 ttl=55 time=34.369 ms
64 bytes from 172.217.22.14: icmp_seq=3 ttl=55 time=15.319 ms

Landscape review

Systems Updates

Init within container

In Conclusion

References

See https://www.andrewhowden.com/