segfault/sfbin/sf
2023-12-13 16:23:45 +00:00

329 lines
13 KiB
Bash
Executable File

#! /usr/bin/env bash
# Change to CWD (in case CWD has been updated).
cd "$(pwd)" || exit
BINDIR="$(cd "$(dirname "${0}")" || exit; pwd)"
source "${BINDIR}/funcs.sh" || exit 254
command -v jq >/dev/null || ERREXIT 255 "Command 'jq' not found. Try ${CDC}apt-get install jq${CN}"
down()
{
local IFS
IFS=$'\n'
docker container prune -f
c=($(docker ps -f name=^lg --all --quiet))
[[ -n $c ]] && docker stop "${c[@]}"
docker-compose "$@"
docker network prune -f
# Sometimes docker gets into a state when it complains about overlappting
# network pool even that 'docker network ls' shows no networks beside
# the 3 default networks and with no containers running:
ip link show | cut -f2 -d" " | grep -E "^(br-)" | while read x; do x="${x%@*}"; x="${x%:*}"; [[ -z $x ]] && continue; ip link delete "${x}" down; done
}
[[ -z $SF_REDIS_AUTH ]] && {
SF_REDIS_AUTH=$(echo -n "Redis AUTH $SF_SEED" | sha512sum | base64 -w0)
SF_REDIS_AUTH="${SF_REDIS_AUTH//[^[:alnum:]]}"
SF_REDIS_AUTH="${SF_REDIS_AUTH:0:32}"
export SF_REDIS_AUTH
}
export SF_BACKING_FS="$(docker info --format '{{json .DriverStatus}}' | jq -r '.[0][1]')"
[[ "$SF_BACKING_FS" != "xfs" ]] && WARN "Backing FS is not XFS (SF_USER_ROOT_FS_SIZE wont work)"
[[ "$1" == down ]] && {
down "$@"
exit
}
[[ "$1" != up ]] && exec docker-compose "$@"
# HERE: "up"
[[ -z $SF_SEED ]] && ERREXIT 255 "SF_SEED= not set"
# Load variables from ENV but only those not already set in
# user's environemtn.
load_env()
{
local n
local v
local arr
local a
envfile="./.env"
[[ -n $SF_BASEDIR ]] && envfile="${SF_BASEDIR}/.env"
if [[ ! -f "${envfile}" ]]; then
WARN "Not found: \${SF_BASEDIR}/.env (${envfile})"
else
mapfile -t arr < <(grep -E -v '(^#|^$)' "${envfile}")
for a in "${arr[@]}"; do
n="${a%%=*}"
v="${a#*=}"
# Prefer user's environemtn over .env settings.
[[ -z "$(eval echo \$$n)" ]] && eval "${n}=\"${v}\""
done
fi
[[ -z $SF_BASEDIR ]] && ERREXIT 255 "SF_BASEDIR= not set in ${envfile}."
}
blockio_init()
{
local is_bfq
local n
# Check if there is BFQ-Scheduler support in the Kernel
for fn in /sys/class/block/*/queue/scheduler; do
[[ ! -f "${fn}" ]] && break
grep bfq "${fn}" >/dev/null || break
is_bfq=1
break
done
[[ -z $is_bfq ]] && {
# HERE: no BFQ support. Try load module.
# Try: apt install linux-modules-extra-aws
modprobe bfq || { WARN "No BFQ-Scheduler. Attacker can DoS block-IO."; return; }
is_bfq=1
}
# Return if BFQ is set
for fn in /sys/class/block/*/queue/scheduler; do
[[ ! -f "${fn}" ]] && break
echo bfq >"${fn}" || { WARN ""${fn%/queue*}": Failed to set BFQ scheduler."; return; }
done
# Odd bug. On some systems we set all correctly and docker still complains that
# it cant use Block IO weights. It appears to be a problem with cgroup v1?
# It can be fixed on v1 systems by using --cgroup-parent=/guest and creating:
# mkdir -p /sys/fs/cgroup/blkio/guest
# echo 1 >/sys/fs/cgroup/blkio/guest/blkio.bfq.weight
# => But then why cant docker fix this crap?
# https://github.com/moby/moby/issues/16173#issuecomment-1298432655
# Test if docker accepts --blkio-weight:
docker run --rm --blkio-weight=100 alpine true 2>&1 | grep "does not support Block" >/dev/null && { WARN "DOCKER: Your kernel does not support Block I/O weight."; return; }
}
sysinc()
{
local key
local val
key=$1
val=$2
[[ $(sysctl -n "$key") -ge $val ]] && return
sysctl -q -w "${key}=${val}" || WARN "Could not set '${key}=${val}'"
}
sysdec()
{
local key
local val
key=$1
val=$2
[[ $(sysctl -n "$key") -le $val ]] && return
sysctl -q -w "${key}=${val}" || WARN "Could not set '${key}=${val}'"
}
warn_file()
{
[[ -f "$1" ]] && return
WARN "Not found: $1"
}
warn_outdated()
{
local fn dst src
dst="${SF_BASEDIR}/${1}"
src="${BINDIR}/../${1}"
[[ ! -f "$dst" ]] && { WARN "Not found: $dst"; return; }
[[ ! -f "$src" ]] && ERREXIT 255 "Not found: $src"
# Installed file $dst is newer or equal than $src
[[ ! "$dst" -ot "$src" ]] && return
[[ $(stat -c%s "$dst") -eq $(stat -c%s "$src") ]] && return
WARN "$dst is outdated? Try ${CDC}touch $dst${CN} to ignore."
}
load_env
[[ -z $SF_DATADIR ]] && SF_DATADIR="${SF_BASEDIR}/data"
[[ -z $SF_SHMDIR ]] && SF_SHMDIR="/dev/shm/sf"
[[ -z $SF_HOST_MTU ]] && SF_HOST_MTU=1500
export SF_GUEST_MTU=$((SF_HOST_MTU - 80))
[[ ! -d "${SF_DATADIR}/user" ]] && mkdir -p "${SF_DATADIR}/user"
[[ ! -d "${SF_DATADIR}/share" ]] && mkdir -p "${SF_DATADIR}/share"
[[ ! -f "${SF_DATADIR}/share/GeoLite2-City.mmdb" ]] && [[ "${MAXMIND_KEY,,}" != "skip" ]] && {
WARN "Not found: data/share/GeoLite2-City.mmdb"
echo -e "Try \`curl 'https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&license_key=${MAXMIND_KEY:-KEY-NOT-SET}&suffix=tar.gz' | tar xfvz - --strip-components=1 --no-anchored -C '${SF_DATADIR}/share/' 'GeoLite2-City.mmdb'\`."
echo -e "Try ${CDC}MAXMIND_KEY=skip${CN} to disable. This will also disable limits by GEOIP and disable user tools like geoip and geoiphn."
}
[[ ! -f "${SF_DATADIR}/share/tor-exit-nodes.txt" ]] && {
WARN "Not found: data/share/tor-exit-nodes.txt"
echo -e "Try \`curl 'https://www.dan.me.uk/torlist/?exit' >'${SF_DATADIR}/share/tor-exit-nodes.txt'\`"
}
[[ ! -f "${SF_DATADIR}/share/relay-exit-nodes-mullvad.txt" ]] && WARN "Not found: data/share/relay-exit-nodes-mullvad.txt - ${CDM}See contrib/cronjob how to create it.${CN}"
[[ ! -f "${SF_DATADIR}/share/proxies.txt" ]] && WARN "Not found: data/share/proxies.txt (Mullvad proxies) - ${CDM}See contrib/cronjob how to create it.${CN}"
[[ ! -f "${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt" ]] && {
WARN "Not found: ${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt"
echo -e "\
==> Log in from global relays is not controlled. We use a private list from Blind Mouse.
==> Generate your own list (see THC's Tips & Tricks).
==> Use ${CDC}touch ${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt${CN} to stop this warning."
}
chmod 644 "${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt" 2>/dev/null
[[ -z $SF_OVERLAYDIR ]] && [[ -d "${SF_BASEDIR}/docker/overlay2" ]] && export SF_OVERLAYDIR="${SF_BASEDIR}/docker/overlay2"
[[ -z $SF_IP ]] && {
command -v dig >/dev/null || ERREXIT 255 "Command 'dig' not found. Try ${CDC}apt-get install dnsutils${CN}"
export SF_IP=$(dig +short "$SF_FQDN" 2>/dev/null | grep -v '\.$')
[[ -z $SF_IP ]] && ERREXIT 255 "Could not get SF_IP..."
WARN "SF_IP not set in .env. Using '$SF_IP'."
}
# xfs_init_quota "${SF_DATADIR}/everyone-root" "everyone" 100 16384 16G
# Enable BFQ on all block devices to allow cgroup's io.weight
# FIXME: One day but this into udev/startup scripts and only for
# device that we are using...
blockio_init
# BUG-ARP-CACHE:
# User can cause arp-table overflow. The kernel limit is global for all arp tables
# but each container gets its own arp table. All containers just put pressure on the global
# limit.
# Attack: A user can spawn multiple containers and create 'incomplete' arp entries in its own
# table. Those entries reduce the amount of entries avaialble for other containers (it's a global limit
# and not a limit per container).
#
# Oddity: Docker-compose is making the host name of each service available (e.g sf-redis, sf-tor etc).
# This is not done via an /etc/hosts entry but handled by Docker internally. The problem is that
# 'somewhere' docker (internally) needs an arp-entry (which fails during an attack). Then the
# name (e.g. sf-redis or so) can not be resolved and all goes to shits.
#
# Tweaking base_reachable_time_ms and gc_stale_time has no effect. Best we can do:
# 1. Use static IPs where possible for inter-container communication.
# 2. Limit the User's local network (to /22 or /24)
# 3. Increase the global size of the kernel's arp table (gc_thresh3)
# These are global and shared among all containers
# Increase unless already higher
sysinc net.ipv4.neigh.default.gc_thresh3 65536
sysinc net.netfilter.nf_conntrack_buckets 16384 # default is 65536 for >4GB systems
sysinc net.netfilter.nf_conntrack_max 1048576
# find /proc/*/fd -lname anon_inode:inotify | cut -d/ -f3 | xargs -I '{}' -- ps --no-headers -o '%p %U %c' -p '{}' | uniq -c | sort -nr
sysinc fs.inotify.max_user_instances 1024
# Conntrack & Namespaces is a mess. Restricting these inside a container
# only results that the connection is dropped sooner but the state still
# remains on the host's container. Thus we also reduce the host's timers
# to deal with this. The host does not do CONNTRACKING and thus these
# settings should only affect the containers.
# Decrease unless already lower.
sysdec net.netfilter.nf_conntrack_tcp_timeout_syn_sent 10
sysdec net.netfilter.nf_conntrack_tcp_timeout_syn_recv 5 # default is 30, 5 because of reverse tunnels
sysdec net.netfilter.nf_conntrack_tcp_timeout_last_ack 5 # default is 30
sysdec net.netfilter.nf_conntrack_tcp_timeout_fin_wait 10 # default is 120
sysdec net.netfilter.nf_conntrack_tcp_timeout_close 1 # default is 10
sysdec net.netfilter.nf_conntrack_tcp_timeout_close_wait 10 # default is 60
sysdec net.netfilter.nf_conntrack_tcp_timeout_unacknowledged 30 # default is 300
sysdec net.netfilter.nf_conntrack_tcp_timeout_established 10800 # 3h, default is 5 days
sysdec net.netfilter.nf_conntrack_icmp_timeout 10 # default is 30
sysdec net.netfilter.nf_conntrack_udp_timeout 10 # default is 30
# Each Hugepagesize is 2MB (grep HUGE /proc/meminfo)
# 512 => 1g as HUGE
# 8192 => 16g as HUGE
[[ ! $(cat /proc/sys/vm/nr_hugepages) -gt 0 ]] && WARN "Huge Tables not set. Consider ${CDC}echo \"vm.nr_hugepages=8192\" >>/etc/sysctl.conf && sysctl -w vm.nr_hugepages=8192${CN}"
# Warn for outdated files in /sf/config/* (that are older and different size)
mapfile -t arr < <(cd "${BINDIR}/../" || exit; find config -type f)
for fn in "${arr[@]}"; do
warn_outdated "$fn"
done
# Check if there are any fils in /sf/sfbin that are not equal to ./sfbin
for x in "${BINDIR}/"*; do
[[ ! -e "$x" ]] && WARN "Oops. Files missing in ${BINDIR}/*???"
str=$(md5sum "$x")
src=${str%% *}
x=$(basename "$x")
str=$(md5sum "${SF_BASEDIR}/sfbin/${x}" 2>/dev/null)
dst=${str%% *}
[[ $src != $dst ]] && WARN "${SF_BASEDIR}/sfbin/${x} is outdated. Please update with ${CDC}${BINDIR}/${x}${CN}"
done
# Make sure /dev/shm is 'shared'
[[ "$(findmnt -no TARGET,PROPAGATION /dev/shm)" != *"shared"* ]] && {
mount --make-shared /dev/shm/ || ERREXIT 252
}
systemctl start sf.slice || WARN 'Could not start sf.slice'
systemctl start sf-guest.slice || WARN 'Could not start sf-guest.slice'
systemctl status sf.slice | grep Segfault >/dev/null || WARN 'Bad start sf.slice. Does not belong to Segfault.'
systemctl status sf-guest.slice | grep Segfault >/dev/null || WARN 'Bad start sf-guest.slice. Does not belong to Segfault.'
SF_CG_DIR="/sys/fs/cgroup"
[[ -d "/sys/fs/cgroup/unified" ]] && {
SF_CG_DIR="/sys/fs/cgroup/unified"
# for cgroupv1 docker-run expects the absolute hierarchy path (for --cgroup-parent):
export SF_CG_PARENT="sf.slice/sf-guest.slice"
}
str=$(mount | grep ^cgroup2 | grep -F "$SF_CG_DIR" )
[[ $str == *'nsdelegate'* ]] && {
# HERE: cgroup2 is in use.
echo -e >&2 "[$(date '+%F %T' -u)] [${CDY}WARN${CN}] ${SF_CG_DIR} is mounted with nsdelegate. Disabling nsdelegate."
str=${str##*\(}
str=${str%\)*}
# We need to move encfsd to the user's cgroup: From sf.slice (sf-encfsd) to sf.slice/sf-guest.slice.
# We need to turn of "nsdelegate" as otherwise there is no (?) way moving it.
# (write() to cgroup.procs returns ENOENT if nsdelegate is enabled.)
# There is no 'nonsdelegate' and removing nsdelegate requires a hack:
# mount -t cgroup2 none /mnt && umount /mnt
# mount -o remount,rw,nosuid,nodev,noexec,relatime,memory_recursiveprot /sys/fs/cgroup
# Test with:
# docker run --rm -v /sys/fs/cgroup:/sys/fs/cgroup -it ubuntu bash -c 'sleep 31339 & echo $! >/sys/fs/cgroup/sf.slice/sf-guest.slice/docker-ANY-RUNNING-CONTAINER-ID-HERE.scope/cgroup.procs && echo $! OK'
mount -t cgroup2 none /mnt
umount /mnt
str="${str/,nsdelegate/}"
str="${str/nsdelegate,/}"
mount -o "remount,${str}" "${SF_CG_DIR}" || ERREXIT 255
}
# sf.slice's parent is root (/). Any siblings (e.g. /user.slice, /system.slice) also need to do
# IO Accounting or otherwise /sf.slice can starve those.
systemctl status system.slice | head -n8 | grep -F "IO: " &>/dev/null || WARN "IO Accounting not enabled. Check /etc/systemd/system.conf"
grep -F sf.slice /etc/docker/daemon.json &>/dev/null && WARN "Obsolete sf.slice found in /etc/docker/daemon.json. Remove that line."
[[ ! -d /var/lib/lxcfs/proc ]] && WARN "LG will report wrong uptime etc. Try ${CDC}apt-get install lxcfs${CN}?"
# If there was a warning then wait...
WARN_ENTER
# Delete stale run files..
[[ -d "${SF_SHMDIR}/run/encfsd/user" ]] && rm -rf "${SF_SHMDIR}/run/encfsd/user"
[[ ! -d "${SF_SHMDIR}/run/redis/sock" ]] && mkdir -p "${SF_SHMDIR}/run/redis/sock"
chmod 700 "${SF_SHMDIR}/run/redis"
chown 999 "${SF_SHMDIR}/run/redis/sock" # docker/redis user
chmod 711 "${SF_SHMDIR}/run/redis/sock"
# exec docker-compose "$@"
docker-compose "$@"
ret=$?
# If not started as background (-d): run DOWN.
[[ "$*" != *" -d"* ]] && { down "down"; exit; }
echo -e "May need to run \`${CDC}$0 down${CN}\` (code=$ret)"