segfault/sfbin/sf

#! /usr/bin/env bash

# Change to CWD (in case CWD has been updated).
cd "$(pwd)" || exit

BINDIR="$(cd "$(dirname "${0}")" || exit; pwd)"
source "${BINDIR}/funcs.sh" || exit 254

command -v jq >/dev/null || ERREXIT 255 "Command 'jq' not found. Try ${CDC}apt-get install jq${CN}"

down()
{
	local IFS
	IFS=$'\n'
	docker container prune -f
	c=($(docker ps -f name=^lg --all --quiet))
	[[ -n $c ]] && docker stop "${c[@]}"
	docker-compose "$@"
	docker network prune -f
	# Sometimes docker gets into a state when it complains about overlappting
	# network pool even that 'docker network ls' shows no networks beside
	# the 3 default networks and with no containers running:
	ip link show | cut -f2 -d" " | grep -E "^(br-)"  | while read x; do x="${x%@*}"; x="${x%:*}"; [[ -z $x ]] && continue; ip link delete "${x}" down; done
}

[[ -z $SF_REDIS_AUTH ]] && {
	SF_REDIS_AUTH=$(echo -n "Redis AUTH $SF_SEED" | sha512sum | base64 -w0)
	SF_REDIS_AUTH="${SF_REDIS_AUTH//[^[:alnum:]]}"
	SF_REDIS_AUTH="${SF_REDIS_AUTH:0:32}"
	export SF_REDIS_AUTH
}

export SF_BACKING_FS="$(docker info --format '{{json .DriverStatus}}' | jq -r '.[0][1]')"
[[ "$SF_BACKING_FS" != "xfs" ]] && WARN "Backing FS is not XFS (SF_USER_ROOT_FS_SIZE wont work)"

[[ "$1" == down ]] && {
	down "$@"
	exit
}
[[ "$1" != up ]] && exec docker-compose "$@"

# HERE: "up"

[[ -z $SF_SEED ]] && ERREXIT 255 "SF_SEED= not set"

# Load variables from ENV but only those not already set in
# user's environemtn.
load_env()
{
	local n
	local v
	local arr
	local a
	envfile="./.env"

	[[ -n $SF_BASEDIR ]] && envfile="${SF_BASEDIR}/.env"
	if [[ ! -f "${envfile}" ]]; then
		WARN "Not found: \${SF_BASEDIR}/.env (${envfile})"
	else
		mapfile -t arr < <(grep -E -v '(^#|^$)' "${envfile}")
		for a in "${arr[@]}"; do

			n="${a%%=*}"
			v="${a#*=}"
			# Prefer user's environemtn over .env settings.
			[[ -z "$(eval echo \$$n)" ]] && eval "${n}=\"${v}\""
		done
	fi

	[[ -z $SF_BASEDIR ]] && ERREXIT 255 "SF_BASEDIR= not set in ${envfile}."
}

blockio_init()
{
	local is_bfq
	local n

	# Check if there is BFQ-Scheduler support in the Kernel
	for fn in /sys/class/block/*/queue/scheduler; do
		[[ ! -f "${fn}" ]] && break
		grep bfq "${fn}" >/dev/null || break
		is_bfq=1
		break
	done

	[[ -z $is_bfq ]] && {
		# HERE: no BFQ support. Try load module.
		# Try: apt install linux-modules-extra-aws
		modprobe bfq || { WARN "No BFQ-Scheduler. Attacker can DoS block-IO."; return; }
		is_bfq=1
	}

	# Return if BFQ is set
	for fn in /sys/class/block/*/queue/scheduler; do
		[[ ! -f "${fn}" ]] && break
		echo bfq >"${fn}" || { WARN ""${fn%/queue*}": Failed to set BFQ scheduler."; return; }
	done

	# Odd bug. On some systems we set all correctly and docker still complains that
	# it cant use Block IO weights. It appears to be a problem with cgroup v1?
	# It can be fixed on v1 systems by using --cgroup-parent=/guest and creating:
	#   mkdir -p /sys/fs/cgroup/blkio/guest
	#   echo 1 >/sys/fs/cgroup/blkio/guest/blkio.bfq.weight
	# => But then why cant docker fix this crap?
	# https://github.com/moby/moby/issues/16173#issuecomment-1298432655
	# Test if docker accepts --blkio-weight:
	docker run --rm --blkio-weight=100 alpine  true 2>&1 | grep "does not support Block" >/dev/null && { WARN "DOCKER: Your kernel does not support Block I/O weight."; return; }
}

sysinc()
{
	local key
	local val
	key=$1
	val=$2
	[[ $(sysctl -n "$key") -ge $val ]] && return
	sysctl -q -w "${key}=${val}" || WARN "Could not set '${key}=${val}'"
}

sysdec()
{
	local key
	local val
	key=$1
	val=$2
	[[ $(sysctl -n "$key") -le $val ]] && return
	sysctl -q -w "${key}=${val}" || WARN "Could not set '${key}=${val}'"
}

warn_file()
{
	[[ -f "$1" ]] && return

	WARN "Not found: $1"
}

warn_outdated()
{
    local fn dst src
    dst="${SF_BASEDIR}/${1}"
    src="${BINDIR}/../${1}"

    [[ ! -f "$dst" ]] && { WARN "Not found: $dst"; return; }
    [[ ! -f "$src" ]] && ERREXIT 255 "Not found: $src"

    # Installed file $dst is newer or equal than $src
    [[ ! "$dst" -ot "$src" ]] && return

    [[ $(stat -c%s "$dst") -eq $(stat -c%s "$src") ]] && return

    WARN "$dst is outdated? Try ${CDC}touch $dst${CN} to ignore."
}

load_env
[[ -z $SF_DATADIR ]] && SF_DATADIR="${SF_BASEDIR}/data"
[[ -z $SF_SHMDIR ]] && SF_SHMDIR="/dev/shm/sf"
[[ -z $SF_HOST_MTU ]] && SF_HOST_MTU=1500
export SF_GUEST_MTU=$((SF_HOST_MTU - 80))

[[ ! -d "${SF_DATADIR}/user" ]] && mkdir -p "${SF_DATADIR}/user"
[[ ! -d "${SF_DATADIR}/share" ]] && mkdir -p "${SF_DATADIR}/share"

[[ ! -f "${SF_DATADIR}/share/GeoLite2-City.mmdb" ]] && [[ "${MAXMIND_KEY,,}" != "skip" ]] && {
	WARN "Not found: data/share/GeoLite2-City.mmdb"
	echo -e "Try \`curl 'https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&license_key=${MAXMIND_KEY:-KEY-NOT-SET}&suffix=tar.gz' | tar xfvz  - --strip-components=1  --no-anchored -C '${SF_DATADIR}/share/' 'GeoLite2-City.mmdb'\`."
	echo -e "Try ${CDC}MAXMIND_KEY=skip${CN} to disable. This will also disable limits by GEOIP and disable user tools like geoip and geoiphn."
}

[[ ! -f "${SF_DATADIR}/share/tor-exit-nodes.txt" ]] && {
	WARN "Not found: data/share/tor-exit-nodes.txt"
	echo -e "Try \`curl 'https://www.dan.me.uk/torlist/?exit' >'${SF_DATADIR}/share/tor-exit-nodes.txt'\`"
}

[[ ! -f "${SF_DATADIR}/share/relay-exit-nodes-mullvad.txt" ]] && WARN "Not found: data/share/relay-exit-nodes-mullvad.txt - ${CDM}See contrib/cronjob how to create it.${CN}"

[[ ! -f "${SF_DATADIR}/share/proxies.txt" ]] && WARN "Not found: data/share/proxies.txt (Mullvad proxies) - ${CDM}See contrib/cronjob how to create it.${CN}"

[[ ! -f "${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt" ]] && {
	WARN "Not found: ${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt"
	echo -e "\
==> Log in from global relays is not controlled. We use a private list from Blind Mouse.
==> Generate your own list (see THC's Tips & Tricks).
==> Use ${CDC}touch ${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt${CN} to stop this warning."
}
chmod 644 "${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt" 2>/dev/null

[[ -z $SF_OVERLAYDIR ]] && [[ -d "${SF_BASEDIR}/docker/overlay2" ]] && export SF_OVERLAYDIR="${SF_BASEDIR}/docker/overlay2"

[[ -z $SF_IP ]] && {
	command -v dig >/dev/null || ERREXIT 255 "Command 'dig' not found. Try ${CDC}apt-get install dnsutils${CN}"
	export SF_IP=$(dig +short "$SF_FQDN" 2>/dev/null | grep -v '\.$')
	[[ -z $SF_IP ]] && ERREXIT 255 "Could not get SF_IP..."
	WARN "SF_IP not set in .env. Using '$SF_IP'."
}

# xfs_init_quota "${SF_DATADIR}/everyone-root" "everyone" 100 16384 16G

# Enable BFQ on all block devices to allow cgroup's io.weight
# FIXME: One day but this into udev/startup scripts and only for
# device that we are using...
blockio_init

# BUG-ARP-CACHE:
# User can cause arp-table overflow. The kernel limit is global for all arp tables
# but each container gets its own arp table. All containers just put pressure on the global
# limit.
# Attack: A user can spawn multiple containers and create 'incomplete' arp entries in its own
# table. Those entries reduce the amount of entries avaialble for other containers (it's a global limit
# and not a limit per container).
#
# Oddity: Docker-compose is making the host name of each service available (e.g sf-redis, sf-tor etc).
# This is not done via an /etc/hosts entry but handled by Docker internally. The problem is that
# 'somewhere' docker (internally) needs an arp-entry (which fails during an attack). Then the
# name (e.g. sf-redis or so) can not be resolved and all goes to shits.
#
# Tweaking base_reachable_time_ms and gc_stale_time has no effect. Best we can do:
# 1. Use static IPs where possible for inter-container communication.
# 2. Limit the User's local network (to /22 or /24)
# 3. Increase the global size of the kernel's arp table (gc_thresh3)

# These are global and shared among all containers
# Increase unless already higher
sysinc net.ipv4.neigh.default.gc_thresh3 65536
sysinc net.netfilter.nf_conntrack_buckets 16384 # default is 65536 for >4GB systems
sysinc net.netfilter.nf_conntrack_max 1048576
# find /proc/*/fd -lname anon_inode:inotify | cut -d/ -f3 | xargs -I '{}' -- ps --no-headers -o '%p %U %c' -p '{}' | uniq -c | sort -nr
sysinc fs.inotify.max_user_instances 1024

# Conntrack & Namespaces is a mess. Restricting these inside a container
# only results that the connection is dropped sooner but the state still
# remains on the host's container. Thus we also reduce the host's timers
# to deal with this. The host does not do CONNTRACKING and thus these
# settings should only affect the containers.
# Decrease unless already lower.
sysdec net.netfilter.nf_conntrack_tcp_timeout_syn_sent 10
sysdec net.netfilter.nf_conntrack_tcp_timeout_syn_recv 5        # default is 30, 5 because of reverse tunnels
sysdec net.netfilter.nf_conntrack_tcp_timeout_last_ack 5        # default is 30
sysdec net.netfilter.nf_conntrack_tcp_timeout_fin_wait 10       # default is 120
sysdec net.netfilter.nf_conntrack_tcp_timeout_close 1           # default is 10
sysdec net.netfilter.nf_conntrack_tcp_timeout_close_wait 10     # default is 60
sysdec net.netfilter.nf_conntrack_tcp_timeout_unacknowledged 30 # default is 300
sysdec net.netfilter.nf_conntrack_tcp_timeout_established 10800 # 3h, default is 5 days
sysdec net.netfilter.nf_conntrack_icmp_timeout 10 # default is 30
sysdec net.netfilter.nf_conntrack_udp_timeout 10  # default is 30

# Each Hugepagesize is 2MB (grep HUGE /proc/meminfo)
#  512 => 1g as HUGE
# 8192 => 16g as HUGE
[[ ! $(cat /proc/sys/vm/nr_hugepages) -gt 0 ]] && WARN "Huge Tables not set. Consider ${CDC}echo \"vm.nr_hugepages=8192\" >>/etc/sysctl.conf && sysctl -w vm.nr_hugepages=8192${CN}"

# Warn for outdated files in /sf/config/* (that are older and different size)
mapfile -t arr < <(cd "${BINDIR}/../" || exit; find config -type f)
for fn in "${arr[@]}"; do
    warn_outdated "$fn"
done

# Check if there are any fils in /sf/sfbin that are not equal to ./sfbin
for x in "${BINDIR}/"*; do
	[[ ! -e "$x" ]] && WARN "Oops. Files missing in ${BINDIR}/*???"
	str=$(md5sum "$x")
	src=${str%% *}
	x=$(basename "$x")
	str=$(md5sum "${SF_BASEDIR}/sfbin/${x}" 2>/dev/null)
	dst=${str%% *}
	[[ $src != $dst ]] && WARN "${SF_BASEDIR}/sfbin/${x} is outdated. Please update with ${CDC}${BINDIR}/${x}${CN}"
done


# Make sure /dev/shm is 'shared'
[[ "$(findmnt -no TARGET,PROPAGATION /dev/shm)" != *"shared"* ]] && {
	mount --make-shared /dev/shm/ || ERREXIT 252
}

systemctl start sf.slice || WARN 'Could not start sf.slice'
systemctl start sf-guest.slice || WARN 'Could not start sf-guest.slice'
systemctl status sf.slice | grep Segfault >/dev/null || WARN 'Bad start sf.slice. Does not belong to Segfault.'
systemctl status sf-guest.slice | grep Segfault >/dev/null || WARN 'Bad start sf-guest.slice. Does not belong to Segfault.'

SF_CG_DIR="/sys/fs/cgroup"
[[ -d "/sys/fs/cgroup/unified" ]] && {
	SF_CG_DIR="/sys/fs/cgroup/unified"
	# for cgroupv1 docker-run expects the absolute hierarchy path (for --cgroup-parent):
	export SF_CG_PARENT="sf.slice/sf-guest.slice"
}

str=$(mount | grep ^cgroup2 | grep -F "$SF_CG_DIR" )
[[ $str == *'nsdelegate'* ]] && {
	# HERE: cgroup2 is in use.
	echo -e >&2 "[$(date '+%F %T' -u)] [${CDY}WARN${CN}] ${SF_CG_DIR} is mounted with nsdelegate. Disabling nsdelegate."
	str=${str##*\(}
	str=${str%\)*}
	# We need to move encfsd to the user's cgroup: From sf.slice (sf-encfsd) to sf.slice/sf-guest.slice.
	# We need to turn of "nsdelegate" as otherwise there is no (?) way moving it.
	# (write() to cgroup.procs returns ENOENT if nsdelegate is enabled.)

	# There is no 'nonsdelegate' and removing nsdelegate requires a hack:
	#   mount -t cgroup2 none /mnt && umount /mnt
	#   mount -o remount,rw,nosuid,nodev,noexec,relatime,memory_recursiveprot /sys/fs/cgroup
	# Test with:
	#   docker run --rm -v /sys/fs/cgroup:/sys/fs/cgroup -it ubuntu bash -c 'sleep 31339 & echo $! >/sys/fs/cgroup/sf.slice/sf-guest.slice/docker-ANY-RUNNING-CONTAINER-ID-HERE.scope/cgroup.procs && echo $! OK'
	mount -t cgroup2 none /mnt
	umount /mnt
	str="${str/,nsdelegate/}"
	str="${str/nsdelegate,/}"
	mount -o "remount,${str}" "${SF_CG_DIR}" || ERREXIT 255
}

# sf.slice's parent is root (/). Any siblings (e.g. /user.slice, /system.slice) also need to do
# IO Accounting or otherwise /sf.slice can starve those.
systemctl status system.slice | head -n8 | grep -F "IO: " &>/dev/null || WARN "IO Accounting not enabled. Check /etc/systemd/system.conf"
grep -F sf.slice /etc/docker/daemon.json &>/dev/null && WARN "Obsolete sf.slice found in /etc/docker/daemon.json. Remove that line."

[[ ! -d /var/lib/lxcfs/proc ]] && WARN "LG will report wrong uptime etc. Try ${CDC}apt-get install lxcfs${CN}?"
# If there was a warning then wait...
WARN_ENTER

# Delete stale run files..
[[ -d "${SF_SHMDIR}/run/encfsd/user" ]] && rm -rf "${SF_SHMDIR}/run/encfsd/user"
[[ ! -d "${SF_SHMDIR}/run/redis/sock" ]] && mkdir -p "${SF_SHMDIR}/run/redis/sock"
chmod 700 "${SF_SHMDIR}/run/redis"
chown 999 "${SF_SHMDIR}/run/redis/sock" # docker/redis user
chmod 711 "${SF_SHMDIR}/run/redis/sock"
# exec docker-compose "$@"
docker-compose "$@"
ret=$?
# If not started as background (-d): run DOWN.
[[ "$*" != *" -d"* ]] && { down "down"; exit; }
echo -e "May need to run \`${CDC}$0 down${CN}\` (code=$ret)"