#! /usr/bin/env bash # Change to CWD (in case CWD has been updated). cd "$(pwd)" || exit BINDIR="$(cd "$(dirname "${0}")" || exit; pwd)" source "${BINDIR}/funcs.sh" || exit 254 command -v jq >/dev/null || ERREXIT 255 "Command 'jq' not found. Try ${CDC}apt-get install jq${CN}" down() { local IFS IFS=$'\n' docker container prune -f c=($(docker ps -f name=^lg --all --quiet)) [[ -n $c ]] && docker stop "${c[@]}" docker-compose "$@" docker network prune -f # Sometimes docker gets into a state when it complains about overlappting # network pool even that 'docker network ls' shows no networks beside # the 3 default networks and with no containers running: ip link show | cut -f2 -d" " | grep -E "^(br-)" | while read x; do x="${x%@*}"; x="${x%:*}"; [[ -z $x ]] && continue; ip link delete "${x}" down; done } [[ -z $SF_REDIS_AUTH ]] && { SF_REDIS_AUTH=$(echo -n "Redis AUTH $SF_SEED" | sha512sum | base64 -w0) SF_REDIS_AUTH="${SF_REDIS_AUTH//[^[:alnum:]]}" SF_REDIS_AUTH="${SF_REDIS_AUTH:0:32}" export SF_REDIS_AUTH } export SF_BACKING_FS="$(docker info --format '{{json .DriverStatus}}' | jq -r '.[0][1]')" [[ "$SF_BACKING_FS" != "xfs" ]] && WARN "Backing FS is not XFS (SF_USER_ROOT_FS_SIZE wont work)" [[ "$1" == down ]] && { down "$@" exit } [[ "$1" != up ]] && exec docker-compose "$@" # HERE: "up" [[ -z $SF_SEED ]] && ERREXIT 255 "SF_SEED= not set" # Load variables from ENV but only those not already set in # user's environemtn. load_env() { local n local v local arr local a envfile="./.env" [[ -n $SF_BASEDIR ]] && envfile="${SF_BASEDIR}/.env" if [[ ! -f "${envfile}" ]]; then WARN "Not found: \${SF_BASEDIR}/.env (${envfile})" else mapfile -t arr < <(grep -E -v '(^#|^$)' "${envfile}") for a in "${arr[@]}"; do n="${a%%=*}" v="${a#*=}" # Prefer user's environemtn over .env settings. [[ -z "$(eval echo \$$n)" ]] && eval "${n}=\"${v}\"" done fi [[ -z $SF_BASEDIR ]] && ERREXIT 255 "SF_BASEDIR= not set in ${envfile}." } blockio_init() { local is_bfq local n # Check if there is BFQ-Scheduler support in the Kernel for fn in /sys/class/block/*/queue/scheduler; do [[ ! -f "${fn}" ]] && break grep bfq "${fn}" >/dev/null || break is_bfq=1 break done [[ -z $is_bfq ]] && { # HERE: no BFQ support. Try load module. # Try: apt install linux-modules-extra-aws modprobe bfq || { WARN "No BFQ-Scheduler. Attacker can DoS block-IO."; return; } is_bfq=1 } # Return if BFQ is set for fn in /sys/class/block/*/queue/scheduler; do [[ ! -f "${fn}" ]] && break echo bfq >"${fn}" || { WARN ""${fn%/queue*}": Failed to set BFQ scheduler."; return; } done # Odd bug. On some systems we set all correctly and docker still complains that # it cant use Block IO weights. It appears to be a problem with cgroup v1? # It can be fixed on v1 systems by using --cgroup-parent=/guest and creating: # mkdir -p /sys/fs/cgroup/blkio/guest # echo 1 >/sys/fs/cgroup/blkio/guest/blkio.bfq.weight # => But then why cant docker fix this crap? # https://github.com/moby/moby/issues/16173#issuecomment-1298432655 # Test if docker accepts --blkio-weight: docker run --rm --blkio-weight=100 alpine true 2>&1 | grep "does not support Block" >/dev/null && { WARN "DOCKER: Your kernel does not support Block I/O weight."; return; } } sysinc() { local key local val key=$1 val=$2 [[ $(sysctl -n "$key") -ge $val ]] && return sysctl -q -w "${key}=${val}" || WARN "Could not set '${key}=${val}'" } sysdec() { local key local val key=$1 val=$2 [[ $(sysctl -n "$key") -le $val ]] && return sysctl -q -w "${key}=${val}" || WARN "Could not set '${key}=${val}'" } warn_file() { [[ -f "$1" ]] && return WARN "Not found: $1" } warn_outdated() { local fn dst src dst="${SF_BASEDIR}/${1}" src="${BINDIR}/../${1}" [[ ! -f "$dst" ]] && { WARN "Not found: $dst"; return; } [[ ! -f "$src" ]] && ERREXIT 255 "Not found: $src" # Installed file $dst is newer or equal than $src [[ ! "$dst" -ot "$src" ]] && return [[ $(stat -c%s "$dst") -eq $(stat -c%s "$src") ]] && return WARN "$dst is outdated? Try ${CDC}touch $dst${CN} to ignore." } load_env [[ -z $SF_DATADIR ]] && SF_DATADIR="${SF_BASEDIR}/data" [[ -z $SF_SHMDIR ]] && SF_SHMDIR="/dev/shm/sf" [[ -z $SF_HOST_MTU ]] && SF_HOST_MTU=1500 export SF_GUEST_MTU=$((SF_HOST_MTU - 80)) [[ ! -d "${SF_DATADIR}/user" ]] && mkdir -p "${SF_DATADIR}/user" [[ ! -d "${SF_DATADIR}/share" ]] && mkdir -p "${SF_DATADIR}/share" [[ ! -f "${SF_DATADIR}/share/GeoLite2-City.mmdb" ]] && [[ "${MAXMIND_KEY,,}" != "skip" ]] && { WARN "Not found: data/share/GeoLite2-City.mmdb" echo -e "Try \`curl 'https://download.maxmind.com/app/geoip_download?edition_id=GeoLite2-City&license_key=${MAXMIND_KEY:-KEY-NOT-SET}&suffix=tar.gz' | tar xfvz - --strip-components=1 --no-anchored -C '${SF_DATADIR}/share/' 'GeoLite2-City.mmdb'\`." echo -e "Try ${CDC}MAXMIND_KEY=skip${CN} to disable. This will also disable limits by GEOIP and disable user tools like geoip and geoiphn." } [[ ! -f "${SF_DATADIR}/share/tor-exit-nodes.txt" ]] && { WARN "Not found: data/share/tor-exit-nodes.txt" echo -e "Try \`curl 'https://www.dan.me.uk/torlist/?exit' >'${SF_DATADIR}/share/tor-exit-nodes.txt'\`" } [[ ! -f "${SF_DATADIR}/share/relay-exit-nodes-mullvad.txt" ]] && WARN "Not found: data/share/relay-exit-nodes-mullvad.txt - ${CDM}See contrib/cronjob how to create it.${CN}" [[ ! -f "${SF_DATADIR}/share/proxies.txt" ]] && WARN "Not found: data/share/proxies.txt (Mullvad proxies) - ${CDM}See contrib/cronjob how to create it.${CN}" [[ ! -f "${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt" ]] && { WARN "Not found: ${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt" echo -e "\ ==> Log in from global relays is not controlled. We use a private list from Blind Mouse. ==> Generate your own list (see THC's Tips & Tricks). ==> Use ${CDC}touch ${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt${CN} to stop this warning." } chmod 644 "${SF_BASEDIR}/config/etc/relay-exit-nodes-global.txt" 2>/dev/null [[ -z $SF_OVERLAYDIR ]] && [[ -d "${SF_BASEDIR}/docker/overlay2" ]] && export SF_OVERLAYDIR="${SF_BASEDIR}/docker/overlay2" [[ -z $SF_IP ]] && { command -v dig >/dev/null || ERREXIT 255 "Command 'dig' not found. Try ${CDC}apt-get install dnsutils${CN}" export SF_IP=$(dig +short "$SF_FQDN" 2>/dev/null | grep -v '\.$') [[ -z $SF_IP ]] && ERREXIT 255 "Could not get SF_IP..." WARN "SF_IP not set in .env. Using '$SF_IP'." } # xfs_init_quota "${SF_DATADIR}/everyone-root" "everyone" 100 16384 16G # Enable BFQ on all block devices to allow cgroup's io.weight # FIXME: One day but this into udev/startup scripts and only for # device that we are using... blockio_init # BUG-ARP-CACHE: # User can cause arp-table overflow. The kernel limit is global for all arp tables # but each container gets its own arp table. All containers just put pressure on the global # limit. # Attack: A user can spawn multiple containers and create 'incomplete' arp entries in its own # table. Those entries reduce the amount of entries avaialble for other containers (it's a global limit # and not a limit per container). # # Oddity: Docker-compose is making the host name of each service available (e.g sf-redis, sf-tor etc). # This is not done via an /etc/hosts entry but handled by Docker internally. The problem is that # 'somewhere' docker (internally) needs an arp-entry (which fails during an attack). Then the # name (e.g. sf-redis or so) can not be resolved and all goes to shits. # # Tweaking base_reachable_time_ms and gc_stale_time has no effect. Best we can do: # 1. Use static IPs where possible for inter-container communication. # 2. Limit the User's local network (to /22 or /24) # 3. Increase the global size of the kernel's arp table (gc_thresh3) # These are global and shared among all containers # Increase unless already higher sysinc net.ipv4.neigh.default.gc_thresh3 65536 sysinc net.netfilter.nf_conntrack_buckets 16384 # default is 65536 for >4GB systems sysinc net.netfilter.nf_conntrack_max 1048576 # find /proc/*/fd -lname anon_inode:inotify | cut -d/ -f3 | xargs -I '{}' -- ps --no-headers -o '%p %U %c' -p '{}' | uniq -c | sort -nr sysinc fs.inotify.max_user_instances 1024 # Conntrack & Namespaces is a mess. Restricting these inside a container # only results that the connection is dropped sooner but the state still # remains on the host's container. Thus we also reduce the host's timers # to deal with this. The host does not do CONNTRACKING and thus these # settings should only affect the containers. # Decrease unless already lower. sysdec net.netfilter.nf_conntrack_tcp_timeout_syn_sent 10 sysdec net.netfilter.nf_conntrack_tcp_timeout_syn_recv 5 # default is 30, 5 because of reverse tunnels sysdec net.netfilter.nf_conntrack_tcp_timeout_last_ack 5 # default is 30 sysdec net.netfilter.nf_conntrack_tcp_timeout_fin_wait 10 # default is 120 sysdec net.netfilter.nf_conntrack_tcp_timeout_close 1 # default is 10 sysdec net.netfilter.nf_conntrack_tcp_timeout_close_wait 10 # default is 60 sysdec net.netfilter.nf_conntrack_tcp_timeout_unacknowledged 30 # default is 300 sysdec net.netfilter.nf_conntrack_tcp_timeout_established 10800 # 3h, default is 5 days sysdec net.netfilter.nf_conntrack_icmp_timeout 10 # default is 30 sysdec net.netfilter.nf_conntrack_udp_timeout 10 # default is 30 # Each Hugepagesize is 2MB (grep HUGE /proc/meminfo) # 512 => 1g as HUGE # 8192 => 16g as HUGE [[ ! $(cat /proc/sys/vm/nr_hugepages) -gt 0 ]] && WARN "Huge Tables not set. Consider ${CDC}echo \"vm.nr_hugepages=8192\" >>/etc/sysctl.conf && sysctl -w vm.nr_hugepages=8192${CN}" # Warn for outdated files in /sf/config/* (that are older and different size) mapfile -t arr < <(cd "${BINDIR}/../" || exit; find config -type f) for fn in "${arr[@]}"; do warn_outdated "$fn" done # Check if there are any fils in /sf/sfbin that are not equal to ./sfbin for x in "${BINDIR}/"*; do [[ ! -e "$x" ]] && WARN "Oops. Files missing in ${BINDIR}/*???" str=$(md5sum "$x") src=${str%% *} x=$(basename "$x") str=$(md5sum "${SF_BASEDIR}/sfbin/${x}" 2>/dev/null) dst=${str%% *} [[ $src != $dst ]] && WARN "${SF_BASEDIR}/sfbin/${x} is outdated. Please update with ${CDC}${BINDIR}/${x}${CN}" done # Make sure /dev/shm is 'shared' [[ "$(findmnt -no TARGET,PROPAGATION /dev/shm)" != *"shared"* ]] && { mount --make-shared /dev/shm/ || ERREXIT 252 } systemctl start sf.slice || WARN 'Could not start sf.slice' systemctl start sf-guest.slice || WARN 'Could not start sf-guest.slice' systemctl status sf.slice | grep Segfault >/dev/null || WARN 'Bad start sf.slice. Does not belong to Segfault.' systemctl status sf-guest.slice | grep Segfault >/dev/null || WARN 'Bad start sf-guest.slice. Does not belong to Segfault.' SF_CG_DIR="/sys/fs/cgroup" [[ -d "/sys/fs/cgroup/unified" ]] && { SF_CG_DIR="/sys/fs/cgroup/unified" # for cgroupv1 docker-run expects the absolute hierarchy path (for --cgroup-parent): export SF_CG_PARENT="sf.slice/sf-guest.slice" } str=$(mount | grep ^cgroup2 | grep -F "$SF_CG_DIR" ) [[ $str == *'nsdelegate'* ]] && { # HERE: cgroup2 is in use. echo -e >&2 "[$(date '+%F %T' -u)] [${CDY}WARN${CN}] ${SF_CG_DIR} is mounted with nsdelegate. Disabling nsdelegate." str=${str##*\(} str=${str%\)*} # We need to move encfsd to the user's cgroup: From sf.slice (sf-encfsd) to sf.slice/sf-guest.slice. # We need to turn of "nsdelegate" as otherwise there is no (?) way moving it. # (write() to cgroup.procs returns ENOENT if nsdelegate is enabled.) # There is no 'nonsdelegate' and removing nsdelegate requires a hack: # mount -t cgroup2 none /mnt && umount /mnt # mount -o remount,rw,nosuid,nodev,noexec,relatime,memory_recursiveprot /sys/fs/cgroup # Test with: # docker run --rm -v /sys/fs/cgroup:/sys/fs/cgroup -it ubuntu bash -c 'sleep 31339 & echo $! >/sys/fs/cgroup/sf.slice/sf-guest.slice/docker-ANY-RUNNING-CONTAINER-ID-HERE.scope/cgroup.procs && echo $! OK' mount -t cgroup2 none /mnt umount /mnt str="${str/,nsdelegate/}" str="${str/nsdelegate,/}" mount -o "remount,${str}" "${SF_CG_DIR}" || ERREXIT 255 } # sf.slice's parent is root (/). Any siblings (e.g. /user.slice, /system.slice) also need to do # IO Accounting or otherwise /sf.slice can starve those. systemctl status system.slice | head -n8 | grep -F "IO: " &>/dev/null || WARN "IO Accounting not enabled. Check /etc/systemd/system.conf" grep -F sf.slice /etc/docker/daemon.json &>/dev/null && WARN "Obsolete sf.slice found in /etc/docker/daemon.json. Remove that line." [[ ! -d /var/lib/lxcfs/proc ]] && WARN "LG will report wrong uptime etc. Try ${CDC}apt-get install lxcfs${CN}?" # If there was a warning then wait... WARN_ENTER # Delete stale run files.. [[ -d "${SF_SHMDIR}/run/encfsd/user" ]] && rm -rf "${SF_SHMDIR}/run/encfsd/user" [[ ! -d "${SF_SHMDIR}/run/redis/sock" ]] && mkdir -p "${SF_SHMDIR}/run/redis/sock" chmod 700 "${SF_SHMDIR}/run/redis" chown 999 "${SF_SHMDIR}/run/redis/sock" # docker/redis user chmod 711 "${SF_SHMDIR}/run/redis/sock" # exec docker-compose "$@" docker-compose "$@" ret=$? # If not started as background (-d): run DOWN. [[ "$*" != *" -d"* ]] && { down "down"; exit; } echo -e "May need to run \`${CDC}$0 down${CN}\` (code=$ret)"