#!/usr/bin/env bash set -euo pipefail GPU_PCI_ADDR="${GPU_PCI_ADDR:-0000:01:00.0}" POWER_CONTROL_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/control" RUNTIME_STATUS_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/runtime_status" MODPROBE_CONF="/etc/modprobe.d/hartomat-nvidia-runtimepm.conf" TARGET_OPTION='options nvidia NVreg_DynamicPowerManagement=0x00 NVreg_EnableGpuFirmware=0' COMPOSE_SERVICES=(render-worker render-worker-light) log() { printf ' [%s] %s ' "$(date +%H:%M:%S)" "$*" } require_root() { if [[ "${EUID}" -ne 0 ]]; then echo "Run as root: sudo $0" >&2 exit 1 fi } show_state() { log "GPU runtime state" if [[ -r "${RUNTIME_STATUS_PATH}" ]]; then cat "${RUNTIME_STATUS_PATH}" else echo "runtime_status unavailable at ${RUNTIME_STATUS_PATH}" fi log "nvidia-smi" if ! nvidia-smi; then echo "nvidia-smi still failing" fi } show_gpu_clients() { log "Processes using /dev/nvidia*" if ! lsof /dev/nvidia* 2>/dev/null; then echo "No open /dev/nvidia* handles detected" fi } stop_compose_workers() { if ! command -v docker >/dev/null 2>&1; then return fi if [[ ! -f "docker-compose.yml" ]]; then return fi log "Stopping HartOMat render workers" docker compose stop "${COMPOSE_SERVICES[@]}" || true } start_compose_workers() { if ! command -v docker >/dev/null 2>&1; then return fi if [[ ! -f "docker-compose.yml" ]]; then return fi log "Starting HartOMat render workers" docker compose up -d "${COMPOSE_SERVICES[@]}" || true } reload_nvidia_modules() { log "Reloading NVIDIA kernel modules" systemctl stop nvidia-persistenced.service nvidia-powerd.service || true local pids="" pids="$(fuser -v /dev/nvidia* 2>/dev/null | awk '{for (i = 1; i <= NF; i++) print $i}' | rg '^[0-9]+$' | sort -u || true)" if [[ -n "${pids}" ]]; then echo "The following PIDs still hold /dev/nvidia*: ${pids}" >&2 echo "Close those applications and rerun the script." >&2 exit 1 fi modprobe -r nvidia_uvm nvidia_drm nvidia_modeset nvidia modprobe nvidia modprobe nvidia_modeset modprobe nvidia_uvm modprobe nvidia_drm systemctl start nvidia-persistenced.service || true systemctl start nvidia-powerd.service || true } require_root if [[ ! -e "${POWER_CONTROL_PATH}" ]]; then echo "GPU power control path not found: ${POWER_CONTROL_PATH}" >&2 exit 1 fi show_gpu_clients show_state stop_compose_workers log "Disabling runtime autosuspend for this boot" echo on > "${POWER_CONTROL_PATH}" log "Persisting NVIDIA runtime power setting" printf '%s ' "${TARGET_OPTION}" > "${MODPROBE_CONF}" reload_nvidia_modules log "Final state" show_state start_compose_workers log "If nvidia-smi still fails, reboot once so the new modprobe option is applied from a clean boot."