Files
HartOMat/scripts/recover_nvidia_runtime_pm.sh

114 lines
2.7 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
GPU_PCI_ADDR="${GPU_PCI_ADDR:-0000:01:00.0}"
POWER_CONTROL_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/control"
RUNTIME_STATUS_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/runtime_status"
MODPROBE_CONF="/etc/modprobe.d/hartomat-nvidia-runtimepm.conf"
TARGET_OPTION='options nvidia NVreg_DynamicPowerManagement=0x00 NVreg_EnableGpuFirmware=0'
COMPOSE_SERVICES=(render-worker render-worker-light)
log() {
printf '
[%s] %s
' "$(date +%H:%M:%S)" "$*"
}
require_root() {
if [[ "${EUID}" -ne 0 ]]; then
echo "Run as root: sudo $0" >&2
exit 1
fi
}
show_state() {
log "GPU runtime state"
if [[ -r "${RUNTIME_STATUS_PATH}" ]]; then
cat "${RUNTIME_STATUS_PATH}"
else
echo "runtime_status unavailable at ${RUNTIME_STATUS_PATH}"
fi
log "nvidia-smi"
if ! nvidia-smi; then
echo "nvidia-smi still failing"
fi
}
show_gpu_clients() {
log "Processes using /dev/nvidia*"
if ! lsof /dev/nvidia* 2>/dev/null; then
echo "No open /dev/nvidia* handles detected"
fi
}
stop_compose_workers() {
if ! command -v docker >/dev/null 2>&1; then
return
fi
if [[ ! -f "docker-compose.yml" ]]; then
return
fi
log "Stopping HartOMat render workers"
docker compose stop "${COMPOSE_SERVICES[@]}" || true
}
start_compose_workers() {
if ! command -v docker >/dev/null 2>&1; then
return
fi
if [[ ! -f "docker-compose.yml" ]]; then
return
fi
log "Starting HartOMat render workers"
docker compose up -d "${COMPOSE_SERVICES[@]}" || true
}
reload_nvidia_modules() {
log "Reloading NVIDIA kernel modules"
systemctl stop nvidia-persistenced.service nvidia-powerd.service || true
local pids=""
pids="$(fuser -v /dev/nvidia* 2>/dev/null | awk '{for (i = 1; i <= NF; i++) print $i}' | rg '^[0-9]+$' | sort -u || true)"
if [[ -n "${pids}" ]]; then
echo "The following PIDs still hold /dev/nvidia*: ${pids}" >&2
echo "Close those applications and rerun the script." >&2
exit 1
fi
modprobe -r nvidia_uvm nvidia_drm nvidia_modeset nvidia
modprobe nvidia
modprobe nvidia_modeset
modprobe nvidia_uvm
modprobe nvidia_drm
systemctl start nvidia-persistenced.service || true
systemctl start nvidia-powerd.service || true
}
require_root
if [[ ! -e "${POWER_CONTROL_PATH}" ]]; then
echo "GPU power control path not found: ${POWER_CONTROL_PATH}" >&2
exit 1
fi
show_gpu_clients
show_state
stop_compose_workers
log "Disabling runtime autosuspend for this boot"
echo on > "${POWER_CONTROL_PATH}"
log "Persisting NVIDIA runtime power setting"
printf '%s
' "${TARGET_OPTION}" > "${MODPROBE_CONF}"
reload_nvidia_modules
log "Final state"
show_state
start_compose_workers
log "If nvidia-smi still fails, reboot once so the new modprobe option is applied from a clean boot."