114 lines
2.7 KiB
Bash
Executable File
114 lines
2.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
GPU_PCI_ADDR="${GPU_PCI_ADDR:-0000:01:00.0}"
|
|
POWER_CONTROL_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/control"
|
|
RUNTIME_STATUS_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/runtime_status"
|
|
MODPROBE_CONF="/etc/modprobe.d/hartomat-nvidia-runtimepm.conf"
|
|
TARGET_OPTION='options nvidia NVreg_DynamicPowerManagement=0x00 NVreg_EnableGpuFirmware=0'
|
|
COMPOSE_SERVICES=(render-worker render-worker-light)
|
|
|
|
log() {
|
|
printf '
|
|
[%s] %s
|
|
' "$(date +%H:%M:%S)" "$*"
|
|
}
|
|
|
|
require_root() {
|
|
if [[ "${EUID}" -ne 0 ]]; then
|
|
echo "Run as root: sudo $0" >&2
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
show_state() {
|
|
log "GPU runtime state"
|
|
if [[ -r "${RUNTIME_STATUS_PATH}" ]]; then
|
|
cat "${RUNTIME_STATUS_PATH}"
|
|
else
|
|
echo "runtime_status unavailable at ${RUNTIME_STATUS_PATH}"
|
|
fi
|
|
|
|
log "nvidia-smi"
|
|
if ! nvidia-smi; then
|
|
echo "nvidia-smi still failing"
|
|
fi
|
|
}
|
|
|
|
show_gpu_clients() {
|
|
log "Processes using /dev/nvidia*"
|
|
if ! lsof /dev/nvidia* 2>/dev/null; then
|
|
echo "No open /dev/nvidia* handles detected"
|
|
fi
|
|
}
|
|
|
|
stop_compose_workers() {
|
|
if ! command -v docker >/dev/null 2>&1; then
|
|
return
|
|
fi
|
|
if [[ ! -f "docker-compose.yml" ]]; then
|
|
return
|
|
fi
|
|
log "Stopping HartOMat render workers"
|
|
docker compose stop "${COMPOSE_SERVICES[@]}" || true
|
|
}
|
|
|
|
start_compose_workers() {
|
|
if ! command -v docker >/dev/null 2>&1; then
|
|
return
|
|
fi
|
|
if [[ ! -f "docker-compose.yml" ]]; then
|
|
return
|
|
fi
|
|
log "Starting HartOMat render workers"
|
|
docker compose up -d "${COMPOSE_SERVICES[@]}" || true
|
|
}
|
|
|
|
reload_nvidia_modules() {
|
|
log "Reloading NVIDIA kernel modules"
|
|
systemctl stop nvidia-persistenced.service nvidia-powerd.service || true
|
|
|
|
local pids=""
|
|
pids="$(fuser -v /dev/nvidia* 2>/dev/null | awk '{for (i = 1; i <= NF; i++) print $i}' | rg '^[0-9]+$' | sort -u || true)"
|
|
if [[ -n "${pids}" ]]; then
|
|
echo "The following PIDs still hold /dev/nvidia*: ${pids}" >&2
|
|
echo "Close those applications and rerun the script." >&2
|
|
exit 1
|
|
fi
|
|
|
|
modprobe -r nvidia_uvm nvidia_drm nvidia_modeset nvidia
|
|
modprobe nvidia
|
|
modprobe nvidia_modeset
|
|
modprobe nvidia_uvm
|
|
modprobe nvidia_drm
|
|
|
|
systemctl start nvidia-persistenced.service || true
|
|
systemctl start nvidia-powerd.service || true
|
|
}
|
|
|
|
require_root
|
|
|
|
if [[ ! -e "${POWER_CONTROL_PATH}" ]]; then
|
|
echo "GPU power control path not found: ${POWER_CONTROL_PATH}" >&2
|
|
exit 1
|
|
fi
|
|
|
|
show_gpu_clients
|
|
show_state
|
|
stop_compose_workers
|
|
|
|
log "Disabling runtime autosuspend for this boot"
|
|
echo on > "${POWER_CONTROL_PATH}"
|
|
|
|
log "Persisting NVIDIA runtime power setting"
|
|
printf '%s
|
|
' "${TARGET_OPTION}" > "${MODPROBE_CONF}"
|
|
|
|
reload_nvidia_modules
|
|
|
|
log "Final state"
|
|
show_state
|
|
start_compose_workers
|
|
|
|
log "If nvidia-smi still fails, reboot once so the new modprobe option is applied from a clean boot."
|