chore: snapshot workflow migration progress
This commit is contained in:
Executable
+113
@@ -0,0 +1,113 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
GPU_PCI_ADDR="${GPU_PCI_ADDR:-0000:01:00.0}"
|
||||
POWER_CONTROL_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/control"
|
||||
RUNTIME_STATUS_PATH="/sys/bus/pci/devices/${GPU_PCI_ADDR}/power/runtime_status"
|
||||
MODPROBE_CONF="/etc/modprobe.d/hartomat-nvidia-runtimepm.conf"
|
||||
TARGET_OPTION='options nvidia NVreg_DynamicPowerManagement=0x00 NVreg_EnableGpuFirmware=0'
|
||||
COMPOSE_SERVICES=(render-worker render-worker-light)
|
||||
|
||||
log() {
|
||||
printf '
|
||||
[%s] %s
|
||||
' "$(date +%H:%M:%S)" "$*"
|
||||
}
|
||||
|
||||
require_root() {
|
||||
if [[ "${EUID}" -ne 0 ]]; then
|
||||
echo "Run as root: sudo $0" >&2
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
show_state() {
|
||||
log "GPU runtime state"
|
||||
if [[ -r "${RUNTIME_STATUS_PATH}" ]]; then
|
||||
cat "${RUNTIME_STATUS_PATH}"
|
||||
else
|
||||
echo "runtime_status unavailable at ${RUNTIME_STATUS_PATH}"
|
||||
fi
|
||||
|
||||
log "nvidia-smi"
|
||||
if ! nvidia-smi; then
|
||||
echo "nvidia-smi still failing"
|
||||
fi
|
||||
}
|
||||
|
||||
show_gpu_clients() {
|
||||
log "Processes using /dev/nvidia*"
|
||||
if ! lsof /dev/nvidia* 2>/dev/null; then
|
||||
echo "No open /dev/nvidia* handles detected"
|
||||
fi
|
||||
}
|
||||
|
||||
stop_compose_workers() {
|
||||
if ! command -v docker >/dev/null 2>&1; then
|
||||
return
|
||||
fi
|
||||
if [[ ! -f "docker-compose.yml" ]]; then
|
||||
return
|
||||
fi
|
||||
log "Stopping HartOMat render workers"
|
||||
docker compose stop "${COMPOSE_SERVICES[@]}" || true
|
||||
}
|
||||
|
||||
start_compose_workers() {
|
||||
if ! command -v docker >/dev/null 2>&1; then
|
||||
return
|
||||
fi
|
||||
if [[ ! -f "docker-compose.yml" ]]; then
|
||||
return
|
||||
fi
|
||||
log "Starting HartOMat render workers"
|
||||
docker compose up -d "${COMPOSE_SERVICES[@]}" || true
|
||||
}
|
||||
|
||||
reload_nvidia_modules() {
|
||||
log "Reloading NVIDIA kernel modules"
|
||||
systemctl stop nvidia-persistenced.service nvidia-powerd.service || true
|
||||
|
||||
local pids=""
|
||||
pids="$(fuser -v /dev/nvidia* 2>/dev/null | awk '{for (i = 1; i <= NF; i++) print $i}' | rg '^[0-9]+$' | sort -u || true)"
|
||||
if [[ -n "${pids}" ]]; then
|
||||
echo "The following PIDs still hold /dev/nvidia*: ${pids}" >&2
|
||||
echo "Close those applications and rerun the script." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
modprobe -r nvidia_uvm nvidia_drm nvidia_modeset nvidia
|
||||
modprobe nvidia
|
||||
modprobe nvidia_modeset
|
||||
modprobe nvidia_uvm
|
||||
modprobe nvidia_drm
|
||||
|
||||
systemctl start nvidia-persistenced.service || true
|
||||
systemctl start nvidia-powerd.service || true
|
||||
}
|
||||
|
||||
require_root
|
||||
|
||||
if [[ ! -e "${POWER_CONTROL_PATH}" ]]; then
|
||||
echo "GPU power control path not found: ${POWER_CONTROL_PATH}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
show_gpu_clients
|
||||
show_state
|
||||
stop_compose_workers
|
||||
|
||||
log "Disabling runtime autosuspend for this boot"
|
||||
echo on > "${POWER_CONTROL_PATH}"
|
||||
|
||||
log "Persisting NVIDIA runtime power setting"
|
||||
printf '%s
|
||||
' "${TARGET_OPTION}" > "${MODPROBE_CONF}"
|
||||
|
||||
reload_nvidia_modules
|
||||
|
||||
log "Final state"
|
||||
show_state
|
||||
start_compose_workers
|
||||
|
||||
log "If nvidia-smi still fails, reboot once so the new modprobe option is applied from a clean boot."
|
||||
Reference in New Issue
Block a user