feat(gpu): GPU health check + RENDER_DEVICE_USED token + strict mode

- gpu_probe.py: Blender script that probes OPTIX/CUDA/HIP/ONEAPI and
  exits 1 on no GPU — used at startup + on-demand from Admin UI
- blender_render.py, still_render.py, turntable_render.py: emit
  RENDER_DEVICE_USED: engine=CYCLES device=GPU|CPU compute_type=...
  after GPU activation; exit 2 when CYCLES_DEVICE=gpu and CPU fallback
- render_blender.py: parse RENDER_DEVICE_USED token into render_log
  (device_used, compute_type, gpu_fallback); handle exit code 2 as
  explicit GPU strict-mode failure
- check_version.py: check_gpu() runs gpu_probe.py at container startup;
  CYCLES_DEVICE=gpu aborts startup if no GPU found
- docker-compose.yml: CYCLES_DEVICE=${CYCLES_DEVICE:-auto} env var
- gpu_tasks.py: probe_gpu Celery task on thumbnail_rendering queue;
  saves result to system_settings.gpu_probe_last_result; beat every 30min
- worker.py: POST /probe/gpu (trigger) + GET /probe/gpu/result (last result)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-08 20:57:36 +01:00
parent c6556434d6
commit 34f89cc225
10 changed files with 269 additions and 2 deletions
+57
View File
@@ -64,5 +64,62 @@ def check_version():
print(f"Blender {version_str} OK (>= {MIN_VERSION_STR})")
def check_gpu():
"""Run the Blender GPU probe script and report results.
Respects CYCLES_DEVICE env var:
- "cpu" → skip probe entirely
- "gpu" → require GPU; abort startup if none found
- "auto" (default) → warn if no GPU found, but continue
"""
cycles_device = os.environ.get("CYCLES_DEVICE", "auto").lower()
if cycles_device == "cpu":
print("[check_version] GPU check skipped (CYCLES_DEVICE=cpu)", flush=True)
return
blender_bin = find_blender()
probe_script = Path("/render-scripts/gpu_probe.py")
if not probe_script.exists():
print(
f"[check_version] WARNING: gpu_probe.py not found at {probe_script}",
flush=True,
)
return
try:
result = subprocess.run(
[blender_bin, "--background", "--python", str(probe_script)],
capture_output=True, text=True, timeout=45,
)
if result.returncode == 0:
for line in result.stdout.splitlines():
if "GPU_PROBE_OK" in line:
print(f"[check_version] {line}", flush=True)
break
else:
msg = "No GPU detected — renders will use CPU"
for line in result.stdout.splitlines():
if "GPU_PROBE_FAIL" in line:
msg = line
break
if cycles_device == "gpu":
print(f"[check_version] ERROR: {msg}", flush=True)
print(
"[check_version] CYCLES_DEVICE=gpu requires GPU — aborting startup",
flush=True,
)
sys.exit(1)
else:
print(
f"[check_version] WARNING: {msg} (set CYCLES_DEVICE=gpu to enforce)",
flush=True,
)
except subprocess.TimeoutExpired:
print("[check_version] WARNING: GPU probe timed out after 45s", flush=True)
except Exception as e:
print(f"[check_version] WARNING: GPU probe failed: {e}", flush=True)
if __name__ == "__main__":
check_version()
check_gpu()
+6
View File
@@ -718,9 +718,15 @@ if engine != "eevee": # covers both explicit Cycles and EEVEE-fallback
# Re-ensure preferences are set (engine activation may have reset them)
_activate_gpu()
print(f"[blender_render] Cycles GPU ({gpu_type_found}), samples={samples}", flush=True)
print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={gpu_type_found}", flush=True)
else:
scene.cycles.device = 'CPU'
print(f"[blender_render] WARNING: GPU not found — falling back to CPU, samples={samples}", flush=True)
print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
import os as _os
if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
sys.exit(2)
scene.cycles.samples = samples
scene.cycles.use_denoising = True
+43
View File
@@ -0,0 +1,43 @@
"""Blender Python script: GPU compute device probe.
Run via:
blender --background --python gpu_probe.py
Exit codes:
0 — GPU found, prints GPU_PROBE_OK line
1 — No GPU found or error, prints GPU_PROBE_FAIL line
"""
import sys
def main():
try:
import bpy
cprefs = bpy.context.preferences.addons['cycles'].preferences
for device_type in ('OPTIX', 'CUDA', 'HIP', 'ONEAPI'):
try:
cprefs.compute_device_type = device_type
cprefs.get_devices()
gpu_devices = [d for d in cprefs.devices if d.type != 'CPU']
if gpu_devices:
device_names = [(d.name, d.type) for d in gpu_devices]
print(
f"GPU_PROBE_OK: device_type={device_type} devices={device_names}",
flush=True,
)
sys.exit(0)
except Exception as e:
print(f"GPU_PROBE: {device_type} not available: {e}", flush=True)
print("GPU_PROBE_FAIL: no GPU compute device found", flush=True)
sys.exit(1)
except SystemExit:
raise
except Exception as e:
print(f"GPU_PROBE_FAIL: exception during probe: {e}", flush=True)
sys.exit(1)
main()
+8 -1
View File
@@ -728,9 +728,16 @@ def main():
continue
except Exception:
pass
if not gpu_found:
if gpu_found:
print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
else:
scene.cycles.device = 'CPU'
print("[still_render] WARNING: GPU not found — falling back to CPU")
print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
import os as _os
if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
sys.exit(2)
# ── Render settings ──────────────────────────────────────────────────────
scene.render.resolution_x = width
+8 -1
View File
@@ -682,9 +682,16 @@ def main():
continue
except Exception:
pass
if not gpu_found:
if gpu_found:
print(f"RENDER_DEVICE_USED: engine=CYCLES device=GPU compute_type={device_type}", flush=True)
else:
scene.cycles.device = 'CPU'
print("[turntable_render] WARNING: GPU not found — falling back to CPU")
print("RENDER_DEVICE_USED: engine=CYCLES device=CPU compute_type=NONE (fallback)", flush=True)
import os as _os
if _os.environ.get("CYCLES_DEVICE", "auto").lower() == "gpu":
print("GPU_REQUIRED_BUT_CPU_USED: strict mode active (CYCLES_DEVICE=gpu)", flush=True)
sys.exit(2)
# ── Render settings ──────────────────────────────────────────────────────
scene.render.resolution_x = width