feat(gpu): GPU health check + RENDER_DEVICE_USED token + strict mode
- gpu_probe.py: Blender script that probes OPTIX/CUDA/HIP/ONEAPI and
exits 1 on no GPU — used at startup + on-demand from Admin UI
- blender_render.py, still_render.py, turntable_render.py: emit
RENDER_DEVICE_USED: engine=CYCLES device=GPU|CPU compute_type=...
after GPU activation; exit 2 when CYCLES_DEVICE=gpu and CPU fallback
- render_blender.py: parse RENDER_DEVICE_USED token into render_log
(device_used, compute_type, gpu_fallback); handle exit code 2 as
explicit GPU strict-mode failure
- check_version.py: check_gpu() runs gpu_probe.py at container startup;
CYCLES_DEVICE=gpu aborts startup if no GPU found
- docker-compose.yml: CYCLES_DEVICE=${CYCLES_DEVICE:-auto} env var
- gpu_tasks.py: probe_gpu Celery task on thumbnail_rendering queue;
saves result to system_settings.gpu_probe_last_result; beat every 30min
- worker.py: POST /probe/gpu (trigger) + GET /probe/gpu/result (last result)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -16,6 +16,7 @@ from app.models.order_line import OrderLine
|
||||
from app.models.product import Product
|
||||
from app.models.user import User
|
||||
from app.models.worker_config import WorkerConfig
|
||||
from app.models.system_setting import SystemSetting
|
||||
from app.utils.auth import get_current_user, require_admin_or_pm, require_admin
|
||||
|
||||
router = APIRouter(prefix="/worker", tags=["worker"])
|
||||
@@ -456,6 +457,34 @@ async def scale_workers(
|
||||
return {"service": body.service, "count": body.count, "status": "scaling"}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GPU probe
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post("/probe/gpu", status_code=http_status.HTTP_202_ACCEPTED)
|
||||
async def trigger_gpu_probe(current_user: User = Depends(require_admin)):
|
||||
"""Queue a GPU probe task on the render-worker."""
|
||||
from app.tasks.gpu_tasks import probe_gpu
|
||||
result = probe_gpu.delay()
|
||||
return {"task_id": str(result.id), "queued": True}
|
||||
|
||||
|
||||
@router.get("/probe/gpu/result")
|
||||
async def get_gpu_probe_result(
|
||||
current_user: User = Depends(require_admin),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""Return the last GPU probe result from system_settings."""
|
||||
import json
|
||||
row = await db.execute(
|
||||
select(SystemSetting).where(SystemSetting.key == "gpu_probe_last_result")
|
||||
)
|
||||
setting = row.scalar_one_or_none()
|
||||
if not setting:
|
||||
return {"status": "unknown", "message": "No probe run yet. Click Run GPU Check."}
|
||||
return json.loads(setting.value)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Render health check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
Reference in New Issue
Block a user