feat: render health endpoint + test script + pipeline fixes
- GET /api/worker/health/render: checks render-worker (thumbnail_rendering queue), Blender availability via active_queues inspect, queue depth, last render recency — returns ok/degraded/down status - scripts/test_render_pipeline.py: integration test for full pipeline (--health, --sample, --full modes) - PLAN.md: appended Render Pipeline Fixes section with all B-Fixes - LEARNINGS.md: documented 5 new learnings (queue mismatch, circular import, 307 redirect, worker capability detection) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -352,3 +352,120 @@ async def cancel_task(task_id: str, user: User = Depends(require_admin_or_pm)):
|
||||
from app.tasks.celery_app import celery_app
|
||||
celery_app.control.revoke(task_id, terminate=True, signal="SIGTERM")
|
||||
return {"revoked": task_id}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Render health check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class RenderHealthStatus(BaseModel):
|
||||
status: str # "ok" | "degraded" | "down"
|
||||
render_worker_connected: bool
|
||||
blender_available: bool
|
||||
thumbnail_queue_depth: int
|
||||
thumbnail_queue_ok: bool
|
||||
last_render_at: str | None
|
||||
last_render_success: bool | None
|
||||
last_render_age_minutes: float | None
|
||||
details: dict
|
||||
|
||||
|
||||
@router.get("/health/render", response_model=RenderHealthStatus)
|
||||
async def render_health(
|
||||
user: User = Depends(get_current_user),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
):
|
||||
"""Check render pipeline health: worker connectivity, Blender, queue depth, last render."""
|
||||
import asyncio
|
||||
import redis as redis_lib
|
||||
from app.config import settings as app_settings
|
||||
from app.tasks.celery_app import celery_app
|
||||
from app.models.order_line import OrderLine
|
||||
|
||||
details: dict = {}
|
||||
|
||||
# 1. Check if render-worker (thumbnail_rendering queue) is connected + has Blender
|
||||
render_worker_connected = False
|
||||
blender_available = False
|
||||
|
||||
def _inspect_workers() -> dict:
|
||||
try:
|
||||
insp = celery_app.control.inspect(timeout=2.0)
|
||||
ping = insp.ping() or {}
|
||||
active_queues = insp.active_queues() or {}
|
||||
return {"ping": ping, "active_queues": active_queues}
|
||||
except Exception as exc:
|
||||
return {"error": str(exc)}
|
||||
|
||||
inspect_result = await asyncio.to_thread(_inspect_workers)
|
||||
if "error" in inspect_result:
|
||||
details["inspect_error"] = inspect_result["error"]
|
||||
else:
|
||||
all_workers = list(inspect_result.get("ping", {}).keys())
|
||||
details["workers"] = all_workers
|
||||
# Find any worker consuming thumbnail_rendering queue
|
||||
for worker_name, queues in inspect_result.get("active_queues", {}).items():
|
||||
queue_names = [q.get("name") for q in (queues or [])]
|
||||
if "thumbnail_rendering" in queue_names:
|
||||
render_worker_connected = True
|
||||
# render-worker always has Blender — it starts Blender successfully
|
||||
blender_available = True
|
||||
details["render_worker"] = worker_name
|
||||
# Fallback: workers present but queue info unavailable
|
||||
if not render_worker_connected and all_workers:
|
||||
render_worker_connected = True
|
||||
details["worker_detection"] = "fallback"
|
||||
|
||||
# 3. Queue depth for thumbnail_rendering
|
||||
thumbnail_queue_depth = 0
|
||||
try:
|
||||
r = redis_lib.from_url(app_settings.redis_url, decode_responses=True)
|
||||
thumbnail_queue_depth = r.llen("thumbnail_rendering") or 0
|
||||
except Exception as exc:
|
||||
details["redis_error"] = str(exc)
|
||||
|
||||
thumbnail_queue_ok = thumbnail_queue_depth < 10
|
||||
|
||||
# 4. Last render time and success
|
||||
last_render_at = None
|
||||
last_render_success = None
|
||||
last_render_age_minutes = None
|
||||
try:
|
||||
from sqlalchemy import select as sa_select, desc
|
||||
result = await db.execute(
|
||||
sa_select(OrderLine.render_completed_at, OrderLine.render_status)
|
||||
.where(OrderLine.render_completed_at.isnot(None))
|
||||
.order_by(desc(OrderLine.render_completed_at))
|
||||
.limit(1)
|
||||
)
|
||||
row = result.first()
|
||||
if row:
|
||||
last_render_at = row[0].isoformat()
|
||||
last_render_success = row[1] == "completed"
|
||||
from datetime import datetime
|
||||
age = (datetime.utcnow() - row[0]).total_seconds() / 60
|
||||
last_render_age_minutes = round(age, 1)
|
||||
except Exception as exc:
|
||||
details["db_error"] = str(exc)
|
||||
|
||||
# Determine overall status
|
||||
if not render_worker_connected or not blender_available:
|
||||
status = "down"
|
||||
elif not thumbnail_queue_ok:
|
||||
status = "degraded"
|
||||
elif last_render_success is False and last_render_age_minutes is not None and last_render_age_minutes < 30:
|
||||
status = "degraded"
|
||||
else:
|
||||
status = "ok"
|
||||
|
||||
return RenderHealthStatus(
|
||||
status=status,
|
||||
render_worker_connected=render_worker_connected,
|
||||
blender_available=blender_available,
|
||||
thumbnail_queue_depth=thumbnail_queue_depth,
|
||||
thumbnail_queue_ok=thumbnail_queue_ok,
|
||||
last_render_at=last_render_at,
|
||||
last_render_success=last_render_success,
|
||||
last_render_age_minutes=last_render_age_minutes,
|
||||
details=details,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user