feat(phase8.3): recover_stalled_renders beat task

Detect OrderLines stuck in render_status='processing' for >30min,
revoke their Celery task, mark failed, update render_job_doc, and
emit a single CHANNEL_ALERT batch notification to admins.

Runs every 5min via beat schedule alongside recover_stuck_cad_files.
Section B (tenant_id in JWT) confirmed already complete from Phase 4.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-08 20:34:36 +01:00
parent da9014831a
commit 69e94e25e3
2 changed files with 101 additions and 0 deletions
+97
View File
@@ -82,6 +82,103 @@ def recover_stuck_cad_files() -> None:
logger.error("recover_stuck_cad_files failed: %s", exc) logger.error("recover_stuck_cad_files failed: %s", exc)
@shared_task(name="app.tasks.beat_tasks.recover_stalled_renders", queue="step_processing")
def recover_stalled_renders() -> None:
"""Detect order_lines stuck in render_status='processing' for longer than
RENDER_STALL_TIMEOUT_MINUTES (default 30) and mark them as failed.
Tries to revoke the Celery task if a celery_task_id is stored in render_job_doc.
Emits a single batch system alert to admins via CHANNEL_ALERT.
Runs every 5 minutes via Celery Beat.
"""
RENDER_STALL_TIMEOUT_MINUTES = 30
try:
from sqlalchemy import create_engine, select
from sqlalchemy.orm import Session
from app.config import settings
from app.domains.orders.models import OrderLine
from app.domains.notifications.service import emit_notification_sync, CHANNEL_ALERT
from app.tasks.celery_app import celery_app
cutoff = datetime.utcnow() - timedelta(minutes=RENDER_STALL_TIMEOUT_MINUTES)
sync_url = settings.database_url.replace("+asyncpg", "")
engine = create_engine(sync_url)
stalled_ids: list[str] = []
with Session(engine) as session:
rows = session.execute(
select(OrderLine).where(
OrderLine.render_status == "processing",
OrderLine.render_started_at < cutoff,
)
).scalars().all()
for line in rows:
elapsed_minutes = int(
(datetime.utcnow() - line.render_started_at).total_seconds() / 60
)
# Try to revoke the Celery task if we have its ID
job_doc = line.render_job_doc or {}
celery_task_id = job_doc.get("celery_task_id")
if celery_task_id:
try:
celery_app.control.revoke(celery_task_id, terminate=True)
logger.info(
"[WORKER_RECOVERY] Revoked Celery task %s for order_line %s",
celery_task_id, line.id,
)
except Exception as revoke_exc:
logger.warning(
"[WORKER_RECOVERY] Could not revoke task %s: %s",
celery_task_id, revoke_exc,
)
# Update render_job_doc with failure info
updated_doc = dict(job_doc)
updated_doc["state"] = "failed"
updated_doc["error"] = (
f"Stalled render terminated by health recovery after {RENDER_STALL_TIMEOUT_MINUTES}min"
)
line.render_status = "failed"
line.render_job_doc = updated_doc
stalled_ids.append(str(line.id))
logger.warning(
"[WORKER_RECOVERY] Stalled render for order_line %s terminated after %dmin",
line.id, elapsed_minutes,
)
if stalled_ids:
session.commit()
engine.dispose()
if stalled_ids:
emit_notification_sync(
action="worker.recovery",
entity_type="system",
details={
"message": (
f"Worker recovery: {len(stalled_ids)} stalled render(s) terminated "
f"after {RENDER_STALL_TIMEOUT_MINUTES}min timeout"
),
"stalled_order_line_ids": stalled_ids,
"count": len(stalled_ids),
},
channel=CHANNEL_ALERT,
)
logger.warning(
"recover_stalled_renders: terminated %d stalled render(s): %s",
len(stalled_ids), stalled_ids,
)
else:
logger.debug("recover_stalled_renders: no stalled renders found")
except Exception as exc:
logger.error("recover_stalled_renders failed: %s", exc)
@shared_task(name="app.tasks.beat_tasks.batch_render_notifications", queue="step_processing") @shared_task(name="app.tasks.beat_tasks.batch_render_notifications", queue="step_processing")
def batch_render_notifications() -> None: def batch_render_notifications() -> None:
"""Check for orders where all lines are terminal but no batch notification emitted yet. """Check for orders where all lines are terminal but no batch notification emitted yet.
+4
View File
@@ -42,5 +42,9 @@ celery_app.conf.update(
"task": "app.tasks.beat_tasks.batch_render_notifications", "task": "app.tasks.beat_tasks.batch_render_notifications",
"schedule": 60.0, # every 60 seconds "schedule": 60.0, # every 60 seconds
}, },
"recover-stalled-renders-every-5m": {
"task": "app.tasks.beat_tasks.recover_stalled_renders",
"schedule": 300.0, # every 5 minutes
},
}, },
) )