feat: add workflow rollout gate signals

2026-04-08 21:44:02 +02:00
parent 8c9648d5dc
commit fe46dabfc5
4 changed files with 1624 additions and 101 deletions
@@ -17,6 +17,28 @@ import logging
 logger = logging.getLogger(__name__)


+def _build_rollout_signal(
+    *,
+    gate_status: str,
+    ready: bool,
+    reasons: list[str],
+    workflow_def_id=None,
+    output_type_id=None,
+    verdict: str | None = None,
+) -> dict:
+    return {
+        "rollout_gate_status": gate_status,
+        "rollout_gate_verdict": verdict,
+        "rollout_gate_reasons": reasons,
+        "workflow_rollout_ready": ready,
+        "workflow_rollout_status": "ready_for_rollout" if ready else "hold_legacy_authoritative",
+        "output_type_rollout_ready": ready,
+        "output_type_rollout_status": "ready_for_rollout" if ready else "hold_legacy_authoritative",
+        "rollout_workflow_definition_id": str(workflow_def_id) if workflow_def_id is not None else None,
+        "rollout_output_type_id": str(output_type_id) if output_type_id is not None else None,
+    }
+
+
 def dispatch_render_with_workflow(order_line_id: str) -> dict:
    """Dispatch a render for the given order line.

@@ -33,6 +55,7 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
    from app.domains.orders.models import OrderLine
    from app.domains.rendering.models import OutputType, WorkflowDefinition
    from app.domains.rendering.workflow_config_utils import (
+        canonicalize_workflow_config,
        extract_runtime_workflow,
        get_workflow_execution_mode,
    )
@@ -67,7 +90,16 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                "order_line %s: no workflow_definition_id, using legacy dispatch",
                order_line_id,
            )
-            return _legacy_dispatch(order_line_id)
+            legacy_result = _legacy_dispatch(order_line_id)
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="legacy_only",
+                    ready=False,
+                    reasons=["No workflow definition is linked; legacy dispatch remains authoritative."],
+                    output_type_id=getattr(output_type, "id", None),
+                )
+            )
+            return legacy_result

        # Load the linked WorkflowDefinition
        wf_def: WorkflowDefinition | None = session.execute(
@@ -84,13 +116,45 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                order_line_id,
                output_type.workflow_definition_id,
            )
-            return _legacy_dispatch(order_line_id)
+            legacy_result = _legacy_dispatch(order_line_id)
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="workflow_unavailable",
+                    ready=False,
+                    reasons=["Linked workflow definition is missing or inactive; legacy dispatch remains authoritative."],
+                    workflow_def_id=output_type.workflow_definition_id,
+                    output_type_id=output_type.id,
+                )
+            )
+            return legacy_result

-        execution_mode = get_workflow_execution_mode(wf_def.config, default="legacy")
+        try:
+            canonical_config = canonicalize_workflow_config(wf_def.config)
+        except Exception as exc:
+            logger.warning(
+                "order_line %s: workflow_definition_id %s has invalid config (%s), "
+                "falling back to legacy dispatch",
+                order_line_id,
+                wf_def.id,
+                exc,
+            )
+            legacy_result = _legacy_dispatch(order_line_id)
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="workflow_invalid",
+                    ready=False,
+                    reasons=[f"Workflow definition config is invalid: {exc}."],
+                    workflow_def_id=wf_def.id,
+                    output_type_id=output_type.id,
+                )
+            )
+            return legacy_result
+
+        execution_mode = get_workflow_execution_mode(canonical_config, default="legacy")

        def _prepare_graph_context(target_mode: str):
            workflow_context = prepare_workflow_context(
-                wf_def.config,
+                canonical_config,
                context_id=order_line_id,
                execution_mode=target_mode,
            )
@@ -122,7 +186,18 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                    wf_def.id,
                    exc,
                )
-                return _legacy_dispatch(order_line_id)
+                legacy_result = _legacy_dispatch(order_line_id)
+                legacy_result["fallback_from"] = "workflow_graph"
+                legacy_result.update(
+                    _build_rollout_signal(
+                        gate_status="graph_preparation_failed",
+                        ready=False,
+                        reasons=[f"Graph runtime preparation failed: {exc}."],
+                        workflow_def_id=wf_def.id,
+                        output_type_id=output_type.id,
+                    )
+                )
+                return legacy_result

            run = None
            try:
@@ -136,7 +211,18 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                    wf_def.id,
                    exc,
                )
-                return _legacy_dispatch(order_line_id)
+                legacy_result = _legacy_dispatch(order_line_id)
+                legacy_result["fallback_from"] = "workflow_graph"
+                legacy_result.update(
+                    _build_rollout_signal(
+                        gate_status="graph_run_creation_failed",
+                        ready=False,
+                        reasons=[f"Graph workflow run creation failed: {exc}."],
+                        workflow_def_id=wf_def.id,
+                        output_type_id=output_type.id,
+                    )
+                )
+                return legacy_result

            try:
                dispatch_result = execute_graph_workflow(session, workflow_context)
@@ -154,15 +240,35 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                fallback_result = _legacy_dispatch(order_line_id)
                fallback_result["fallback_from"] = "workflow_graph"
                fallback_result["workflow_run_id"] = str(run.id)
+                fallback_result.update(
+                    _build_rollout_signal(
+                        gate_status="graph_execution_failed",
+                        ready=False,
+                        reasons=[f"Graph workflow execution failed: {exc}."],
+                        workflow_def_id=wf_def.id,
+                        output_type_id=output_type.id,
+                    )
+                )
                return fallback_result

-            return {
+            result = {
                "backend": "workflow_graph",
                "execution_mode": "graph",
                "workflow_run_id": str(run.id),
                "celery_task_id": dispatch_result.task_ids[0] if dispatch_result.task_ids else None,
                "task_ids": dispatch_result.task_ids,
            }
+            result.update(
+                _build_rollout_signal(
+                    gate_status="graph_authoritative",
+                    ready=True,
+                    verdict="pass",
+                    reasons=["Workflow graph dispatch is authoritative for this output type."],
+                    workflow_def_id=wf_def.id,
+                    output_type_id=output_type.id,
+                )
+            )
+            return result

        if execution_mode == "shadow":
            legacy_result = _legacy_dispatch(order_line_id)
@@ -180,6 +286,18 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                legacy_result["execution_mode"] = "shadow"
                legacy_result["shadow_status"] = "skipped"
                legacy_result["shadow_error"] = str(exc)
+                legacy_result.update(
+                    _build_rollout_signal(
+                        gate_status="shadow_skipped",
+                        ready=False,
+                        reasons=[
+                            "Shadow workflow preparation failed; legacy dispatch remains authoritative.",
+                            f"Preparation error: {exc}.",
+                        ],
+                        workflow_def_id=wf_def.id,
+                        output_type_id=output_type.id,
+                    )
+                )
                return legacy_result

            run = None
@@ -197,6 +315,18 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                legacy_result["execution_mode"] = "shadow"
                legacy_result["shadow_status"] = "failed"
                legacy_result["shadow_error"] = str(exc)
+                legacy_result.update(
+                    _build_rollout_signal(
+                        gate_status="shadow_run_creation_failed",
+                        ready=False,
+                        reasons=[
+                            "Shadow workflow run could not be created; legacy dispatch remains authoritative.",
+                            f"Run creation error: {exc}.",
+                        ],
+                        workflow_def_id=wf_def.id,
+                        output_type_id=output_type.id,
+                    )
+                )
                return legacy_result

            try:
@@ -216,15 +346,39 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                legacy_result["shadow_status"] = "failed"
                legacy_result["shadow_error"] = str(exc)
                legacy_result["shadow_workflow_run_id"] = str(run.id)
+                legacy_result.update(
+                    _build_rollout_signal(
+                        gate_status="shadow_execution_failed",
+                        ready=False,
+                        reasons=[
+                            "Shadow workflow execution failed; legacy dispatch remains authoritative.",
+                            f"Execution error: {exc}.",
+                        ],
+                        workflow_def_id=wf_def.id,
+                        output_type_id=output_type.id,
+                    )
+                )
                return legacy_result

            legacy_result["execution_mode"] = "shadow"
            legacy_result["shadow_status"] = "dispatched"
            legacy_result["shadow_workflow_run_id"] = str(run.id)
            legacy_result["shadow_task_ids"] = dispatch_result.task_ids
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="pending_shadow_verdict",
+                    ready=False,
+                    reasons=[
+                        "Legacy dispatch remains authoritative until the shadow workflow comparison returns pass.",
+                        "A pass verdict is required before workflow-first rollout is ready.",
+                    ],
+                    workflow_def_id=wf_def.id,
+                    output_type_id=output_type.id,
+                )
+            )
            return legacy_result

-        workflow_type, params = extract_runtime_workflow(wf_def.config)
+        workflow_type, params = extract_runtime_workflow(canonical_config)
        if workflow_type is None or workflow_type == "custom":
            logger.warning(
                "order_line %s: workflow_definition_id %s has no supported preset runtime, "
@@ -232,7 +386,17 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                order_line_id,
                wf_def.id,
            )
-            return _legacy_dispatch(order_line_id)
+            legacy_result = _legacy_dispatch(order_line_id)
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="workflow_runtime_unsupported",
+                    ready=False,
+                    reasons=["Workflow definition has no supported preset runtime; legacy dispatch remains authoritative."],
+                    workflow_def_id=wf_def.id,
+                    output_type_id=output_type.id,
+                )
+            )
+            return legacy_result

        logger.info(
            "order_line %s: dispatching via WorkflowDefinition %s (type=%s)",
@@ -243,7 +407,7 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:

        try:
            workflow_context = prepare_workflow_context(
-                wf_def.config,
+                canonical_config,
                context_id=order_line_id,
                execution_mode="legacy",
            )
@@ -255,7 +419,17 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                wf_def.id,
                exc,
            )
-            return _legacy_dispatch(order_line_id)
+            legacy_result = _legacy_dispatch(order_line_id)
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="workflow_preparation_failed",
+                    ready=False,
+                    reasons=[f"Workflow runtime preparation failed: {exc}."],
+                    workflow_def_id=wf_def.id,
+                    output_type_id=output_type.id,
+                )
+            )
+            return legacy_result

        # For turntable workflows: resolve step_path + output_dir from the order line at runtime
        if workflow_type == "turntable" and ("step_path" not in params or "output_dir" not in params):
@@ -299,7 +473,17 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                wf_def.id,
                exc,
            )
-            return _legacy_dispatch(order_line_id)
+            legacy_result = _legacy_dispatch(order_line_id)
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="workflow_run_creation_failed",
+                    ready=False,
+                    reasons=[f"Workflow run creation failed: {exc}."],
+                    workflow_def_id=wf_def.id,
+                    output_type_id=output_type.id,
+                )
+            )
+            return legacy_result

        from app.domains.rendering.workflow_builder import dispatch_workflow

@@ -317,15 +501,35 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
                order_line_id,
                wf_def.id,
            )
-            return _legacy_dispatch(order_line_id)
+            legacy_result = _legacy_dispatch(order_line_id)
+            legacy_result.update(
+                _build_rollout_signal(
+                    gate_status="workflow_dispatch_failed",
+                    ready=False,
+                    reasons=[f"Workflow dispatch failed: {exc}."],
+                    workflow_def_id=wf_def.id,
+                    output_type_id=output_type.id,
+                )
+            )
+            return legacy_result

-        return {
+        result = {
            "backend": "workflow",
            "workflow_type": workflow_type,
            "execution_mode": "legacy",
            "workflow_run_id": str(run.id),
            "celery_task_id": celery_task_id,
        }
+        result.update(
+            _build_rollout_signal(
+                gate_status="workflow_legacy_runtime",
+                ready=False,
+                reasons=["Workflow definition is active, but execution still uses the legacy runtime path."],
+                workflow_def_id=wf_def.id,
+                output_type_id=output_type.id,
+            )
+        )
+        return result


 def _legacy_dispatch(order_line_id: str) -> dict:
@@ -10,11 +10,16 @@ from PIL import Image, ImageChops, ImageStat
 from sqlalchemy import select
 from sqlalchemy.ext.asyncio import AsyncSession

+from app.config import settings
+from app.core.render_paths import resolve_result_path, result_path_to_storage_key
 from app.domains.media.models import MediaAsset
 from app.domains.orders.models import OrderLine
 from app.domains.rendering.models import WorkflowRun
 from app.domains.rendering.schemas import WorkflowComparisonArtifactOut, WorkflowRunComparisonOut

+ROLLOUT_PASS_MAX_MEAN_PIXEL_DELTA = 0.0
+ROLLOUT_WARN_MAX_MEAN_PIXEL_DELTA = 0.02
+

@dataclass(slots=True)
 class _ArtifactComparison:
@@ -36,18 +41,78 @@ class _ArtifactComparison:
            sha256=self.sha256,
            mime_type=self.mime_type,
            image_width=self.image_width,
-            image_height=self.image_height,
+        image_height=self.image_height,
        )


+def evaluate_rollout_gate(
+    *,
+    authoritative_output: _ArtifactComparison,
+    observer_output: _ArtifactComparison,
+    exact_match: bool | None,
+    dimensions_match: bool | None,
+    mean_pixel_delta: float | None,
+) -> dict[str, object]:
+    thresholds = {
+        "pass_max_mean_pixel_delta": ROLLOUT_PASS_MAX_MEAN_PIXEL_DELTA,
+        "warn_max_mean_pixel_delta": ROLLOUT_WARN_MAX_MEAN_PIXEL_DELTA,
+    }
+    reasons: list[str] = []
+
+    if not authoritative_output.exists:
+        verdict = "fail"
+        reasons.append("Authoritative legacy output is missing; keep legacy fallback active.")
+    elif not observer_output.exists:
+        verdict = "fail"
+        reasons.append("Observer workflow output is missing; rollout cannot be approved.")
+    elif exact_match:
+        verdict = "pass"
+        reasons.append("Observer output matches the authoritative legacy output byte-for-byte.")
+    elif dimensions_match is False:
+        verdict = "fail"
+        reasons.append("Observer output dimensions differ from the authoritative legacy output.")
+    elif mean_pixel_delta is None:
+        verdict = "fail"
+        reasons.append("Observer output could not be pixel-compared against the authoritative output.")
+    elif mean_pixel_delta <= ROLLOUT_PASS_MAX_MEAN_PIXEL_DELTA:
+        verdict = "pass"
+        reasons.append("Observer output is visually identical within the pass threshold.")
+    elif mean_pixel_delta <= ROLLOUT_WARN_MAX_MEAN_PIXEL_DELTA:
+        verdict = "warn"
+        reasons.append(
+            "Observer output differs slightly from the authoritative output but remains within the warn threshold."
+        )
+    else:
+        verdict = "fail"
+        reasons.append(
+            "Observer output exceeds the allowed parity threshold; keep legacy fallback active."
+        )
+
+    if mean_pixel_delta is not None and not exact_match:
+        reasons.append(
+            f"Mean pixel delta {mean_pixel_delta:.6f}; "
+            f"pass<={ROLLOUT_PASS_MAX_MEAN_PIXEL_DELTA:.6f}, "
+            f"warn<={ROLLOUT_WARN_MAX_MEAN_PIXEL_DELTA:.6f}."
+        )
+
+    rollout_ready = verdict == "pass"
+    rollout_status = "ready_for_rollout" if rollout_ready else "hold_legacy_authoritative"
+
+    return {
+        "verdict": verdict,
+        "ready": rollout_ready,
+        "status": rollout_status,
+        "reasons": reasons,
+        "thresholds": thresholds,
+        "workflow_rollout_ready": rollout_ready,
+        "workflow_rollout_status": rollout_status,
+        "output_type_rollout_ready": rollout_ready,
+        "output_type_rollout_status": rollout_status,
+    }
+
+
 def _normalize_storage_key(path: str | None) -> str | None:
-    if not path:
-        return None
-    normalized = path.replace("\\", "/")
-    marker = "/uploads/"
-    if marker in normalized:
-        return normalized.split(marker, 1)[1]
-    return normalized.lstrip("/")
+    return result_path_to_storage_key(path)


 def _build_artifact(path: str | None) -> _ArtifactComparison:
@@ -63,7 +128,8 @@ def _build_artifact(path: str | None) -> _ArtifactComparison:
            image_height=None,
        )

-    file_path = Path(path)
+    resolved_path = resolve_result_path(path)
+    file_path = resolved_path or Path(path)
    exists = file_path.exists()
    mime_type, _ = mimetypes.guess_type(str(file_path))

@@ -136,10 +202,8 @@ async def _load_shadow_asset_by_workflow_run(
    if asset is None:
        return None

-    storage_key = asset.storage_key.lstrip("/")
-    if storage_key.startswith("app/uploads/"):
-        return f"/{storage_key}"
-    return f"/app/uploads/{storage_key}"
+    resolved = resolve_result_path(asset.storage_key)
+    return str(resolved) if resolved is not None else None


 def _find_shadow_file(order_line: OrderLine, workflow_run: WorkflowRun) -> str | None:
@@ -147,9 +211,13 @@ def _find_shadow_file(order_line: OrderLine, workflow_run: WorkflowRun) -> str |
    candidate_roots: list[Path] = []

    if order_line.result_path:
-        candidate_roots.append(Path(order_line.result_path).parent)
+        resolved_result = resolve_result_path(order_line.result_path)
+        if resolved_result is not None:
+            candidate_roots.append(resolved_result.parent)

-    candidate_roots.append(Path("/app/uploads/renders") / str(order_line.id))
+    upload_root = Path(settings.upload_dir)
+    candidate_roots.append(upload_root / "renders" / str(order_line.id))
+    candidate_roots.append(upload_root / "step_files" / "renders")

    seen_roots: set[Path] = set()
    candidates: list[Path] = []
@@ -215,6 +283,9 @@ async def build_workflow_run_comparison(
        if exact_match:
            status = "matched"
            summary = "Observer output matches the authoritative legacy output byte-for-byte."
+        elif mean_pixel_delta == 0.0 and dimensions_match:
+            status = "matched"
+            summary = "Observer output matches the authoritative legacy output visually, but file metadata differs."
        else:
            status = "different"
            if dimensions_match is False: