fix: stabilize shadow workflow smoke comparison

This commit is contained in:
2026-04-08 22:14:33 +02:00
parent 375339eb74
commit d685031c1a
2 changed files with 46 additions and 5 deletions
@@ -120,6 +120,7 @@ Parallel execution ownership and stage gates are defined in [`docs/workflows/WOR
- `shadow` must finish with a successful order line and a comparison verdict of `pass`
- `warn` or `fail` means legacy remains authoritative
- `graph` may only be enabled on real output types after the shadow command passes cleanly
- Progress: the canonical still smoke flow now passes live in `legacy` and `graph`; `shadow` stabilizes after a short observer-output lag and currently reports `warn` because the observer image differs slightly, so legacy remains authoritative for rollout decisions.
## Definition of Done
+45 -5
View File
@@ -42,6 +42,7 @@ SAMPLE_STEP = Path(__file__).parent.parent / "step-sample-file" / "81113-l_cut.s
RENDER_TIMEOUT_SECONDS = 300 # 5 minutes per render
POLL_INTERVAL_SECONDS = 5
CAD_PROCESSING_TIMEOUT = 120 # 2 minutes for STEP processing
COMPARISON_TIMEOUT_SECONDS = 60
GREEN = "\033[92m"
RED = "\033[91m"
@@ -508,6 +509,44 @@ def wait_for_workflow_run(
return None
def wait_for_workflow_comparison(
client: APIClient,
*,
workflow_run_id: str,
timeout_seconds: int = COMPARISON_TIMEOUT_SECONDS,
) -> dict | None:
deadline = time.time() + timeout_seconds
last_status = None
while time.time() < deadline:
resp = client.get(f"/workflows/runs/{workflow_run_id}/comparison")
if resp.status_code != 200:
time.sleep(2)
continue
comparison = resp.json()
status = comparison.get("status")
authoritative_exists = bool(comparison.get("authoritative_output", {}).get("exists"))
observer_exists = bool(comparison.get("observer_output", {}).get("exists"))
if status != last_status:
info(
" Comparison poll: "
f"status={status} authoritative_exists={authoritative_exists} "
f"observer_exists={observer_exists}"
)
last_status = status
# Shadow observer artifacts can arrive shortly after the workflow run is visible.
# Treat missing/processing observer states as transient until the timeout expires.
if authoritative_exists and observer_exists and status not in {"missing_observer", "pending", "running"}:
return comparison
time.sleep(2)
return None
# ---------------------------------------------------------------------------
# Test: Order creation + submit + dispatch + wait
# ---------------------------------------------------------------------------
@@ -748,12 +787,13 @@ def test_workflow_still_smoke(
)
if success and execution_mode == "shadow" and workflow_run is not None:
resp_cmp = client.get(f"/workflows/runs/{workflow_run['id']}/comparison")
if resp_cmp.status_code != 200:
warn(f"Shadow comparison lookup failed: {resp_cmp.status_code} {resp_cmp.text[:300]}")
comparison = wait_for_workflow_comparison(
client,
workflow_run_id=workflow_run["id"],
)
if comparison is None:
warn("Shadow comparison did not stabilize before timeout")
return success
comparison = resp_cmp.json()
rollout_gate = evaluate_rollout_gate_from_comparison(comparison)
verdict = rollout_gate["verdict"]
info(