fix: stabilize shadow workflow smoke comparison
This commit is contained in:
@@ -120,6 +120,7 @@ Parallel execution ownership and stage gates are defined in [`docs/workflows/WOR
|
||||
- `shadow` must finish with a successful order line and a comparison verdict of `pass`
|
||||
- `warn` or `fail` means legacy remains authoritative
|
||||
- `graph` may only be enabled on real output types after the shadow command passes cleanly
|
||||
- Progress: the canonical still smoke flow now passes live in `legacy` and `graph`; `shadow` stabilizes after a short observer-output lag and currently reports `warn` because the observer image differs slightly, so legacy remains authoritative for rollout decisions.
|
||||
|
||||
## Definition of Done
|
||||
|
||||
|
||||
@@ -42,6 +42,7 @@ SAMPLE_STEP = Path(__file__).parent.parent / "step-sample-file" / "81113-l_cut.s
|
||||
RENDER_TIMEOUT_SECONDS = 300 # 5 minutes per render
|
||||
POLL_INTERVAL_SECONDS = 5
|
||||
CAD_PROCESSING_TIMEOUT = 120 # 2 minutes for STEP processing
|
||||
COMPARISON_TIMEOUT_SECONDS = 60
|
||||
|
||||
GREEN = "\033[92m"
|
||||
RED = "\033[91m"
|
||||
@@ -508,6 +509,44 @@ def wait_for_workflow_run(
|
||||
return None
|
||||
|
||||
|
||||
def wait_for_workflow_comparison(
|
||||
client: APIClient,
|
||||
*,
|
||||
workflow_run_id: str,
|
||||
timeout_seconds: int = COMPARISON_TIMEOUT_SECONDS,
|
||||
) -> dict | None:
|
||||
deadline = time.time() + timeout_seconds
|
||||
last_status = None
|
||||
|
||||
while time.time() < deadline:
|
||||
resp = client.get(f"/workflows/runs/{workflow_run_id}/comparison")
|
||||
if resp.status_code != 200:
|
||||
time.sleep(2)
|
||||
continue
|
||||
|
||||
comparison = resp.json()
|
||||
status = comparison.get("status")
|
||||
authoritative_exists = bool(comparison.get("authoritative_output", {}).get("exists"))
|
||||
observer_exists = bool(comparison.get("observer_output", {}).get("exists"))
|
||||
|
||||
if status != last_status:
|
||||
info(
|
||||
" Comparison poll: "
|
||||
f"status={status} authoritative_exists={authoritative_exists} "
|
||||
f"observer_exists={observer_exists}"
|
||||
)
|
||||
last_status = status
|
||||
|
||||
# Shadow observer artifacts can arrive shortly after the workflow run is visible.
|
||||
# Treat missing/processing observer states as transient until the timeout expires.
|
||||
if authoritative_exists and observer_exists and status not in {"missing_observer", "pending", "running"}:
|
||||
return comparison
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test: Order creation + submit + dispatch + wait
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -748,12 +787,13 @@ def test_workflow_still_smoke(
|
||||
)
|
||||
|
||||
if success and execution_mode == "shadow" and workflow_run is not None:
|
||||
resp_cmp = client.get(f"/workflows/runs/{workflow_run['id']}/comparison")
|
||||
if resp_cmp.status_code != 200:
|
||||
warn(f"Shadow comparison lookup failed: {resp_cmp.status_code} {resp_cmp.text[:300]}")
|
||||
comparison = wait_for_workflow_comparison(
|
||||
client,
|
||||
workflow_run_id=workflow_run["id"],
|
||||
)
|
||||
if comparison is None:
|
||||
warn("Shadow comparison did not stabilize before timeout")
|
||||
return success
|
||||
|
||||
comparison = resp_cmp.json()
|
||||
rollout_gate = evaluate_rollout_gate_from_comparison(comparison)
|
||||
verdict = rollout_gate["verdict"]
|
||||
info(
|
||||
|
||||
Reference in New Issue
Block a user