feat: add graph workflow fallback and retry metadata
This commit is contained in:
@@ -32,7 +32,15 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
|
||||
from app.config import settings
|
||||
from app.domains.orders.models import OrderLine
|
||||
from app.domains.rendering.models import OutputType, WorkflowDefinition
|
||||
from app.domains.rendering.workflow_config_utils import (
|
||||
extract_runtime_workflow,
|
||||
get_workflow_execution_mode,
|
||||
)
|
||||
from app.domains.rendering.workflow_executor import prepare_workflow_context
|
||||
from app.domains.rendering.workflow_graph_runtime import (
|
||||
execute_graph_workflow,
|
||||
find_unsupported_graph_nodes,
|
||||
)
|
||||
from app.domains.rendering.workflow_run_service import create_workflow_run, mark_workflow_run_failed
|
||||
|
||||
engine = create_engine(
|
||||
@@ -78,7 +86,90 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
|
||||
)
|
||||
return _legacy_dispatch(order_line_id)
|
||||
|
||||
from app.domains.rendering.workflow_config_utils import extract_runtime_workflow
|
||||
execution_mode = get_workflow_execution_mode(wf_def.config, default="legacy")
|
||||
|
||||
if execution_mode == "graph":
|
||||
try:
|
||||
workflow_context = prepare_workflow_context(
|
||||
wf_def.config,
|
||||
context_id=order_line_id,
|
||||
execution_mode="graph",
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"order_line %s: workflow_definition_id %s failed graph runtime preparation (%s), "
|
||||
"falling back to legacy dispatch",
|
||||
order_line_id,
|
||||
wf_def.id,
|
||||
exc,
|
||||
)
|
||||
return _legacy_dispatch(order_line_id)
|
||||
|
||||
unsupported_nodes = find_unsupported_graph_nodes(workflow_context)
|
||||
if unsupported_nodes:
|
||||
logger.warning(
|
||||
"order_line %s: workflow_definition_id %s contains graph-unsupported nodes %s, "
|
||||
"falling back to legacy dispatch",
|
||||
order_line_id,
|
||||
wf_def.id,
|
||||
unsupported_nodes,
|
||||
)
|
||||
return _legacy_dispatch(order_line_id)
|
||||
|
||||
run = None
|
||||
try:
|
||||
run = create_workflow_run(
|
||||
session,
|
||||
workflow_def_id=wf_def.id,
|
||||
order_line_id=line.id,
|
||||
workflow_context=workflow_context,
|
||||
)
|
||||
session.commit()
|
||||
except Exception as exc:
|
||||
session.rollback()
|
||||
logger.warning(
|
||||
"order_line %s: failed to create graph workflow run for workflow_definition_id %s (%s), "
|
||||
"falling back to legacy dispatch",
|
||||
order_line_id,
|
||||
wf_def.id,
|
||||
exc,
|
||||
)
|
||||
return _legacy_dispatch(order_line_id)
|
||||
|
||||
try:
|
||||
dispatch_result = execute_graph_workflow(session, workflow_context)
|
||||
session.commit()
|
||||
except Exception as exc:
|
||||
if run is not None:
|
||||
mark_workflow_run_failed(run, str(exc))
|
||||
session.commit()
|
||||
logger.exception(
|
||||
"order_line %s: graph workflow execution via definition %s failed, falling back to legacy dispatch",
|
||||
order_line_id,
|
||||
wf_def.id,
|
||||
)
|
||||
fallback_result = _legacy_dispatch(order_line_id)
|
||||
fallback_result["fallback_from"] = "workflow_graph"
|
||||
if run is not None:
|
||||
fallback_result["workflow_run_id"] = str(run.id)
|
||||
return fallback_result
|
||||
|
||||
return {
|
||||
"backend": "workflow_graph",
|
||||
"execution_mode": "graph",
|
||||
"workflow_run_id": str(run.id),
|
||||
"celery_task_id": dispatch_result.task_ids[0] if dispatch_result.task_ids else None,
|
||||
"task_ids": dispatch_result.task_ids,
|
||||
}
|
||||
|
||||
if execution_mode == "shadow":
|
||||
logger.warning(
|
||||
"order_line %s: workflow_definition_id %s requested shadow mode, "
|
||||
"falling back to legacy dispatch until duplicate-safe shadow execution exists",
|
||||
order_line_id,
|
||||
wf_def.id,
|
||||
)
|
||||
return _legacy_dispatch(order_line_id)
|
||||
|
||||
workflow_type, params = extract_runtime_workflow(wf_def.config)
|
||||
if workflow_type is None or workflow_type == "custom":
|
||||
@@ -178,6 +269,7 @@ def dispatch_render_with_workflow(order_line_id: str) -> dict:
|
||||
return {
|
||||
"backend": "workflow",
|
||||
"workflow_type": workflow_type,
|
||||
"execution_mode": "legacy",
|
||||
"workflow_run_id": str(run.id),
|
||||
"celery_task_id": celery_task_id,
|
||||
}
|
||||
|
||||
@@ -17,6 +17,8 @@ _PRESET_TYPES = {
|
||||
"custom",
|
||||
}
|
||||
|
||||
_EXECUTION_MODES = {"legacy", "graph", "shadow"}
|
||||
|
||||
_NODE_TYPE_TO_STEP: dict[str, str] = {
|
||||
"inputNode": StepName.RESOLVE_STEP_PATH.value,
|
||||
"convertNode": StepName.STL_CACHE_GENERATE.value,
|
||||
@@ -243,6 +245,15 @@ def get_workflow_preset_type(config: dict[str, Any]) -> str | None:
|
||||
return None
|
||||
|
||||
|
||||
def get_workflow_execution_mode(config: dict[str, Any], *, default: str = "legacy") -> str:
|
||||
canonical = canonicalize_workflow_config(config)
|
||||
ui = canonical.get("ui") or {}
|
||||
mode = ui.get("execution_mode")
|
||||
if mode in _EXECUTION_MODES:
|
||||
return mode
|
||||
return default
|
||||
|
||||
|
||||
def extract_runtime_workflow(config: dict[str, Any]) -> tuple[str | None, dict[str, Any]]:
|
||||
canonical = canonicalize_workflow_config(config)
|
||||
preset = get_workflow_preset_type(canonical)
|
||||
|
||||
@@ -53,6 +53,17 @@ _ORDER_LINE_RENDER_STEPS = {
|
||||
}
|
||||
|
||||
|
||||
def find_unsupported_graph_nodes(workflow_context: WorkflowContext) -> list[str]:
|
||||
unsupported: list[str] = []
|
||||
for node in workflow_context.ordered_nodes:
|
||||
if node.step in _BRIDGE_EXECUTORS:
|
||||
continue
|
||||
if STEP_TASK_MAP.get(node.step) is not None:
|
||||
continue
|
||||
unsupported.append(node.id)
|
||||
return unsupported
|
||||
|
||||
|
||||
def execute_graph_workflow(
|
||||
session: Session,
|
||||
workflow_context: WorkflowContext,
|
||||
@@ -82,48 +93,102 @@ def execute_graph_workflow(
|
||||
)
|
||||
continue
|
||||
|
||||
retry_policy = _retry_policy(node.params)
|
||||
failure_policy = _failure_policy(node.params)
|
||||
metadata = _base_output(node_result.output, node)
|
||||
metadata["retry_policy"] = retry_policy
|
||||
metadata["failure_policy"] = failure_policy
|
||||
definition = get_node_definition(node.step)
|
||||
bridge_executor = _BRIDGE_EXECUTORS.get(node.step)
|
||||
|
||||
if bridge_executor is not None:
|
||||
started = time.perf_counter()
|
||||
node_result.status = "running"
|
||||
node_result.output = dict(metadata)
|
||||
session.flush()
|
||||
try:
|
||||
payload, status, log_message = bridge_executor(
|
||||
session=session,
|
||||
workflow_context=workflow_context,
|
||||
state=state,
|
||||
node_params=node.params,
|
||||
)
|
||||
except Exception as exc:
|
||||
node_result.status = "failed"
|
||||
node_result.log = str(exc)[:2000]
|
||||
node_result.duration_s = round(time.perf_counter() - started, 4)
|
||||
node_result.output = dict(metadata)
|
||||
max_attempts = retry_policy["max_attempts"]
|
||||
last_error: str | None = None
|
||||
|
||||
for attempt in range(1, max_attempts + 1):
|
||||
started = time.perf_counter()
|
||||
attempt_output = dict(metadata)
|
||||
attempt_output["attempt_count"] = attempt
|
||||
attempt_output["max_attempts"] = max_attempts
|
||||
node_result.status = "running"
|
||||
node_result.output = attempt_output
|
||||
session.flush()
|
||||
raise WorkflowGraphRuntimeError(
|
||||
f"Node '{node.id}' ({node.step.value}) failed: {exc}"
|
||||
) from exc
|
||||
|
||||
if payload:
|
||||
metadata.update(payload)
|
||||
state.node_outputs[node.id] = payload
|
||||
try:
|
||||
payload, status, log_message = bridge_executor(
|
||||
session=session,
|
||||
workflow_context=workflow_context,
|
||||
state=state,
|
||||
node_params=node.params,
|
||||
)
|
||||
except Exception as exc:
|
||||
last_error = str(exc)[:2000]
|
||||
if attempt < max_attempts:
|
||||
retry_output = dict(attempt_output)
|
||||
retry_output["last_error"] = last_error
|
||||
retry_output["retry_state"] = "retrying"
|
||||
node_result.status = "retrying"
|
||||
node_result.log = f"Attempt {attempt}/{max_attempts} failed: {last_error}"
|
||||
node_result.output = retry_output
|
||||
node_result.duration_s = round(time.perf_counter() - started, 4)
|
||||
session.flush()
|
||||
continue
|
||||
|
||||
node_result.status = status
|
||||
node_result.log = log_message
|
||||
node_result.output = dict(metadata)
|
||||
node_result.duration_s = round(time.perf_counter() - started, 4)
|
||||
session.flush()
|
||||
failed_output = dict(attempt_output)
|
||||
failed_output["last_error"] = last_error
|
||||
failed_output["retry_exhausted"] = True
|
||||
node_result.status = "failed"
|
||||
node_result.log = last_error
|
||||
node_result.duration_s = round(time.perf_counter() - started, 4)
|
||||
node_result.output = failed_output
|
||||
session.flush()
|
||||
raise WorkflowGraphRuntimeError(
|
||||
f"Node '{node.id}' ({node.step.value}) failed: {exc}"
|
||||
) from exc
|
||||
|
||||
if status == "failed":
|
||||
raise WorkflowGraphRuntimeError(
|
||||
f"Node '{node.id}' ({node.step.value}) failed: {log_message or 'unknown error'}"
|
||||
)
|
||||
if status == "skipped":
|
||||
skipped_node_ids.append(node.id)
|
||||
if payload:
|
||||
metadata.update(payload)
|
||||
state.node_outputs[node.id] = payload
|
||||
|
||||
final_output = dict(metadata)
|
||||
final_output["attempt_count"] = attempt
|
||||
final_output["max_attempts"] = max_attempts
|
||||
if last_error is not None:
|
||||
final_output["last_error"] = last_error
|
||||
final_output["retry_state"] = "recovered"
|
||||
|
||||
node_result.status = status
|
||||
node_result.log = log_message
|
||||
node_result.output = final_output
|
||||
node_result.duration_s = round(time.perf_counter() - started, 4)
|
||||
session.flush()
|
||||
|
||||
if status == "failed":
|
||||
last_error = (log_message or "unknown error")[:2000]
|
||||
if attempt < max_attempts:
|
||||
retry_output = dict(final_output)
|
||||
retry_output["last_error"] = last_error
|
||||
retry_output["retry_state"] = "retrying"
|
||||
node_result.status = "retrying"
|
||||
node_result.log = f"Attempt {attempt}/{max_attempts} failed: {last_error}"
|
||||
node_result.output = retry_output
|
||||
session.flush()
|
||||
continue
|
||||
|
||||
failed_output = dict(final_output)
|
||||
failed_output["last_error"] = last_error
|
||||
failed_output["retry_exhausted"] = True
|
||||
node_result.status = "failed"
|
||||
node_result.log = last_error
|
||||
node_result.output = failed_output
|
||||
session.flush()
|
||||
raise WorkflowGraphRuntimeError(
|
||||
f"Node '{node.id}' ({node.step.value}) failed: {last_error}"
|
||||
)
|
||||
|
||||
if status == "skipped":
|
||||
skipped_node_ids.append(node.id)
|
||||
break
|
||||
continue
|
||||
|
||||
task_name = STEP_TASK_MAP.get(node.step)
|
||||
@@ -147,6 +212,8 @@ def execute_graph_workflow(
|
||||
metadata["task_id"] = result.id
|
||||
if definition is not None:
|
||||
metadata["execution_kind"] = definition.execution_kind
|
||||
metadata["attempt_count"] = 1
|
||||
metadata["max_attempts"] = retry_policy["max_attempts"]
|
||||
node_result.status = "queued"
|
||||
node_result.output = metadata
|
||||
node_result.log = None
|
||||
@@ -203,6 +270,29 @@ def _base_output(existing: dict[str, Any] | None, node) -> dict[str, Any]:
|
||||
return metadata
|
||||
|
||||
|
||||
def _retry_policy(node_params: dict[str, Any]) -> dict[str, Any]:
|
||||
raw = node_params.get("retry_policy")
|
||||
if not isinstance(raw, dict):
|
||||
raw = {}
|
||||
try:
|
||||
max_attempts = int(raw.get("max_attempts", 1))
|
||||
except (TypeError, ValueError):
|
||||
max_attempts = 1
|
||||
return {
|
||||
"max_attempts": max(1, min(max_attempts, 5)),
|
||||
}
|
||||
|
||||
|
||||
def _failure_policy(node_params: dict[str, Any]) -> dict[str, Any]:
|
||||
raw = node_params.get("failure_policy")
|
||||
if not isinstance(raw, dict):
|
||||
raw = {}
|
||||
return {
|
||||
"halt_workflow": bool(raw.get("halt_workflow", True)),
|
||||
"fallback_to_legacy": bool(raw.get("fallback_to_legacy", False)),
|
||||
}
|
||||
|
||||
|
||||
def _serialize_setup_result(result: OrderLineRenderSetupResult) -> dict[str, Any]:
|
||||
payload: dict[str, Any] = {
|
||||
"setup_status": result.status,
|
||||
|
||||
@@ -16,6 +16,8 @@ Example config::
|
||||
]
|
||||
}
|
||||
"""
|
||||
from typing import Literal
|
||||
|
||||
from pydantic import BaseModel, Field, field_validator, model_validator
|
||||
|
||||
from app.core.process_steps import StepName
|
||||
@@ -49,6 +51,7 @@ class WorkflowEdge(BaseModel):
|
||||
|
||||
class WorkflowUI(BaseModel):
|
||||
preset: str | None = None
|
||||
execution_mode: Literal["legacy", "graph", "shadow"] | None = None
|
||||
|
||||
|
||||
class WorkflowConfig(BaseModel):
|
||||
|
||||
Reference in New Issue
Block a user