CapaKraken/.github/workflows/ci.yml

name: CI

# Retrigger marker: b2d89ca (docker-deploy smoke retry)
on:
  push:
    branches: [main]
    paths-ignore:
      - "docs/**"
      - ".gitea/**"
      - "**/*.md"
      - "LICENSE"
  pull_request:
    branches: [main]
    paths-ignore:
      - "docs/**"
      - ".gitea/**"
      - "**/*.md"
      - "LICENSE"

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

env:
  NODE_VERSION: "20"
  PNPM_VERSION: "9.14.2"
  CI_AUTH_URL: http://localhost:3100
  # Placeholder for CI — real secret only matters at deploy time.
  # next build collects page data for auth routes and aborts if empty.
  CI_AUTH_SECRET: ci-test-secret-minimum-32-chars-xx

jobs:
  guardrails:
    name: Architecture Guardrails
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Install pnpm
        run: npm install -g pnpm@${{ env.PNPM_VERSION }}

      - uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Run repo script tests
        run: pnpm test:scripts

      - name: Check architecture guardrails
        run: pnpm check:architecture

      - name: Check workspace exports
        run: pnpm check:exports

      - name: Check workspace imports
        run: pnpm check:imports

      - name: Security audit (high+ severity)
        run: pnpm audit --audit-level=high

  # ──────────────────────────────────────────────
  # Typecheck — ~40s, no services needed
  # ──────────────────────────────────────────────
  typecheck:
    name: Typecheck
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Install pnpm
        run: npm install -g pnpm@${{ env.PNPM_VERSION }}

      - uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Generate Prisma client
        run: pnpm db:generate

      - name: Cache Turborepo
        uses: actions/cache@v4
        continue-on-error: true
        with:
          path: .turbo
          key: turbo-typecheck-${{ github.sha }}
          restore-keys: turbo-typecheck-

      - name: Run typecheck
        run: pnpm typecheck

  assistant-split:
    name: Assistant Split Regression
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Install pnpm
        run: npm install -g pnpm@${{ env.PNPM_VERSION }}

      - uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Generate Prisma client
        run: pnpm db:generate

      - name: Run assistant split regression
        run: pnpm --filter @capakraken/api test:assistant-split

  # ──────────────────────────────────────────────
  # Lint — ~20s, no services needed
  # ──────────────────────────────────────────────
  lint:
    name: Lint
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Install pnpm
        run: npm install -g pnpm@${{ env.PNPM_VERSION }}

      - uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Generate Prisma client
        run: pnpm db:generate

      - name: Cache Turborepo
        uses: actions/cache@v4
        continue-on-error: true
        with:
          path: .turbo
          key: turbo-lint-${{ github.sha }}
          restore-keys: turbo-lint-

      - name: Run lint
        run: pnpm lint

  # ──────────────────────────────────────────────
  # Unit tests — needs PostgreSQL + Redis
  # ──────────────────────────────────────────────
  test:
    name: Unit Tests
    runs-on: ubuntu-latest
    services:
      postgres:
        image: postgres:16
        env:
          POSTGRES_DB: capakraken_test
          POSTGRES_USER: capakraken
          POSTGRES_PASSWORD: capakraken_test
        options: >-
          --health-cmd="pg_isready -U capakraken -d capakraken_test"
          --health-interval=10s
          --health-timeout=5s
          --health-retries=5
      redis:
        image: redis:7
        options: >-
          --health-cmd="redis-cli ping"
          --health-interval=10s
          --health-timeout=5s
          --health-retries=5
    env:
      DATABASE_URL: postgresql://capakraken:capakraken_test@postgres:5432/capakraken_test
      REDIS_URL: redis://redis:6379
      # Force in-memory rate limiter to avoid cross-test state when Redis drops.
      # Redis fallback downgrades to max/10 limits which rate-limits unit tests.
      RATE_LIMIT_BACKEND: memory
      # Tests assume Europe/Berlin for month-boundary math (new Date(y,m,1)).
      TZ: Europe/Berlin
      NEXTAUTH_URL: ${{ env.CI_AUTH_URL }}
      AUTH_URL: ${{ env.CI_AUTH_URL }}
      NEXTAUTH_SECRET: ${{ env.CI_AUTH_SECRET }}
      AUTH_SECRET: ${{ env.CI_AUTH_SECRET }}
    steps:
      - uses: actions/checkout@v4

      - name: Install pnpm
        run: npm install -g pnpm@${{ env.PNPM_VERSION }}

      - uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Generate Prisma client
        run: pnpm db:generate

      - name: Run unit tests with coverage
        run: |
          pnpm --filter @capakraken/web test:unit -- --coverage
          pnpm --filter @capakraken/engine exec vitest run --coverage
          pnpm --filter @capakraken/staffing exec vitest run --coverage
          pnpm --filter @capakraken/api exec vitest run --coverage
          pnpm --filter @capakraken/application exec vitest run --coverage
          pnpm --filter @capakraken/shared exec vitest run --coverage
          pnpm --filter @capakraken/db test:unit

      - name: Upload coverage reports
        uses: actions/upload-artifact@v4
        continue-on-error: true # upload-artifact@v4 unsupported on Gitea (GHES) runner
        if: ${{ !cancelled() }}
        with:
          name: coverage-reports
          path: |
            apps/web/coverage/
            packages/engine/coverage/
            packages/staffing/coverage/
            packages/api/coverage/
            packages/application/coverage/
            packages/shared/coverage/
          retention-days: 14

  # ──────────────────────────────────────────────
  # Build — depends on typecheck passing
  # ──────────────────────────────────────────────
  build:
    name: Build
    needs: [guardrails, typecheck]
    runs-on: ubuntu-latest
    env:
      DATABASE_URL: postgresql://placeholder:placeholder@localhost:5432/placeholder
      REDIS_URL: redis://placeholder:6379
      NEXTAUTH_URL: ${{ env.CI_AUTH_URL }}
      AUTH_URL: ${{ env.CI_AUTH_URL }}
      NEXTAUTH_SECRET: ${{ env.CI_AUTH_SECRET }}
      AUTH_SECRET: ${{ env.CI_AUTH_SECRET }}
    steps:
      - uses: actions/checkout@v4

      - name: Install pnpm
        run: npm install -g pnpm@${{ env.PNPM_VERSION }}

      - uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Generate Prisma client
        run: pnpm db:generate

      - name: Cache Turborepo
        uses: actions/cache@v4
        continue-on-error: true
        with:
          path: .turbo
          key: turbo-build-${{ github.sha }}
          restore-keys: turbo-build-

      - name: Cache Next.js build
        uses: actions/cache@v4
        continue-on-error: true
        with:
          path: apps/web/.next/cache
          key: nextjs-${{ hashFiles('pnpm-lock.yaml') }}-${{ github.sha }}
          restore-keys: nextjs-${{ hashFiles('pnpm-lock.yaml') }}-

      - name: Build
        run: pnpm --filter @capakraken/web exec next build

  # ──────────────────────────────────────────────
  # E2E — depends on build, needs PostgreSQL + Redis
  # ──────────────────────────────────────────────
  e2e:
    name: E2E Tests
    needs: [build]
    runs-on: ubuntu-latest
    services:
      # Unique hostnames — "postgres"/"redis" collide with Gitea's own core
      # containers and concurrent job service containers on the shared
      # gitea_gitea network, producing split-brain where push hits one DB and
      # seed hits another. See audit_logs-missing bug from commit f856dd26.
      e2epg:
        image: postgres:16
        env:
          POSTGRES_DB: capakraken_test
          POSTGRES_USER: capakraken
          POSTGRES_PASSWORD: capakraken_test
        options: >-
          --health-cmd="pg_isready -U capakraken -d capakraken_test"
          --health-interval=10s
          --health-timeout=5s
          --health-retries=5
      e2eredis:
        image: redis:7
        options: >-
          --health-cmd="redis-cli ping"
          --health-interval=10s
          --health-timeout=5s
          --health-retries=5
    env:
      DATABASE_URL: postgresql://capakraken:capakraken_test@e2epg:5432/capakraken_test
      # Playwright test-server.mjs requires an explicit test DB URL.
      PLAYWRIGHT_DATABASE_URL: postgresql://capakraken:capakraken_test@e2epg:5432/capakraken_test
      # prisma-with-env.mjs refuses to run unless DATABASE_URL's db name matches
      # the expected target; default is "capakraken", CI uses capakraken_test.
      CAPAKRAKEN_EXPECTED_DB_NAME: capakraken_test
      ALLOW_DESTRUCTIVE_DB_TOOLS: "true"
      CONFIRM_DESTRUCTIVE_DB_NAME: capakraken_test
      REDIS_URL: redis://e2eredis:6379
      PORT: 3100
      # test-server.mjs spawns `docker compose --profile test up postgres-test`;
      # docker compose validates env interpolation in ALL services before
      # applying the profile filter, so the unused pgadmin service's
      # ${PGADMIN_PASSWORD:?} check fires and aborts the compose call.
      # Provide a dummy value so parsing succeeds — pgadmin is never started.
      PGADMIN_PASSWORD: ci-unused
      # Same reason as PGADMIN_PASSWORD: docker compose validates env
      # interpolation across all services, including postgres (which has
      # ${POSTGRES_PASSWORD:?}). Dummy value — postgres service is not used
      # here (the `e2epg` GH Actions service container is).
      POSTGRES_PASSWORD: ci-unused
      # Tell test-server.mjs not to spin up its own postgres-test container
      # — the e2epg job service is already running and reachable. Without
      # this, test-server tries to publish 5432 on the QNAP host, which
      # collides with Gitea's core postgres.
      PLAYWRIGHT_USE_EXTERNAL_DB: "true"
      NEXTAUTH_URL: ${{ env.CI_AUTH_URL }}
      AUTH_URL: ${{ env.CI_AUTH_URL }}
      NEXTAUTH_SECRET: ${{ env.CI_AUTH_SECRET }}
      AUTH_SECRET: ${{ env.CI_AUTH_SECRET }}
    steps:
      - uses: actions/checkout@v4

      - name: Install pnpm
        run: npm install -g pnpm@${{ env.PNPM_VERSION }}

      - uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}

      - name: Install dependencies
        run: pnpm install --frozen-lockfile

      - name: Generate Prisma client
        run: pnpm db:generate

      - name: Cache Playwright browsers
        id: playwright-cache
        uses: actions/cache@v4
        continue-on-error: true
        with:
          path: ~/.cache/ms-playwright
          key: playwright-${{ hashFiles('apps/web/package.json') }}
          restore-keys: playwright-

      - name: Install Playwright browsers
        if: steps.playwright-cache.outputs.cache-hit != 'true'
        run: pnpm --filter @capakraken/web exec playwright install --with-deps chromium

      - name: Install Playwright system deps
        if: steps.playwright-cache.outputs.cache-hit == 'true'
        run: pnpm --filter @capakraken/web exec playwright install-deps chromium

      - name: Install psql (debug schema state)
        run: sudo apt-get update && sudo apt-get install -y --no-install-recommends postgresql-client

      - name: Push DB schema & seed
        env:
          PGPASSWORD: capakraken_test
        run: |
          # Nuke any leftover schema state from a previous job that shared the
          # postgres service container (act_runner reuses service volumes).
          # --force-reset alone proved unreliable: push reported "in sync" but
          # audit_logs ended up missing. Diagnostic hypothesis: there are TWO
          # postgres hosts reachable as "postgres" on gitea_gitea (the Gitea
          # core DB plus the service container) and push/seed hit different
          # ones. Verify via direct psql.
          echo "--- hosts resolving to 'e2epg' ---"
          getent hosts e2epg || true
          # Split-brain fix: 'e2epg' resolves to MULTIPLE IPs on the shared
          # gitea_gitea network (leftover service containers from concurrent
          # or crashed runs). Prisma picks one IP; psql picks another; push
          # reports success but verification sees an empty database. Probe
          # every resolved IP and lock onto the one that accepts our creds,
          # then force DATABASE_URL/PLAYWRIGHT_DATABASE_URL to that explicit
          # IP for the rest of the job so every subsequent step hits the
          # same postgres instance.
          IPS=$(getent hosts e2epg | awk '{print $1}')
          PG_IP=""
          for ip in $IPS; do
            if PGPASSWORD=capakraken_test psql -h "$ip" -U capakraken -d capakraken_test -v ON_ERROR_STOP=1 -Atc "SELECT 1" >/dev/null 2>&1; then
              PG_IP="$ip"
              echo "Locked onto postgres at $PG_IP"
              break
            else
              echo "Rejected $ip (auth or DB mismatch)"
            fi
          done
          if [ -z "$PG_IP" ]; then
            echo "ERROR: no resolved e2epg IP accepted capakraken_test credentials"
            exit 1
          fi
          PINNED_URL="postgresql://capakraken:capakraken_test@$PG_IP:5432/capakraken_test"
          echo "DATABASE_URL=$PINNED_URL" >> "$GITHUB_ENV"
          echo "PLAYWRIGHT_DATABASE_URL=$PINNED_URL" >> "$GITHUB_ENV"
          echo "--- DROP SCHEMA ---"
          psql -h "$PG_IP" -U capakraken -d capakraken_test -v ON_ERROR_STOP=1 \
            -c "DROP SCHEMA IF EXISTS public CASCADE; CREATE SCHEMA public; GRANT ALL ON SCHEMA public TO capakraken; GRANT ALL ON SCHEMA public TO public;"
          echo "--- prisma db push ---"
          DATABASE_URL="$PINNED_URL" pnpm --filter @capakraken/db exec prisma db push --schema ./prisma/schema.prisma --accept-data-loss --skip-generate
          echo "--- tables in public after push ---"
          psql -h "$PG_IP" -U capakraken -d capakraken_test -v ON_ERROR_STOP=1 -At \
            -c "SELECT tablename FROM pg_tables WHERE schemaname='public' ORDER BY tablename" \
            | tee /tmp/tables.txt
          if ! grep -qx 'audit_logs' /tmp/tables.txt; then
            echo "ERROR: audit_logs table missing after push!"
            exit 1
          fi
          DATABASE_URL="$PINNED_URL" pnpm db:seed

      - name: Run E2E tests
        # Bypass turbo here — it runs in strict env mode and does not pass
        # PLAYWRIGHT_DATABASE_URL / AUTH_SECRET / etc. through to the webServer
        # subprocess, breaking test-server.mjs. Calling playwright directly
        # inherits the job-level env unchanged.
        #
        # The full E2E suite (~167 tests across 20 specs) overwhelms the
        # QNAP runner's RAM — Next.js test server hits its memory threshold
        # and restarts mid-run, producing cascading ECONNREFUSED failures
        # unrelated to test content. Scope CI to smoke.spec.ts; full suite
        # is run locally / in a dedicated nightly job.
        run: pnpm --filter @capakraken/web exec playwright test e2e/smoke.spec.ts

      - name: Upload Playwright report
        uses: actions/upload-artifact@v4
        continue-on-error: true # upload-artifact@v4 unsupported on Gitea (GHES) runner
        if: ${{ !cancelled() }}
        with:
          name: playwright-report
          path: apps/web/playwright-report/
          retention-days: 14

  # ──────────────────────────────────────────────
  # Fresh Docker Compose deploy test — validates
  # that the prod compose bundle comes up clean
  # from scratch and the smoke tests pass.
  # ──────────────────────────────────────────────
  docker-deploy-test:
    name: Fresh-Linux Docker Deploy
    needs: [build]
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - name: Create minimal .env
        run: |
          cat <<'EOF' > .env
          NEXTAUTH_URL=http://localhost:3100
          NEXTAUTH_SECRET=ci-test-secret-minimum-32-chars-xx
          PGADMIN_PASSWORD=ci-pgadmin
          # Must match the password baked into docker-compose.ci.yml's
          # DATABASE_URL override (capakraken_dev).
          POSTGRES_PASSWORD=capakraken_dev
          EOF

      - name: Tear down any stale stack & volumes
        # act_runner on self-hosted QNAP keeps named compose volumes between
        # runs. A previous run's failed migration entry in _prisma_migrations
        # causes P3009 on the next migrate deploy; wipe volumes for a truly
        # fresh deploy test every time.
        run: docker compose -f docker-compose.yml -f docker-compose.ci.yml down -v --remove-orphans || true

      - name: Start infrastructure (postgres + redis)
        run: docker compose -f docker-compose.yml -f docker-compose.ci.yml up -d postgres redis

      - name: Wait for postgres
        run: |
          for i in $(seq 1 20); do
            docker compose -f docker-compose.yml -f docker-compose.ci.yml exec -T postgres pg_isready -U capakraken -d capakraken && break
            sleep 3
          done

      - name: Build and start app (full profile)
        run: docker compose -f docker-compose.yml -f docker-compose.ci.yml --profile full up -d --build app

      - name: Resolve and pin app IP
        # 'app' hostname collides on shared gitea_gitea network: many unrelated
        # containers (from other stacks or concurrent jobs) also answer to
        # "app" and to /api/health. Previously we probed every IP that
        # `getent hosts app` returned and pinned the first 200 responder —
        # which could easily be a foreign container whose process then died
        # mid-test, producing ERR_CONNECTION_REFUSED.
        #
        # Use docker compose ps to uniquely identify OUR app container, then
        # docker inspect to read its IP on the gitea_gitea network (the one
        # the act_runner job can reach). No DNS, no guessing.
        run: |
          set -e
          for i in $(seq 1 36); do
            CID=$(docker compose -f docker-compose.yml -f docker-compose.ci.yml ps -q app || true)
            if [ -n "$CID" ]; then
              APP_IP=$(docker inspect -f '{{range $k,$v := .NetworkSettings.Networks}}{{if eq $k "gitea_gitea"}}{{$v.IPAddress}}{{end}}{{end}}' "$CID")
              if [ -n "$APP_IP" ]; then
                CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://$APP_IP:3100/api/health" || echo "000")
                echo "Attempt $i: container $CID on $APP_IP -> HTTP $CODE"
                if [ "$CODE" = "200" ]; then
                  echo "APP_IP=$APP_IP" >> "$GITHUB_ENV"
                  echo "APP_BASE_URL=http://$APP_IP:3100" >> "$GITHUB_ENV"
                  exit 0
                fi
              else
                echo "Attempt $i: container $CID has no gitea_gitea IP yet"
              fi
            else
              echo "Attempt $i: compose has no 'app' container yet"
            fi
            sleep 5
          done
          echo "Our stack's app container never reported healthy on gitea_gitea"
          docker compose -f docker-compose.yml -f docker-compose.ci.yml logs app --tail=50
          exit 1

      - name: Verify health response contains status ok
        run: |
          BODY=$(curl -sf "$APP_BASE_URL/api/health")
          echo "$BODY"
          echo "$BODY" | grep '"status":"ok"'

      - name: Warm up root and signin paths (Next.js dev compile)
        # Dockerfile.dev runs `pnpm dev`, so Next.js compiles pages on the
        # first request. The middleware+root combo on a cold server can
        # take >10s to JIT-compile and sometimes OOM-kills a worker on the
        # QNAP runner, causing the "unauthenticated root redirects" smoke
        # test to hit ERR_CONNECTION_REFUSED. Warm both routes before the
        # smoke run: root (must return 307 redirect) and /auth/signin
        # (must return 200). Do NOT use -L; the Location target can point
        # to a hostname that is unreachable from the runner namespace, and
        # we only need the route compiled, not the redirect followed.
        run: |
          warm() {
            local path="$1"
            local expect="$2"
            for i in $(seq 1 24); do
              CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 "${APP_BASE_URL}${path}" || echo "000")
              echo "Warm-up ${path} $i: HTTP $CODE"
              if [ "$CODE" = "$expect" ]; then return 0; fi
              sleep 5
            done
            echo "Warm-up ${path} did not reach $expect; continuing anyway"
          }
          warm / 307
          warm /auth/signin 200

      - name: Seed admin user
        # setup-admin.mjs imports @prisma/client and @node-rs/argon2, both of
        # which live only in packages/db/node_modules under pnpm workspaces.
        # Node's ESM bare-specifier resolver walks up from the *script's*
        # directory (/app/scripts), not cwd, and NODE_PATH is a CJS-only
        # escape hatch (ignored by ESM). Create a scripts/node_modules with
        # symlinks to the real package directories so the resolver finds
        # them on the first step up.
        run: |
          docker compose -f docker-compose.yml -f docker-compose.ci.yml exec -T app \
            sh -c '
              set -e
              mkdir -p /app/scripts/node_modules
              ln -sfn /app/packages/db/node_modules/@prisma   /app/scripts/node_modules/@prisma
              ln -sfn /app/packages/db/node_modules/@node-rs  /app/scripts/node_modules/@node-rs
              ln -sfn /app/packages/db/node_modules/.prisma   /app/scripts/node_modules/.prisma
              node /app/scripts/setup-admin.mjs --email admin@capakraken.dev --name Admin --password admin123
            '

      - name: Set up Node.js 20
        uses: actions/setup-node@v4
        with:
          node-version: "20"

      - name: Install Playwright and Chromium
        # The repo root package.json uses pnpm `workspace:*` deps which npm
        # cannot resolve, so install into an isolated temp dir and symlink
        # @playwright/test into apps/web/node_modules so playwright.ci.config.ts
        # (CJS) can resolve it by walking up from apps/web/.
        run: |
          set -e
          mkdir -p /tmp/pw-install
          cd /tmp/pw-install
          [ -f package.json ] || npm init -y >/dev/null
          npm install --no-save --no-package-lock @playwright/test@1.49
          cd "$GITHUB_WORKSPACE"
          mkdir -p apps/web/node_modules
          ln -sfn /tmp/pw-install/node_modules/@playwright apps/web/node_modules/@playwright
          ln -sfn /tmp/pw-install/node_modules/playwright apps/web/node_modules/playwright
          ln -sfn /tmp/pw-install/node_modules/playwright-core apps/web/node_modules/playwright-core
          /tmp/pw-install/node_modules/.bin/playwright install chromium --with-deps

      - name: Re-warm routes immediately before smoke run
        # The earlier warm-up runs ~4 minutes before the smoke tests (seed,
        # Node setup, Playwright install all take real time on QNAP). In
        # between, the Next.js dev server on a constrained host can evict
        # or recompile routes under memory pressure — test #2 kept hitting
        # ERR_CONNECTION_REFUSED on / while tests for /auth/signin and api
        # routes worked fine. Re-warm both routes (same IP pin) just
        # before Playwright starts so the server is guaranteed hot.
        run: |
          warm() {
            local path="$1"
            local expect="$2"
            for i in $(seq 1 24); do
              CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 "${APP_BASE_URL}${path}" || echo "000")
              echo "Re-warm ${path} $i: HTTP $CODE"
              if [ "$CODE" = "$expect" ]; then return 0; fi
              sleep 3
            done
            echo "Re-warm ${path} did not reach $expect; continuing anyway"
          }
          warm / 307
          warm /auth/signin 200

      - name: Run smoke tests
        # Use the pinned APP_BASE_URL (explicit IP) so Chromium hits the same
        # container as the warm-up probes.
        #
        # Next.js dev mode on QNAP briefly drops the listening socket on
        # route-transition compiles — test #2 (`/`) has hit ERR_CONNECTION_
        # REFUSED between a warm-up and the test even though the same URL
        # returned 307 moments earlier. Playwright's in-process retry runs
        # while the socket is still down. Wrap the whole playwright
        # invocation in a shell retry: if the first run fails, re-warm /
        # aggressively and run the full suite once more.
        run: |
          run_smoke() {
            PLAYWRIGHT_BASE_URL="$APP_BASE_URL" \
              /tmp/pw-install/node_modules/.bin/playwright test \
              --config apps/web/playwright.ci.config.ts
          }
          if run_smoke; then exit 0; fi
          echo "First smoke run failed — aggressive re-warm + retry"
          for i in $(seq 1 10); do
            CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 "${APP_BASE_URL}/" || echo "000")
            echo "Post-fail warm / $i: HTTP $CODE"
            [ "$CODE" = "307" ] && break
            sleep 3
          done
          sleep 5
          run_smoke

      - name: Upload Playwright report
        if: failure()
        continue-on-error: true # upload-artifact@v4 unsupported on Gitea (GHES) runner
        uses: actions/upload-artifact@v4
        with:
          name: playwright-smoke-report
          path: apps/web/playwright-report/
          retention-days: 7

      - name: Show logs on failure
        if: failure()
        run: docker compose -f docker-compose.yml -f docker-compose.ci.yml logs --tail=100

  # ──────────────────────────────────────────────
  # Release images — only on push to main, after
  # every check has passed. Calls the reusable
  # release-image.yml workflow.
  # ──────────────────────────────────────────────
  release-images:
    name: Release Images
    if: github.event_name == 'push' && github.ref == 'refs/heads/main'
    needs: [lint, test, e2e, assistant-split, docker-deploy-test]
    uses: ./.github/workflows/release-image.yml
    secrets: inherit