name: CI # Retrigger marker: da0d69c (QNAP runner DNS fix applied) on: push: branches: [main] paths-ignore: - "docs/**" - ".gitea/**" - "**/*.md" - "LICENSE" pull_request: branches: [main] paths-ignore: - "docs/**" - ".gitea/**" - "**/*.md" - "LICENSE" concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true env: NODE_VERSION: "20" PNPM_VERSION: "9.14.2" CI_AUTH_URL: http://localhost:3100 # Placeholder for CI — real secret only matters at deploy time. # next build collects page data for auth routes and aborts if empty. CI_AUTH_SECRET: ci-test-secret-minimum-32-chars-xx jobs: guardrails: name: Architecture Guardrails runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install pnpm run: npm install -g pnpm@${{ env.PNPM_VERSION }} - uses: actions/setup-node@v4.0.4 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: pnpm install --frozen-lockfile - name: Run repo script tests run: pnpm test:scripts - name: Check architecture guardrails run: pnpm check:architecture - name: Check workspace exports run: pnpm check:exports - name: Check workspace imports run: pnpm check:imports - name: Security audit (high+ severity) run: pnpm audit --audit-level=high # ────────────────────────────────────────────── # Typecheck — ~40s, no services needed # ────────────────────────────────────────────── typecheck: name: Typecheck runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install pnpm run: npm install -g pnpm@${{ env.PNPM_VERSION }} - uses: actions/setup-node@v4.0.4 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: pnpm install --frozen-lockfile - name: Generate Prisma client run: pnpm db:generate - name: Cache Turborepo uses: actions/cache@v4 continue-on-error: true with: path: .turbo key: turbo-typecheck-${{ github.sha }} restore-keys: turbo-typecheck- - name: Run typecheck run: pnpm typecheck assistant-split: name: Assistant Split Regression runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install pnpm run: npm install -g pnpm@${{ env.PNPM_VERSION }} - uses: actions/setup-node@v4.0.4 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: pnpm install --frozen-lockfile - name: Generate Prisma client run: pnpm db:generate - name: Run assistant split regression run: pnpm --filter @capakraken/api test:assistant-split # ────────────────────────────────────────────── # Lint — ~20s, no services needed # ────────────────────────────────────────────── lint: name: Lint runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Install pnpm run: npm install -g pnpm@${{ env.PNPM_VERSION }} - uses: actions/setup-node@v4.0.4 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: pnpm install --frozen-lockfile - name: Generate Prisma client run: pnpm db:generate - name: Cache Turborepo uses: actions/cache@v4 continue-on-error: true with: path: .turbo key: turbo-lint-${{ github.sha }} restore-keys: turbo-lint- - name: Run lint run: pnpm lint # ────────────────────────────────────────────── # Unit tests — needs PostgreSQL + Redis # ────────────────────────────────────────────── test: name: Unit Tests runs-on: ubuntu-latest services: postgres: image: postgres:16 env: POSTGRES_DB: capakraken_test POSTGRES_USER: capakraken POSTGRES_PASSWORD: capakraken_test options: >- --health-cmd="pg_isready -U capakraken -d capakraken_test" --health-interval=10s --health-timeout=5s --health-retries=5 redis: image: redis:7 options: >- --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=5 env: DATABASE_URL: postgresql://capakraken:capakraken_test@postgres:5432/capakraken_test REDIS_URL: redis://redis:6379 # Force in-memory rate limiter to avoid cross-test state when Redis drops. # Redis fallback downgrades to max/10 limits which rate-limits unit tests. RATE_LIMIT_BACKEND: memory # Tests assume Europe/Berlin for month-boundary math (new Date(y,m,1)). TZ: Europe/Berlin NEXTAUTH_URL: ${{ env.CI_AUTH_URL }} AUTH_URL: ${{ env.CI_AUTH_URL }} NEXTAUTH_SECRET: ${{ env.CI_AUTH_SECRET }} AUTH_SECRET: ${{ env.CI_AUTH_SECRET }} steps: - uses: actions/checkout@v4 - name: Install pnpm run: npm install -g pnpm@${{ env.PNPM_VERSION }} - uses: actions/setup-node@v4.0.4 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: pnpm install --frozen-lockfile - name: Generate Prisma client run: pnpm db:generate - name: Run unit tests with coverage run: | pnpm --filter @capakraken/web test:unit -- --coverage pnpm --filter @capakraken/engine exec vitest run --coverage pnpm --filter @capakraken/staffing exec vitest run --coverage pnpm --filter @capakraken/api exec vitest run --coverage pnpm --filter @capakraken/application exec vitest run --coverage pnpm --filter @capakraken/shared exec vitest run --coverage pnpm --filter @capakraken/db test:unit - name: Upload coverage reports uses: actions/upload-artifact@v4 continue-on-error: true # upload-artifact@v4 unsupported on Gitea (GHES) runner if: ${{ !cancelled() }} with: name: coverage-reports path: | apps/web/coverage/ packages/engine/coverage/ packages/staffing/coverage/ packages/api/coverage/ packages/application/coverage/ packages/shared/coverage/ retention-days: 14 # ────────────────────────────────────────────── # Build — depends on typecheck passing # ────────────────────────────────────────────── build: name: Build needs: [guardrails, typecheck] runs-on: ubuntu-latest env: DATABASE_URL: postgresql://placeholder:placeholder@localhost:5432/placeholder REDIS_URL: redis://placeholder:6379 NEXTAUTH_URL: ${{ env.CI_AUTH_URL }} AUTH_URL: ${{ env.CI_AUTH_URL }} NEXTAUTH_SECRET: ${{ env.CI_AUTH_SECRET }} AUTH_SECRET: ${{ env.CI_AUTH_SECRET }} steps: - uses: actions/checkout@v4 - name: Install pnpm run: npm install -g pnpm@${{ env.PNPM_VERSION }} - uses: actions/setup-node@v4.0.4 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: pnpm install --frozen-lockfile - name: Generate Prisma client run: pnpm db:generate - name: Cache Turborepo uses: actions/cache@v4 continue-on-error: true with: path: .turbo key: turbo-build-${{ github.sha }} restore-keys: turbo-build- - name: Cache Next.js build uses: actions/cache@v4 continue-on-error: true with: path: apps/web/.next/cache key: nextjs-${{ hashFiles('pnpm-lock.yaml') }}-${{ github.sha }} restore-keys: nextjs-${{ hashFiles('pnpm-lock.yaml') }}- - name: Build run: pnpm --filter @capakraken/web exec next build # ────────────────────────────────────────────── # E2E — depends on build, needs PostgreSQL + Redis # ────────────────────────────────────────────── e2e: name: E2E Tests needs: [build] runs-on: ubuntu-latest services: # Unique hostnames — "postgres"/"redis" collide with Gitea's own core # containers and concurrent job service containers on the shared # gitea_gitea network, producing split-brain where push hits one DB and # seed hits another. See audit_logs-missing bug from commit f856dd26. e2epg: image: postgres:16 env: POSTGRES_DB: capakraken_test POSTGRES_USER: capakraken POSTGRES_PASSWORD: capakraken_test options: >- --health-cmd="pg_isready -U capakraken -d capakraken_test" --health-interval=10s --health-timeout=5s --health-retries=5 e2eredis: image: redis:7 options: >- --health-cmd="redis-cli ping" --health-interval=10s --health-timeout=5s --health-retries=5 env: DATABASE_URL: postgresql://capakraken:capakraken_test@e2epg:5432/capakraken_test # Playwright test-server.mjs requires an explicit test DB URL. PLAYWRIGHT_DATABASE_URL: postgresql://capakraken:capakraken_test@e2epg:5432/capakraken_test # prisma-with-env.mjs refuses to run unless DATABASE_URL's db name matches # the expected target; default is "capakraken", CI uses capakraken_test. CAPAKRAKEN_EXPECTED_DB_NAME: capakraken_test ALLOW_DESTRUCTIVE_DB_TOOLS: "true" CONFIRM_DESTRUCTIVE_DB_NAME: capakraken_test REDIS_URL: redis://e2eredis:6379 PORT: 3100 # test-server.mjs spawns `docker compose --profile test up postgres-test`; # docker compose validates env interpolation in ALL services before # applying the profile filter, so the unused pgadmin service's # ${PGADMIN_PASSWORD:?} check fires and aborts the compose call. # Provide a dummy value so parsing succeeds — pgadmin is never started. PGADMIN_PASSWORD: ci-unused # Tell test-server.mjs not to spin up its own postgres-test container # — the e2epg job service is already running and reachable. Without # this, test-server tries to publish 5432 on the QNAP host, which # collides with Gitea's core postgres. PLAYWRIGHT_USE_EXTERNAL_DB: "true" NEXTAUTH_URL: ${{ env.CI_AUTH_URL }} AUTH_URL: ${{ env.CI_AUTH_URL }} NEXTAUTH_SECRET: ${{ env.CI_AUTH_SECRET }} AUTH_SECRET: ${{ env.CI_AUTH_SECRET }} steps: - uses: actions/checkout@v4 - name: Install pnpm run: npm install -g pnpm@${{ env.PNPM_VERSION }} - uses: actions/setup-node@v4.0.4 with: node-version: ${{ env.NODE_VERSION }} - name: Install dependencies run: pnpm install --frozen-lockfile - name: Generate Prisma client run: pnpm db:generate - name: Cache Playwright browsers id: playwright-cache uses: actions/cache@v4 continue-on-error: true with: path: ~/.cache/ms-playwright key: playwright-${{ hashFiles('apps/web/package.json') }} restore-keys: playwright- - name: Install Playwright browsers if: steps.playwright-cache.outputs.cache-hit != 'true' run: pnpm --filter @capakraken/web exec playwright install --with-deps chromium - name: Install Playwright system deps if: steps.playwright-cache.outputs.cache-hit == 'true' run: pnpm --filter @capakraken/web exec playwright install-deps chromium - name: Install psql (debug schema state) run: sudo apt-get update && sudo apt-get install -y --no-install-recommends postgresql-client - name: Push DB schema & seed env: PGPASSWORD: capakraken_test run: | # Nuke any leftover schema state from a previous job that shared the # postgres service container (act_runner reuses service volumes). # --force-reset alone proved unreliable: push reported "in sync" but # audit_logs ended up missing. Diagnostic hypothesis: there are TWO # postgres hosts reachable as "postgres" on gitea_gitea (the Gitea # core DB plus the service container) and push/seed hit different # ones. Verify via direct psql. echo "--- hosts resolving to 'e2epg' ---" getent hosts e2epg || true # Split-brain fix: 'e2epg' resolves to MULTIPLE IPs on the shared # gitea_gitea network (leftover service containers from concurrent # or crashed runs). Prisma picks one IP; psql picks another; push # reports success but verification sees an empty database. Probe # every resolved IP and lock onto the one that accepts our creds, # then force DATABASE_URL/PLAYWRIGHT_DATABASE_URL to that explicit # IP for the rest of the job so every subsequent step hits the # same postgres instance. IPS=$(getent hosts e2epg | awk '{print $1}') PG_IP="" for ip in $IPS; do if PGPASSWORD=capakraken_test psql -h "$ip" -U capakraken -d capakraken_test -v ON_ERROR_STOP=1 -Atc "SELECT 1" >/dev/null 2>&1; then PG_IP="$ip" echo "Locked onto postgres at $PG_IP" break else echo "Rejected $ip (auth or DB mismatch)" fi done if [ -z "$PG_IP" ]; then echo "ERROR: no resolved e2epg IP accepted capakraken_test credentials" exit 1 fi PINNED_URL="postgresql://capakraken:capakraken_test@$PG_IP:5432/capakraken_test" echo "DATABASE_URL=$PINNED_URL" >> "$GITHUB_ENV" echo "PLAYWRIGHT_DATABASE_URL=$PINNED_URL" >> "$GITHUB_ENV" echo "--- DROP SCHEMA ---" psql -h "$PG_IP" -U capakraken -d capakraken_test -v ON_ERROR_STOP=1 \ -c "DROP SCHEMA IF EXISTS public CASCADE; CREATE SCHEMA public; GRANT ALL ON SCHEMA public TO capakraken; GRANT ALL ON SCHEMA public TO public;" echo "--- prisma db push ---" DATABASE_URL="$PINNED_URL" pnpm --filter @capakraken/db exec prisma db push --schema ./prisma/schema.prisma --accept-data-loss --skip-generate echo "--- tables in public after push ---" psql -h "$PG_IP" -U capakraken -d capakraken_test -v ON_ERROR_STOP=1 -At \ -c "SELECT tablename FROM pg_tables WHERE schemaname='public' ORDER BY tablename" \ | tee /tmp/tables.txt if ! grep -qx 'audit_logs' /tmp/tables.txt; then echo "ERROR: audit_logs table missing after push!" exit 1 fi DATABASE_URL="$PINNED_URL" pnpm db:seed - name: Run E2E tests # Bypass turbo here — it runs in strict env mode and does not pass # PLAYWRIGHT_DATABASE_URL / AUTH_SECRET / etc. through to the webServer # subprocess, breaking test-server.mjs. Calling playwright directly # inherits the job-level env unchanged. # # The full E2E suite (~167 tests across 20 specs) overwhelms the # QNAP runner's RAM — Next.js test server hits its memory threshold # and restarts mid-run, producing cascading ECONNREFUSED failures # unrelated to test content. Scope CI to smoke.spec.ts; full suite # is run locally / in a dedicated nightly job. run: pnpm --filter @capakraken/web exec playwright test e2e/smoke.spec.ts - name: Upload Playwright report uses: actions/upload-artifact@v4 continue-on-error: true # upload-artifact@v4 unsupported on Gitea (GHES) runner if: ${{ !cancelled() }} with: name: playwright-report path: apps/web/playwright-report/ retention-days: 14 # ────────────────────────────────────────────── # Fresh Docker Compose deploy test — validates # that the prod compose bundle comes up clean # from scratch and the smoke tests pass. # ────────────────────────────────────────────── docker-deploy-test: name: Fresh-Linux Docker Deploy needs: [build] runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Create minimal .env run: | cat <<'EOF' > .env NEXTAUTH_URL=http://localhost:3100 NEXTAUTH_SECRET=ci-test-secret-minimum-32-chars-xx PGADMIN_PASSWORD=ci-pgadmin EOF - name: Tear down any stale stack & volumes # act_runner on self-hosted QNAP keeps named compose volumes between # runs. A previous run's failed migration entry in _prisma_migrations # causes P3009 on the next migrate deploy; wipe volumes for a truly # fresh deploy test every time. run: docker compose -f docker-compose.yml -f docker-compose.ci.yml down -v --remove-orphans || true - name: Start infrastructure (postgres + redis) run: docker compose -f docker-compose.yml -f docker-compose.ci.yml up -d postgres redis - name: Wait for postgres run: | for i in $(seq 1 20); do docker compose -f docker-compose.yml -f docker-compose.ci.yml exec -T postgres pg_isready -U capakraken -d capakraken && break sleep 3 done - name: Build and start app (full profile) run: docker compose -f docker-compose.yml -f docker-compose.ci.yml --profile full up -d --build app - name: Resolve and pin app IP # 'app' hostname collides on shared gitea_gitea network: many unrelated # containers (from other stacks or concurrent jobs) also answer to # "app" and to /api/health. Previously we probed every IP that # `getent hosts app` returned and pinned the first 200 responder — # which could easily be a foreign container whose process then died # mid-test, producing ERR_CONNECTION_REFUSED. # # Use docker compose ps to uniquely identify OUR app container, then # docker inspect to read its IP on the gitea_gitea network (the one # the act_runner job can reach). No DNS, no guessing. run: | set -e for i in $(seq 1 36); do CID=$(docker compose -f docker-compose.yml -f docker-compose.ci.yml ps -q app || true) if [ -n "$CID" ]; then APP_IP=$(docker inspect -f '{{range $k,$v := .NetworkSettings.Networks}}{{if eq $k "gitea_gitea"}}{{$v.IPAddress}}{{end}}{{end}}' "$CID") if [ -n "$APP_IP" ]; then CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "http://$APP_IP:3100/api/health" || echo "000") echo "Attempt $i: container $CID on $APP_IP -> HTTP $CODE" if [ "$CODE" = "200" ]; then echo "APP_IP=$APP_IP" >> "$GITHUB_ENV" echo "APP_BASE_URL=http://$APP_IP:3100" >> "$GITHUB_ENV" exit 0 fi else echo "Attempt $i: container $CID has no gitea_gitea IP yet" fi else echo "Attempt $i: compose has no 'app' container yet" fi sleep 5 done echo "Our stack's app container never reported healthy on gitea_gitea" docker compose -f docker-compose.yml -f docker-compose.ci.yml logs app --tail=50 exit 1 - name: Verify health response contains status ok run: | BODY=$(curl -sf "$APP_BASE_URL/api/health") echo "$BODY" echo "$BODY" | grep '"status":"ok"' - name: Warm up root and signin paths (Next.js dev compile) # Dockerfile.dev runs `pnpm dev`, so Next.js compiles pages on the # first request. The middleware+root combo on a cold server can # take >10s to JIT-compile and sometimes OOM-kills a worker on the # QNAP runner, causing the "unauthenticated root redirects" smoke # test to hit ERR_CONNECTION_REFUSED. Warm both routes before the # smoke run: root (must return 307 redirect) and /auth/signin # (must return 200). Do NOT use -L; the Location target can point # to a hostname that is unreachable from the runner namespace, and # we only need the route compiled, not the redirect followed. run: | warm() { local path="$1" local expect="$2" for i in $(seq 1 24); do CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 "${APP_BASE_URL}${path}" || echo "000") echo "Warm-up ${path} $i: HTTP $CODE" if [ "$CODE" = "$expect" ]; then return 0; fi sleep 5 done echo "Warm-up ${path} did not reach $expect; continuing anyway" } warm / 307 warm /auth/signin 200 - name: Seed admin user # setup-admin.mjs imports @prisma/client and @node-rs/argon2, both of # which live only in packages/db/node_modules under pnpm workspaces. # Node's ESM bare-specifier resolver walks up from the *script's* # directory (/app/scripts), not cwd, and NODE_PATH is a CJS-only # escape hatch (ignored by ESM). Create a scripts/node_modules with # symlinks to the real package directories so the resolver finds # them on the first step up. run: | docker compose -f docker-compose.yml -f docker-compose.ci.yml exec -T app \ sh -c ' set -e mkdir -p /app/scripts/node_modules ln -sfn /app/packages/db/node_modules/@prisma /app/scripts/node_modules/@prisma ln -sfn /app/packages/db/node_modules/@node-rs /app/scripts/node_modules/@node-rs ln -sfn /app/packages/db/node_modules/.prisma /app/scripts/node_modules/.prisma node /app/scripts/setup-admin.mjs --email admin@capakraken.dev --name Admin --password admin123 ' - name: Set up Node.js 20 uses: actions/setup-node@v4.0.4 with: node-version: "20" - name: Install Playwright and Chromium # The repo root package.json uses pnpm `workspace:*` deps which npm # cannot resolve, so install into an isolated temp dir and symlink # @playwright/test into apps/web/node_modules so playwright.ci.config.ts # (CJS) can resolve it by walking up from apps/web/. run: | set -e mkdir -p /tmp/pw-install cd /tmp/pw-install [ -f package.json ] || npm init -y >/dev/null npm install --no-save --no-package-lock @playwright/test@1.49 cd "$GITHUB_WORKSPACE" mkdir -p apps/web/node_modules ln -sfn /tmp/pw-install/node_modules/@playwright apps/web/node_modules/@playwright ln -sfn /tmp/pw-install/node_modules/playwright apps/web/node_modules/playwright ln -sfn /tmp/pw-install/node_modules/playwright-core apps/web/node_modules/playwright-core /tmp/pw-install/node_modules/.bin/playwright install chromium --with-deps - name: Re-warm routes immediately before smoke run # The earlier warm-up runs ~4 minutes before the smoke tests (seed, # Node setup, Playwright install all take real time on QNAP). In # between, the Next.js dev server on a constrained host can evict # or recompile routes under memory pressure — test #2 kept hitting # ERR_CONNECTION_REFUSED on / while tests for /auth/signin and api # routes worked fine. Re-warm both routes (same IP pin) just # before Playwright starts so the server is guaranteed hot. run: | warm() { local path="$1" local expect="$2" for i in $(seq 1 24); do CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 "${APP_BASE_URL}${path}" || echo "000") echo "Re-warm ${path} $i: HTTP $CODE" if [ "$CODE" = "$expect" ]; then return 0; fi sleep 3 done echo "Re-warm ${path} did not reach $expect; continuing anyway" } warm / 307 warm /auth/signin 200 - name: Run smoke tests # Use the pinned APP_BASE_URL (explicit IP) so Chromium hits the same # container as the warm-up probes. # # Next.js dev mode on QNAP briefly drops the listening socket on # route-transition compiles — test #2 (`/`) has hit ERR_CONNECTION_ # REFUSED between a warm-up and the test even though the same URL # returned 307 moments earlier. Playwright's in-process retry runs # while the socket is still down. Wrap the whole playwright # invocation in a shell retry: if the first run fails, re-warm / # aggressively and run the full suite once more. run: | run_smoke() { PLAYWRIGHT_BASE_URL="$APP_BASE_URL" \ /tmp/pw-install/node_modules/.bin/playwright test \ --config apps/web/playwright.ci.config.ts } if run_smoke; then exit 0; fi echo "First smoke run failed — aggressive re-warm + retry" for i in $(seq 1 10); do CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 30 "${APP_BASE_URL}/" || echo "000") echo "Post-fail warm / $i: HTTP $CODE" [ "$CODE" = "307" ] && break sleep 3 done sleep 5 run_smoke - name: Upload Playwright report if: failure() continue-on-error: true # upload-artifact@v4 unsupported on Gitea (GHES) runner uses: actions/upload-artifact@v4 with: name: playwright-smoke-report path: apps/web/playwright-report/ retention-days: 7 - name: Show logs on failure if: failure() run: docker compose -f docker-compose.yml -f docker-compose.ci.yml logs --tail=100 # ────────────────────────────────────────────── # Release images — only on push to main, after # every check has passed. Calls the reusable # release-image.yml workflow. # ────────────────────────────────────────────── release-images: name: Release Images if: github.event_name == 'push' && github.ref == 'refs/heads/main' needs: [lint, test, e2e, assistant-split, docker-deploy-test] uses: ./.github/workflows/release-image.yml secrets: inherit