refactor(ops): standardize image-based production delivery

This commit is contained in:
2026-03-30 23:35:29 +02:00
parent ef5e8016a4
commit 7bcc831b5c
17 changed files with 447 additions and 538 deletions
+22 -1
View File
@@ -15,6 +15,27 @@ env:
PNPM_VERSION: "9.14.2"
jobs:
guardrails:
name: Architecture Guardrails
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: pnpm/action-setup@v4
with:
version: ${{ env.PNPM_VERSION }}
- uses: actions/setup-node@v4
with:
node-version: ${{ env.NODE_VERSION }}
cache: pnpm
- name: Install dependencies
run: pnpm install --frozen-lockfile
- name: Check architecture guardrails
run: pnpm check:architecture
# ──────────────────────────────────────────────
# Typecheck — ~40s, no services needed
# ──────────────────────────────────────────────
@@ -147,7 +168,7 @@ jobs:
# ──────────────────────────────────────────────
build:
name: Build
needs: [typecheck]
needs: [guardrails, typecheck]
runs-on: ubuntu-latest
env:
DATABASE_URL: postgresql://placeholder:placeholder@localhost:5432/placeholder
+1 -1
View File
@@ -44,7 +44,7 @@ jobs:
ssh-keyscan -p "${SSH_PORT:-22}" -H "${SSH_HOST}" >> ~/.ssh/known_hosts
- name: Bundle deploy assets
run: tar czf deploy-bundle.tgz docker-compose.cicd.yml tooling/deploy
run: tar czf deploy-bundle.tgz docker-compose.prod.yml tooling/deploy
- name: Copy deploy assets to production
env:
+1 -1
View File
@@ -44,7 +44,7 @@ jobs:
ssh-keyscan -p "${SSH_PORT:-22}" -H "${SSH_HOST}" >> ~/.ssh/known_hosts
- name: Bundle deploy assets
run: tar czf deploy-bundle.tgz docker-compose.cicd.yml tooling/deploy
run: tar czf deploy-bundle.tgz docker-compose.prod.yml tooling/deploy
- name: Copy deploy assets to staging
env:
+11
View File
@@ -1,6 +1,8 @@
name: Release Image
on:
push:
branches: [main]
workflow_dispatch:
inputs:
image_tag:
@@ -61,3 +63,12 @@ jobs:
tags: ${{ steps.vars.outputs.migrator_image }}
cache-from: type=gha,scope=migrator-image
cache-to: type=gha,mode=max,scope=migrator-image
- name: Publish release summary
run: |
{
echo "## Image release"
echo
echo "- App image: \`${{ steps.vars.outputs.app_image }}\`"
echo "- Migrator image: \`${{ steps.vars.outputs.migrator_image }}\`"
} >> "$GITHUB_STEP_SUMMARY"
-77
View File
@@ -1,77 +0,0 @@
name: capakraken-cicd
services:
postgres:
image: postgres:16-alpine
restart: unless-stopped
ports:
- "${POSTGRES_PORT:-5432}:5432"
environment:
POSTGRES_DB: capakraken
POSTGRES_USER: capakraken
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}
volumes:
- capakraken_prod_pgdata:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U capakraken -d capakraken"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
redis:
image: redis:7-alpine
restart: unless-stopped
ports:
- "${REDIS_PORT:-6379}:6379"
command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
volumes:
- capakraken_prod_redis:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
start_period: 5s
migrator:
image: ${MIGRATOR_IMAGE:?set MIGRATOR_IMAGE}
restart: "no"
env_file:
- .env.production
environment:
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
REDIS_URL: redis://redis:6379
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
app:
image: ${APP_IMAGE:?set APP_IMAGE}
restart: unless-stopped
ports:
- "${APP_HOST_PORT:-3000}:3000"
env_file:
- .env.production
environment:
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
REDIS_URL: redis://redis:6379
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/api/ready"]
interval: 30s
timeout: 5s
retries: 3
start_period: 30s
volumes:
capakraken_prod_pgdata:
name: capakraken_prod_pgdata
capakraken_prod_redis:
name: capakraken_prod_redis
+29 -13
View File
@@ -5,11 +5,11 @@ services:
image: postgres:16-alpine
restart: unless-stopped
ports:
- "5432:5432"
- "${POSTGRES_PORT:-5432}:5432"
environment:
POSTGRES_DB: capakraken
POSTGRES_USER: capakraken
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}
command: >
postgres
-c log_connections=on
@@ -31,7 +31,7 @@ services:
image: redis:7-alpine
restart: unless-stopped
ports:
- "6379:6379"
- "${REDIS_PORT:-6379}:6379"
command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
volumes:
- capakraken_prod_redis:/data
@@ -42,29 +42,45 @@ services:
retries: 5
start_period: 5s
app:
build:
context: .
dockerfile: Dockerfile.prod
restart: unless-stopped
ports:
- "3000:3000"
migrator:
image: ${MIGRATOR_IMAGE:?set MIGRATOR_IMAGE}
pull_policy: always
restart: "no"
env_file:
- .env.production
environment:
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:-changeme}@postgres:5432/capakraken
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
REDIS_URL: redis://redis:6379
RATE_LIMIT_BACKEND: ${RATE_LIMIT_BACKEND:-redis}
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
app:
image: ${APP_IMAGE:?set APP_IMAGE}
pull_policy: always
restart: unless-stopped
ports:
- "${APP_HOST_PORT:-3000}:3000"
env_file:
- .env.production
environment:
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
REDIS_URL: redis://redis:6379
RATE_LIMIT_BACKEND: ${RATE_LIMIT_BACKEND:-redis}
depends_on:
postgres:
condition: service_healthy
redis:
condition: service_healthy
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
test: ["CMD", "curl", "-f", "http://localhost:3000/api/ready"]
interval: 30s
timeout: 5s
retries: 3
start_period: 20s
start_period: 30s
volumes:
capakraken_prod_pgdata:
+1 -1
View File
@@ -8,7 +8,7 @@
| Topic | File | Use |
|---|---|---|
| AI excellence due diligence | [ai-excellence-due-diligence-roadmap.md](/home/hartmut/Documents/Copilot/capakraken/docs/ai-excellence-due-diligence-roadmap.md) | Frank quality assessment and cleanup roadmap toward a showcase AI-built project |
| Target CI/CD architecture | [cicd-target-architecture.md](/home/hartmut/Documents/Copilot/capakraken/docs/cicd-target-architecture.md) | Proposed image-based build, deploy, and rollback flow |
| Target CI/CD architecture | [cicd-target-architecture.md](/home/hartmut/Documents/Copilot/capakraken/docs/cicd-target-architecture.md) | Canonical image-based build, deploy, and rollback flow |
| Active roadmap and open gaps | [product-roadmap.md](/home/hartmut/Documents/Copilot/capakraken/docs/product-roadmap.md) | Primary backlog and current delivery order |
| Estimating system design | [estimating-extension-design.md](/home/hartmut/Documents/Copilot/capakraken/docs/estimating-extension-design.md) | Workbook analysis, field mapping, and implementation plan |
| Dispo import implementation | [dispo-import-implementation.md](/home/hartmut/Documents/Copilot/capakraken/docs/dispo-import-implementation.md) | Clean-slate Dispo v2 import design, mapping rules, staging flow, and commit policy |
+4 -5
View File
@@ -66,9 +66,9 @@ The previously critical SSE and browser parser coverage issues were addressed du
Evidence: the current performance review identifies repeated in-memory filtering, broad invalidation, and heavyweight timeline/report derivations in [performance-optimization-review-2026-03-18.md](/home/hartmut/Documents/Copilot/capakraken/docs/performance-optimization-review-2026-03-18.md).
Risk: user experience and infrastructure cost will degrade as data volume grows.
3. Production delivery is still in transition.
Evidence: the current repo now has a target CI/CD path, but the old manual production path still coexists with the new image-based deploy model in [cicd-target-architecture.md](/home/hartmut/Documents/Copilot/capakraken/docs/cicd-target-architecture.md).
Risk: the operational source of truth is not yet singular.
3. Rollback and incident drills still need to be exercised, even though the deployment path is now standardized.
Evidence: the canonical production path now runs through [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml), [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml), [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml), and the single host compose file [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml).
Risk: a clean architecture path still needs operator rehearsal before it becomes operationally boring under pressure.
## Overall Rating
@@ -92,7 +92,7 @@ The architecture is promising, but file size, router density, and compatibility
### Operational Maturity
`7/10`
`7.5/10`
Good CI and improving deploy discipline are in place, but production standardization still needs one more step.
@@ -191,7 +191,6 @@ Target window: 1 to 2 weeks
Goals:
- complete the move to image-based deploys as the canonical path
- document staging and production bootstrap as code, not tribal knowledge
- ensure staging and production run the Redis-backed rate-limit path intentionally and monitor fallback usage
- define rollback drills and incident response playbooks
+2 -2
View File
@@ -48,6 +48,7 @@
- the country listing and country detail assistant helpers now live in their own domain module, keeping the remaining geo/readmodel lookups out of the monolithic assistant router without changing the assistant contract
- the remaining vacation workflow and entitlement assistant helpers now live in their own domain module, leaving `packages/api/src/router/assistant-tools.ts` as an aggregator/composition layer instead of the last mixed monolithic execution block
- API and auth rate limiting now prefer shared Redis-backed counters when `REDIS_URL` is configured, while retaining an in-memory fallback for local/degraded operation with focused regression coverage
- production delivery is now consolidated on a single image-based compose path with automatic image publication on `main`, deploy-time readiness gating, and architecture guardrails that prevent host-side app builds from creeping back in
## Next Up
@@ -62,8 +63,7 @@ The remaining work is now structural rather than another quick batch:
1. secrets and runtime configuration policy
2. oversized router decomposition
3. canonical image-based production delivery
4. performance hotspot reduction
3. performance hotspot reduction
## Working Rule
+132 -272
View File
@@ -2,333 +2,193 @@
## Overview
CapaKraken uses GitHub Actions for continuous integration and Docker for deployment. This document covers the full pipeline from code push to production.
This is the operational runbook for the canonical CapaKraken delivery path:
---
1. CI validates every PR.
2. Every push to `main` publishes immutable release images.
3. Staging deploys one `sha-<commit>` tag.
4. Production promotes the same tag.
5. The host never builds application code from Git.
## 1. CI Pipeline (Automatic on every PR)
## 1. CI Gate
### What triggers it
The merge gate is [ci.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/ci.yml).
| Event | Trigger |
|-------|---------|
| Pull request to `main` | All CI jobs run |
| Push to `main` | All CI jobs run |
It covers:
### Jobs and their purpose
- architecture guardrails
- typecheck
- lint
- unit tests
- build
- E2E
```
PR opened / pushed
├──→ typecheck (tsc --noEmit, ~40s)
├──→ lint (ESLint via Turborepo, ~20s)
├──→ test (Vitest unit tests, ~60s, needs PostgreSQL + Redis)
└──→ build (next build, ~90s, runs after typecheck)
└──→ e2e (Playwright, ~3-5min, runs after build)
```
Before merging, all required checks must pass.
**typecheck, lint, and test run in parallel** for speed. Build waits for typecheck. E2E waits for build.
### What each job checks
| Job | Command | What it catches |
|-----|---------|----------------|
| **typecheck** | `pnpm --filter @capakraken/web exec tsc --noEmit` | Type errors across the full web app |
| **lint** | `pnpm lint` | Code style violations, unused imports, etc. |
| **test** | `pnpm test:unit` | Unit test failures in engine, staffing, API, shared |
| **build** | `pnpm --filter @capakraken/web exec next build` | SSR errors, dynamic import issues, bundle problems |
| **e2e** | `pnpm test:e2e` | End-to-end user flow regressions |
### Required status checks
Before merging a PR, **all 5 jobs must pass**. Configure this in GitHub Settings > Branches > Branch protection rules > Require status checks.
### Caching
The pipeline caches these artifacts to speed up subsequent runs:
| Cache | Key | Saves |
|-------|-----|-------|
| pnpm store | `pnpm-lock.yaml` hash | ~30s install time |
| Turborepo | `.turbo` directory | ~60s on unchanged packages |
| Playwright browsers | Playwright version | ~45s browser download |
---
## 2. Local Development Quality Gates
Run these before pushing to catch issues early:
Useful local commands:
```bash
# Quick check (< 2 min)
pnpm --filter @capakraken/web exec tsc --noEmit && pnpm lint
# Full check (< 3 min)
pnpm --filter @capakraken/web exec tsc --project tsconfig.typecheck.json --noEmit
pnpm lint
pnpm test:unit
# Full check including build (< 5 min)
pnpm --filter @capakraken/web exec next build
```
### Pre-commit hook (optional)
## 2. Image Release
You can add a Git pre-commit hook to run the quick check automatically:
[release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) runs automatically on every push to `main`.
```bash
# .husky/pre-commit
pnpm --filter @capakraken/web exec tsc --noEmit
pnpm lint
It publishes:
- `ghcr.io/<owner>/<repo>-app:sha-<commit>`
- `ghcr.io/<owner>/<repo>-migrator:sha-<commit>`
The workflow is also callable manually if a rebuild or tag override is needed.
## 3. Host Bootstrap
Each deploy target should have a dedicated directory such as `/opt/capakraken` containing:
```text
docker-compose.prod.yml
.env.production
deploy.env
tooling/deploy/deploy-compose.sh
```
---
Use these examples from the repo:
## 3. Health Check Endpoints
- [tooling/deploy/.env.production.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/.env.production.example)
- [tooling/deploy/deploy.env.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy.env.example)
Two endpoints are available for monitoring:
Important host-side rules:
### GET `/api/health` — Liveness Probe
- keep `RATE_LIMIT_BACKEND=redis`
- keep runtime secrets in `.env.production` or the platform secret layer
- do not rotate runtime secrets through admin settings
- ensure the host can pull from `ghcr.io`
Returns 200 if the Node.js process is running. No external dependencies checked.
```json
{ "status": "ok", "timestamp": "2026-03-19T10:00:00.000Z" }
```
**Use for:** Kubernetes/Docker liveness probe, uptime monitoring.
### GET `/api/ready` — Readiness Probe
Checks PostgreSQL and Redis connectivity. Returns 200 if all services are reachable, 503 if not.
```json
// Healthy
{ "status": "ready", "postgres": "ok", "redis": "ok" }
// Unhealthy
{ "status": "not_ready", "postgres": "ok", "redis": "error" }
```
**Use for:** Kubernetes/Docker readiness probe, load balancer health checks, nginx upstream checks.
---
## 4. Production Docker Build
### Building the production image
```bash
# Build the image
docker build -f Dockerfile.prod -t capakraken:latest .
# Test it locally
docker compose -f docker-compose.prod.yml up -d
```
### Image details
| Property | Value |
|----------|-------|
| Base | `node:20-bookworm-slim` |
| Size | ~150-200 MB (vs ~1.5 GB dev image) |
| Output | Next.js standalone mode |
| Healthcheck | `curl -f http://localhost:3000/api/health` |
| Port | 3000 (internal), mapped to 3100 externally |
### Environment variables
The production image requires these environment variables:
```env
# Required
DATABASE_URL=postgresql://user:pass@host:5432/capakraken
REDIS_URL=redis://host:6379
NEXTAUTH_URL=https://capakraken.your-domain.com
NEXTAUTH_SECRET=<random-32-char-string>
# Optional
SENTRY_DSN=https://xxx@sentry.io/xxx
SMTP_HOST=smtp.example.com
SMTP_PORT=587
SMTP_USER=notifications@example.com
SMTP_PASSWORD=<password>
SMTP_FROM=CapaKraken <notifications@example.com>
OPENAI_API_KEY=<optional-if-openai-used>
AZURE_OPENAI_API_KEY=<optional-if-azure-chat-used>
AZURE_DALLE_API_KEY=<optional-if-azure-image-gen-used>
GEMINI_API_KEY=<optional-if-gemini-used>
ANONYMIZATION_SEED=<required-if-deterministic-anonymization-enabled>
```
Generate a secure `NEXTAUTH_SECRET`:
Generate a secure `NEXTAUTH_SECRET` with:
```bash
openssl rand -base64 32
```
Runtime secret policy:
## 4. Staging Deployment
- production secrets are injected through the deployment environment or host secret store
- admin settings must not be used to enter or rotate AI, SMTP, or anonymization secrets
- the admin UI is only for status checks and cleanup of legacy database-stored secret values
Standard path:
---
1. merge to `main`
2. wait for [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) to publish `sha-<commit>`
3. run [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) with that tag
## 5. Deployment
The workflow uploads:
### docker-compose (simplest)
- [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
- [tooling/deploy](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/README.md)
- a short-lived `deploy.env`
On the host, [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh):
1. validates the rendered compose file
2. pulls `APP_IMAGE` and `MIGRATOR_IMAGE`
3. starts PostgreSQL and Redis
4. runs Prisma migrations with the `migrator` image
5. starts the app
6. waits for `GET /api/ready`
## 5. Production Promotion
After staging is accepted:
1. run [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml)
2. use the exact same `sha-<commit>` tag
3. verify `GET /api/ready`
Production must promote the already-tested image, not rebuild from source.
## 6. Manual Host Dry Run
If you need to verify the host outside GitHub Actions:
```bash
# On your server, after updating the host-side env/secret source
git pull
docker compose -f docker-compose.prod.yml up -d --build
cp tooling/deploy/.env.production.example .env.production
cp tooling/deploy/deploy.env.example deploy.env
# fill in real secrets and image refs first
# Run database migrations
docker compose -f docker-compose.prod.yml exec app \
pnpm --filter @capakraken/db db:migrate:deploy
# Seed initial data (first deployment only)
docker compose -f docker-compose.prod.yml exec app \
pnpm db:seed
set -a
. ./deploy.env
set +a
bash tooling/deploy/deploy-compose.sh staging
```
### Manual deployment (current setup)
## 7. Health Endpoints
Since `capakraken.hartmut-noerenberg.com` runs behind nginx:
### GET `/api/health`
Process liveness only. Use it for coarse uptime checks.
### GET `/api/ready`
Checks PostgreSQL and Redis connectivity. Use it for deploy readiness and traffic admission.
For deploys, `/api/ready` is the source of truth.
## 8. Rollback
Rollback is image-based:
1. choose the previous healthy `sha-<commit>`
2. rerun the staging or production deploy workflow with that tag
3. confirm `GET /api/ready`
Schema changes still need expand-and-contract discipline for rollback safety.
## 9. Troubleshooting
### CI failure
Run the failing command locally:
```bash
# On the server
cd /home/hartmut/Documents/Copilot/capakraken
git pull origin main
pnpm install
pnpm db:generate
pnpm db:validate
pnpm --filter @capakraken/db db:migrate:deploy
pnpm --filter @capakraken/web exec next build
rm -rf apps/web/.next/cache # clear stale cache
# Restart the app (systemd, pm2, or manual)
fuser -k 3100/tcp 2>/dev/null
PORT=3100 pnpm --filter @capakraken/web start &
```
Use the repo-level `pnpm db:*` commands for Prisma/database operations. They load `.env`, `.env.local`, `.env.$NODE_ENV`, and `.env.$NODE_ENV.local` automatically before invoking Prisma.
If you rotate runtime secrets during a manual deploy, update the host-side environment source first, then restart the app so the new process reads the updated values. Do not patch those values through admin settings.
### nginx configuration
The existing nginx reverse proxy should forward to port 3100:
```nginx
server {
server_name capakraken.hartmut-noerenberg.com;
location / {
proxy_pass http://127.0.0.1:3100;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# SSE support (keep connection open)
proxy_read_timeout 86400s;
proxy_buffering off;
}
}
```
---
## 6. Monitoring Setup
### Sentry (error tracking)
After creating a Sentry project, add the DSN to `.env.production`:
```env
SENTRY_DSN=https://xxx@sentry.io/xxx
```
Errors are automatically captured by the Sentry integration in Next.js.
### Uptime monitoring
Point an external monitor (UptimeRobot, Better Stack, etc.) at:
```
https://capakraken.hartmut-noerenberg.com/api/health
```
Alert if status code != 200 for more than 2 consecutive checks.
---
## 7. Troubleshooting
### CI job fails: "tsc --noEmit"
TypeScript error in the web app. Run locally:
```bash
pnpm --filter @capakraken/web exec tsc --noEmit
```
### CI job fails: "test:unit"
Unit test failure. Run locally:
```bash
pnpm --filter @capakraken/web exec tsc --project tsconfig.typecheck.json --noEmit
pnpm lint
pnpm test:unit
```
### CI job fails: "next build"
Build error (often `ssr: false` in Server Components, missing exports). Run locally:
```bash
pnpm --filter @capakraken/web exec next build
```
### CI job fails: "e2e"
### Deploy fails before container start
Playwright test failure. Check the HTML report artifact in the GitHub Actions run.
Check the rendered compose configuration on the host:
### Production: 502 Bad Gateway
The Next.js process isn't running. Check:
```bash
ss -tlnp | grep 3100 # Is anything listening?
tail -50 /tmp/capakraken-dev.log # Check app logs
docker compose -f docker-compose.prod.yml config -q
```
Restart:
Then verify `.env.production` and `deploy.env`.
### App never becomes ready
Check:
```bash
fuser -k 3100/tcp 2>/dev/null
pnpm dev & # or pnpm start for production mode
docker compose -f docker-compose.prod.yml ps
docker compose -f docker-compose.prod.yml logs --tail 200 app
curl -s http://127.0.0.1:${APP_HOST_PORT:-3000}/api/ready
```
### Production: 500 Internal Server Error
### Database migration failure
Inspect the migrator logs:
Usually a stale Prisma client after schema changes:
```bash
pnpm db:generate
pnpm db:validate
rm -rf apps/web/.next
pnpm --filter @capakraken/web exec next build
# Restart the server
docker compose -f docker-compose.prod.yml run --rm migrator
```
### Database connection issues
### Registry pull failure
Check the `/api/ready` endpoint:
```bash
curl -s https://capakraken.hartmut-noerenberg.com/api/ready | jq .
```
Verify `GHCR_USERNAME` and `GHCR_TOKEN`, then test:
If `postgres: "error"`, verify:
```bash
docker ps | grep postgres # Is container running?
psql -h localhost -p 5433 -U capakraken -d capakraken # Can you connect?
printf '%s\n' "$GHCR_TOKEN" | docker login ghcr.io -u "$GHCR_USERNAME" --password-stdin
```
+64 -153
View File
@@ -2,83 +2,67 @@
## Goal
This document captures the intended delivery model for CapaKraken without replacing the currently working manual production setup immediately.
This document describes the canonical release path for CapaKraken.
The target state is:
The release model is now:
1. CI validates every PR.
2. GitHub Actions builds immutable Docker images.
3. Staging and production pull those exact images from a registry.
4. Database migrations run as an explicit deploy step.
5. Traffic is considered safe only after the app answers `GET /api/ready`.
1. PRs are validated by CI before merge.
2. Every push to `main` publishes immutable `app` and `migrator` images.
3. Staging and production promote the exact same `sha-<commit>` tag.
4. The host deploys only from images and runtime env files.
5. A deployment is successful only after `GET /api/ready` passes.
## Core Idea
The production host should stop building application code from a Git checkout. Instead, it should only:
- pull a versioned `app` image
- pull a matching `migrator` image
- run Prisma deploy migrations
- start the application container
- wait for readiness
That removes "works on the server but not in CI" drift and makes rollbacks much simpler.
## Delivery Flow
## Canonical Flow
### 1. Pull Request Validation
The existing `CI` workflow continues to validate:
The main [ci.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/ci.yml) workflow remains the merge gate for:
- architecture guardrails for SSE audience scoping
- architecture guardrails
- typecheck
- lint
- unit tests
- build
- E2E
This remains the quality gate before merge.
### 2. Automatic Image Release
The guardrail step currently enforces three invariants:
[release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) now runs automatically on every push to `main` and can still be started manually for rebuilds or tag overrides.
- no role-based SSE audience fan-out in [event-bus.ts](/home/hartmut/Documents/Copilot/capakraken/packages/api/src/sse/event-bus.ts)
- no role-derived subscription audiences in [subscription-policy.ts](/home/hartmut/Documents/Copilot/capakraken/packages/api/src/sse/subscription-policy.ts)
- no client-provided audience parsing in [route.ts](/home/hartmut/Documents/Copilot/capakraken/apps/web/src/app/api/sse/timeline/route.ts)
It publishes two images from [Dockerfile.prod](/home/hartmut/Documents/Copilot/capakraken/Dockerfile.prod):
### 2. Image Build
- `ghcr.io/<owner>/<repo>-app:sha-<commit>`
- `ghcr.io/<owner>/<repo>-migrator:sha-<commit>`
The new manual workflow [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) builds two images from [Dockerfile.prod](/home/hartmut/Documents/Copilot/capakraken/Dockerfile.prod):
### 3. Staging Promotion
- `runner` target as the production app image
- `migrator` target as the Prisma migration image
[deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) copies the canonical deploy bundle to the staging host:
Recommended tag format:
- [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
- [tooling/deploy/deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh)
- the rest of [tooling/deploy](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/README.md)
- `sha-<git-commit>`
GitHub Actions also writes a short-lived `deploy.env` containing `APP_IMAGE`, `MIGRATOR_IMAGE`, and the host port.
Example:
### 4. Host-Side Deployment
```text
ghcr.io/<owner>/capakraken-app:sha-abc123
ghcr.io/<owner>/capakraken-migrator:sha-abc123
```
On the target host, [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh):
### 3. Staging Deploy
1. loads `.env.production` and `deploy.env`
2. validates the rendered compose file
3. pulls the immutable `app` and `migrator` images
4. starts PostgreSQL and Redis
5. runs Prisma migrations through the dedicated `migrator` image
6. starts the new `app` container
7. waits for `GET /api/ready`
The staging workflow [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) is intended to:
The host does not build application code from Git anymore.
1. connect to the staging host over SSH
2. copy the deploy assets
3. export `APP_IMAGE` and `MIGRATOR_IMAGE`
4. run [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh)
### 5. Production Promotion
The compose file used for this target flow is [docker-compose.cicd.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.cicd.yml).
[deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml) repeats the exact staging flow with the same image tag after staging acceptance.
### 4. Production Promotion
The production workflow [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml) follows the same logic as staging, but the image tag is promoted manually.
That means production uses an image that was already built and can already have been exercised in staging.
That keeps staging and production on the same artifact instead of rebuilding.
## Required Infrastructure
@@ -86,139 +70,66 @@ That means production uses an image that was already built and can already have
- GitHub repository with Actions enabled
- GHCR or another container registry
- 1 Linux host with Docker and Docker Compose
- one Linux host with Docker Engine and Docker Compose v2
- PostgreSQL
- Redis
- reverse proxy such as nginx
- SSH access from GitHub Actions to the host
- reverse proxy or load balancer in front of the app
### Recommended
- separate staging and production hosts
- GitHub Environments for `staging` and `production`
- required reviewer approval for `production`
- backup strategy for PostgreSQL volumes
- uptime monitoring and error tracking
- required approval for the `production` environment
- monitoring on `/api/health` and `/api/ready`
- PostgreSQL backup and restore drills
## Secrets
## Runtime Configuration
### GitHub Environment Secrets
For `staging`:
- `STAGING_SSH_HOST`
- `STAGING_SSH_PORT`
- `STAGING_SSH_USER`
- `STAGING_SSH_KEY`
- `STAGING_DEPLOY_PATH`
- `STAGING_APP_HOST_PORT`
- `STAGING_GHCR_USERNAME`
- `STAGING_GHCR_TOKEN`
For `production`:
- `PROD_SSH_HOST`
- `PROD_SSH_PORT`
- `PROD_SSH_USER`
- `PROD_SSH_KEY`
- `PROD_DEPLOY_PATH`
- `PROD_APP_HOST_PORT`
- `PROD_GHCR_USERNAME`
- `PROD_GHCR_TOKEN`
### Host-side Files
Each target host should already have:
The canonical host-side inputs are:
- [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
- `.env.production`
- Docker installed
- network access to the container registry
- `deploy.env`
The repository now also contains a small host example at [tooling/deploy/.env.production.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/.env.production.example) and an operator note at [tooling/deploy/README.md](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/README.md).
`.env.production` holds long-lived runtime configuration and secrets. The example file is [tooling/deploy/.env.production.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/.env.production.example).
### Minimum Host Bootstrap
`deploy.env` is short-lived deployment metadata. The example file is [tooling/deploy/deploy.env.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy.env.example).
For each target host, create a dedicated deploy directory such as `/opt/capakraken` and place these files there:
Important invariants:
```text
docker-compose.cicd.yml
.env.production
tooling/deploy/deploy-compose.sh
```
`.env.production` should hold the long-lived runtime settings, including:
```env
POSTGRES_PASSWORD=<long-random-password>
NEXTAUTH_URL=https://capakraken.example.com
NEXTAUTH_SECRET=<long-random-secret>
```
GitHub Actions only injects the short-lived image references through `deploy.env`. The deploy script then loads both files before calling Docker Compose, so compose interpolation and container runtime env use the same source of truth.
### Runtime Secret Provisioning Policy
Production and staging secrets should be provisioned at the host or platform-secret layer, not through admin mutations and not through application database writes.
That includes at least:
```env
OPENAI_API_KEY=<optional-if-openai-used>
AZURE_OPENAI_API_KEY=<optional-if-azure-chat-used>
AZURE_DALLE_API_KEY=<optional-if-azure-image-gen-used>
GEMINI_API_KEY=<optional-if-gemini-used>
SMTP_PASSWORD=<required-if-smtp-auth-used>
ANONYMIZATION_SEED=<required-if-deterministic-anonymization-enabled>
```
Operational rule:
- keep these values in `.env.production` only for smaller self-managed hosts, or preferably in the host's secret manager / encrypted environment facility
- do not rotate or patch these values through `SystemSettings`
- use the admin settings page only to verify runtime source/status and to clear leftover legacy database copies
- after migration, legacy database secret fields should be empty in both staging and production
- `RATE_LIMIT_BACKEND=redis` should stay explicit in release environments
- runtime AI, SMTP, and anonymization secrets belong to the host or platform secret layer
- admin settings are for verification and legacy-secret cleanup, not for secret rotation
## Database Policy
For release environments, use:
Release environments must run migrations through the `migrator` image, which executes:
```bash
pnpm --filter @capakraken/db db:migrate:deploy
```
Do not use `db:push` as the main production deployment mechanism. `db:push` is convenient for local development, but it does not give the release traceability that a migration-based deploy requires.
`db:push` remains a local-development tool, not a production rollout mechanism.
## Rollback Model
Rollback should be image-based:
Rollback is image-based:
1. choose the previous good `sha-...` tag
2. run the production deploy workflow again with that tag
3. confirm readiness
1. choose the previous healthy `sha-<commit>` tag
2. redeploy staging or production with that tag
3. confirm `GET /api/ready`
This is only safe when schema changes follow backwards-compatible expand and contract rules.
This assumes schema changes follow backwards-compatible expand-and-contract rollout rules.
## How A Production Update Works
## Production Update Summary
The intended production update path is:
The standard production update is:
1. merge to `main` after the existing CI workflow is green
2. run [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) to build immutable `app` and `migrator` images tagged as `sha-<commit>`
3. run [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) with that exact image tag
4. GitHub Actions uploads the deploy bundle to the staging host and writes a temporary `deploy.env`
5. [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh) pulls images, starts PostgreSQL and Redis, runs Prisma deploy migrations, starts the new app container, and waits for `GET /api/ready`
6. after staging is accepted, run [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml) with the same tag
7. production repeats the same image-based flow, so the running artifact matches staging
1. merge to `main` after CI is green
2. let [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) publish `sha-<commit>` images
3. deploy that tag to staging through [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml)
4. validate staging
5. promote the same tag through [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml)
That means the production host no longer builds from Git. It only receives a versioned image and starts it after migrations complete.
The same principle applies to secrets: the running container reads them from the deployment environment at start time, so an update only needs a new image tag unless secret material itself is being rotated.
## Current Status
The repository now contains the CI/CD scaffolding, but the existing manual production setup remains untouched:
- current manual compose flow: [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
- current manual runbook: [ci-cd-manual.md](/home/hartmut/Documents/Copilot/capakraken/docs/ci-cd-manual.md)
This allows the team to introduce the new path gradually instead of switching production in one step.
The important property is artifact identity: staging and production run the same image, not two separate builds.
+1
View File
@@ -9,6 +9,7 @@
"test": "turbo run test:unit",
"test:unit": "turbo test:unit",
"test:e2e": "turbo test:e2e",
"check:architecture": "node ./scripts/check-architecture-guardrails.mjs",
"db:doctor": "node ./scripts/db-doctor.mjs capakraken",
"db:prisma": "node ./scripts/prisma-with-env.mjs",
"db:push": "node ./scripts/with-env.mjs pnpm --filter @capakraken/db db:push",
+155
View File
@@ -0,0 +1,155 @@
import { readFile } from "node:fs/promises";
import path from "node:path";
import process from "node:process";
const rootDir = process.cwd();
const rules = [
{
file: "packages/api/src/sse/event-bus.ts",
required: [],
forbidden: [
{ pattern: /\bRoleSseAudience\b/, message: "role-based SSE audience types must not reappear" },
{ pattern: /\broleAudience\s*\(/, message: "role-derived SSE audiences must not be emitted" },
{ pattern: /\bBROADCAST_SENT\b/, message: "broadcast SSE event resurrection needs explicit architecture review" },
],
},
{
file: "packages/api/src/sse/subscription-policy.ts",
required: [
{
pattern: /\bderiveUserSseSubscription\b/,
message: "subscription derivation must stay centralized in deriveUserSseSubscription",
},
],
forbidden: [
{ pattern: /\broleAudience\s*\(/, message: "subscription policy must not derive role audiences" },
],
},
{
file: "apps/web/src/app/api/sse/timeline/route.ts",
required: [
{
pattern: /\bderiveUserSseSubscription\s*\(/,
message: "timeline SSE route must derive audiences server-side from the authenticated user",
},
],
forbidden: [
{ pattern: /\bsearchParams\b/, message: "timeline SSE route must not accept client-provided audience scoping" },
{ pattern: /\baudience\b/, message: "timeline SSE route must not parse raw audience values from the client" },
],
},
{
file: "docker-compose.prod.yml",
required: [
{
pattern: /image:\s+\$\{APP_IMAGE:\?set APP_IMAGE\}/,
message: "production compose must deploy the immutable app image",
},
{
pattern: /image:\s+\$\{MIGRATOR_IMAGE:\?set MIGRATOR_IMAGE\}/,
message: "production compose must deploy the immutable migrator image",
},
{
pattern: /http:\/\/localhost:3000\/api\/ready/,
message: "production compose must gate app health on the readiness endpoint",
},
{
pattern: /RATE_LIMIT_BACKEND:\s+\$\{RATE_LIMIT_BACKEND:-redis\}/,
message: "production compose must intentionally pin the Redis-backed rate-limit path",
},
],
forbidden: [
{ pattern: /\bbuild:/, message: "production compose must not build application images on the host" },
],
},
{
file: ".github/workflows/release-image.yml",
required: [
{
pattern: /push:\s*\n\s*branches:\s*\[main\]/,
message: "image releases must build automatically on pushes to main",
},
{
pattern: /workflow_dispatch:/,
message: "image release must remain manually callable for rebuilds and tag overrides",
},
{
pattern: /target:\s+runner/,
message: "release workflow must keep publishing the runner image",
},
{
pattern: /target:\s+migrator/,
message: "release workflow must keep publishing the migrator image",
},
],
forbidden: [],
},
{
file: ".github/workflows/deploy-staging.yml",
required: [
{
pattern: /docker-compose\.prod\.yml tooling\/deploy/,
message: "staging deploy must ship the canonical production compose bundle",
},
],
forbidden: [],
},
{
file: ".github/workflows/deploy-prod.yml",
required: [
{
pattern: /docker-compose\.prod\.yml tooling\/deploy/,
message: "production deploy must ship the canonical production compose bundle",
},
],
forbidden: [],
},
{
file: "tooling/deploy/deploy-compose.sh",
required: [
{
pattern: /COMPOSE_FILE="\$\{COMPOSE_FILE:-docker-compose\.prod\.yml\}"/,
message: "deploy script must default to the canonical production compose file",
},
{
pattern: /READY_URL="\$\{READY_URL:-http:\/\/127\.0\.0\.1:\$\{APP_HOST_PORT:-3000\}\/api\/ready\}"/,
message: "deploy script must wait on the readiness endpoint",
},
{
pattern: /docker compose -f "\$\{COMPOSE_FILE\}" config -q/,
message: "deploy script must validate the rendered compose file before pulling images",
},
],
forbidden: [],
},
];
const violations = [];
for (const rule of rules) {
const absolutePath = path.join(rootDir, rule.file);
const source = await readFile(absolutePath, "utf8");
for (const requirement of rule.required) {
if (!requirement.pattern.test(source)) {
violations.push(`${rule.file}: missing guardrail anchor: ${requirement.message}`);
}
}
for (const forbidden of rule.forbidden) {
if (forbidden.pattern.test(source)) {
violations.push(`${rule.file}: forbidden pattern matched: ${forbidden.message}`);
}
}
}
if (violations.length > 0) {
console.error("Architecture guardrail check failed:");
for (const violation of violations) {
console.error(`- ${violation}`);
}
process.exit(1);
}
console.log("Architecture guardrails passed.");
+2 -1
View File
@@ -1,8 +1,9 @@
# Runtime settings consumed by the app and by docker-compose.cicd.yml on the target host.
# Runtime settings consumed by the app and by docker-compose.prod.yml on the target host.
POSTGRES_PASSWORD=replace-with-a-long-random-password
NEXTAUTH_URL=https://capakraken.example.com
NEXTAUTH_SECRET=replace-with-a-long-random-secret
RATE_LIMIT_BACKEND=redis
# Optional but commonly needed application settings.
SENTRY_DSN=
+15 -10
View File
@@ -1,11 +1,12 @@
# Deploy Tooling
This directory contains the additive deployment scaffold for the image-based CI/CD target path.
This directory contains the canonical host-side tooling for the image-based staging and production path.
## Files
- `deploy-compose.sh`: pulls images, runs migrations, starts the app, and waits for readiness
- `deploy-compose.sh`: validates compose input, pulls images, runs migrations, starts the app, and waits for readiness
- `.env.production.example`: example host-side runtime configuration
- `deploy.env.example`: example short-lived deployment manifest written by GitHub Actions
## Host Layout
@@ -13,7 +14,7 @@ On the target host, the deploy directory should contain:
```text
<deploy-path>/
docker-compose.cicd.yml
docker-compose.prod.yml
deploy.env
.env.production
tooling/deploy/deploy-compose.sh
@@ -25,16 +26,20 @@ On the target host, the deploy directory should contain:
1. Copy `tooling/deploy/.env.production.example` to the target host as `.env.production`.
2. Fill in the required secrets and URLs.
3. Provision runtime AI/SMTP/anonymization secrets on the host through `.env.production` or the platform's secret facility.
4. Keep admin settings for status/verification only; do not use them to enter or rotate operational secrets.
5. After migration, use the admin cleanup action to remove any legacy database-stored runtime secrets.
6. Ensure Docker Engine and Docker Compose v2 are installed.
7. Ensure the target host can pull from `ghcr.io`.
8. Run the image release workflow, then the staging or production deploy workflow with the same image tag.
3. Keep `RATE_LIMIT_BACKEND=redis` so production uses the shared counter path intentionally.
4. Copy `tooling/deploy/deploy.env.example` to the host only if you want to dry-run the deploy script manually.
5. Replace the placeholder images in `deploy.env.example` with a real `sha-<commit>` tag and save it as `deploy.env` for a manual dry run.
6. Provision runtime AI/SMTP/anonymization secrets on the host through `.env.production` or the platform's secret facility.
7. Keep admin settings for status/verification only; do not use them to enter or rotate operational secrets.
8. After migration, use the admin cleanup action to remove any legacy database-stored runtime secrets.
9. Ensure Docker Engine and Docker Compose v2 are installed.
10. Ensure the target host can pull from `ghcr.io`.
11. A normal release no longer needs a Git checkout on the host. The host only needs the deploy bundle plus the two env files.
12. Merge to `main`, let `release-image.yml` publish the immutable images, then run the staging or production deploy workflow with the same image tag.
## Manual Host Test
After the files are present on the host, the flow can be tested manually:
After the files are present on the host, the canonical flow can be tested manually:
```bash
set -a
+2 -1
View File
@@ -2,7 +2,7 @@
set -euo pipefail
DEPLOY_ENV="${1:-unknown}"
COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.cicd.yml}"
COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.prod.yml}"
APP_ENV_FILE="${APP_ENV_FILE:-.env.production}"
DEPLOY_ENV_FILE="${DEPLOY_ENV_FILE:-deploy.env}"
READY_URL="${READY_URL:-http://127.0.0.1:${APP_HOST_PORT:-3000}/api/ready}"
@@ -36,6 +36,7 @@ if [ -n "${GHCR_USERNAME:-}" ] && [ -n "${GHCR_TOKEN:-}" ]; then
printf '%s\n' "${GHCR_TOKEN}" | docker login ghcr.io -u "${GHCR_USERNAME}" --password-stdin
fi
docker compose -f "${COMPOSE_FILE}" config -q
docker compose -f "${COMPOSE_FILE}" pull app migrator
docker compose -f "${COMPOSE_FILE}" up -d postgres redis
docker compose -f "${COMPOSE_FILE}" run --rm migrator
+5
View File
@@ -0,0 +1,5 @@
APP_IMAGE=ghcr.io/example/capakraken-app:sha-abc123
MIGRATOR_IMAGE=ghcr.io/example/capakraken-migrator:sha-abc123
APP_HOST_PORT=3000
GHCR_USERNAME=
GHCR_TOKEN=