refactor(ops): standardize image-based production delivery
This commit is contained in:
@@ -15,6 +15,27 @@ env:
|
||||
PNPM_VERSION: "9.14.2"
|
||||
|
||||
jobs:
|
||||
guardrails:
|
||||
name: Architecture Guardrails
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: pnpm/action-setup@v4
|
||||
with:
|
||||
version: ${{ env.PNPM_VERSION }}
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: ${{ env.NODE_VERSION }}
|
||||
cache: pnpm
|
||||
|
||||
- name: Install dependencies
|
||||
run: pnpm install --frozen-lockfile
|
||||
|
||||
- name: Check architecture guardrails
|
||||
run: pnpm check:architecture
|
||||
|
||||
# ──────────────────────────────────────────────
|
||||
# Typecheck — ~40s, no services needed
|
||||
# ──────────────────────────────────────────────
|
||||
@@ -147,7 +168,7 @@ jobs:
|
||||
# ──────────────────────────────────────────────
|
||||
build:
|
||||
name: Build
|
||||
needs: [typecheck]
|
||||
needs: [guardrails, typecheck]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
DATABASE_URL: postgresql://placeholder:placeholder@localhost:5432/placeholder
|
||||
|
||||
@@ -44,7 +44,7 @@ jobs:
|
||||
ssh-keyscan -p "${SSH_PORT:-22}" -H "${SSH_HOST}" >> ~/.ssh/known_hosts
|
||||
|
||||
- name: Bundle deploy assets
|
||||
run: tar czf deploy-bundle.tgz docker-compose.cicd.yml tooling/deploy
|
||||
run: tar czf deploy-bundle.tgz docker-compose.prod.yml tooling/deploy
|
||||
|
||||
- name: Copy deploy assets to production
|
||||
env:
|
||||
|
||||
@@ -44,7 +44,7 @@ jobs:
|
||||
ssh-keyscan -p "${SSH_PORT:-22}" -H "${SSH_HOST}" >> ~/.ssh/known_hosts
|
||||
|
||||
- name: Bundle deploy assets
|
||||
run: tar czf deploy-bundle.tgz docker-compose.cicd.yml tooling/deploy
|
||||
run: tar czf deploy-bundle.tgz docker-compose.prod.yml tooling/deploy
|
||||
|
||||
- name: Copy deploy assets to staging
|
||||
env:
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
name: Release Image
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: [main]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
image_tag:
|
||||
@@ -61,3 +63,12 @@ jobs:
|
||||
tags: ${{ steps.vars.outputs.migrator_image }}
|
||||
cache-from: type=gha,scope=migrator-image
|
||||
cache-to: type=gha,mode=max,scope=migrator-image
|
||||
|
||||
- name: Publish release summary
|
||||
run: |
|
||||
{
|
||||
echo "## Image release"
|
||||
echo
|
||||
echo "- App image: \`${{ steps.vars.outputs.app_image }}\`"
|
||||
echo "- Migrator image: \`${{ steps.vars.outputs.migrator_image }}\`"
|
||||
} >> "$GITHUB_STEP_SUMMARY"
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
name: capakraken-cicd
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${POSTGRES_PORT:-5432}:5432"
|
||||
environment:
|
||||
POSTGRES_DB: capakraken
|
||||
POSTGRES_USER: capakraken
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}
|
||||
volumes:
|
||||
- capakraken_prod_pgdata:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U capakraken -d capakraken"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 10s
|
||||
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${REDIS_PORT:-6379}:6379"
|
||||
command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
|
||||
volumes:
|
||||
- capakraken_prod_redis:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 5s
|
||||
|
||||
migrator:
|
||||
image: ${MIGRATOR_IMAGE:?set MIGRATOR_IMAGE}
|
||||
restart: "no"
|
||||
env_file:
|
||||
- .env.production
|
||||
environment:
|
||||
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
|
||||
REDIS_URL: redis://redis:6379
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
||||
app:
|
||||
image: ${APP_IMAGE:?set APP_IMAGE}
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${APP_HOST_PORT:-3000}:3000"
|
||||
env_file:
|
||||
- .env.production
|
||||
environment:
|
||||
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
|
||||
REDIS_URL: redis://redis:6379
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:3000/api/ready"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
|
||||
volumes:
|
||||
capakraken_prod_pgdata:
|
||||
name: capakraken_prod_pgdata
|
||||
capakraken_prod_redis:
|
||||
name: capakraken_prod_redis
|
||||
+29
-13
@@ -5,11 +5,11 @@ services:
|
||||
image: postgres:16-alpine
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "5432:5432"
|
||||
- "${POSTGRES_PORT:-5432}:5432"
|
||||
environment:
|
||||
POSTGRES_DB: capakraken
|
||||
POSTGRES_USER: capakraken
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}
|
||||
command: >
|
||||
postgres
|
||||
-c log_connections=on
|
||||
@@ -31,7 +31,7 @@ services:
|
||||
image: redis:7-alpine
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "6379:6379"
|
||||
- "${REDIS_PORT:-6379}:6379"
|
||||
command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru
|
||||
volumes:
|
||||
- capakraken_prod_redis:/data
|
||||
@@ -42,29 +42,45 @@ services:
|
||||
retries: 5
|
||||
start_period: 5s
|
||||
|
||||
app:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.prod
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3000:3000"
|
||||
migrator:
|
||||
image: ${MIGRATOR_IMAGE:?set MIGRATOR_IMAGE}
|
||||
pull_policy: always
|
||||
restart: "no"
|
||||
env_file:
|
||||
- .env.production
|
||||
environment:
|
||||
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:-changeme}@postgres:5432/capakraken
|
||||
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
|
||||
REDIS_URL: redis://redis:6379
|
||||
RATE_LIMIT_BACKEND: ${RATE_LIMIT_BACKEND:-redis}
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
||||
app:
|
||||
image: ${APP_IMAGE:?set APP_IMAGE}
|
||||
pull_policy: always
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${APP_HOST_PORT:-3000}:3000"
|
||||
env_file:
|
||||
- .env.production
|
||||
environment:
|
||||
DATABASE_URL: postgresql://capakraken:${POSTGRES_PASSWORD:?set POSTGRES_PASSWORD}@postgres:5432/capakraken
|
||||
REDIS_URL: redis://redis:6379
|
||||
RATE_LIMIT_BACKEND: ${RATE_LIMIT_BACKEND:-redis}
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:3000/api/health"]
|
||||
test: ["CMD", "curl", "-f", "http://localhost:3000/api/ready"]
|
||||
interval: 30s
|
||||
timeout: 5s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
start_period: 30s
|
||||
|
||||
volumes:
|
||||
capakraken_prod_pgdata:
|
||||
|
||||
+1
-1
@@ -8,7 +8,7 @@
|
||||
| Topic | File | Use |
|
||||
|---|---|---|
|
||||
| AI excellence due diligence | [ai-excellence-due-diligence-roadmap.md](/home/hartmut/Documents/Copilot/capakraken/docs/ai-excellence-due-diligence-roadmap.md) | Frank quality assessment and cleanup roadmap toward a showcase AI-built project |
|
||||
| Target CI/CD architecture | [cicd-target-architecture.md](/home/hartmut/Documents/Copilot/capakraken/docs/cicd-target-architecture.md) | Proposed image-based build, deploy, and rollback flow |
|
||||
| Target CI/CD architecture | [cicd-target-architecture.md](/home/hartmut/Documents/Copilot/capakraken/docs/cicd-target-architecture.md) | Canonical image-based build, deploy, and rollback flow |
|
||||
| Active roadmap and open gaps | [product-roadmap.md](/home/hartmut/Documents/Copilot/capakraken/docs/product-roadmap.md) | Primary backlog and current delivery order |
|
||||
| Estimating system design | [estimating-extension-design.md](/home/hartmut/Documents/Copilot/capakraken/docs/estimating-extension-design.md) | Workbook analysis, field mapping, and implementation plan |
|
||||
| Dispo import implementation | [dispo-import-implementation.md](/home/hartmut/Documents/Copilot/capakraken/docs/dispo-import-implementation.md) | Clean-slate Dispo v2 import design, mapping rules, staging flow, and commit policy |
|
||||
|
||||
@@ -66,9 +66,9 @@ The previously critical SSE and browser parser coverage issues were addressed du
|
||||
Evidence: the current performance review identifies repeated in-memory filtering, broad invalidation, and heavyweight timeline/report derivations in [performance-optimization-review-2026-03-18.md](/home/hartmut/Documents/Copilot/capakraken/docs/performance-optimization-review-2026-03-18.md).
|
||||
Risk: user experience and infrastructure cost will degrade as data volume grows.
|
||||
|
||||
3. Production delivery is still in transition.
|
||||
Evidence: the current repo now has a target CI/CD path, but the old manual production path still coexists with the new image-based deploy model in [cicd-target-architecture.md](/home/hartmut/Documents/Copilot/capakraken/docs/cicd-target-architecture.md).
|
||||
Risk: the operational source of truth is not yet singular.
|
||||
3. Rollback and incident drills still need to be exercised, even though the deployment path is now standardized.
|
||||
Evidence: the canonical production path now runs through [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml), [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml), [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml), and the single host compose file [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml).
|
||||
Risk: a clean architecture path still needs operator rehearsal before it becomes operationally boring under pressure.
|
||||
|
||||
## Overall Rating
|
||||
|
||||
@@ -92,7 +92,7 @@ The architecture is promising, but file size, router density, and compatibility
|
||||
|
||||
### Operational Maturity
|
||||
|
||||
`7/10`
|
||||
`7.5/10`
|
||||
|
||||
Good CI and improving deploy discipline are in place, but production standardization still needs one more step.
|
||||
|
||||
@@ -191,7 +191,6 @@ Target window: 1 to 2 weeks
|
||||
|
||||
Goals:
|
||||
|
||||
- complete the move to image-based deploys as the canonical path
|
||||
- document staging and production bootstrap as code, not tribal knowledge
|
||||
- ensure staging and production run the Redis-backed rate-limit path intentionally and monitor fallback usage
|
||||
- define rollback drills and incident response playbooks
|
||||
|
||||
@@ -48,6 +48,7 @@
|
||||
- the country listing and country detail assistant helpers now live in their own domain module, keeping the remaining geo/readmodel lookups out of the monolithic assistant router without changing the assistant contract
|
||||
- the remaining vacation workflow and entitlement assistant helpers now live in their own domain module, leaving `packages/api/src/router/assistant-tools.ts` as an aggregator/composition layer instead of the last mixed monolithic execution block
|
||||
- API and auth rate limiting now prefer shared Redis-backed counters when `REDIS_URL` is configured, while retaining an in-memory fallback for local/degraded operation with focused regression coverage
|
||||
- production delivery is now consolidated on a single image-based compose path with automatic image publication on `main`, deploy-time readiness gating, and architecture guardrails that prevent host-side app builds from creeping back in
|
||||
|
||||
## Next Up
|
||||
|
||||
@@ -62,8 +63,7 @@ The remaining work is now structural rather than another quick batch:
|
||||
|
||||
1. secrets and runtime configuration policy
|
||||
2. oversized router decomposition
|
||||
3. canonical image-based production delivery
|
||||
4. performance hotspot reduction
|
||||
3. performance hotspot reduction
|
||||
|
||||
## Working Rule
|
||||
|
||||
|
||||
+132
-272
@@ -2,333 +2,193 @@
|
||||
|
||||
## Overview
|
||||
|
||||
CapaKraken uses GitHub Actions for continuous integration and Docker for deployment. This document covers the full pipeline from code push to production.
|
||||
This is the operational runbook for the canonical CapaKraken delivery path:
|
||||
|
||||
---
|
||||
1. CI validates every PR.
|
||||
2. Every push to `main` publishes immutable release images.
|
||||
3. Staging deploys one `sha-<commit>` tag.
|
||||
4. Production promotes the same tag.
|
||||
5. The host never builds application code from Git.
|
||||
|
||||
## 1. CI Pipeline (Automatic on every PR)
|
||||
## 1. CI Gate
|
||||
|
||||
### What triggers it
|
||||
The merge gate is [ci.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/ci.yml).
|
||||
|
||||
| Event | Trigger |
|
||||
|-------|---------|
|
||||
| Pull request to `main` | All CI jobs run |
|
||||
| Push to `main` | All CI jobs run |
|
||||
It covers:
|
||||
|
||||
### Jobs and their purpose
|
||||
- architecture guardrails
|
||||
- typecheck
|
||||
- lint
|
||||
- unit tests
|
||||
- build
|
||||
- E2E
|
||||
|
||||
```
|
||||
PR opened / pushed
|
||||
│
|
||||
├──→ typecheck (tsc --noEmit, ~40s)
|
||||
├──→ lint (ESLint via Turborepo, ~20s)
|
||||
├──→ test (Vitest unit tests, ~60s, needs PostgreSQL + Redis)
|
||||
│
|
||||
└──→ build (next build, ~90s, runs after typecheck)
|
||||
│
|
||||
└──→ e2e (Playwright, ~3-5min, runs after build)
|
||||
```
|
||||
Before merging, all required checks must pass.
|
||||
|
||||
**typecheck, lint, and test run in parallel** for speed. Build waits for typecheck. E2E waits for build.
|
||||
|
||||
### What each job checks
|
||||
|
||||
| Job | Command | What it catches |
|
||||
|-----|---------|----------------|
|
||||
| **typecheck** | `pnpm --filter @capakraken/web exec tsc --noEmit` | Type errors across the full web app |
|
||||
| **lint** | `pnpm lint` | Code style violations, unused imports, etc. |
|
||||
| **test** | `pnpm test:unit` | Unit test failures in engine, staffing, API, shared |
|
||||
| **build** | `pnpm --filter @capakraken/web exec next build` | SSR errors, dynamic import issues, bundle problems |
|
||||
| **e2e** | `pnpm test:e2e` | End-to-end user flow regressions |
|
||||
|
||||
### Required status checks
|
||||
|
||||
Before merging a PR, **all 5 jobs must pass**. Configure this in GitHub Settings > Branches > Branch protection rules > Require status checks.
|
||||
|
||||
### Caching
|
||||
|
||||
The pipeline caches these artifacts to speed up subsequent runs:
|
||||
|
||||
| Cache | Key | Saves |
|
||||
|-------|-----|-------|
|
||||
| pnpm store | `pnpm-lock.yaml` hash | ~30s install time |
|
||||
| Turborepo | `.turbo` directory | ~60s on unchanged packages |
|
||||
| Playwright browsers | Playwright version | ~45s browser download |
|
||||
|
||||
---
|
||||
|
||||
## 2. Local Development Quality Gates
|
||||
|
||||
Run these before pushing to catch issues early:
|
||||
Useful local commands:
|
||||
|
||||
```bash
|
||||
# Quick check (< 2 min)
|
||||
pnpm --filter @capakraken/web exec tsc --noEmit && pnpm lint
|
||||
|
||||
# Full check (< 3 min)
|
||||
pnpm --filter @capakraken/web exec tsc --project tsconfig.typecheck.json --noEmit
|
||||
pnpm lint
|
||||
pnpm test:unit
|
||||
|
||||
# Full check including build (< 5 min)
|
||||
pnpm --filter @capakraken/web exec next build
|
||||
```
|
||||
|
||||
### Pre-commit hook (optional)
|
||||
## 2. Image Release
|
||||
|
||||
You can add a Git pre-commit hook to run the quick check automatically:
|
||||
[release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) runs automatically on every push to `main`.
|
||||
|
||||
```bash
|
||||
# .husky/pre-commit
|
||||
pnpm --filter @capakraken/web exec tsc --noEmit
|
||||
pnpm lint
|
||||
It publishes:
|
||||
|
||||
- `ghcr.io/<owner>/<repo>-app:sha-<commit>`
|
||||
- `ghcr.io/<owner>/<repo>-migrator:sha-<commit>`
|
||||
|
||||
The workflow is also callable manually if a rebuild or tag override is needed.
|
||||
|
||||
## 3. Host Bootstrap
|
||||
|
||||
Each deploy target should have a dedicated directory such as `/opt/capakraken` containing:
|
||||
|
||||
```text
|
||||
docker-compose.prod.yml
|
||||
.env.production
|
||||
deploy.env
|
||||
tooling/deploy/deploy-compose.sh
|
||||
```
|
||||
|
||||
---
|
||||
Use these examples from the repo:
|
||||
|
||||
## 3. Health Check Endpoints
|
||||
- [tooling/deploy/.env.production.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/.env.production.example)
|
||||
- [tooling/deploy/deploy.env.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy.env.example)
|
||||
|
||||
Two endpoints are available for monitoring:
|
||||
Important host-side rules:
|
||||
|
||||
### GET `/api/health` — Liveness Probe
|
||||
- keep `RATE_LIMIT_BACKEND=redis`
|
||||
- keep runtime secrets in `.env.production` or the platform secret layer
|
||||
- do not rotate runtime secrets through admin settings
|
||||
- ensure the host can pull from `ghcr.io`
|
||||
|
||||
Returns 200 if the Node.js process is running. No external dependencies checked.
|
||||
|
||||
```json
|
||||
{ "status": "ok", "timestamp": "2026-03-19T10:00:00.000Z" }
|
||||
```
|
||||
|
||||
**Use for:** Kubernetes/Docker liveness probe, uptime monitoring.
|
||||
|
||||
### GET `/api/ready` — Readiness Probe
|
||||
|
||||
Checks PostgreSQL and Redis connectivity. Returns 200 if all services are reachable, 503 if not.
|
||||
|
||||
```json
|
||||
// Healthy
|
||||
{ "status": "ready", "postgres": "ok", "redis": "ok" }
|
||||
|
||||
// Unhealthy
|
||||
{ "status": "not_ready", "postgres": "ok", "redis": "error" }
|
||||
```
|
||||
|
||||
**Use for:** Kubernetes/Docker readiness probe, load balancer health checks, nginx upstream checks.
|
||||
|
||||
---
|
||||
|
||||
## 4. Production Docker Build
|
||||
|
||||
### Building the production image
|
||||
|
||||
```bash
|
||||
# Build the image
|
||||
docker build -f Dockerfile.prod -t capakraken:latest .
|
||||
|
||||
# Test it locally
|
||||
docker compose -f docker-compose.prod.yml up -d
|
||||
```
|
||||
|
||||
### Image details
|
||||
|
||||
| Property | Value |
|
||||
|----------|-------|
|
||||
| Base | `node:20-bookworm-slim` |
|
||||
| Size | ~150-200 MB (vs ~1.5 GB dev image) |
|
||||
| Output | Next.js standalone mode |
|
||||
| Healthcheck | `curl -f http://localhost:3000/api/health` |
|
||||
| Port | 3000 (internal), mapped to 3100 externally |
|
||||
|
||||
### Environment variables
|
||||
|
||||
The production image requires these environment variables:
|
||||
|
||||
```env
|
||||
# Required
|
||||
DATABASE_URL=postgresql://user:pass@host:5432/capakraken
|
||||
REDIS_URL=redis://host:6379
|
||||
NEXTAUTH_URL=https://capakraken.your-domain.com
|
||||
NEXTAUTH_SECRET=<random-32-char-string>
|
||||
|
||||
# Optional
|
||||
SENTRY_DSN=https://xxx@sentry.io/xxx
|
||||
SMTP_HOST=smtp.example.com
|
||||
SMTP_PORT=587
|
||||
SMTP_USER=notifications@example.com
|
||||
SMTP_PASSWORD=<password>
|
||||
SMTP_FROM=CapaKraken <notifications@example.com>
|
||||
OPENAI_API_KEY=<optional-if-openai-used>
|
||||
AZURE_OPENAI_API_KEY=<optional-if-azure-chat-used>
|
||||
AZURE_DALLE_API_KEY=<optional-if-azure-image-gen-used>
|
||||
GEMINI_API_KEY=<optional-if-gemini-used>
|
||||
ANONYMIZATION_SEED=<required-if-deterministic-anonymization-enabled>
|
||||
```
|
||||
|
||||
Generate a secure `NEXTAUTH_SECRET`:
|
||||
Generate a secure `NEXTAUTH_SECRET` with:
|
||||
|
||||
```bash
|
||||
openssl rand -base64 32
|
||||
```
|
||||
|
||||
Runtime secret policy:
|
||||
## 4. Staging Deployment
|
||||
|
||||
- production secrets are injected through the deployment environment or host secret store
|
||||
- admin settings must not be used to enter or rotate AI, SMTP, or anonymization secrets
|
||||
- the admin UI is only for status checks and cleanup of legacy database-stored secret values
|
||||
Standard path:
|
||||
|
||||
---
|
||||
1. merge to `main`
|
||||
2. wait for [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) to publish `sha-<commit>`
|
||||
3. run [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) with that tag
|
||||
|
||||
## 5. Deployment
|
||||
The workflow uploads:
|
||||
|
||||
### docker-compose (simplest)
|
||||
- [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
|
||||
- [tooling/deploy](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/README.md)
|
||||
- a short-lived `deploy.env`
|
||||
|
||||
On the host, [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh):
|
||||
|
||||
1. validates the rendered compose file
|
||||
2. pulls `APP_IMAGE` and `MIGRATOR_IMAGE`
|
||||
3. starts PostgreSQL and Redis
|
||||
4. runs Prisma migrations with the `migrator` image
|
||||
5. starts the app
|
||||
6. waits for `GET /api/ready`
|
||||
|
||||
## 5. Production Promotion
|
||||
|
||||
After staging is accepted:
|
||||
|
||||
1. run [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml)
|
||||
2. use the exact same `sha-<commit>` tag
|
||||
3. verify `GET /api/ready`
|
||||
|
||||
Production must promote the already-tested image, not rebuild from source.
|
||||
|
||||
## 6. Manual Host Dry Run
|
||||
|
||||
If you need to verify the host outside GitHub Actions:
|
||||
|
||||
```bash
|
||||
# On your server, after updating the host-side env/secret source
|
||||
git pull
|
||||
docker compose -f docker-compose.prod.yml up -d --build
|
||||
cp tooling/deploy/.env.production.example .env.production
|
||||
cp tooling/deploy/deploy.env.example deploy.env
|
||||
# fill in real secrets and image refs first
|
||||
|
||||
# Run database migrations
|
||||
docker compose -f docker-compose.prod.yml exec app \
|
||||
pnpm --filter @capakraken/db db:migrate:deploy
|
||||
|
||||
# Seed initial data (first deployment only)
|
||||
docker compose -f docker-compose.prod.yml exec app \
|
||||
pnpm db:seed
|
||||
set -a
|
||||
. ./deploy.env
|
||||
set +a
|
||||
bash tooling/deploy/deploy-compose.sh staging
|
||||
```
|
||||
|
||||
### Manual deployment (current setup)
|
||||
## 7. Health Endpoints
|
||||
|
||||
Since `capakraken.hartmut-noerenberg.com` runs behind nginx:
|
||||
### GET `/api/health`
|
||||
|
||||
Process liveness only. Use it for coarse uptime checks.
|
||||
|
||||
### GET `/api/ready`
|
||||
|
||||
Checks PostgreSQL and Redis connectivity. Use it for deploy readiness and traffic admission.
|
||||
|
||||
For deploys, `/api/ready` is the source of truth.
|
||||
|
||||
## 8. Rollback
|
||||
|
||||
Rollback is image-based:
|
||||
|
||||
1. choose the previous healthy `sha-<commit>`
|
||||
2. rerun the staging or production deploy workflow with that tag
|
||||
3. confirm `GET /api/ready`
|
||||
|
||||
Schema changes still need expand-and-contract discipline for rollback safety.
|
||||
|
||||
## 9. Troubleshooting
|
||||
|
||||
### CI failure
|
||||
|
||||
Run the failing command locally:
|
||||
|
||||
```bash
|
||||
# On the server
|
||||
cd /home/hartmut/Documents/Copilot/capakraken
|
||||
git pull origin main
|
||||
pnpm install
|
||||
pnpm db:generate
|
||||
pnpm db:validate
|
||||
pnpm --filter @capakraken/db db:migrate:deploy
|
||||
pnpm --filter @capakraken/web exec next build
|
||||
rm -rf apps/web/.next/cache # clear stale cache
|
||||
|
||||
# Restart the app (systemd, pm2, or manual)
|
||||
fuser -k 3100/tcp 2>/dev/null
|
||||
PORT=3100 pnpm --filter @capakraken/web start &
|
||||
```
|
||||
|
||||
Use the repo-level `pnpm db:*` commands for Prisma/database operations. They load `.env`, `.env.local`, `.env.$NODE_ENV`, and `.env.$NODE_ENV.local` automatically before invoking Prisma.
|
||||
|
||||
If you rotate runtime secrets during a manual deploy, update the host-side environment source first, then restart the app so the new process reads the updated values. Do not patch those values through admin settings.
|
||||
|
||||
### nginx configuration
|
||||
|
||||
The existing nginx reverse proxy should forward to port 3100:
|
||||
|
||||
```nginx
|
||||
server {
|
||||
server_name capakraken.hartmut-noerenberg.com;
|
||||
|
||||
location / {
|
||||
proxy_pass http://127.0.0.1:3100;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# SSE support (keep connection open)
|
||||
proxy_read_timeout 86400s;
|
||||
proxy_buffering off;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Monitoring Setup
|
||||
|
||||
### Sentry (error tracking)
|
||||
|
||||
After creating a Sentry project, add the DSN to `.env.production`:
|
||||
|
||||
```env
|
||||
SENTRY_DSN=https://xxx@sentry.io/xxx
|
||||
```
|
||||
|
||||
Errors are automatically captured by the Sentry integration in Next.js.
|
||||
|
||||
### Uptime monitoring
|
||||
|
||||
Point an external monitor (UptimeRobot, Better Stack, etc.) at:
|
||||
|
||||
```
|
||||
https://capakraken.hartmut-noerenberg.com/api/health
|
||||
```
|
||||
|
||||
Alert if status code != 200 for more than 2 consecutive checks.
|
||||
|
||||
---
|
||||
|
||||
## 7. Troubleshooting
|
||||
|
||||
### CI job fails: "tsc --noEmit"
|
||||
|
||||
TypeScript error in the web app. Run locally:
|
||||
```bash
|
||||
pnpm --filter @capakraken/web exec tsc --noEmit
|
||||
```
|
||||
|
||||
### CI job fails: "test:unit"
|
||||
|
||||
Unit test failure. Run locally:
|
||||
```bash
|
||||
pnpm --filter @capakraken/web exec tsc --project tsconfig.typecheck.json --noEmit
|
||||
pnpm lint
|
||||
pnpm test:unit
|
||||
```
|
||||
|
||||
### CI job fails: "next build"
|
||||
|
||||
Build error (often `ssr: false` in Server Components, missing exports). Run locally:
|
||||
```bash
|
||||
pnpm --filter @capakraken/web exec next build
|
||||
```
|
||||
|
||||
### CI job fails: "e2e"
|
||||
### Deploy fails before container start
|
||||
|
||||
Playwright test failure. Check the HTML report artifact in the GitHub Actions run.
|
||||
Check the rendered compose configuration on the host:
|
||||
|
||||
### Production: 502 Bad Gateway
|
||||
|
||||
The Next.js process isn't running. Check:
|
||||
```bash
|
||||
ss -tlnp | grep 3100 # Is anything listening?
|
||||
tail -50 /tmp/capakraken-dev.log # Check app logs
|
||||
docker compose -f docker-compose.prod.yml config -q
|
||||
```
|
||||
|
||||
Restart:
|
||||
Then verify `.env.production` and `deploy.env`.
|
||||
|
||||
### App never becomes ready
|
||||
|
||||
Check:
|
||||
|
||||
```bash
|
||||
fuser -k 3100/tcp 2>/dev/null
|
||||
pnpm dev & # or pnpm start for production mode
|
||||
docker compose -f docker-compose.prod.yml ps
|
||||
docker compose -f docker-compose.prod.yml logs --tail 200 app
|
||||
curl -s http://127.0.0.1:${APP_HOST_PORT:-3000}/api/ready
|
||||
```
|
||||
|
||||
### Production: 500 Internal Server Error
|
||||
### Database migration failure
|
||||
|
||||
Inspect the migrator logs:
|
||||
|
||||
Usually a stale Prisma client after schema changes:
|
||||
```bash
|
||||
pnpm db:generate
|
||||
pnpm db:validate
|
||||
rm -rf apps/web/.next
|
||||
pnpm --filter @capakraken/web exec next build
|
||||
# Restart the server
|
||||
docker compose -f docker-compose.prod.yml run --rm migrator
|
||||
```
|
||||
|
||||
### Database connection issues
|
||||
### Registry pull failure
|
||||
|
||||
Check the `/api/ready` endpoint:
|
||||
```bash
|
||||
curl -s https://capakraken.hartmut-noerenberg.com/api/ready | jq .
|
||||
```
|
||||
Verify `GHCR_USERNAME` and `GHCR_TOKEN`, then test:
|
||||
|
||||
If `postgres: "error"`, verify:
|
||||
```bash
|
||||
docker ps | grep postgres # Is container running?
|
||||
psql -h localhost -p 5433 -U capakraken -d capakraken # Can you connect?
|
||||
printf '%s\n' "$GHCR_TOKEN" | docker login ghcr.io -u "$GHCR_USERNAME" --password-stdin
|
||||
```
|
||||
|
||||
@@ -2,83 +2,67 @@
|
||||
|
||||
## Goal
|
||||
|
||||
This document captures the intended delivery model for CapaKraken without replacing the currently working manual production setup immediately.
|
||||
This document describes the canonical release path for CapaKraken.
|
||||
|
||||
The target state is:
|
||||
The release model is now:
|
||||
|
||||
1. CI validates every PR.
|
||||
2. GitHub Actions builds immutable Docker images.
|
||||
3. Staging and production pull those exact images from a registry.
|
||||
4. Database migrations run as an explicit deploy step.
|
||||
5. Traffic is considered safe only after the app answers `GET /api/ready`.
|
||||
1. PRs are validated by CI before merge.
|
||||
2. Every push to `main` publishes immutable `app` and `migrator` images.
|
||||
3. Staging and production promote the exact same `sha-<commit>` tag.
|
||||
4. The host deploys only from images and runtime env files.
|
||||
5. A deployment is successful only after `GET /api/ready` passes.
|
||||
|
||||
## Core Idea
|
||||
|
||||
The production host should stop building application code from a Git checkout. Instead, it should only:
|
||||
|
||||
- pull a versioned `app` image
|
||||
- pull a matching `migrator` image
|
||||
- run Prisma deploy migrations
|
||||
- start the application container
|
||||
- wait for readiness
|
||||
|
||||
That removes "works on the server but not in CI" drift and makes rollbacks much simpler.
|
||||
|
||||
## Delivery Flow
|
||||
## Canonical Flow
|
||||
|
||||
### 1. Pull Request Validation
|
||||
|
||||
The existing `CI` workflow continues to validate:
|
||||
The main [ci.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/ci.yml) workflow remains the merge gate for:
|
||||
|
||||
- architecture guardrails for SSE audience scoping
|
||||
- architecture guardrails
|
||||
- typecheck
|
||||
- lint
|
||||
- unit tests
|
||||
- build
|
||||
- E2E
|
||||
|
||||
This remains the quality gate before merge.
|
||||
### 2. Automatic Image Release
|
||||
|
||||
The guardrail step currently enforces three invariants:
|
||||
[release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) now runs automatically on every push to `main` and can still be started manually for rebuilds or tag overrides.
|
||||
|
||||
- no role-based SSE audience fan-out in [event-bus.ts](/home/hartmut/Documents/Copilot/capakraken/packages/api/src/sse/event-bus.ts)
|
||||
- no role-derived subscription audiences in [subscription-policy.ts](/home/hartmut/Documents/Copilot/capakraken/packages/api/src/sse/subscription-policy.ts)
|
||||
- no client-provided audience parsing in [route.ts](/home/hartmut/Documents/Copilot/capakraken/apps/web/src/app/api/sse/timeline/route.ts)
|
||||
It publishes two images from [Dockerfile.prod](/home/hartmut/Documents/Copilot/capakraken/Dockerfile.prod):
|
||||
|
||||
### 2. Image Build
|
||||
- `ghcr.io/<owner>/<repo>-app:sha-<commit>`
|
||||
- `ghcr.io/<owner>/<repo>-migrator:sha-<commit>`
|
||||
|
||||
The new manual workflow [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) builds two images from [Dockerfile.prod](/home/hartmut/Documents/Copilot/capakraken/Dockerfile.prod):
|
||||
### 3. Staging Promotion
|
||||
|
||||
- `runner` target as the production app image
|
||||
- `migrator` target as the Prisma migration image
|
||||
[deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) copies the canonical deploy bundle to the staging host:
|
||||
|
||||
Recommended tag format:
|
||||
- [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
|
||||
- [tooling/deploy/deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh)
|
||||
- the rest of [tooling/deploy](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/README.md)
|
||||
|
||||
- `sha-<git-commit>`
|
||||
GitHub Actions also writes a short-lived `deploy.env` containing `APP_IMAGE`, `MIGRATOR_IMAGE`, and the host port.
|
||||
|
||||
Example:
|
||||
### 4. Host-Side Deployment
|
||||
|
||||
```text
|
||||
ghcr.io/<owner>/capakraken-app:sha-abc123
|
||||
ghcr.io/<owner>/capakraken-migrator:sha-abc123
|
||||
```
|
||||
On the target host, [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh):
|
||||
|
||||
### 3. Staging Deploy
|
||||
1. loads `.env.production` and `deploy.env`
|
||||
2. validates the rendered compose file
|
||||
3. pulls the immutable `app` and `migrator` images
|
||||
4. starts PostgreSQL and Redis
|
||||
5. runs Prisma migrations through the dedicated `migrator` image
|
||||
6. starts the new `app` container
|
||||
7. waits for `GET /api/ready`
|
||||
|
||||
The staging workflow [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) is intended to:
|
||||
The host does not build application code from Git anymore.
|
||||
|
||||
1. connect to the staging host over SSH
|
||||
2. copy the deploy assets
|
||||
3. export `APP_IMAGE` and `MIGRATOR_IMAGE`
|
||||
4. run [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh)
|
||||
### 5. Production Promotion
|
||||
|
||||
The compose file used for this target flow is [docker-compose.cicd.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.cicd.yml).
|
||||
[deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml) repeats the exact staging flow with the same image tag after staging acceptance.
|
||||
|
||||
### 4. Production Promotion
|
||||
|
||||
The production workflow [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml) follows the same logic as staging, but the image tag is promoted manually.
|
||||
|
||||
That means production uses an image that was already built and can already have been exercised in staging.
|
||||
That keeps staging and production on the same artifact instead of rebuilding.
|
||||
|
||||
## Required Infrastructure
|
||||
|
||||
@@ -86,139 +70,66 @@ That means production uses an image that was already built and can already have
|
||||
|
||||
- GitHub repository with Actions enabled
|
||||
- GHCR or another container registry
|
||||
- 1 Linux host with Docker and Docker Compose
|
||||
- one Linux host with Docker Engine and Docker Compose v2
|
||||
- PostgreSQL
|
||||
- Redis
|
||||
- reverse proxy such as nginx
|
||||
- SSH access from GitHub Actions to the host
|
||||
- reverse proxy or load balancer in front of the app
|
||||
|
||||
### Recommended
|
||||
|
||||
- separate staging and production hosts
|
||||
- GitHub Environments for `staging` and `production`
|
||||
- required reviewer approval for `production`
|
||||
- backup strategy for PostgreSQL volumes
|
||||
- uptime monitoring and error tracking
|
||||
- required approval for the `production` environment
|
||||
- monitoring on `/api/health` and `/api/ready`
|
||||
- PostgreSQL backup and restore drills
|
||||
|
||||
## Secrets
|
||||
## Runtime Configuration
|
||||
|
||||
### GitHub Environment Secrets
|
||||
|
||||
For `staging`:
|
||||
|
||||
- `STAGING_SSH_HOST`
|
||||
- `STAGING_SSH_PORT`
|
||||
- `STAGING_SSH_USER`
|
||||
- `STAGING_SSH_KEY`
|
||||
- `STAGING_DEPLOY_PATH`
|
||||
- `STAGING_APP_HOST_PORT`
|
||||
- `STAGING_GHCR_USERNAME`
|
||||
- `STAGING_GHCR_TOKEN`
|
||||
|
||||
For `production`:
|
||||
|
||||
- `PROD_SSH_HOST`
|
||||
- `PROD_SSH_PORT`
|
||||
- `PROD_SSH_USER`
|
||||
- `PROD_SSH_KEY`
|
||||
- `PROD_DEPLOY_PATH`
|
||||
- `PROD_APP_HOST_PORT`
|
||||
- `PROD_GHCR_USERNAME`
|
||||
- `PROD_GHCR_TOKEN`
|
||||
|
||||
### Host-side Files
|
||||
|
||||
Each target host should already have:
|
||||
The canonical host-side inputs are:
|
||||
|
||||
- [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
|
||||
- `.env.production`
|
||||
- Docker installed
|
||||
- network access to the container registry
|
||||
- `deploy.env`
|
||||
|
||||
The repository now also contains a small host example at [tooling/deploy/.env.production.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/.env.production.example) and an operator note at [tooling/deploy/README.md](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/README.md).
|
||||
`.env.production` holds long-lived runtime configuration and secrets. The example file is [tooling/deploy/.env.production.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/.env.production.example).
|
||||
|
||||
### Minimum Host Bootstrap
|
||||
`deploy.env` is short-lived deployment metadata. The example file is [tooling/deploy/deploy.env.example](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy.env.example).
|
||||
|
||||
For each target host, create a dedicated deploy directory such as `/opt/capakraken` and place these files there:
|
||||
Important invariants:
|
||||
|
||||
```text
|
||||
docker-compose.cicd.yml
|
||||
.env.production
|
||||
tooling/deploy/deploy-compose.sh
|
||||
```
|
||||
|
||||
`.env.production` should hold the long-lived runtime settings, including:
|
||||
|
||||
```env
|
||||
POSTGRES_PASSWORD=<long-random-password>
|
||||
NEXTAUTH_URL=https://capakraken.example.com
|
||||
NEXTAUTH_SECRET=<long-random-secret>
|
||||
```
|
||||
|
||||
GitHub Actions only injects the short-lived image references through `deploy.env`. The deploy script then loads both files before calling Docker Compose, so compose interpolation and container runtime env use the same source of truth.
|
||||
|
||||
### Runtime Secret Provisioning Policy
|
||||
|
||||
Production and staging secrets should be provisioned at the host or platform-secret layer, not through admin mutations and not through application database writes.
|
||||
|
||||
That includes at least:
|
||||
|
||||
```env
|
||||
OPENAI_API_KEY=<optional-if-openai-used>
|
||||
AZURE_OPENAI_API_KEY=<optional-if-azure-chat-used>
|
||||
AZURE_DALLE_API_KEY=<optional-if-azure-image-gen-used>
|
||||
GEMINI_API_KEY=<optional-if-gemini-used>
|
||||
SMTP_PASSWORD=<required-if-smtp-auth-used>
|
||||
ANONYMIZATION_SEED=<required-if-deterministic-anonymization-enabled>
|
||||
```
|
||||
|
||||
Operational rule:
|
||||
|
||||
- keep these values in `.env.production` only for smaller self-managed hosts, or preferably in the host's secret manager / encrypted environment facility
|
||||
- do not rotate or patch these values through `SystemSettings`
|
||||
- use the admin settings page only to verify runtime source/status and to clear leftover legacy database copies
|
||||
- after migration, legacy database secret fields should be empty in both staging and production
|
||||
- `RATE_LIMIT_BACKEND=redis` should stay explicit in release environments
|
||||
- runtime AI, SMTP, and anonymization secrets belong to the host or platform secret layer
|
||||
- admin settings are for verification and legacy-secret cleanup, not for secret rotation
|
||||
|
||||
## Database Policy
|
||||
|
||||
For release environments, use:
|
||||
Release environments must run migrations through the `migrator` image, which executes:
|
||||
|
||||
```bash
|
||||
pnpm --filter @capakraken/db db:migrate:deploy
|
||||
```
|
||||
|
||||
Do not use `db:push` as the main production deployment mechanism. `db:push` is convenient for local development, but it does not give the release traceability that a migration-based deploy requires.
|
||||
`db:push` remains a local-development tool, not a production rollout mechanism.
|
||||
|
||||
## Rollback Model
|
||||
|
||||
Rollback should be image-based:
|
||||
Rollback is image-based:
|
||||
|
||||
1. choose the previous good `sha-...` tag
|
||||
2. run the production deploy workflow again with that tag
|
||||
3. confirm readiness
|
||||
1. choose the previous healthy `sha-<commit>` tag
|
||||
2. redeploy staging or production with that tag
|
||||
3. confirm `GET /api/ready`
|
||||
|
||||
This is only safe when schema changes follow backwards-compatible expand and contract rules.
|
||||
This assumes schema changes follow backwards-compatible expand-and-contract rollout rules.
|
||||
|
||||
## How A Production Update Works
|
||||
## Production Update Summary
|
||||
|
||||
The intended production update path is:
|
||||
The standard production update is:
|
||||
|
||||
1. merge to `main` after the existing CI workflow is green
|
||||
2. run [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) to build immutable `app` and `migrator` images tagged as `sha-<commit>`
|
||||
3. run [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml) with that exact image tag
|
||||
4. GitHub Actions uploads the deploy bundle to the staging host and writes a temporary `deploy.env`
|
||||
5. [deploy-compose.sh](/home/hartmut/Documents/Copilot/capakraken/tooling/deploy/deploy-compose.sh) pulls images, starts PostgreSQL and Redis, runs Prisma deploy migrations, starts the new app container, and waits for `GET /api/ready`
|
||||
6. after staging is accepted, run [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml) with the same tag
|
||||
7. production repeats the same image-based flow, so the running artifact matches staging
|
||||
1. merge to `main` after CI is green
|
||||
2. let [release-image.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/release-image.yml) publish `sha-<commit>` images
|
||||
3. deploy that tag to staging through [deploy-staging.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-staging.yml)
|
||||
4. validate staging
|
||||
5. promote the same tag through [deploy-prod.yml](/home/hartmut/Documents/Copilot/capakraken/.github/workflows/deploy-prod.yml)
|
||||
|
||||
That means the production host no longer builds from Git. It only receives a versioned image and starts it after migrations complete.
|
||||
|
||||
The same principle applies to secrets: the running container reads them from the deployment environment at start time, so an update only needs a new image tag unless secret material itself is being rotated.
|
||||
|
||||
## Current Status
|
||||
|
||||
The repository now contains the CI/CD scaffolding, but the existing manual production setup remains untouched:
|
||||
|
||||
- current manual compose flow: [docker-compose.prod.yml](/home/hartmut/Documents/Copilot/capakraken/docker-compose.prod.yml)
|
||||
- current manual runbook: [ci-cd-manual.md](/home/hartmut/Documents/Copilot/capakraken/docs/ci-cd-manual.md)
|
||||
|
||||
This allows the team to introduce the new path gradually instead of switching production in one step.
|
||||
The important property is artifact identity: staging and production run the same image, not two separate builds.
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
"test": "turbo run test:unit",
|
||||
"test:unit": "turbo test:unit",
|
||||
"test:e2e": "turbo test:e2e",
|
||||
"check:architecture": "node ./scripts/check-architecture-guardrails.mjs",
|
||||
"db:doctor": "node ./scripts/db-doctor.mjs capakraken",
|
||||
"db:prisma": "node ./scripts/prisma-with-env.mjs",
|
||||
"db:push": "node ./scripts/with-env.mjs pnpm --filter @capakraken/db db:push",
|
||||
|
||||
@@ -0,0 +1,155 @@
|
||||
import { readFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import process from "node:process";
|
||||
|
||||
const rootDir = process.cwd();
|
||||
|
||||
const rules = [
|
||||
{
|
||||
file: "packages/api/src/sse/event-bus.ts",
|
||||
required: [],
|
||||
forbidden: [
|
||||
{ pattern: /\bRoleSseAudience\b/, message: "role-based SSE audience types must not reappear" },
|
||||
{ pattern: /\broleAudience\s*\(/, message: "role-derived SSE audiences must not be emitted" },
|
||||
{ pattern: /\bBROADCAST_SENT\b/, message: "broadcast SSE event resurrection needs explicit architecture review" },
|
||||
],
|
||||
},
|
||||
{
|
||||
file: "packages/api/src/sse/subscription-policy.ts",
|
||||
required: [
|
||||
{
|
||||
pattern: /\bderiveUserSseSubscription\b/,
|
||||
message: "subscription derivation must stay centralized in deriveUserSseSubscription",
|
||||
},
|
||||
],
|
||||
forbidden: [
|
||||
{ pattern: /\broleAudience\s*\(/, message: "subscription policy must not derive role audiences" },
|
||||
],
|
||||
},
|
||||
{
|
||||
file: "apps/web/src/app/api/sse/timeline/route.ts",
|
||||
required: [
|
||||
{
|
||||
pattern: /\bderiveUserSseSubscription\s*\(/,
|
||||
message: "timeline SSE route must derive audiences server-side from the authenticated user",
|
||||
},
|
||||
],
|
||||
forbidden: [
|
||||
{ pattern: /\bsearchParams\b/, message: "timeline SSE route must not accept client-provided audience scoping" },
|
||||
{ pattern: /\baudience\b/, message: "timeline SSE route must not parse raw audience values from the client" },
|
||||
],
|
||||
},
|
||||
{
|
||||
file: "docker-compose.prod.yml",
|
||||
required: [
|
||||
{
|
||||
pattern: /image:\s+\$\{APP_IMAGE:\?set APP_IMAGE\}/,
|
||||
message: "production compose must deploy the immutable app image",
|
||||
},
|
||||
{
|
||||
pattern: /image:\s+\$\{MIGRATOR_IMAGE:\?set MIGRATOR_IMAGE\}/,
|
||||
message: "production compose must deploy the immutable migrator image",
|
||||
},
|
||||
{
|
||||
pattern: /http:\/\/localhost:3000\/api\/ready/,
|
||||
message: "production compose must gate app health on the readiness endpoint",
|
||||
},
|
||||
{
|
||||
pattern: /RATE_LIMIT_BACKEND:\s+\$\{RATE_LIMIT_BACKEND:-redis\}/,
|
||||
message: "production compose must intentionally pin the Redis-backed rate-limit path",
|
||||
},
|
||||
],
|
||||
forbidden: [
|
||||
{ pattern: /\bbuild:/, message: "production compose must not build application images on the host" },
|
||||
],
|
||||
},
|
||||
{
|
||||
file: ".github/workflows/release-image.yml",
|
||||
required: [
|
||||
{
|
||||
pattern: /push:\s*\n\s*branches:\s*\[main\]/,
|
||||
message: "image releases must build automatically on pushes to main",
|
||||
},
|
||||
{
|
||||
pattern: /workflow_dispatch:/,
|
||||
message: "image release must remain manually callable for rebuilds and tag overrides",
|
||||
},
|
||||
{
|
||||
pattern: /target:\s+runner/,
|
||||
message: "release workflow must keep publishing the runner image",
|
||||
},
|
||||
{
|
||||
pattern: /target:\s+migrator/,
|
||||
message: "release workflow must keep publishing the migrator image",
|
||||
},
|
||||
],
|
||||
forbidden: [],
|
||||
},
|
||||
{
|
||||
file: ".github/workflows/deploy-staging.yml",
|
||||
required: [
|
||||
{
|
||||
pattern: /docker-compose\.prod\.yml tooling\/deploy/,
|
||||
message: "staging deploy must ship the canonical production compose bundle",
|
||||
},
|
||||
],
|
||||
forbidden: [],
|
||||
},
|
||||
{
|
||||
file: ".github/workflows/deploy-prod.yml",
|
||||
required: [
|
||||
{
|
||||
pattern: /docker-compose\.prod\.yml tooling\/deploy/,
|
||||
message: "production deploy must ship the canonical production compose bundle",
|
||||
},
|
||||
],
|
||||
forbidden: [],
|
||||
},
|
||||
{
|
||||
file: "tooling/deploy/deploy-compose.sh",
|
||||
required: [
|
||||
{
|
||||
pattern: /COMPOSE_FILE="\$\{COMPOSE_FILE:-docker-compose\.prod\.yml\}"/,
|
||||
message: "deploy script must default to the canonical production compose file",
|
||||
},
|
||||
{
|
||||
pattern: /READY_URL="\$\{READY_URL:-http:\/\/127\.0\.0\.1:\$\{APP_HOST_PORT:-3000\}\/api\/ready\}"/,
|
||||
message: "deploy script must wait on the readiness endpoint",
|
||||
},
|
||||
{
|
||||
pattern: /docker compose -f "\$\{COMPOSE_FILE\}" config -q/,
|
||||
message: "deploy script must validate the rendered compose file before pulling images",
|
||||
},
|
||||
],
|
||||
forbidden: [],
|
||||
},
|
||||
];
|
||||
|
||||
const violations = [];
|
||||
|
||||
for (const rule of rules) {
|
||||
const absolutePath = path.join(rootDir, rule.file);
|
||||
const source = await readFile(absolutePath, "utf8");
|
||||
|
||||
for (const requirement of rule.required) {
|
||||
if (!requirement.pattern.test(source)) {
|
||||
violations.push(`${rule.file}: missing guardrail anchor: ${requirement.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
for (const forbidden of rule.forbidden) {
|
||||
if (forbidden.pattern.test(source)) {
|
||||
violations.push(`${rule.file}: forbidden pattern matched: ${forbidden.message}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (violations.length > 0) {
|
||||
console.error("Architecture guardrail check failed:");
|
||||
for (const violation of violations) {
|
||||
console.error(`- ${violation}`);
|
||||
}
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
console.log("Architecture guardrails passed.");
|
||||
@@ -1,8 +1,9 @@
|
||||
# Runtime settings consumed by the app and by docker-compose.cicd.yml on the target host.
|
||||
# Runtime settings consumed by the app and by docker-compose.prod.yml on the target host.
|
||||
|
||||
POSTGRES_PASSWORD=replace-with-a-long-random-password
|
||||
NEXTAUTH_URL=https://capakraken.example.com
|
||||
NEXTAUTH_SECRET=replace-with-a-long-random-secret
|
||||
RATE_LIMIT_BACKEND=redis
|
||||
|
||||
# Optional but commonly needed application settings.
|
||||
SENTRY_DSN=
|
||||
|
||||
+15
-10
@@ -1,11 +1,12 @@
|
||||
# Deploy Tooling
|
||||
|
||||
This directory contains the additive deployment scaffold for the image-based CI/CD target path.
|
||||
This directory contains the canonical host-side tooling for the image-based staging and production path.
|
||||
|
||||
## Files
|
||||
|
||||
- `deploy-compose.sh`: pulls images, runs migrations, starts the app, and waits for readiness
|
||||
- `deploy-compose.sh`: validates compose input, pulls images, runs migrations, starts the app, and waits for readiness
|
||||
- `.env.production.example`: example host-side runtime configuration
|
||||
- `deploy.env.example`: example short-lived deployment manifest written by GitHub Actions
|
||||
|
||||
## Host Layout
|
||||
|
||||
@@ -13,7 +14,7 @@ On the target host, the deploy directory should contain:
|
||||
|
||||
```text
|
||||
<deploy-path>/
|
||||
docker-compose.cicd.yml
|
||||
docker-compose.prod.yml
|
||||
deploy.env
|
||||
.env.production
|
||||
tooling/deploy/deploy-compose.sh
|
||||
@@ -25,16 +26,20 @@ On the target host, the deploy directory should contain:
|
||||
|
||||
1. Copy `tooling/deploy/.env.production.example` to the target host as `.env.production`.
|
||||
2. Fill in the required secrets and URLs.
|
||||
3. Provision runtime AI/SMTP/anonymization secrets on the host through `.env.production` or the platform's secret facility.
|
||||
4. Keep admin settings for status/verification only; do not use them to enter or rotate operational secrets.
|
||||
5. After migration, use the admin cleanup action to remove any legacy database-stored runtime secrets.
|
||||
6. Ensure Docker Engine and Docker Compose v2 are installed.
|
||||
7. Ensure the target host can pull from `ghcr.io`.
|
||||
8. Run the image release workflow, then the staging or production deploy workflow with the same image tag.
|
||||
3. Keep `RATE_LIMIT_BACKEND=redis` so production uses the shared counter path intentionally.
|
||||
4. Copy `tooling/deploy/deploy.env.example` to the host only if you want to dry-run the deploy script manually.
|
||||
5. Replace the placeholder images in `deploy.env.example` with a real `sha-<commit>` tag and save it as `deploy.env` for a manual dry run.
|
||||
6. Provision runtime AI/SMTP/anonymization secrets on the host through `.env.production` or the platform's secret facility.
|
||||
7. Keep admin settings for status/verification only; do not use them to enter or rotate operational secrets.
|
||||
8. After migration, use the admin cleanup action to remove any legacy database-stored runtime secrets.
|
||||
9. Ensure Docker Engine and Docker Compose v2 are installed.
|
||||
10. Ensure the target host can pull from `ghcr.io`.
|
||||
11. A normal release no longer needs a Git checkout on the host. The host only needs the deploy bundle plus the two env files.
|
||||
12. Merge to `main`, let `release-image.yml` publish the immutable images, then run the staging or production deploy workflow with the same image tag.
|
||||
|
||||
## Manual Host Test
|
||||
|
||||
After the files are present on the host, the flow can be tested manually:
|
||||
After the files are present on the host, the canonical flow can be tested manually:
|
||||
|
||||
```bash
|
||||
set -a
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
set -euo pipefail
|
||||
|
||||
DEPLOY_ENV="${1:-unknown}"
|
||||
COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.cicd.yml}"
|
||||
COMPOSE_FILE="${COMPOSE_FILE:-docker-compose.prod.yml}"
|
||||
APP_ENV_FILE="${APP_ENV_FILE:-.env.production}"
|
||||
DEPLOY_ENV_FILE="${DEPLOY_ENV_FILE:-deploy.env}"
|
||||
READY_URL="${READY_URL:-http://127.0.0.1:${APP_HOST_PORT:-3000}/api/ready}"
|
||||
@@ -36,6 +36,7 @@ if [ -n "${GHCR_USERNAME:-}" ] && [ -n "${GHCR_TOKEN:-}" ]; then
|
||||
printf '%s\n' "${GHCR_TOKEN}" | docker login ghcr.io -u "${GHCR_USERNAME}" --password-stdin
|
||||
fi
|
||||
|
||||
docker compose -f "${COMPOSE_FILE}" config -q
|
||||
docker compose -f "${COMPOSE_FILE}" pull app migrator
|
||||
docker compose -f "${COMPOSE_FILE}" up -d postgres redis
|
||||
docker compose -f "${COMPOSE_FILE}" run --rm migrator
|
||||
|
||||
@@ -0,0 +1,5 @@
|
||||
APP_IMAGE=ghcr.io/example/capakraken-app:sha-abc123
|
||||
MIGRATOR_IMAGE=ghcr.io/example/capakraken-migrator:sha-abc123
|
||||
APP_HOST_PORT=3000
|
||||
GHCR_USERNAME=
|
||||
GHCR_TOKEN=
|
||||
Reference in New Issue
Block a user