Files
HartOMat/backend/app/services/docker_scaler.py
T
2026-03-05 22:12:38 +01:00

178 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Scale Flamenco worker containers via the Docker socket.
Uses the Docker Python SDK (docker>=6.1.0) to list, start, and stop containers.
Requires /var/run/docker.sock to be mounted into the backend container.
"""
import os
import logging
log = logging.getLogger(__name__)
COMPOSE_PROJECT = os.getenv("COMPOSE_PROJECT_NAME", "schaefflerautomat")
SERVICE_NAME = "flamenco-worker"
def _get_client():
import docker
return docker.from_env()
def get_worker_containers(client=None):
"""Return all flamenco-worker containers (running + stopped) sorted by name."""
if client is None:
client = _get_client()
return sorted(
client.containers.list(
all=True,
filters={
"label": [
f"com.docker.compose.project={COMPOSE_PROJECT}",
f"com.docker.compose.service={SERVICE_NAME}",
]
},
),
key=lambda c: c.name,
)
def get_running_worker_count(client=None) -> int:
"""Return how many flamenco-worker containers are currently running."""
try:
if client is None:
client = _get_client()
containers = get_worker_containers(client)
return sum(1 for c in containers if c.status == "running")
except Exception as exc:
log.warning("docker_scaler: could not read worker count: %s", exc)
return -1
def scale_workers(target: int) -> dict:
"""Scale flamenco-worker containers to *target* count.
Returns a dict with keys:
previous containers running before
current containers running after
delta change (negative = stopped, positive = started)
message human-readable summary
"""
import docker
from docker.types import Mount
client = _get_client()
all_workers = get_worker_containers(client)
running = [c for c in all_workers if c.status == "running"]
previous = len(running)
if target == previous:
return {"previous": previous, "current": previous, "delta": 0,
"message": f"Already at {previous} worker(s) — no change"}
# ── Scale down ────────────────────────────────────────────────────────────
if target < previous:
# Stop highest-numbered containers first to minimise disruption
to_stop = sorted(running, key=lambda c: c.name, reverse=True)[: previous - target]
for c in to_stop:
log.info("docker_scaler: stopping %s", c.name)
c.stop(timeout=20)
c.remove()
return {
"previous": previous,
"current": target,
"delta": target - previous,
"message": f"Stopped {len(to_stop)} worker(s): {[c.name for c in to_stop]}",
}
# ── Scale up ──────────────────────────────────────────────────────────────
template = running[0] if running else (all_workers[0] if all_workers else None)
if template is None:
raise RuntimeError(
"No existing flamenco-worker container found to clone configuration from. "
"Ensure at least one worker container exists (even if stopped)."
)
attrs = template.attrs
image = attrs["Config"]["Image"]
env = attrs["Config"].get("Env") or []
# Reconstruct mounts from the template container
mounts = []
for m in (attrs.get("Mounts") or []):
mount_type = m.get("Type", "bind")
source = m.get("Name", "") if mount_type == "volume" else m.get("Source", "")
mounts.append(
Mount(
target=m["Destination"],
source=source,
type=mount_type,
read_only=not m.get("RW", True),
)
)
# Reconstruct GPU device requests (nvidia)
device_requests = None
raw_dr = (attrs.get("HostConfig") or {}).get("DeviceRequests") or []
if raw_dr:
device_requests = []
for dr in raw_dr:
device_requests.append(
docker.types.DeviceRequest(
driver=dr.get("Driver", ""),
count=dr.get("Count", -1),
device_ids=dr.get("DeviceIDs") or [],
capabilities=dr.get("Capabilities") or [],
options=dr.get("Options") or {},
)
)
# Network(s) the template is connected to
network_names = list(
(attrs.get("NetworkSettings") or {}).get("Networks", {}).keys()
)
restart_policy_name = (
(attrs.get("HostConfig") or {})
.get("RestartPolicy", {})
.get("Name", "unless-stopped")
) or "unless-stopped"
started = []
for i in range(previous + 1, target + 1):
new_name = f"{COMPOSE_PROJECT}-{SERVICE_NAME}-{i}"
labels = {
"com.docker.compose.project": COMPOSE_PROJECT,
"com.docker.compose.service": SERVICE_NAME,
"com.docker.compose.container-number": str(i),
}
log.info("docker_scaler: creating %s from image %s", new_name, image)
container = client.containers.create(
image=image,
name=new_name,
environment=env,
labels=labels,
mounts=mounts,
restart_policy={"Name": restart_policy_name},
device_requests=device_requests,
)
for net_name in network_names:
try:
net = client.networks.get(net_name)
net.connect(container)
log.info("docker_scaler: connected %s to network %s", new_name, net_name)
except Exception as exc:
log.warning("docker_scaler: could not connect to network %s: %s", net_name, exc)
container.start()
started.append(new_name)
log.info("docker_scaler: started %s", new_name)
return {
"previous": previous,
"current": target,
"delta": target - previous,
"message": f"Started {len(started)} new worker(s): {started}",
}