178 lines
6.1 KiB
Python
178 lines
6.1 KiB
Python
"""Scale Flamenco worker containers via the Docker socket.
|
||
|
||
Uses the Docker Python SDK (docker>=6.1.0) to list, start, and stop containers.
|
||
Requires /var/run/docker.sock to be mounted into the backend container.
|
||
"""
|
||
import os
|
||
import logging
|
||
|
||
log = logging.getLogger(__name__)
|
||
|
||
COMPOSE_PROJECT = os.getenv("COMPOSE_PROJECT_NAME", "schaefflerautomat")
|
||
SERVICE_NAME = "flamenco-worker"
|
||
|
||
|
||
def _get_client():
|
||
import docker
|
||
return docker.from_env()
|
||
|
||
|
||
def get_worker_containers(client=None):
|
||
"""Return all flamenco-worker containers (running + stopped) sorted by name."""
|
||
if client is None:
|
||
client = _get_client()
|
||
return sorted(
|
||
client.containers.list(
|
||
all=True,
|
||
filters={
|
||
"label": [
|
||
f"com.docker.compose.project={COMPOSE_PROJECT}",
|
||
f"com.docker.compose.service={SERVICE_NAME}",
|
||
]
|
||
},
|
||
),
|
||
key=lambda c: c.name,
|
||
)
|
||
|
||
|
||
def get_running_worker_count(client=None) -> int:
|
||
"""Return how many flamenco-worker containers are currently running."""
|
||
try:
|
||
if client is None:
|
||
client = _get_client()
|
||
containers = get_worker_containers(client)
|
||
return sum(1 for c in containers if c.status == "running")
|
||
except Exception as exc:
|
||
log.warning("docker_scaler: could not read worker count: %s", exc)
|
||
return -1
|
||
|
||
|
||
def scale_workers(target: int) -> dict:
|
||
"""Scale flamenco-worker containers to *target* count.
|
||
|
||
Returns a dict with keys:
|
||
previous – containers running before
|
||
current – containers running after
|
||
delta – change (negative = stopped, positive = started)
|
||
message – human-readable summary
|
||
"""
|
||
import docker
|
||
from docker.types import Mount
|
||
|
||
client = _get_client()
|
||
|
||
all_workers = get_worker_containers(client)
|
||
running = [c for c in all_workers if c.status == "running"]
|
||
previous = len(running)
|
||
|
||
if target == previous:
|
||
return {"previous": previous, "current": previous, "delta": 0,
|
||
"message": f"Already at {previous} worker(s) — no change"}
|
||
|
||
# ── Scale down ────────────────────────────────────────────────────────────
|
||
if target < previous:
|
||
# Stop highest-numbered containers first to minimise disruption
|
||
to_stop = sorted(running, key=lambda c: c.name, reverse=True)[: previous - target]
|
||
for c in to_stop:
|
||
log.info("docker_scaler: stopping %s", c.name)
|
||
c.stop(timeout=20)
|
||
c.remove()
|
||
return {
|
||
"previous": previous,
|
||
"current": target,
|
||
"delta": target - previous,
|
||
"message": f"Stopped {len(to_stop)} worker(s): {[c.name for c in to_stop]}",
|
||
}
|
||
|
||
# ── Scale up ──────────────────────────────────────────────────────────────
|
||
template = running[0] if running else (all_workers[0] if all_workers else None)
|
||
if template is None:
|
||
raise RuntimeError(
|
||
"No existing flamenco-worker container found to clone configuration from. "
|
||
"Ensure at least one worker container exists (even if stopped)."
|
||
)
|
||
|
||
attrs = template.attrs
|
||
image = attrs["Config"]["Image"]
|
||
env = attrs["Config"].get("Env") or []
|
||
|
||
# Reconstruct mounts from the template container
|
||
mounts = []
|
||
for m in (attrs.get("Mounts") or []):
|
||
mount_type = m.get("Type", "bind")
|
||
source = m.get("Name", "") if mount_type == "volume" else m.get("Source", "")
|
||
mounts.append(
|
||
Mount(
|
||
target=m["Destination"],
|
||
source=source,
|
||
type=mount_type,
|
||
read_only=not m.get("RW", True),
|
||
)
|
||
)
|
||
|
||
# Reconstruct GPU device requests (nvidia)
|
||
device_requests = None
|
||
raw_dr = (attrs.get("HostConfig") or {}).get("DeviceRequests") or []
|
||
if raw_dr:
|
||
device_requests = []
|
||
for dr in raw_dr:
|
||
device_requests.append(
|
||
docker.types.DeviceRequest(
|
||
driver=dr.get("Driver", ""),
|
||
count=dr.get("Count", -1),
|
||
device_ids=dr.get("DeviceIDs") or [],
|
||
capabilities=dr.get("Capabilities") or [],
|
||
options=dr.get("Options") or {},
|
||
)
|
||
)
|
||
|
||
# Network(s) the template is connected to
|
||
network_names = list(
|
||
(attrs.get("NetworkSettings") or {}).get("Networks", {}).keys()
|
||
)
|
||
|
||
restart_policy_name = (
|
||
(attrs.get("HostConfig") or {})
|
||
.get("RestartPolicy", {})
|
||
.get("Name", "unless-stopped")
|
||
) or "unless-stopped"
|
||
|
||
started = []
|
||
for i in range(previous + 1, target + 1):
|
||
new_name = f"{COMPOSE_PROJECT}-{SERVICE_NAME}-{i}"
|
||
labels = {
|
||
"com.docker.compose.project": COMPOSE_PROJECT,
|
||
"com.docker.compose.service": SERVICE_NAME,
|
||
"com.docker.compose.container-number": str(i),
|
||
}
|
||
|
||
log.info("docker_scaler: creating %s from image %s", new_name, image)
|
||
container = client.containers.create(
|
||
image=image,
|
||
name=new_name,
|
||
environment=env,
|
||
labels=labels,
|
||
mounts=mounts,
|
||
restart_policy={"Name": restart_policy_name},
|
||
device_requests=device_requests,
|
||
)
|
||
|
||
for net_name in network_names:
|
||
try:
|
||
net = client.networks.get(net_name)
|
||
net.connect(container)
|
||
log.info("docker_scaler: connected %s to network %s", new_name, net_name)
|
||
except Exception as exc:
|
||
log.warning("docker_scaler: could not connect to network %s: %s", net_name, exc)
|
||
|
||
container.start()
|
||
started.append(new_name)
|
||
log.info("docker_scaler: started %s", new_name)
|
||
|
||
return {
|
||
"previous": previous,
|
||
"current": target,
|
||
"delta": target - previous,
|
||
"message": f"Started {len(started)} new worker(s): {started}",
|
||
}
|