"""Scale Flamenco worker containers via the Docker socket. Uses the Docker Python SDK (docker>=6.1.0) to list, start, and stop containers. Requires /var/run/docker.sock to be mounted into the backend container. """ import os import logging log = logging.getLogger(__name__) COMPOSE_PROJECT = os.getenv("COMPOSE_PROJECT_NAME", "schaefflerautomat") SERVICE_NAME = "flamenco-worker" def _get_client(): import docker return docker.from_env() def get_worker_containers(client=None): """Return all flamenco-worker containers (running + stopped) sorted by name.""" if client is None: client = _get_client() return sorted( client.containers.list( all=True, filters={ "label": [ f"com.docker.compose.project={COMPOSE_PROJECT}", f"com.docker.compose.service={SERVICE_NAME}", ] }, ), key=lambda c: c.name, ) def get_running_worker_count(client=None) -> int: """Return how many flamenco-worker containers are currently running.""" try: if client is None: client = _get_client() containers = get_worker_containers(client) return sum(1 for c in containers if c.status == "running") except Exception as exc: log.warning("docker_scaler: could not read worker count: %s", exc) return -1 def scale_workers(target: int) -> dict: """Scale flamenco-worker containers to *target* count. Returns a dict with keys: previous – containers running before current – containers running after delta – change (negative = stopped, positive = started) message – human-readable summary """ import docker from docker.types import Mount client = _get_client() all_workers = get_worker_containers(client) running = [c for c in all_workers if c.status == "running"] previous = len(running) if target == previous: return {"previous": previous, "current": previous, "delta": 0, "message": f"Already at {previous} worker(s) — no change"} # ── Scale down ──────────────────────────────────────────────────────────── if target < previous: # Stop highest-numbered containers first to minimise disruption to_stop = sorted(running, key=lambda c: c.name, reverse=True)[: previous - target] for c in to_stop: log.info("docker_scaler: stopping %s", c.name) c.stop(timeout=20) c.remove() return { "previous": previous, "current": target, "delta": target - previous, "message": f"Stopped {len(to_stop)} worker(s): {[c.name for c in to_stop]}", } # ── Scale up ────────────────────────────────────────────────────────────── template = running[0] if running else (all_workers[0] if all_workers else None) if template is None: raise RuntimeError( "No existing flamenco-worker container found to clone configuration from. " "Ensure at least one worker container exists (even if stopped)." ) attrs = template.attrs image = attrs["Config"]["Image"] env = attrs["Config"].get("Env") or [] # Reconstruct mounts from the template container mounts = [] for m in (attrs.get("Mounts") or []): mount_type = m.get("Type", "bind") source = m.get("Name", "") if mount_type == "volume" else m.get("Source", "") mounts.append( Mount( target=m["Destination"], source=source, type=mount_type, read_only=not m.get("RW", True), ) ) # Reconstruct GPU device requests (nvidia) device_requests = None raw_dr = (attrs.get("HostConfig") or {}).get("DeviceRequests") or [] if raw_dr: device_requests = [] for dr in raw_dr: device_requests.append( docker.types.DeviceRequest( driver=dr.get("Driver", ""), count=dr.get("Count", -1), device_ids=dr.get("DeviceIDs") or [], capabilities=dr.get("Capabilities") or [], options=dr.get("Options") or {}, ) ) # Network(s) the template is connected to network_names = list( (attrs.get("NetworkSettings") or {}).get("Networks", {}).keys() ) restart_policy_name = ( (attrs.get("HostConfig") or {}) .get("RestartPolicy", {}) .get("Name", "unless-stopped") ) or "unless-stopped" started = [] for i in range(previous + 1, target + 1): new_name = f"{COMPOSE_PROJECT}-{SERVICE_NAME}-{i}" labels = { "com.docker.compose.project": COMPOSE_PROJECT, "com.docker.compose.service": SERVICE_NAME, "com.docker.compose.container-number": str(i), } log.info("docker_scaler: creating %s from image %s", new_name, image) container = client.containers.create( image=image, name=new_name, environment=env, labels=labels, mounts=mounts, restart_policy={"Name": restart_policy_name}, device_requests=device_requests, ) for net_name in network_names: try: net = client.networks.get(net_name) net.connect(container) log.info("docker_scaler: connected %s to network %s", new_name, net_name) except Exception as exc: log.warning("docker_scaler: could not connect to network %s: %s", net_name, exc) container.start() started.append(new_name) log.info("docker_scaler: started %s", new_name) return { "previous": previous, "current": target, "delta": target - previous, "message": f"Started {len(started)} new worker(s): {started}", }