From baeb0c4e7f52a7c7912f0feaa8c39fec043849c1 Mon Sep 17 00:00:00 2001
From: Benson Wong <mostlygeek@gmail.com>
Date: Thu, 30 Jan 2025 16:59:57 -0800
Subject: [PATCH] Add cmd_stop configuration to better support docker (#35)

Add `cmd_stop` to model configuration to run a command instead of sending a SIGTERM to shutdown a process before swapping.
---
 README.md            | 18 ++++++++++-
 config.example.yaml  | 15 +++++++++
 llama-swap.go        | 12 ++++++++
 proxy/config.go      |  4 +++
 proxy/config_test.go | 26 ++++++++++++++++
 proxy/process.go     | 72 +++++++++++++++++++++++++++++++-------------
 6 files changed, 125 insertions(+), 22 deletions(-)

diff --git a/README.md b/README.md
index a27451d..48eb17b 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 # Introduction
 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
 
-Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). 
+Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file).
 
 Download a pre-built [release](https://github.com/mostlygeek/llama-swap/releases) or build it yourself from source with `make clean all`.
 
@@ -30,11 +30,13 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
   - `v1/rerank`
   - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
 - ✅ Multiple GPU support
+- ✅ Docker Support ([#40](https://github.com/mostlygeek/llama-swap/pull/40))
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
 - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
 - ✅ Direct access to upstream HTTP server via `/upstream/:model_id` ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
+-
 
 ## config.yaml
 
@@ -89,6 +91,20 @@ models:
     cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
     unlisted: true
 
+  # Docker Support (Experimental)
+  # see: https://github.com/mostlygeek/llama-swap/pull/40
+  "dockertest":
+    proxy: "http://127.0.0.1:9790"
+
+    # introduced to reliably stop containers
+    cmd_stop: docker stop -t 2 dockertest
+
+    cmd: >
+      docker run --name dockertest
+      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
+      ghcr.io/ggerganov/llama.cpp:server
+      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
+
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
diff --git a/config.example.yaml b/config.example.yaml
index 89621cd..6a2a543 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -53,6 +53,21 @@ models:
       --ctx-size 8192
       --reranking
 
+  # EXPERIMENTAL! Docker Support
+  # see:
+  #  - https://github.com/mostlygeek/llama-swap/pull/40
+  #  - https://github.com/mostlygeek/llama-swap/issues/35
+  "dockertest":
+    proxy: "http://127.0.0.1:9790"
+
+    # use this to reliably stop named containers
+    cmd_stop: docker stop -t 2 dockertest
+
+    cmd: >
+      docker run --name dockertest
+      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
+      ghcr.io/ggerganov/llama.cpp:server
+      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
 
   "simple":
     # example of setting environment variables
diff --git a/llama-swap.go b/llama-swap.go
index 75c9ec6..f7e6b7b 100644
--- a/llama-swap.go
+++ b/llama-swap.go
@@ -4,6 +4,8 @@ import (
 	"flag"
 	"fmt"
 	"os"
+	"os/signal"
+	"syscall"
 
 	"github.com/gin-gonic/gin"
 	"github.com/mostlygeek/llama-swap/proxy"
@@ -39,6 +41,16 @@ func main() {
 	}
 
 	proxyManager := proxy.New(config)
+
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	go func() {
+		<-sigChan
+		fmt.Println("Shutting down llama-swap")
+		proxyManager.StopProcesses()
+		os.Exit(0)
+	}()
+
 	fmt.Println("llama-swap listening on " + *listenStr)
 	if err := proxyManager.Run(*listenStr); err != nil {
 		fmt.Printf("Server error: %v\n", err)
diff --git a/proxy/config.go b/proxy/config.go
index 3206ae9..0ed7487 100644
--- a/proxy/config.go
+++ b/proxy/config.go
@@ -11,6 +11,7 @@ import (
 
 type ModelConfig struct {
 	Cmd           string   `yaml:"cmd"`
+	CmdStop       string   `yaml:"cmd_stop"`
 	Proxy         string   `yaml:"proxy"`
 	Aliases       []string `yaml:"aliases"`
 	Env           []string `yaml:"env"`
@@ -22,6 +23,9 @@ type ModelConfig struct {
 func (m *ModelConfig) SanitizedCommand() ([]string, error) {
 	return SanitizeCommand(m.Cmd)
 }
+func (m *ModelConfig) SanitizeCommandStop() ([]string, error) {
+	return SanitizeCommand(m.CmdStop)
+}
 
 type Config struct {
 	HealthCheckTimeout int                    `yaml:"healthCheckTimeout"`
diff --git a/proxy/config_test.go b/proxy/config_test.go
index da2eb39..28495f1 100644
--- a/proxy/config_test.go
+++ b/proxy/config_test.go
@@ -35,6 +35,11 @@ models:
     aliases:
       - "m2"
     checkEndpoint: "/"
+  docker:
+    cmd: docker run -p 9999:8080 --name "my_container"
+    cmd_stop: docker stop my_container
+    proxy: "http://localhost:9999"
+    checkEndpoint: "/health"
 healthCheckTimeout: 15
 profiles:
   test:
@@ -56,6 +61,7 @@ profiles:
 		Models: map[string]ModelConfig{
 			"model1": {
 				Cmd:           "path/to/cmd --arg1 one",
+				CmdStop:       "",
 				Proxy:         "http://localhost:8080",
 				Aliases:       []string{"m1", "model-one"},
 				Env:           []string{"VAR1=value1", "VAR2=value2"},
@@ -63,11 +69,19 @@ profiles:
 			},
 			"model2": {
 				Cmd:           "path/to/cmd --arg1 one",
+				CmdStop:       "",
 				Proxy:         "http://localhost:8081",
 				Aliases:       []string{"m2"},
 				Env:           nil,
 				CheckEndpoint: "/",
 			},
+			"docker": {
+				Cmd:           `docker run -p 9999:8080 --name "my_container"`,
+				CmdStop:       "docker stop my_container",
+				Proxy:         "http://localhost:9999",
+				Env:           nil,
+				CheckEndpoint: "/health",
+			},
 		},
 		HealthCheckTimeout: 15,
 		Profiles: map[string][]string{
@@ -99,6 +113,18 @@ func TestConfig_ModelConfigSanitizedCommand(t *testing.T) {
 	assert.Equal(t, []string{"python", "model1.py", "--arg1", "value1", "--arg2", "value2"}, args)
 }
 
+func TestConfig_ModelConfigSanitizedCommandStop(t *testing.T) {
+	config := &ModelConfig{
+		CmdStop: `docker stop my_container \
+		--arg1 1
+		--arg2 2`,
+	}
+
+	args, err := config.SanitizeCommandStop()
+	assert.NoError(t, err)
+	assert.Equal(t, []string{"docker", "stop", "my_container", "--arg1", "1", "--arg2", "2"}, args)
+}
+
 func TestConfig_FindConfig(t *testing.T) {
 
 	// TODO?
diff --git a/proxy/process.go b/proxy/process.go
index 05c6bac..6ac381c 100644
--- a/proxy/process.go
+++ b/proxy/process.go
@@ -153,12 +153,13 @@ func (p *Process) Stop() {
 	defer p.stateMutex.Unlock()
 
 	if p.state != StateReady {
+		fmt.Fprintf(p.logMonitor, "!!! Stop() called but Process State is not READY\n")
 		return
 	}
 
 	if p.cmd == nil || p.cmd.Process == nil {
 		// this situation should never happen... but if it does just update the state
-		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
+		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
 		p.state = StateStopped
 		return
 	}
@@ -166,30 +167,59 @@ func (p *Process) Stop() {
 	// Pretty sure this stopping code needs some work for windows and
 	// will be a source of pain in the future.
 
-	sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
-	defer cancel()
-
-	sigtermNormal := make(chan error, 1)
-	go func() {
-		sigtermNormal <- p.cmd.Wait()
-	}()
-
-	p.cmd.Process.Signal(syscall.SIGTERM)
-
-	select {
-	case <-sigtermTimeout.Done():
-		fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
-		p.cmd.Process.Kill()
-		p.cmd.Wait()
-	case err := <-sigtermNormal:
+	if p.config.CmdStop != "" {
+		// for issue #35 to do things like `docker stop`
+		args, err := p.config.SanitizeCommandStop()
 		if err != nil {
-			if err.Error() != "wait: no child processes" {
-				// possible that simple-responder for testing is just not
-				// existing right, so suppress those errors.
-				fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
+			fmt.Fprintf(p.logMonitor, "!!! Error sanitizing stop command: %v\n", err)
+
+			// leave the state as it is?
+			return
+		}
+
+		fmt.Fprintf(p.logMonitor, "!!! Running stop command: %s\n", strings.Join(args, " "))
+		cmd := exec.Command(args[0], args[1:]...)
+		cmd.Stdout = p.logMonitor
+		cmd.Stderr = p.logMonitor
+		err = cmd.Start()
+		if err != nil {
+			fmt.Fprintf(p.logMonitor, "!!! Error running stop command: %v\n", err)
+
+			// leave the state as it is?
+			return
+		}
+
+		err = cmd.Wait()
+		if err != nil {
+			fmt.Fprintf(p.logMonitor, "!!! WARNING error waiting for stop command to complete: %v\n", err)
+		}
+	} else {
+		sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		defer cancel()
+
+		sigtermNormal := make(chan error, 1)
+		go func() {
+			sigtermNormal <- p.cmd.Wait()
+		}()
+
+		p.cmd.Process.Signal(syscall.SIGTERM)
+
+		select {
+		case <-sigtermTimeout.Done():
+			fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
+			p.cmd.Process.Kill()
+			p.cmd.Wait()
+		case err := <-sigtermNormal:
+			if err != nil {
+				if err.Error() != "wait: no child processes" {
+					// possible that simple-responder for testing is just not
+					// existing right, so suppress those errors.
+					fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
+				}
 			}
 		}
 	}
+
 	p.state = StateStopped
 }