Add cmd_stop configuration to better support docker (#35)

Add `cmd_stop` to model configuration to run a command instead of sending a SIGTERM to shutdown a process before swapping.
2025-01-30 16:59:57 -08:00
parent 2833517eef
commit baeb0c4e7f
6 changed files with 125 additions and 22 deletions
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@
 # Introduction
 llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
-Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). 
+Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file).
 Download a pre-built [release](https://github.com/mostlygeek/llama-swap/releases) or build it yourself from source with `make clean all`.
@@ -30,11 +30,13 @@ Any OpenAI compatible server would work. llama-swap was originally designed for
  - `v1/rerank`
  - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36))
 - ✅ Multiple GPU support
 - ✅ Docker Support ([#40](https://github.com/mostlygeek/llama-swap/pull/40))
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
 - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc)
 - ✅ Direct access to upstream HTTP server via `/upstream/:model_id` ([demo](https://github.com/mostlygeek/llama-swap/pull/31))
 -
 ## config.yaml
@@ -89,6 +91,20 @@ models:
    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
    unlisted: true
  # Docker Support (Experimental)
  # see: https://github.com/mostlygeek/llama-swap/pull/40
  "dockertest":
    proxy: "http://127.0.0.1:9790"
    # introduced to reliably stop containers
    cmd_stop: docker stop -t 2 dockertest
    cmd: >
      docker run --name dockertest
      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -53,6 +53,21 @@ models:
      --ctx-size 8192
      --reranking
  # EXPERIMENTAL! Docker Support
  # see:
  #  - https://github.com/mostlygeek/llama-swap/pull/40
  #  - https://github.com/mostlygeek/llama-swap/issues/35
  "dockertest":
    proxy: "http://127.0.0.1:9790"
    # use this to reliably stop named containers
    cmd_stop: docker stop -t 2 dockertest
    cmd: >
      docker run --name dockertest
      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
  "simple":
    # example of setting environment variables
--- a/llama-swap.go
+++ b/llama-swap.go
@@ -4,6 +4,8 @@ import (
 	"flag"
 	"fmt"
 	"os"
 	"os/signal"
 	"syscall"
 	"github.com/gin-gonic/gin"
 	"github.com/mostlygeek/llama-swap/proxy"
@@ -39,6 +41,16 @@ func main() {
 	}
 	proxyManager := proxy.New(config)
 	sigChan := make(chan os.Signal, 1)
 	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
 	go func() {
 		<-sigChan
 		fmt.Println("Shutting down llama-swap")
 		proxyManager.StopProcesses()
 		os.Exit(0)
 	}()
 	fmt.Println("llama-swap listening on " + *listenStr)
 	if err := proxyManager.Run(*listenStr); err != nil {
 		fmt.Printf("Server error: %v\n", err)
--- a/proxy/config.go
+++ b/proxy/config.go
@@ -11,6 +11,7 @@ import (
 type ModelConfig struct {
 	Cmd           string   `yaml:"cmd"`
 	CmdStop       string   `yaml:"cmd_stop"`
 	Proxy         string   `yaml:"proxy"`
 	Aliases       []string `yaml:"aliases"`
 	Env           []string `yaml:"env"`
@@ -22,6 +23,9 @@ type ModelConfig struct {
 func (m *ModelConfig) SanitizedCommand() ([]string, error) {
 	return SanitizeCommand(m.Cmd)
 }
 func (m *ModelConfig) SanitizeCommandStop() ([]string, error) {
 	return SanitizeCommand(m.CmdStop)
 }
 type Config struct {
 	HealthCheckTimeout int                    `yaml:"healthCheckTimeout"`
--- a/proxy/config_test.go
+++ b/proxy/config_test.go
@@ -35,6 +35,11 @@ models:
    aliases:
      - "m2"
    checkEndpoint: "/"
  docker:
    cmd: docker run -p 9999:8080 --name "my_container"
    cmd_stop: docker stop my_container
    proxy: "http://localhost:9999"
    checkEndpoint: "/health"
 healthCheckTimeout: 15
 profiles:
  test:
@@ -56,6 +61,7 @@ profiles:
 		Models: map[string]ModelConfig{
 			"model1": {
 				Cmd:           "path/to/cmd --arg1 one",
 				CmdStop:       "",
 				Proxy:         "http://localhost:8080",
 				Aliases:       []string{"m1", "model-one"},
 				Env:           []string{"VAR1=value1", "VAR2=value2"},
@@ -63,11 +69,19 @@ profiles:
 			},
 			"model2": {
 				Cmd:           "path/to/cmd --arg1 one",
 				CmdStop:       "",
 				Proxy:         "http://localhost:8081",
 				Aliases:       []string{"m2"},
 				Env:           nil,
 				CheckEndpoint: "/",
 			},
 			"docker": {
 				Cmd:           `docker run -p 9999:8080 --name "my_container"`,
 				CmdStop:       "docker stop my_container",
 				Proxy:         "http://localhost:9999",
 				Env:           nil,
 				CheckEndpoint: "/health",
 			},
 		},
 		HealthCheckTimeout: 15,
 		Profiles: map[string][]string{
@@ -99,6 +113,18 @@ func TestConfig_ModelConfigSanitizedCommand(t *testing.T) {
 	assert.Equal(t, []string{"python", "model1.py", "--arg1", "value1", "--arg2", "value2"}, args)
 }
 func TestConfig_ModelConfigSanitizedCommandStop(t *testing.T) {
 	config := &ModelConfig{
 		CmdStop: `docker stop my_container \
 		--arg1 1
 		--arg2 2`,
 	}
 	args, err := config.SanitizeCommandStop()
 	assert.NoError(t, err)
 	assert.Equal(t, []string{"docker", "stop", "my_container", "--arg1", "1", "--arg2", "2"}, args)
 }
 func TestConfig_FindConfig(t *testing.T) {
 	// TODO?
--- a/proxy/process.go
+++ b/proxy/process.go
@@ -153,12 +153,13 @@ func (p *Process) Stop() {
 	defer p.stateMutex.Unlock()
 	if p.state != StateReady {
 		fmt.Fprintf(p.logMonitor, "!!! Stop() called but Process State is not READY\n")
 		return
 	}
 	if p.cmd == nil || p.cmd.Process == nil {
 		// this situation should never happen... but if it does just update the state
-		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.")
+		fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n")
 		p.state = StateStopped
 		return
 	}
@@ -166,30 +167,59 @@ func (p *Process) Stop() {
 	// Pretty sure this stopping code needs some work for windows and
 	// will be a source of pain in the future.
-	sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+	if p.config.CmdStop != "" {
-	defer cancel()
+		// for issue #35 to do things like `docker stop`
-
+		args, err := p.config.SanitizeCommandStop()
 	sigtermNormal := make(chan error, 1)
 	go func() {
 		sigtermNormal <- p.cmd.Wait()
 	}()
 	p.cmd.Process.Signal(syscall.SIGTERM)
 	select {
 	case <-sigtermTimeout.Done():
 		fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
 		p.cmd.Process.Kill()
 		p.cmd.Wait()
 	case err := <-sigtermNormal:
 		if err != nil {
-			if err.Error() != "wait: no child processes" {
+			fmt.Fprintf(p.logMonitor, "!!! Error sanitizing stop command: %v\n", err)
-				// possible that simple-responder for testing is just not
+
-				// existing right, so suppress those errors.
+			// leave the state as it is?
-				fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
+			return
 		}
 		fmt.Fprintf(p.logMonitor, "!!! Running stop command: %s\n", strings.Join(args, " "))
 		cmd := exec.Command(args[0], args[1:]...)
 		cmd.Stdout = p.logMonitor
 		cmd.Stderr = p.logMonitor
 		err = cmd.Start()
 		if err != nil {
 			fmt.Fprintf(p.logMonitor, "!!! Error running stop command: %v\n", err)
 			// leave the state as it is?
 			return
 		}
 		err = cmd.Wait()
 		if err != nil {
 			fmt.Fprintf(p.logMonitor, "!!! WARNING error waiting for stop command to complete: %v\n", err)
 		}
 	} else {
 		sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 		defer cancel()
 		sigtermNormal := make(chan error, 1)
 		go func() {
 			sigtermNormal <- p.cmd.Wait()
 		}()
 		p.cmd.Process.Signal(syscall.SIGTERM)
 		select {
 		case <-sigtermTimeout.Done():
 			fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
 			p.cmd.Process.Kill()
 			p.cmd.Wait()
 		case err := <-sigtermNormal:
 			if err != nil {
 				if err.Error() != "wait: no child processes" {
 					// possible that simple-responder for testing is just not
 					// existing right, so suppress those errors.
 					fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
 				}
 			}
 		}
 	}
 	p.state = StateStopped
 }