From baeb0c4e7f52a7c7912f0feaa8c39fec043849c1 Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Thu, 30 Jan 2025 16:59:57 -0800 Subject: [PATCH] Add cmd_stop configuration to better support docker (#35) Add `cmd_stop` to model configuration to run a command instead of sending a SIGTERM to shutdown a process before swapping. --- README.md | 18 ++++++++++- config.example.yaml | 15 +++++++++ llama-swap.go | 12 ++++++++ proxy/config.go | 4 +++ proxy/config_test.go | 26 ++++++++++++++++ proxy/process.go | 72 +++++++++++++++++++++++++++++++------------- 6 files changed, 125 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index a27451d..48eb17b 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ # Introduction llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server. -Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). +Written in golang, it is very easy to install (single binary with no dependancies) and configure (single yaml file). Download a pre-built [release](https://github.com/mostlygeek/llama-swap/releases) or build it yourself from source with `make clean all`. @@ -30,11 +30,13 @@ Any OpenAI compatible server would work. llama-swap was originally designed for - `v1/rerank` - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36)) - ✅ Multiple GPU support +- ✅ Docker Support ([#40](https://github.com/mostlygeek/llama-swap/pull/40)) - ✅ Run multiple models at once with `profiles` - ✅ Remote log monitoring at `/log` - ✅ Automatic unloading of models from GPUs after timeout - ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabbyAPI, etc) - ✅ Direct access to upstream HTTP server via `/upstream/:model_id` ([demo](https://github.com/mostlygeek/llama-swap/pull/31)) +- ## config.yaml @@ -89,6 +91,20 @@ models: cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0 unlisted: true + # Docker Support (Experimental) + # see: https://github.com/mostlygeek/llama-swap/pull/40 + "dockertest": + proxy: "http://127.0.0.1:9790" + + # introduced to reliably stop containers + cmd_stop: docker stop -t 2 dockertest + + cmd: > + docker run --name dockertest + --init --rm -p 9790:8080 -v /mnt/nvme/models:/models + ghcr.io/ggerganov/llama.cpp:server + --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' + # profiles make it easy to managing multi model (and gpu) configurations. # # Tips: diff --git a/config.example.yaml b/config.example.yaml index 89621cd..6a2a543 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -53,6 +53,21 @@ models: --ctx-size 8192 --reranking + # EXPERIMENTAL! Docker Support + # see: + # - https://github.com/mostlygeek/llama-swap/pull/40 + # - https://github.com/mostlygeek/llama-swap/issues/35 + "dockertest": + proxy: "http://127.0.0.1:9790" + + # use this to reliably stop named containers + cmd_stop: docker stop -t 2 dockertest + + cmd: > + docker run --name dockertest + --init --rm -p 9790:8080 -v /mnt/nvme/models:/models + ghcr.io/ggerganov/llama.cpp:server + --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' "simple": # example of setting environment variables diff --git a/llama-swap.go b/llama-swap.go index 75c9ec6..f7e6b7b 100644 --- a/llama-swap.go +++ b/llama-swap.go @@ -4,6 +4,8 @@ import ( "flag" "fmt" "os" + "os/signal" + "syscall" "github.com/gin-gonic/gin" "github.com/mostlygeek/llama-swap/proxy" @@ -39,6 +41,16 @@ func main() { } proxyManager := proxy.New(config) + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + go func() { + <-sigChan + fmt.Println("Shutting down llama-swap") + proxyManager.StopProcesses() + os.Exit(0) + }() + fmt.Println("llama-swap listening on " + *listenStr) if err := proxyManager.Run(*listenStr); err != nil { fmt.Printf("Server error: %v\n", err) diff --git a/proxy/config.go b/proxy/config.go index 3206ae9..0ed7487 100644 --- a/proxy/config.go +++ b/proxy/config.go @@ -11,6 +11,7 @@ import ( type ModelConfig struct { Cmd string `yaml:"cmd"` + CmdStop string `yaml:"cmd_stop"` Proxy string `yaml:"proxy"` Aliases []string `yaml:"aliases"` Env []string `yaml:"env"` @@ -22,6 +23,9 @@ type ModelConfig struct { func (m *ModelConfig) SanitizedCommand() ([]string, error) { return SanitizeCommand(m.Cmd) } +func (m *ModelConfig) SanitizeCommandStop() ([]string, error) { + return SanitizeCommand(m.CmdStop) +} type Config struct { HealthCheckTimeout int `yaml:"healthCheckTimeout"` diff --git a/proxy/config_test.go b/proxy/config_test.go index da2eb39..28495f1 100644 --- a/proxy/config_test.go +++ b/proxy/config_test.go @@ -35,6 +35,11 @@ models: aliases: - "m2" checkEndpoint: "/" + docker: + cmd: docker run -p 9999:8080 --name "my_container" + cmd_stop: docker stop my_container + proxy: "http://localhost:9999" + checkEndpoint: "/health" healthCheckTimeout: 15 profiles: test: @@ -56,6 +61,7 @@ profiles: Models: map[string]ModelConfig{ "model1": { Cmd: "path/to/cmd --arg1 one", + CmdStop: "", Proxy: "http://localhost:8080", Aliases: []string{"m1", "model-one"}, Env: []string{"VAR1=value1", "VAR2=value2"}, @@ -63,11 +69,19 @@ profiles: }, "model2": { Cmd: "path/to/cmd --arg1 one", + CmdStop: "", Proxy: "http://localhost:8081", Aliases: []string{"m2"}, Env: nil, CheckEndpoint: "/", }, + "docker": { + Cmd: `docker run -p 9999:8080 --name "my_container"`, + CmdStop: "docker stop my_container", + Proxy: "http://localhost:9999", + Env: nil, + CheckEndpoint: "/health", + }, }, HealthCheckTimeout: 15, Profiles: map[string][]string{ @@ -99,6 +113,18 @@ func TestConfig_ModelConfigSanitizedCommand(t *testing.T) { assert.Equal(t, []string{"python", "model1.py", "--arg1", "value1", "--arg2", "value2"}, args) } +func TestConfig_ModelConfigSanitizedCommandStop(t *testing.T) { + config := &ModelConfig{ + CmdStop: `docker stop my_container \ + --arg1 1 + --arg2 2`, + } + + args, err := config.SanitizeCommandStop() + assert.NoError(t, err) + assert.Equal(t, []string{"docker", "stop", "my_container", "--arg1", "1", "--arg2", "2"}, args) +} + func TestConfig_FindConfig(t *testing.T) { // TODO? diff --git a/proxy/process.go b/proxy/process.go index 05c6bac..6ac381c 100644 --- a/proxy/process.go +++ b/proxy/process.go @@ -153,12 +153,13 @@ func (p *Process) Stop() { defer p.stateMutex.Unlock() if p.state != StateReady { + fmt.Fprintf(p.logMonitor, "!!! Stop() called but Process State is not READY\n") return } if p.cmd == nil || p.cmd.Process == nil { // this situation should never happen... but if it does just update the state - fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.") + fmt.Fprintf(p.logMonitor, "!!! State is Ready but Command is nil.\n") p.state = StateStopped return } @@ -166,30 +167,59 @@ func (p *Process) Stop() { // Pretty sure this stopping code needs some work for windows and // will be a source of pain in the future. - sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second) - defer cancel() - - sigtermNormal := make(chan error, 1) - go func() { - sigtermNormal <- p.cmd.Wait() - }() - - p.cmd.Process.Signal(syscall.SIGTERM) - - select { - case <-sigtermTimeout.Done(): - fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid) - p.cmd.Process.Kill() - p.cmd.Wait() - case err := <-sigtermNormal: + if p.config.CmdStop != "" { + // for issue #35 to do things like `docker stop` + args, err := p.config.SanitizeCommandStop() if err != nil { - if err.Error() != "wait: no child processes" { - // possible that simple-responder for testing is just not - // existing right, so suppress those errors. - fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err) + fmt.Fprintf(p.logMonitor, "!!! Error sanitizing stop command: %v\n", err) + + // leave the state as it is? + return + } + + fmt.Fprintf(p.logMonitor, "!!! Running stop command: %s\n", strings.Join(args, " ")) + cmd := exec.Command(args[0], args[1:]...) + cmd.Stdout = p.logMonitor + cmd.Stderr = p.logMonitor + err = cmd.Start() + if err != nil { + fmt.Fprintf(p.logMonitor, "!!! Error running stop command: %v\n", err) + + // leave the state as it is? + return + } + + err = cmd.Wait() + if err != nil { + fmt.Fprintf(p.logMonitor, "!!! WARNING error waiting for stop command to complete: %v\n", err) + } + } else { + sigtermTimeout, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + sigtermNormal := make(chan error, 1) + go func() { + sigtermNormal <- p.cmd.Wait() + }() + + p.cmd.Process.Signal(syscall.SIGTERM) + + select { + case <-sigtermTimeout.Done(): + fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid) + p.cmd.Process.Kill() + p.cmd.Wait() + case err := <-sigtermNormal: + if err != nil { + if err.Error() != "wait: no child processes" { + // possible that simple-responder for testing is just not + // existing right, so suppress those errors. + fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err) + } } } } + p.state = StateStopped }