diff --git a/README.md b/README.md index c9a6dec..679caad 100644 --- a/README.md +++ b/README.md @@ -45,158 +45,31 @@ llama-swap's configuration is purposefully simple. ```yaml models: "qwen2.5": - proxy: "http://127.0.0.1:9999" cmd: | /app/llama-server -hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M - --port 9999 + --port ${PORT} "smollm2": - proxy: "http://127.0.0.1:9999" cmd: | /app/llama-server -hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M - --port 9999 + --port ${PORT} ``` -
-But also very powerful ... +But also very powerful: -```yaml -# Seconds to wait for upstream to load and be ready to serve requests -# minimum is 15 seconds -# default is 120 seconds -healthCheckTimeout: 500 +- ⚡ `groups` to run multiple models at once +- ⚡ `macros` for reusable snippets +- ⚡ `ttl` to automatically unload models +- ⚡ `aliases` to use familiar model names (e.g., "gpt-4o-mini") +- ⚡ `env` variables to pass custom environment to inference servers +- ⚡ `useModelName` to override model names sent to upstream servers +- ⚡ `healthCheckTimeout` to control model startup wait times +- ⚡ `${PORT}` automatic port variables for dynamic port assignment +- ⚡ Docker/podman compatible -# Valid log levels: debug, info (default), warn, error -logLevel: info - -# Automatic Port Values -# use ${PORT} in model.cmd and model.proxy to use an automatic port number -# when you use ${PORT} you can omit a custom model.proxy value, as it will -# default to http://localhost:${PORT} - -# override the default port (5800) for automatic port values -startPort: 10001 - -# define valid model values and the upstream server start -models: - "llama": - # multiline for readability - cmd: | - llama-server --port 8999 - --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf - - # environment variables to pass to the command - env: - - "CUDA_VISIBLE_DEVICES=0" - - # where to reach the server started by cmd, make sure the ports match - # can be omitted if you use an automatic ${PORT} in cmd - proxy: http://127.0.0.1:8999 - - # aliases names to use this model for - aliases: - - "gpt-4o-mini" - - "gpt-3.5-turbo" - - # check this path for an HTTP 200 OK before serving requests - # default: /health to match llama.cpp - # use "none" to skip endpoint checking, but may cause HTTP errors - # until the model is ready - checkEndpoint: /custom-endpoint - - # automatically unload the model after this many seconds - # ttl values must be a value greater than 0 - # default: 0 = never unload model - ttl: 60 - - # `useModelName` overrides the model name in the request - # and sends a specific name to the upstream server - useModelName: "qwen:qwq" - - # unlisted models do not show up in /v1/models or /upstream lists - # but they can still be requested as normal - "qwen-unlisted": - unlisted: true - cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0 - - # Docker Support (v26.1.4+ required!) - "docker-llama": - proxy: "http://127.0.0.1:${PORT}" - cmd: | - docker run --name dockertest - --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models - ghcr.io/ggml-org/llama.cpp:server - --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' - - # use a custom command to stop the model when swapping. By default - # this is SIGTERM on POSIX systems, and taskkill on Windows systems - # the ${PID} variable can be used in cmdStop, it will be automatically replaced - # with the PID of the running model - cmdStop: docker stop dockertest - -# Groups provide advanced controls over model swapping behaviour. Using groups -# some models can be kept loaded indefinitely, while others are swapped out. -# -# Tips: -# -# - models must be defined above in the Models section -# - a model can only be a member of one group -# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields -# - see issue #109 for details -# -# NOTE: the example below uses model names that are not defined above for demonstration purposes -groups: - # group1 is the default behaviour of llama-swap where only one model is allowed - # to run a time across the whole llama-swap instance - "group1": - # swap controls the model swapping behaviour in within the group - # - true : only one model is allowed to run at a time - # - false: all models can run together, no swapping - swap: true - - # exclusive controls how the group affects other groups - # - true: causes all other groups to unload their models when this group runs a model - # - false: does not affect other groups - exclusive: true - - # members references the models defined above - members: - - "llama" - - "qwen-unlisted" - - # models in this group are never unloaded - "group2": - swap: false - exclusive: false - members: - - "docker-llama" - # (not defined above, here for example) - - "modelA" - - "modelB" - - "forever": - # setting persistent to true causes the group to never be affected by the swapping behaviour of - # other groups. It is a shortcut to keeping some models always loaded. - persistent: true - - # set swap/exclusive to false to prevent swapping inside the group and effect on other groups - swap: false - exclusive: false - members: - - "forever-modelA" - - "forever-modelB" - - "forever-modelc" -``` - -### Use Case Examples - -- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints -- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases. -- [Optimizing Code Generation](examples/benchmark-snakegame/README.md) - find the optimal settings for your machine. This example demonstrates defining multiple configurations and testing which one is fastest. -- [Restart on Config Change](examples/restart-on-config-change/README.md) - automatically restart llama-swap when trying out different configurations. -
+Check the [wiki](https://github.com/mostlygeek/llama-swap/wiki/Configuration) full documentation. ## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap)) diff --git a/proxy/config.go b/proxy/config.go index 6ebdd7d..bef94e2 100644 --- a/proxy/config.go +++ b/proxy/config.go @@ -4,6 +4,7 @@ import ( "fmt" "io" "os" + "regexp" "runtime" "sort" "strconv" @@ -67,6 +68,9 @@ type Config struct { Profiles map[string][]string `yaml:"profiles"` Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */ + // for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint + Macros map[string]string `yaml:"macros"` + // map aliases to actual model IDs aliases map[string]string @@ -141,6 +145,30 @@ func LoadConfigFromReader(r io.Reader) (Config, error) { } } + /* check macro constraint rules: + + - name must fit the regex ^[a-zA-Z0-9_-]+$ + - names must be less than 64 characters (no reason, just cause) + - name can not be any reserved macros: PORT + - macro values must be less than 1024 characters + */ + macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`) + for macroName, macroValue := range config.Macros { + if len(macroName) >= 64 { + return Config{}, fmt.Errorf("macro name '%s' exceeds maximum length of 63 characters", macroName) + } + if !macroNameRegex.MatchString(macroName) { + return Config{}, fmt.Errorf("macro name '%s' contains invalid characters, must match pattern ^[a-zA-Z0-9_-]+$", macroName) + } + if len(macroValue) >= 1024 { + return Config{}, fmt.Errorf("macro value for '%s' exceeds maximum length of 1024 characters", macroName) + } + switch macroName { + case "PORT": + return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName) + } + } + // Get and sort all model IDs first, makes testing more consistent modelIds := make([]string, 0, len(config.Models)) for modelId := range config.Models { @@ -151,19 +179,51 @@ func LoadConfigFromReader(r io.Reader) (Config, error) { nextPort := config.StartPort for _, modelId := range modelIds { modelConfig := config.Models[modelId] - // iterate over the models and replace any ${PORT} with the next available port - if strings.Contains(modelConfig.Cmd, "${PORT}") { - modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort)) + + // go through model config fields: cmd, cmdStop, proxy, checkEndPoint and replace macros with macro values + for macroName, macroValue := range config.Macros { + macroSlug := fmt.Sprintf("${%s}", macroName) + modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroValue) + modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue) + modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue) + modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue) + } + + // only iterate over models that use ${PORT} to keep port numbers from increasing unnecessarily + if strings.Contains(modelConfig.Cmd, "${PORT}") || strings.Contains(modelConfig.Proxy, "${PORT}") || strings.Contains(modelConfig.CmdStop, "${PORT}") { if modelConfig.Proxy == "" { - modelConfig.Proxy = fmt.Sprintf("http://localhost:%d", nextPort) - } else { - modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", strconv.Itoa(nextPort)) + modelConfig.Proxy = "http://localhost:${PORT}" } + + nextPortStr := strconv.Itoa(nextPort) + modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", nextPortStr) + modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${PORT}", nextPortStr) + modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", nextPortStr) nextPort++ - config.Models[modelId] = modelConfig } else if modelConfig.Proxy == "" { return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId) } + + // make sure there are no unknown macros that have not been replaced + macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`) + fieldMap := map[string]string{ + "cmd": modelConfig.Cmd, + "cmdStop": modelConfig.CmdStop, + "proxy": modelConfig.Proxy, + "checkEndpoint": modelConfig.CheckEndpoint, + } + + for fieldName, fieldValue := range fieldMap { + matches := macroPattern.FindAllStringSubmatch(fieldValue, -1) + for _, match := range matches { + macroName := match[1] + if _, exists := config.Macros[macroName]; !exists { + return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName) + } + } + } + + config.Models[modelId] = modelConfig } config = AddDefaultGroupToConfig(config) diff --git a/proxy/config_test.go b/proxy/config_test.go index 6017f37..5ce93e0 100644 --- a/proxy/config_test.go +++ b/proxy/config_test.go @@ -19,6 +19,8 @@ func TestConfig_Load(t *testing.T) { tempFile := filepath.Join(tempDir, "config.yaml") content := ` +macros: + svr-path: "path/to/server" models: model1: cmd: path/to/cmd --arg1 one @@ -31,7 +33,7 @@ models: - "VAR2=value2" checkEndpoint: "/health" model2: - cmd: path/to/cmd --arg1 one + cmd: ${svr-path} --arg1 one proxy: "http://localhost:8081" aliases: - "m2" @@ -76,6 +78,9 @@ groups: expected := Config{ StartPort: 5800, + Macros: map[string]string{ + "svr-path": "path/to/server", + }, Models: map[string]ModelConfig{ "model1": { Cmd: "path/to/cmd --arg1 one", @@ -85,7 +90,7 @@ groups: CheckEndpoint: "/health", }, "model2": { - Cmd: "path/to/cmd --arg1 one", + Cmd: "path/to/server --arg1 one", Proxy: "http://localhost:8081", Aliases: []string{"m2"}, Env: nil, @@ -331,3 +336,106 @@ models: assert.Equal(t, "model model1 requires a proxy value when not using automatic ${PORT}", err.Error()) }) } + +func TestConfig_MacroReplacement(t *testing.T) { + content := ` +startPort: 9990 +macros: + svr-path: "path/to/server" + argOne: "--arg1" + argTwo: "--arg2" + autoPort: "--port ${PORT}" + +models: + model1: + cmd: | + ${svr-path} ${argTwo} + # the automatic ${PORT} is replaced + ${autoPort} + ${argOne} + --arg3 three + cmdStop: | + /path/to/stop.sh --port ${PORT} ${argTwo} +` + + config, err := LoadConfigFromReader(strings.NewReader(content)) + assert.NoError(t, err) + sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd) + assert.NoError(t, err) + assert.Equal(t, "path/to/server --arg2 --port 9990 --arg1 --arg3 three", strings.Join(sanitizedCmd, " ")) + + sanitizedCmdStop, err := SanitizeCommand(config.Models["model1"].CmdStop) + assert.NoError(t, err) + assert.Equal(t, "/path/to/stop.sh --port 9990 --arg2", strings.Join(sanitizedCmdStop, " ")) +} + +func TestConfig_MacroErrorOnUnknownMacros(t *testing.T) { + tests := []struct { + name string + field string + content string + }{ + { + name: "unknown macro in cmd", + field: "cmd", + content: ` +startPort: 9990 +macros: + svr-path: "path/to/server" +models: + model1: + cmd: | + ${svr-path} --port ${PORT} + ${unknownMacro} +`, + }, + { + name: "unknown macro in cmdStop", + field: "cmdStop", + content: ` +startPort: 9990 +macros: + svr-path: "path/to/server" +models: + model1: + cmd: "${svr-path} --port ${PORT}" + cmdStop: "kill ${unknownMacro}" +`, + }, + { + name: "unknown macro in proxy", + field: "proxy", + content: ` +startPort: 9990 +macros: + svr-path: "path/to/server" +models: + model1: + cmd: "${svr-path} --port ${PORT}" + proxy: "http://localhost:${unknownMacro}" +`, + }, + { + name: "unknown macro in checkEndpoint", + field: "checkEndpoint", + content: ` +startPort: 9990 +macros: + svr-path: "path/to/server" +models: + model1: + cmd: "${svr-path} --port ${PORT}" + checkEndpoint: "http://localhost:${unknownMacro}/health" +`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + _, err := LoadConfigFromReader(strings.NewReader(tt.content)) + assert.Error(t, err) + assert.Contains(t, err.Error(), "unknown macro '${unknownMacro}' found in model1."+tt.field) + //t.Log(err) + }) + } +}