Add macros to Configuration schema (#149)
* Add macros to Configuration schema * update docs
This commit is contained in:
153
README.md
153
README.md
@@ -45,158 +45,31 @@ llama-swap's configuration is purposefully simple.
|
||||
```yaml
|
||||
models:
|
||||
"qwen2.5":
|
||||
proxy: "http://127.0.0.1:9999"
|
||||
cmd: |
|
||||
/app/llama-server
|
||||
-hf bartowski/Qwen2.5-0.5B-Instruct-GGUF:Q4_K_M
|
||||
--port 9999
|
||||
--port ${PORT}
|
||||
|
||||
"smollm2":
|
||||
proxy: "http://127.0.0.1:9999"
|
||||
cmd: |
|
||||
/app/llama-server
|
||||
-hf bartowski/SmolLM2-135M-Instruct-GGUF:Q4_K_M
|
||||
--port 9999
|
||||
--port ${PORT}
|
||||
```
|
||||
|
||||
<details>
|
||||
<summary>But also very powerful ...</summary>
|
||||
But also very powerful:
|
||||
|
||||
```yaml
|
||||
# Seconds to wait for upstream to load and be ready to serve requests
|
||||
# minimum is 15 seconds
|
||||
# default is 120 seconds
|
||||
healthCheckTimeout: 500
|
||||
- ⚡ `groups` to run multiple models at once
|
||||
- ⚡ `macros` for reusable snippets
|
||||
- ⚡ `ttl` to automatically unload models
|
||||
- ⚡ `aliases` to use familiar model names (e.g., "gpt-4o-mini")
|
||||
- ⚡ `env` variables to pass custom environment to inference servers
|
||||
- ⚡ `useModelName` to override model names sent to upstream servers
|
||||
- ⚡ `healthCheckTimeout` to control model startup wait times
|
||||
- ⚡ `${PORT}` automatic port variables for dynamic port assignment
|
||||
- ⚡ Docker/podman compatible
|
||||
|
||||
# Valid log levels: debug, info (default), warn, error
|
||||
logLevel: info
|
||||
|
||||
# Automatic Port Values
|
||||
# use ${PORT} in model.cmd and model.proxy to use an automatic port number
|
||||
# when you use ${PORT} you can omit a custom model.proxy value, as it will
|
||||
# default to http://localhost:${PORT}
|
||||
|
||||
# override the default port (5800) for automatic port values
|
||||
startPort: 10001
|
||||
|
||||
# define valid model values and the upstream server start
|
||||
models:
|
||||
"llama":
|
||||
# multiline for readability
|
||||
cmd: |
|
||||
llama-server --port 8999
|
||||
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
||||
|
||||
# environment variables to pass to the command
|
||||
env:
|
||||
- "CUDA_VISIBLE_DEVICES=0"
|
||||
|
||||
# where to reach the server started by cmd, make sure the ports match
|
||||
# can be omitted if you use an automatic ${PORT} in cmd
|
||||
proxy: http://127.0.0.1:8999
|
||||
|
||||
# aliases names to use this model for
|
||||
aliases:
|
||||
- "gpt-4o-mini"
|
||||
- "gpt-3.5-turbo"
|
||||
|
||||
# check this path for an HTTP 200 OK before serving requests
|
||||
# default: /health to match llama.cpp
|
||||
# use "none" to skip endpoint checking, but may cause HTTP errors
|
||||
# until the model is ready
|
||||
checkEndpoint: /custom-endpoint
|
||||
|
||||
# automatically unload the model after this many seconds
|
||||
# ttl values must be a value greater than 0
|
||||
# default: 0 = never unload model
|
||||
ttl: 60
|
||||
|
||||
# `useModelName` overrides the model name in the request
|
||||
# and sends a specific name to the upstream server
|
||||
useModelName: "qwen:qwq"
|
||||
|
||||
# unlisted models do not show up in /v1/models or /upstream lists
|
||||
# but they can still be requested as normal
|
||||
"qwen-unlisted":
|
||||
unlisted: true
|
||||
cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||
|
||||
# Docker Support (v26.1.4+ required!)
|
||||
"docker-llama":
|
||||
proxy: "http://127.0.0.1:${PORT}"
|
||||
cmd: |
|
||||
docker run --name dockertest
|
||||
--init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
|
||||
ghcr.io/ggml-org/llama.cpp:server
|
||||
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
|
||||
|
||||
# use a custom command to stop the model when swapping. By default
|
||||
# this is SIGTERM on POSIX systems, and taskkill on Windows systems
|
||||
# the ${PID} variable can be used in cmdStop, it will be automatically replaced
|
||||
# with the PID of the running model
|
||||
cmdStop: docker stop dockertest
|
||||
|
||||
# Groups provide advanced controls over model swapping behaviour. Using groups
|
||||
# some models can be kept loaded indefinitely, while others are swapped out.
|
||||
#
|
||||
# Tips:
|
||||
#
|
||||
# - models must be defined above in the Models section
|
||||
# - a model can only be a member of one group
|
||||
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
|
||||
# - see issue #109 for details
|
||||
#
|
||||
# NOTE: the example below uses model names that are not defined above for demonstration purposes
|
||||
groups:
|
||||
# group1 is the default behaviour of llama-swap where only one model is allowed
|
||||
# to run a time across the whole llama-swap instance
|
||||
"group1":
|
||||
# swap controls the model swapping behaviour in within the group
|
||||
# - true : only one model is allowed to run at a time
|
||||
# - false: all models can run together, no swapping
|
||||
swap: true
|
||||
|
||||
# exclusive controls how the group affects other groups
|
||||
# - true: causes all other groups to unload their models when this group runs a model
|
||||
# - false: does not affect other groups
|
||||
exclusive: true
|
||||
|
||||
# members references the models defined above
|
||||
members:
|
||||
- "llama"
|
||||
- "qwen-unlisted"
|
||||
|
||||
# models in this group are never unloaded
|
||||
"group2":
|
||||
swap: false
|
||||
exclusive: false
|
||||
members:
|
||||
- "docker-llama"
|
||||
# (not defined above, here for example)
|
||||
- "modelA"
|
||||
- "modelB"
|
||||
|
||||
"forever":
|
||||
# setting persistent to true causes the group to never be affected by the swapping behaviour of
|
||||
# other groups. It is a shortcut to keeping some models always loaded.
|
||||
persistent: true
|
||||
|
||||
# set swap/exclusive to false to prevent swapping inside the group and effect on other groups
|
||||
swap: false
|
||||
exclusive: false
|
||||
members:
|
||||
- "forever-modelA"
|
||||
- "forever-modelB"
|
||||
- "forever-modelc"
|
||||
```
|
||||
|
||||
### Use Case Examples
|
||||
|
||||
- [config.example.yaml](config.example.yaml) includes example for supporting `v1/embeddings` and `v1/rerank` endpoints
|
||||
- [Speculative Decoding](examples/speculative-decoding/README.md) - using a small draft model can increase inference speeds from 20% to 40%. This example includes a configurations Qwen2.5-Coder-32B (2.5x increase) and Llama-3.1-70B (1.4x increase) in the best cases.
|
||||
- [Optimizing Code Generation](examples/benchmark-snakegame/README.md) - find the optimal settings for your machine. This example demonstrates defining multiple configurations and testing which one is fastest.
|
||||
- [Restart on Config Change](examples/restart-on-config-change/README.md) - automatically restart llama-swap when trying out different configurations.
|
||||
</details>
|
||||
Check the [wiki](https://github.com/mostlygeek/llama-swap/wiki/Configuration) full documentation.
|
||||
|
||||
## Docker Install ([download images](https://github.com/mostlygeek/llama-swap/pkgs/container/llama-swap))
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
@@ -67,6 +68,9 @@ type Config struct {
|
||||
Profiles map[string][]string `yaml:"profiles"`
|
||||
Groups map[string]GroupConfig `yaml:"groups"` /* key is group ID */
|
||||
|
||||
// for key/value replacements in model's cmd, cmdStop, proxy, checkEndPoint
|
||||
Macros map[string]string `yaml:"macros"`
|
||||
|
||||
// map aliases to actual model IDs
|
||||
aliases map[string]string
|
||||
|
||||
@@ -141,6 +145,30 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
}
|
||||
}
|
||||
|
||||
/* check macro constraint rules:
|
||||
|
||||
- name must fit the regex ^[a-zA-Z0-9_-]+$
|
||||
- names must be less than 64 characters (no reason, just cause)
|
||||
- name can not be any reserved macros: PORT
|
||||
- macro values must be less than 1024 characters
|
||||
*/
|
||||
macroNameRegex := regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
|
||||
for macroName, macroValue := range config.Macros {
|
||||
if len(macroName) >= 64 {
|
||||
return Config{}, fmt.Errorf("macro name '%s' exceeds maximum length of 63 characters", macroName)
|
||||
}
|
||||
if !macroNameRegex.MatchString(macroName) {
|
||||
return Config{}, fmt.Errorf("macro name '%s' contains invalid characters, must match pattern ^[a-zA-Z0-9_-]+$", macroName)
|
||||
}
|
||||
if len(macroValue) >= 1024 {
|
||||
return Config{}, fmt.Errorf("macro value for '%s' exceeds maximum length of 1024 characters", macroName)
|
||||
}
|
||||
switch macroName {
|
||||
case "PORT":
|
||||
return Config{}, fmt.Errorf("macro name '%s' is reserved and cannot be used", macroName)
|
||||
}
|
||||
}
|
||||
|
||||
// Get and sort all model IDs first, makes testing more consistent
|
||||
modelIds := make([]string, 0, len(config.Models))
|
||||
for modelId := range config.Models {
|
||||
@@ -151,19 +179,51 @@ func LoadConfigFromReader(r io.Reader) (Config, error) {
|
||||
nextPort := config.StartPort
|
||||
for _, modelId := range modelIds {
|
||||
modelConfig := config.Models[modelId]
|
||||
// iterate over the models and replace any ${PORT} with the next available port
|
||||
if strings.Contains(modelConfig.Cmd, "${PORT}") {
|
||||
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", strconv.Itoa(nextPort))
|
||||
if modelConfig.Proxy == "" {
|
||||
modelConfig.Proxy = fmt.Sprintf("http://localhost:%d", nextPort)
|
||||
} else {
|
||||
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", strconv.Itoa(nextPort))
|
||||
|
||||
// go through model config fields: cmd, cmdStop, proxy, checkEndPoint and replace macros with macro values
|
||||
for macroName, macroValue := range config.Macros {
|
||||
macroSlug := fmt.Sprintf("${%s}", macroName)
|
||||
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, macroSlug, macroValue)
|
||||
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, macroSlug, macroValue)
|
||||
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, macroSlug, macroValue)
|
||||
modelConfig.CheckEndpoint = strings.ReplaceAll(modelConfig.CheckEndpoint, macroSlug, macroValue)
|
||||
}
|
||||
|
||||
// only iterate over models that use ${PORT} to keep port numbers from increasing unnecessarily
|
||||
if strings.Contains(modelConfig.Cmd, "${PORT}") || strings.Contains(modelConfig.Proxy, "${PORT}") || strings.Contains(modelConfig.CmdStop, "${PORT}") {
|
||||
if modelConfig.Proxy == "" {
|
||||
modelConfig.Proxy = "http://localhost:${PORT}"
|
||||
}
|
||||
|
||||
nextPortStr := strconv.Itoa(nextPort)
|
||||
modelConfig.Cmd = strings.ReplaceAll(modelConfig.Cmd, "${PORT}", nextPortStr)
|
||||
modelConfig.CmdStop = strings.ReplaceAll(modelConfig.CmdStop, "${PORT}", nextPortStr)
|
||||
modelConfig.Proxy = strings.ReplaceAll(modelConfig.Proxy, "${PORT}", nextPortStr)
|
||||
nextPort++
|
||||
config.Models[modelId] = modelConfig
|
||||
} else if modelConfig.Proxy == "" {
|
||||
return Config{}, fmt.Errorf("model %s requires a proxy value when not using automatic ${PORT}", modelId)
|
||||
}
|
||||
|
||||
// make sure there are no unknown macros that have not been replaced
|
||||
macroPattern := regexp.MustCompile(`\$\{([a-zA-Z0-9_-]+)\}`)
|
||||
fieldMap := map[string]string{
|
||||
"cmd": modelConfig.Cmd,
|
||||
"cmdStop": modelConfig.CmdStop,
|
||||
"proxy": modelConfig.Proxy,
|
||||
"checkEndpoint": modelConfig.CheckEndpoint,
|
||||
}
|
||||
|
||||
for fieldName, fieldValue := range fieldMap {
|
||||
matches := macroPattern.FindAllStringSubmatch(fieldValue, -1)
|
||||
for _, match := range matches {
|
||||
macroName := match[1]
|
||||
if _, exists := config.Macros[macroName]; !exists {
|
||||
return Config{}, fmt.Errorf("unknown macro '${%s}' found in %s.%s", macroName, modelId, fieldName)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
config.Models[modelId] = modelConfig
|
||||
}
|
||||
|
||||
config = AddDefaultGroupToConfig(config)
|
||||
|
||||
@@ -19,6 +19,8 @@ func TestConfig_Load(t *testing.T) {
|
||||
|
||||
tempFile := filepath.Join(tempDir, "config.yaml")
|
||||
content := `
|
||||
macros:
|
||||
svr-path: "path/to/server"
|
||||
models:
|
||||
model1:
|
||||
cmd: path/to/cmd --arg1 one
|
||||
@@ -31,7 +33,7 @@ models:
|
||||
- "VAR2=value2"
|
||||
checkEndpoint: "/health"
|
||||
model2:
|
||||
cmd: path/to/cmd --arg1 one
|
||||
cmd: ${svr-path} --arg1 one
|
||||
proxy: "http://localhost:8081"
|
||||
aliases:
|
||||
- "m2"
|
||||
@@ -76,6 +78,9 @@ groups:
|
||||
|
||||
expected := Config{
|
||||
StartPort: 5800,
|
||||
Macros: map[string]string{
|
||||
"svr-path": "path/to/server",
|
||||
},
|
||||
Models: map[string]ModelConfig{
|
||||
"model1": {
|
||||
Cmd: "path/to/cmd --arg1 one",
|
||||
@@ -85,7 +90,7 @@ groups:
|
||||
CheckEndpoint: "/health",
|
||||
},
|
||||
"model2": {
|
||||
Cmd: "path/to/cmd --arg1 one",
|
||||
Cmd: "path/to/server --arg1 one",
|
||||
Proxy: "http://localhost:8081",
|
||||
Aliases: []string{"m2"},
|
||||
Env: nil,
|
||||
@@ -331,3 +336,106 @@ models:
|
||||
assert.Equal(t, "model model1 requires a proxy value when not using automatic ${PORT}", err.Error())
|
||||
})
|
||||
}
|
||||
|
||||
func TestConfig_MacroReplacement(t *testing.T) {
|
||||
content := `
|
||||
startPort: 9990
|
||||
macros:
|
||||
svr-path: "path/to/server"
|
||||
argOne: "--arg1"
|
||||
argTwo: "--arg2"
|
||||
autoPort: "--port ${PORT}"
|
||||
|
||||
models:
|
||||
model1:
|
||||
cmd: |
|
||||
${svr-path} ${argTwo}
|
||||
# the automatic ${PORT} is replaced
|
||||
${autoPort}
|
||||
${argOne}
|
||||
--arg3 three
|
||||
cmdStop: |
|
||||
/path/to/stop.sh --port ${PORT} ${argTwo}
|
||||
`
|
||||
|
||||
config, err := LoadConfigFromReader(strings.NewReader(content))
|
||||
assert.NoError(t, err)
|
||||
sanitizedCmd, err := SanitizeCommand(config.Models["model1"].Cmd)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, "path/to/server --arg2 --port 9990 --arg1 --arg3 three", strings.Join(sanitizedCmd, " "))
|
||||
|
||||
sanitizedCmdStop, err := SanitizeCommand(config.Models["model1"].CmdStop)
|
||||
assert.NoError(t, err)
|
||||
assert.Equal(t, "/path/to/stop.sh --port 9990 --arg2", strings.Join(sanitizedCmdStop, " "))
|
||||
}
|
||||
|
||||
func TestConfig_MacroErrorOnUnknownMacros(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
field string
|
||||
content string
|
||||
}{
|
||||
{
|
||||
name: "unknown macro in cmd",
|
||||
field: "cmd",
|
||||
content: `
|
||||
startPort: 9990
|
||||
macros:
|
||||
svr-path: "path/to/server"
|
||||
models:
|
||||
model1:
|
||||
cmd: |
|
||||
${svr-path} --port ${PORT}
|
||||
${unknownMacro}
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "unknown macro in cmdStop",
|
||||
field: "cmdStop",
|
||||
content: `
|
||||
startPort: 9990
|
||||
macros:
|
||||
svr-path: "path/to/server"
|
||||
models:
|
||||
model1:
|
||||
cmd: "${svr-path} --port ${PORT}"
|
||||
cmdStop: "kill ${unknownMacro}"
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "unknown macro in proxy",
|
||||
field: "proxy",
|
||||
content: `
|
||||
startPort: 9990
|
||||
macros:
|
||||
svr-path: "path/to/server"
|
||||
models:
|
||||
model1:
|
||||
cmd: "${svr-path} --port ${PORT}"
|
||||
proxy: "http://localhost:${unknownMacro}"
|
||||
`,
|
||||
},
|
||||
{
|
||||
name: "unknown macro in checkEndpoint",
|
||||
field: "checkEndpoint",
|
||||
content: `
|
||||
startPort: 9990
|
||||
macros:
|
||||
svr-path: "path/to/server"
|
||||
models:
|
||||
model1:
|
||||
cmd: "${svr-path} --port ${PORT}"
|
||||
checkEndpoint: "http://localhost:${unknownMacro}/health"
|
||||
`,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
_, err := LoadConfigFromReader(strings.NewReader(tt.content))
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "unknown macro '${unknownMacro}' found in model1."+tt.field)
|
||||
//t.Log(err)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user