# llama-swap YAML configuration example # ------------------------------------- # # 💡 Tip - Use an LLM with this file! # ==================================== # This example configuration is written to be LLM friendly! Try # copying this file into an LLM and asking it to explain or generate # sections for you. # ==================================== # # - Below are all the available configuration options for llama-swap. # - Settings with a default value, or noted as optional can be omitted. # - Settings that are marked required must be in your configuration file # healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests # - optional, default: 120 # - minimum value is 15 seconds, anything less will be set to this value healthCheckTimeout: 500 # logLevel: sets the logging value # - optional, default: info # - Valid log levels: debug, info, warn, error logLevel: info # metricsMaxInMemory: maximum number of metrics to keep in memory # - optional, default: 1000 # - controls how many metrics are stored in memory before older ones are discarded # - useful for limiting memory usage when processing large volumes of metrics metricsMaxInMemory: 1000 # startPort: sets the starting port number for the automatic ${PORT} macro. # - optional, default: 5800 # - the ${PORT} macro can be used in model.cmd and model.proxy settings # - it is automatically incremented for every model that uses it startPort: 10001 # macros: sets a dictionary of string:string pairs # - optional, default: empty dictionary # - these are reusable snippets # - used in a model's cmd, cmdStop, proxy and checkEndpoint # - useful for reducing common configuration settings macros: "latest-llama": > /path/to/llama-server/llama-server-ec9e0301 --port ${PORT} # models: a dictionary of model configurations # - required # - each key is the model's ID, used in API requests # - model settings have default values that are used if they are not defined here # - below are examples of the various settings a model can have: # - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted models: # keys are the model names used in API requests "llama": # cmd: the command to run to start the inference server. # - required # - it is just a string, similar to what you would run on the CLI # - using `|` allows for comments in the command, these will be parsed out # - macros can be used within cmd cmd: | # ${latest-llama} is a macro that is defined above ${latest-llama} --model path/to/llama-8B-Q4_K_M.gguf # name: a display name for the model # - optional, default: empty string # - if set, it will be used in the v1/models API response # - if not set, it will be omitted in the JSON model record name: "llama 3.1 8B" # description: a description for the model # - optional, default: empty string # - if set, it will be used in the v1/models API response # - if not set, it will be omitted in the JSON model record description: "A small but capable model used for quick testing" # env: define an array of environment variables to inject into cmd's environment # - optional, default: empty array # - each value is a single string # - in the format: ENV_NAME=value env: - "CUDA_VISIBLE_DEVICES=0,1,2" # proxy: the URL where llama-swap routes API requests # - optional, default: http://localhost:${PORT} # - if you used ${PORT} in cmd this can be omitted # - if you use a custom port in cmd this *must* be set proxy: http://127.0.0.1:8999 # aliases: alternative model names that this model configuration is used for # - optional, default: empty array # - aliases must be unique globally # - useful for impersonating a specific model aliases: - "gpt-4o-mini" - "gpt-3.5-turbo" # checkEndpoint: URL path to check if the server is ready # - optional, default: /health # - use "none" to skip endpoint ready checking # - endpoint is expected to return an HTTP 200 response # - all requests wait until the endpoint is ready (or fails) checkEndpoint: /custom-endpoint # ttl: automatically unload the model after this many seconds # - optional, default: 0 # - ttl values must be a value greater than 0 # - a value of 0 disables automatic unloading of the model ttl: 60 # useModelName: overrides the model name that is sent to upstream server # - optional, default: "" # - useful when the upstream server expects a specific model name or format useModelName: "qwen:qwq" # filters: a dictionary of filter settings # - optional, default: empty dictionary filters: # strip_params: a comma separated list of parameters to remove from the request # - optional, default: "" # - useful for preventing overriding of default server params by requests # - `model` parameter is never removed # - can be any JSON key in the request body # - recommended to stick to sampling parameters strip_params: "temperature, top_p, top_k" # Unlisted model example: "qwen-unlisted": # unlisted: true or false # - optional, default: false # - unlisted models do not show up in /v1/models or /upstream lists # - can be requested as normal through all apis unlisted: true cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0 # Docker example: # container run times like Docker and Podman can also be used with a # a combination of cmd and cmdStop. "docker-llama": proxy: "http://127.0.0.1:${PORT}" cmd: | docker run --name dockertest --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models ghcr.io/ggml-org/llama.cpp:server --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' # cmdStop: command to run to stop the model gracefully # - optional, default: "" # - useful for stopping commands managed by another system # - on POSIX systems: a SIGTERM is sent for graceful shutdown # - on Windows, taskkill is used # - processes are given 5 seconds to shutdown until they are forcefully killed # - the upstream's process id is available in the ${PID} macro cmdStop: docker stop dockertest # groups: a dictionary of group settings # - optional, default: empty dictionary # - provide advanced controls over model swapping behaviour. # - Using groups some models can be kept loaded indefinitely, while others are swapped out. # - model ids must be defined in the Models section # - a model can only be a member of one group # - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields # - see issue #109 for details # # NOTE: the example below uses model names that are not defined above for demonstration purposes groups: # group1 is same as the default behaviour of llama-swap where only one model is allowed # to run a time across the whole llama-swap instance "group1": # swap: controls the model swapping behaviour in within the group # - optional, default: true # - true : only one model is allowed to run at a time # - false: all models can run together, no swapping swap: true # exclusive: controls how the group affects other groups # - optional, default: true # - true: causes all other groups to unload when this group runs a model # - false: does not affect other groups exclusive: true # members references the models defined above # required members: - "llama" - "qwen-unlisted" # Example: # - in this group all the models can run at the same time # - when a different group loads all running models in this group are unloaded "group2": swap: false exclusive: false members: - "docker-llama" - "modelA" - "modelB" # Example: # - a persistent group, prevents other groups from unloading it "forever": # persistent: prevents over groups from unloading the models in this group # - optional, default: false # - does not affect individual model behaviour persistent: true # set swap/exclusive to false to prevent swapping inside the group # and the unloading of other groups swap: false exclusive: false members: - "forever-modelA" - "forever-modelB" - "forever-modelc" # hooks: a dictionary of event triggers and actions # - optional, default: empty dictionary # - the only supported hook is on_startup hooks: # on_startup: a dictionary of actions to perform on startup # - optional, default: empty dictionar # - the only supported action is preload on_startup: # preload: a list of model ids to load on startup # - optional, default: empty list # - model names must match keys in the models sections # - when preloading multiple models at once, define a group # otherwise models will be loaded and swapped out preload: - "llama"