# llama-swap YAML configuration example # ------------------------------------- # # 💡 Tip - Use an LLM with this file! # ==================================== # This example configuration is written to be LLM friendly. Try # copying this file into an LLM and asking it to explain or generate # sections for you. # ==================================== # Usage notes: # - Below are all the available configuration options for llama-swap. # - Settings noted as "required" must be in your configuration file # - Settings noted as "optional" can be omitted # healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests # - optional, default: 120 # - minimum value is 15 seconds, anything less will be set to this value healthCheckTimeout: 500 # logLevel: sets the logging value # - optional, default: info # - Valid log levels: debug, info, warn, error logLevel: info # metricsMaxInMemory: maximum number of metrics to keep in memory # - optional, default: 1000 # - controls how many metrics are stored in memory before older ones are discarded # - useful for limiting memory usage when processing large volumes of metrics metricsMaxInMemory: 1000 # startPort: sets the starting port number for the automatic ${PORT} macro. # - optional, default: 5800 # - the ${PORT} macro can be used in model.cmd and model.proxy settings # - it is automatically incremented for every model that uses it startPort: 10001 # macros: a dictionary of string substitutions # - optional, default: empty dictionary # - macros are reusable snippets # - used in a model's cmd, cmdStop, proxy and checkEndpoint # - useful for reducing common configuration settings macros: "latest-llama": > /path/to/llama-server/llama-server-ec9e0301 --port ${PORT} # models: a dictionary of model configurations # - required # - each key is the model's ID, used in API requests # - model settings have default values that are used if they are not defined here # - below are examples of the various settings a model can have: # - available model settings: env, cmd, cmdStop, proxy, aliases, checkEndpoint, ttl, unlisted models: # keys are the model names used in API requests "llama": # cmd: the command to run to start the inference server. # - required # - it is just a string, similar to what you would run on the CLI # - using `|` allows for comments in the command, these will be parsed out # - macros can be used within cmd cmd: | # ${latest-llama} is a macro that is defined above ${latest-llama} --model path/to/llama-8B-Q4_K_M.gguf # name: a display name for the model # - optional, default: empty string # - if set, it will be used in the v1/models API response # - if not set, it will be omitted in the JSON model record name: "llama 3.1 8B" # description: a description for the model # - optional, default: empty string # - if set, it will be used in the v1/models API response # - if not set, it will be omitted in the JSON model record description: "A small but capable model used for quick testing" # env: define an array of environment variables to inject into cmd's environment # - optional, default: empty array # - each value is a single string # - in the format: ENV_NAME=value env: - "CUDA_VISIBLE_DEVICES=0,1,2" # proxy: the URL where llama-swap routes API requests # - optional, default: http://localhost:${PORT} # - if you used ${PORT} in cmd this can be omitted # - if you use a custom port in cmd this *must* be set proxy: http://127.0.0.1:8999 # aliases: alternative model names that this model configuration is used for # - optional, default: empty array # - aliases must be unique globally # - useful for impersonating a specific model aliases: - "gpt-4o-mini" - "gpt-3.5-turbo" # checkEndpoint: URL path to check if the server is ready # - optional, default: /health # - endpoint is expected to return an HTTP 200 response # - all requests wait until the endpoint is ready or fails # - use "none" to skip endpoint health checking checkEndpoint: /custom-endpoint # ttl: automatically unload the model after ttl seconds # - optional, default: 0 # - ttl values must be a value greater than 0 # - a value of 0 disables automatic unloading of the model ttl: 60 # useModelName: override the model name that is sent to upstream server # - optional, default: "" # - useful for when the upstream server expects a specific model name that # is different from the model's ID useModelName: "qwen:qwq" # filters: a dictionary of filter settings # - optional, default: empty dictionary # - only strip_params is currently supported filters: # strip_params: a comma separated list of parameters to remove from the request # - optional, default: "" # - useful for server side enforcement of sampling parameters # - the `model` parameter can never be removed # - can be any JSON key in the request body # - recommended to stick to sampling parameters strip_params: "temperature, top_p, top_k" # concurrencyLimit: overrides the allowed number of active parallel requests to a model # - optional, default: 0 # - useful for limiting the number of active parallel requests a model can process # - must be set per model # - any number greater than 0 will override the internal default value of 10 # - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response # - recommended to be omitted and the default used concurrencyLimit: 0 # Unlisted model example: "qwen-unlisted": # unlisted: boolean, true or false # - optional, default: false # - unlisted models do not show up in /v1/models api requests # - can be requested as normal through all apis unlisted: true cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0 # Docker example: # container run times like Docker and Podman can be used reliably with a # a combination of cmd and cmdStop. "docker-llama": proxy: "http://127.0.0.1:${PORT}" cmd: | docker run --name dockertest --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models ghcr.io/ggml-org/llama.cpp:server --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' # cmdStop: command to run to stop the model gracefully # - optional, default: "" # - useful for stopping commands managed by another system # - the upstream's process id is available in the ${PID} macro # # When empty, llama-swap has this default behaviour: # - on POSIX systems: a SIGTERM signal is sent # - on Windows, calls taskkill to stop the process # - processes have 5 seconds to shutdown until forceful termination is attempted cmdStop: docker stop dockertest # groups: a dictionary of group settings # - optional, default: empty dictionary # - provides advanced controls over model swapping behaviour # - using groups some models can be kept loaded indefinitely, while others are swapped out # - model IDs must be defined in the Models section # - a model can only be a member of one group # - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields # - see issue #109 for details # # NOTE: the example below uses model names that are not defined above for demonstration purposes groups: # group1 works the same as the default behaviour of llama-swap where only one model is allowed # to run a time across the whole llama-swap instance "group1": # swap: controls the model swapping behaviour in within the group # - optional, default: true # - true : only one model is allowed to run at a time # - false: all models can run together, no swapping swap: true # exclusive: controls how the group affects other groups # - optional, default: true # - true: causes all other groups to unload when this group runs a model # - false: does not affect other groups exclusive: true # members references the models defined above # required members: - "llama" - "qwen-unlisted" # Example: # - in group2 all models can run at the same time # - when a different group is loaded it causes all running models in this group to unload "group2": swap: false # exclusive: false does not unload other groups when a model in group2 is requested # - the models in group2 will be loaded but will not unload any other groups exclusive: false members: - "docker-llama" - "modelA" - "modelB" # Example: # - a persistent group, prevents other groups from unloading it "forever": # persistent: prevents over groups from unloading the models in this group # - optional, default: false # - does not affect individual model behaviour persistent: true # set swap/exclusive to false to prevent swapping inside the group # and the unloading of other groups swap: false exclusive: false members: - "forever-modelA" - "forever-modelB" - "forever-modelc" # hooks: a dictionary of event triggers and actions # - optional, default: empty dictionary # - the only supported hook is on_startup hooks: # on_startup: a dictionary of actions to perform on startup # - optional, default: empty dictionary # - the only supported action is preload on_startup: # preload: a list of model ids to load on startup # - optional, default: empty list # - model names must match keys in the models sections # - when preloading multiple models at once, define a group # otherwise models will be loaded and swapped out preload: - "llama"