llama-swap/config.example.yaml

# llama-swap YAML configuration example
# -------------------------------------
#
# 💡 Tip - Use an LLM with this file!
# ====================================
#  This example configuration is written to be LLM friendly. Try
#  copying this file into an LLM and asking it to explain or generate
#  sections for you.
# ====================================

# Usage notes:
# - Below are all the available configuration options for llama-swap.
# - Settings noted as "required" must be in your configuration file
# - Settings noted as "optional" can be omitted

# healthCheckTimeout: number of seconds to wait for a model to be ready to serve requests
# - optional, default: 120
# - minimum value is 15 seconds, anything less will be set to this value
healthCheckTimeout: 500

# logLevel: sets the logging value
# - optional, default: info
# - Valid log levels: debug, info, warn, error
logLevel: info

# metricsMaxInMemory: maximum number of metrics to keep in memory
# - optional, default: 1000
# - controls how many metrics are stored in memory before older ones are discarded
# - useful for limiting memory usage when processing large volumes of metrics
metricsMaxInMemory: 1000

# startPort: sets the starting port number for the automatic ${PORT} macro.
# - optional, default: 5800
# - the ${PORT} macro can be used in model.cmd and model.proxy settings
# - it is automatically incremented for every model that uses it
startPort: 10001

# macros: a dictionary of string substitutions
# - optional, default: empty dictionary
# - macros are reusable snippets
# - used in a model's cmd, cmdStop, proxy and checkEndpoint
# - useful for reducing common configuration settings
macros:
  "latest-llama": >
    /path/to/llama-server/llama-server-ec9e0301
    --port ${PORT}

# models: a dictionary of model configurations
# - required
# - each key is the model's ID, used in API requests
# - model settings have default values that are used if they are not defined here
# - the model's ID is available in the ${MODEL_ID} macro, also available in macros defined above
# - below are examples of the all the settings a model can have
models:

  # keys are the model names used in API requests
  "llama":
    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
    # - using `|` allows for comments in the command, these will be parsed out
    # - macros can be used within cmd
    cmd: |
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
      --model path/to/llama-8B-Q4_K_M.gguf

    # name: a display name for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
    name: "llama 3.1 8B"

    # description: a description for the model
    # - optional, default: empty string
    # - if set, it will be used in the v1/models API response
    # - if not set, it will be omitted in the JSON model record
    description: "A small but capable model used for quick testing"

    # env: define an array of environment variables to inject into cmd's environment
    # - optional, default: empty array
    # - each value is a single string
    # - in the format: ENV_NAME=value
    env:
      - "CUDA_VISIBLE_DEVICES=0,1,2"

    # proxy: the URL where llama-swap routes API requests
    # - optional, default: http://localhost:${PORT}
    # - if you used ${PORT} in cmd this can be omitted
    # - if you use a custom port in cmd this *must* be set
    proxy: http://127.0.0.1:8999

    # aliases: alternative model names that this model configuration is used for
    # - optional, default: empty array
    # - aliases must be unique globally
    # - useful for impersonating a specific model
    aliases:
      - "gpt-4o-mini"
      - "gpt-3.5-turbo"

    # checkEndpoint: URL path to check if the server is ready
    # - optional, default: /health
    # - endpoint is expected to return an HTTP 200 response
    # - all requests wait until the endpoint is ready or fails
    # - use "none" to skip endpoint health checking
    checkEndpoint: /custom-endpoint

    # ttl: automatically unload the model after ttl seconds
    # - optional, default: 0
    # - ttl values must be a value greater than 0
    # - a value of 0 disables automatic unloading of the model
    ttl: 60

    # useModelName: override the model name that is sent to upstream server
    # - optional, default: ""
    # - useful for when the upstream server expects a specific model name that
    #   is different from the model's ID
    useModelName: "qwen:qwq"

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
    # - only strip_params is currently supported
    filters:
      # strip_params: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for server side enforcement of sampling parameters
      # - the `model` parameter can never be removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
      strip_params: "temperature, top_p, top_k"

    # concurrencyLimit: overrides the allowed number of active parallel requests to a model
    # - optional, default: 0
    # - useful for limiting the number of active parallel requests a model can process
    # - must be set per model
    # - any number greater than 0 will override the internal default value of 10
    # - any requests that exceeds the limit will receive an HTTP 429 Too Many Requests response
    # - recommended to be omitted and the default used
    concurrencyLimit: 0

  # Unlisted model example:
  "qwen-unlisted":
    # unlisted: boolean, true or false
    # - optional, default: false
    # - unlisted models do not show up in /v1/models api requests
    # - can be requested as normal through all apis
    unlisted: true
    cmd: llama-server --port ${PORT} -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0

  # Docker example:
  # container runtimes like Docker and Podman can be used reliably with
  # a combination of cmd, cmdStop, and ${MODEL_ID}
  "docker-llama":
    proxy: "http://127.0.0.1:${PORT}"
    cmd: |
      docker run --name ${MODEL_ID}
      --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggml-org/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

    # cmdStop: command to run to stop the model gracefully
    # - optional, default: ""
    # - useful for stopping commands managed by another system
    # - the upstream's process id is available in the ${PID} macro
    #
    # When empty, llama-swap has this default behaviour:
    # - on POSIX systems: a SIGTERM signal is sent
    # - on Windows, calls taskkill to stop the process
    # - processes have 5 seconds to shutdown until forceful termination is attempted
    cmdStop: docker stop ${MODEL_ID}

# groups: a dictionary of group settings
# - optional, default: empty dictionary
# - provides advanced controls over model swapping behaviour
# - using groups some models can be kept loaded indefinitely, while others are swapped out
# - model IDs must be defined in the Models section
# - a model can only be a member of one group
# - group behaviour is controlled via the `swap`, `exclusive` and `persistent` fields
# - see issue #109 for details
#
# NOTE: the example below uses model names that are not defined above for demonstration purposes
groups:
  # group1 works the same as the default behaviour of llama-swap where only one model is allowed
  # to run a time across the whole llama-swap instance
  "group1":
    # swap: controls the model swapping behaviour in within the group
    # - optional, default: true
    # - true : only one model is allowed to run at a time
    # - false: all models can run together, no swapping
    swap: true

    # exclusive: controls how the group affects other groups
    # - optional, default: true
    # - true: causes all other groups to unload when this group runs a model
    # - false: does not affect other groups
    exclusive: true

    # members references the models defined above
    # required
    members:
      - "llama"
      - "qwen-unlisted"

  # Example:
  # - in group2 all models can run at the same time
  # - when a different group is loaded it causes all running models in this group to unload
  "group2":
    swap: false

    # exclusive: false does not unload other groups when a model in group2 is requested
    # - the models in group2 will be loaded but will not unload any other groups
    exclusive: false
    members:
      - "docker-llama"
      - "modelA"
      - "modelB"

  # Example:
  # - a persistent group, prevents other groups from unloading it
  "forever":
    # persistent: prevents over groups from unloading the models in this group
    # - optional, default: false
    # - does not affect individual model behaviour
    persistent: true

    # set swap/exclusive to false to prevent swapping inside the group
    # and the unloading of other groups
    swap: false
    exclusive: false
    members:
      - "forever-modelA"
      - "forever-modelB"
      - "forever-modelc"

# hooks: a dictionary of event triggers and actions
# - optional, default: empty dictionary
# - the only supported hook is on_startup
hooks:
  # on_startup: a dictionary of actions to perform on startup
  # - optional, default: empty dictionary
  # - the only supported action is preload
  on_startup:
        # preload: a list of model ids to load on startup
        # - optional, default: empty list
        # - model names must match keys in the models sections
        # - when preloading multiple models at once, define a group
        #   otherwise models will be loaded and swapped out
    preload:
      - "llama"