llama-swap/config.example.yaml

# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 60

models:
  "llama":
    cmd: "models/llama-server-osx --port 8999 -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
    proxy: "http://127.0.0.1:8999"

    # list of model name aliases this llama.cpp instance can serve
    aliases:
    - "gpt-4o-mini"

    # check this path for a HTTP 200 response for the server to be ready
    checkEndpoint: "/health"

  "qwen":
    cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
    proxy: "http://127.0.0.1:8999"
    aliases:
    - "gpt-3.5-turbo"

  "simple":
    # example of setting environment variables
    env:
      - "CUDA_VISIBLE_DEVICES=0,1"
      - "env1=hello"
    cmd: "build/simple-responder --port 8999"
    proxy: "http://127.0.0.1:8999"

    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
    checkEndpoint: "none"

  # don't use this, just for testing if things are broken
  "broken":
    cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf"
    proxy: "http://127.0.0.1:8999"