llama-swap/config.example.yaml

# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 90

# valid log levels: debug, info (default), warn, error
logLevel: debug

models:
  "llama":
    cmd: >
      models/llama-server-osx
      --port 9001
      -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
    proxy: http://127.0.0.1:9001

    # list of model name aliases this llama.cpp instance can serve
    aliases:
    - gpt-4o-mini

    # check this path for a HTTP 200 response for the server to be ready
    checkEndpoint: /health

    # unload model after 5 seconds
    ttl: 5

  "qwen":
    cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9002
    aliases:
    - gpt-3.5-turbo

  # Embedding example with Nomic
  # https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
  "nomic":
    proxy: http://127.0.0.1:9005
    cmd: >
      models/llama-server-osx --port 9005
      -m models/nomic-embed-text-v1.5.Q8_0.gguf
      --ctx-size 8192
      --batch-size 8192
      --rope-scaling yarn
      --rope-freq-scale 0.75
      -ngl 99
      --embeddings

  # Reranking example with bge-reranker
  # https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
  "bge-reranker":
    proxy: http://127.0.0.1:9006
    cmd: >
      models/llama-server-osx --port 9006
      -m models/bge-reranker-v2-m3-Q4_K_M.gguf
      --ctx-size 8192
      --reranking

  # Docker Support (v26.1.4+ required!)
  "dockertest":
    proxy: "http://127.0.0.1:9790"
    cmd: >
      docker run --name dockertest
      --init --rm -p 9790:8080 -v /mnt/nvme/models:/models
      ghcr.io/ggerganov/llama.cpp:server
      --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'

  "simple":
    # example of setting environment variables
    env:
      - CUDA_VISIBLE_DEVICES=0,1
      - env1=hello
    cmd: build/simple-responder --port 8999
    proxy: http://127.0.0.1:8999
    unlisted: true

    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
    checkEndpoint: none

  # don't use these, just for testing if things are broken
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
    unlisted: true

# creating a coding profile with models for code generation and general questions
profiles:
  coding:
    - "qwen"
    - "llama"