llama-swap/config.example.yaml

# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 15

# Log HTTP requests helpful for troubleshoot, defaults to False
logRequests: true

models:
  "llama":
    cmd: >
      models/llama-server-osx
      --port 9001
      -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
    proxy: http://127.0.0.1:9001

    # list of model name aliases this llama.cpp instance can serve
    aliases:
    - gpt-4o-mini

    # check this path for a HTTP 200 response for the server to be ready
    checkEndpoint: /health

    # unload model after 5 seconds
    ttl: 5

  "qwen":
    cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9002
    aliases:
    - gpt-3.5-turbo

  # Embedding example with Nomic
  # https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
  "nomic":
    proxy: http://127.0.0.1:9005
    cmd: >
      models/llama-server-osx --port 9005
      -m models/nomic-embed-text-v1.5.Q8_0.gguf
      --ctx-size 8192
      --batch-size 8192
      --rope-scaling yarn
      --rope-freq-scale 0.75
      -ngl 99
      --embeddings

  # Reranking example with bge-reranker
  # https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
  "bge-reranker":
    proxy: http://127.0.0.1:9006
    cmd: >
      models/llama-server-osx --port 9006
      -m models/bge-reranker-v2-m3-Q4_K_M.gguf
      --ctx-size 8192
      --reranking


  "simple":
    # example of setting environment variables
    env:
      - CUDA_VISIBLE_DEVICES=0,1
      - env1=hello
    cmd: build/simple-responder --port 8999
    proxy: http://127.0.0.1:8999
    unlisted: true

    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
    checkEndpoint: none

  # don't use these, just for testing if things are broken
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
    unlisted: true

# creating a coding profile with models for code generation and general questions
profiles:
  coding:
    - "qwen"
    - "llama"