# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 15

models:
  "llama":
    cmd: >
      models/llama-server-osx
      --port 9001
      -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
    proxy: http://127.0.0.1:9001

    # list of model name aliases this llama.cpp instance can serve
    aliases:
    - gpt-4o-mini

    # check this path for a HTTP 200 response for the server to be ready
    checkEndpoint: /health

    # unload model after 5 seconds
    ttl: 5

  "qwen":
    cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9002
    aliases:
    - gpt-3.5-turbo

  "simple":
    # example of setting environment variables
    env:
      - CUDA_VISIBLE_DEVICES=0,1
      - env1=hello
    cmd: build/simple-responder --port 8999
    proxy: http://127.0.0.1:8999
    unlisted: true

    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
    checkEndpoint: none

  # don't use these, just for testing if things are broken
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
    unlisted: true

# creating a coding profile with models for code generation and general questions
profiles:
  coding:
    - "qwen"
    - "llama"