# Seconds to wait for llama.cpp to be available to serve requests # Default (and minimum): 15 seconds healthCheckTimeout: 60 models: "llama": cmd: "models/llama-server-osx --port 8999 -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf" proxy: "http://127.0.0.1:8999" # list of model name aliases this llama.cpp instance can serve aliases: - "gpt-4o-mini" "qwen": cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf" proxy: "http://127.0.0.1:8999" aliases: - "gpt-3.5-turbo" "simple": # example of setting environment variables env: - "CUDA_VISIBLE_DEVICES=0,1" - "env1=hello" cmd: "build/simple-responder --port 8999" proxy: "http://127.0.0.1:8999" # don't use this, just for testing if things are broken "broken": cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf" proxy: "http://127.0.0.1:8999"