# Seconds to wait for llama.cpp to be available to serve requests # Default (and minimum): 15 seconds healthCheckTimeout: 15 models: "llama": cmd: > models/llama-server-osx --port 9001 -m models/Llama-3.2-1B-Instruct-Q4_0.gguf proxy: http://127.0.0.1:9001 # list of model name aliases this llama.cpp instance can serve aliases: - gpt-4o-mini # check this path for a HTTP 200 response for the server to be ready checkEndpoint: /health # unload model after 5 seconds ttl: 5 "qwen": cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf proxy: http://127.0.0.1:9002 aliases: - gpt-3.5-turbo "simple": # example of setting environment variables env: - CUDA_VISIBLE_DEVICES=0,1 - env1=hello cmd: build/simple-responder --port 8999 proxy: http://127.0.0.1:8999 unlisted: true # use "none" to skip check. Caution this may cause some requests to fail # until the upstream server is ready for traffic checkEndpoint: none # don't use these, just for testing if things are broken "broken": cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf proxy: http://127.0.0.1:8999 unlisted: true "broken_timeout": cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf proxy: http://127.0.0.1:9000 unlisted: true # creating a coding profile with models for code generation and general questions profiles: coding: - "qwen" - "llama"