# Seconds to wait for llama.cpp to be available to serve requests # Default (and minimum): 15 seconds healthCheckTimeout: 90 # valid log levels: debug, info (default), warn, error logLevel: debug models: "llama": cmd: > models/llama-server-osx --port 9001 -m models/Llama-3.2-1B-Instruct-Q4_0.gguf proxy: http://127.0.0.1:9001 # list of model name aliases this llama.cpp instance can serve aliases: - gpt-4o-mini # check this path for a HTTP 200 response for the server to be ready checkEndpoint: /health # unload model after 5 seconds ttl: 5 "qwen": cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf proxy: http://127.0.0.1:9002 aliases: - gpt-3.5-turbo # Embedding example with Nomic # https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF "nomic": proxy: http://127.0.0.1:9005 cmd: > models/llama-server-osx --port 9005 -m models/nomic-embed-text-v1.5.Q8_0.gguf --ctx-size 8192 --batch-size 8192 --rope-scaling yarn --rope-freq-scale 0.75 -ngl 99 --embeddings # Reranking example with bge-reranker # https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF "bge-reranker": proxy: http://127.0.0.1:9006 cmd: > models/llama-server-osx --port 9006 -m models/bge-reranker-v2-m3-Q4_K_M.gguf --ctx-size 8192 --reranking # Docker Support (v26.1.4+ required!) "dockertest": proxy: "http://127.0.0.1:9790" cmd: > docker run --name dockertest --init --rm -p 9790:8080 -v /mnt/nvme/models:/models ghcr.io/ggerganov/llama.cpp:server --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' "simple": # example of setting environment variables env: - CUDA_VISIBLE_DEVICES=0,1 - env1=hello cmd: build/simple-responder --port 8999 proxy: http://127.0.0.1:8999 unlisted: true # use "none" to skip check. Caution this may cause some requests to fail # until the upstream server is ready for traffic checkEndpoint: none # don't use these, just for testing if things are broken "broken": cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf proxy: http://127.0.0.1:8999 unlisted: true "broken_timeout": cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf proxy: http://127.0.0.1:9000 unlisted: true # creating a coding profile with models for code generation and general questions profiles: coding: - "qwen" - "llama"