Files
llama-swap/config.example.yaml
Benson Wong 8eb5b7b6c4 Add custom check endpoint
Replace previously hardcoded value for `/health` to check when the
server became ready to serve traffic. With this the server can support
any server that provides an an OpenAI compatible inference endpoint.
2024-10-11 21:59:21 -07:00

40 lines
1.2 KiB
YAML

# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 60
models:
"llama":
cmd: "models/llama-server-osx --port 8999 -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
proxy: "http://127.0.0.1:8999"
# list of model name aliases this llama.cpp instance can serve
aliases:
- "gpt-4o-mini"
# check this path for a HTTP 200 response for the server to be ready
checkEndpoint: "/health"
"qwen":
cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
proxy: "http://127.0.0.1:8999"
aliases:
- "gpt-3.5-turbo"
"simple":
# example of setting environment variables
env:
- "CUDA_VISIBLE_DEVICES=0,1"
- "env1=hello"
cmd: "build/simple-responder --port 8999"
proxy: "http://127.0.0.1:8999"
# use "none" to skip check. Caution this may cause some requests to fail
# until the upstream server is ready for traffic
checkEndpoint: "none"
# don't use this, just for testing if things are broken
"broken":
cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf"
proxy: "http://127.0.0.1:8999"