Replace previously hardcoded value for `/health` to check when the server became ready to serve traffic. With this the server can support any server that provides an an OpenAI compatible inference endpoint.
40 lines
1.2 KiB
YAML
40 lines
1.2 KiB
YAML
# Seconds to wait for llama.cpp to be available to serve requests
|
|
# Default (and minimum): 15 seconds
|
|
healthCheckTimeout: 60
|
|
|
|
models:
|
|
"llama":
|
|
cmd: "models/llama-server-osx --port 8999 -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
|
|
proxy: "http://127.0.0.1:8999"
|
|
|
|
# list of model name aliases this llama.cpp instance can serve
|
|
aliases:
|
|
- "gpt-4o-mini"
|
|
|
|
# check this path for a HTTP 200 response for the server to be ready
|
|
checkEndpoint: "/health"
|
|
|
|
"qwen":
|
|
cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf"
|
|
proxy: "http://127.0.0.1:8999"
|
|
aliases:
|
|
- "gpt-3.5-turbo"
|
|
|
|
"simple":
|
|
# example of setting environment variables
|
|
env:
|
|
- "CUDA_VISIBLE_DEVICES=0,1"
|
|
- "env1=hello"
|
|
cmd: "build/simple-responder --port 8999"
|
|
proxy: "http://127.0.0.1:8999"
|
|
|
|
# use "none" to skip check. Caution this may cause some requests to fail
|
|
# until the upstream server is ready for traffic
|
|
checkEndpoint: "none"
|
|
|
|
# don't use this, just for testing if things are broken
|
|
"broken":
|
|
cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf"
|
|
proxy: "http://127.0.0.1:8999"
|
|
|