llama-swap only waited a maximum of 5 seconds for an upstream HTTP server to be available. If it took longer than that it will error out the request. Now it will wait up to the configured healthCheckTimeout or the upstream process unexpectedly exits.
44 lines
1.3 KiB
YAML
44 lines
1.3 KiB
YAML
# Seconds to wait for llama.cpp to be available to serve requests
|
|
# Default (and minimum): 15 seconds
|
|
healthCheckTimeout: 15
|
|
|
|
models:
|
|
"llama":
|
|
cmd: >
|
|
models/llama-server-osx
|
|
--port 8999
|
|
-m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
|
|
proxy: http://127.0.0.1:8999
|
|
|
|
# list of model name aliases this llama.cpp instance can serve
|
|
aliases:
|
|
- gpt-4o-mini
|
|
|
|
# check this path for a HTTP 200 response for the server to be ready
|
|
checkEndpoint: /health
|
|
|
|
"qwen":
|
|
cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
|
proxy: http://127.0.0.1:8999
|
|
aliases:
|
|
- gpt-3.5-turbo
|
|
|
|
"simple":
|
|
# example of setting environment variables
|
|
env:
|
|
- CUDA_VISIBLE_DEVICES=0,1
|
|
- env1=hello
|
|
cmd: build/simple-responder --port 8999
|
|
proxy: http://127.0.0.1:8999
|
|
|
|
# use "none" to skip check. Caution this may cause some requests to fail
|
|
# until the upstream server is ready for traffic
|
|
checkEndpoint: none
|
|
|
|
# don't use these, just for testing if things are broken
|
|
"broken":
|
|
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
|
proxy: http://127.0.0.1:8999
|
|
"broken_timeout":
|
|
cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
|
proxy: http://127.0.0.1:9000 |