Files
llama-swap/config.example.yaml
Benson Wong 34f9fd7340 Improve timeout and exit handling of child processes. fix #3 and #5
llama-swap only waited a maximum of 5 seconds for an upstream
HTTP server to be available. If it took longer than that it will error
out the request. Now it will wait up to the configured healthCheckTimeout
or the upstream process unexpectedly exits.
2024-11-01 14:32:39 -07:00

44 lines
1.3 KiB
YAML

# Seconds to wait for llama.cpp to be available to serve requests
# Default (and minimum): 15 seconds
healthCheckTimeout: 15
models:
"llama":
cmd: >
models/llama-server-osx
--port 8999
-m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf
proxy: http://127.0.0.1:8999
# list of model name aliases this llama.cpp instance can serve
aliases:
- gpt-4o-mini
# check this path for a HTTP 200 response for the server to be ready
checkEndpoint: /health
"qwen":
cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
proxy: http://127.0.0.1:8999
aliases:
- gpt-3.5-turbo
"simple":
# example of setting environment variables
env:
- CUDA_VISIBLE_DEVICES=0,1
- env1=hello
cmd: build/simple-responder --port 8999
proxy: http://127.0.0.1:8999
# use "none" to skip check. Caution this may cause some requests to fail
# until the upstream server is ready for traffic
checkEndpoint: none
# don't use these, just for testing if things are broken
"broken":
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
proxy: http://127.0.0.1:8999
"broken_timeout":
cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
proxy: http://127.0.0.1:9000