81 lines
2.2 KiB
YAML
81 lines
2.2 KiB
YAML
# Seconds to wait for llama.cpp to be available to serve requests
|
|
# Default (and minimum): 15 seconds
|
|
healthCheckTimeout: 15
|
|
|
|
models:
|
|
"llama":
|
|
cmd: >
|
|
models/llama-server-osx
|
|
--port 9001
|
|
-m models/Llama-3.2-1B-Instruct-Q4_0.gguf
|
|
proxy: http://127.0.0.1:9001
|
|
|
|
# list of model name aliases this llama.cpp instance can serve
|
|
aliases:
|
|
- gpt-4o-mini
|
|
|
|
# check this path for a HTTP 200 response for the server to be ready
|
|
checkEndpoint: /health
|
|
|
|
# unload model after 5 seconds
|
|
ttl: 5
|
|
|
|
"qwen":
|
|
cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
|
|
proxy: http://127.0.0.1:9002
|
|
aliases:
|
|
- gpt-3.5-turbo
|
|
|
|
# Embedding example with Nomic
|
|
# https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
|
|
"nomic":
|
|
proxy: http://127.0.0.1:9005
|
|
cmd: >
|
|
models/llama-server-osx --port 9005
|
|
-m models/nomic-embed-text-v1.5.Q8_0.gguf
|
|
--ctx-size 8192
|
|
--batch-size 8192
|
|
--rope-scaling yarn
|
|
--rope-freq-scale 0.75
|
|
-ngl 99
|
|
--embeddings
|
|
|
|
# Reranking example with bge-reranker
|
|
# https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
|
|
"bge-reranker":
|
|
proxy: http://127.0.0.1:9006
|
|
cmd: >
|
|
models/llama-server-osx --port 9006
|
|
-m models/bge-reranker-v2-m3-Q4_K_M.gguf
|
|
--ctx-size 8192
|
|
--reranking
|
|
|
|
|
|
"simple":
|
|
# example of setting environment variables
|
|
env:
|
|
- CUDA_VISIBLE_DEVICES=0,1
|
|
- env1=hello
|
|
cmd: build/simple-responder --port 8999
|
|
proxy: http://127.0.0.1:8999
|
|
unlisted: true
|
|
|
|
# use "none" to skip check. Caution this may cause some requests to fail
|
|
# until the upstream server is ready for traffic
|
|
checkEndpoint: none
|
|
|
|
# don't use these, just for testing if things are broken
|
|
"broken":
|
|
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
|
proxy: http://127.0.0.1:8999
|
|
unlisted: true
|
|
"broken_timeout":
|
|
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
|
|
proxy: http://127.0.0.1:9000
|
|
unlisted: true
|
|
|
|
# creating a coding profile with models for code generation and general questions
|
|
profiles:
|
|
coding:
|
|
- "qwen"
|
|
- "llama" |