update example config

This commit is contained in:
Benson Wong
2025-05-12 10:20:18 -07:00
parent bc652709a5
commit cb876c143b

View File

@@ -5,13 +5,20 @@ healthCheckTimeout: 90
# valid log levels: debug, info (default), warn, error # valid log levels: debug, info (default), warn, error
logLevel: debug logLevel: debug
# creating a coding profile with models for code generation and general questions
groups:
coding:
swap: false
members:
- "qwen"
- "llama"
models: models:
"llama": "llama":
cmd: > cmd: >
models/llama-server-osx models/llama-server-osx
--port 9001 --port ${PORT}
-m models/Llama-3.2-1B-Instruct-Q4_0.gguf -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
proxy: http://127.0.0.1:9001
# list of model name aliases this llama.cpp instance can serve # list of model name aliases this llama.cpp instance can serve
aliases: aliases:
@@ -24,17 +31,15 @@ models:
ttl: 5 ttl: 5
"qwen": "qwen":
cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf cmd: models/llama-server-osx --port ${PORT} -m models/qwen2.5-0.5b-instruct-q8_0.gguf
proxy: http://127.0.0.1:9002
aliases: aliases:
- gpt-3.5-turbo - gpt-3.5-turbo
# Embedding example with Nomic # Embedding example with Nomic
# https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF # https://huggingface.co/nomic-ai/nomic-embed-text-v1.5-GGUF
"nomic": "nomic":
proxy: http://127.0.0.1:9005
cmd: > cmd: >
models/llama-server-osx --port 9005 models/llama-server-osx --port ${PORT}
-m models/nomic-embed-text-v1.5.Q8_0.gguf -m models/nomic-embed-text-v1.5.Q8_0.gguf
--ctx-size 8192 --ctx-size 8192
--batch-size 8192 --batch-size 8192
@@ -46,19 +51,17 @@ models:
# Reranking example with bge-reranker # Reranking example with bge-reranker
# https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF # https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF
"bge-reranker": "bge-reranker":
proxy: http://127.0.0.1:9006
cmd: > cmd: >
models/llama-server-osx --port 9006 models/llama-server-osx --port ${PORT}
-m models/bge-reranker-v2-m3-Q4_K_M.gguf -m models/bge-reranker-v2-m3-Q4_K_M.gguf
--ctx-size 8192 --ctx-size 8192
--reranking --reranking
# Docker Support (v26.1.4+ required!) # Docker Support (v26.1.4+ required!)
"dockertest": "dockertest":
proxy: "http://127.0.0.1:9790"
cmd: > cmd: >
docker run --name dockertest docker run --name dockertest
--init --rm -p 9790:8080 -v /mnt/nvme/models:/models --init --rm -p ${PORT}:8080 -v /mnt/nvme/models:/models
ghcr.io/ggerganov/llama.cpp:server ghcr.io/ggerganov/llama.cpp:server
--model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf' --model '/models/Qwen2.5-Coder-0.5B-Instruct-Q4_K_M.gguf'
@@ -67,8 +70,7 @@ models:
env: env:
- CUDA_VISIBLE_DEVICES=0,1 - CUDA_VISIBLE_DEVICES=0,1
- env1=hello - env1=hello
cmd: build/simple-responder --port 8999 cmd: build/simple-responder --port ${PORT}
proxy: http://127.0.0.1:8999
unlisted: true unlisted: true
# use "none" to skip check. Caution this may cause some requests to fail # use "none" to skip check. Caution this may cause some requests to fail
@@ -83,10 +85,4 @@ models:
"broken_timeout": "broken_timeout":
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
proxy: http://127.0.0.1:9000 proxy: http://127.0.0.1:9000
unlisted: true unlisted: true
# creating a coding profile with models for code generation and general questions
profiles:
coding:
- "qwen"
- "llama"