proxy: add support for user defined metadata in model configs (#333)

Changes: 

- add Metadata key to ModelConfig
- include metadata in /v1/models under meta.llamaswap key
- add recursive macro substitution into Metadata
- change macros at global and model level to be any scalar type

Note: 

This is the first mostly AI generated change to llama-swap. See #333 for notes about the workflow and approach to AI going forward.
This commit is contained in:
Benson Wong
2025-10-04 19:56:41 -07:00
committed by GitHub
parent 1f6179110c
commit 70930e4e91
11 changed files with 807 additions and 25 deletions

View File

@@ -67,7 +67,8 @@ models:
# - macros defined here override macros defined in the global macros section
# - model level macros follow the same rules as global macros
macros:
"default_ctx": "16384"
"default_ctx": 16384
"temp": 0.7
# cmd: the command to run to start the inference server.
# - required
@@ -79,6 +80,7 @@ models:
${latest-llama}
--model path/to/llama-8B-Q4_K_M.gguf
--ctx-size ${default_ctx}
--temperature ${temp}
# name: a display name for the model
# - optional, default: empty string
@@ -144,6 +146,30 @@ models:
# - recommended to stick to sampling parameters
stripParams: "temperature, top_p, top_k"
# metadata: a dictionary of arbitrary values that are included in /v1/models
# - optional, default: empty dictionary
# - while metadata can contains complex types it is recommended to keep it simple
# - metadata is only passed through in /v1/models responses
metadata:
# port will remain an integer
port: ${PORT}
# the ${temp} macro will remain a float
temperature: ${temp}
note: "The ${MODEL_ID} is running on port ${PORT} temp=${temp}, context=${default_ctx}"
a_list:
- 1
- 1.23
- "macros are OK in list and dictionary types: ${MODEL_ID}"
an_obj:
a: "1"
b: 2
# objects can contain complex types with macro substitution
# becomes: c: [0.7, false, "model: llama"]
c: ["${temp}", false, "model: ${MODEL_ID}"]
# concurrencyLimit: overrides the allowed number of active parallel requests to a model
# - optional, default: 0
# - useful for limiting the number of active parallel requests a model can process