proxy/config: add model level macros (#330)

* proxy/config: add model level macros Add macros to model configuration. Model macros override macros that are defined at the global configuration level. They follow the same naming and value rules as the global macros. * proxy/config: fix bug with macro reserved name checking The PORT reserved name was not properly checked * proxy/config: add tests around model.filters.stripParams - add check that model.filters.stripParams has no invalid macros - renamed strip_params to stripParams for camel case consistency - add legacy code compatibility so model.filters.strip_params continues to work * proxy/config: add duplicate removal to model.filters.stripParams * clean up some doc nits
2025-09-28 23:32:52 -07:00
parent 216c40b951
commit 1f6179110c
5 changed files with 321 additions and 161 deletions
@@ -38,12 +38,19 @@ startPort: 10001
 # macros: a dictionary of string substitutions
 # - optional, default: empty dictionary
 # - macros are reusable snippets
-# - used in a model's cmd, cmdStop, proxy and checkEndpoint
+# - used in a model's cmd, cmdStop, proxy, checkEndpoint, filters.stripParams
 # - useful for reducing common configuration settings
+# - macro names are strings and must be less than 64 characters
+# - macro names must match the regex ^[a-zA-Z0-9_-]+$
+# - macro names must not be a reserved name: PORT or MODEL_ID
+# - macro values must be less than 1024 characters
+#
+# Important: do not nest macros inside other macros; expansion is single-pass
 macros:
  "latest-llama": >
    /path/to/llama-server/llama-server-ec9e0301
    --port ${PORT}
+  "default_ctx": "4096"

 # models: a dictionary of model configurations
 # - required
@@ -55,6 +62,13 @@ models:

  # keys are the model names used in API requests
  "llama":
+    # macros: a dictionary of string substitutions specific to this model
+    # - optional, default: empty dictionary
+    # - macros defined here override macros defined in the global macros section
+    # - model level macros follow the same rules as global macros
+    macros:
+      "default_ctx": "16384"
+
    # cmd: the command to run to start the inference server.
    # - required
    # - it is just a string, similar to what you would run on the CLI
@@ -64,6 +78,7 @@ models:
      # ${latest-llama} is a macro that is defined above
      ${latest-llama}
      --model path/to/llama-8B-Q4_K_M.gguf
+      --ctx-size ${default_ctx}

    # name: a display name for the model
    # - optional, default: empty string
@@ -119,15 +134,15 @@ models:

    # filters: a dictionary of filter settings
    # - optional, default: empty dictionary
-    # - only strip_params is currently supported
+    # - only stripParams is currently supported
    filters:
-      # strip_params: a comma separated list of parameters to remove from the request
+      # stripParams: a comma separated list of parameters to remove from the request
      # - optional, default: ""
      # - useful for server side enforcement of sampling parameters
      # - the `model` parameter can never be removed
      # - can be any JSON key in the request body
      # - recommended to stick to sampling parameters
-      strip_params: "temperature, top_p, top_k"
+      stripParams: "temperature, top_p, top_k"

    # concurrencyLimit: overrides the allowed number of active parallel requests to a model
    # - optional, default: 0