add support for automatically unloading a model (#10) (#14)

* Make starting upstream process on-demand (#10) * Add automatic unload of model after TTL is reached * add `ttl` configuration parameter to models in seconds, default is 0 (never unload)
2024-11-19 16:32:51 -08:00
parent ba39ed4c18
commit 533162ce6a
8 changed files with 149 additions and 54 deletions
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -17,6 +17,9 @@ models:
    # check this path for a HTTP 200 response for the server to be ready
    checkEndpoint: /health

+    # unload model after 5 seconds
+    ttl: 5
+
  "qwen":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:8999