Implement Multi-Process Handling (#7)

Refactor code to support starting of multiple back end llama.cpp servers. This functionality is exposed as `profiles` to create a simple configuration format. Changes: * refactor proxy tests to get ready for multi-process support * update proxy/ProxyManager to support multiple processes (#7) * Add support for Groups in configuration * improve handling of Model alias configs * implement multi-model swapping * improve code clarity for swapModel * improve docs, rename groups to profiles in config
2024-11-23 19:45:13 -08:00
parent 533162ce6a
commit 73ad85ea69
10 changed files with 361 additions and 124 deletions
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -6,9 +6,9 @@ models:
  "llama":
    cmd: >
      models/llama-server-osx
-      --port 8999
+      --port 9001
      -m models/Llama-3.2-1B-Instruct-Q4_0.gguf
-    proxy: http://127.0.0.1:8999
+    proxy: http://127.0.0.1:9001

    # list of model name aliases this llama.cpp instance can serve
    aliases:
@@ -21,8 +21,8 @@ models:
    ttl: 5

  "qwen":
-    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
-    proxy: http://127.0.0.1:8999
+    cmd: models/llama-server-osx --port 9002 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
+    proxy: http://127.0.0.1:9002
    aliases:
    - gpt-3.5-turbo

@@ -44,4 +44,10 @@ models:
    proxy: http://127.0.0.1:8999
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
-    proxy: http://127.0.0.1:9000
+    proxy: http://127.0.0.1:9000
+
+# creating a coding profile with models for code generation and general questions
+profiles:
+  coding:
+    - "qwen"
+    - "llama"