improve error handling

2024-10-04 10:55:02 -07:00
parent 2d387cf373
commit bfdba43bd8
3 changed files with 86 additions and 24 deletions
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -1,7 +1,20 @@
+# Seconds to wait for llama.cpp to be available to serve requests
+# Default (and minimum): 15 seconds
+healthCheckTimeout: 60
+
 models:
  "llama":
    cmd: "models/llama-server-osx --port 8999 -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
    proxy: "http://127.0.0.1:8999"
+
+    # list of model name aliases this llama.cpp instance can serve
+    aliases:
+    - "gpt-4o-mini"
  "qwen":
    cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf "
    proxy: "http://127.0.0.1:8999"
+    aliases:
+    - "gpt-3.5-turbo"
+  "broken":
+    cmd: "models/llama-server-osx --port 8999 -m models/doesnotexist.gguf "
+    proxy: "http://127.0.0.1:8999"
--- a/proxy/config.go
+++ b/proxy/config.go
@@ -7,12 +7,32 @@ import (
 )

 type ModelConfig struct {
-	Cmd   string `yaml:"cmd"`
-	Proxy string `yaml:"proxy"`
+	Cmd     string   `yaml:"cmd"`
+	Proxy   string   `yaml:"proxy"`
+	Aliases []string `yaml:"aliases"`
 }

 type Config struct {
-	Models map[string]ModelConfig `yaml:"models"`
+	Models             map[string]ModelConfig `yaml:"models"`
+	HealthCheckTimeout int                    `yaml:"healthCheckTimeout"`
+}
+
+func (c *Config) FindConfig(modelName string) (ModelConfig, bool) {
+	modelConfig, found := c.Models[modelName]
+	if found {
+		return modelConfig, true
+	}
+
+	// Search through aliases to find the right config
+	for _, config := range c.Models {
+		for _, alias := range config.Aliases {
+			if alias == modelName {
+				return config, true
+			}
+		}
+	}
+
+	return ModelConfig{}, false
 }

 func LoadConfig(path string) (*Config, error) {
@@ -27,5 +47,9 @@ func LoadConfig(path string) (*Config, error) {
 		return nil, err
 	}

+	if config.HealthCheckTimeout < 15 {
+		config.HealthCheckTimeout = 15
+	}
+
 	return &config, nil
 }
--- a/proxy/manager.go
+++ b/proxy/manager.go
@@ -18,10 +18,9 @@ import (
 type ProxyManager struct {
 	sync.Mutex

-	config       *Config
-	currentCmd   *exec.Cmd
-	currentModel string
-	currentProxy string
+	config        *Config
+	currentCmd    *exec.Cmd
+	currentConfig ModelConfig
 }

 func New(config *Config) *ProxyManager {
@@ -32,29 +31,31 @@ func (pm *ProxyManager) HandleFunc(w http.ResponseWriter, r *http.Request) {
 	if r.URL.Path == "/v1/chat/completions" {
 		pm.proxyChatRequest(w, r)
 	} else {
-		http.Error(w, "Endpoint not supported", http.StatusNotFound)
+		http.Error(w, "endpoint not supported", http.StatusNotFound)
 	}
 }

-func (pm *ProxyManager) swapModel(model string) error {
+func (pm *ProxyManager) swapModel(requestedModel string) error {
 	pm.Lock()
 	defer pm.Unlock()

-	if model == pm.currentModel {
+	// find the model configuration matching requestedModel
+	modelConfig, found := pm.config.FindConfig(requestedModel)
+	if !found {
+		return fmt.Errorf("could not find configuration for %s", requestedModel)
+	}
+
+	// no need to swap llama.cpp instances
+	if pm.currentConfig.Cmd == modelConfig.Cmd {
 		return nil
 	}

-	modelConfig, ok := pm.config.Models[model]
-	if !ok {
-		return fmt.Errorf("unknown model %s", model)
-	}
-
+	// kill the current running one to swap it
 	if pm.currentCmd != nil {
 		pm.currentCmd.Process.Signal(syscall.SIGTERM)
 	}

-	pm.currentModel = model
-	pm.currentProxy = modelConfig.Proxy
+	pm.currentConfig = modelConfig

 	args := strings.Fields(modelConfig.Cmd)
 	cmd := exec.Command(args[0], args[1:]...)
@@ -66,20 +67,24 @@ func (pm *ProxyManager) swapModel(model string) error {
 	}
 	pm.currentCmd = cmd

-	if err := pm.checkHealthEndpoint(60 * time.Second); err != nil {
+	if err := pm.checkHealthEndpoint(); err != nil {
 		return err
 	}

 	return nil
 }

-func (pm *ProxyManager) checkHealthEndpoint(maxDuration time.Duration) error {
+func (pm *ProxyManager) checkHealthEndpoint() error {

-	if pm.currentProxy == "" {
+	if pm.currentConfig.Proxy == "" {
 		return fmt.Errorf("no upstream available to check /health")
 	}

-	healthURL := pm.currentProxy + "/health"
+	proxyTo := pm.currentConfig.Proxy
+
+	maxDuration := time.Second * time.Duration(pm.config.HealthCheckTimeout)
+
+	healthURL := proxyTo + "/health"
 	client := &http.Client{}
 	startTime := time.Now()

@@ -93,6 +98,15 @@ func (pm *ProxyManager) checkHealthEndpoint(maxDuration time.Duration) error {
 		req = req.WithContext(ctx)
 		resp, err := client.Do(req)
 		if err != nil {
+			if strings.Contains(err.Error(), "connection refused") {
+				// llama.cpp /health endpoint commes up fast, give it 5 seconds
+				// happens when llama.cpp exited, keeps the code simple if TCP dial is not
+				// able to talk to the proxy endpoint
+				if time.Since(startTime) > 5*time.Second {
+					return fmt.Errorf("/healthy endpoint took more than 5 seconds to respond")
+				}
+			}
+
 			if time.Since(startTime) >= maxDuration {
 				return fmt.Errorf("failed to check /healthy from: %s", healthURL)
 			}
@@ -127,14 +141,25 @@ func (pm *ProxyManager) proxyChatRequest(w http.ResponseWriter, r *http.Request)
 		return
 	}

-	pm.swapModel(model)
+	if err := pm.swapModel(model); err != nil {
+		http.Error(w, fmt.Sprintf("unable to swap to model: %s", err.Error()), http.StatusNotFound)
+		return
+	}
+
 	r.Body = io.NopCloser(bytes.NewBuffer(bodyBytes))
 	pm.proxyRequest(w, r)
 }

 func (pm *ProxyManager) proxyRequest(w http.ResponseWriter, r *http.Request) {
+	if pm.currentConfig.Proxy == "" {
+		http.Error(w, "No upstream proxy", http.StatusInternalServerError)
+		return
+	}
+
+	proxyTo := pm.currentConfig.Proxy
+
 	client := &http.Client{}
-	req, err := http.NewRequest(r.Method, pm.currentProxy+r.URL.String(), r.Body)
+	req, err := http.NewRequest(r.Method, proxyTo+r.URL.String(), r.Body)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return