diff --git a/config.example.yaml b/config.example.yaml index fc738e5..315951c 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -1,6 +1,6 @@ # Seconds to wait for llama.cpp to be available to serve requests # Default (and minimum): 15 seconds -healthCheckTimeout: 60 +healthCheckTimeout: 15 models: "llama": @@ -35,7 +35,10 @@ models: # until the upstream server is ready for traffic checkEndpoint: none - # don't use this, just for testing if things are broken + # don't use these, just for testing if things are broken "broken": cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf - proxy: http://127.0.0.1:8999 \ No newline at end of file + proxy: http://127.0.0.1:8999 + "broken_timeout": + cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf + proxy: http://127.0.0.1:9000 \ No newline at end of file diff --git a/proxy/manager.go b/proxy/manager.go index 0107c51..db5d0e8 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -149,14 +149,28 @@ func (pm *ProxyManager) swapModel(requestedModel string) error { } pm.currentCmd = cmd - if err := pm.checkHealthEndpoint(); err != nil { + // watch for the command to exist + cmdCtx, cancel := context.WithCancelCause(context.Background()) + + // monitor the command's exist status + go func() { + err := cmd.Wait() + if err != nil { + cancel(fmt.Errorf("command [%s] %s", strings.Join(cmd.Args, " "), err.Error())) + } else { + cancel(nil) + } + }() + + // wait for checkHealthEndpoint + if err := pm.checkHealthEndpoint(cmdCtx); err != nil { return err } return nil } -func (pm *ProxyManager) checkHealthEndpoint() error { +func (pm *ProxyManager) checkHealthEndpoint(cmdCtx context.Context) error { if pm.currentConfig.Proxy == "" { return fmt.Errorf("no upstream available to check /health") @@ -179,6 +193,7 @@ func (pm *ProxyManager) checkHealthEndpoint() error { if err != nil { return fmt.Errorf("failed to create health url with with %s and path %s", proxyTo, checkEndpoint) } + client := &http.Client{} startTime := time.Now() @@ -187,33 +202,47 @@ func (pm *ProxyManager) checkHealthEndpoint() error { if err != nil { return err } - ctx, cancel := context.WithTimeout(req.Context(), 250*time.Millisecond) + + ctx, cancel := context.WithTimeout(cmdCtx, 250*time.Millisecond) defer cancel() req = req.WithContext(ctx) resp, err := client.Do(req) - if err != nil { - if strings.Contains(err.Error(), "connection refused") { - // if TCP dial can't connect any HTTP response after 5 seconds - // exit quickly. - if time.Since(startTime) > 5*time.Second { - return fmt.Errorf("health check endpoint took more than 5 seconds to respond") - } + ttl := (maxDuration - time.Since(startTime)).Seconds() + + if err != nil { + // check if the context was cancelled + select { + case <-ctx.Done(): + return context.Cause(ctx) + default: } - if time.Since(startTime) >= maxDuration { + // wait a bit longer for TCP connection issues + if strings.Contains(err.Error(), "connection refused") { + fmt.Fprintf(pm.logMonitor, "Connection refused on %s, ttl %.0fs\n", healthURL, ttl) + + time.Sleep(5 * time.Second) + } else { + time.Sleep(time.Second) + } + + if ttl < 0 { return fmt.Errorf("failed to check health from: %s", healthURL) } - time.Sleep(time.Second) + continue } + defer resp.Body.Close() if resp.StatusCode == http.StatusOK { return nil } - if time.Since(startTime) >= maxDuration { + + if ttl < 0 { return fmt.Errorf("failed to check health from: %s", healthURL) } + time.Sleep(time.Second) } } @@ -236,7 +265,7 @@ func (pm *ProxyManager) proxyChatRequest(w http.ResponseWriter, r *http.Request) } if err := pm.swapModel(model); err != nil { - http.Error(w, fmt.Sprintf("unable to swap to model: %s", err.Error()), http.StatusNotFound) + http.Error(w, fmt.Sprintf("unable to swap to model, %s", err.Error()), http.StatusNotFound) return }