From 8448efa7fcc6e592b61acae0f69793535f1356f1 Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Fri, 1 Nov 2024 09:42:37 -0700 Subject: [PATCH] revise health check logic to not error on 5 second timeout --- config.example.yaml | 7 +++++-- proxy/manager.go | 23 +++++++++++++---------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/config.example.yaml b/config.example.yaml index fc738e5..4e51a76 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -35,7 +35,10 @@ models: # until the upstream server is ready for traffic checkEndpoint: none - # don't use this, just for testing if things are broken + # don't use these, just for testing if things are broken "broken": cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf - proxy: http://127.0.0.1:8999 \ No newline at end of file + proxy: http://127.0.0.1:8999 + "broken_timeout": + cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf + proxy: http://127.0.0.1:9000 \ No newline at end of file diff --git a/proxy/manager.go b/proxy/manager.go index 0107c51..abafcae 100644 --- a/proxy/manager.go +++ b/proxy/manager.go @@ -187,26 +187,28 @@ func (pm *ProxyManager) checkHealthEndpoint() error { if err != nil { return err } + ctx, cancel := context.WithTimeout(req.Context(), 250*time.Millisecond) defer cancel() req = req.WithContext(ctx) resp, err := client.Do(req) + if err != nil { - if strings.Contains(err.Error(), "connection refused") { - - // if TCP dial can't connect any HTTP response after 5 seconds - // exit quickly. - if time.Since(startTime) > 5*time.Second { - return fmt.Errorf("health check endpoint took more than 5 seconds to respond") - } - } - if time.Since(startTime) >= maxDuration { return fmt.Errorf("failed to check health from: %s", healthURL) } - time.Sleep(time.Second) + + // wait a bit longer for TCP connection issues + if strings.Contains(err.Error(), "connection refused") { + fmt.Fprintf(pm.logMonitor, "Connection refused on %s\n", healthURL) + time.Sleep(5 * time.Second) + } else { + time.Sleep(time.Second) + } + continue } + defer resp.Body.Close() if resp.StatusCode == http.StatusOK { return nil @@ -214,6 +216,7 @@ func (pm *ProxyManager) checkHealthEndpoint() error { if time.Since(startTime) >= maxDuration { return fmt.Errorf("failed to check health from: %s", healthURL) } + time.Sleep(time.Second) } }