revise health check logic to not error on 5 second timeout

This commit is contained in:
Benson Wong
2024-11-01 09:42:37 -07:00
parent 8cf2a389d8
commit 8448efa7fc
2 changed files with 18 additions and 12 deletions

View File

@@ -35,7 +35,10 @@ models:
# until the upstream server is ready for traffic
checkEndpoint: none
# don't use this, just for testing if things are broken
# don't use these, just for testing if things are broken
"broken":
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
proxy: http://127.0.0.1:8999
"broken_timeout":
cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
proxy: http://127.0.0.1:9000

View File

@@ -187,26 +187,28 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
if err != nil {
return err
}
ctx, cancel := context.WithTimeout(req.Context(), 250*time.Millisecond)
defer cancel()
req = req.WithContext(ctx)
resp, err := client.Do(req)
if err != nil {
if strings.Contains(err.Error(), "connection refused") {
// if TCP dial can't connect any HTTP response after 5 seconds
// exit quickly.
if time.Since(startTime) > 5*time.Second {
return fmt.Errorf("health check endpoint took more than 5 seconds to respond")
}
}
if time.Since(startTime) >= maxDuration {
return fmt.Errorf("failed to check health from: %s", healthURL)
}
// wait a bit longer for TCP connection issues
if strings.Contains(err.Error(), "connection refused") {
fmt.Fprintf(pm.logMonitor, "Connection refused on %s\n", healthURL)
time.Sleep(5 * time.Second)
} else {
time.Sleep(time.Second)
}
continue
}
defer resp.Body.Close()
if resp.StatusCode == http.StatusOK {
return nil
@@ -214,6 +216,7 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
if time.Since(startTime) >= maxDuration {
return fmt.Errorf("failed to check health from: %s", healthURL)
}
time.Sleep(time.Second)
}
}