revise health check logic to not error on 5 second timeout
This commit is contained in:
@@ -35,7 +35,10 @@ models:
|
|||||||
# until the upstream server is ready for traffic
|
# until the upstream server is ready for traffic
|
||||||
checkEndpoint: none
|
checkEndpoint: none
|
||||||
|
|
||||||
# don't use this, just for testing if things are broken
|
# don't use these, just for testing if things are broken
|
||||||
"broken":
|
"broken":
|
||||||
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
||||||
proxy: http://127.0.0.1:8999
|
proxy: http://127.0.0.1:8999
|
||||||
|
"broken_timeout":
|
||||||
|
cmd: models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
||||||
|
proxy: http://127.0.0.1:9000
|
||||||
@@ -187,26 +187,28 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(req.Context(), 250*time.Millisecond)
|
ctx, cancel := context.WithTimeout(req.Context(), 250*time.Millisecond)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
req = req.WithContext(ctx)
|
req = req.WithContext(ctx)
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if strings.Contains(err.Error(), "connection refused") {
|
|
||||||
|
|
||||||
// if TCP dial can't connect any HTTP response after 5 seconds
|
|
||||||
// exit quickly.
|
|
||||||
if time.Since(startTime) > 5*time.Second {
|
|
||||||
return fmt.Errorf("health check endpoint took more than 5 seconds to respond")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if time.Since(startTime) >= maxDuration {
|
if time.Since(startTime) >= maxDuration {
|
||||||
return fmt.Errorf("failed to check health from: %s", healthURL)
|
return fmt.Errorf("failed to check health from: %s", healthURL)
|
||||||
}
|
}
|
||||||
time.Sleep(time.Second)
|
|
||||||
|
// wait a bit longer for TCP connection issues
|
||||||
|
if strings.Contains(err.Error(), "connection refused") {
|
||||||
|
fmt.Fprintf(pm.logMonitor, "Connection refused on %s\n", healthURL)
|
||||||
|
time.Sleep(5 * time.Second)
|
||||||
|
} else {
|
||||||
|
time.Sleep(time.Second)
|
||||||
|
}
|
||||||
|
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
if resp.StatusCode == http.StatusOK {
|
if resp.StatusCode == http.StatusOK {
|
||||||
return nil
|
return nil
|
||||||
@@ -214,6 +216,7 @@ func (pm *ProxyManager) checkHealthEndpoint() error {
|
|||||||
if time.Since(startTime) >= maxDuration {
|
if time.Since(startTime) >= maxDuration {
|
||||||
return fmt.Errorf("failed to check health from: %s", healthURL)
|
return fmt.Errorf("failed to check health from: %s", healthURL)
|
||||||
}
|
}
|
||||||
|
|
||||||
time.Sleep(time.Second)
|
time.Sleep(time.Second)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user