Improve process start and stop reliability (#38)

Refactor Process.start()/Stop() logic (#38)
- remove cmd.Wait() call in start(). This seems to conflict with the one
  in .Stop(). Removing it eliminated no child errors
- eliminate goroutines in .start() as it no longer required
This commit is contained in:
Benson Wong
2025-02-03 11:50:38 -08:00
committed by GitHub
parent 314d2f2212
commit 85cd74a51c

View File

@@ -2,7 +2,6 @@ package proxy
import ( import (
"context" "context"
"errors"
"fmt" "fmt"
"io" "io"
"net/http" "net/http"
@@ -86,36 +85,11 @@ func (p *Process) start() error {
// 3. The health check passes // 3. The health check passes
// //
// only in the third case will the process be considered Ready to accept // only in the third case will the process be considered Ready to accept
healthCheckContext, cancelHealthCheck := context.WithCancelCause(context.Background()) <-time.After(250 * time.Millisecond) // give process a bit of time to start
defer cancelHealthCheck(nil) // clean up
cmdWaitChan := make(chan error, 1)
healthCheckChan := make(chan error, 1)
go func() { if err := p.checkHealthEndpoint(); err != nil {
// possible cmd exits early
cmdWaitChan <- p.cmd.Wait()
}()
go func() {
<-time.After(250 * time.Millisecond) // give process a bit of time to start
healthCheckChan <- p.checkHealthEndpoint(healthCheckContext)
}()
select {
case err := <-cmdWaitChan:
p.state = StateFailed p.state = StateFailed
if err != nil {
err = fmt.Errorf("command [%s] %s", strings.Join(p.cmd.Args, " "), err.Error())
} else {
err = fmt.Errorf("command [%s] exited unexpected", strings.Join(p.cmd.Args, " "))
}
cancelHealthCheck(err)
return err return err
case err := <-healthCheckChan:
if err != nil {
p.state = StateFailed
return err
}
} }
if p.config.UnloadAfter > 0 { if p.config.UnloadAfter > 0 {
@@ -176,15 +150,23 @@ func (p *Process) Stop() {
select { select {
case <-sigtermTimeout.Done(): case <-sigtermTimeout.Done():
fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid) fmt.Fprintf(p.logMonitor, "!!! process [%s] timed out waiting to stop, sending KILL signal\n", p.ID)
p.cmd.Process.Kill() p.cmd.Process.Kill()
p.cmd.Wait()
case err := <-sigtermNormal: case err := <-sigtermNormal:
if err != nil { if err != nil {
if err.Error() != "wait: no child processes" { if errno, ok := err.(syscall.Errno); ok {
// possible that simple-responder for testing is just not fmt.Fprintf(p.logMonitor, "!!! process [%s] errno >> %v\n", p.ID, errno)
// existing right, so suppress those errors. } else if exitError, ok := err.(*exec.ExitError); ok {
fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err) if strings.Contains(exitError.String(), "signal: terminated") {
fmt.Fprintf(p.logMonitor, "!!! process [%s] stopped OK\n", p.ID)
} else if strings.Contains(exitError.String(), "signal: interrupt") {
fmt.Fprintf(p.logMonitor, "!!! process [%s] interrupted OK\n", p.ID)
} else {
fmt.Fprintf(p.logMonitor, "!!! process [%s] ExitError >> %v, exit code: %d\n", p.ID, exitError, exitError.ExitCode())
}
} else {
fmt.Fprintf(p.logMonitor, "!!! process [%s] exited >> %v\n", p.ID, err)
} }
} }
} }
@@ -198,7 +180,7 @@ func (p *Process) CurrentState() ProcessState {
return p.state return p.state
} }
func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error { func (p *Process) checkHealthEndpoint() error {
if p.config.Proxy == "" { if p.config.Proxy == "" {
return fmt.Errorf("no upstream available to check /health") return fmt.Errorf("no upstream available to check /health")
} }
@@ -230,24 +212,11 @@ func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error {
return err return err
} }
ctx, cancel := context.WithTimeout(ctxFromStart, time.Second)
defer cancel()
req = req.WithContext(ctx)
resp, err := client.Do(req) resp, err := client.Do(req)
ttl := (maxDuration - time.Since(startTime)).Seconds() ttl := (maxDuration - time.Since(startTime)).Seconds()
if err != nil { if err != nil {
// check if the context was cancelled
select {
case <-ctx.Done():
err := context.Cause(ctx)
if !errors.Is(err, context.DeadlineExceeded) {
return err
}
default:
}
// wait a bit longer for TCP connection issues // wait a bit longer for TCP connection issues
if strings.Contains(err.Error(), "connection refused") { if strings.Contains(err.Error(), "connection refused") {
fmt.Fprintf(p.logMonitor, "Connection refused on %s, ttl %.0fs\n", healthURL, ttl) fmt.Fprintf(p.logMonitor, "Connection refused on %s, ttl %.0fs\n", healthURL, ttl)