Improve process start and stop reliability (#38)
Refactor Process.start()/Stop() logic (#38) - remove cmd.Wait() call in start(). This seems to conflict with the one in .Stop(). Removing it eliminated no child errors - eliminate goroutines in .start() as it no longer required
This commit is contained in:
@@ -2,7 +2,6 @@ package proxy
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"errors"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
@@ -86,36 +85,11 @@ func (p *Process) start() error {
|
|||||||
// 3. The health check passes
|
// 3. The health check passes
|
||||||
//
|
//
|
||||||
// only in the third case will the process be considered Ready to accept
|
// only in the third case will the process be considered Ready to accept
|
||||||
healthCheckContext, cancelHealthCheck := context.WithCancelCause(context.Background())
|
<-time.After(250 * time.Millisecond) // give process a bit of time to start
|
||||||
defer cancelHealthCheck(nil) // clean up
|
|
||||||
cmdWaitChan := make(chan error, 1)
|
|
||||||
healthCheckChan := make(chan error, 1)
|
|
||||||
|
|
||||||
go func() {
|
if err := p.checkHealthEndpoint(); err != nil {
|
||||||
// possible cmd exits early
|
|
||||||
cmdWaitChan <- p.cmd.Wait()
|
|
||||||
}()
|
|
||||||
|
|
||||||
go func() {
|
|
||||||
<-time.After(250 * time.Millisecond) // give process a bit of time to start
|
|
||||||
healthCheckChan <- p.checkHealthEndpoint(healthCheckContext)
|
|
||||||
}()
|
|
||||||
|
|
||||||
select {
|
|
||||||
case err := <-cmdWaitChan:
|
|
||||||
p.state = StateFailed
|
p.state = StateFailed
|
||||||
if err != nil {
|
|
||||||
err = fmt.Errorf("command [%s] %s", strings.Join(p.cmd.Args, " "), err.Error())
|
|
||||||
} else {
|
|
||||||
err = fmt.Errorf("command [%s] exited unexpected", strings.Join(p.cmd.Args, " "))
|
|
||||||
}
|
|
||||||
cancelHealthCheck(err)
|
|
||||||
return err
|
return err
|
||||||
case err := <-healthCheckChan:
|
|
||||||
if err != nil {
|
|
||||||
p.state = StateFailed
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if p.config.UnloadAfter > 0 {
|
if p.config.UnloadAfter > 0 {
|
||||||
@@ -176,15 +150,23 @@ func (p *Process) Stop() {
|
|||||||
|
|
||||||
select {
|
select {
|
||||||
case <-sigtermTimeout.Done():
|
case <-sigtermTimeout.Done():
|
||||||
fmt.Fprintf(p.logMonitor, "XXX Process for %s timed out waiting to stop, sending SIGKILL to PID: %d\n", p.ID, p.cmd.Process.Pid)
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] timed out waiting to stop, sending KILL signal\n", p.ID)
|
||||||
p.cmd.Process.Kill()
|
p.cmd.Process.Kill()
|
||||||
p.cmd.Wait()
|
|
||||||
case err := <-sigtermNormal:
|
case err := <-sigtermNormal:
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err.Error() != "wait: no child processes" {
|
if errno, ok := err.(syscall.Errno); ok {
|
||||||
// possible that simple-responder for testing is just not
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] errno >> %v\n", p.ID, errno)
|
||||||
// existing right, so suppress those errors.
|
} else if exitError, ok := err.(*exec.ExitError); ok {
|
||||||
fmt.Fprintf(p.logMonitor, "!!! process for %s stopped with error > %v\n", p.ID, err)
|
if strings.Contains(exitError.String(), "signal: terminated") {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] stopped OK\n", p.ID)
|
||||||
|
} else if strings.Contains(exitError.String(), "signal: interrupt") {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] interrupted OK\n", p.ID)
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] ExitError >> %v, exit code: %d\n", p.ID, exitError, exitError.ExitCode())
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
fmt.Fprintf(p.logMonitor, "!!! process [%s] exited >> %v\n", p.ID, err)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -198,7 +180,7 @@ func (p *Process) CurrentState() ProcessState {
|
|||||||
return p.state
|
return p.state
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error {
|
func (p *Process) checkHealthEndpoint() error {
|
||||||
if p.config.Proxy == "" {
|
if p.config.Proxy == "" {
|
||||||
return fmt.Errorf("no upstream available to check /health")
|
return fmt.Errorf("no upstream available to check /health")
|
||||||
}
|
}
|
||||||
@@ -230,24 +212,11 @@ func (p *Process) checkHealthEndpoint(ctxFromStart context.Context) error {
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(ctxFromStart, time.Second)
|
|
||||||
defer cancel()
|
|
||||||
req = req.WithContext(ctx)
|
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
|
|
||||||
ttl := (maxDuration - time.Since(startTime)).Seconds()
|
ttl := (maxDuration - time.Since(startTime)).Seconds()
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// check if the context was cancelled
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
err := context.Cause(ctx)
|
|
||||||
if !errors.Is(err, context.DeadlineExceeded) {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
}
|
|
||||||
|
|
||||||
// wait a bit longer for TCP connection issues
|
// wait a bit longer for TCP connection issues
|
||||||
if strings.Contains(err.Error(), "connection refused") {
|
if strings.Contains(err.Error(), "connection refused") {
|
||||||
fmt.Fprintf(p.logMonitor, "Connection refused on %s, ttl %.0fs\n", healthURL, ttl)
|
fmt.Fprintf(p.logMonitor, "Connection refused on %s, ttl %.0fs\n", healthURL, ttl)
|
||||||
|
|||||||
Reference in New Issue
Block a user