improve llama-swap upstream process recovery and restarts (#155)
Refactor internal upstream process life cycle management to recover better from unexpected situations. With this change llama-swap should never need to be restarted due to a crashed upstream child process. The `StateFailed` state was removed in favour of always trying to start/restart a process.
This commit is contained in:
91
misc/process-cmd-test/main.go
Normal file
91
misc/process-cmd-test/main.go
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
package main
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"os/exec"
|
||||||
|
"os/signal"
|
||||||
|
"syscall"
|
||||||
|
"time"
|
||||||
|
)
|
||||||
|
|
||||||
|
/*
|
||||||
|
**
|
||||||
|
Test how exec.Cmd.CommandContext behaves under certain conditions:*
|
||||||
|
|
||||||
|
- process is killed externally, what happens with cmd.Wait() *
|
||||||
|
✔︎ it returns. catches crashes.*
|
||||||
|
|
||||||
|
- process ignores SIGTERM*
|
||||||
|
✔︎ `kill()` is called after cmd.WaitDelay*
|
||||||
|
|
||||||
|
- this process exits, what happens with children (kill -9 <this process' pid>)*
|
||||||
|
x they stick around. have to be manually killed.*
|
||||||
|
|
||||||
|
- .WithTimeout()'s cancel is called *
|
||||||
|
✔︎ process is killed after it ignores sigterm, cmd.Wait() catches it.*
|
||||||
|
|
||||||
|
- parent receives SIGINT/SIGTERM, what happens
|
||||||
|
✔︎ waits for child process to exit, then exits gracefully.
|
||||||
|
*/
|
||||||
|
func main() {
|
||||||
|
|
||||||
|
// swap between these to use kill -9 <pid> on the cli to sim external crash
|
||||||
|
ctx, cancel := context.WithCancel(context.Background())
|
||||||
|
//ctx, cancel := context.WithTimeout(context.Background(), 1000*time.Millisecond)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
//cmd := exec.CommandContext(ctx, "sleep", "1")
|
||||||
|
cmd := exec.CommandContext(ctx,
|
||||||
|
"../../build/simple-responder_darwin_arm64",
|
||||||
|
//"-ignore-sig-term", /* so it doesn't exit on receiving SIGTERM, test cmd.WaitTimeout */
|
||||||
|
)
|
||||||
|
cmd.Stdin = os.Stdin
|
||||||
|
cmd.Stdout = os.Stdout
|
||||||
|
cmd.Stderr = os.Stderr
|
||||||
|
|
||||||
|
// set a wait delay before signing sig kill
|
||||||
|
cmd.WaitDelay = 500 * time.Millisecond
|
||||||
|
cmd.Cancel = func() error {
|
||||||
|
fmt.Println("✔︎ Cancel() called, sending SIGTERM")
|
||||||
|
cmd.Process.Signal(syscall.SIGTERM)
|
||||||
|
|
||||||
|
//return nil
|
||||||
|
|
||||||
|
// this error is returned by cmd.Wait(), and can be used to
|
||||||
|
// single an error when the process couldn't be normally terminated
|
||||||
|
// but since a SIGTERM is sent, it's probably ok to return a nil
|
||||||
|
// as WaitDelay timing out will override the any error set here.
|
||||||
|
//
|
||||||
|
// test by enabling/disabling -ignore-sig-term on the process
|
||||||
|
// with -ignore-sig-term enabled, cmd.Wait() will have "signal: killed"
|
||||||
|
// without it, it will show the "new error from cancel"
|
||||||
|
return errors.New("error from cmd.Cancel()") // sets error returned by cmd.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := cmd.Start(); err != nil {
|
||||||
|
fmt.Println("Error starting process:", err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// catch signals. Calls cancel() which will cause cmd.Wait() to return and
|
||||||
|
// this program to eventually exit gracefully.
|
||||||
|
sigChan := make(chan os.Signal, 1)
|
||||||
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
go func() {
|
||||||
|
signal := <-sigChan
|
||||||
|
fmt.Printf("✔︎ Received signal: %d, Killing process... with cancel before exiting\n", signal)
|
||||||
|
cancel()
|
||||||
|
}()
|
||||||
|
|
||||||
|
fmt.Printf("✔︎ Parent Pid: %d, Process Pid: %d\n", os.Getpid(), cmd.Process.Pid)
|
||||||
|
fmt.Println("✔︎ Process started, cmd.Wait() ... ")
|
||||||
|
if err := cmd.Wait(); err != nil {
|
||||||
|
fmt.Println("✔︎ cmd.Wait returned, Error:", err)
|
||||||
|
} else {
|
||||||
|
fmt.Println("✔︎ cmd.Wait returned, Process exited on its own")
|
||||||
|
}
|
||||||
|
fmt.Println("✔︎ Child process exited, Done.")
|
||||||
|
}
|
||||||
309
proxy/process.go
309
proxy/process.go
@@ -24,9 +24,6 @@ const (
|
|||||||
StateReady ProcessState = ProcessState("ready")
|
StateReady ProcessState = ProcessState("ready")
|
||||||
StateStopping ProcessState = ProcessState("stopping")
|
StateStopping ProcessState = ProcessState("stopping")
|
||||||
|
|
||||||
// failed a health check on start and will not be recovered
|
|
||||||
StateFailed ProcessState = ProcessState("failed")
|
|
||||||
|
|
||||||
// process is shutdown and will not be restarted
|
// process is shutdown and will not be restarted
|
||||||
StateShutdown ProcessState = ProcessState("shutdown")
|
StateShutdown ProcessState = ProcessState("shutdown")
|
||||||
)
|
)
|
||||||
@@ -43,8 +40,11 @@ type Process struct {
|
|||||||
config ModelConfig
|
config ModelConfig
|
||||||
cmd *exec.Cmd
|
cmd *exec.Cmd
|
||||||
|
|
||||||
// for p.cmd.Wait() select { ... }
|
// PR #155 called to cancel the upstream process
|
||||||
cmdWaitChan chan error
|
cancelUpstream context.CancelFunc
|
||||||
|
|
||||||
|
// closed when command exits
|
||||||
|
cmdWaitChan chan struct{}
|
||||||
|
|
||||||
processLogger *LogMonitor
|
processLogger *LogMonitor
|
||||||
proxyLogger *LogMonitor
|
proxyLogger *LogMonitor
|
||||||
@@ -62,22 +62,17 @@ type Process struct {
|
|||||||
// used to block on multiple start() calls
|
// used to block on multiple start() calls
|
||||||
waitStarting sync.WaitGroup
|
waitStarting sync.WaitGroup
|
||||||
|
|
||||||
// for managing shutdown state
|
|
||||||
shutdownCtx context.Context
|
|
||||||
shutdownCancel context.CancelFunc
|
|
||||||
|
|
||||||
// for managing concurrency limits
|
// for managing concurrency limits
|
||||||
concurrencyLimitSemaphore chan struct{}
|
concurrencyLimitSemaphore chan struct{}
|
||||||
|
|
||||||
// stop timeout waiting for graceful shutdown
|
// used for testing to override the default value
|
||||||
gracefulStopTimeout time.Duration
|
gracefulStopTimeout time.Duration
|
||||||
|
|
||||||
// track that this happened
|
// track the number of failed starts
|
||||||
upstreamWasStoppedWithKill bool
|
failedStartCount int
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
|
func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
|
||||||
concurrentLimit := 10
|
concurrentLimit := 10
|
||||||
if config.ConcurrencyLimit > 0 {
|
if config.ConcurrencyLimit > 0 {
|
||||||
concurrentLimit = config.ConcurrencyLimit
|
concurrentLimit = config.ConcurrencyLimit
|
||||||
@@ -87,21 +82,20 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
|
|||||||
ID: ID,
|
ID: ID,
|
||||||
config: config,
|
config: config,
|
||||||
cmd: nil,
|
cmd: nil,
|
||||||
cmdWaitChan: make(chan error, 1),
|
cancelUpstream: nil,
|
||||||
processLogger: processLogger,
|
processLogger: processLogger,
|
||||||
proxyLogger: proxyLogger,
|
proxyLogger: proxyLogger,
|
||||||
healthCheckTimeout: healthCheckTimeout,
|
healthCheckTimeout: healthCheckTimeout,
|
||||||
healthCheckLoopInterval: 5 * time.Second, /* default, can not be set by user - used for testing */
|
healthCheckLoopInterval: 5 * time.Second, /* default, can not be set by user - used for testing */
|
||||||
state: StateStopped,
|
state: StateStopped,
|
||||||
shutdownCtx: ctx,
|
|
||||||
shutdownCancel: cancel,
|
|
||||||
|
|
||||||
// concurrency limit
|
// concurrency limit
|
||||||
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),
|
||||||
|
|
||||||
|
// To be removed when migration over exec.CommandContext is complete
|
||||||
// stop timeout
|
// stop timeout
|
||||||
gracefulStopTimeout: 10 * time.Second,
|
gracefulStopTimeout: 10 * time.Second,
|
||||||
upstreamWasStoppedWithKill: false,
|
cmdWaitChan: make(chan struct{}),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -143,13 +137,11 @@ func isValidTransition(from, to ProcessState) bool {
|
|||||||
case StateStopped:
|
case StateStopped:
|
||||||
return to == StateStarting
|
return to == StateStarting
|
||||||
case StateStarting:
|
case StateStarting:
|
||||||
return to == StateReady || to == StateFailed || to == StateStopping
|
return to == StateReady || to == StateStopping || to == StateStopped
|
||||||
case StateReady:
|
case StateReady:
|
||||||
return to == StateStopping
|
return to == StateStopping
|
||||||
case StateStopping:
|
case StateStopping:
|
||||||
return to == StateStopped || to == StateShutdown
|
return to == StateStopped || to == StateShutdown
|
||||||
case StateFailed:
|
|
||||||
return to == StateStopping
|
|
||||||
case StateShutdown:
|
case StateShutdown:
|
||||||
return false // No transitions allowed from these states
|
return false // No transitions allowed from these states
|
||||||
}
|
}
|
||||||
@@ -197,17 +189,24 @@ func (p *Process) start() error {
|
|||||||
|
|
||||||
p.waitStarting.Add(1)
|
p.waitStarting.Add(1)
|
||||||
defer p.waitStarting.Done()
|
defer p.waitStarting.Done()
|
||||||
|
cmdContext, ctxCancelUpstream := context.WithCancel(context.Background())
|
||||||
p.cmd = exec.Command(args[0], args[1:]...)
|
p.cmd = exec.CommandContext(cmdContext, args[0], args[1:]...)
|
||||||
p.cmd.Stdout = p.processLogger
|
p.cmd.Stdout = p.processLogger
|
||||||
p.cmd.Stderr = p.processLogger
|
p.cmd.Stderr = p.processLogger
|
||||||
p.cmd.Env = p.config.Env
|
p.cmd.Env = p.config.Env
|
||||||
|
|
||||||
|
p.cmd.Cancel = p.cmdStopUpstreamProcess
|
||||||
|
p.cmd.WaitDelay = p.gracefulStopTimeout
|
||||||
|
p.cancelUpstream = ctxCancelUpstream
|
||||||
|
p.cmdWaitChan = make(chan struct{})
|
||||||
|
|
||||||
|
p.failedStartCount++ // this will be reset to zero when the process has successfully started
|
||||||
err = p.cmd.Start()
|
err = p.cmd.Start()
|
||||||
|
|
||||||
// Set process state to failed
|
// Set process state to failed
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if curState, swapErr := p.swapState(StateStarting, StateFailed); swapErr != nil {
|
if curState, swapErr := p.swapState(StateStarting, StateStopped); swapErr != nil {
|
||||||
|
p.state = StateStopped // force it into a stopped state
|
||||||
return fmt.Errorf(
|
return fmt.Errorf(
|
||||||
"failed to start command and state swap failed. command error: %v, current state: %v, state swap error: %v",
|
"failed to start command and state swap failed. command error: %v, current state: %v, state swap error: %v",
|
||||||
err, curState, swapErr,
|
err, curState, swapErr,
|
||||||
@@ -217,20 +216,7 @@ func (p *Process) start() error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Capture the exit error for later signalling
|
// Capture the exit error for later signalling
|
||||||
go func() {
|
go p.waitForCmd()
|
||||||
exitErr := p.cmd.Wait()
|
|
||||||
p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
|
|
||||||
|
|
||||||
// there is a race condition when SIGKILL is used, p.cmd.Wait() returns, and then
|
|
||||||
// the code below fires, putting an error into cmdWaitChan. This code is to prevent this
|
|
||||||
if p.upstreamWasStoppedWithKill {
|
|
||||||
p.proxyLogger.Debugf("<%s> process was killed, NOT sending exitErr: %v", p.ID, exitErr)
|
|
||||||
p.upstreamWasStoppedWithKill = false
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
p.cmdWaitChan <- exitErr
|
|
||||||
}()
|
|
||||||
|
|
||||||
// One of three things can happen at this stage:
|
// One of three things can happen at this stage:
|
||||||
// 1. The command exits unexpectedly
|
// 1. The command exits unexpectedly
|
||||||
@@ -257,56 +243,32 @@ func (p *Process) start() error {
|
|||||||
return fmt.Errorf("failed to create health check URL proxy=%s and checkEndpoint=%s", proxyTo, checkEndpoint)
|
return fmt.Errorf("failed to create health check URL proxy=%s and checkEndpoint=%s", proxyTo, checkEndpoint)
|
||||||
}
|
}
|
||||||
|
|
||||||
checkDeadline, cancelHealthCheck := context.WithDeadline(
|
|
||||||
context.Background(),
|
|
||||||
checkStartTime.Add(maxDuration),
|
|
||||||
)
|
|
||||||
defer cancelHealthCheck()
|
|
||||||
|
|
||||||
loop:
|
|
||||||
// Ready Check loop
|
// Ready Check loop
|
||||||
for {
|
for {
|
||||||
select {
|
currentState := p.CurrentState()
|
||||||
case <-checkDeadline.Done():
|
if currentState != StateStarting {
|
||||||
if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
|
if currentState == StateStopped {
|
||||||
return fmt.Errorf("health check timed out after %vs AND state swap failed: %v, current state: %v", maxDuration.Seconds(), err, curState)
|
return fmt.Errorf("upstream command exited prematurely but successfully")
|
||||||
} else {
|
|
||||||
return fmt.Errorf("health check timed out after %vs", maxDuration.Seconds())
|
|
||||||
}
|
}
|
||||||
case <-p.shutdownCtx.Done():
|
|
||||||
return errors.New("health check interrupted due to shutdown")
|
return errors.New("health check interrupted due to shutdown")
|
||||||
case exitErr := <-p.cmdWaitChan:
|
|
||||||
if exitErr != nil {
|
|
||||||
p.proxyLogger.Warnf("<%s> upstream command exited prematurely with error: %v", p.ID, exitErr)
|
|
||||||
if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
|
|
||||||
return fmt.Errorf("upstream command exited unexpectedly: %s AND state swap failed: %v, current state: %v", exitErr.Error(), err, curState)
|
|
||||||
} else {
|
|
||||||
return fmt.Errorf("upstream command exited unexpectedly: %s", exitErr.Error())
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
p.proxyLogger.Warnf("<%s> upstream command exited prematurely but successfully", p.ID)
|
|
||||||
if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
|
|
||||||
return fmt.Errorf("upstream command exited prematurely but successfully AND state swap failed: %v, current state: %v", err, curState)
|
|
||||||
} else {
|
|
||||||
return fmt.Errorf("upstream command exited prematurely but successfully")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
if err := p.checkHealthEndpoint(healthURL); err == nil {
|
|
||||||
p.proxyLogger.Infof("<%s> Health check passed on %s", p.ID, healthURL)
|
|
||||||
cancelHealthCheck()
|
|
||||||
break loop
|
|
||||||
} else {
|
|
||||||
if strings.Contains(err.Error(), "connection refused") {
|
|
||||||
endTime, _ := checkDeadline.Deadline()
|
|
||||||
ttl := time.Until(endTime)
|
|
||||||
p.proxyLogger.Debugf("<%s> Connection refused on %s, giving up in %.0fs (normal during startup)", p.ID, healthURL, ttl.Seconds())
|
|
||||||
} else {
|
|
||||||
p.proxyLogger.Debugf("<%s> Health check error on %s, %v (normal during startup)", p.ID, healthURL, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if time.Since(checkStartTime) > maxDuration {
|
||||||
|
p.stopCommand()
|
||||||
|
return fmt.Errorf("health check timed out after %vs", maxDuration.Seconds())
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := p.checkHealthEndpoint(healthURL); err == nil {
|
||||||
|
p.proxyLogger.Infof("<%s> Health check passed on %s", p.ID, healthURL)
|
||||||
|
break
|
||||||
|
} else {
|
||||||
|
if strings.Contains(err.Error(), "connection refused") {
|
||||||
|
ttl := time.Until(checkStartTime.Add(maxDuration))
|
||||||
|
p.proxyLogger.Debugf("<%s> Connection refused on %s, giving up in %.0fs (normal during startup)", p.ID, healthURL, ttl.Seconds())
|
||||||
|
} else {
|
||||||
|
p.proxyLogger.Debugf("<%s> Health check error on %s, %v (normal during startup)", p.ID, healthURL, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
<-time.After(p.healthCheckLoopInterval)
|
<-time.After(p.healthCheckLoopInterval)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -337,6 +299,7 @@ func (p *Process) start() error {
|
|||||||
if curState, err := p.swapState(StateStarting, StateReady); err != nil {
|
if curState, err := p.swapState(StateStarting, StateReady); err != nil {
|
||||||
return fmt.Errorf("failed to set Process state to ready: current state: %v, error: %v", curState, err)
|
return fmt.Errorf("failed to set Process state to ready: current state: %v, error: %v", curState, err)
|
||||||
} else {
|
} else {
|
||||||
|
p.failedStartCount = 0
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -361,26 +324,12 @@ func (p *Process) StopImmediately() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
|
p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
|
||||||
currentState := p.CurrentState()
|
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
|
||||||
|
p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
|
||||||
if currentState == StateFailed {
|
return
|
||||||
if curState, err := p.swapState(StateFailed, StateStopping); err != nil {
|
|
||||||
p.proxyLogger.Infof("<%s> Stop() Failed -> StateStopping err: %v, current state: %v", p.ID, err, curState)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
|
|
||||||
p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// stop the process with a graceful exit timeout
|
p.stopCommand()
|
||||||
p.stopCommand(p.gracefulStopTimeout)
|
|
||||||
|
|
||||||
if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
|
|
||||||
p.proxyLogger.Infof("<%s> Stop() StateStopping -> StateStopped err: %v, current state: %v", p.ID, err, curState)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Shutdown is called when llama-swap is shutting down. It will give a little bit
|
// Shutdown is called when llama-swap is shutting down. It will give a little bit
|
||||||
@@ -392,91 +341,26 @@ func (p *Process) Shutdown() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
p.shutdownCancel()
|
p.stopCommand()
|
||||||
p.stopCommand(p.gracefulStopTimeout)
|
|
||||||
|
|
||||||
// just force it to this state since there is no recovery from shutdown
|
// just force it to this state since there is no recovery from shutdown
|
||||||
p.state = StateShutdown
|
p.state = StateShutdown
|
||||||
}
|
}
|
||||||
|
|
||||||
// stopCommand will send a SIGTERM to the process and wait for it to exit.
|
// stopCommand will send a SIGTERM to the process and wait for it to exit.
|
||||||
// If it does not exit within 5 seconds, it will send a SIGKILL.
|
// If it does not exit within 5 seconds, it will send a SIGKILL.
|
||||||
func (p *Process) stopCommand(sigtermTTL time.Duration) {
|
func (p *Process) stopCommand() {
|
||||||
stopStartTime := time.Now()
|
stopStartTime := time.Now()
|
||||||
defer func() {
|
defer func() {
|
||||||
p.proxyLogger.Debugf("<%s> stopCommand took %v", p.ID, time.Since(stopStartTime))
|
p.proxyLogger.Debugf("<%s> stopCommand took %v", p.ID, time.Since(stopStartTime))
|
||||||
}()
|
}()
|
||||||
|
|
||||||
sigtermTimeout, cancelTimeout := context.WithTimeout(context.Background(), sigtermTTL)
|
if p.cancelUpstream == nil {
|
||||||
defer cancelTimeout()
|
p.proxyLogger.Errorf("<%s> stopCommand has a nil p.cancelUpstream()", p.ID)
|
||||||
|
|
||||||
if p.cmd == nil || p.cmd.Process == nil {
|
|
||||||
p.proxyLogger.Debugf("<%s> cmd or cmd.Process is nil (normal during config reload)", p.ID)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// if err := p.terminateProcess(); err != nil {
|
p.cancelUpstream()
|
||||||
// p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
|
<-p.cmdWaitChan
|
||||||
// }
|
|
||||||
// the default cmdStop to taskkill /f /t /pid ${PID}
|
|
||||||
if runtime.GOOS == "windows" && strings.TrimSpace(p.config.CmdStop) == "" {
|
|
||||||
p.config.CmdStop = "taskkill /f /t /pid ${PID}"
|
|
||||||
}
|
|
||||||
|
|
||||||
if p.config.CmdStop != "" {
|
|
||||||
// replace ${PID} with the pid of the process
|
|
||||||
stopArgs, err := SanitizeCommand(strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", p.cmd.Process.Pid)))
|
|
||||||
if err != nil {
|
|
||||||
p.proxyLogger.Errorf("<%s> Failed to sanitize stop command: %v", p.ID, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
p.proxyLogger.Debugf("<%s> Executing stop command: %s", p.ID, strings.Join(stopArgs, " "))
|
|
||||||
|
|
||||||
stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
|
|
||||||
stopCmd.Stdout = p.processLogger
|
|
||||||
stopCmd.Stderr = p.processLogger
|
|
||||||
stopCmd.Env = p.config.Env
|
|
||||||
|
|
||||||
if err := stopCmd.Run(); err != nil {
|
|
||||||
p.proxyLogger.Errorf("<%s> Failed to exec stop command: %v", p.ID, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
|
|
||||||
p.proxyLogger.Errorf("<%s> Failed to send SIGTERM to process: %v", p.ID, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
select {
|
|
||||||
case <-sigtermTimeout.Done():
|
|
||||||
p.proxyLogger.Debugf("<%s> Process timed out waiting to stop, sending KILL signal (normal during shutdown)", p.ID)
|
|
||||||
p.upstreamWasStoppedWithKill = true
|
|
||||||
if err := p.cmd.Process.Kill(); err != nil {
|
|
||||||
p.proxyLogger.Errorf("<%s> Failed to kill process: %v", p.ID, err)
|
|
||||||
}
|
|
||||||
case err := <-p.cmdWaitChan:
|
|
||||||
// Note: in start(), p.cmdWaitChan also has a select { ... }. That should be OK
|
|
||||||
// because if we make it here then the cmd has been successfully running and made it
|
|
||||||
// through the health check. There is a possibility that the cmd crashed after the health check
|
|
||||||
// succeeded but that's not a case llama-swap is handling for now.
|
|
||||||
if err != nil {
|
|
||||||
if errno, ok := err.(syscall.Errno); ok {
|
|
||||||
p.proxyLogger.Errorf("<%s> errno >> %v", p.ID, errno)
|
|
||||||
} else if exitError, ok := err.(*exec.ExitError); ok {
|
|
||||||
if strings.Contains(exitError.String(), "signal: terminated") {
|
|
||||||
p.proxyLogger.Debugf("<%s> Process stopped OK", p.ID)
|
|
||||||
} else if strings.Contains(exitError.String(), "signal: interrupt") {
|
|
||||||
p.proxyLogger.Debugf("<%s> Process interrupted OK", p.ID)
|
|
||||||
} else {
|
|
||||||
p.proxyLogger.Warnf("<%s> ExitError >> %v, exit code: %d", p.ID, exitError, exitError.ExitCode())
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
p.proxyLogger.Errorf("<%s> Process exited >> %v", p.ID, err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *Process) checkHealthEndpoint(healthURL string) error {
|
func (p *Process) checkHealthEndpoint(healthURL string) error {
|
||||||
@@ -509,7 +393,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
// prevent new requests from being made while stopping or irrecoverable
|
// prevent new requests from being made while stopping or irrecoverable
|
||||||
currentState := p.CurrentState()
|
currentState := p.CurrentState()
|
||||||
if currentState == StateFailed || currentState == StateShutdown || currentState == StateStopping {
|
if currentState == StateShutdown || currentState == StateStopping {
|
||||||
http.Error(w, fmt.Sprintf("Process can not ProxyRequest, state is %s", currentState), http.StatusServiceUnavailable)
|
http.Error(w, fmt.Sprintf("Process can not ProxyRequest, state is %s", currentState), http.StatusServiceUnavailable)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -591,3 +475,84 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
|
|||||||
p.proxyLogger.Debugf("<%s> request %s - start: %v, total: %v",
|
p.proxyLogger.Debugf("<%s> request %s - start: %v, total: %v",
|
||||||
p.ID, r.RequestURI, startDuration, totalTime)
|
p.ID, r.RequestURI, startDuration, totalTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// waitForCmd waits for the command to exit and handles exit conditions depending on current state
|
||||||
|
func (p *Process) waitForCmd() {
|
||||||
|
exitErr := p.cmd.Wait()
|
||||||
|
p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
|
||||||
|
|
||||||
|
if exitErr != nil {
|
||||||
|
if errno, ok := exitErr.(syscall.Errno); ok {
|
||||||
|
p.proxyLogger.Errorf("<%s> errno >> %v", p.ID, errno)
|
||||||
|
} else if exitError, ok := exitErr.(*exec.ExitError); ok {
|
||||||
|
if strings.Contains(exitError.String(), "signal: terminated") {
|
||||||
|
p.proxyLogger.Debugf("<%s> Process stopped OK", p.ID)
|
||||||
|
} else if strings.Contains(exitError.String(), "signal: interrupt") {
|
||||||
|
p.proxyLogger.Debugf("<%s> Process interrupted OK", p.ID)
|
||||||
|
} else {
|
||||||
|
p.proxyLogger.Warnf("<%s> ExitError >> %v, exit code: %d", p.ID, exitError, exitError.ExitCode())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if exitErr.Error() != "context canceled" /* this is normal */ {
|
||||||
|
p.proxyLogger.Errorf("<%s> Process exited >> %v", p.ID, exitErr)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
currentState := p.CurrentState()
|
||||||
|
switch currentState {
|
||||||
|
case StateStopping:
|
||||||
|
if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
|
||||||
|
p.proxyLogger.Errorf("<%s> Process exited but could not swap to StateStopped. curState=%s, err: %v", p.ID, curState, err)
|
||||||
|
p.state = StateStopped
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
p.proxyLogger.Infof("<%s> process exited but not StateStopping, current state: %s", p.ID, currentState)
|
||||||
|
p.state = StateStopped // force it to be in this state
|
||||||
|
}
|
||||||
|
close(p.cmdWaitChan)
|
||||||
|
}
|
||||||
|
|
||||||
|
// cmdStopUpstreamProcess attemps to stop the upstream process gracefully
|
||||||
|
func (p *Process) cmdStopUpstreamProcess() error {
|
||||||
|
p.processLogger.Debugf("<%s> cmdStopUpstreamProcess() initiating graceful stop of upstream process", p.ID)
|
||||||
|
|
||||||
|
// this should never happen ...
|
||||||
|
if p.cmd == nil || p.cmd.Process == nil {
|
||||||
|
p.proxyLogger.Debugf("<%s> cmd or cmd.Process is nil (normal during config reload)", p.ID)
|
||||||
|
return fmt.Errorf("<%s> process is nil or cmd is nil, skipping graceful stop", p.ID)
|
||||||
|
}
|
||||||
|
|
||||||
|
// the default cmdStop to taskkill /f /t /pid ${PID}
|
||||||
|
if runtime.GOOS == "windows" && strings.TrimSpace(p.config.CmdStop) == "" {
|
||||||
|
p.config.CmdStop = "taskkill /f /t /pid ${PID}"
|
||||||
|
}
|
||||||
|
|
||||||
|
if p.config.CmdStop != "" {
|
||||||
|
// replace ${PID} with the pid of the process
|
||||||
|
stopArgs, err := SanitizeCommand(strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", p.cmd.Process.Pid)))
|
||||||
|
if err != nil {
|
||||||
|
p.proxyLogger.Errorf("<%s> Failed to sanitize stop command: %v", p.ID, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
p.proxyLogger.Debugf("<%s> Executing stop command: %s", p.ID, strings.Join(stopArgs, " "))
|
||||||
|
|
||||||
|
stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
|
||||||
|
stopCmd.Stdout = p.processLogger
|
||||||
|
stopCmd.Stderr = p.processLogger
|
||||||
|
stopCmd.Env = p.config.Env
|
||||||
|
|
||||||
|
if err := stopCmd.Run(); err != nil {
|
||||||
|
p.proxyLogger.Errorf("<%s> Failed to exec stop command: %v", p.ID, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
|
||||||
|
p.proxyLogger.Errorf("<%s> Failed to send SIGTERM to process: %v", p.ID, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|||||||
@@ -106,8 +106,8 @@ func TestProcess_BrokenModelConfig(t *testing.T) {
|
|||||||
|
|
||||||
w = httptest.NewRecorder()
|
w = httptest.NewRecorder()
|
||||||
process.ProxyRequest(w, req)
|
process.ProxyRequest(w, req)
|
||||||
assert.Equal(t, http.StatusServiceUnavailable, w.Code)
|
assert.Equal(t, http.StatusBadGateway, w.Code)
|
||||||
assert.Contains(t, w.Body.String(), "Process can not ProxyRequest, state is failed")
|
assert.Contains(t, w.Body.String(), "start() failed: ")
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestProcess_UnloadAfterTTL(t *testing.T) {
|
func TestProcess_UnloadAfterTTL(t *testing.T) {
|
||||||
@@ -248,18 +248,14 @@ func TestProcess_SwapState(t *testing.T) {
|
|||||||
}{
|
}{
|
||||||
{"Stopped to Starting", StateStopped, StateStopped, StateStarting, nil, StateStarting},
|
{"Stopped to Starting", StateStopped, StateStopped, StateStarting, nil, StateStarting},
|
||||||
{"Starting to Ready", StateStarting, StateStarting, StateReady, nil, StateReady},
|
{"Starting to Ready", StateStarting, StateStarting, StateReady, nil, StateReady},
|
||||||
{"Starting to Failed", StateStarting, StateStarting, StateFailed, nil, StateFailed},
|
|
||||||
{"Starting to Stopping", StateStarting, StateStarting, StateStopping, nil, StateStopping},
|
{"Starting to Stopping", StateStarting, StateStarting, StateStopping, nil, StateStopping},
|
||||||
|
{"Starting to Stopped", StateStarting, StateStarting, StateStopped, nil, StateStopped},
|
||||||
{"Ready to Stopping", StateReady, StateReady, StateStopping, nil, StateStopping},
|
{"Ready to Stopping", StateReady, StateReady, StateStopping, nil, StateStopping},
|
||||||
{"Stopping to Stopped", StateStopping, StateStopping, StateStopped, nil, StateStopped},
|
{"Stopping to Stopped", StateStopping, StateStopping, StateStopped, nil, StateStopped},
|
||||||
{"Stopping to Shutdown", StateStopping, StateStopping, StateShutdown, nil, StateShutdown},
|
{"Stopping to Shutdown", StateStopping, StateStopping, StateShutdown, nil, StateShutdown},
|
||||||
{"Stopped to Ready", StateStopped, StateStopped, StateReady, ErrInvalidStateTransition, StateStopped},
|
{"Stopped to Ready", StateStopped, StateStopped, StateReady, ErrInvalidStateTransition, StateStopped},
|
||||||
{"Starting to Stopped", StateStarting, StateStarting, StateStopped, ErrInvalidStateTransition, StateStarting},
|
|
||||||
{"Ready to Starting", StateReady, StateReady, StateStarting, ErrInvalidStateTransition, StateReady},
|
{"Ready to Starting", StateReady, StateReady, StateStarting, ErrInvalidStateTransition, StateReady},
|
||||||
{"Ready to Failed", StateReady, StateReady, StateFailed, ErrInvalidStateTransition, StateReady},
|
|
||||||
{"Stopping to Ready", StateStopping, StateStopping, StateReady, ErrInvalidStateTransition, StateStopping},
|
{"Stopping to Ready", StateStopping, StateStopping, StateReady, ErrInvalidStateTransition, StateStopping},
|
||||||
{"Failed to Stopped", StateFailed, StateFailed, StateStopped, ErrInvalidStateTransition, StateFailed},
|
|
||||||
{"Failed to Starting", StateFailed, StateFailed, StateStarting, ErrInvalidStateTransition, StateFailed},
|
|
||||||
{"Shutdown to Stopped", StateShutdown, StateShutdown, StateStopped, ErrInvalidStateTransition, StateShutdown},
|
{"Shutdown to Stopped", StateShutdown, StateShutdown, StateStopped, ErrInvalidStateTransition, StateShutdown},
|
||||||
{"Shutdown to Starting", StateShutdown, StateShutdown, StateStarting, ErrInvalidStateTransition, StateShutdown},
|
{"Shutdown to Starting", StateShutdown, StateShutdown, StateStarting, ErrInvalidStateTransition, StateShutdown},
|
||||||
{"Expected state mismatch", StateStopped, StateStarting, StateStarting, ErrExpectedStateMismatch, StateStopped},
|
{"Expected state mismatch", StateStopped, StateStarting, StateStarting, ErrExpectedStateMismatch, StateStopped},
|
||||||
@@ -339,7 +335,7 @@ func TestProcess_ExitInterruptsHealthCheck(t *testing.T) {
|
|||||||
process.healthCheckLoopInterval = time.Second // make it faster
|
process.healthCheckLoopInterval = time.Second // make it faster
|
||||||
err := process.start()
|
err := process.start()
|
||||||
assert.Equal(t, "upstream command exited prematurely but successfully", err.Error())
|
assert.Equal(t, "upstream command exited prematurely but successfully", err.Error())
|
||||||
assert.Equal(t, process.CurrentState(), StateFailed)
|
assert.Equal(t, process.CurrentState(), StateStopped)
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestProcess_ConcurrencyLimit(t *testing.T) {
|
func TestProcess_ConcurrencyLimit(t *testing.T) {
|
||||||
|
|||||||
@@ -348,8 +348,6 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
|||||||
stateStr = "Starting"
|
stateStr = "Starting"
|
||||||
case StateStopping:
|
case StateStopping:
|
||||||
stateStr = "Stopping"
|
stateStr = "Stopping"
|
||||||
case StateFailed:
|
|
||||||
stateStr = "Failed"
|
|
||||||
case StateShutdown:
|
case StateShutdown:
|
||||||
stateStr = "Shutdown"
|
stateStr = "Shutdown"
|
||||||
case StateStopped:
|
case StateStopped:
|
||||||
|
|||||||
Reference in New Issue
Block a user