improve llama-swap upstream process recovery and restarts (#155)

Refactor internal upstream process life cycle management to recover better from unexpected situations. With this change llama-swap should never need to be restarted due to a crashed upstream child process. The `StateFailed` state was removed in favour of always trying to start/restart a process.
2025-06-05 16:24:55 -07:00
parent a84098d3b4
commit 2dc0ca0663
4 changed files with 232 additions and 182 deletions
@@ -0,0 +1,91 @@
+package main
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"os/exec"
+	"os/signal"
+	"syscall"
+	"time"
+)
+
+/*
+**
+Test how exec.Cmd.CommandContext behaves under certain conditions:*
+
+  - process is killed externally, what happens with cmd.Wait() *
+    ✔︎ it returns. catches crashes.*
+
+  - process ignores SIGTERM*
+    ✔︎ `kill()` is called after cmd.WaitDelay*
+
+  - this process exits, what happens with children (kill -9 <this process' pid>)*
+    x they stick around. have to be manually killed.*
+
+  - .WithTimeout()'s cancel is called *
+    ✔︎ process is killed after it ignores sigterm, cmd.Wait() catches it.*
+
+  - parent receives SIGINT/SIGTERM, what happens
+    ✔︎ waits for child process to exit, then exits gracefully.
+*/
+func main() {
+
+	// swap between these to use kill -9 <pid> on the cli to sim external crash
+	ctx, cancel := context.WithCancel(context.Background())
+	//ctx, cancel := context.WithTimeout(context.Background(), 1000*time.Millisecond)
+	defer cancel()
+
+	//cmd := exec.CommandContext(ctx, "sleep", "1")
+	cmd := exec.CommandContext(ctx,
+		"../../build/simple-responder_darwin_arm64",
+		//"-ignore-sig-term", /* so it doesn't exit on receiving SIGTERM, test cmd.WaitTimeout */
+	)
+	cmd.Stdin = os.Stdin
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	// set a wait delay before signing sig kill
+	cmd.WaitDelay = 500 * time.Millisecond
+	cmd.Cancel = func() error {
+		fmt.Println("✔︎ Cancel() called, sending SIGTERM")
+		cmd.Process.Signal(syscall.SIGTERM)
+
+		//return nil
+
+		// this error is returned by cmd.Wait(), and can be used to
+		// single an error when the process couldn't be normally terminated
+		// but since a SIGTERM is sent, it's probably ok to return a nil
+		// as WaitDelay timing out will override the any error set here.
+		//
+		// test by enabling/disabling -ignore-sig-term on the process
+		// with -ignore-sig-term enabled, cmd.Wait() will have "signal: killed"
+		// without it, it will show the "new error from cancel"
+		return errors.New("error from cmd.Cancel()") // sets error returned by cmd.Wait()
+	}
+
+	if err := cmd.Start(); err != nil {
+		fmt.Println("Error starting process:", err)
+		return
+	}
+
+	// catch signals. Calls cancel() which will cause cmd.Wait() to return and
+	// this program to eventually exit gracefully.
+	sigChan := make(chan os.Signal, 1)
+	signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
+	go func() {
+		signal := <-sigChan
+		fmt.Printf("✔︎ Received signal: %d, Killing process... with cancel before exiting\n", signal)
+		cancel()
+	}()
+
+	fmt.Printf("✔︎ Parent Pid: %d, Process Pid: %d\n", os.Getpid(), cmd.Process.Pid)
+	fmt.Println("✔︎ Process started, cmd.Wait() ... ")
+	if err := cmd.Wait(); err != nil {
+		fmt.Println("✔︎ cmd.Wait returned, Error:", err)
+	} else {
+		fmt.Println("✔︎ cmd.Wait returned, Process exited on its own")
+	}
+	fmt.Println("✔︎ Child process exited, Done.")
+}
@@ -24,9 +24,6 @@ const (
 	StateReady    ProcessState = ProcessState("ready")
 	StateStopping ProcessState = ProcessState("stopping")

-	// failed a health check on start and will not be recovered
-	StateFailed ProcessState = ProcessState("failed")
-
 	// process is shutdown and will not be restarted
 	StateShutdown ProcessState = ProcessState("shutdown")
 )
@@ -43,8 +40,11 @@ type Process struct {
 	config ModelConfig
 	cmd    *exec.Cmd

-	// for p.cmd.Wait() select { ... }
-	cmdWaitChan chan error
+	// PR #155 called to cancel the upstream process
+	cancelUpstream context.CancelFunc
+
+	// closed when command exits
+	cmdWaitChan chan struct{}

 	processLogger *LogMonitor
 	proxyLogger   *LogMonitor
@@ -62,22 +62,17 @@ type Process struct {
 	// used to block on multiple start() calls
 	waitStarting sync.WaitGroup

-	// for managing shutdown state
-	shutdownCtx    context.Context
-	shutdownCancel context.CancelFunc
-
 	// for managing concurrency limits
 	concurrencyLimitSemaphore chan struct{}

-	// stop timeout waiting for graceful shutdown
+	// used for testing to override the default value
 	gracefulStopTimeout time.Duration

-	// track that this happened
-	upstreamWasStoppedWithKill bool
+	// track the number of failed starts
+	failedStartCount int
 }

 func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
-	ctx, cancel := context.WithCancel(context.Background())
 	concurrentLimit := 10
 	if config.ConcurrencyLimit > 0 {
 		concurrentLimit = config.ConcurrencyLimit
@@ -87,21 +82,20 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLo
 		ID:                      ID,
 		config:                  config,
 		cmd:                     nil,
-		cmdWaitChan:             make(chan error, 1),
+		cancelUpstream:          nil,
 		processLogger:           processLogger,
 		proxyLogger:             proxyLogger,
 		healthCheckTimeout:      healthCheckTimeout,
 		healthCheckLoopInterval: 5 * time.Second, /* default, can not be set by user - used for testing */
 		state:                   StateStopped,
-		shutdownCtx:             ctx,
-		shutdownCancel:          cancel,

 		// concurrency limit
 		concurrencyLimitSemaphore: make(chan struct{}, concurrentLimit),

+		// To be removed when migration over exec.CommandContext is complete
 		// stop timeout
 		gracefulStopTimeout: 10 * time.Second,
-		upstreamWasStoppedWithKill: false,
+		cmdWaitChan:         make(chan struct{}),
 	}
 }

@@ -143,13 +137,11 @@ func isValidTransition(from, to ProcessState) bool {
 	case StateStopped:
 		return to == StateStarting
 	case StateStarting:
-		return to == StateReady || to == StateFailed || to == StateStopping
+		return to == StateReady || to == StateStopping || to == StateStopped
 	case StateReady:
 		return to == StateStopping
 	case StateStopping:
 		return to == StateStopped || to == StateShutdown
-	case StateFailed:
-		return to == StateStopping
 	case StateShutdown:
 		return false // No transitions allowed from these states
 	}
@@ -197,17 +189,24 @@ func (p *Process) start() error {

 	p.waitStarting.Add(1)
 	defer p.waitStarting.Done()
-
-	p.cmd = exec.Command(args[0], args[1:]...)
+	cmdContext, ctxCancelUpstream := context.WithCancel(context.Background())
+	p.cmd = exec.CommandContext(cmdContext, args[0], args[1:]...)
 	p.cmd.Stdout = p.processLogger
 	p.cmd.Stderr = p.processLogger
 	p.cmd.Env = p.config.Env

+	p.cmd.Cancel = p.cmdStopUpstreamProcess
+	p.cmd.WaitDelay = p.gracefulStopTimeout
+	p.cancelUpstream = ctxCancelUpstream
+	p.cmdWaitChan = make(chan struct{})
+
+	p.failedStartCount++ // this will be reset to zero when the process has successfully started
 	err = p.cmd.Start()

 	// Set process state to failed
 	if err != nil {
-		if curState, swapErr := p.swapState(StateStarting, StateFailed); swapErr != nil {
+		if curState, swapErr := p.swapState(StateStarting, StateStopped); swapErr != nil {
+			p.state = StateStopped // force it into a stopped state
 			return fmt.Errorf(
 				"failed to start command and state swap failed. command error: %v, current state: %v, state swap error: %v",
 				err, curState, swapErr,
@@ -217,20 +216,7 @@ func (p *Process) start() error {
 	}

 	// Capture the exit error for later signalling
-	go func() {
-		exitErr := p.cmd.Wait()
-		p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
-
-		// there is a race condition when SIGKILL is used, p.cmd.Wait() returns, and then
-		// the code below fires, putting an error into cmdWaitChan. This code is to prevent this
-		if p.upstreamWasStoppedWithKill {
-			p.proxyLogger.Debugf("<%s> process was killed, NOT sending exitErr: %v", p.ID, exitErr)
-			p.upstreamWasStoppedWithKill = false
-			return
-		}
-
-		p.cmdWaitChan <- exitErr
-	}()
+	go p.waitForCmd()

 	// One of three things can happen at this stage:
 	// 1. The command exits unexpectedly
@@ -257,56 +243,32 @@ func (p *Process) start() error {
 			return fmt.Errorf("failed to create health check URL proxy=%s and checkEndpoint=%s", proxyTo, checkEndpoint)
 		}

-		checkDeadline, cancelHealthCheck := context.WithDeadline(
-			context.Background(),
-			checkStartTime.Add(maxDuration),
-		)
-		defer cancelHealthCheck()
-
-	loop:
 		// Ready Check loop
 		for {
-			select {
-			case <-checkDeadline.Done():
-				if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
-					return fmt.Errorf("health check timed out after %vs AND state swap failed: %v, current state: %v", maxDuration.Seconds(), err, curState)
-				} else {
-					return fmt.Errorf("health check timed out after %vs", maxDuration.Seconds())
-				}
-			case <-p.shutdownCtx.Done():
-				return errors.New("health check interrupted due to shutdown")
-			case exitErr := <-p.cmdWaitChan:
-				if exitErr != nil {
-					p.proxyLogger.Warnf("<%s> upstream command exited prematurely with error: %v", p.ID, exitErr)
-					if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
-						return fmt.Errorf("upstream command exited unexpectedly: %s AND state swap failed: %v, current state: %v", exitErr.Error(), err, curState)
-					} else {
-						return fmt.Errorf("upstream command exited unexpectedly: %s", exitErr.Error())
-					}
-				} else {
-					p.proxyLogger.Warnf("<%s> upstream command exited prematurely but successfully", p.ID)
-					if curState, err := p.swapState(StateStarting, StateFailed); err != nil {
-						return fmt.Errorf("upstream command exited prematurely but successfully AND state swap failed: %v, current state: %v", err, curState)
-					} else {
+			currentState := p.CurrentState()
+			if currentState != StateStarting {
+				if currentState == StateStopped {
 					return fmt.Errorf("upstream command exited prematurely but successfully")
 				}
+				return errors.New("health check interrupted due to shutdown")
 			}
-			default:
+
+			if time.Since(checkStartTime) > maxDuration {
+				p.stopCommand()
+				return fmt.Errorf("health check timed out after %vs", maxDuration.Seconds())
+			}
+
 			if err := p.checkHealthEndpoint(healthURL); err == nil {
 				p.proxyLogger.Infof("<%s> Health check passed on %s", p.ID, healthURL)
-					cancelHealthCheck()
-					break loop
+				break
 			} else {
 				if strings.Contains(err.Error(), "connection refused") {
-						endTime, _ := checkDeadline.Deadline()
-						ttl := time.Until(endTime)
+					ttl := time.Until(checkStartTime.Add(maxDuration))
 					p.proxyLogger.Debugf("<%s> Connection refused on %s, giving up in %.0fs (normal during startup)", p.ID, healthURL, ttl.Seconds())
 				} else {
 					p.proxyLogger.Debugf("<%s> Health check error on %s, %v (normal during startup)", p.ID, healthURL, err)
 				}
 			}
-			}
-
 			<-time.After(p.healthCheckLoopInterval)
 		}
 	}
@@ -337,6 +299,7 @@ func (p *Process) start() error {
 	if curState, err := p.swapState(StateStarting, StateReady); err != nil {
 		return fmt.Errorf("failed to set Process state to ready: current state: %v, error: %v", curState, err)
 	} else {
+		p.failedStartCount = 0
 		return nil
 	}
 }
@@ -361,26 +324,12 @@ func (p *Process) StopImmediately() {
 	}

 	p.proxyLogger.Debugf("<%s> Stopping process, current state: %s", p.ID, p.CurrentState())
-	currentState := p.CurrentState()
-
-	if currentState == StateFailed {
-		if curState, err := p.swapState(StateFailed, StateStopping); err != nil {
-			p.proxyLogger.Infof("<%s> Stop() Failed -> StateStopping err: %v, current state: %v", p.ID, err, curState)
-			return
-		}
-	} else {
 	if curState, err := p.swapState(StateReady, StateStopping); err != nil {
 		p.proxyLogger.Infof("<%s> Stop() Ready -> StateStopping err: %v, current state: %v", p.ID, err, curState)
 		return
 	}
-	}

-	// stop the process with a graceful exit timeout
-	p.stopCommand(p.gracefulStopTimeout)
-
-	if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
-		p.proxyLogger.Infof("<%s> Stop() StateStopping -> StateStopped err: %v, current state: %v", p.ID, err, curState)
-	}
+	p.stopCommand()
 }

 // Shutdown is called when llama-swap is shutting down. It will give a little bit
@@ -392,91 +341,26 @@ func (p *Process) Shutdown() {
 		return
 	}

-	p.shutdownCancel()
-	p.stopCommand(p.gracefulStopTimeout)
-
+	p.stopCommand()
 	// just force it to this state since there is no recovery from shutdown
 	p.state = StateShutdown
 }

 // stopCommand will send a SIGTERM to the process and wait for it to exit.
 // If it does not exit within 5 seconds, it will send a SIGKILL.
-func (p *Process) stopCommand(sigtermTTL time.Duration) {
+func (p *Process) stopCommand() {
 	stopStartTime := time.Now()
 	defer func() {
 		p.proxyLogger.Debugf("<%s> stopCommand took %v", p.ID, time.Since(stopStartTime))
 	}()

-	sigtermTimeout, cancelTimeout := context.WithTimeout(context.Background(), sigtermTTL)
-	defer cancelTimeout()
-
-	if p.cmd == nil || p.cmd.Process == nil {
-		p.proxyLogger.Debugf("<%s> cmd or cmd.Process is nil (normal during config reload)", p.ID)
+	if p.cancelUpstream == nil {
+		p.proxyLogger.Errorf("<%s> stopCommand has a nil p.cancelUpstream()", p.ID)
 		return
 	}

-	// if err := p.terminateProcess(); err != nil {
-	// 	p.proxyLogger.Debugf("<%s> Process already terminated: %v (normal during shutdown)", p.ID, err)
-	// }
-	// the default cmdStop to taskkill /f /t /pid ${PID}
-	if runtime.GOOS == "windows" && strings.TrimSpace(p.config.CmdStop) == "" {
-		p.config.CmdStop = "taskkill /f /t /pid ${PID}"
-	}
-
-	if p.config.CmdStop != "" {
-		// replace ${PID} with the pid of the process
-		stopArgs, err := SanitizeCommand(strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", p.cmd.Process.Pid)))
-		if err != nil {
-			p.proxyLogger.Errorf("<%s> Failed to sanitize stop command: %v", p.ID, err)
-			return
-		}
-
-		p.proxyLogger.Debugf("<%s> Executing stop command: %s", p.ID, strings.Join(stopArgs, " "))
-
-		stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
-		stopCmd.Stdout = p.processLogger
-		stopCmd.Stderr = p.processLogger
-		stopCmd.Env = p.config.Env
-
-		if err := stopCmd.Run(); err != nil {
-			p.proxyLogger.Errorf("<%s> Failed to exec stop command: %v", p.ID, err)
-			return
-		}
-	} else {
-		if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
-			p.proxyLogger.Errorf("<%s> Failed to send SIGTERM to process: %v", p.ID, err)
-			return
-		}
-	}
-
-	select {
-	case <-sigtermTimeout.Done():
-		p.proxyLogger.Debugf("<%s> Process timed out waiting to stop, sending KILL signal (normal during shutdown)", p.ID)
-		p.upstreamWasStoppedWithKill = true
-		if err := p.cmd.Process.Kill(); err != nil {
-			p.proxyLogger.Errorf("<%s> Failed to kill process: %v", p.ID, err)
-		}
-	case err := <-p.cmdWaitChan:
-		// Note: in start(), p.cmdWaitChan also has a select { ... }. That should be OK
-		// because if we make it here then the cmd has been successfully running and made it
-		// through the health check. There is a possibility that the cmd crashed after the health check
-		// succeeded but that's not a case llama-swap is handling for now.
-		if err != nil {
-			if errno, ok := err.(syscall.Errno); ok {
-				p.proxyLogger.Errorf("<%s> errno >> %v", p.ID, errno)
-			} else if exitError, ok := err.(*exec.ExitError); ok {
-				if strings.Contains(exitError.String(), "signal: terminated") {
-					p.proxyLogger.Debugf("<%s> Process stopped OK", p.ID)
-				} else if strings.Contains(exitError.String(), "signal: interrupt") {
-					p.proxyLogger.Debugf("<%s> Process interrupted OK", p.ID)
-				} else {
-					p.proxyLogger.Warnf("<%s> ExitError >> %v, exit code: %d", p.ID, exitError, exitError.ExitCode())
-				}
-			} else {
-				p.proxyLogger.Errorf("<%s> Process exited >> %v", p.ID, err)
-			}
-		}
-	}
+	p.cancelUpstream()
+	<-p.cmdWaitChan
 }

 func (p *Process) checkHealthEndpoint(healthURL string) error {
@@ -509,7 +393,7 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {

 	// prevent new requests from being made while stopping or irrecoverable
 	currentState := p.CurrentState()
-	if currentState == StateFailed || currentState == StateShutdown || currentState == StateStopping {
+	if currentState == StateShutdown || currentState == StateStopping {
 		http.Error(w, fmt.Sprintf("Process can not ProxyRequest, state is %s", currentState), http.StatusServiceUnavailable)
 		return
 	}
@@ -591,3 +475,84 @@ func (p *Process) ProxyRequest(w http.ResponseWriter, r *http.Request) {
 	p.proxyLogger.Debugf("<%s> request %s - start: %v, total: %v",
 		p.ID, r.RequestURI, startDuration, totalTime)
 }
+
+// waitForCmd waits for the command to exit and handles exit conditions depending on current state
+func (p *Process) waitForCmd() {
+	exitErr := p.cmd.Wait()
+	p.proxyLogger.Debugf("<%s> cmd.Wait() returned error: %v", p.ID, exitErr)
+
+	if exitErr != nil {
+		if errno, ok := exitErr.(syscall.Errno); ok {
+			p.proxyLogger.Errorf("<%s> errno >> %v", p.ID, errno)
+		} else if exitError, ok := exitErr.(*exec.ExitError); ok {
+			if strings.Contains(exitError.String(), "signal: terminated") {
+				p.proxyLogger.Debugf("<%s> Process stopped OK", p.ID)
+			} else if strings.Contains(exitError.String(), "signal: interrupt") {
+				p.proxyLogger.Debugf("<%s> Process interrupted OK", p.ID)
+			} else {
+				p.proxyLogger.Warnf("<%s> ExitError >> %v, exit code: %d", p.ID, exitError, exitError.ExitCode())
+			}
+		} else {
+			if exitErr.Error() != "context canceled" /* this is normal */ {
+				p.proxyLogger.Errorf("<%s> Process exited >> %v", p.ID, exitErr)
+			}
+		}
+	}
+
+	currentState := p.CurrentState()
+	switch currentState {
+	case StateStopping:
+		if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
+			p.proxyLogger.Errorf("<%s> Process exited but could not swap to StateStopped. curState=%s, err: %v", p.ID, curState, err)
+			p.state = StateStopped
+		}
+	default:
+		p.proxyLogger.Infof("<%s> process exited but not StateStopping, current state: %s", p.ID, currentState)
+		p.state = StateStopped // force it to be in this state
+	}
+	close(p.cmdWaitChan)
+}
+
+// cmdStopUpstreamProcess attemps to stop the upstream process gracefully
+func (p *Process) cmdStopUpstreamProcess() error {
+	p.processLogger.Debugf("<%s> cmdStopUpstreamProcess() initiating graceful stop of upstream process", p.ID)
+
+	// this should never happen ...
+	if p.cmd == nil || p.cmd.Process == nil {
+		p.proxyLogger.Debugf("<%s> cmd or cmd.Process is nil (normal during config reload)", p.ID)
+		return fmt.Errorf("<%s> process is nil or cmd is nil, skipping graceful stop", p.ID)
+	}
+
+	// the default cmdStop to taskkill /f /t /pid ${PID}
+	if runtime.GOOS == "windows" && strings.TrimSpace(p.config.CmdStop) == "" {
+		p.config.CmdStop = "taskkill /f /t /pid ${PID}"
+	}
+
+	if p.config.CmdStop != "" {
+		// replace ${PID} with the pid of the process
+		stopArgs, err := SanitizeCommand(strings.ReplaceAll(p.config.CmdStop, "${PID}", fmt.Sprintf("%d", p.cmd.Process.Pid)))
+		if err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to sanitize stop command: %v", p.ID, err)
+			return err
+		}
+
+		p.proxyLogger.Debugf("<%s> Executing stop command: %s", p.ID, strings.Join(stopArgs, " "))
+
+		stopCmd := exec.Command(stopArgs[0], stopArgs[1:]...)
+		stopCmd.Stdout = p.processLogger
+		stopCmd.Stderr = p.processLogger
+		stopCmd.Env = p.config.Env
+
+		if err := stopCmd.Run(); err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to exec stop command: %v", p.ID, err)
+			return err
+		}
+	} else {
+		if err := p.cmd.Process.Signal(syscall.SIGTERM); err != nil {
+			p.proxyLogger.Errorf("<%s> Failed to send SIGTERM to process: %v", p.ID, err)
+			return err
+		}
+	}
+
+	return nil
+}
@@ -106,8 +106,8 @@ func TestProcess_BrokenModelConfig(t *testing.T) {

 	w = httptest.NewRecorder()
 	process.ProxyRequest(w, req)
-	assert.Equal(t, http.StatusServiceUnavailable, w.Code)
-	assert.Contains(t, w.Body.String(), "Process can not ProxyRequest, state is failed")
+	assert.Equal(t, http.StatusBadGateway, w.Code)
+	assert.Contains(t, w.Body.String(), "start() failed: ")
 }

 func TestProcess_UnloadAfterTTL(t *testing.T) {
@@ -248,18 +248,14 @@ func TestProcess_SwapState(t *testing.T) {
 	}{
 		{"Stopped to Starting", StateStopped, StateStopped, StateStarting, nil, StateStarting},
 		{"Starting to Ready", StateStarting, StateStarting, StateReady, nil, StateReady},
-		{"Starting to Failed", StateStarting, StateStarting, StateFailed, nil, StateFailed},
 		{"Starting to Stopping", StateStarting, StateStarting, StateStopping, nil, StateStopping},
+		{"Starting to Stopped", StateStarting, StateStarting, StateStopped, nil, StateStopped},
 		{"Ready to Stopping", StateReady, StateReady, StateStopping, nil, StateStopping},
 		{"Stopping to Stopped", StateStopping, StateStopping, StateStopped, nil, StateStopped},
 		{"Stopping to Shutdown", StateStopping, StateStopping, StateShutdown, nil, StateShutdown},
 		{"Stopped to Ready", StateStopped, StateStopped, StateReady, ErrInvalidStateTransition, StateStopped},
-		{"Starting to Stopped", StateStarting, StateStarting, StateStopped, ErrInvalidStateTransition, StateStarting},
 		{"Ready to Starting", StateReady, StateReady, StateStarting, ErrInvalidStateTransition, StateReady},
-		{"Ready to Failed", StateReady, StateReady, StateFailed, ErrInvalidStateTransition, StateReady},
 		{"Stopping to Ready", StateStopping, StateStopping, StateReady, ErrInvalidStateTransition, StateStopping},
-		{"Failed to Stopped", StateFailed, StateFailed, StateStopped, ErrInvalidStateTransition, StateFailed},
-		{"Failed to Starting", StateFailed, StateFailed, StateStarting, ErrInvalidStateTransition, StateFailed},
 		{"Shutdown to Stopped", StateShutdown, StateShutdown, StateStopped, ErrInvalidStateTransition, StateShutdown},
 		{"Shutdown to Starting", StateShutdown, StateShutdown, StateStarting, ErrInvalidStateTransition, StateShutdown},
 		{"Expected state mismatch", StateStopped, StateStarting, StateStarting, ErrExpectedStateMismatch, StateStopped},
@@ -339,7 +335,7 @@ func TestProcess_ExitInterruptsHealthCheck(t *testing.T) {
 	process.healthCheckLoopInterval = time.Second // make it faster
 	err := process.start()
 	assert.Equal(t, "upstream command exited prematurely but successfully", err.Error())
-	assert.Equal(t, process.CurrentState(), StateFailed)
+	assert.Equal(t, process.CurrentState(), StateStopped)
 }

 func TestProcess_ConcurrencyLimit(t *testing.T) {
@@ -348,8 +348,6 @@ func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
 					stateStr = "Starting"
 				case StateStopping:
 					stateStr = "Stopping"
-				case StateFailed:
-					stateStr = "Failed"
 				case StateShutdown:
 					stateStr = "Shutdown"
 				case StateStopped: