Stream loading state when swapping models (#371)

Swapping models can take a long time and leave a lot of silence while the model is loading. Rather than silently load the model in the background, this PR allows llama-swap to send status updates in the reasoning_content of a streaming chat response. Fixes: #366
2025-10-29 00:09:39 -07:00
parent f852689104
commit a89b803d4a
8 changed files with 375 additions and 51 deletions
--- a/proxy/proxymanager.go
+++ b/proxy/proxymanager.go
@@ -25,6 +25,8 @@ const (
 	PROFILE_SPLIT_CHAR = ":"
 )

+type proxyCtxKey string
+
 type ProxyManager struct {
 	sync.Mutex

@@ -555,6 +557,12 @@ func (pm *ProxyManager) proxyOAIHandler(c *gin.Context) {
 	c.Request.Header.Set("content-length", strconv.Itoa(len(bodyBytes)))
 	c.Request.ContentLength = int64(len(bodyBytes))

+	// issue #366 extract values that downstream handlers may need
+	isStreaming := gjson.GetBytes(bodyBytes, "stream").Bool()
+	ctx := context.WithValue(c.Request.Context(), proxyCtxKey("streaming"), isStreaming)
+	ctx = context.WithValue(ctx, proxyCtxKey("model"), realModelName)
+	c.Request = c.Request.WithContext(ctx)
+
 	if pm.metricsMonitor != nil && c.Request.Method == "POST" {
 		if err := pm.metricsMonitor.wrapHandler(realModelName, c.Writer, c.Request, processGroup.ProxyRequest); err != nil {
 			pm.sendErrorResponse(c, http.StatusInternalServerError, fmt.Sprintf("error proxying metrics wrapped request: %s", err.Error()))