From 57803fd3aae930cc2b9a886ca034112331debada Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Wed, 27 Aug 2025 08:36:05 -0700 Subject: [PATCH] Support llama-server's /infill endpoint (#272) Add support for llama-server's /infill endpoint and metrics gathering on the Activities page. --- README.md | 4 ++- proxy/metrics_middleware.go | 50 +++++++++++++++++++++---------------- proxy/proxymanager.go | 10 ++++++-- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/README.md b/README.md index 7818ea5..012c0ca 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,11 @@ Written in golang, it is very easy to install (single binary with no dependencie - `v1/completions` - `v1/chat/completions` - `v1/embeddings` - - `v1/rerank`, `v1/reranking`, `rerank` - `v1/audio/speech` ([#36](https://github.com/mostlygeek/llama-swap/issues/36)) - `v1/audio/transcriptions` ([docs](https://github.com/mostlygeek/llama-swap/issues/41#issuecomment-2722637867)) +- ✅ llama-server (llama.cpp) supported endpoints: + - `v1/rerank`, `v1/reranking`, `/rerank` + - `/infill` - for code infilling - ✅ llama-swap custom API endpoints - `/ui` - web UI - `/log` - remote log monitoring diff --git a/proxy/metrics_middleware.go b/proxy/metrics_middleware.go index a4249ac..adffe97 100644 --- a/proxy/metrics_middleware.go +++ b/proxy/metrics_middleware.go @@ -5,12 +5,20 @@ import ( "fmt" "io" "net/http" + "strings" "time" "github.com/gin-gonic/gin" "github.com/tidwall/gjson" ) +type MetricsRecorder struct { + metricsMonitor *MetricsMonitor + realModelName string + // isStreaming bool + startTime time.Time +} + // MetricsMiddleware sets up the MetricsResponseWriter for capturing upstream requests func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc { return func(c *gin.Context) { @@ -41,49 +49,47 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc { metricsRecorder: &MetricsRecorder{ metricsMonitor: pm.metricsMonitor, realModelName: realModelName, - isStreaming: gjson.GetBytes(bodyBytes, "stream").Bool(), startTime: time.Now(), }, } c.Writer = writer c.Next() - rec := writer.metricsRecorder - rec.processBody(writer.body) - } -} + // check for streaming response + if strings.Contains(c.Writer.Header().Get("Content-Type"), "text/event-stream") { + writer.metricsRecorder.processStreamingResponse(writer.body) + } else { + writer.metricsRecorder.processNonStreamingResponse(writer.body) + } -type MetricsRecorder struct { - metricsMonitor *MetricsMonitor - realModelName string - isStreaming bool - startTime time.Time -} - -// processBody handles response processing after request completes -func (rec *MetricsRecorder) processBody(body []byte) { - if rec.isStreaming { - rec.processStreamingResponse(body) - } else { - rec.processNonStreamingResponse(body) } } func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool { usage := jsonData.Get("usage") - if !usage.Exists() { + timings := jsonData.Get("timings") + if !usage.Exists() && !timings.Exists() { return false } // default values - outputTokens := int(jsonData.Get("usage.completion_tokens").Int()) - inputTokens := int(jsonData.Get("usage.prompt_tokens").Int()) + outputTokens := 0 + inputTokens := 0 + + // timings data tokensPerSecond := -1.0 promptPerSecond := -1.0 durationMs := int(time.Since(rec.startTime).Milliseconds()) + if usage.Exists() { + outputTokens = int(jsonData.Get("usage.completion_tokens").Int()) + inputTokens = int(jsonData.Get("usage.prompt_tokens").Int()) + } + // use llama-server's timing data for tok/sec and duration as it is more accurate - if timings := jsonData.Get("timings"); timings.Exists() { + if timings.Exists() { + inputTokens = int(jsonData.Get("timings.prompt_n").Int()) + outputTokens = int(jsonData.Get("timings.predicted_n").Int()) promptPerSecond = jsonData.Get("timings.prompt_per_second").Float() tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float() durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float()) diff --git a/proxy/proxymanager.go b/proxy/proxymanager.go index 644851d..9b1d49b 100644 --- a/proxy/proxymanager.go +++ b/proxy/proxymanager.go @@ -191,11 +191,17 @@ func (pm *ProxyManager) setupGinEngine() { // Support legacy /v1/completions api, see issue #12 pm.ginEngine.POST("/v1/completions", mm, pm.proxyOAIHandler) - // Support embeddings + // Support embeddings and reranking pm.ginEngine.POST("/v1/embeddings", mm, pm.proxyOAIHandler) + + // llama-server's /reranking endpoint + aliases + pm.ginEngine.POST("/reranking", mm, pm.proxyOAIHandler) + pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler) pm.ginEngine.POST("/v1/rerank", mm, pm.proxyOAIHandler) pm.ginEngine.POST("/v1/reranking", mm, pm.proxyOAIHandler) - pm.ginEngine.POST("/rerank", mm, pm.proxyOAIHandler) + + // llama-server's /infill endpoint for code infilling + pm.ginEngine.POST("/infill", mm, pm.proxyOAIHandler) // Support audio/speech endpoint pm.ginEngine.POST("/v1/audio/speech", pm.proxyOAIHandler)