From 74c69f39ef61937ac82a6a3fdae8bb27d5e0418a Mon Sep 17 00:00:00 2001 From: Benson Wong Date: Thu, 14 Aug 2025 10:02:16 -0700 Subject: [PATCH] Add prompt processing metrics (#250) - capture prompt processing metrics - display prompt processing metrics on UI Activity page --- proxy/metrics_middleware.go | 3 +++ proxy/metrics_monitor.go | 1 + ui/src/contexts/APIProvider.tsx | 1 + ui/src/pages/Activity.tsx | 2 ++ 4 files changed, 7 insertions(+) diff --git a/proxy/metrics_middleware.go b/proxy/metrics_middleware.go index ee17717..a4249ac 100644 --- a/proxy/metrics_middleware.go +++ b/proxy/metrics_middleware.go @@ -79,10 +79,12 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool { outputTokens := int(jsonData.Get("usage.completion_tokens").Int()) inputTokens := int(jsonData.Get("usage.prompt_tokens").Int()) tokensPerSecond := -1.0 + promptPerSecond := -1.0 durationMs := int(time.Since(rec.startTime).Milliseconds()) // use llama-server's timing data for tok/sec and duration as it is more accurate if timings := jsonData.Get("timings"); timings.Exists() { + promptPerSecond = jsonData.Get("timings.prompt_per_second").Float() tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float() durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float()) } @@ -92,6 +94,7 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool { Model: rec.realModelName, InputTokens: inputTokens, OutputTokens: outputTokens, + PromptPerSecond: promptPerSecond, TokensPerSecond: tokensPerSecond, DurationMs: durationMs, }) diff --git a/proxy/metrics_monitor.go b/proxy/metrics_monitor.go index 050b95e..0ce4efd 100644 --- a/proxy/metrics_monitor.go +++ b/proxy/metrics_monitor.go @@ -15,6 +15,7 @@ type TokenMetrics struct { Model string `json:"model"` InputTokens int `json:"input_tokens"` OutputTokens int `json:"output_tokens"` + PromptPerSecond float64 `json:"prompt_per_second"` TokensPerSecond float64 `json:"tokens_per_second"` DurationMs int `json:"duration_ms"` } diff --git a/ui/src/contexts/APIProvider.tsx b/ui/src/contexts/APIProvider.tsx index 5148e89..d2a8a7f 100644 --- a/ui/src/contexts/APIProvider.tsx +++ b/ui/src/contexts/APIProvider.tsx @@ -28,6 +28,7 @@ interface Metrics { model: string; input_tokens: number; output_tokens: number; + prompt_per_second: number; tokens_per_second: number; duration_ms: number; } diff --git a/ui/src/pages/Activity.tsx b/ui/src/pages/Activity.tsx index 70cae42..f8aa996 100644 --- a/ui/src/pages/Activity.tsx +++ b/ui/src/pages/Activity.tsx @@ -51,6 +51,7 @@ const ActivityPage = () => { Model Input Tokens Output Tokens + Prompt Processing Generation Speed Duration @@ -62,6 +63,7 @@ const ActivityPage = () => { {metric.model} {metric.input_tokens.toLocaleString()} {metric.output_tokens.toLocaleString()} + {formatSpeed(metric.prompt_per_second)} {formatSpeed(metric.tokens_per_second)} {formatDuration(metric.duration_ms)}