Fix token metrics parsing (#199)

Fix #198

- use llama-server's `timings` info if available in response body
- send "-1" for token/sec when not able to accurately calculate
  performance
- optimize streaming body search for metrics information
This commit is contained in:
Benson Wong
2025-07-22 23:10:14 -07:00
committed by GitHub
parent accd65294b
commit 01d4838fb3
4 changed files with 79 additions and 45 deletions

View File

@@ -78,6 +78,14 @@ func main() {
"prompt_tokens": 25, "prompt_tokens": 25,
"total_tokens": 35, "total_tokens": 35,
}, },
// add timings to simulate llama.cpp
"timings": gin.H{
"prompt_n": 25,
"prompt_ms": 13,
"predicted_n": 10,
"predicted_ms": 17,
"predicted_per_second": 10,
},
} }
c.SSEvent("message", finalData) c.SSEvent("message", finalData)
c.Writer.Flush() c.Writer.Flush()
@@ -102,6 +110,13 @@ func main() {
"prompt_tokens": 25, "prompt_tokens": 25,
"total_tokens": 35, "total_tokens": 35,
}, },
"timings": gin.H{
"prompt_n": 25,
"prompt_ms": 13,
"predicted_n": 10,
"predicted_ms": 17,
"predicted_per_second": 10,
},
}) })
} }
}) })

View File

@@ -67,51 +67,66 @@ func (rec *MetricsRecorder) processBody(body []byte) {
} }
} }
func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) { func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
if !jsonData.Get("usage").Exists() { usage := jsonData.Get("usage")
return if !usage.Exists() {
return false
} }
// default values
outputTokens := int(jsonData.Get("usage.completion_tokens").Int()) outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
inputTokens := int(jsonData.Get("usage.prompt_tokens").Int()) inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
tokensPerSecond := -1.0
durationMs := int(time.Since(rec.startTime).Milliseconds())
if outputTokens > 0 { // use llama-server's timing data for tok/sec and duration as it is more accurate
duration := time.Since(rec.startTime) if timings := jsonData.Get("timings"); timings.Exists() {
tokensPerSecond := float64(inputTokens+outputTokens) / duration.Seconds() tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
metrics := TokenMetrics{
Timestamp: time.Now(),
Model: rec.realModelName,
InputTokens: inputTokens,
OutputTokens: outputTokens,
TokensPerSecond: tokensPerSecond,
DurationMs: int(duration.Milliseconds()),
}
rec.metricsMonitor.addMetrics(metrics)
} }
rec.metricsMonitor.addMetrics(TokenMetrics{
Timestamp: time.Now(),
Model: rec.realModelName,
InputTokens: inputTokens,
OutputTokens: outputTokens,
TokensPerSecond: tokensPerSecond,
DurationMs: durationMs,
})
return true
} }
func (rec *MetricsRecorder) processStreamingResponse(body []byte) { func (rec *MetricsRecorder) processStreamingResponse(body []byte) {
// Iterate **backwards** through the lines looking for the data payload with
// usage data
lines := bytes.Split(body, []byte("\n")) lines := bytes.Split(body, []byte("\n"))
for _, line := range lines {
line = bytes.TrimSpace(line) for i := len(lines) - 1; i >= 0; i-- {
line := bytes.TrimSpace(lines[i])
if len(line) == 0 { if len(line) == 0 {
continue continue
} }
// Check for SSE data prefix // SSE payload always follows "data:"
if data, found := bytes.CutPrefix(line, []byte("data:")); found { prefix := []byte("data:")
data = bytes.TrimSpace(data) if !bytes.HasPrefix(line, prefix) {
if len(data) == 0 { continue
continue }
} data := bytes.TrimSpace(line[len(prefix):])
if bytes.Equal(data, []byte("[DONE]")) {
break
}
// Parse JSON to look for usage data if len(data) == 0 {
if gjson.ValidBytes(data) { continue
rec.parseAndRecordMetrics(gjson.ParseBytes(data)) }
if bytes.Equal(data, []byte("[DONE]")) {
// [DONE] line itself contains nothing of interest.
continue
}
if gjson.ValidBytes(data) {
if rec.parseAndRecordMetrics(gjson.ParseBytes(data)) {
return // short circuit if a metric was recorded
} }
} }
} }

View File

@@ -708,7 +708,9 @@ func TestProxyManager_MiddlewareWritesMetrics_NonStreaming(t *testing.T) {
// Check that metrics were recorded // Check that metrics were recorded
metrics := proxy.metricsMonitor.GetMetrics() metrics := proxy.metricsMonitor.GetMetrics()
assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request") if !assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request") {
return
}
// Verify the last metric has the correct model // Verify the last metric has the correct model
lastMetric := metrics[len(metrics)-1] lastMetric := metrics[len(metrics)-1]
@@ -741,7 +743,9 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {
// Check that metrics were recorded // Check that metrics were recorded
metrics := proxy.metricsMonitor.GetMetrics() metrics := proxy.metricsMonitor.GetMetrics()
assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request") if !assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request") {
return
}
// Verify the last metric has the correct model // Verify the last metric has the correct model
lastMetric := metrics[len(metrics)-1] lastMetric := metrics[len(metrics)-1]

View File

@@ -1,6 +1,18 @@
import { useState, useEffect } from "react"; import { useState, useEffect } from "react";
import { useAPI } from "../contexts/APIProvider"; import { useAPI } from "../contexts/APIProvider";
const formatTimestamp = (timestamp: string): string => {
return new Date(timestamp).toLocaleString();
};
const formatSpeed = (speed: number): string => {
return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
};
const formatDuration = (ms: number): string => {
return (ms / 1000).toFixed(2) + "s";
};
const ActivityPage = () => { const ActivityPage = () => {
const { metrics } = useAPI(); const { metrics } = useAPI();
const [error, setError] = useState<string | null>(null); const [error, setError] = useState<string | null>(null);
@@ -11,18 +23,6 @@ const ActivityPage = () => {
} }
}, [metrics]); }, [metrics]);
const formatTimestamp = (timestamp: string) => {
return new Date(timestamp).toLocaleString();
};
const formatSpeed = (speed: number) => {
return speed.toFixed(2) + " t/s";
};
const formatDuration = (ms: number) => {
return (ms / 1000).toFixed(2) + "s";
};
if (error) { if (error) {
return ( return (
<div className="p-6"> <div className="p-6">
@@ -51,7 +51,7 @@ const ActivityPage = () => {
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Processing Speed</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th> <th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
</tr> </tr>
</thead> </thead>