Fix token metrics parsing (#199)
Fix #198 - use llama-server's `timings` info if available in response body - send "-1" for token/sec when not able to accurately calculate performance - optimize streaming body search for metrics information
This commit is contained in:
@@ -78,6 +78,14 @@ func main() {
|
|||||||
"prompt_tokens": 25,
|
"prompt_tokens": 25,
|
||||||
"total_tokens": 35,
|
"total_tokens": 35,
|
||||||
},
|
},
|
||||||
|
// add timings to simulate llama.cpp
|
||||||
|
"timings": gin.H{
|
||||||
|
"prompt_n": 25,
|
||||||
|
"prompt_ms": 13,
|
||||||
|
"predicted_n": 10,
|
||||||
|
"predicted_ms": 17,
|
||||||
|
"predicted_per_second": 10,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
c.SSEvent("message", finalData)
|
c.SSEvent("message", finalData)
|
||||||
c.Writer.Flush()
|
c.Writer.Flush()
|
||||||
@@ -102,6 +110,13 @@ func main() {
|
|||||||
"prompt_tokens": 25,
|
"prompt_tokens": 25,
|
||||||
"total_tokens": 35,
|
"total_tokens": 35,
|
||||||
},
|
},
|
||||||
|
"timings": gin.H{
|
||||||
|
"prompt_n": 25,
|
||||||
|
"prompt_ms": 13,
|
||||||
|
"predicted_n": 10,
|
||||||
|
"predicted_ms": 17,
|
||||||
|
"predicted_per_second": 10,
|
||||||
|
},
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -67,51 +67,66 @@ func (rec *MetricsRecorder) processBody(body []byte) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) {
|
func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
|
||||||
if !jsonData.Get("usage").Exists() {
|
usage := jsonData.Get("usage")
|
||||||
return
|
if !usage.Exists() {
|
||||||
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// default values
|
||||||
outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
|
outputTokens := int(jsonData.Get("usage.completion_tokens").Int())
|
||||||
inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
|
inputTokens := int(jsonData.Get("usage.prompt_tokens").Int())
|
||||||
|
tokensPerSecond := -1.0
|
||||||
|
durationMs := int(time.Since(rec.startTime).Milliseconds())
|
||||||
|
|
||||||
if outputTokens > 0 {
|
// use llama-server's timing data for tok/sec and duration as it is more accurate
|
||||||
duration := time.Since(rec.startTime)
|
if timings := jsonData.Get("timings"); timings.Exists() {
|
||||||
tokensPerSecond := float64(inputTokens+outputTokens) / duration.Seconds()
|
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
|
||||||
|
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
|
||||||
metrics := TokenMetrics{
|
|
||||||
Timestamp: time.Now(),
|
|
||||||
Model: rec.realModelName,
|
|
||||||
InputTokens: inputTokens,
|
|
||||||
OutputTokens: outputTokens,
|
|
||||||
TokensPerSecond: tokensPerSecond,
|
|
||||||
DurationMs: int(duration.Milliseconds()),
|
|
||||||
}
|
|
||||||
rec.metricsMonitor.addMetrics(metrics)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
rec.metricsMonitor.addMetrics(TokenMetrics{
|
||||||
|
Timestamp: time.Now(),
|
||||||
|
Model: rec.realModelName,
|
||||||
|
InputTokens: inputTokens,
|
||||||
|
OutputTokens: outputTokens,
|
||||||
|
TokensPerSecond: tokensPerSecond,
|
||||||
|
DurationMs: durationMs,
|
||||||
|
})
|
||||||
|
|
||||||
|
return true
|
||||||
}
|
}
|
||||||
|
|
||||||
func (rec *MetricsRecorder) processStreamingResponse(body []byte) {
|
func (rec *MetricsRecorder) processStreamingResponse(body []byte) {
|
||||||
|
// Iterate **backwards** through the lines looking for the data payload with
|
||||||
|
// usage data
|
||||||
lines := bytes.Split(body, []byte("\n"))
|
lines := bytes.Split(body, []byte("\n"))
|
||||||
for _, line := range lines {
|
|
||||||
line = bytes.TrimSpace(line)
|
for i := len(lines) - 1; i >= 0; i-- {
|
||||||
|
line := bytes.TrimSpace(lines[i])
|
||||||
if len(line) == 0 {
|
if len(line) == 0 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for SSE data prefix
|
// SSE payload always follows "data:"
|
||||||
if data, found := bytes.CutPrefix(line, []byte("data:")); found {
|
prefix := []byte("data:")
|
||||||
data = bytes.TrimSpace(data)
|
if !bytes.HasPrefix(line, prefix) {
|
||||||
if len(data) == 0 {
|
continue
|
||||||
continue
|
}
|
||||||
}
|
data := bytes.TrimSpace(line[len(prefix):])
|
||||||
if bytes.Equal(data, []byte("[DONE]")) {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
// Parse JSON to look for usage data
|
if len(data) == 0 {
|
||||||
if gjson.ValidBytes(data) {
|
continue
|
||||||
rec.parseAndRecordMetrics(gjson.ParseBytes(data))
|
}
|
||||||
|
|
||||||
|
if bytes.Equal(data, []byte("[DONE]")) {
|
||||||
|
// [DONE] line itself contains nothing of interest.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if gjson.ValidBytes(data) {
|
||||||
|
if rec.parseAndRecordMetrics(gjson.ParseBytes(data)) {
|
||||||
|
return // short circuit if a metric was recorded
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -708,7 +708,9 @@ func TestProxyManager_MiddlewareWritesMetrics_NonStreaming(t *testing.T) {
|
|||||||
|
|
||||||
// Check that metrics were recorded
|
// Check that metrics were recorded
|
||||||
metrics := proxy.metricsMonitor.GetMetrics()
|
metrics := proxy.metricsMonitor.GetMetrics()
|
||||||
assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request")
|
if !assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Verify the last metric has the correct model
|
// Verify the last metric has the correct model
|
||||||
lastMetric := metrics[len(metrics)-1]
|
lastMetric := metrics[len(metrics)-1]
|
||||||
@@ -741,7 +743,9 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {
|
|||||||
|
|
||||||
// Check that metrics were recorded
|
// Check that metrics were recorded
|
||||||
metrics := proxy.metricsMonitor.GetMetrics()
|
metrics := proxy.metricsMonitor.GetMetrics()
|
||||||
assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request")
|
if !assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request") {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Verify the last metric has the correct model
|
// Verify the last metric has the correct model
|
||||||
lastMetric := metrics[len(metrics)-1]
|
lastMetric := metrics[len(metrics)-1]
|
||||||
|
|||||||
@@ -1,6 +1,18 @@
|
|||||||
import { useState, useEffect } from "react";
|
import { useState, useEffect } from "react";
|
||||||
import { useAPI } from "../contexts/APIProvider";
|
import { useAPI } from "../contexts/APIProvider";
|
||||||
|
|
||||||
|
const formatTimestamp = (timestamp: string): string => {
|
||||||
|
return new Date(timestamp).toLocaleString();
|
||||||
|
};
|
||||||
|
|
||||||
|
const formatSpeed = (speed: number): string => {
|
||||||
|
return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
|
||||||
|
};
|
||||||
|
|
||||||
|
const formatDuration = (ms: number): string => {
|
||||||
|
return (ms / 1000).toFixed(2) + "s";
|
||||||
|
};
|
||||||
|
|
||||||
const ActivityPage = () => {
|
const ActivityPage = () => {
|
||||||
const { metrics } = useAPI();
|
const { metrics } = useAPI();
|
||||||
const [error, setError] = useState<string | null>(null);
|
const [error, setError] = useState<string | null>(null);
|
||||||
@@ -11,18 +23,6 @@ const ActivityPage = () => {
|
|||||||
}
|
}
|
||||||
}, [metrics]);
|
}, [metrics]);
|
||||||
|
|
||||||
const formatTimestamp = (timestamp: string) => {
|
|
||||||
return new Date(timestamp).toLocaleString();
|
|
||||||
};
|
|
||||||
|
|
||||||
const formatSpeed = (speed: number) => {
|
|
||||||
return speed.toFixed(2) + " t/s";
|
|
||||||
};
|
|
||||||
|
|
||||||
const formatDuration = (ms: number) => {
|
|
||||||
return (ms / 1000).toFixed(2) + "s";
|
|
||||||
};
|
|
||||||
|
|
||||||
if (error) {
|
if (error) {
|
||||||
return (
|
return (
|
||||||
<div className="p-6">
|
<div className="p-6">
|
||||||
@@ -51,7 +51,7 @@ const ActivityPage = () => {
|
|||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Processing Speed</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
|
||||||
</tr>
|
</tr>
|
||||||
</thead>
|
</thead>
|
||||||
|
|||||||
Reference in New Issue
Block a user