Fix token metrics parsing (#199)

Fix #198

- use llama-server's `timings` info if available in response body
- send "-1" for token/sec when not able to accurately calculate
  performance
- optimize streaming body search for metrics information
This commit is contained in:
Benson Wong
2025-07-22 23:10:14 -07:00
committed by GitHub
parent accd65294b
commit 01d4838fb3
4 changed files with 79 additions and 45 deletions

View File

@@ -708,7 +708,9 @@ func TestProxyManager_MiddlewareWritesMetrics_NonStreaming(t *testing.T) {
// Check that metrics were recorded
metrics := proxy.metricsMonitor.GetMetrics()
assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request")
if !assert.NotEmpty(t, metrics, "metrics should be recorded for non-streaming request") {
return
}
// Verify the last metric has the correct model
lastMetric := metrics[len(metrics)-1]
@@ -741,7 +743,9 @@ func TestProxyManager_MiddlewareWritesMetrics_Streaming(t *testing.T) {
// Check that metrics were recorded
metrics := proxy.metricsMonitor.GetMetrics()
assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request")
if !assert.NotEmpty(t, metrics, "metrics should be recorded for streaming request") {
return
}
// Verify the last metric has the correct model
lastMetric := metrics[len(metrics)-1]