Fix token metrics parsing (#199)

Fix #198

- use llama-server's `timings` info if available in response body
- send "-1" for token/sec when not able to accurately calculate
  performance
- optimize streaming body search for metrics information
This commit is contained in:
Benson Wong
2025-07-22 23:10:14 -07:00
committed by GitHub
parent accd65294b
commit 01d4838fb3
4 changed files with 79 additions and 45 deletions

View File

@@ -78,6 +78,14 @@ func main() {
"prompt_tokens": 25,
"total_tokens": 35,
},
// add timings to simulate llama.cpp
"timings": gin.H{
"prompt_n": 25,
"prompt_ms": 13,
"predicted_n": 10,
"predicted_ms": 17,
"predicted_per_second": 10,
},
}
c.SSEvent("message", finalData)
c.Writer.Flush()
@@ -102,6 +110,13 @@ func main() {
"prompt_tokens": 25,
"total_tokens": 35,
},
"timings": gin.H{
"prompt_n": 25,
"prompt_ms": 13,
"predicted_n": 10,
"predicted_ms": 17,
"predicted_per_second": 10,
},
})
}
})