Fix token metrics parsing (#199)

Fix #198 - use llama-server's `timings` info if available in response body - send "-1" for token/sec when not able to accurately calculate performance - optimize streaming body search for metrics information
2025-07-22 23:10:14 -07:00
parent accd65294b
commit 01d4838fb3
4 changed files with 79 additions and 45 deletions
--- a/misc/simple-responder/simple-responder.go
+++ b/misc/simple-responder/simple-responder.go
@@ -78,6 +78,14 @@ func main() {
 					"prompt_tokens":     25,
 					"total_tokens":      35,
 				},
+				// add timings to simulate llama.cpp
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
 			}
 			c.SSEvent("message", finalData)
 			c.Writer.Flush()
@@ -102,6 +110,13 @@ func main() {
 					"prompt_tokens":     25,
 					"total_tokens":      35,
 				},
+				"timings": gin.H{
+					"prompt_n":             25,
+					"prompt_ms":            13,
+					"predicted_n":          10,
+					"predicted_ms":         17,
+					"predicted_per_second": 10,
+				},
 			})
 		}
 	})