Support llama.cpp's cache_n in timings info (#287)

Capture prompt cache metrics and surface them on Activities page in UI
This commit is contained in:
Benson Wong
2025-09-06 13:58:02 -07:00
committed by GitHub
parent 954e2dee73
commit f58c8c8ec5
4 changed files with 72 additions and 11 deletions

View File

@@ -61,7 +61,6 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
} else {
writer.metricsRecorder.processNonStreamingResponse(writer.body)
}
}
}
@@ -73,6 +72,7 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
}
// default values
cachedTokens := -1 // unknown or missing data
outputTokens := 0
inputTokens := 0
@@ -93,11 +93,16 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
if cachedValue := jsonData.Get("timings.cache_n"); cachedValue.Exists() {
cachedTokens = int(cachedValue.Int())
}
}
rec.metricsMonitor.addMetrics(TokenMetrics{
Timestamp: time.Now(),
Model: rec.realModelName,
CachedTokens: cachedTokens,
InputTokens: inputTokens,
OutputTokens: outputTokens,
PromptPerSecond: promptPerSecond,