Support llama.cpp's cache_n in timings info (#287)
Capture prompt cache metrics and surface them on Activities page in UI
This commit is contained in:
@@ -61,7 +61,6 @@ func MetricsMiddleware(pm *ProxyManager) gin.HandlerFunc {
|
|||||||
} else {
|
} else {
|
||||||
writer.metricsRecorder.processNonStreamingResponse(writer.body)
|
writer.metricsRecorder.processNonStreamingResponse(writer.body)
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -73,6 +72,7 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// default values
|
// default values
|
||||||
|
cachedTokens := -1 // unknown or missing data
|
||||||
outputTokens := 0
|
outputTokens := 0
|
||||||
inputTokens := 0
|
inputTokens := 0
|
||||||
|
|
||||||
@@ -93,11 +93,16 @@ func (rec *MetricsRecorder) parseAndRecordMetrics(jsonData gjson.Result) bool {
|
|||||||
promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
|
promptPerSecond = jsonData.Get("timings.prompt_per_second").Float()
|
||||||
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
|
tokensPerSecond = jsonData.Get("timings.predicted_per_second").Float()
|
||||||
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
|
durationMs = int(jsonData.Get("timings.prompt_ms").Float() + jsonData.Get("timings.predicted_ms").Float())
|
||||||
|
|
||||||
|
if cachedValue := jsonData.Get("timings.cache_n"); cachedValue.Exists() {
|
||||||
|
cachedTokens = int(cachedValue.Int())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rec.metricsMonitor.addMetrics(TokenMetrics{
|
rec.metricsMonitor.addMetrics(TokenMetrics{
|
||||||
Timestamp: time.Now(),
|
Timestamp: time.Now(),
|
||||||
Model: rec.realModelName,
|
Model: rec.realModelName,
|
||||||
|
CachedTokens: cachedTokens,
|
||||||
InputTokens: inputTokens,
|
InputTokens: inputTokens,
|
||||||
OutputTokens: outputTokens,
|
OutputTokens: outputTokens,
|
||||||
PromptPerSecond: promptPerSecond,
|
PromptPerSecond: promptPerSecond,
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ type TokenMetrics struct {
|
|||||||
ID int `json:"id"`
|
ID int `json:"id"`
|
||||||
Timestamp time.Time `json:"timestamp"`
|
Timestamp time.Time `json:"timestamp"`
|
||||||
Model string `json:"model"`
|
Model string `json:"model"`
|
||||||
|
CachedTokens int `json:"cache_tokens"`
|
||||||
InputTokens int `json:"input_tokens"`
|
InputTokens int `json:"input_tokens"`
|
||||||
OutputTokens int `json:"output_tokens"`
|
OutputTokens int `json:"output_tokens"`
|
||||||
PromptPerSecond float64 `json:"prompt_per_second"`
|
PromptPerSecond float64 `json:"prompt_per_second"`
|
||||||
@@ -61,7 +62,6 @@ func (mp *MetricsMonitor) addMetrics(metric TokenMetrics) {
|
|||||||
if len(mp.metrics) > mp.maxMetrics {
|
if len(mp.metrics) > mp.maxMetrics {
|
||||||
mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
|
mp.metrics = mp.metrics[len(mp.metrics)-mp.maxMetrics:]
|
||||||
}
|
}
|
||||||
|
|
||||||
event.Emit(TokenMetricsEvent{Metrics: metric})
|
event.Emit(TokenMetricsEvent{Metrics: metric})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ interface Metrics {
|
|||||||
id: number;
|
id: number;
|
||||||
timestamp: string;
|
timestamp: string;
|
||||||
model: string;
|
model: string;
|
||||||
|
cache_tokens: number;
|
||||||
input_tokens: number;
|
input_tokens: number;
|
||||||
output_tokens: number;
|
output_tokens: number;
|
||||||
prompt_per_second: number;
|
prompt_per_second: number;
|
||||||
|
|||||||
@@ -1,10 +1,6 @@
|
|||||||
import { useMemo } from "react";
|
import { useMemo } from "react";
|
||||||
import { useAPI } from "../contexts/APIProvider";
|
import { useAPI } from "../contexts/APIProvider";
|
||||||
|
|
||||||
const formatTimestamp = (timestamp: string): string => {
|
|
||||||
return new Date(timestamp).toLocaleString();
|
|
||||||
};
|
|
||||||
|
|
||||||
const formatSpeed = (speed: number): string => {
|
const formatSpeed = (speed: number): string => {
|
||||||
return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
|
return speed < 0 ? "unknown" : speed.toFixed(2) + " t/s";
|
||||||
};
|
};
|
||||||
@@ -13,6 +9,33 @@ const formatDuration = (ms: number): string => {
|
|||||||
return (ms / 1000).toFixed(2) + "s";
|
return (ms / 1000).toFixed(2) + "s";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const formatRelativeTime = (timestamp: string): string => {
|
||||||
|
const now = new Date();
|
||||||
|
const date = new Date(timestamp);
|
||||||
|
const diffInSeconds = Math.floor((now.getTime() - date.getTime()) / 1000);
|
||||||
|
|
||||||
|
// Handle future dates by returning "just now"
|
||||||
|
if (diffInSeconds < 5) {
|
||||||
|
return "now";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (diffInSeconds < 60) {
|
||||||
|
return `${diffInSeconds}s ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const diffInMinutes = Math.floor(diffInSeconds / 60);
|
||||||
|
if (diffInMinutes < 60) {
|
||||||
|
return `${diffInMinutes}m ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const diffInHours = Math.floor(diffInMinutes / 60);
|
||||||
|
if (diffInHours < 24) {
|
||||||
|
return `${diffInHours}h ago`;
|
||||||
|
}
|
||||||
|
|
||||||
|
return "a while ago";
|
||||||
|
};
|
||||||
|
|
||||||
const ActivityPage = () => {
|
const ActivityPage = () => {
|
||||||
const { metrics } = useAPI();
|
const { metrics } = useAPI();
|
||||||
const sortedMetrics = useMemo(() => {
|
const sortedMetrics = useMemo(() => {
|
||||||
@@ -32,11 +55,16 @@ const ActivityPage = () => {
|
|||||||
<table className="min-w-full divide-y">
|
<table className="min-w-full divide-y">
|
||||||
<thead>
|
<thead>
|
||||||
<tr>
|
<tr>
|
||||||
<th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">Id</th>
|
<th className="px-4 py-3 text-left text-xs font-medium uppercase tracking-wider">ID</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Timestamp</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Time</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Model</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Input Tokens</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Output Tokens</th>
|
Cached <Tooltip content="prompt tokens from cache" />
|
||||||
|
</th>
|
||||||
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">
|
||||||
|
Prompt <Tooltip content="new prompt tokens processed" />
|
||||||
|
</th>
|
||||||
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generated</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Prompt Processing</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Generation Speed</th>
|
||||||
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
|
<th className="px-6 py-3 text-left text-xs font-medium uppercase tracking-wider">Duration</th>
|
||||||
@@ -46,8 +74,11 @@ const ActivityPage = () => {
|
|||||||
{sortedMetrics.map((metric) => (
|
{sortedMetrics.map((metric) => (
|
||||||
<tr key={`metric_${metric.id}`}>
|
<tr key={`metric_${metric.id}`}>
|
||||||
<td className="px-4 py-4 whitespace-nowrap text-sm">{metric.id + 1 /* un-zero index */}</td>
|
<td className="px-4 py-4 whitespace-nowrap text-sm">{metric.id + 1 /* un-zero index */}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatTimestamp(metric.timestamp)}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatRelativeTime(metric.timestamp)}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.model}</td>
|
||||||
|
<td className="px-6 py-4 whitespace-nowrap text-sm">
|
||||||
|
{metric.cache_tokens > 0 ? metric.cache_tokens.toLocaleString() : "-"}
|
||||||
|
</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.input_tokens.toLocaleString()}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{metric.output_tokens.toLocaleString()}</td>
|
||||||
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
|
<td className="px-6 py-4 whitespace-nowrap text-sm">{formatSpeed(metric.prompt_per_second)}</td>
|
||||||
@@ -63,4 +94,28 @@ const ActivityPage = () => {
|
|||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
interface TooltipProps {
|
||||||
|
content: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const Tooltip: React.FC<TooltipProps> = ({ content }) => {
|
||||||
|
return (
|
||||||
|
<div className="relative group inline-block">
|
||||||
|
ⓘ
|
||||||
|
<div
|
||||||
|
className="absolute top-full left-1/2 transform -translate-x-1/2 mt-2
|
||||||
|
px-3 py-2 bg-gray-900 text-white text-sm rounded-md
|
||||||
|
opacity-0 group-hover:opacity-100 transition-opacity
|
||||||
|
duration-200 pointer-events-none whitespace-nowrap z-50 normal-case"
|
||||||
|
>
|
||||||
|
{content}
|
||||||
|
<div
|
||||||
|
className="absolute bottom-full left-1/2 transform -translate-x-1/2
|
||||||
|
border-4 border-transparent border-b-gray-900"
|
||||||
|
></div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
export default ActivityPage;
|
export default ActivityPage;
|
||||||
|
|||||||
Reference in New Issue
Block a user