Add /upstream endpoint (#30)

* remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico
2024-12-17 14:37:44 -08:00
parent 7183f6b43d
commit 891f6a5b5a
7 changed files with 78 additions and 22 deletions
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
 Features:
 - ✅ Easy to deploy: single binary with no dependencies
- ✅ Single yaml configuration file
+- ✅ Easy to config: single yaml file
 - ✅ On-demand model switching
 - ✅ Full control over server settings per model
 - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
@@ -16,7 +16,8 @@ Features:
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
 - ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`
 ## Releases
@@ -73,6 +74,12 @@ models:
      --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
    proxy: http://127.0.0.1:8999
  # unlisted models do not show up in /v1/models or /upstream lists
  # but they can still be requested as normal
  "qwen-unlisted":
    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
    unlisted: true
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -33,6 +33,7 @@ models:
      - env1=hello
    cmd: build/simple-responder --port 8999
    proxy: http://127.0.0.1:8999
    unlisted: true
    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
@@ -42,9 +43,11 @@ models:
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
    unlisted: true
 # creating a coding profile with models for code generation and general questions
 profiles:
--- a/misc/assets/favicon-raw.png
+++ b/misc/assets/favicon-raw.png
--- a/proxy/config.go
+++ b/proxy/config.go
@@ -16,6 +16,7 @@ type ModelConfig struct {
 	Env           []string `yaml:"env"`
 	CheckEndpoint string   `yaml:"checkEndpoint"`
 	UnloadAfter   int      `yaml:"ttl"`
 	Unlisted      bool     `yaml:"unlisted"`
 }
 func (m *ModelConfig) SanitizedCommand() ([]string, error) {
--- a/proxy/html/favicon.ico
+++ b/proxy/html/favicon.ico
--- a/proxy/proxymanager.go
+++ b/proxy/proxymanager.go
@@ -2,10 +2,12 @@ package proxy
 import (
 	"bytes"
 	"embed"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
 	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -18,6 +20,15 @@ const (
 	PROFILE_SPLIT_CHAR = ":"
 )
 //go:embed html/favicon.ico
 var faviconData []byte
 //go:embed html/logs.html
 var logsHTML []byte
 // make sure embed is kept there by the IDE auto-package importer
 var _ = embed.FS{}
 type ProxyManager struct {
 	sync.Mutex
@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
 	pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
 	pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
-	pm.ginEngine.NoRoute(pm.proxyNoRouteHandler)
+	pm.ginEngine.GET("/upstream", pm.upstreamIndex)
 	pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
 	pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
 		c.Data(http.StatusOK, "image/x-icon", faviconData)
 	})
 	// Disable console color for testing
 	gin.DisableConsoleColor()
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {
 func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
 	data := []interface{}{}
-	for id := range pm.config.Models {
+	for id, modelConfig := range pm.config.Models {
 		if modelConfig.Unlisted {
 			continue
 		}
 		data = append(data, map[string]interface{}{
 			"id":       id,
 			"object":   "model",
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	pm.Lock()
 	defer pm.Unlock()
-	// Check if requestedModel contains a /
+	// Check if requestedModel contains a PROFILE_SPLIT_CHAR
 	profileName, modelName := "", requestedModel
 	if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
 		profileName = requestedModel[:idx]
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	return pm.currentProcesses[requestedProcessKey], nil
 }
 func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
 	requestedModel := c.Param("model_id")
 	if requestedModel == "" {
 		c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
 		return
 	}
 	if process, err := pm.swapModel(requestedModel); err != nil {
 		c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
 	} else {
 		// rewrite the path
 		c.Request.URL.Path = c.Param("upstreamPath")
 		process.ProxyRequest(c.Writer, c.Request)
 	}
 }
 func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
 	var html strings.Builder
 	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
 	// Extract keys and sort them
 	var modelIDs []string
 	for modelID, modelConfig := range pm.config.Models {
 		if modelConfig.Unlisted {
 			continue
 		}
 		modelIDs = append(modelIDs, modelID)
 	}
 	sort.Strings(modelIDs)
 	// Iterate over sorted keys
 	for _, modelID := range modelIDs {
 		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
 	}
 	html.WriteString("</ul></body></html>")
 	c.Header("Content-Type", "text/html")
 	c.String(http.StatusOK, html.String())
 }
 func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	bodyBytes, err := io.ReadAll(c.Request.Body)
 	if err != nil {
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	}
 }
 func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
 	// since maps are unordered, just use the first available process if one exists
 	for _, process := range pm.currentProcesses {
 		process.ProxyRequest(c.Writer, c.Request)
 		return
 	}
 	c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
 }
 func ProcessKeyName(groupName, modelName string) string {
 	return groupName + PROFILE_SPLIT_CHAR + modelName
 }
--- a/proxy/proxymanager_loghandlers.go
+++ b/proxy/proxymanager_loghandlers.go
@@ -1,7 +1,6 @@
 package proxy
 import (
 	"embed"
 	"fmt"
 	"net/http"
 	"strings"
@@ -9,12 +8,6 @@ import (
 	"github.com/gin-gonic/gin"
 )
 //go:embed html/logs.html
 var logsHTML []byte
 // make sure embed is kept there by the IDE auto-package importer
 var _ = embed.FS{}
 func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
 	accept := c.GetHeader("Accept")