Add /upstream endpoint (#30)

* remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico
2024-12-17 14:37:44 -08:00
parent 7183f6b43d
commit 891f6a5b5a
7 changed files with 78 additions and 22 deletions
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
 Features:

 - ✅ Easy to deploy: single binary with no dependencies
- ✅ Single yaml configuration file
+- ✅ Easy to config: single yaml file
 - ✅ On-demand model switching
 - ✅ Full control over server settings per model
 - ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
@@ -16,7 +16,8 @@ Features:
 - ✅ Run multiple models at once with `profiles`
 - ✅ Remote log monitoring at `/log`
 - ✅ Automatic unloading of models from GPUs after timeout
- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
+- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`

 ## Releases

@@ -73,6 +74,12 @@ models:
      --model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
    proxy: http://127.0.0.1:8999

+  # unlisted models do not show up in /v1/models or /upstream lists
+  # but they can still be requested as normal
+  "qwen-unlisted":
+    cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
+    unlisted: true
+
 # profiles make it easy to managing multi model (and gpu) configurations.
 #
 # Tips:
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -33,6 +33,7 @@ models:
      - env1=hello
    cmd: build/simple-responder --port 8999
    proxy: http://127.0.0.1:8999
+    unlisted: true

    # use "none" to skip check. Caution this may cause some requests to fail
    # until the upstream server is ready for traffic
@@ -42,9 +43,11 @@ models:
  "broken":
    cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
    proxy: http://127.0.0.1:8999
+    unlisted: true
  "broken_timeout":
    cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
    proxy: http://127.0.0.1:9000
+    unlisted: true

 # creating a coding profile with models for code generation and general questions
 profiles:
--- a/misc/assets/favicon-raw.png
+++ b/misc/assets/favicon-raw.png
--- a/proxy/config.go
+++ b/proxy/config.go
@@ -16,6 +16,7 @@ type ModelConfig struct {
 	Env           []string `yaml:"env"`
 	CheckEndpoint string   `yaml:"checkEndpoint"`
 	UnloadAfter   int      `yaml:"ttl"`
+	Unlisted      bool     `yaml:"unlisted"`
 }

 func (m *ModelConfig) SanitizedCommand() ([]string, error) {
--- a/proxy/html/favicon.ico
+++ b/proxy/html/favicon.ico
--- a/proxy/proxymanager.go
+++ b/proxy/proxymanager.go
@@ -2,10 +2,12 @@ package proxy

 import (
 	"bytes"
+	"embed"
 	"encoding/json"
 	"fmt"
 	"io"
 	"net/http"
+	"sort"
 	"strconv"
 	"strings"
 	"sync"
@@ -18,6 +20,15 @@ const (
 	PROFILE_SPLIT_CHAR = ":"
 )

+//go:embed html/favicon.ico
+var faviconData []byte
+
+//go:embed html/logs.html
+var logsHTML []byte
+
+// make sure embed is kept there by the IDE auto-package importer
+var _ = embed.FS{}
+
 type ProxyManager struct {
 	sync.Mutex

@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
 	pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
 	pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)

-	pm.ginEngine.NoRoute(pm.proxyNoRouteHandler)
+	pm.ginEngine.GET("/upstream", pm.upstreamIndex)
+	pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
+
+	pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
+		c.Data(http.StatusOK, "image/x-icon", faviconData)
+	})

 	// Disable console color for testing
 	gin.DisableConsoleColor()
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {

 func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
 	data := []interface{}{}
-	for id := range pm.config.Models {
+	for id, modelConfig := range pm.config.Models {
+		if modelConfig.Unlisted {
+			continue
+		}
+
 		data = append(data, map[string]interface{}{
 			"id":       id,
 			"object":   "model",
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	pm.Lock()
 	defer pm.Unlock()

-	// Check if requestedModel contains a /
+	// Check if requestedModel contains a PROFILE_SPLIT_CHAR
 	profileName, modelName := "", requestedModel
 	if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
 		profileName = requestedModel[:idx]
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
 	return pm.currentProcesses[requestedProcessKey], nil
 }

+func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
+	requestedModel := c.Param("model_id")
+
+	if requestedModel == "" {
+		c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
+		return
+	}
+
+	if process, err := pm.swapModel(requestedModel); err != nil {
+		c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
+	} else {
+		// rewrite the path
+		c.Request.URL.Path = c.Param("upstreamPath")
+		process.ProxyRequest(c.Writer, c.Request)
+	}
+}
+
+func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
+	var html strings.Builder
+
+	html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
+
+	// Extract keys and sort them
+	var modelIDs []string
+	for modelID, modelConfig := range pm.config.Models {
+		if modelConfig.Unlisted {
+			continue
+		}
+
+		modelIDs = append(modelIDs, modelID)
+	}
+	sort.Strings(modelIDs)
+
+	// Iterate over sorted keys
+	for _, modelID := range modelIDs {
+		html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
+	}
+	html.WriteString("</ul></body></html>")
+	c.Header("Content-Type", "text/html")
+	c.String(http.StatusOK, html.String())
+}
+
 func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	bodyBytes, err := io.ReadAll(c.Request.Body)
 	if err != nil {
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
 	}
 }

-func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
-	// since maps are unordered, just use the first available process if one exists
-	for _, process := range pm.currentProcesses {
-		process.ProxyRequest(c.Writer, c.Request)
-		return
-	}
-
-	c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
-}
-
 func ProcessKeyName(groupName, modelName string) string {
 	return groupName + PROFILE_SPLIT_CHAR + modelName
 }
--- a/proxy/proxymanager_loghandlers.go
+++ b/proxy/proxymanager_loghandlers.go
@@ -1,7 +1,6 @@
 package proxy

 import (
-	"embed"
 	"fmt"
 	"net/http"
 	"strings"
@@ -9,12 +8,6 @@ import (
 	"github.com/gin-gonic/gin"
 )

-//go:embed html/logs.html
-var logsHTML []byte
-
-// make sure embed is kept there by the IDE auto-package importer
-var _ = embed.FS{}
-
 func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {

 	accept := c.GetHeader("Accept")