Add /upstream endpoint (#30)
* remove catch-all route to upstream proxy (it was broken anyways) * add /upstream/:model_id to swap and route to upstream path * add /upstream HTML endpoint and unlisted option * add /upstream endpoint to show a list of available models * add `unlisted` configuration option to omit a model from /v1/models and /upstream lists * add favicon.ico
This commit is contained in:
11
README.md
11
README.md
@@ -8,7 +8,7 @@ llama-swap is an OpenAI API compatible server that gives you complete control ov
|
|||||||
Features:
|
Features:
|
||||||
|
|
||||||
- ✅ Easy to deploy: single binary with no dependencies
|
- ✅ Easy to deploy: single binary with no dependencies
|
||||||
- ✅ Single yaml configuration file
|
- ✅ Easy to config: single yaml file
|
||||||
- ✅ On-demand model switching
|
- ✅ On-demand model switching
|
||||||
- ✅ Full control over server settings per model
|
- ✅ Full control over server settings per model
|
||||||
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
- ✅ OpenAI API support (`v1/completions` and `v1/chat/completions`)
|
||||||
@@ -16,7 +16,8 @@ Features:
|
|||||||
- ✅ Run multiple models at once with `profiles`
|
- ✅ Run multiple models at once with `profiles`
|
||||||
- ✅ Remote log monitoring at `/log`
|
- ✅ Remote log monitoring at `/log`
|
||||||
- ✅ Automatic unloading of models from GPUs after timeout
|
- ✅ Automatic unloading of models from GPUs after timeout
|
||||||
- ✅ Use any local server that provides an OpenAI compatible API (llama.cpp, vllm, tabblyAPI, etc)
|
- ✅ Use any local OpenAI compatible server (llama.cpp, vllm, tabblyAPI, etc)
|
||||||
|
- ✅ Direct access to proxied upstream HTTP server via `/upstream/:model_id`
|
||||||
|
|
||||||
## Releases
|
## Releases
|
||||||
|
|
||||||
@@ -73,6 +74,12 @@ models:
|
|||||||
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
--model path/to/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf
|
||||||
proxy: http://127.0.0.1:8999
|
proxy: http://127.0.0.1:8999
|
||||||
|
|
||||||
|
# unlisted models do not show up in /v1/models or /upstream lists
|
||||||
|
# but they can still be requested as normal
|
||||||
|
"qwen-unlisted":
|
||||||
|
cmd: llama-server --port 9999 -m Llama-3.2-1B-Instruct-Q4_K_M.gguf -ngl 0
|
||||||
|
unlisted: true
|
||||||
|
|
||||||
# profiles make it easy to managing multi model (and gpu) configurations.
|
# profiles make it easy to managing multi model (and gpu) configurations.
|
||||||
#
|
#
|
||||||
# Tips:
|
# Tips:
|
||||||
|
|||||||
@@ -33,6 +33,7 @@ models:
|
|||||||
- env1=hello
|
- env1=hello
|
||||||
cmd: build/simple-responder --port 8999
|
cmd: build/simple-responder --port 8999
|
||||||
proxy: http://127.0.0.1:8999
|
proxy: http://127.0.0.1:8999
|
||||||
|
unlisted: true
|
||||||
|
|
||||||
# use "none" to skip check. Caution this may cause some requests to fail
|
# use "none" to skip check. Caution this may cause some requests to fail
|
||||||
# until the upstream server is ready for traffic
|
# until the upstream server is ready for traffic
|
||||||
@@ -42,9 +43,11 @@ models:
|
|||||||
"broken":
|
"broken":
|
||||||
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
cmd: models/llama-server-osx --port 8999 -m models/doesnotexist.gguf
|
||||||
proxy: http://127.0.0.1:8999
|
proxy: http://127.0.0.1:8999
|
||||||
|
unlisted: true
|
||||||
"broken_timeout":
|
"broken_timeout":
|
||||||
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
|
cmd: models/llama-server-osx --port 8999 -m models/qwen2.5-0.5b-instruct-q8_0.gguf
|
||||||
proxy: http://127.0.0.1:9000
|
proxy: http://127.0.0.1:9000
|
||||||
|
unlisted: true
|
||||||
|
|
||||||
# creating a coding profile with models for code generation and general questions
|
# creating a coding profile with models for code generation and general questions
|
||||||
profiles:
|
profiles:
|
||||||
|
|||||||
BIN
misc/assets/favicon-raw.png
Normal file
BIN
misc/assets/favicon-raw.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 51 KiB |
@@ -16,6 +16,7 @@ type ModelConfig struct {
|
|||||||
Env []string `yaml:"env"`
|
Env []string `yaml:"env"`
|
||||||
CheckEndpoint string `yaml:"checkEndpoint"`
|
CheckEndpoint string `yaml:"checkEndpoint"`
|
||||||
UnloadAfter int `yaml:"ttl"`
|
UnloadAfter int `yaml:"ttl"`
|
||||||
|
Unlisted bool `yaml:"unlisted"`
|
||||||
}
|
}
|
||||||
|
|
||||||
func (m *ModelConfig) SanitizedCommand() ([]string, error) {
|
func (m *ModelConfig) SanitizedCommand() ([]string, error) {
|
||||||
|
|||||||
BIN
proxy/html/favicon.ico
Normal file
BIN
proxy/html/favicon.ico
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
@@ -2,10 +2,12 @@ package proxy
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes"
|
||||||
|
"embed"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"sort"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -18,6 +20,15 @@ const (
|
|||||||
PROFILE_SPLIT_CHAR = ":"
|
PROFILE_SPLIT_CHAR = ":"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
//go:embed html/favicon.ico
|
||||||
|
var faviconData []byte
|
||||||
|
|
||||||
|
//go:embed html/logs.html
|
||||||
|
var logsHTML []byte
|
||||||
|
|
||||||
|
// make sure embed is kept there by the IDE auto-package importer
|
||||||
|
var _ = embed.FS{}
|
||||||
|
|
||||||
type ProxyManager struct {
|
type ProxyManager struct {
|
||||||
sync.Mutex
|
sync.Mutex
|
||||||
|
|
||||||
@@ -48,7 +59,12 @@ func New(config *Config) *ProxyManager {
|
|||||||
pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
|
pm.ginEngine.GET("/logs/stream", pm.streamLogsHandler)
|
||||||
pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
|
pm.ginEngine.GET("/logs/streamSSE", pm.streamLogsHandlerSSE)
|
||||||
|
|
||||||
pm.ginEngine.NoRoute(pm.proxyNoRouteHandler)
|
pm.ginEngine.GET("/upstream", pm.upstreamIndex)
|
||||||
|
pm.ginEngine.Any("/upstream/:model_id/*upstreamPath", pm.proxyToUpstream)
|
||||||
|
|
||||||
|
pm.ginEngine.GET("/favicon.ico", func(c *gin.Context) {
|
||||||
|
c.Data(http.StatusOK, "image/x-icon", faviconData)
|
||||||
|
})
|
||||||
|
|
||||||
// Disable console color for testing
|
// Disable console color for testing
|
||||||
gin.DisableConsoleColor()
|
gin.DisableConsoleColor()
|
||||||
@@ -86,7 +102,11 @@ func (pm *ProxyManager) stopProcesses() {
|
|||||||
|
|
||||||
func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
|
func (pm *ProxyManager) listModelsHandler(c *gin.Context) {
|
||||||
data := []interface{}{}
|
data := []interface{}{}
|
||||||
for id := range pm.config.Models {
|
for id, modelConfig := range pm.config.Models {
|
||||||
|
if modelConfig.Unlisted {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
data = append(data, map[string]interface{}{
|
data = append(data, map[string]interface{}{
|
||||||
"id": id,
|
"id": id,
|
||||||
"object": "model",
|
"object": "model",
|
||||||
@@ -113,7 +133,7 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
|
|||||||
pm.Lock()
|
pm.Lock()
|
||||||
defer pm.Unlock()
|
defer pm.Unlock()
|
||||||
|
|
||||||
// Check if requestedModel contains a /
|
// Check if requestedModel contains a PROFILE_SPLIT_CHAR
|
||||||
profileName, modelName := "", requestedModel
|
profileName, modelName := "", requestedModel
|
||||||
if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
|
if idx := strings.Index(requestedModel, PROFILE_SPLIT_CHAR); idx != -1 {
|
||||||
profileName = requestedModel[:idx]
|
profileName = requestedModel[:idx]
|
||||||
@@ -170,6 +190,48 @@ func (pm *ProxyManager) swapModel(requestedModel string) (*Process, error) {
|
|||||||
return pm.currentProcesses[requestedProcessKey], nil
|
return pm.currentProcesses[requestedProcessKey], nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (pm *ProxyManager) proxyToUpstream(c *gin.Context) {
|
||||||
|
requestedModel := c.Param("model_id")
|
||||||
|
|
||||||
|
if requestedModel == "" {
|
||||||
|
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("model id required in path"))
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if process, err := pm.swapModel(requestedModel); err != nil {
|
||||||
|
c.AbortWithError(http.StatusNotFound, fmt.Errorf("unable to swap to model, %s", err.Error()))
|
||||||
|
} else {
|
||||||
|
// rewrite the path
|
||||||
|
c.Request.URL.Path = c.Param("upstreamPath")
|
||||||
|
process.ProxyRequest(c.Writer, c.Request)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (pm *ProxyManager) upstreamIndex(c *gin.Context) {
|
||||||
|
var html strings.Builder
|
||||||
|
|
||||||
|
html.WriteString("<!doctype HTML>\n<html><body><h1>Available Models</h1><ul>")
|
||||||
|
|
||||||
|
// Extract keys and sort them
|
||||||
|
var modelIDs []string
|
||||||
|
for modelID, modelConfig := range pm.config.Models {
|
||||||
|
if modelConfig.Unlisted {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
modelIDs = append(modelIDs, modelID)
|
||||||
|
}
|
||||||
|
sort.Strings(modelIDs)
|
||||||
|
|
||||||
|
// Iterate over sorted keys
|
||||||
|
for _, modelID := range modelIDs {
|
||||||
|
html.WriteString(fmt.Sprintf("<li><a href=\"/upstream/%s\">%s</a></li>", modelID, modelID))
|
||||||
|
}
|
||||||
|
html.WriteString("</ul></body></html>")
|
||||||
|
c.Header("Content-Type", "text/html")
|
||||||
|
c.String(http.StatusOK, html.String())
|
||||||
|
}
|
||||||
|
|
||||||
func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
||||||
bodyBytes, err := io.ReadAll(c.Request.Body)
|
bodyBytes, err := io.ReadAll(c.Request.Body)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -201,16 +263,6 @@ func (pm *ProxyManager) proxyChatRequestHandler(c *gin.Context) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (pm *ProxyManager) proxyNoRouteHandler(c *gin.Context) {
|
|
||||||
// since maps are unordered, just use the first available process if one exists
|
|
||||||
for _, process := range pm.currentProcesses {
|
|
||||||
process.ProxyRequest(c.Writer, c.Request)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
c.AbortWithError(http.StatusBadRequest, fmt.Errorf("no strategy to handle request"))
|
|
||||||
}
|
|
||||||
|
|
||||||
func ProcessKeyName(groupName, modelName string) string {
|
func ProcessKeyName(groupName, modelName string) string {
|
||||||
return groupName + PROFILE_SPLIT_CHAR + modelName
|
return groupName + PROFILE_SPLIT_CHAR + modelName
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
package proxy
|
package proxy
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"embed"
|
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
@@ -9,12 +8,6 @@ import (
|
|||||||
"github.com/gin-gonic/gin"
|
"github.com/gin-gonic/gin"
|
||||||
)
|
)
|
||||||
|
|
||||||
//go:embed html/logs.html
|
|
||||||
var logsHTML []byte
|
|
||||||
|
|
||||||
// make sure embed is kept there by the IDE auto-package importer
|
|
||||||
var _ = embed.FS{}
|
|
||||||
|
|
||||||
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
|
func (pm *ProxyManager) sendLogsHandlers(c *gin.Context) {
|
||||||
|
|
||||||
accept := c.GetHeader("Accept")
|
accept := c.GetHeader("Accept")
|
||||||
|
|||||||
Reference in New Issue
Block a user