first commit

2024-10-03 20:20:01 -07:00
commit b63b81b121
8 changed files with 280 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,3 @@
 .aider*
 .env
 build/
--- a/bin/simple-responder/simple-responder.go
+++ b/bin/simple-responder/simple-responder.go
@@ -0,0 +1,37 @@
 package main
 import (
 	"flag"
 	"fmt"
 	"net/http"
 )
 func main() {
 	// Define a command-line flag for the port
 	port := flag.String("port", "8080", "port to listen on")
 	// Define a command-line flag for the response message
 	responseMessage := flag.String("respond", "hi", "message to respond with")
 	flag.Parse() // Parse the command-line flags
 	// Set up the handler function using the provided response message
 	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
 		fmt.Fprintln(w, *responseMessage)
 	})
 	// Set up the /health endpoint handler function
 	http.HandleFunc("/health", func(w http.ResponseWriter, r *http.Request) {
 		w.Header().Set("Content-Type", "application/json")
 		response := `{"status": "ok"}`
 		w.Write([]byte(response))
 	})
 	address := ":" + *port // Address with the specified port
 	fmt.Printf("Server is listening on port %s\n", *port)
 	// Start the server and log any error if it occurs
 	if err := http.ListenAndServe(address, nil); err != nil {
 		fmt.Printf("Error starting server: %s\n", err)
 	}
 }
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -0,0 +1,7 @@
 models:
  "llama":
    cmd: "models/llama-server-osx --port 8999 -m models/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
    proxy: "http://127.0.0.1:8999"
  "qwen":
    cmd: "models/llama-server-osx --port 8999 -m models/Qwen2.5-1.5B-Instruct-Q4_K_M.gguf "
    proxy: "http://127.0.0.1:8999"
--- a/go.mod
+++ b/go.mod
@@ -0,0 +1,5 @@
 module golang-llama-cpp-proxy
 go 1.23.0
 require gopkg.in/yaml.v3 v3.0.1 // indirect
--- a/go.sum
+++ b/go.sum
@@ -0,0 +1,3 @@
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/llama-proxy.go
+++ b/llama-proxy.go
@@ -0,0 +1,215 @@
 package main
 import (
 	"bytes"
 	"context"
 	"encoding/json"
 	"flag"
 	"fmt"
 	"io"
 	"net/http"
 	"os"
 	"os/exec"
 	"strings"
 	"sync"
 	"syscall"
 	"time"
 	"gopkg.in/yaml.v3"
 )
 type ModelConfig struct {
 	Cmd   string `yaml:"cmd"`
 	Proxy string `yaml:"proxy"`
 }
 type Config struct {
 	Models map[string]ModelConfig `yaml:"models"`
 }
 type ServiceState struct {
 	sync.Mutex
 	currentCmd   *exec.Cmd
 	currentModel string
 }
 func loadConfig(path string) (*Config, error) {
 	data, err := os.ReadFile(path)
 	if err != nil {
 		return nil, err
 	}
 	var config Config
 	err = yaml.Unmarshal(data, &config)
 	if err != nil {
 		return nil, err
 	}
 	return &config, nil
 }
 func startService(command string) (*exec.Cmd, error) {
 	args := strings.Fields(command)
 	cmd := exec.Command(args[0], args[1:]...)
 	// write it to the stdout/stderr of the proxy
 	cmd.Stdout = os.Stdout
 	cmd.Stderr = os.Stderr
 	err := cmd.Start()
 	if err != nil {
 		return nil, err
 	}
 	return cmd, nil
 }
 func checkHealthEndpoint(client *http.Client, healthURL string, maxDuration time.Duration) error {
 	startTime := time.Now()
 	for {
 		req, err := http.NewRequest("GET", healthURL, nil)
 		if err != nil {
 			return err
 		}
 		// Set request timeout
 		ctx, cancel := context.WithTimeout(req.Context(), 250*time.Millisecond)
 		defer cancel()
 		// Execute the request with the context
 		req = req.WithContext(ctx)
 		resp, err := client.Do(req)
 		if err != nil {
 			// Log error and check elapsed time before retrying
 			if time.Since(startTime) >= maxDuration {
 				return fmt.Errorf("failed to get a healthy response from: %s", healthURL)
 			}
 			// Wait a second before retrying
 			time.Sleep(time.Second)
 			continue
 		}
 		// Close response body
 		defer resp.Body.Close()
 		// Check if we got a 200 OK response
 		if resp.StatusCode == http.StatusOK {
 			return nil // Health check succeeded
 		}
 		// Check elapsed time before retrying
 		if time.Since(startTime) >= maxDuration {
 			return fmt.Errorf("failed to get a healthy response from: %s", healthURL)
 		}
 		// Wait a second before retrying
 		time.Sleep(time.Second)
 	}
 }
 func proxyRequest(w http.ResponseWriter, r *http.Request, config *Config, state *ServiceState) {
 	client := &http.Client{}
 	// Read the original request body
 	bodyBytes, err := io.ReadAll(r.Body)
 	if err != nil {
 		http.Error(w, "Invalid JSON", http.StatusBadRequest)
 		return
 	}
 	var requestBody map[string]interface{}
 	if err := json.Unmarshal(bodyBytes, &requestBody); err != nil {
 		http.Error(w, "Invalid JSON", http.StatusBadRequest)
 		return
 	}
 	model, ok := requestBody["model"].(string)
 	if !ok {
 		http.Error(w, "Missing or invalid 'model' key", http.StatusBadRequest)
 		return
 	}
 	modelConfig, ok := config.Models[model]
 	if !ok {
 		http.Error(w, "Model not found in configuration", http.StatusNotFound)
 		return
 	}
 	err = error(nil)
 	state.Lock()
 	defer state.Unlock()
 	if state.currentModel != model {
 		if state.currentCmd != nil {
 			state.currentCmd.Process.Signal(syscall.SIGTERM)
 		}
 		state.currentCmd, err = startService(modelConfig.Cmd)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusInternalServerError)
 			return
 		}
 		state.currentModel = model
 		// Check the /health endpoint
 		healthURL := modelConfig.Proxy + "/health"
 		err = checkHealthEndpoint(client, healthURL, 30*time.Second)
 		if err != nil {
 			http.Error(w, err.Error(), http.StatusServiceUnavailable)
 			return
 		}
 	}
 	req, err := http.NewRequest(r.Method, modelConfig.Proxy+r.URL.String(), io.NopCloser(bytes.NewBuffer(bodyBytes)))
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusInternalServerError)
 		return
 	}
 	req.Header = r.Header
 	resp, err := client.Do(req)
 	if err != nil {
 		http.Error(w, err.Error(), http.StatusBadGateway)
 		return
 	}
 	defer resp.Body.Close()
 	for k, vv := range resp.Header {
 		for _, v := range vv {
 			w.Header().Add(k, v)
 		}
 	}
 	w.WriteHeader(resp.StatusCode)
 	io.Copy(w, resp.Body)
 }
 func main() {
 	// Define a command-line flag for the port
 	configPath := flag.String("config", "config.yaml", "config file name")
 	listenStr := flag.String("listen", ":8080", "listen ip/port")
 	flag.Parse() // Parse the command-line flags
 	config, err := loadConfig(*configPath)
 	if err != nil {
 		fmt.Printf("Error loading config: %v\n", err)
 		os.Exit(1)
 	}
 	serviceState := &ServiceState{}
 	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
 		if r.URL.Path == "/v1/chat/completions" {
 			proxyRequest(w, r, config, serviceState)
 		} else {
 			http.Error(w, "Endpoint not supported", http.StatusNotFound)
 		}
 	})
 	fmt.Println("Proxy server started on :8080")
 	if err := http.ListenAndServe(*listenStr, nil); err != nil {
 		fmt.Printf("Error starting server: %v\n", err)
 		os.Exit(1)
 	}
 }
--- a/models/.gitignore
+++ b/models/.gitignore
@@ -0,0 +1,3 @@
 *
 !.gitignore
 !README.md
--- a/models/README.md
+++ b/models/README.md
@@ -0,0 +1,7 @@
 TODO improve these docs
 1. Download a llama-server suitable for your architecture
 1. Fetch some small models for testing / swapping between
    - `huggingface-cli download bartowski/Qwen2.5-1.5B-Instruct-GGUF --include "Qwen2.5-1.5B-Instruct-Q4_K_M.gguf" --local-dir ./`
    - `huggingface-cli download bartowski/Llama-3.2-1B-Instruct-GGUF --include "Llama-3.2-1B-Instruct-Q4_K_M.gguf" --local-dir ./`
 1. Create a new config.yaml file (see `config.example.yaml`) pointing to the models