add more runner params

This commit is contained in:
jmorganca 2024-05-28 00:02:01 -07:00
parent 72f3fe4b94
commit 20afaae020
2 changed files with 52 additions and 16 deletions

View File

@ -31,6 +31,7 @@ package llama
// #include "sampling_ext.h" // #include "sampling_ext.h"
import "C" import "C"
import ( import (
"errors"
"fmt" "fmt"
"runtime" "runtime"
"strings" "strings"
@ -49,13 +50,14 @@ type ContextParams struct {
c C.struct_llama_context_params c C.struct_llama_context_params
} }
func NewContextParams() ContextParams { func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParams {
params := C.llama_context_default_params() params := C.llama_context_default_params()
params.seed = C.uint(1234) params.n_ctx = C.uint(numCtx)
params.n_ctx = C.uint(2048)
params.n_threads = C.uint(runtime.NumCPU()) params.n_threads = C.uint(runtime.NumCPU())
params.n_threads_batch = params.n_threads params.n_threads_batch = params.n_threads
params.embeddings = C.bool(true) params.embeddings = C.bool(true)
params.flash_attn = C.bool(flashAttention)
params.n_threads = C.uint(threads)
return ContextParams{c: params} return ContextParams{c: params}
} }
@ -63,9 +65,10 @@ type ModelParams struct {
c C.struct_llama_model_params c C.struct_llama_model_params
} }
func NewModelParams() ModelParams { func NewModelParams(numGpuLayers int, mainGpu int) ModelParams {
params := C.llama_model_default_params() params := C.llama_model_default_params()
params.n_gpu_layers = 999 params.n_gpu_layers = C.int(numGpuLayers)
params.main_gpu = C.int32_t(mainGpu)
return ModelParams{c: params} return ModelParams{c: params}
} }
@ -155,6 +158,23 @@ func (m *Model) TokenIsEog(token int) bool {
return bool(C.llama_token_is_eog(m.c, C.llama_token(token))) return bool(C.llama_token_is_eog(m.c, C.llama_token(token)))
} }
func (m *Model) ApplyLoraFromFile(loraPath string, scale float32, baseModelPath string, threads int) error {
cLoraPath := C.CString(loraPath)
defer C.free(unsafe.Pointer(cLoraPath))
var cBaseModelPath *C.char
if baseModelPath != "" {
cBaseModelPath = C.CString(baseModelPath)
}
code := int(C.llama_model_apply_lora_from_file(m.c, cLoraPath, C.float(scale), cBaseModelPath, C.int32_t(threads)))
if code != 0 {
return errors.New("error applying lora from file")
}
return nil
}
type Batch struct { type Batch struct {
c C.struct_llama_batch c C.struct_llama_batch
} }

View File

@ -9,6 +9,7 @@ import (
"log/slog" "log/slog"
"net" "net"
"net/http" "net/http"
"runtime"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@ -73,6 +74,8 @@ type Server struct {
lc *llama.Context lc *llama.Context
cc *llama.ClipContext cc *llama.ClipContext
batchSize int
// parallel is the number of parallel requests to handle // parallel is the number of parallel requests to handle
parallel int parallel int
@ -154,7 +157,7 @@ func truncateStop(pieces []string, stop string) []string {
} }
func (s *Server) run(ctx context.Context) { func (s *Server) run(ctx context.Context) {
batch := llama.NewBatch(512, 0, s.parallel) batch := llama.NewBatch(s.batchSize, 0, s.parallel)
defer batch.Free() defer batch.Free()
// build up stop sequences as we recognize them // build up stop sequences as we recognize them
@ -182,7 +185,7 @@ func (s *Server) run(ctx context.Context) {
for j, t := range seq.tokens { for j, t := range seq.tokens {
// todo: make this n_batch // todo: make this n_batch
if j > 512 { if j > s.batchSize {
break break
} }
@ -207,10 +210,10 @@ func (s *Server) run(ctx context.Context) {
// don't sample prompt processing // don't sample prompt processing
if seq.prompt() { if seq.prompt() {
if len(seq.tokens) < 512 { if len(seq.tokens) < s.batchSize {
seq.tokens = []int{} seq.tokens = []int{}
} else { } else {
seq.tokens = seq.tokens[512:] seq.tokens = seq.tokens[s.batchSize:]
} }
continue continue
@ -412,14 +415,26 @@ func main() {
mpath := flag.String("model", "", "Path to model binary file") mpath := flag.String("model", "", "Path to model binary file")
ppath := flag.String("projector", "", "Path to projector binary file") ppath := flag.String("projector", "", "Path to projector binary file")
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously") parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
batchSize := flag.Int("batch-size", 512, "Batch size")
nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
flashAttention := flag.Bool("flash-attention", false, "Enable flash attention")
numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size")
lpath := flag.String("lora", "", "Path to lora layer file")
port := flag.Int("port", 8080, "Port to expose the server on") port := flag.Int("port", 8080, "Port to expose the server on")
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
flag.Parse() flag.Parse()
// load the model // load the model
llama.BackendInit() llama.BackendInit()
params := llama.NewModelParams() params := llama.NewModelParams(*nGpuLayers, *mainGpu)
model := llama.LoadModelFromFile(*mpath, params) model := llama.LoadModelFromFile(*mpath, params)
ctxParams := llama.NewContextParams()
if *lpath != "" {
model.ApplyLoraFromFile(*lpath, 1.0, "", *threads)
}
ctxParams := llama.NewContextParams(*numCtx, *threads, *flashAttention)
lc := llama.NewContextWithModel(model, ctxParams) lc := llama.NewContextWithModel(model, ctxParams)
if lc == nil { if lc == nil {
panic("Failed to create context") panic("Failed to create context")
@ -437,6 +452,7 @@ func main() {
model: model, model: model,
lc: lc, lc: lc,
cc: cc, cc: cc,
batchSize: *batchSize,
parallel: *parallel, parallel: *parallel,
seqs: make([]*Sequence, *parallel), seqs: make([]*Sequence, *parallel),
} }