mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-07 11:45:21 +00:00
add more runner params
This commit is contained in:
parent
72f3fe4b94
commit
20afaae020
@ -31,6 +31,7 @@ package llama
|
|||||||
// #include "sampling_ext.h"
|
// #include "sampling_ext.h"
|
||||||
import "C"
|
import "C"
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strings"
|
"strings"
|
||||||
@ -49,13 +50,14 @@ type ContextParams struct {
|
|||||||
c C.struct_llama_context_params
|
c C.struct_llama_context_params
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewContextParams() ContextParams {
|
func NewContextParams(numCtx int, threads int, flashAttention bool) ContextParams {
|
||||||
params := C.llama_context_default_params()
|
params := C.llama_context_default_params()
|
||||||
params.seed = C.uint(1234)
|
params.n_ctx = C.uint(numCtx)
|
||||||
params.n_ctx = C.uint(2048)
|
|
||||||
params.n_threads = C.uint(runtime.NumCPU())
|
params.n_threads = C.uint(runtime.NumCPU())
|
||||||
params.n_threads_batch = params.n_threads
|
params.n_threads_batch = params.n_threads
|
||||||
params.embeddings = C.bool(true)
|
params.embeddings = C.bool(true)
|
||||||
|
params.flash_attn = C.bool(flashAttention)
|
||||||
|
params.n_threads = C.uint(threads)
|
||||||
return ContextParams{c: params}
|
return ContextParams{c: params}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -63,9 +65,10 @@ type ModelParams struct {
|
|||||||
c C.struct_llama_model_params
|
c C.struct_llama_model_params
|
||||||
}
|
}
|
||||||
|
|
||||||
func NewModelParams() ModelParams {
|
func NewModelParams(numGpuLayers int, mainGpu int) ModelParams {
|
||||||
params := C.llama_model_default_params()
|
params := C.llama_model_default_params()
|
||||||
params.n_gpu_layers = 999
|
params.n_gpu_layers = C.int(numGpuLayers)
|
||||||
|
params.main_gpu = C.int32_t(mainGpu)
|
||||||
return ModelParams{c: params}
|
return ModelParams{c: params}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,6 +158,23 @@ func (m *Model) TokenIsEog(token int) bool {
|
|||||||
return bool(C.llama_token_is_eog(m.c, C.llama_token(token)))
|
return bool(C.llama_token_is_eog(m.c, C.llama_token(token)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (m *Model) ApplyLoraFromFile(loraPath string, scale float32, baseModelPath string, threads int) error {
|
||||||
|
cLoraPath := C.CString(loraPath)
|
||||||
|
defer C.free(unsafe.Pointer(cLoraPath))
|
||||||
|
|
||||||
|
var cBaseModelPath *C.char
|
||||||
|
if baseModelPath != "" {
|
||||||
|
cBaseModelPath = C.CString(baseModelPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
code := int(C.llama_model_apply_lora_from_file(m.c, cLoraPath, C.float(scale), cBaseModelPath, C.int32_t(threads)))
|
||||||
|
if code != 0 {
|
||||||
|
return errors.New("error applying lora from file")
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
type Batch struct {
|
type Batch struct {
|
||||||
c C.struct_llama_batch
|
c C.struct_llama_batch
|
||||||
}
|
}
|
||||||
|
@ -9,6 +9,7 @@ import (
|
|||||||
"log/slog"
|
"log/slog"
|
||||||
"net"
|
"net"
|
||||||
"net/http"
|
"net/http"
|
||||||
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
@ -73,6 +74,8 @@ type Server struct {
|
|||||||
lc *llama.Context
|
lc *llama.Context
|
||||||
cc *llama.ClipContext
|
cc *llama.ClipContext
|
||||||
|
|
||||||
|
batchSize int
|
||||||
|
|
||||||
// parallel is the number of parallel requests to handle
|
// parallel is the number of parallel requests to handle
|
||||||
parallel int
|
parallel int
|
||||||
|
|
||||||
@ -154,7 +157,7 @@ func truncateStop(pieces []string, stop string) []string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (s *Server) run(ctx context.Context) {
|
func (s *Server) run(ctx context.Context) {
|
||||||
batch := llama.NewBatch(512, 0, s.parallel)
|
batch := llama.NewBatch(s.batchSize, 0, s.parallel)
|
||||||
defer batch.Free()
|
defer batch.Free()
|
||||||
|
|
||||||
// build up stop sequences as we recognize them
|
// build up stop sequences as we recognize them
|
||||||
@ -182,7 +185,7 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
|
|
||||||
for j, t := range seq.tokens {
|
for j, t := range seq.tokens {
|
||||||
// todo: make this n_batch
|
// todo: make this n_batch
|
||||||
if j > 512 {
|
if j > s.batchSize {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -207,10 +210,10 @@ func (s *Server) run(ctx context.Context) {
|
|||||||
|
|
||||||
// don't sample prompt processing
|
// don't sample prompt processing
|
||||||
if seq.prompt() {
|
if seq.prompt() {
|
||||||
if len(seq.tokens) < 512 {
|
if len(seq.tokens) < s.batchSize {
|
||||||
seq.tokens = []int{}
|
seq.tokens = []int{}
|
||||||
} else {
|
} else {
|
||||||
seq.tokens = seq.tokens[512:]
|
seq.tokens = seq.tokens[s.batchSize:]
|
||||||
}
|
}
|
||||||
|
|
||||||
continue
|
continue
|
||||||
@ -412,14 +415,26 @@ func main() {
|
|||||||
mpath := flag.String("model", "", "Path to model binary file")
|
mpath := flag.String("model", "", "Path to model binary file")
|
||||||
ppath := flag.String("projector", "", "Path to projector binary file")
|
ppath := flag.String("projector", "", "Path to projector binary file")
|
||||||
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
parallel := flag.Int("parallel", 1, "Number of sequences to handle simultaneously")
|
||||||
|
batchSize := flag.Int("batch-size", 512, "Batch size")
|
||||||
|
nGpuLayers := flag.Int("n-gpu-layers", 0, "Number of layers to offload to GPU")
|
||||||
|
mainGpu := flag.Int("main-gpu", 0, "Main GPU")
|
||||||
|
flashAttention := flag.Bool("flash-attention", false, "Enable flash attention")
|
||||||
|
numCtx := flag.Int("num-ctx", 2048, "Context (or KV cache) size")
|
||||||
|
lpath := flag.String("lora", "", "Path to lora layer file")
|
||||||
port := flag.Int("port", 8080, "Port to expose the server on")
|
port := flag.Int("port", 8080, "Port to expose the server on")
|
||||||
|
threads := flag.Int("threads", runtime.NumCPU(), "Number of threads to use during generation")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
// load the model
|
// load the model
|
||||||
llama.BackendInit()
|
llama.BackendInit()
|
||||||
params := llama.NewModelParams()
|
params := llama.NewModelParams(*nGpuLayers, *mainGpu)
|
||||||
model := llama.LoadModelFromFile(*mpath, params)
|
model := llama.LoadModelFromFile(*mpath, params)
|
||||||
ctxParams := llama.NewContextParams()
|
|
||||||
|
if *lpath != "" {
|
||||||
|
model.ApplyLoraFromFile(*lpath, 1.0, "", *threads)
|
||||||
|
}
|
||||||
|
|
||||||
|
ctxParams := llama.NewContextParams(*numCtx, *threads, *flashAttention)
|
||||||
lc := llama.NewContextWithModel(model, ctxParams)
|
lc := llama.NewContextWithModel(model, ctxParams)
|
||||||
if lc == nil {
|
if lc == nil {
|
||||||
panic("Failed to create context")
|
panic("Failed to create context")
|
||||||
@ -434,11 +449,12 @@ func main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
server := &Server{
|
server := &Server{
|
||||||
model: model,
|
model: model,
|
||||||
lc: lc,
|
lc: lc,
|
||||||
cc: cc,
|
cc: cc,
|
||||||
parallel: *parallel,
|
batchSize: *batchSize,
|
||||||
seqs: make([]*Sequence, *parallel),
|
parallel: *parallel,
|
||||||
|
seqs: make([]*Sequence, *parallel),
|
||||||
}
|
}
|
||||||
|
|
||||||
server.cond = sync.NewCond(&server.mu)
|
server.cond = sync.NewCond(&server.mu)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user