mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-07 11:45:21 +00:00
Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896)
* increase minimum cuda overhead and fix minimum overhead for multi-gpu * fix multi gpu overhead * limit overhead to 10% of all gpus * better wording * allocate fixed amount before layers * fixed only includes graph alloc
This commit is contained in:
parent
f83881390f
commit
b24e8d17b2
@ -184,10 +184,11 @@ func getCPUMem() (memInfo, error) {
|
|||||||
func CheckVRAM() (int64, error) {
|
func CheckVRAM() (int64, error) {
|
||||||
gpuInfo := GetGPUInfo()
|
gpuInfo := GetGPUInfo()
|
||||||
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
|
||||||
// leave 10% or 384Mi of VRAM free for unaccounted for overhead
|
// leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead
|
||||||
overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10
|
overhead := gpuInfo.FreeMemory / 10
|
||||||
if overhead < 384*1024*1024 {
|
gpus := uint64(gpuInfo.DeviceCount)
|
||||||
overhead = 384 * 1024 * 1024
|
if overhead < gpus*512*1024*1024 {
|
||||||
|
overhead = gpus * 512 * 1024 * 1024
|
||||||
}
|
}
|
||||||
return int64(gpuInfo.FreeMemory - overhead), nil
|
return int64(gpuInfo.FreeMemory - overhead), nil
|
||||||
}
|
}
|
||||||
|
22
llm/llm.go
22
llm/llm.go
@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// no offloading required
|
// alloc buffer and kv cache is allocated as a fixed amount on the main gpu
|
||||||
if requiredTotal <= available {
|
// TODO: find the largest GPU and only reserve memory there
|
||||||
break
|
avgAvailable := available / int64(info.DeviceCount)
|
||||||
}
|
if requiredAlloc > avgAvailable {
|
||||||
|
|
||||||
// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
|
|
||||||
if requiredAlloc > available {
|
|
||||||
log.Printf("not enough vram available, falling back to CPU only")
|
log.Printf("not enough vram available, falling back to CPU only")
|
||||||
library = "cpu"
|
library = "cpu"
|
||||||
opts.NumGPU = 0
|
opts.NumGPU = 0
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
available -= requiredAlloc
|
// we don't know which GPU will be used, so estimate
|
||||||
|
// the scratch buffer space on all of them
|
||||||
|
// TODO: allocate less layers to the GPU with the scratch buffer
|
||||||
|
// and more to the others (based on their available memory)
|
||||||
|
available -= requiredAlloc * int64(info.DeviceCount)
|
||||||
|
|
||||||
|
// no offloading required
|
||||||
|
if requiredModel+requiredKv <= available {
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
// fill remaining vram with layers
|
// fill remaining vram with layers
|
||||||
log.Println("splitting", available, "of available memory bytes into layers")
|
log.Println("splitting", available, "of available memory bytes into layers")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user