Increase minimum CUDA memory allocation overhead and fix minimum overhead for multi-gpu (#1896)

* increase minimum cuda overhead and fix minimum overhead for multi-gpu

* fix multi gpu overhead

* limit overhead to 10% of all gpus

* better wording

* allocate fixed amount before layers

* fixed only includes graph alloc
This commit is contained in:
Jeffrey Morgan 2024-01-10 19:08:51 -05:00 committed by GitHub
parent f83881390f
commit b24e8d17b2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 19 additions and 12 deletions

View File

@ -184,10 +184,11 @@ func getCPUMem() (memInfo, error) {
func CheckVRAM() (int64, error) { func CheckVRAM() (int64, error) {
gpuInfo := GetGPUInfo() gpuInfo := GetGPUInfo()
if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") { if gpuInfo.FreeMemory > 0 && (gpuInfo.Library == "cuda" || gpuInfo.Library == "rocm") {
// leave 10% or 384Mi of VRAM free for unaccounted for overhead // leave 10% or 512MiB of VRAM free per GPU to handle unaccounted for overhead
overhead := gpuInfo.FreeMemory * uint64(gpuInfo.DeviceCount) / 10 overhead := gpuInfo.FreeMemory / 10
if overhead < 384*1024*1024 { gpus := uint64(gpuInfo.DeviceCount)
overhead = 384 * 1024 * 1024 if overhead < gpus*512*1024*1024 {
overhead = gpus * 512 * 1024 * 1024
} }
return int64(gpuInfo.FreeMemory - overhead), nil return int64(gpuInfo.FreeMemory - overhead), nil
} }

View File

@ -95,20 +95,26 @@ func New(workDir, model string, adapters, projectors []string, opts api.Options)
break break
} }
// no offloading required // alloc buffer and kv cache is allocated as a fixed amount on the main gpu
if requiredTotal <= available { // TODO: find the largest GPU and only reserve memory there
break avgAvailable := available / int64(info.DeviceCount)
} if requiredAlloc > avgAvailable {
// requiredAlloc is always loaded for the CUDA runner, so don't load it if it won't fit
if requiredAlloc > available {
log.Printf("not enough vram available, falling back to CPU only") log.Printf("not enough vram available, falling back to CPU only")
library = "cpu" library = "cpu"
opts.NumGPU = 0 opts.NumGPU = 0
break break
} }
available -= requiredAlloc // we don't know which GPU will be used, so estimate
// the scratch buffer space on all of them
// TODO: allocate less layers to the GPU with the scratch buffer
// and more to the others (based on their available memory)
available -= requiredAlloc * int64(info.DeviceCount)
// no offloading required
if requiredModel+requiredKv <= available {
break
}
// fill remaining vram with layers // fill remaining vram with layers
log.Println("splitting", available, "of available memory bytes into layers") log.Println("splitting", available, "of available memory bytes into layers")