mirror of
https://github.com/tcsenpai/ollama.git
synced 2025-06-08 12:15:22 +00:00
fix memory
This commit is contained in:
parent
e04c7012c2
commit
69207b4987
@ -83,7 +83,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
var memoryLayerOutput uint64
|
var memoryLayerOutput uint64
|
||||||
|
|
||||||
// The sizes of a layer
|
// The sizes of a layer
|
||||||
var layerSize uint64
|
var baseLayerSize uint64
|
||||||
|
|
||||||
// The sum of all the layer sizes (just for logging)
|
// The sum of all the layer sizes (just for logging)
|
||||||
var memoryWeights uint64
|
var memoryWeights uint64
|
||||||
@ -110,27 +110,27 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
layers := ggml.Tensors().Layers()
|
layers := ggml.Tensors().Layers()
|
||||||
// add one layer worth of memory as a buffer
|
// add one layer worth of memory as a buffer
|
||||||
if blk0, ok := layers["blk.0"]; ok {
|
if blk0, ok := layers["blk.0"]; ok {
|
||||||
layerSize = blk0.size()
|
baseLayerSize = blk0.size()
|
||||||
} else {
|
} else {
|
||||||
slog.Warn("model missing blk.0 layer size")
|
slog.Warn("model missing blk.0 layer size")
|
||||||
}
|
}
|
||||||
|
|
||||||
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
|
// fp16 k,v = sizeof(float16) * n_ctx * n_layer * (n_embd_head_k + n_embd_head_v) * n_head_kv
|
||||||
var kv uint64 = 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
kv := 2 * uint64(opts.NumCtx) * ggml.KV().BlockCount() * (ggml.KV().EmbeddingHeadCountK() + ggml.KV().EmbeddingHeadCountV()) * ggml.KV().HeadCountKV()
|
||||||
|
layerKV := kv / ggml.KV().BlockCount()
|
||||||
// KV is proportional to the number of layers
|
baseLayerSize += layerKV
|
||||||
layerSize += kv / ggml.KV().BlockCount()
|
|
||||||
|
|
||||||
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
graphPartialOffload, graphFullOffload = ggml.GraphSize(uint64(opts.NumCtx), uint64(min(opts.NumCtx, opts.NumBatch)))
|
||||||
if graphPartialOffload == 0 {
|
if graphPartialOffload == 0 {
|
||||||
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
graphPartialOffload = ggml.KV().GQA() * kv / 6
|
||||||
}
|
}
|
||||||
|
|
||||||
if graphFullOffload == 0 {
|
if graphFullOffload == 0 {
|
||||||
graphFullOffload = graphPartialOffload
|
graphFullOffload = graphPartialOffload
|
||||||
}
|
}
|
||||||
|
|
||||||
// on metal there's no partial offload overhead
|
|
||||||
if gpus[0].Library == "metal" {
|
if gpus[0].Library == "metal" {
|
||||||
|
// there's no partial offload overhead on metal
|
||||||
graphPartialOffload = graphFullOffload
|
graphPartialOffload = graphFullOffload
|
||||||
} else if len(gpus) > 1 {
|
} else if len(gpus) > 1 {
|
||||||
// multigpu should always use the partial graph size
|
// multigpu should always use the partial graph size
|
||||||
@ -140,6 +140,7 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
if layer, ok := layers["output_norm"]; ok {
|
if layer, ok := layers["output_norm"]; ok {
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
}
|
}
|
||||||
|
|
||||||
if layer, ok := layers["output"]; ok {
|
if layer, ok := layers["output"]; ok {
|
||||||
memoryLayerOutput += layer.size()
|
memoryLayerOutput += layer.size()
|
||||||
} else if layer, ok := layers["token_embd"]; ok {
|
} else if layer, ok := layers["token_embd"]; ok {
|
||||||
@ -164,12 +165,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
gzo = gpuZeroOverhead
|
gzo = gpuZeroOverhead
|
||||||
}
|
}
|
||||||
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
// Only include GPUs that can fit the graph, gpu minimum, the layer buffer and at least more layer
|
||||||
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*layerSize {
|
if gpus[i].FreeMemory < gzo+max(graphPartialOffload, graphFullOffload)+gpus[i].MinimumMemory+2*baseLayerSize {
|
||||||
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
slog.Debug("gpu has too little memory to allocate any layers", "gpu", gpus[i])
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
gpusWithSpace = append(gpusWithSpace, gs{i, &gpus[i]})
|
||||||
gpuAllocations[i] += gpus[i].MinimumMemory + layerSize // We hold off on graph until we know partial vs. full
|
gpuAllocations[i] += gpus[i].MinimumMemory + baseLayerSize // We hold off on graph until we know partial vs. full
|
||||||
}
|
}
|
||||||
|
|
||||||
var gpuZeroID int
|
var gpuZeroID int
|
||||||
@ -180,11 +181,14 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
|
|
||||||
// For all the layers, find where they can fit on the GPU(s)
|
// For all the layers, find where they can fit on the GPU(s)
|
||||||
for i := range int(ggml.KV().BlockCount()) {
|
for i := range int(ggml.KV().BlockCount()) {
|
||||||
// Some models have inconsistent layer sizes
|
var layerSize uint64
|
||||||
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
if blk, ok := layers[fmt.Sprintf("blk.%d", i)]; ok {
|
||||||
layerSize = blk.size()
|
layerSize = blk.size()
|
||||||
layerSize += kv / ggml.KV().BlockCount()
|
} else {
|
||||||
|
slog.Error("missing layer", "blk", i)
|
||||||
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
memoryWeights += layerSize
|
memoryWeights += layerSize
|
||||||
|
|
||||||
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
if opts.NumGPU >= 0 && layerCount >= opts.NumGPU {
|
||||||
@ -196,8 +200,8 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
for j := len(gpusWithSpace); j > 0; j-- {
|
for j := len(gpusWithSpace); j > 0; j-- {
|
||||||
g := gpusWithSpace[i%j]
|
g := gpusWithSpace[i%j]
|
||||||
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
|
||||||
if g.g.FreeMemory > used+layerSize {
|
if g.g.FreeMemory > used+layerSize+layerKV {
|
||||||
gpuAllocations[g.i] += layerSize
|
gpuAllocations[g.i] += layerSize + layerKV
|
||||||
layerCounts[g.i]++
|
layerCounts[g.i]++
|
||||||
layerCount++
|
layerCount++
|
||||||
break
|
break
|
||||||
@ -206,11 +210,12 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if layerCount >= int(ggml.KV().BlockCount()) {
|
if layerCount >= int(ggml.KV().BlockCount()) {
|
||||||
fullyLoaded = true
|
fullyLoaded = true
|
||||||
} else {
|
} else {
|
||||||
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
for i := layerCount; i < int(ggml.KV().BlockCount()); i++ {
|
||||||
overflow += layerSize
|
overflow += baseLayerSize
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -265,9 +270,10 @@ func EstimateGPULayers(gpus []gpu.GpuInfo, ggml *GGML, projectors []string, opts
|
|||||||
}
|
}
|
||||||
tensorSplit = strings.Join(splits, ",")
|
tensorSplit = strings.Join(splits, ",")
|
||||||
}
|
}
|
||||||
allocationsList := []string{}
|
|
||||||
for _, a := range gpuAllocations {
|
allocationsList := make([]string, len(gpuAllocations))
|
||||||
allocationsList = append(allocationsList, format.HumanBytes2(a))
|
for i, a := range gpuAllocations {
|
||||||
|
allocationsList[i] = format.HumanBytes2(a)
|
||||||
}
|
}
|
||||||
|
|
||||||
estimate := MemoryEstimate{
|
estimate := MemoryEstimate{
|
||||||
@ -337,9 +343,9 @@ func (m MemoryEstimate) log() {
|
|||||||
slog.Group(
|
slog.Group(
|
||||||
"weights",
|
"weights",
|
||||||
// memory of the weights
|
// memory of the weights
|
||||||
"total", format.HumanBytes2(m.memoryWeights),
|
"total", format.HumanBytes2(m.memoryWeights+m.memoryLayerOutput),
|
||||||
// memory of repeating layers
|
// memory of repeating layers
|
||||||
"repeating", format.HumanBytes2(m.memoryWeights-m.memoryLayerOutput),
|
"repeating", format.HumanBytes2(m.memoryWeights),
|
||||||
// memory of non-repeating layers
|
// memory of non-repeating layers
|
||||||
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
"nonrepeating", format.HumanBytes2(m.memoryLayerOutput),
|
||||||
),
|
),
|
||||||
|
@ -62,6 +62,15 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||||||
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
estimate := EstimateGPULayers(gpus, ggml, projectors, opts)
|
||||||
assert.Equal(t, 0, estimate.Layers)
|
assert.Equal(t, 0, estimate.Layers)
|
||||||
assert.Equal(t, uint64(0), estimate.Graph)
|
assert.Equal(t, uint64(0), estimate.Graph)
|
||||||
|
|
||||||
|
// 5 layers * 4 bytes per layer
|
||||||
|
if estimate.memoryWeights != 20 {
|
||||||
|
t.Errorf("expected memoryWeights 20, got %d", estimate.memoryWeights)
|
||||||
|
}
|
||||||
|
|
||||||
|
if estimate.memoryLayerOutput != 4 {
|
||||||
|
t.Errorf("expected memoryLayerOutput 4, got %d", estimate.memoryLayerOutput)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
|
|
||||||
// derived from the dummy ggml file above
|
// derived from the dummy ggml file above
|
||||||
@ -124,6 +133,15 @@ func TestEstimateGPULayers(t *testing.T) {
|
|||||||
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
assert.Equal(t, estimate.VRAMSize, estimate.TotalSize, "scenario %d: %v %+v", i, s, estimate)
|
||||||
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
assert.Equal(t, estimate.TotalSize, layerSums, "scenario %d: %v %+v", i, s, estimate)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 5 layers * 4 bytes per layer
|
||||||
|
if estimate.memoryWeights != 20 {
|
||||||
|
t.Errorf("expected memoryWeights 20, got %d", estimate.memoryWeights)
|
||||||
|
}
|
||||||
|
|
||||||
|
if estimate.memoryLayerOutput != 4 {
|
||||||
|
t.Errorf("expected memoryLayerOutput 4, got %d", estimate.memoryLayerOutput)
|
||||||
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user