diff --git a/llm/server.go b/llm/server.go
index 9b5d0f06..97aa2a15 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -191,35 +191,38 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 		params = append(params, "--memory-f32")
 	}
 
-	if opts.UseMLock {
-		params = append(params, "--mlock")
+	flashAttnEnabled := envconfig.FlashAttention
+
+	for _, g := range gpus {
+		// only cuda (compute capability 7+) and metal support flash attention
+		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
+			flashAttnEnabled = false
+		}
+
+		// mmap has issues with partial offloading on metal
+		if g.Library == "metal" &&
+			uint64(opts.NumGPU) > 0 &&
+			uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
+			opts.UseMMap = false
+		}
+	}
+
+	if flashAttnEnabled {
+		params = append(params, "--flash-attn")
 	}
 
 	if !opts.UseMMap {
 		params = append(params, "--no-mmap")
 	}
 
+	if opts.UseMLock {
+		params = append(params, "--mlock")
+	}
+
 	if opts.UseNUMA {
 		params = append(params, "--numa")
 	}
 
-	flashAttnEnabled := envconfig.FlashAttention
-
-	// partial offloading does not support flash attention
-	if uint64(opts.NumGPU) < ggml.KV().BlockCount()+1 {
-		flashAttnEnabled = false
-	}
-
-	// only cuda (compute capability 7+) and metal support flash attention
-	for _, g := range gpus {
-		if g.Library != "metal" && (g.Library != "cuda" || g.DriverMajor < 7) {
-			flashAttnEnabled = false
-		}
-	}
-	if flashAttnEnabled {
-		params = append(params, "--flash-attn")
-	}
-
 	numParallel := envconfig.NumParallel
 
 	// TODO (jmorganca): multimodal models don't support parallel yet