Skip to content

Commit bedaf34

Browse files
ggerganovmglambda
authored andcommitted
metal : use residency sets (ggml-org#11427)
* metal : use residency sets ggml-ci * metal : restore commandBufferWithUnretainedReferences calls [no ci] * metal : release descriptors ggml-ci * metal : check env GGML_METAL_NO_RESIDENCY ggml-ci * metal : fix build + clean-up ggml-ci
1 parent d4bf8f4 commit bedaf34

File tree

1 file changed

+119
-17
lines changed

1 file changed

+119
-17
lines changed

ggml/src/ggml-metal/ggml-metal.m

+119-17
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919
// max number of MTLCommandBuffer used to submit a graph for processing
2020
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
2121

22-
#define UNUSED(x) (void)(x)
22+
// create residency sets only on macOS >= 15.0
23+
#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24+
#define GGML_METAL_HAS_RESIDENCY_SETS 1
25+
#endif
2326

2427
// globals
2528

@@ -39,6 +42,7 @@
3942

4043
bool has_simdgroup_reduction;
4144
bool has_simdgroup_mm;
45+
bool has_residency_sets;
4246
bool has_bfloat;
4347
bool use_bfloat;
4448

@@ -48,6 +52,7 @@
4852
/*.mtl_device_ref_count =*/ 0,
4953
/*.has_simdgroup_reduction =*/ false,
5054
/*.has_simdgroup_mm =*/ false,
55+
/*.has_residency_sets =*/ false,
5156
/*.has_bfloat =*/ false,
5257
/*.use_bfloat =*/ false,
5358
/*.name =*/ "",
@@ -65,6 +70,10 @@
6570

6671
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];
6772

73+
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
74+
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
75+
#endif
76+
6877
ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
6978
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];
7079

@@ -483,6 +492,11 @@ @implementation GGMLMetalClass
483492
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
484493

485494
ctx->queue = [device newCommandQueue];
495+
if (ctx->queue == nil) {
496+
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
497+
return NULL;
498+
}
499+
486500
ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
487501

488502
id<MTLLibrary> metal_library;
@@ -649,6 +663,7 @@ @implementation GGMLMetalClass
649663

650664
GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
651665
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
666+
GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
652667
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
653668
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
654669
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
@@ -1035,8 +1050,70 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
10351050
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
10361051
int n_buffers;
10371052
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1053+
1054+
// optional MTLResidencySet
1055+
id rset;
10381056
};
10391057

1058+
// rset init
1059+
static bool ggml_backend_metal_buffer_rset_init(
1060+
struct ggml_backend_metal_buffer_context * ctx,
1061+
struct ggml_backend_metal_device_context * ctx_dev,
1062+
id<MTLDevice> device) {
1063+
ctx->rset = nil;
1064+
1065+
if (!ctx_dev->has_residency_sets) {
1066+
return true;
1067+
}
1068+
1069+
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1070+
if (@available(macOS 15.0, *)) {
1071+
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
1072+
desc.label = @"ggml_backend_metal";
1073+
desc.initialCapacity = ctx->n_buffers;
1074+
1075+
NSError * error;
1076+
ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
1077+
if (error) {
1078+
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
1079+
[desc release];
1080+
return false;
1081+
}
1082+
1083+
[desc release];
1084+
1085+
for (int i = 0; i < ctx->n_buffers; i++) {
1086+
[ctx->rset addAllocation:ctx->buffers[i].metal];
1087+
}
1088+
1089+
[ctx->rset commit];
1090+
[ctx->rset requestResidency];
1091+
1092+
return true;
1093+
}
1094+
#else
1095+
GGML_UNUSED(ctx_dev);
1096+
GGML_UNUSED(device);
1097+
#endif
1098+
1099+
return true;
1100+
}
1101+
1102+
// rset free
1103+
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
1104+
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1105+
if (@available(macOS 15.0, *)) {
1106+
if (ctx->rset) {
1107+
[ctx->rset endResidency];
1108+
[ctx->rset removeAllAllocations];
1109+
[ctx->rset release];
1110+
}
1111+
}
1112+
#else
1113+
GGML_UNUSED(ctx);
1114+
#endif
1115+
}
1116+
10401117
// finds the Metal buffer that contains the tensor data on the GPU device
10411118
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
10421119
// Metal buffer based on the host memory pointer
@@ -4176,6 +4253,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41764253
for (int i = 0; i < ctx->n_buffers; i++) {
41774254
[ctx->buffers[i].metal release];
41784255
}
4256+
4257+
ggml_backend_metal_buffer_rset_free(ctx);
41794258
ggml_backend_metal_device_rel(buffer->buft->device->context);
41804259

41814260
if (ctx->owned) {
@@ -4198,19 +4277,19 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
41984277
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
41994278
memset((char *)tensor->data + offset, value, size);
42004279

4201-
UNUSED(buffer);
4280+
GGML_UNUSED(buffer);
42024281
}
42034282

42044283
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
42054284
memcpy((char *)tensor->data + offset, data, size);
42064285

4207-
UNUSED(buffer);
4286+
GGML_UNUSED(buffer);
42084287
}
42094288

42104289
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
42114290
memcpy(data, (const char *)tensor->data + offset, size);
42124291

4213-
UNUSED(buffer);
4292+
GGML_UNUSED(buffer);
42144293
}
42154294

42164295
static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@@ -4220,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
42204299
}
42214300
return false;
42224301

4223-
UNUSED(buffer);
4302+
GGML_UNUSED(buffer);
42244303
}
42254304

42264305
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
@@ -4246,7 +4325,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
42464325
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
42474326
return "Metal";
42484327

4249-
UNUSED(buft);
4328+
GGML_UNUSED(buft);
42504329
}
42514330

42524331
static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
@@ -4270,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
42704349
}
42714350
#endif
42724351
#endif
4273-
UNUSED(device);
4274-
UNUSED(size_aligned);
4352+
GGML_UNUSED(device);
4353+
GGML_UNUSED(size_aligned);
42754354
}
42764355

42774356
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
@@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
42844363
size_aligned += (size_page - (size_aligned % size_page));
42854364
}
42864365

4287-
id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
4366+
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
4367+
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
42884368

42894369
ctx->all_data = ggml_metal_host_malloc(size_aligned);
42904370
ctx->all_size = size_aligned;
@@ -4307,7 +4387,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43074387
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
43084388
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
43094389
free(ctx);
4310-
ggml_backend_metal_device_rel(buft->device->context);
4390+
ggml_backend_metal_device_rel(ctx_dev);
4391+
return NULL;
4392+
}
4393+
4394+
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
4395+
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4396+
free(ctx);
4397+
ggml_backend_metal_device_rel(ctx_dev);
43114398
return NULL;
43124399
}
43134400

@@ -4318,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
43184405

43194406
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
43204407
return 32;
4321-
UNUSED(buft);
4408+
GGML_UNUSED(buft);
43224409
}
43234410

43244411
static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -4328,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
43284415

43294416
return max_size;
43304417

4331-
UNUSED(buft);
4418+
GGML_UNUSED(buft);
43324419
}
43334420

43344421
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
43354422
return true;
43364423

4337-
UNUSED(buft);
4424+
GGML_UNUSED(buft);
43384425
}
43394426

43404427
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
@@ -4357,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
43574444
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
43584445
return "Metal_Mapped";
43594446

4360-
UNUSED(buft);
4447+
GGML_UNUSED(buft);
43614448
}
43624449

43634450
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
@@ -4400,7 +4487,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44004487
size_aligned += (size_page - (size_aligned % size_page));
44014488
}
44024489

4403-
id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
4490+
struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4491+
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
44044492

44054493
// the buffer fits into the max buffer size allowed by the device
44064494
if (size_aligned <= device.maxBufferLength) {
@@ -4453,6 +4541,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44534541
}
44544542
}
44554543

4544+
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
4545+
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4546+
free(ctx);
4547+
ggml_backend_metal_device_rel(ctx_dev);
4548+
return NULL;
4549+
}
4550+
44564551
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
44574552
}
44584553

@@ -4461,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
44614556
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
44624557
return "Metal";
44634558

4464-
UNUSED(backend);
4559+
GGML_UNUSED(backend);
44654560
}
44664561

44674562
static void ggml_backend_metal_free(ggml_backend_t backend) {
@@ -4766,6 +4861,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
47664861
}
47674862
}
47684863

4864+
if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
4865+
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
4866+
free(ctx);
4867+
ggml_backend_metal_device_rel(ctx_dev);
4868+
return NULL;
4869+
}
4870+
47694871
return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
47704872
}
47714873

@@ -4779,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
47794881
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
47804882
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
47814883

4782-
UNUSED(dev);
4884+
GGML_UNUSED(dev);
47834885
}
47844886

47854887
static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {

0 commit comments

Comments
 (0)