19
19
// max number of MTLCommandBuffer used to submit a graph for processing
20
20
#define GGML_METAL_MAX_COMMAND_BUFFERS 8
21
21
22
- #define UNUSED (x ) (void )(x)
22
+ // create residency sets only on macOS >= 15.0
23
+ #if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
24
+ #define GGML_METAL_HAS_RESIDENCY_SETS 1
25
+ #endif
23
26
24
27
// globals
25
28
39
42
40
43
bool has_simdgroup_reduction;
41
44
bool has_simdgroup_mm;
45
+ bool has_residency_sets;
42
46
bool has_bfloat;
43
47
bool use_bfloat;
44
48
48
52
/* .mtl_device_ref_count =*/ 0 ,
49
53
/* .has_simdgroup_reduction =*/ false ,
50
54
/* .has_simdgroup_mm =*/ false ,
55
+ /* .has_residency_sets =*/ false ,
51
56
/* .has_bfloat =*/ false ,
52
57
/* .use_bfloat =*/ false ,
53
58
/* .name =*/ " " ,
65
70
66
71
ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily: MTLGPUFamilyApple7];
67
72
73
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
74
+ ctx->has_residency_sets = getenv (" GGML_METAL_NO_RESIDENCY" ) == NULL ;
75
+ #endif
76
+
68
77
ctx->has_bfloat = [ctx->mtl_device supportsFamily: MTLGPUFamilyMetal3_GGML];
69
78
ctx->has_bfloat |= [ctx->mtl_device supportsFamily: MTLGPUFamilyApple6];
70
79
@@ -483,6 +492,11 @@ @implementation GGMLMetalClass
483
492
GGML_LOG_INFO (" %s : picking default device: %s \n " , __func__, [[device name ] UTF8String ]);
484
493
485
494
ctx->queue = [device newCommandQueue ];
495
+ if (ctx->queue == nil ) {
496
+ GGML_LOG_ERROR (" %s : error: failed to create command queue\n " , __func__);
497
+ return NULL ;
498
+ }
499
+
486
500
ctx->d_queue = dispatch_queue_create (" ggml-metal" , DISPATCH_QUEUE_CONCURRENT);
487
501
488
502
id <MTLLibrary > metal_library;
@@ -649,6 +663,7 @@ @implementation GGMLMetalClass
649
663
650
664
GGML_LOG_INFO (" %s : simdgroup reduction = %s \n " , __func__, ctx_dev->has_simdgroup_reduction ? " true" : " false" );
651
665
GGML_LOG_INFO (" %s : simdgroup matrix mul. = %s \n " , __func__, ctx_dev->has_simdgroup_mm ? " true" : " false" );
666
+ GGML_LOG_INFO (" %s : has residency sets = %s \n " , __func__, ctx_dev->has_residency_sets ? " true" : " false" );
652
667
GGML_LOG_INFO (" %s : has bfloat = %s \n " , __func__, ctx_dev->has_bfloat ? " true" : " false" );
653
668
GGML_LOG_INFO (" %s : use bfloat = %s \n " , __func__, ctx_dev->use_bfloat ? " true" : " false" );
654
669
GGML_LOG_INFO (" %s : hasUnifiedMemory = %s \n " , __func__, ctx_dev->mtl_device .hasUnifiedMemory ? " true" : " false" );
@@ -1035,8 +1050,70 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
1035
1050
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
1036
1051
int n_buffers;
1037
1052
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
1053
+
1054
+ // optional MTLResidencySet
1055
+ id rset;
1038
1056
};
1039
1057
1058
+ // rset init
1059
+ static bool ggml_backend_metal_buffer_rset_init (
1060
+ struct ggml_backend_metal_buffer_context * ctx,
1061
+ struct ggml_backend_metal_device_context * ctx_dev,
1062
+ id <MTLDevice > device) {
1063
+ ctx->rset = nil ;
1064
+
1065
+ if (!ctx_dev->has_residency_sets ) {
1066
+ return true ;
1067
+ }
1068
+
1069
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1070
+ if (@available (macOS 15.0 , *)) {
1071
+ MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc ] init ];
1072
+ desc.label = @" ggml_backend_metal" ;
1073
+ desc.initialCapacity = ctx->n_buffers ;
1074
+
1075
+ NSError * error;
1076
+ ctx->rset = [device newResidencySetWithDescriptor: desc error: &error];
1077
+ if (error) {
1078
+ GGML_LOG_ERROR (" %s : error: %s \n " , __func__, [[error description ] UTF8String ]);
1079
+ [desc release ];
1080
+ return false ;
1081
+ }
1082
+
1083
+ [desc release ];
1084
+
1085
+ for (int i = 0 ; i < ctx->n_buffers ; i++) {
1086
+ [ctx->rset addAllocation: ctx->buffers[i].metal];
1087
+ }
1088
+
1089
+ [ctx->rset commit ];
1090
+ [ctx->rset requestResidency ];
1091
+
1092
+ return true ;
1093
+ }
1094
+ #else
1095
+ GGML_UNUSED (ctx_dev);
1096
+ GGML_UNUSED (device);
1097
+ #endif
1098
+
1099
+ return true ;
1100
+ }
1101
+
1102
+ // rset free
1103
+ static void ggml_backend_metal_buffer_rset_free (struct ggml_backend_metal_buffer_context * ctx) {
1104
+ #if defined(GGML_METAL_HAS_RESIDENCY_SETS)
1105
+ if (@available (macOS 15.0 , *)) {
1106
+ if (ctx->rset ) {
1107
+ [ctx->rset endResidency ];
1108
+ [ctx->rset removeAllAllocations ];
1109
+ [ctx->rset release ];
1110
+ }
1111
+ }
1112
+ #else
1113
+ GGML_UNUSED (ctx);
1114
+ #endif
1115
+ }
1116
+
1040
1117
// finds the Metal buffer that contains the tensor data on the GPU device
1041
1118
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
1042
1119
// Metal buffer based on the host memory pointer
@@ -4176,6 +4253,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4176
4253
for (int i = 0 ; i < ctx->n_buffers ; i++) {
4177
4254
[ctx->buffers[i].metal release ];
4178
4255
}
4256
+
4257
+ ggml_backend_metal_buffer_rset_free (ctx);
4179
4258
ggml_backend_metal_device_rel (buffer->buft ->device ->context );
4180
4259
4181
4260
if (ctx->owned ) {
@@ -4198,19 +4277,19 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
4198
4277
static void ggml_backend_metal_buffer_memset_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
4199
4278
memset ((char *)tensor->data + offset, value, size);
4200
4279
4201
- UNUSED (buffer);
4280
+ GGML_UNUSED (buffer);
4202
4281
}
4203
4282
4204
4283
static void ggml_backend_metal_buffer_set_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
4205
4284
memcpy ((char *)tensor->data + offset, data, size);
4206
4285
4207
- UNUSED (buffer);
4286
+ GGML_UNUSED (buffer);
4208
4287
}
4209
4288
4210
4289
static void ggml_backend_metal_buffer_get_tensor (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
4211
4290
memcpy (data, (const char *)tensor->data + offset, size);
4212
4291
4213
- UNUSED (buffer);
4292
+ GGML_UNUSED (buffer);
4214
4293
}
4215
4294
4216
4295
static bool ggml_backend_metal_buffer_cpy_tensor (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
@@ -4220,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
4220
4299
}
4221
4300
return false ;
4222
4301
4223
- UNUSED (buffer);
4302
+ GGML_UNUSED (buffer);
4224
4303
}
4225
4304
4226
4305
static void ggml_backend_metal_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value) {
@@ -4246,7 +4325,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
4246
4325
static const char * ggml_backend_metal_buffer_type_get_name (ggml_backend_buffer_type_t buft) {
4247
4326
return " Metal" ;
4248
4327
4249
- UNUSED (buft);
4328
+ GGML_UNUSED (buft);
4250
4329
}
4251
4330
4252
4331
static void ggml_backend_metal_log_allocated_size (id <MTLDevice > device, size_t size_aligned) {
@@ -4270,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
4270
4349
}
4271
4350
#endif
4272
4351
#endif
4273
- UNUSED (device);
4274
- UNUSED (size_aligned);
4352
+ GGML_UNUSED (device);
4353
+ GGML_UNUSED (size_aligned);
4275
4354
}
4276
4355
4277
4356
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size) {
@@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4284
4363
size_aligned += (size_page - (size_aligned % size_page));
4285
4364
}
4286
4365
4287
- id <MTLDevice > device = ggml_backend_metal_device_acq (buft->device ->context );
4366
+ struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device ->context ;
4367
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4288
4368
4289
4369
ctx->all_data = ggml_metal_host_malloc (size_aligned);
4290
4370
ctx->all_size = size_aligned;
@@ -4307,7 +4387,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4307
4387
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers [0 ].metal == nil )) {
4308
4388
GGML_LOG_ERROR (" %s : error: failed to allocate buffer, size = %8.2f MiB\n " , __func__, size_aligned / 1024.0 / 1024.0 );
4309
4389
free (ctx);
4310
- ggml_backend_metal_device_rel (buft->device ->context );
4390
+ ggml_backend_metal_device_rel (ctx_dev);
4391
+ return NULL ;
4392
+ }
4393
+
4394
+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4395
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4396
+ free (ctx);
4397
+ ggml_backend_metal_device_rel (ctx_dev);
4311
4398
return NULL ;
4312
4399
}
4313
4400
@@ -4318,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
4318
4405
4319
4406
static size_t ggml_backend_metal_buffer_type_get_alignment (ggml_backend_buffer_type_t buft) {
4320
4407
return 32 ;
4321
- UNUSED (buft);
4408
+ GGML_UNUSED (buft);
4322
4409
}
4323
4410
4324
4411
static size_t ggml_backend_metal_buffer_type_get_max_size (ggml_backend_buffer_type_t buft) {
@@ -4328,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty
4328
4415
4329
4416
return max_size;
4330
4417
4331
- UNUSED (buft);
4418
+ GGML_UNUSED (buft);
4332
4419
}
4333
4420
4334
4421
static bool ggml_backend_metal_buffer_type_is_host (ggml_backend_buffer_type_t buft) {
4335
4422
return true ;
4336
4423
4337
- UNUSED (buft);
4424
+ GGML_UNUSED (buft);
4338
4425
}
4339
4426
4340
4427
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type (void ) {
@@ -4357,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
4357
4444
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name (ggml_backend_buffer_type_t buft) {
4358
4445
return " Metal_Mapped" ;
4359
4446
4360
- UNUSED (buft);
4447
+ GGML_UNUSED (buft);
4361
4448
}
4362
4449
4363
4450
static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type (void ) {
@@ -4400,7 +4487,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4400
4487
size_aligned += (size_page - (size_aligned % size_page));
4401
4488
}
4402
4489
4403
- id <MTLDevice > device = ggml_backend_metal_device_acq (&g_ggml_ctx_dev_main);
4490
+ struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
4491
+ id <MTLDevice > device = ggml_backend_metal_device_acq (ctx_dev);
4404
4492
4405
4493
// the buffer fits into the max buffer size allowed by the device
4406
4494
if (size_aligned <= device.maxBufferLength ) {
@@ -4453,6 +4541,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4453
4541
}
4454
4542
}
4455
4543
4544
+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4545
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4546
+ free (ctx);
4547
+ ggml_backend_metal_device_rel (ctx_dev);
4548
+ return NULL ;
4549
+ }
4550
+
4456
4551
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4457
4552
}
4458
4553
@@ -4461,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
4461
4556
static const char * ggml_backend_metal_name (ggml_backend_t backend) {
4462
4557
return " Metal" ;
4463
4558
4464
- UNUSED (backend);
4559
+ GGML_UNUSED (backend);
4465
4560
}
4466
4561
4467
4562
static void ggml_backend_metal_free (ggml_backend_t backend) {
@@ -4766,6 +4861,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
4766
4861
}
4767
4862
}
4768
4863
4864
+ if (!ggml_backend_metal_buffer_rset_init (ctx, ctx_dev, device)) {
4865
+ GGML_LOG_ERROR (" %s : error: failed to initialize residency set\n " , __func__);
4866
+ free (ctx);
4867
+ ggml_backend_metal_device_rel (ctx_dev);
4868
+ return NULL ;
4869
+ }
4870
+
4769
4871
return ggml_backend_buffer_init (ggml_backend_metal_buffer_from_ptr_type (), ggml_backend_metal_buffer_i, ctx, size);
4770
4872
}
4771
4873
@@ -4779,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
4779
4881
return buft->iface .get_name == ggml_backend_metal_buffer_type_get_name ||
4780
4882
buft->iface .get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;
4781
4883
4782
- UNUSED (dev);
4884
+ GGML_UNUSED (dev);
4783
4885
}
4784
4886
4785
4887
static bool ggml_backend_metal_device_offload_op (ggml_backend_dev_t dev, const struct ggml_tensor * op) {
0 commit comments