Skip to content

metal : use residency sets #11427

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 26, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 119 additions & 17 deletions ggml/src/ggml-metal/ggml-metal.m
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
// max number of MTLCommandBuffer used to submit a graph for processing
#define GGML_METAL_MAX_COMMAND_BUFFERS 8

#define UNUSED(x) (void)(x)
// create residency sets only on macOS >= 15.0
#if TARGET_OS_OSX && __MAC_OS_X_VERSION_MAX_ALLOWED >= 150000
#define GGML_METAL_HAS_RESIDENCY_SETS 1
#endif

// globals

Expand All @@ -39,6 +42,7 @@

bool has_simdgroup_reduction;
bool has_simdgroup_mm;
bool has_residency_sets;
bool has_bfloat;
bool use_bfloat;

Expand All @@ -48,6 +52,7 @@
/*.mtl_device_ref_count =*/ 0,
/*.has_simdgroup_reduction =*/ false,
/*.has_simdgroup_mm =*/ false,
/*.has_residency_sets =*/ false,
/*.has_bfloat =*/ false,
/*.use_bfloat =*/ false,
/*.name =*/ "",
Expand All @@ -65,6 +70,10 @@

ctx->has_simdgroup_mm = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7];

#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
ctx->has_residency_sets = getenv("GGML_METAL_NO_RESIDENCY") == NULL;
#endif

ctx->has_bfloat = [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML];
ctx->has_bfloat |= [ctx->mtl_device supportsFamily:MTLGPUFamilyApple6];

Expand Down Expand Up @@ -483,6 +492,11 @@ @implementation GGMLMetalClass
GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);

ctx->queue = [device newCommandQueue];
if (ctx->queue == nil) {
GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
return NULL;
}

ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);

id<MTLLibrary> metal_library;
Expand Down Expand Up @@ -649,6 +663,7 @@ @implementation GGMLMetalClass

GGML_LOG_INFO("%s: simdgroup reduction = %s\n", __func__, ctx_dev->has_simdgroup_reduction ? "true" : "false");
GGML_LOG_INFO("%s: simdgroup matrix mul. = %s\n", __func__, ctx_dev->has_simdgroup_mm ? "true" : "false");
GGML_LOG_INFO("%s: has residency sets = %s\n", __func__, ctx_dev->has_residency_sets ? "true" : "false");
GGML_LOG_INFO("%s: has bfloat = %s\n", __func__, ctx_dev->has_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: use bfloat = %s\n", __func__, ctx_dev->use_bfloat ? "true" : "false");
GGML_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx_dev->mtl_device.hasUnifiedMemory ? "true" : "false");
Expand Down Expand Up @@ -1035,8 +1050,70 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
int n_buffers;
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];

// optional MTLResidencySet
id rset;
};

// rset init
static bool ggml_backend_metal_buffer_rset_init(
struct ggml_backend_metal_buffer_context * ctx,
struct ggml_backend_metal_device_context * ctx_dev,
id<MTLDevice> device) {
ctx->rset = nil;

if (!ctx_dev->has_residency_sets) {
return true;
}

#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, *)) {
MTLResidencySetDescriptor * desc = [[MTLResidencySetDescriptor alloc] init];
desc.label = @"ggml_backend_metal";
desc.initialCapacity = ctx->n_buffers;

NSError * error;
ctx->rset = [device newResidencySetWithDescriptor:desc error:&error];
if (error) {
GGML_LOG_ERROR("%s: error: %s\n", __func__, [[error description] UTF8String]);
[desc release];
return false;
}

[desc release];

for (int i = 0; i < ctx->n_buffers; i++) {
[ctx->rset addAllocation:ctx->buffers[i].metal];
}

[ctx->rset commit];
[ctx->rset requestResidency];

return true;
}
#else
GGML_UNUSED(ctx_dev);
GGML_UNUSED(device);
#endif

return true;
}

// rset free
static void ggml_backend_metal_buffer_rset_free(struct ggml_backend_metal_buffer_context * ctx) {
#if defined(GGML_METAL_HAS_RESIDENCY_SETS)
if (@available(macOS 15.0, *)) {
if (ctx->rset) {
[ctx->rset endResidency];
[ctx->rset removeAllAllocations];
[ctx->rset release];
}
}
#else
GGML_UNUSED(ctx);
#endif
}

// finds the Metal buffer that contains the tensor data on the GPU device
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
// Metal buffer based on the host memory pointer
Expand Down Expand Up @@ -4176,6 +4253,8 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
for (int i = 0; i < ctx->n_buffers; i++) {
[ctx->buffers[i].metal release];
}

ggml_backend_metal_buffer_rset_free(ctx);
ggml_backend_metal_device_rel(buffer->buft->device->context);

if (ctx->owned) {
Expand All @@ -4198,19 +4277,19 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer)
static void ggml_backend_metal_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);

UNUSED(buffer);
GGML_UNUSED(buffer);
}

static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
memcpy((char *)tensor->data + offset, data, size);

UNUSED(buffer);
GGML_UNUSED(buffer);
}

static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
memcpy(data, (const char *)tensor->data + offset, size);

UNUSED(buffer);
GGML_UNUSED(buffer);
}

static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
Expand All @@ -4220,7 +4299,7 @@ static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, c
}
return false;

UNUSED(buffer);
GGML_UNUSED(buffer);
}

static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
Expand All @@ -4246,7 +4325,7 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_
static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal";

UNUSED(buft);
GGML_UNUSED(buft);
}

static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t size_aligned) {
Expand All @@ -4270,8 +4349,8 @@ static void ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_t s
}
#endif
#endif
UNUSED(device);
UNUSED(size_aligned);
GGML_UNUSED(device);
GGML_UNUSED(size_aligned);
}

static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
Expand All @@ -4284,7 +4363,8 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
size_aligned += (size_page - (size_aligned % size_page));
}

id<MTLDevice> device = ggml_backend_metal_device_acq(buft->device->context);
struct ggml_backend_metal_device_context * ctx_dev = (struct ggml_backend_metal_device_context *)buft->device->context;
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);

ctx->all_data = ggml_metal_host_malloc(size_aligned);
ctx->all_size = size_aligned;
Expand All @@ -4307,7 +4387,14 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
free(ctx);
ggml_backend_metal_device_rel(buft->device->context);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}

if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}

Expand All @@ -4318,7 +4405,7 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba

static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return 32;
UNUSED(buft);
GGML_UNUSED(buft);
}

static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
Expand All @@ -4328,13 +4415,13 @@ static size_t ggml_backend_metal_buffer_type_get_max_size(ggml_backend_buffer_ty

return max_size;

UNUSED(buft);
GGML_UNUSED(buft);
}

static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;

UNUSED(buft);
GGML_UNUSED(buft);
}

ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
Expand All @@ -4357,7 +4444,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
return "Metal_Mapped";

UNUSED(buft);
GGML_UNUSED(buft);
}

static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
Expand Down Expand Up @@ -4400,7 +4487,8 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
size_aligned += (size_page - (size_aligned % size_page));
}

id<MTLDevice> device = ggml_backend_metal_device_acq(&g_ggml_ctx_dev_main);
struct ggml_backend_metal_device_context * ctx_dev = &g_ggml_ctx_dev_main;
id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);

// the buffer fits into the max buffer size allowed by the device
if (size_aligned <= device.maxBufferLength) {
Expand Down Expand Up @@ -4453,6 +4541,13 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
}
}

if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}

return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
}

Expand All @@ -4461,7 +4556,7 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
return "Metal";

UNUSED(backend);
GGML_UNUSED(backend);
}

static void ggml_backend_metal_free(ggml_backend_t backend) {
Expand Down Expand Up @@ -4766,6 +4861,13 @@ static ggml_backend_buffer_t ggml_backend_metal_device_buffer_from_ptr(ggml_back
}
}

if (!ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
free(ctx);
ggml_backend_metal_device_rel(ctx_dev);
return NULL;
}

return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
}

Expand All @@ -4779,7 +4881,7 @@ static bool ggml_backend_metal_device_supports_buft(ggml_backend_dev_t dev, ggml
return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name ||
buft->iface.get_name == ggml_backend_metal_buffer_from_ptr_type_get_name;

UNUSED(dev);
GGML_UNUSED(dev);
}

static bool ggml_backend_metal_device_offload_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
Expand Down
Loading