Skip to content

Commit 5cb04db

Browse files
ggerganovslaren
andauthored
llama : remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD (#5240)
* llama : remove LLAMA_MAX_DEVICES from llama.h ggml-ci * Update llama.cpp Co-authored-by: slaren <[email protected]> * server : remove LLAMA_MAX_DEVICES ggml-ci * llama : remove LLAMA_SUPPORTS_GPU_OFFLOAD ggml-ci * train : remove LLAMA_SUPPORTS_GPU_OFFLOAD * readme : add deprecation notice * readme : change deprecation notice to "remove" and fix url * llama : remove gpu includes from llama.h ggml-ci --------- Co-authored-by: slaren <[email protected]>
1 parent efb7bdb commit 5cb04db

File tree

9 files changed

+144
-125
lines changed

9 files changed

+144
-125
lines changed

README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1010

1111
### Hot topics
1212

13-
- ⚠️ Incoming backends: https://github.com./ggerganov/llama.cpp/discussions/5138
13+
- Remove LLAMA_MAX_DEVICES and LLAMA_SUPPORTS_GPU_OFFLOAD: https://github.com./ggerganov/llama.cpp/pull/5240
14+
- Incoming backends: https://github.com./ggerganov/llama.cpp/discussions/5138
1415
- [SYCL backend](README-sycl.md) is ready (1/28/2024), support Linux/Windows in Intel GPUs (iGPU, Arc/Flex/Max series)
1516
- New SOTA quantized models, including pure 2-bits: https://huggingface.co/ikawrakow
1617
- Collecting Apple Silicon performance stats:

common/common.cpp

+28-28
Original file line numberDiff line numberDiff line change
@@ -583,20 +583,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
583583
break;
584584
}
585585
params.n_gpu_layers = std::stoi(argv[i]);
586-
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
587-
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
588-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
589-
#endif
586+
if (!llama_supports_gpu_offload()) {
587+
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
588+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
589+
}
590590
} else if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") {
591591
if (++i >= argc) {
592592
invalid_param = true;
593593
break;
594594
}
595595
params.n_gpu_layers_draft = std::stoi(argv[i]);
596-
#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
597-
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
598-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
599-
#endif
596+
if (!llama_supports_gpu_offload()) {
597+
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n");
598+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
599+
}
600600
} else if (arg == "--main-gpu" || arg == "-mg") {
601601
if (++i >= argc) {
602602
invalid_param = true;
@@ -637,11 +637,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
637637
const std::regex regex{R"([,/]+)"};
638638
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1};
639639
std::vector<std::string> split_arg{it, {}};
640-
if (split_arg.size() >= LLAMA_MAX_DEVICES) {
640+
if (split_arg.size() >= llama_max_devices()) {
641641
invalid_param = true;
642642
break;
643643
}
644-
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
644+
for (size_t i = 0; i < llama_max_devices(); ++i) {
645645
if (i < split_arg.size()) {
646646
params.tensor_split[i] = std::stof(split_arg[i]);
647647
} else {
@@ -989,30 +989,30 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
989989
printf(" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
990990
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
991991
printf(" --image IMAGE_FILE path to an image file. use with multimodal models\n");
992-
if (llama_mlock_supported()) {
992+
if (llama_supports_mlock()) {
993993
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
994994
}
995-
if (llama_mmap_supported()) {
995+
if (llama_supports_mmap()) {
996996
printf(" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n");
997997
}
998998
printf(" --numa attempt optimizations that help on some NUMA systems\n");
999999
printf(" if run without this previously, it is recommended to drop the system page cache before using this\n");
10001000
printf(" see https://github.com./ggerganov/llama.cpp/issues/1437\n");
1001-
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1002-
printf(" -ngl N, --n-gpu-layers N\n");
1003-
printf(" number of layers to store in VRAM\n");
1004-
printf(" -ngld N, --n-gpu-layers-draft N\n");
1005-
printf(" number of layers to store in VRAM for the draft model\n");
1006-
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
1007-
printf(" how to split the model across multiple GPUs, one of:\n");
1008-
printf(" - none: use one GPU only\n");
1009-
printf(" - layer (default): split layers and KV across GPUs\n");
1010-
printf(" - row: split rows across GPUs\n");
1011-
printf(" -ts SPLIT, --tensor-split SPLIT\n");
1012-
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
1013-
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
1014-
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
1015-
#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
1001+
if (llama_supports_gpu_offload()) {
1002+
printf(" -ngl N, --n-gpu-layers N\n");
1003+
printf(" number of layers to store in VRAM\n");
1004+
printf(" -ngld N, --n-gpu-layers-draft N\n");
1005+
printf(" number of layers to store in VRAM for the draft model\n");
1006+
printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
1007+
printf(" how to split the model across multiple GPUs, one of:\n");
1008+
printf(" - none: use one GPU only\n");
1009+
printf(" - layer (default): split layers and KV across GPUs\n");
1010+
printf(" - row: split rows across GPUs\n");
1011+
printf(" -ts SPLIT, --tensor-split SPLIT\n");
1012+
printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
1013+
printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n");
1014+
printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
1015+
}
10161016
printf(" --verbose-prompt print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
10171017
printf(" --no-display-prompt don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
10181018
printf(" -gan N, --grp-attn-n N\n");
@@ -1651,7 +1651,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
16511651
fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
16521652
fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
16531653

1654-
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + LLAMA_MAX_DEVICES);
1654+
const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());
16551655
dump_vector_float_yaml(stream, "tensor_split", tensor_split_vector);
16561656

16571657
fprintf(stream, "tfs: %f # default: 1.0\n", sparams.tfs_z);

common/common.h

+34-34
Original file line numberDiff line numberDiff line change
@@ -43,40 +43,40 @@ extern char const *LLAMA_BUILD_TARGET;
4343
int32_t get_num_physical_cores();
4444

4545
struct gpt_params {
46-
uint32_t seed = -1; // RNG seed
47-
48-
int32_t n_threads = get_num_physical_cores();
49-
int32_t n_threads_draft = -1;
50-
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
51-
int32_t n_threads_batch_draft = -1;
52-
int32_t n_predict = -1; // new tokens to predict
53-
int32_t n_ctx = 512; // context size
54-
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
55-
int32_t n_keep = 0; // number of tokens to keep from initial prompt
56-
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
57-
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
58-
int32_t n_parallel = 1; // number of parallel sequences to decode
59-
int32_t n_sequences = 1; // number of sequences to decode
60-
float p_accept = 0.5f; // speculative decoding accept probability
61-
float p_split = 0.1f; // speculative decoding split probability
62-
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
63-
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
64-
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
65-
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
66-
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
67-
int32_t n_beams = 0; // if non-zero then use beam search of given width.
68-
int32_t grp_attn_n = 1; // group-attention factor
69-
int32_t grp_attn_w = 512; // group-attention width
70-
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
71-
float rope_freq_base = 0.0f; // RoPE base frequency
72-
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
73-
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
74-
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
75-
float yarn_beta_fast = 32.0f; // YaRN low correction dim
76-
float yarn_beta_slow = 1.0f; // YaRN high correction dim
77-
int32_t yarn_orig_ctx = 0; // YaRN original context length
78-
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
79-
// pinging @cebtenzzre
46+
uint32_t seed = -1; // RNG seed
47+
48+
int32_t n_threads = get_num_physical_cores();
49+
int32_t n_threads_draft = -1;
50+
int32_t n_threads_batch = -1; // number of threads to use for batch processing (-1 = use n_threads)
51+
int32_t n_threads_batch_draft = -1;
52+
int32_t n_predict = -1; // new tokens to predict
53+
int32_t n_ctx = 512; // context size
54+
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
55+
int32_t n_keep = 0; // number of tokens to keep from initial prompt
56+
int32_t n_draft = 8; // number of tokens to draft during speculative decoding
57+
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)
58+
int32_t n_parallel = 1; // number of parallel sequences to decode
59+
int32_t n_sequences = 1; // number of sequences to decode
60+
float p_accept = 0.5f; // speculative decoding accept probability
61+
float p_split = 0.1f; // speculative decoding split probability
62+
int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default)
63+
int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
64+
llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs
65+
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
66+
float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs
67+
int32_t n_beams = 0; // if non-zero then use beam search of given width.
68+
int32_t grp_attn_n = 1; // group-attention factor
69+
int32_t grp_attn_w = 512; // group-attention width
70+
int32_t n_print = -1; // print token count every n tokens (-1 = disabled)
71+
float rope_freq_base = 0.0f; // RoPE base frequency
72+
float rope_freq_scale = 0.0f; // RoPE frequency scaling factor
73+
float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor
74+
float yarn_attn_factor = 1.0f; // YaRN magnitude scaling factor
75+
float yarn_beta_fast = 32.0f; // YaRN low correction dim
76+
float yarn_beta_slow = 1.0f; // YaRN high correction dim
77+
int32_t yarn_orig_ctx = 0; // YaRN original context length
78+
int8_t rope_scaling_type = LLAMA_ROPE_SCALING_UNSPECIFIED; // TODO: better to be int32_t for alignment
79+
// pinging @cebtenzzre
8080

8181
// // sampling parameters
8282
struct llama_sampling_params sparams;

common/train.cpp

+6-6
Original file line numberDiff line numberDiff line change
@@ -1363,12 +1363,12 @@ bool consume_common_train_arg(
13631363
*invalid_param = true;
13641364
return true;
13651365
}
1366-
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1367-
params->n_gpu_layers = std::stoi(argv[i]);
1368-
#else
1369-
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
1370-
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1371-
#endif
1366+
if (llama_supports_gpu_offload()) {
1367+
params->n_gpu_layers = std::stoi(argv[i]);
1368+
} else {
1369+
fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n");
1370+
fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n");
1371+
}
13721372
} else if (arg == "-h" || arg == "--help") {
13731373
params->print_usage = true;
13741374
return true;

examples/batched-bench/batched-bench.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ int main(int argc, char ** argv) {
8888

8989
llama_model_params model_params = llama_model_default_params();
9090

91-
const std::vector<float> t_split (LLAMA_MAX_DEVICES, 0.0f);
91+
const std::vector<float> t_split(llama_max_devices(), 0.0f);
9292

9393
model_params.n_gpu_layers = n_gpu_layers;
9494
model_params.tensor_split = t_split.data();

examples/llama-bench/llama-bench.cpp

+8-8
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ struct cmd_params {
160160
std::vector<int> main_gpu;
161161
std::vector<bool> no_kv_offload;
162162
std::vector<bool> mul_mat_q;
163-
std::vector<std::array<float, LLAMA_MAX_DEVICES>> tensor_split;
163+
std::vector<std::vector<float>> tensor_split;
164164
int reps;
165165
bool verbose;
166166
output_formats output_format;
@@ -179,7 +179,7 @@ static const cmd_params cmd_params_defaults = {
179179
/* main_gpu */ {0},
180180
/* no_kv_offload */ {false},
181181
/* mul_mat_q */ {true},
182-
/* tensor_split */ {{}},
182+
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
183183
/* reps */ 5,
184184
/* verbose */ false,
185185
/* output_format */ MARKDOWN
@@ -380,10 +380,10 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
380380
const std::regex regex{R"([;/]+)"};
381381
std::sregex_token_iterator it{ts.begin(), ts.end(), regex, -1};
382382
std::vector<std::string> split_arg{it, {}};
383-
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);
383+
GGML_ASSERT(split_arg.size() <= llama_max_devices());
384384

385-
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
386-
for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) {
385+
std::vector<float> tensor_split(llama_max_devices());
386+
for (size_t i = 0; i < llama_max_devices(); ++i) {
387387
if (i < split_arg.size()) {
388388
tensor_split[i] = std::stof(split_arg[i]);
389389
} else {
@@ -459,7 +459,7 @@ struct cmd_params_instance {
459459
int main_gpu;
460460
bool no_kv_offload;
461461
bool mul_mat_q;
462-
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
462+
std::vector<float> tensor_split;
463463

464464
llama_model_params to_llama_mparams() const {
465465
llama_model_params mparams = llama_model_default_params();
@@ -582,7 +582,7 @@ struct test {
582582
int main_gpu;
583583
bool no_kv_offload;
584584
bool mul_mat_q;
585-
std::array<float, LLAMA_MAX_DEVICES> tensor_split;
585+
std::vector<float> tensor_split;
586586
int n_prompt;
587587
int n_gen;
588588
std::string test_time;
@@ -704,7 +704,7 @@ struct test {
704704
std::vector<std::string> get_values() const {
705705
std::string tensor_split_str;
706706
int max_nonzero = 0;
707-
for (int i = 0; i < LLAMA_MAX_DEVICES; i++) {
707+
for (size_t i = 0; i < llama_max_devices(); i++) {
708708
if (tensor_split[i] > 0) {
709709
max_nonzero = i;
710710
}

0 commit comments

Comments
 (0)