@@ -583,20 +583,20 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
583
583
break ;
584
584
}
585
585
params.n_gpu_layers = std::stoi (argv[i]);
586
- # ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
587
- fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n " );
588
- fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
589
- # endif
586
+ if (! llama_supports_gpu_offload ()) {
587
+ fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n " );
588
+ fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
589
+ }
590
590
} else if (arg == " --gpu-layers-draft" || arg == " -ngld" || arg == " --n-gpu-layers-draft" ) {
591
591
if (++i >= argc) {
592
592
invalid_param = true ;
593
593
break ;
594
594
}
595
595
params.n_gpu_layers_draft = std::stoi (argv[i]);
596
- # ifndef LLAMA_SUPPORTS_GPU_OFFLOAD
597
- fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n " );
598
- fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
599
- # endif
596
+ if (! llama_supports_gpu_offload ()) {
597
+ fprintf (stderr, " warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n " );
598
+ fprintf (stderr, " warning: see main README.md for information on enabling GPU BLAS support\n " );
599
+ }
600
600
} else if (arg == " --main-gpu" || arg == " -mg" ) {
601
601
if (++i >= argc) {
602
602
invalid_param = true ;
@@ -637,11 +637,11 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
637
637
const std::regex regex{R"( [,/]+)" };
638
638
std::sregex_token_iterator it{arg_next.begin (), arg_next.end (), regex, -1 };
639
639
std::vector<std::string> split_arg{it, {}};
640
- if (split_arg.size () >= LLAMA_MAX_DEVICES ) {
640
+ if (split_arg.size () >= llama_max_devices () ) {
641
641
invalid_param = true ;
642
642
break ;
643
643
}
644
- for (size_t i = 0 ; i < LLAMA_MAX_DEVICES ; ++i) {
644
+ for (size_t i = 0 ; i < llama_max_devices () ; ++i) {
645
645
if (i < split_arg.size ()) {
646
646
params.tensor_split [i] = std::stof (split_arg[i]);
647
647
} else {
@@ -989,30 +989,30 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
989
989
printf (" -cb, --cont-batching enable continuous batching (a.k.a dynamic batching) (default: disabled)\n " );
990
990
printf (" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA. see examples/llava/README.md\n " );
991
991
printf (" --image IMAGE_FILE path to an image file. use with multimodal models\n " );
992
- if (llama_mlock_supported ()) {
992
+ if (llama_supports_mlock ()) {
993
993
printf (" --mlock force system to keep model in RAM rather than swapping or compressing\n " );
994
994
}
995
- if (llama_mmap_supported ()) {
995
+ if (llama_supports_mmap ()) {
996
996
printf (" --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n " );
997
997
}
998
998
printf (" --numa attempt optimizations that help on some NUMA systems\n " );
999
999
printf (" if run without this previously, it is recommended to drop the system page cache before using this\n " );
1000
1000
printf (" see https://github.com./ggerganov/llama.cpp/issues/1437\n " );
1001
- # ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
1002
- printf (" -ngl N, --n-gpu-layers N\n " );
1003
- printf (" number of layers to store in VRAM\n " );
1004
- printf (" -ngld N, --n-gpu-layers-draft N\n " );
1005
- printf (" number of layers to store in VRAM for the draft model\n " );
1006
- printf (" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n " );
1007
- printf (" how to split the model across multiple GPUs, one of:\n " );
1008
- printf (" - none: use one GPU only\n " );
1009
- printf (" - layer (default): split layers and KV across GPUs\n " );
1010
- printf (" - row: split rows across GPUs\n " );
1011
- printf (" -ts SPLIT, --tensor-split SPLIT\n " );
1012
- printf (" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n " );
1013
- printf (" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n " );
1014
- printf (" or for intermediate results and KV (with split-mode = row) (default: %d)\n " , params.main_gpu );
1015
- # endif // LLAMA_SUPPORTS_GPU_OFFLOAD
1001
+ if ( llama_supports_gpu_offload ()) {
1002
+ printf (" -ngl N, --n-gpu-layers N\n " );
1003
+ printf (" number of layers to store in VRAM\n " );
1004
+ printf (" -ngld N, --n-gpu-layers-draft N\n " );
1005
+ printf (" number of layers to store in VRAM for the draft model\n " );
1006
+ printf (" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n " );
1007
+ printf (" how to split the model across multiple GPUs, one of:\n " );
1008
+ printf (" - none: use one GPU only\n " );
1009
+ printf (" - layer (default): split layers and KV across GPUs\n " );
1010
+ printf (" - row: split rows across GPUs\n " );
1011
+ printf (" -ts SPLIT, --tensor-split SPLIT\n " );
1012
+ printf (" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n " );
1013
+ printf (" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n " );
1014
+ printf (" or for intermediate results and KV (with split-mode = row) (default: %d)\n " , params.main_gpu );
1015
+ }
1016
1016
printf (" --verbose-prompt print a verbose prompt before generation (default: %s)\n " , params.verbose_prompt ? " true" : " false" );
1017
1017
printf (" --no-display-prompt don't print prompt at generation (default: %s)\n " , !params.display_prompt ? " true" : " false" );
1018
1018
printf (" -gan N, --grp-attn-n N\n " );
@@ -1651,7 +1651,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
1651
1651
fprintf (stream, " cont_batching: %s # default: false\n " , params.cont_batching ? " true" : " false" );
1652
1652
fprintf (stream, " temp: %f # default: 0.8\n " , sparams.temp );
1653
1653
1654
- const std::vector<float > tensor_split_vector (params.tensor_split , params.tensor_split + LLAMA_MAX_DEVICES );
1654
+ const std::vector<float > tensor_split_vector (params.tensor_split , params.tensor_split + llama_max_devices () );
1655
1655
dump_vector_float_yaml (stream, " tensor_split" , tensor_split_vector);
1656
1656
1657
1657
fprintf (stream, " tfs: %f # default: 1.0\n " , sparams.tfs_z );
0 commit comments