10
10
11
11
#include " ggml.h"
12
12
#ifdef GGML_USE_CUBLAS
13
+ #include < cuda_runtime.h>
13
14
#include " ggml-cuda.h"
14
15
#endif
15
16
@@ -641,7 +642,7 @@ struct llama_model_loader {
641
642
}
642
643
}
643
644
644
- struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne) {
645
+ struct ggml_tensor * get_tensor (const std::string & name, const std::vector<uint32_t > & ne, ggml_backend backend ) {
645
646
auto it = tensors_map.name_to_idx .find (name);
646
647
if (it == tensors_map.name_to_idx .end ()) {
647
648
throw format (" llama.cpp: tensor '%s' is missing from model" , name.c_str ());
@@ -652,10 +653,10 @@ struct llama_model_loader {
652
653
name.c_str (), llama_format_tensor_shape (ne).c_str (), llama_format_tensor_shape (lt.ne ).c_str ());
653
654
}
654
655
655
- return get_tensor_for (lt);
656
+ return get_tensor_for (lt, backend );
656
657
}
657
658
658
- struct ggml_tensor * get_tensor_for (llama_load_tensor & lt) {
659
+ struct ggml_tensor * get_tensor_for (llama_load_tensor & lt, ggml_backend backend ) {
659
660
struct ggml_tensor * tensor;
660
661
if (lt.ne .size () == 2 ) {
661
662
tensor = ggml_new_tensor_2d (ggml_ctx, lt.type , lt.ne .at (0 ), lt.ne .at (1 ));
@@ -665,6 +666,7 @@ struct llama_model_loader {
665
666
}
666
667
ggml_set_name (tensor, lt.name .c_str ());
667
668
LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
669
+ tensor->backend = backend;
668
670
lt.ggml_tensor = tensor;
669
671
num_ggml_tensors_created++;
670
672
return tensor;
@@ -683,7 +685,7 @@ struct llama_model_loader {
683
685
}
684
686
685
687
if (use_mmap) {
686
- mapping.reset (new llama_mmap (&file_loaders.at (0 )->file ));
688
+ mapping.reset (new llama_mmap (&file_loaders.at (0 )->file , false ));
687
689
if (!lmlock) {
688
690
// Don't call the callback since the actual loading will be lazy
689
691
// and we can't measure it.
@@ -696,6 +698,9 @@ struct llama_model_loader {
696
698
697
699
size_t done_size = 0 ;
698
700
for (llama_load_tensor & lt : tensors_map.tensors ) {
701
+ if (lt.ggml_tensor ->backend != GGML_BACKEND_CPU) {
702
+ continue ;
703
+ }
699
704
if (progress_callback) {
700
705
progress_callback ((float ) done_size / data_size, progress_callback_user_data);
701
706
}
@@ -944,26 +949,6 @@ static void llama_model_load_internal(
944
949
ml->calc_sizes (&ctx_size, &mmapped_size);
945
950
fprintf (stderr, " %s: ggml ctx size = %6.2f KB\n " , __func__, ctx_size/1024.0 );
946
951
947
- // print memory requirements
948
- {
949
- const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1 ;
950
-
951
- // this is the total memory required to run the inference
952
- const size_t mem_required =
953
- ctx_size +
954
- mmapped_size +
955
- MEM_REQ_SCRATCH0 ().at (model.type ) +
956
- MEM_REQ_SCRATCH1 ().at (model.type ) +
957
- MEM_REQ_EVAL ().at (model.type );
958
-
959
- // this is the memory required by one llama_state
960
- const size_t mem_required_state =
961
- scale*MEM_REQ_KV_SELF ().at (model.type );
962
-
963
- fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
964
- mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
965
- }
966
-
967
952
// create the ggml context
968
953
{
969
954
lctx.model .buf .resize (ctx_size);
@@ -985,79 +970,131 @@ static void llama_model_load_internal(
985
970
}
986
971
987
972
// prepare memory for the weights
973
+ size_t vram_total = 0 ;
988
974
{
989
975
const uint32_t n_embd = hparams.n_embd ;
990
976
const uint32_t n_layer = hparams.n_layer ;
991
977
const uint32_t n_vocab = hparams.n_vocab ;
992
978
993
979
ml->ggml_ctx = ctx;
994
980
995
- model.tok_embeddings = ml->get_tensor (" tok_embeddings.weight" , {n_embd, n_vocab});
996
- model.norm = ml->get_tensor (" norm.weight" , {n_embd});
997
- model.output = ml->get_tensor (" output.weight" , {n_embd, n_vocab});
981
+ model.tok_embeddings = ml->get_tensor (" tok_embeddings.weight" , {n_embd, n_vocab}, GGML_BACKEND_CPU);
982
+ model.norm = ml->get_tensor (" norm.weight" , {n_embd}, GGML_BACKEND_CPU);
983
+ ggml_backend backend_output;
984
+ if (n_gpu_layers > int (n_layer)) {
985
+ backend_output = GGML_BACKEND_CUDA;
986
+ } else {
987
+ backend_output = GGML_BACKEND_CPU;
988
+ }
989
+ model.output = ml->get_tensor (" output.weight" , {n_embd, n_vocab}, backend_output);
998
990
999
991
model.layers .resize (n_layer);
992
+ const int i_gpu_start = n_layer - n_gpu_layers;
1000
993
for (uint32_t i = 0 ; i < n_layer; ++i) {
1001
994
auto & layer = model.layers [i];
995
+ const ggml_backend backend = int (i) < i_gpu_start ? GGML_BACKEND_CPU : GGML_BACKEND_CUDA;
1002
996
1003
997
std::string layers_i = " layers." + std::to_string (i);
1004
998
1005
- layer.attention_norm = ml->get_tensor (layers_i + " .attention_norm.weight" , {n_embd});
999
+ layer.attention_norm = ml->get_tensor (layers_i + " .attention_norm.weight" , {n_embd}, backend );
1006
1000
1007
- layer.wq = ml->get_tensor (layers_i + " .attention.wq.weight" , {n_embd, n_embd});
1008
- layer.wk = ml->get_tensor (layers_i + " .attention.wk.weight" , {n_embd, n_embd});
1009
- layer.wv = ml->get_tensor (layers_i + " .attention.wv.weight" , {n_embd, n_embd});
1010
- layer.wo = ml->get_tensor (layers_i + " .attention.wo.weight" , {n_embd, n_embd});
1001
+ layer.wq = ml->get_tensor (layers_i + " .attention.wq.weight" , {n_embd, n_embd}, backend );
1002
+ layer.wk = ml->get_tensor (layers_i + " .attention.wk.weight" , {n_embd, n_embd}, backend );
1003
+ layer.wv = ml->get_tensor (layers_i + " .attention.wv.weight" , {n_embd, n_embd}, backend );
1004
+ layer.wo = ml->get_tensor (layers_i + " .attention.wo.weight" , {n_embd, n_embd}, backend );
1011
1005
1012
- layer.ffn_norm = ml->get_tensor (layers_i + " .ffn_norm.weight" , {n_embd});
1006
+ layer.ffn_norm = ml->get_tensor (layers_i + " .ffn_norm.weight" , {n_embd}, backend );
1013
1007
1014
- layer.w1 = ml->get_tensor (layers_i + " .feed_forward.w1.weight" , {n_embd, n_ff});
1015
- layer.w2 = ml->get_tensor (layers_i + " .feed_forward.w2.weight" , { n_ff, n_embd});
1016
- layer.w3 = ml->get_tensor (layers_i + " .feed_forward.w3.weight" , {n_embd, n_ff});
1008
+ layer.w1 = ml->get_tensor (layers_i + " .feed_forward.w1.weight" , {n_embd, n_ff}, backend);
1009
+ layer.w2 = ml->get_tensor (layers_i + " .feed_forward.w2.weight" , { n_ff, n_embd}, backend);
1010
+ layer.w3 = ml->get_tensor (layers_i + " .feed_forward.w3.weight" , {n_embd, n_ff}, backend);
1011
+ if (backend == GGML_BACKEND_CUDA) {
1012
+ vram_total += ggml_nbytes (layer.attention_norm ) + ggml_nbytes (layer.wq ) + ggml_nbytes (layer.wk )
1013
+ + ggml_nbytes (layer.wv ) + ggml_nbytes (layer.wo ) + ggml_nbytes (layer.attention_norm )
1014
+ + ggml_nbytes (layer.w1 ) + ggml_nbytes (layer.w2 ) + ggml_nbytes (layer.w3 );
1015
+ }
1017
1016
}
1018
1017
}
1019
1018
1020
1019
ml->done_getting_tensors ();
1021
1020
1021
+ // print memory requirements
1022
+ {
1023
+ const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1 ;
1024
+
1025
+ // this is the total memory required to run the inference
1026
+ const size_t mem_required =
1027
+ ctx_size +
1028
+ mmapped_size - vram_total + // weights in VRAM not in memory
1029
+ MEM_REQ_SCRATCH0 ().at (model.type ) +
1030
+ MEM_REQ_SCRATCH1 ().at (model.type ) +
1031
+ MEM_REQ_EVAL ().at (model.type );
1032
+
1033
+ // this is the memory required by one llama_state
1034
+ const size_t mem_required_state =
1035
+ scale*MEM_REQ_KV_SELF ().at (model.type );
1036
+
1037
+ fprintf (stderr, " %s: mem required = %7.2f MB (+ %7.2f MB per state)\n " , __func__,
1038
+ mem_required / 1024.0 / 1024.0 , mem_required_state / 1024.0 / 1024.0 );
1039
+
1040
+ #ifdef GGML_USE_CUBLAS
1041
+ const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1042
+
1043
+ fprintf (stderr, " %s: [cublas] offloading %d layers to GPU\n " , __func__, n_gpu);
1044
+ if (n_gpu_layers > (int ) hparams.n_layer ) {
1045
+ fprintf (stderr, " %s: [cublas] offloading output layer to GPU\n " , __func__);
1046
+ }
1047
+ fprintf (stderr, " %s: [cublas] total VRAM used: %zu MB\n " , __func__, vram_total / 1024 / 1024 );
1048
+ #else
1049
+ (void ) n_gpu_layers;
1050
+ #endif
1051
+ }
1052
+
1022
1053
// populate `tensors_by_name`
1023
1054
for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1024
1055
model.tensors_by_name .emplace_back (lt.name , lt.ggml_tensor );
1025
1056
}
1026
1057
1027
1058
ml->load_all_data (progress_callback, progress_callback_user_data, use_mlock ? &lctx.model .mlock_mmap : NULL );
1028
1059
1029
- model.mapping = std::move (ml->mapping );
1030
1060
#ifdef GGML_USE_CUBLAS
1031
1061
{
1032
- const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1062
+ std::vector<struct ggml_tensor *> tensors;
1063
+ std::vector<size_t > offsets;
1064
+ for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1065
+ if (lt.ggml_tensor ->backend != GGML_BACKEND_CUDA) {
1066
+ continue ;
1067
+ }
1068
+ tensors.emplace_back (lt.ggml_tensor );
1069
+ LLAMA_ASSERT (lt.shards .size () == 1 );
1070
+ offsets.emplace_back (lt.shards .at (0 ).file_off );
1071
+ }
1072
+ bool cufile_success = ggml_cuda_load_data_cufile (fname.c_str (), tensors.data (), tensors.size (), offsets.data ());
1033
1073
1034
- fprintf (stderr, " %s: [cublas] offloading %d layers to GPU\n " , __func__, n_gpu);
1074
+ if (!cufile_success) {
1075
+ for (llama_load_tensor & lt : ml->tensors_map .tensors ) {
1076
+ if (lt.ggml_tensor ->backend != GGML_BACKEND_CUDA) {
1077
+ continue ;
1078
+ }
1079
+ size_t actual_size;
1080
+ void * buf = ggml_cuda_pool_malloc (lt.size , &actual_size);
1081
+ void * buf_host = ggml_cuda_host_malloc (lt.size );
1035
1082
1036
- size_t vram_total = 0 ;
1083
+ llama_file & file = ml->file_loaders .at (lt.shards .at (0 ).file_idx )->file ;
1084
+ file.seek (lt.shards .at (0 ).file_off , SEEK_SET);
1085
+ file.read_raw (buf_host, lt.size );
1037
1086
1038
- for ( int i = 0 ; i < n_gpu; ++i) {
1039
- const auto & layer = model. layers [i] ;
1087
+ cudaMemcpy (buf, buf_host, lt. size , cudaMemcpyHostToDevice);
1088
+ cudaDeviceSynchronize () ;
1040
1089
1041
- ggml_cuda_transform_tensor (layer.attention_norm ); vram_total += ggml_nbytes (layer.attention_norm );
1042
- ggml_cuda_transform_tensor (layer.wq ); vram_total += ggml_nbytes (layer.wq );
1043
- ggml_cuda_transform_tensor (layer.wk ); vram_total += ggml_nbytes (layer.wk );
1044
- ggml_cuda_transform_tensor (layer.wv ); vram_total += ggml_nbytes (layer.wv );
1045
- ggml_cuda_transform_tensor (layer.wo ); vram_total += ggml_nbytes (layer.wo );
1046
- ggml_cuda_transform_tensor (layer.ffn_norm ); vram_total += ggml_nbytes (layer.ffn_norm );
1047
- ggml_cuda_transform_tensor (layer.w1 ); vram_total += ggml_nbytes (layer.w1 );
1048
- ggml_cuda_transform_tensor (layer.w2 ); vram_total += ggml_nbytes (layer.w2 );
1049
- ggml_cuda_transform_tensor (layer.w3 ); vram_total += ggml_nbytes (layer.w3 );
1050
- }
1051
- if (n_gpu_layers > (int ) hparams.n_layer ) {
1052
- fprintf (stderr, " %s: [cublas] offloading output layer to GPU\n " , __func__);
1053
- ggml_cuda_transform_tensor (model.output ); vram_total += ggml_nbytes (model.output );
1090
+ lt.ggml_tensor ->data = buf;
1091
+ ggml_cuda_host_free (buf_host);
1092
+ }
1054
1093
}
1055
-
1056
- fprintf (stderr, " %s: [cublas] total VRAM used: %zu MB\n " , __func__, vram_total / 1024 / 1024 );
1057
1094
}
1058
- #else
1059
- ( void ) n_gpu_layers;
1060
- # endif
1095
+ #endif // GGML_USE_CUBLAS
1096
+
1097
+ model. mapping = std::move (ml-> mapping );
1061
1098
1062
1099
// loading time will be recalculate after the first eval, so
1063
1100
// we take page faults deferred by mmap() into consideration
@@ -2395,7 +2432,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
2395
2432
}
2396
2433
size_t idx = model_loader->tensors_map .name_to_idx [base_name];
2397
2434
llama_load_tensor & lt = model_loader->tensors_map .tensors [idx];
2398
- base_t = model_loader->get_tensor (base_name, { (uint32_t )dest_t ->ne [0 ], (uint32_t )dest_t ->ne [1 ] });
2435
+ base_t = model_loader->get_tensor (
2436
+ base_name, { (uint32_t )dest_t ->ne [0 ], (uint32_t )dest_t ->ne [1 ] }, GGML_BACKEND_CPU);
2399
2437
lt.data = (uint8_t *) lt.ggml_tensor ->data ;
2400
2438
model_loader->load_data_for (lt);
2401
2439
lt.ggml_tensor ->data = lt.data ;
0 commit comments