Skip to content

Commit 53ae0dd

Browse files
committed
use a host buffer for the cpu compute buffer for faster copies to the gpu
1 parent 458674c commit 53ae0dd

File tree

5 files changed

+35
-11
lines changed

5 files changed

+35
-11
lines changed

ggml-alloc.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -265,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
265265
return alloc;
266266
}
267267

268-
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
268+
ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
269269
// create a backend buffer to get the correct tensor allocation sizes
270-
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1);
270+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
271271

272272
// TODO: move alloc initialization to a common ggml_tallocr_new_impl function
273273
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
@@ -277,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe
277277
return alloc;
278278
}
279279

280-
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
281-
ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
280+
ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
281+
return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
282+
}
283+
284+
ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
285+
// create a backend buffer to get the correct tensor allocation sizes
286+
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
282287
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
283288
alloc->buffer_owned = true;
284289
return alloc;
285290
}
286291

292+
ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
293+
return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
294+
}
295+
287296
ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
288297
ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
289298

ggml-alloc.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t;
5252

5353
GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
5454
GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
55-
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
55+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
5656
GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
57+
GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
58+
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
5759
GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
5860

5961
GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);

ggml-backend.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ struct ggml_backend_sched {
776776

777777
int n_backends;
778778
ggml_backend_t backends[GGML_MAX_BACKENDS];
779+
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
779780
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
780781

781782
ggml_gallocr_t galloc;
@@ -1334,7 +1335,7 @@ static void sched_reset(ggml_backend_sched_t sched) {
13341335
sched->is_reset = true;
13351336
}
13361337

1337-
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size) {
1338+
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
13381339
GGML_ASSERT(n_backends > 0);
13391340
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
13401341

@@ -1348,13 +1349,14 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_bac
13481349
sched->n_backends = n_backends;
13491350
for (int i = 0; i < n_backends; i++) {
13501351
sched->backends[i] = backends[i];
1352+
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
13511353
}
13521354

13531355
sched->galloc = ggml_gallocr_new();
13541356

13551357
// init measure allocs for each backend
13561358
for (int i = 0; i < n_backends; i++) {
1357-
sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
1359+
sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
13581360
}
13591361

13601362
sched_reset(sched);
@@ -1387,7 +1389,7 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr
13871389
for (int i = 0; i < sched->n_backends; i++) {
13881390
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
13891391
ggml_tallocr_free(sched->tallocs[i]);
1390-
sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
1392+
sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
13911393
}
13921394

13931395
sched_reset(sched);

ggml-backend.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ extern "C" {
149149
typedef struct ggml_backend_sched * ggml_backend_sched_t;
150150

151151
// Initialize a backend scheduler
152-
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends, size_t graph_size);
152+
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
153153
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
154154
// Initialize backend buffers from a measure graph
155155
GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);

llama.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9368,10 +9368,21 @@ struct llama_context * llama_new_context_with_model(
93689368
}
93699369

93709370
{
9371+
// buffer types used for the compute buffer of each backend
9372+
std::vector<ggml_backend_buffer_type_t> backend_buft;
9373+
for (auto * backend : ctx->backends) {
9374+
if (ggml_backend_is_cpu(backend)) {
9375+
// use host buffers for the CPU backend compute buffer
9376+
backend_buft.push_back(llama_default_buffer_type_cpu(true));
9377+
} else {
9378+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
9379+
}
9380+
}
9381+
93719382
// buffer used to store the computation graph and the tensor meta data
93729383
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
93739384

9374-
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), ctx->backends.size(), LLAMA_MAX_NODES);
9385+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
93759386
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
93769387

93779388
// build worst-case graph
@@ -9390,7 +9401,7 @@ struct llama_context * llama_new_context_with_model(
93909401
for (ggml_backend_t backend : ctx->backends) {
93919402
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
93929403
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
9393-
ggml_backend_name(backend),
9404+
ggml_backend_buffer_name(buf),
93949405
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
93959406
}
93969407
}

0 commit comments

Comments
 (0)