@@ -388,7 +388,7 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
388
388
const int n_layer = hparams.n_layer ;
389
389
const float eps = hparams.eps ;
390
390
391
- GGML_ASSERT (imgs.size () == 1 ); // batch_size == 1
391
+ GGML_ASSERT (imgs.entries . size () == 1 ); // batch_size == 1
392
392
393
393
struct ggml_init_params params = {
394
394
/* .mem_size =*/ ctx->buf_compute_meta .size (),
@@ -540,16 +540,16 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
540
540
image_size_width = load_image_size.width ;
541
541
image_size_height = load_image_size.height ;
542
542
if (is_inf) {
543
- image_size_width = imgs[0 ]->nx ;
544
- image_size_height = imgs[0 ]->ny ;
543
+ image_size_width = imgs. entries [0 ]->nx ;
544
+ image_size_height = imgs. entries [0 ]->ny ;
545
545
}
546
546
}
547
547
else if (ctx->has_qwen2vl_merger ) {
548
548
// use the image's native resolution when image is avaible
549
549
if (is_inf) {
550
550
// if (imgs->data->nx && imgs->data->ny) {
551
- image_size_width = imgs[0 ]->nx ;
552
- image_size_height = imgs[0 ]->ny ;
551
+ image_size_width = imgs. entries [0 ]->nx ;
552
+ image_size_height = imgs. entries [0 ]->ny ;
553
553
}
554
554
}
555
555
const int patch_size = hparams.patch_size ;
@@ -564,7 +564,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
564
564
const float eps = hparams.eps ;
565
565
int mrope_sections[4 ] = {d_head/4 , d_head/4 , d_head/4 , d_head/4 };
566
566
567
- const int batch_size = imgs.size ();
567
+ const int batch_size = imgs.entries . size ();
568
568
569
569
if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector ) {
570
570
GGML_ASSERT (batch_size == 1 );
@@ -1477,15 +1477,15 @@ struct clip_model_loader {
1477
1477
1478
1478
// create a fake batch
1479
1479
clip_image_f32_batch batch;
1480
- clip_image_f32_ptr img;
1480
+ clip_image_f32_ptr img ( clip_image_f32_init ()) ;
1481
1481
clip_image_size image_size;
1482
1482
image_size.width = clip_get_image_size (&ctx_clip);
1483
1483
image_size.height = clip_get_image_size (&ctx_clip);
1484
1484
int n_patches = clip_get_image_size (&ctx_clip) / image_size.width ;
1485
1485
img->nx = n_patches;
1486
1486
img->ny = n_patches;
1487
1487
img->buf .resize (n_patches * image_size.width * image_size.height * 3 );
1488
- batch.push_back (std::move (img));
1488
+ batch.entries . push_back (std::move (img));
1489
1489
1490
1490
ggml_cgraph * gf = clip_image_build_graph (&ctx_clip, batch, image_size, false );
1491
1491
ggml_backend_sched_reserve (ctx_clip.sched .get (), gf);
@@ -1626,31 +1626,31 @@ void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) { if (batch) d
1626
1626
void clip_image_f32_batch_free (struct clip_image_f32_batch * batch) { if (batch) delete batch; }
1627
1627
1628
1628
size_t clip_image_f32_batch_n_images (const struct clip_image_f32_batch * batch) {
1629
- return batch->size ();
1629
+ return batch->entries . size ();
1630
1630
}
1631
1631
1632
1632
size_t clip_image_f32_batch_nx (const struct clip_image_f32_batch * batch, int idx) {
1633
- if (idx < 0 || idx >= (int )batch->size ()) {
1633
+ if (idx < 0 || idx >= (int )batch->entries . size ()) {
1634
1634
LOG_ERR (" %s: invalid index %d\n " , __func__, idx);
1635
1635
return 0 ;
1636
1636
}
1637
- return batch->at ( idx) ->nx ;
1637
+ return batch->entries [ idx] ->nx ;
1638
1638
}
1639
1639
1640
1640
size_t clip_image_f32_batch_ny (const struct clip_image_f32_batch * batch, int idx) {
1641
- if (idx < 0 || idx >= (int )batch->size ()) {
1641
+ if (idx < 0 || idx >= (int )batch->entries . size ()) {
1642
1642
LOG_ERR (" %s: invalid index %d\n " , __func__, idx);
1643
1643
return 0 ;
1644
1644
}
1645
- return batch->at ( idx) ->ny ;
1645
+ return batch->entries [ idx] ->ny ;
1646
1646
}
1647
1647
1648
1648
clip_image_f32 * clip_image_f32_get_img (const struct clip_image_f32_batch * batch, int idx) {
1649
- if (idx < 0 || idx >= (int )batch->size ()) {
1649
+ if (idx < 0 || idx >= (int )batch->entries . size ()) {
1650
1650
LOG_ERR (" %s: invalid index %d\n " , __func__, idx);
1651
1651
return nullptr ;
1652
1652
}
1653
- return batch->at ( idx) .get ();
1653
+ return batch->entries [ idx] .get ();
1654
1654
}
1655
1655
1656
1656
void clip_build_img_from_pixels (const unsigned char * rgb_pixels, int nx, int ny, clip_image_u8 * img) {
@@ -1884,7 +1884,7 @@ static std::vector<clip_image_u8_ptr> divide_to_patches_u8(const clip_image_u8 &
1884
1884
int height = image.ny ;
1885
1885
for (int i = 0 ; i < height; i += patch_size) {
1886
1886
for (int j = 0 ; j < width; j += patch_size) {
1887
- clip_image_u8_ptr patch;
1887
+ clip_image_u8_ptr patch ( clip_image_u8_init ()) ;
1888
1888
patch->nx = std::min (patch_size, width - j);
1889
1889
patch->ny = std::min (patch_size, height - i);
1890
1890
patch->buf .resize (3 * patch->nx * patch->ny );
@@ -1990,14 +1990,14 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
1990
1990
1991
1991
if (multiple <= 1 ) {
1992
1992
auto best_size = uhd_find_best_resize (original_size, scale_resolution, patch_size, true );
1993
- clip_image_u8_ptr source_image;
1993
+ clip_image_u8_ptr source_image ( clip_image_u8_init ()) ;
1994
1994
bicubic_resize (*img, *source_image, best_size.first , best_size.second );
1995
1995
// source_image = image.resize(best_size, Image.Resampling.BICUBIC)
1996
1996
images.back ().push_back (std::move (source_image));
1997
1997
}
1998
1998
else if (multiple > 1 ) {
1999
1999
auto best_size = uhd_find_best_resize (original_size, scale_resolution, patch_size);
2000
- clip_image_u8_ptr source_image;
2000
+ clip_image_u8_ptr source_image ( clip_image_u8_init ()) ;
2001
2001
bicubic_resize (*img, *source_image, best_size.first , best_size.second );
2002
2002
// source_image = image.copy().resize(best_resize, Image.Resampling.BICUBIC)
2003
2003
LOG_DBG (" %s: image_size: %d %d; source_image size: %d %d\n " , __func__, img->nx , img->ny , best_size.first , best_size.second );
@@ -2007,7 +2007,7 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
2007
2007
LOG_DBG (" %s: image_size: %d %d; best_grid: %d %d\n " , __func__, img->nx , img->ny , best_grid.first , best_grid.second );
2008
2008
2009
2009
auto refine_size = uhd_get_refine_size (original_size, best_grid, scale_resolution, patch_size, true );
2010
- clip_image_u8_ptr refine_image;
2010
+ clip_image_u8_ptr refine_image ( clip_image_u8_init ()) ;
2011
2011
bicubic_resize (*img, *refine_image, refine_size.first , refine_size.second );
2012
2012
2013
2013
LOG_DBG (" %s: refine_image_size: %d %d; refine_size: %d %d\n " , __func__, refine_image->nx , refine_image->ny , refine_size.first , refine_size.second );
@@ -2020,7 +2020,7 @@ static std::vector<std::vector<clip_image_u8_ptr>> uhd_slice_image(const clip_im
2020
2020
for (int patches_i = 0 , ic = 0 ; patches_i < height && ic < best_grid.second ; patches_i += grid_y, ic += 1 ){
2021
2021
images.push_back (std::vector<clip_image_u8_ptr>());
2022
2022
for (int patches_j = 0 , jc = 0 ; patches_j < width && jc < best_grid.first ; patches_j += grid_x, jc += 1 ){
2023
- clip_image_u8_ptr patch;
2023
+ clip_image_u8_ptr patch ( clip_image_u8_init ()) ;
2024
2024
patch->nx = grid_x;
2025
2025
patch->ny = grid_y;
2026
2026
patch->buf .resize (3 * patch->nx * patch->ny );
@@ -2062,36 +2062,36 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2062
2062
for (size_t i = 0 ; i < imgs.size (); ++i) {
2063
2063
for (size_t j = 0 ; j < imgs[i].size (); ++j) {
2064
2064
LOG_DBG (" %s: %d %d\n " , __func__,imgs[i][j]->nx ,imgs[i][j]->ny );
2065
- clip_image_f32_ptr res;
2065
+ clip_image_f32_ptr res ( clip_image_f32_init ()) ;
2066
2066
normalize_image_u8_to_f32 (*imgs[i][j], *res, ctx->image_mean , ctx->image_std );
2067
- res_imgs->push_back (std::move (res));
2067
+ res_imgs->entries . push_back (std::move (res));
2068
2068
}
2069
2069
}
2070
2070
return true ;
2071
2071
}
2072
2072
else if (ctx->has_qwen2vl_merger ) {
2073
- clip_image_u8_ptr resized;
2073
+ clip_image_u8 resized;
2074
2074
auto patch_size = clip_get_patch_size (ctx) * 2 ;
2075
2075
int nx = ceil ((float )img->nx / patch_size) * patch_size;
2076
2076
int ny = ceil ((float )img->ny / patch_size) * patch_size;
2077
- bicubic_resize (*img, * resized, nx, ny);
2077
+ bicubic_resize (*img, resized, nx, ny);
2078
2078
2079
- clip_image_f32_ptr img_f32;
2080
- // clip_image_f32_ptr res;
2081
- normalize_image_u8_to_f32 (* resized, *img_f32, ctx->image_mean , ctx->image_std );
2079
+ clip_image_f32_ptr img_f32 ( clip_image_f32_init ()) ;
2080
+ // clip_image_f32_ptr res(clip_image_f32_init()) ;
2081
+ normalize_image_u8_to_f32 (resized, *img_f32, ctx->image_mean , ctx->image_std );
2082
2082
// res_imgs->data[0] = *res;
2083
- res_imgs->push_back (std::move (img_f32));
2083
+ res_imgs->entries . push_back (std::move (img_f32));
2084
2084
return true ;
2085
2085
}
2086
2086
2087
2087
if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
2088
2088
clip_image_u8 resized_image;
2089
2089
int32_t sz=ctx->vision_model .hparams .image_size ;
2090
2090
bicubic_resize (*img, resized_image,sz,sz);
2091
- clip_image_f32_ptr img_f32;
2091
+ clip_image_f32_ptr img_f32 ( clip_image_f32_init ()) ;
2092
2092
// clip_image_save_to_bmp(resized_image, "resized.bmp");
2093
2093
normalize_image_u8_to_f32 (resized_image, *img_f32, ctx->image_mean , ctx->image_std );
2094
- res_imgs->push_back (std::move (img_f32));
2094
+ res_imgs->entries . push_back (std::move (img_f32));
2095
2095
return true ;
2096
2096
}
2097
2097
@@ -2106,12 +2106,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2106
2106
pad_to_square = false ;
2107
2107
}
2108
2108
// free the previous res_imgs if any set
2109
- res_imgs->clear ();
2109
+ res_imgs->entries . clear ();
2110
2110
2111
2111
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
2112
2112
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
2113
2113
2114
- clip_image_u8_ptr temp; // we will keep the input image data here temporarily
2114
+ clip_image_u8_ptr temp ( clip_image_u8_init ()) ; // we will keep the input image data here temporarily
2115
2115
if (pad_to_square && img->nx != img->ny ) {
2116
2116
int longer_side = std::max (img->nx , img->ny );
2117
2117
temp->nx = longer_side;
@@ -2156,14 +2156,14 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2156
2156
2157
2157
std::vector<clip_image_u8_ptr> patches = divide_to_patches_u8 (*temp, params.image_size ); // prepare spatial sorted main patches of image_size each (336 in llava-1.6)
2158
2158
2159
- clip_image_u8_ptr image_original_resize;
2159
+ clip_image_u8_ptr image_original_resize ( clip_image_u8_init ()) ;
2160
2160
// bilinear_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
2161
2161
bicubic_resize (*img, *image_original_resize, params.image_size , params.image_size ); // in python this is "shortest_edge", but all CLIP are square
2162
2162
patches.insert (patches.begin (), std::move (image_original_resize));
2163
2163
for (auto & patch : patches) {
2164
- clip_image_f32_ptr res;
2164
+ clip_image_f32_ptr res ( clip_image_f32_init ()) ;
2165
2165
normalize_image_u8_to_f32 (*patch, *res, ctx->image_mean , ctx->image_std );
2166
- res_imgs->push_back (std::move (res));
2166
+ res_imgs->entries . push_back (std::move (res));
2167
2167
}
2168
2168
2169
2169
return true ;
@@ -2181,7 +2181,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2181
2181
2182
2182
const int nx2 = ctx->vision_model .hparams .image_size ;
2183
2183
const int ny2 = ctx->vision_model .hparams .image_size ;
2184
- clip_image_f32_ptr res;
2184
+ clip_image_f32_ptr res ( clip_image_f32_init ()) ;
2185
2185
res->nx = nx2;
2186
2186
res->ny = ny2;
2187
2187
res->buf .resize (3 * nx2 * ny2);
@@ -2242,7 +2242,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
2242
2242
// }
2243
2243
// res_imgs.push_back(res);
2244
2244
2245
- res_imgs->push_back (std::move (res));
2245
+ res_imgs->entries . push_back (std::move (res));
2246
2246
2247
2247
return true ;
2248
2248
}
@@ -2424,9 +2424,9 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
2424
2424
}
2425
2425
2426
2426
clip_image_f32_batch imgs;
2427
- clip_image_f32_ptr img_copy;
2427
+ clip_image_f32_ptr img_copy ( clip_image_f32_init ()) ;
2428
2428
*img_copy = *img;
2429
- imgs.push_back (std::move (img_copy));
2429
+ imgs.entries . push_back (std::move (img_copy));
2430
2430
2431
2431
return clip_image_batch_encode (ctx, n_threads, &imgs, vec);
2432
2432
}
@@ -2439,7 +2439,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2439
2439
return false ;
2440
2440
}
2441
2441
2442
- int batch_size = imgs.size ();
2442
+ int batch_size = imgs.entries . size ();
2443
2443
if (ctx->has_llava_projector ) {
2444
2444
GGML_ASSERT (batch_size == 1 ); // TODO: support multiple images
2445
2445
}
@@ -2466,8 +2466,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2466
2466
int image_size_width = image_size;
2467
2467
int image_size_height = image_size;
2468
2468
if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger ) {
2469
- image_size_width = imgs[0 ]->nx ;
2470
- image_size_height = imgs[0 ]->ny ;
2469
+ image_size_width = imgs. entries [0 ]->nx ;
2470
+ image_size_height = imgs. entries [0 ]->ny ;
2471
2471
}
2472
2472
const int patch_size = hparams.patch_size ;
2473
2473
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
@@ -2479,9 +2479,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2479
2479
struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
2480
2480
float * data = (float *)malloc (ggml_nbytes (inp_raw));
2481
2481
2482
- for (size_t i = 0 ; i < imgs.size (); i++) {
2483
- const int nx = imgs[i]->nx ;
2484
- const int ny = imgs[i]->ny ;
2482
+ for (size_t i = 0 ; i < imgs.entries . size (); i++) {
2483
+ const int nx = imgs. entries [i]->nx ;
2484
+ const int ny = imgs. entries [i]->ny ;
2485
2485
if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger )) {
2486
2486
GGML_ASSERT (nx == image_size && ny == image_size);
2487
2487
}
@@ -2492,7 +2492,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2492
2492
for (int k = 0 ; k < 3 ; k++) {
2493
2493
for (int y = 0 ; y < ny; y++) {
2494
2494
for (int x = 0 ; x < nx; x++) {
2495
- data[(b * 3 * n) + k * n + y * nx + x] = imgs[b]->buf [3 * (y * nx + x) + k];
2495
+ data[(b * 3 * n) + k * n + y * nx + x] = imgs. entries [b]->buf [3 * (y * nx + x) + k];
2496
2496
}
2497
2497
}
2498
2498
}
0 commit comments