@@ -2239,36 +2239,44 @@ ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
2239
2239
}
2240
2240
#endif
2241
2241
2242
- #ifdef GGML_USE_CPU_AARCH64
2243
-
2244
2242
// buffer type AARCH64
2245
2243
2244
+ #ifdef __GNUC__
2245
+ #pragma GCC diagnostic push
2246
+ #pragma GCC diagnostic ignored "-Wpedantic"
2247
+ #endif
2248
+
2246
2249
#include " ggml-aarch64.h"
2247
2250
2251
+ #ifdef __GNUC__
2252
+ #pragma GCC diagnostic pop
2253
+ #endif
2254
+
2255
+ static void ggml_backend_cpu_aarch64_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
2256
+ tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type (tensor); // NOLINT
2257
+
2258
+ GGML_UNUSED (buffer);
2259
+ }
2260
+
2248
2261
static void ggml_backend_cpu_aarch64_buffer_set_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
2249
- bool quantize = tensor->type == GGML_TYPE_Q4_0 &&
2250
- tensor->op == GGML_OP_NONE &&
2251
- strcmp (tensor->name , " token_embd.weight" ) != 0 ;
2262
+ GGML_ASSERT (offset == 0 );
2263
+ GGML_ASSERT (size == ggml_nbytes (tensor));
2252
2264
2253
- if (quantize) {
2254
- GGML_ASSERT (offset == 0 );
2255
- if (ggml_prepare_optimal_kernel (tensor, data, size) == 0 ) {
2256
- return ;
2257
- }
2258
- }
2259
- memcpy ((char *)tensor->data + offset, data, size);
2265
+ enum ggml_type repack_type = (enum ggml_type)(intptr_t )tensor->extra ;
2266
+
2267
+ ggml_aarch64_repack_tensor (tensor, repack_type, data, size);
2260
2268
2261
2269
GGML_UNUSED (buffer);
2262
2270
}
2263
2271
2264
2272
static const struct ggml_backend_buffer_i ggml_backend_cpu_aarch64_buffer_i = {
2265
2273
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
2266
2274
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
2267
- /* .init_tensor = */ NULL , // no initialization required
2275
+ /* .init_tensor = */ ggml_backend_cpu_aarch64_buffer_init_tensor,
2268
2276
/* .memset_tensor = */ ggml_backend_cpu_buffer_memset_tensor,
2269
2277
/* .set_tensor = */ ggml_backend_cpu_aarch64_buffer_set_tensor,
2270
- /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor ,
2271
- /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor ,
2278
+ /* .get_tensor = */ NULL ,
2279
+ /* .cpy_tensor = */ NULL ,
2272
2280
/* .clear = */ ggml_backend_cpu_buffer_clear,
2273
2281
/* .reset = */ NULL ,
2274
2282
};
@@ -2298,33 +2306,37 @@ ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
2298
2306
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
2299
2307
/* .get_max_size = */ NULL , // defaults to SIZE_MAX
2300
2308
/* .get_alloc_size = */ NULL , // defaults to ggml_nbytes
2301
- /* .is_host = */ ggml_backend_cpu_buffer_type_is_host ,
2309
+ /* .is_host = */ NULL ,
2302
2310
},
2303
2311
/* .device = */ ggml_backend_reg_dev_get (ggml_backend_cpu_reg (), 0 ),
2304
- /* .context = */ NULL ,
2312
+ /* .context = */ NULL ,
2305
2313
};
2306
2314
2307
2315
return &ggml_backend_cpu_buffer_type_aarch64;
2308
2316
}
2309
- #endif
2317
+
2318
+ bool ggml_backend_cpu_buft_is_aarch64 (ggml_backend_buffer_type_t buft) {
2319
+ return buft == ggml_backend_cpu_aarch64_buffer_type ();
2320
+ }
2310
2321
2311
2322
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts (ggml_backend_dev_t device) {
2312
- static ggml_backend_buffer_type_t bufts[ 3 ];
2313
- int index = 0 ;
2323
+ static std::vector< ggml_backend_buffer_type_t > bufts = []() {
2324
+ std::vector< ggml_backend_buffer_type_t > bufts ;
2314
2325
2315
2326
#ifdef GGML_USE_CPU_HBM
2316
- bufts[index++] = ggml_backend_cpu_hbm_buffer_type ();
2327
+ bufts. push_back ( ggml_backend_cpu_hbm_buffer_type () );
2317
2328
#endif
2318
2329
2319
2330
#ifdef GGML_USE_CPU_AARCH64
2320
- if (ggml_cpu_has_neon () || ggml_cpu_has_matmul_int8 () || ggml_cpu_has_sve ()) {
2321
- bufts[index++] = ggml_backend_cpu_aarch64_buffer_type ();
2322
- }
2331
+ bufts.push_back (ggml_backend_cpu_aarch64_buffer_type ());
2323
2332
#endif
2324
2333
2325
- bufts[index] = NULL ; // Terminate the list
2334
+ bufts.push_back (NULL );
2335
+
2336
+ return bufts;
2337
+ }();
2326
2338
2327
- return bufts;
2339
+ return bufts. data () ;
2328
2340
2329
2341
GGML_UNUSED (device);
2330
2342
}
@@ -2635,15 +2647,21 @@ static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_b
2635
2647
}
2636
2648
2637
2649
static bool ggml_backend_cpu_device_supports_op (ggml_backend_dev_t dev, const struct ggml_tensor * op) {
2638
- #ifdef GGML_USE_CPU_AARCH64
2639
- const struct ggml_tensor *tensor = op->src [0 ];
2640
- if (tensor && tensor->buffer && (strcmp (tensor->buffer ->buft ->iface .get_name (tensor->buffer ->buft )," CPU_AARCH64" ) == 0 )) {
2641
- if (op->op == GGML_OP_MUL_MAT && tensor->type == GGML_TYPE_Q4_0) {
2642
- return op->src [1 ]->type == GGML_TYPE_F32 || op->src [1 ]->type == ggml_get_type_traits_cpu (tensor->type )->vec_dot_type ;
2650
+ const struct ggml_tensor * src0 = op->src [0 ];
2651
+ const struct ggml_tensor * src1 = op->src [1 ];
2652
+
2653
+ if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64 (src0->buffer ->buft )) {
2654
+ if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type (src0) == GGML_TYPE_Q4_0) {
2655
+ return false ;
2643
2656
}
2644
- return false ;
2645
2657
}
2646
- #endif
2658
+
2659
+ for (int i = 1 ; i < GGML_MAX_SRC; i++) {
2660
+ if (op->src [i] && op->src [i]->buffer && ggml_backend_cpu_buft_is_aarch64 (op->src [i]->buffer ->buft )) {
2661
+ return false ;
2662
+ }
2663
+ }
2664
+
2647
2665
switch (op->op ) {
2648
2666
case GGML_OP_CPY:
2649
2667
return
@@ -2652,13 +2670,13 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
2652
2670
op->type != GGML_TYPE_IQ1_S &&
2653
2671
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
2654
2672
case GGML_OP_MUL_MAT:
2655
- return op-> src [ 1 ]-> type == GGML_TYPE_F32; // FIXME || op->src[1]-> type == ggml_get_type_traits(op->src[0] ->type)->vec_dot_type;
2673
+ return src1-> type == GGML_TYPE_F32 || src1-> type == ggml_get_type_traits_cpu (src0 ->type )->vec_dot_type ;
2656
2674
case GGML_OP_ROPE_BACK:
2657
2675
return op->src [2 ] == NULL && (op->op_params [2 ] & 4 ) == 0 ;
2658
2676
case GGML_OP_IM2COL_BACK:
2659
- return op-> src [ 0 ]-> type == GGML_TYPE_F32 && op-> src [ 1 ] ->type == GGML_TYPE_F32;
2677
+ return src0-> type == GGML_TYPE_F32 && src1 ->type == GGML_TYPE_F32;
2660
2678
case GGML_OP_OUT_PROD:
2661
- return (op-> src [ 0 ]-> type == GGML_TYPE_F32 || ggml_is_quantized (op-> src [ 0 ]-> type )) && op-> src [ 1 ] ->type == GGML_TYPE_F32;
2679
+ return (src0-> type == GGML_TYPE_F32 || ggml_is_quantized (src0-> type )) && src1 ->type == GGML_TYPE_F32;
2662
2680
default :
2663
2681
return true ;
2664
2682
}
@@ -2667,7 +2685,7 @@ static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const st
2667
2685
}
2668
2686
2669
2687
static bool ggml_backend_cpu_device_supports_buft (ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
2670
- return ggml_backend_buft_is_host (buft);
2688
+ return ggml_backend_buft_is_host (buft) || ggml_backend_cpu_buft_is_aarch64 (buft) ;
2671
2689
2672
2690
GGML_UNUSED (dev);
2673
2691
}
@@ -2721,7 +2739,7 @@ static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const ch
2721
2739
if (strcmp (name, " ggml_backend_set_n_threads" ) == 0 ) {
2722
2740
return (void *)ggml_backend_cpu_set_n_threads;
2723
2741
}
2724
- if (strcmp (name, " ggml_backend_cpu_get_extra_bufts " ) == 0 ) {
2742
+ if (strcmp (name, " ggml_backend_dev_get_extra_bufts " ) == 0 ) {
2725
2743
return (void *)ggml_backend_cpu_get_extra_bufts;
2726
2744
}
2727
2745
@@ -2738,6 +2756,9 @@ static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
2738
2756
};
2739
2757
2740
2758
ggml_backend_reg_t ggml_backend_cpu_reg (void ) {
2759
+ // init CPU feature detection
2760
+ ggml_cpu_init ();
2761
+
2741
2762
static struct ggml_backend_reg ggml_backend_cpu_reg = {
2742
2763
/* .iface = */ ggml_backend_cpu_reg_i,
2743
2764
/* .context = */ NULL ,
0 commit comments