@@ -2198,27 +2198,32 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
2198
2198
printf (" size = %8.3f MB\n " , tensor.size /1024.0 /1024.0 );
2199
2199
} else {
2200
2200
new_type = quantized_type;
2201
- if (tensor.name == " output.weight" ) new_type = GGML_TYPE_Q6_K;
2202
- else if (tensor.name .find (" attention.wv.weight" ) != std::string::npos) {
2201
+ // TODO: temporary disabled until Metal / OpenCL support is available
2202
+ // ref: https://github.com/ggerganov/llama.cpp/issues/1711
2203
+ // if (tensor.name == "output.weight") {
2204
+ // new_type = GGML_TYPE_Q6_K;
2205
+ // }
2206
+ if (tensor.name .find (" attention.wv.weight" ) != std::string::npos) {
2203
2207
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2204
2208
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2205
2209
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2206
2210
(i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7 *n_attention_wv/8 ||
2207
2211
(i_attention_wv - n_attention_wv/8 )%3 == 2 )) new_type = GGML_TYPE_Q6_K;
2208
2212
++i_attention_wv;
2209
2213
}
2210
- else if (tensor.name .find (" feed_forward.w2.weight" ) != std::string::npos) {
2214
+ if (tensor.name .find (" feed_forward.w2.weight" ) != std::string::npos) {
2211
2215
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2212
2216
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2213
2217
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
2214
2218
(i_feed_forward_w2 < n_feed_forward_w2/8 || i_feed_forward_w2 >= 7 *n_feed_forward_w2/8 ||
2215
2219
(i_feed_forward_w2 - n_feed_forward_w2/8 )%3 == 2 )) new_type = GGML_TYPE_Q6_K;
2216
2220
++i_feed_forward_w2;
2217
2221
}
2218
- else if (tensor.name .find (" attention.wo.weight" ) != std::string::npos) {
2222
+ if (tensor.name .find (" attention.wo.weight" ) != std::string::npos) {
2219
2223
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
2220
2224
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
2221
2225
}
2226
+
2222
2227
float * f32_data;
2223
2228
size_t nelements = tensor.ne .at (0 ) * tensor.ne .at (1 );
2224
2229
llama_buffer f32_conv_buf;
0 commit comments