Skip to content

Commit 2acd1b6

Browse files
ngxsondanbev
authored andcommitted
img pre processing
1 parent 863a4ca commit 2acd1b6

File tree

8 files changed

+568
-25
lines changed

8 files changed

+568
-25
lines changed

convert_hf_to_gguf.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1651,8 +1651,13 @@ def set_gguf_parameters(self):
16511651
self.gguf_writer.add_vision_clip_feed_forward_length(self.vparams["intermediate_size"])
16521652
self.gguf_writer.add_vision_clip_head_count(self.vparams["num_attention_heads"])
16531653
# TODO: should not hardcode these, but they are currently missing from config.json
1654+
self.gguf_writer.add_vision_clip_projector_type(gguf.constants.CLIPProjectorType.MLP)
16541655
self.gguf_writer.add_vision_clip_max_position_embeddings(577)
16551656
self.gguf_writer.add_vision_clip_layer_norm_epsilon(1e-05)
1657+
default_image_mean = [0.48145466, 0.4578275, 0.40821073]
1658+
default_image_std = [0.26862954, 0.26130258, 0.27577711]
1659+
self.gguf_writer.add_vision_clip_image_mean(default_image_mean)
1660+
self.gguf_writer.add_vision_clip_image_std(default_image_std)
16561661

16571662
@staticmethod
16581663
def permute(weights: Tensor, n_head: int, n_head_kv: int | None):

gguf-py/gguf/constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ class Clip:
229229
PROJECTION_DIM = "vision.clip.projection_dim"
230230
USE_GELU = "vision.clip.use_gelu"
231231
MAX_POS_EMBEDDING = "vision.clip.max_position_embeddings"
232+
PROJECTOR_TYPE = "vision.clip.projector_type"
232233
HEAD_COUNT = "vision.clip.attention.head_count"
233234
LAYERNORM_EPS = "vision.clip.attention.layer_norm_epsilon"
234235

@@ -1689,6 +1690,10 @@ class PoolingType(IntEnum):
16891690
CLS = 2
16901691

16911692

1693+
class CLIPProjectorType(Enum):
1694+
MLP = 'mlp'
1695+
1696+
16921697
class GGMLQuantizationType(IntEnum):
16931698
F32 = 0
16941699
F16 = 1

gguf-py/gguf/gguf_writer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
PoolingType,
2828
TokenType,
2929
ExpertGatingFuncType,
30+
CLIPProjectorType,
3031
)
3132

3233
from .quants import quant_shape_from_byte_shape
@@ -905,9 +906,18 @@ def add_vision_clip_head_count(self, value: int) -> None:
905906
def add_vision_clip_max_position_embeddings(self, value: int) -> None:
906907
self.add_uint32(Keys.Vision.Clip.MAX_POS_EMBEDDING, value)
907908

909+
def add_vision_clip_projector_type(self, value: CLIPProjectorType) -> None:
910+
self.add_string(Keys.Vision.Clip.PROJECTOR_TYPE, value.value)
911+
908912
def add_vision_clip_layer_norm_epsilon(self, value: float) -> None:
909913
self.add_float32(Keys.Vision.Clip.LAYERNORM_EPS, value)
910914

915+
def add_vision_clip_image_mean(self, value: Sequence[float]) -> None:
916+
self.add_array(Keys.Vision.IMAGE_MEAN, value)
917+
918+
def add_vision_clip_image_std(self, value: Sequence[float]) -> None:
919+
self.add_array(Keys.Vision.IMAGE_STD, value)
920+
911921
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
912922
if not isinstance(value, str):
913923
template_default = None

src/llama-arch.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ enum llm_kv {
215215
LLM_KV_VISION_CLIP_PROJECTION_TYPE,
216216
LLM_KV_VISION_CLIP_PROJECTION_DIM,
217217
LLM_KV_VISION_CLIP_USE_GELU,
218+
LLM_KV_VISION_CLIP_PROJECTOR_TYPE,
218219
LLM_KV_VISION_CLIP_HEAD_COUNT,
219220
LLM_KV_VISION_CLIP_MAX_POS_EMBD,
220221
LLM_KV_VISION_CLIP_LAYERNORM_EPS,

src/llama-model-loader.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ namespace GGUFMeta {
412412
// TODO: this is not very clever - figure out something better
413413
template bool llama_model_loader::get_key_or_arr<std::array<int, 4>>(enum llm_kv kid, std::array<int, 4> & result, uint32_t n, bool required);
414414
template bool llama_model_loader::get_key_or_arr<std::array<uint32_t, 512>>(enum llm_kv kid, std::array<uint32_t, 512> & result, uint32_t n, bool required);
415+
template bool llama_model_loader::get_key_or_arr<std::array<float, 3>>(enum llm_kv kid, std::array<float, 3> & result, uint32_t n, bool required);
415416

416417
llama_model_loader::llama_model_loader(const std::string & fname, bool use_mmap, bool check_tensors, const struct llama_model_kv_override * param_overrides_p) {
417418
int trace = 0;

src/llama-model.cpp

Lines changed: 33 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -501,30 +501,6 @@ void llama_model::load_hparams(llama_model_loader & ml) {
501501
uint32_t n_vocab = 0;
502502
ml.get_key(LLM_KV_VOCAB_SIZE, n_vocab, false) || ml.get_arr_n(LLM_KV_TOKENIZER_LIST, n_vocab, false);
503503

504-
std::string vision_type;
505-
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
506-
if (vision_type == "clip") {
507-
hparams.has_vision = true;
508-
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, hparams.clip.image_size, true);
509-
ml.get_key(LLM_KV_VISION_PATCH_SIZE, hparams.clip.patch_size, true);
510-
ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, hparams.clip.hidden_size, true);
511-
ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, hparams.clip.n_layer, true);
512-
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, hparams.clip.n_intermediate, true);
513-
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, hparams.clip.n_head, true);
514-
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, hparams.clip.eps, true);
515-
// TODO: add image_std
516-
std::string arch;
517-
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
518-
for (auto & it : VISION_ARCH_NAMES) {
519-
if (arch == it.second) {
520-
hparams.clip.arch = it.first;
521-
break;
522-
}
523-
}
524-
} else if (!vision_type.empty()) {
525-
throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str()));
526-
}
527-
528504
using e_model = llm_type; // TMP
529505

530506
// arch-specific KVs
@@ -1262,6 +1238,39 @@ void llama_model::load_hparams(llama_model_loader & ml) {
12621238
default: throw std::runtime_error("unsupported model architecture");
12631239
}
12641240

1241+
// vision model
1242+
std::string vision_type;
1243+
ml.get_key(LLM_KV_VISION_TYPE, vision_type, false);
1244+
if (vision_type == "clip") {
1245+
hparams.has_vision = true;
1246+
std::string proj_type;
1247+
ml.get_key(LLM_KV_VISION_IMAGE_SIZE, hparams.clip.image_size, true);
1248+
ml.get_key(LLM_KV_VISION_PATCH_SIZE, hparams.clip.patch_size, true);
1249+
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_MEAN, hparams.clip.image_mean, 3, true);
1250+
ml.get_key_or_arr(LLM_KV_VISION_IMAGE_STD, hparams.clip.image_std, 3, true);
1251+
ml.get_key(LLM_KV_VISION_CLIP_EMBEDDING_LENGTH, hparams.clip.hidden_size, true);
1252+
ml.get_key(LLM_KV_VISION_CLIP_BLOCK_COUNT, hparams.clip.n_layer, true);
1253+
ml.get_key(LLM_KV_VISION_CLIP_FEED_FORWARD_LENGTH, hparams.clip.n_intermediate, true);
1254+
ml.get_key(LLM_KV_VISION_CLIP_HEAD_COUNT, hparams.clip.n_head, true);
1255+
ml.get_key(LLM_KV_VISION_CLIP_LAYERNORM_EPS, hparams.clip.eps, true);
1256+
ml.get_key(LLM_KV_VISION_CLIP_PROJECTOR_TYPE, proj_type, true);
1257+
if (proj_type == "mlp") {
1258+
hparams.clip.proj_type = CLIP_PROJECTOR_TYPE_MLP;
1259+
} else {
1260+
throw std::runtime_error(format("unsupported clip projector type: %s", proj_type.c_str()));
1261+
}
1262+
std::string arch;
1263+
ml.get_key(LLM_KV_VISION_CLIP_ARCHITECTURE, arch, true);
1264+
for (auto & it : VISION_ARCH_NAMES) {
1265+
if (arch == it.second) {
1266+
hparams.clip.arch = it.first;
1267+
break;
1268+
}
1269+
}
1270+
} else if (!vision_type.empty()) {
1271+
throw std::runtime_error(format("unsupported vision type: %s", vision_type.c_str()));
1272+
}
1273+
12651274
// arch-specific CLIP hparams
12661275
switch (hparams.clip.arch) {
12671276
case VISION_ARCH_LLAVA:

0 commit comments

Comments
 (0)