From 06e7340eceb7f96363453adea68850a6dbcde6b6 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Thu, 14 Aug 2025 11:12:53 -0300 Subject: [PATCH 1/2] refactor: remove clip_skip persistent attribute All handlers are constructed with the default clip_skip value, and it is always set during inference time, so there isn't much point in keeping it as a persistent attribute. Instead, just propagate the parameter value down from get_learned_condition*. --- clip.hpp | 37 +++++++++---------------- conditioner.hpp | 65 ++++++++++++++------------------------------ stable-diffusion.cpp | 1 - 3 files changed, 34 insertions(+), 69 deletions(-) diff --git a/clip.hpp b/clip.hpp index 7ca565d9..121583a2 100644 --- a/clip.hpp +++ b/clip.hpp @@ -674,12 +674,10 @@ class CLIPTextModel : public GGMLBlock { int32_t n_head = 12; int32_t n_layer = 12; // num_hidden_layers int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14 - int32_t clip_skip = -1; bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true, - int clip_skip_value = -1) + bool with_final_ln = true) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -692,20 +690,12 @@ class CLIPTextModel : public GGMLBlock { n_head = 20; n_layer = 32; } - set_clip_skip(clip_skip_value); blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } - void set_clip_skip(int skip) { - if (skip <= 0) { - skip = -1; - } - clip_skip = skip; - } - struct ggml_tensor* get_token_embed_weight() { auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); return embeddings->get_token_embed_weight(); @@ -715,7 +705,8 @@ class CLIPTextModel : public GGMLBlock { struct ggml_tensor* input_ids, struct ggml_tensor* tkn_embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { // input_ids: [N, n_token] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); @@ -871,9 +862,8 @@ struct CLIPTextModelRunner : public GGMLRunner { const String2GGMLType& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true, - int clip_skip_value = -1) - : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) { + bool with_final_ln = true) + : GGMLRunner(backend), model(version, with_final_ln) { model.init(params_ctx, tensor_types, prefix); } @@ -881,10 +871,6 @@ struct CLIPTextModelRunner : public GGMLRunner { return "clip"; } - void set_clip_skip(int clip_skip) { - model.set_clip_skip(clip_skip); - } - void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } @@ -893,7 +879,8 @@ struct CLIPTextModelRunner : public GGMLRunner { struct ggml_tensor* input_ids, struct ggml_tensor* embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; if (input_ids->ne[0] > model.n_token) { @@ -901,14 +888,15 @@ struct CLIPTextModelRunner : public GGMLRunner { input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); } - return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled); + return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); } struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, int num_custom_embeddings = 0, void* custom_embeddings_data = NULL, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); @@ -927,7 +915,7 @@ struct CLIPTextModelRunner : public GGMLRunner { embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); } - struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled); + struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); ggml_build_forward_expand(gf, hidden_states); @@ -940,10 +928,11 @@ struct CLIPTextModelRunner : public GGMLRunner { void* custom_embeddings_data, size_t max_token_idx, bool return_pooled, + int clip_skip, ggml_tensor** output, ggml_context* output_ctx = NULL) { auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled); + return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } diff --git a/conditioner.hpp b/conditioner.hpp index 6a51dce8..8f3032e4 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -60,8 +60,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { const String2GGMLType& tensor_types, const std::string& embd_dir, SDVersion version = VERSION_SD1, - PMVersion pv = PM_VERSION_1, - int clip_skip = -1) + PMVersion pv = PM_VERSION_1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { if (sd_version_is_sd1(version)) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); @@ -71,20 +70,6 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 1; - if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { - clip_skip = 2; - } - } - text_model->set_clip_skip(clip_skip); - if (sd_version_is_sdxl(version)) { - text_model2->set_clip_skip(clip_skip); - } } void get_param_tensors(std::map& tensors) { @@ -411,7 +396,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int height, int adm_in_channels = -1, bool force_zero_embeddings = false) { - set_clip_skip(clip_skip); + if (clip_skip <= 0) { + clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1; + } + int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] @@ -454,6 +442,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states1, work_ctx); if (sd_version_is_sdxl(version)) { @@ -463,6 +452,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states2, work_ctx); // concat chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0); @@ -474,6 +464,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -663,21 +654,11 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr t5; SD3CLIPEmbedder(ggml_backend_t backend, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) + const String2GGMLType& tensor_types = {}) : clip_g_tokenizer(0) { clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); - clip_g->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -774,7 +755,9 @@ struct SD3CLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool force_zero_embeddings = false) { - set_clip_skip(clip_skip); + if (clip_skip <= 0) { + clip_skip = 2; + } auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& clip_g_tokens = token_and_weights[1].first; @@ -812,6 +795,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_l, work_ctx); { @@ -839,6 +823,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_l, work_ctx); } @@ -860,6 +845,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_g, work_ctx); @@ -888,6 +874,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_g, work_ctx); } @@ -1010,18 +997,9 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len = 256; FluxCLIPEmbedder(ggml_backend_t backend, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) { + const String2GGMLType& tensor_types = {}) { clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -1102,7 +1080,9 @@ struct FluxCLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool force_zero_embeddings = false) { - set_clip_skip(clip_skip); + if (clip_skip <= 0) { + clip_skip = 2; + } auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& t5_tokens = token_and_weights[1].first; @@ -1136,6 +1116,7 @@ struct FluxCLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -1232,16 +1213,12 @@ struct PixArtCLIPEmbedder : public Conditioner { PixArtCLIPEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, - int clip_skip = -1, bool use_mask = false, int mask_pad = 1) : use_mask(use_mask), mask_pad(mask_pad) { t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); } - void set_clip_skip(int clip_skip) { - } - void get_param_tensors(std::map& tensors) { t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index c5448f92..7a896774 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -346,7 +346,6 @@ class StableDiffusionGGML { if (is_chroma) { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, - -1, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else { From 42f2fa662c91beea9fd36a00aecedf58625d4e30 Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Thu, 14 Aug 2025 11:50:00 -0300 Subject: [PATCH 2/2] feat: reduce CLIP memory usage with no embeddings The CLIP weights need to be converted to f32 for textual inversions (fbd42b6fc16d14fbd362993fa1d083740a05f113), but that increases the amount of allocated VRAM even when embeddings aren't being used. --- clip.hpp | 22 ++++++++++++++++------ conditioner.hpp | 9 +++++---- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/clip.hpp b/clip.hpp index 121583a2..beecd88f 100644 --- a/clip.hpp +++ b/clip.hpp @@ -544,9 +544,15 @@ class CLIPEmbeddings : public GGMLBlock { int64_t embed_dim; int64_t vocab_size; int64_t num_positions; + bool force_clip_f32; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { enum ggml_type token_wtype = GGML_TYPE_F32; + if (!force_clip_f32) { + auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); + if (tensor_type != tensor_types.end()) + token_wtype = tensor_type->second; + } enum ggml_type position_wtype = GGML_TYPE_F32; params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); @@ -556,10 +562,12 @@ class CLIPEmbeddings : public GGMLBlock { public: CLIPEmbeddings(int64_t embed_dim, int64_t vocab_size = 49408, - int64_t num_positions = 77) + int64_t num_positions = 77, + bool force_clip_f32 = false) : embed_dim(embed_dim), vocab_size(vocab_size), - num_positions(num_positions) { + num_positions(num_positions), + force_clip_f32(force_clip_f32) { } struct ggml_tensor* get_token_embed_weight() { @@ -677,7 +685,8 @@ class CLIPTextModel : public GGMLBlock { bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true) + bool with_final_ln = true, + bool force_clip_f32 = false) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -691,7 +700,7 @@ class CLIPTextModel : public GGMLBlock { n_layer = 32; } - blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); + blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } @@ -862,8 +871,9 @@ struct CLIPTextModelRunner : public GGMLRunner { const String2GGMLType& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, - bool with_final_ln = true) - : GGMLRunner(backend), model(version, with_final_ln) { + bool with_final_ln = true, + bool force_clip_f32 = false) + : GGMLRunner(backend), model(version, with_final_ln, force_clip_f32) { model.init(params_ctx, tensor_types, prefix); } diff --git a/conditioner.hpp b/conditioner.hpp index 8f3032e4..7c47670d 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -62,13 +62,14 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { SDVersion version = VERSION_SD1, PMVersion pv = PM_VERSION_1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { + bool force_clip_f32 = embd_dir.size() > 0; if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); - text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); + text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); } }