From 80013eafd775a8d7b437fff8f5d937017d4b028e Mon Sep 17 00:00:00 2001 From: Wagner Bruna Date: Fri, 30 May 2025 16:20:46 -0300 Subject: [PATCH] fix: allow resetting clip_skip to its default value CLIPTextModel currently ignores attempts to set clip_skip back to -1, retaining the previously set value instead. While this is not an issue to the sd command (which does not support changing clip_skip between generations), it affects frontends that reuse model instances for multiple images. Since each model version's default clip_skip value is defined by its respective Conditioner class, it needs to be applied every time they get a different clip_skip value, so move that logic from their constructors into their set_clip_skip methods. --- clip.hpp | 12 ++++++------ conditioner.hpp | 43 +++++++++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 26 deletions(-) diff --git a/clip.hpp b/clip.hpp index 2307ee3c5..d359f61cd 100644 --- a/clip.hpp +++ b/clip.hpp @@ -678,8 +678,8 @@ class CLIPTextModel : public GGMLBlock { bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, - int clip_skip_value = -1, - bool with_final_ln = true) + bool with_final_ln = true, + int clip_skip_value = -1) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -701,7 +701,7 @@ class CLIPTextModel : public GGMLBlock { void set_clip_skip(int skip) { if (skip <= 0) { - return; + skip = -1; } clip_skip = skip; } @@ -871,9 +871,9 @@ struct CLIPTextModelRunner : public GGMLRunner { std::map& tensor_types, const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, - int clip_skip_value = 1, - bool with_final_ln = true) - : GGMLRunner(backend), model(version, clip_skip_value, with_final_ln) { + bool with_final_ln = true, + int clip_skip_value = -1) + : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) { model.init(params_ctx, tensor_types, prefix); } diff --git a/conditioner.hpp b/conditioner.hpp index 6e9acdb19..ab87f18b9 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -63,23 +63,24 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { PMVersion pv = PM_VERSION_1, int clip_skip = -1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { - if (clip_skip <= 0) { - clip_skip = 1; - if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { - clip_skip = 2; - } - } if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); - text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); } + set_clip_skip(clip_skip); } void set_clip_skip(int clip_skip) { + if (clip_skip <= 0) { + clip_skip = 1; + if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { + clip_skip = 2; + } + } text_model->set_clip_skip(clip_skip); if (sd_version_is_sdxl(version)) { text_model2->set_clip_skip(clip_skip); @@ -665,15 +666,16 @@ struct SD3CLIPEmbedder : public Conditioner { std::map& tensor_types, int clip_skip = -1) : clip_g_tokenizer(0) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); - clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false); + clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); + clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); + set_clip_skip(clip_skip); } void set_clip_skip(int clip_skip) { + if (clip_skip <= 0) { + clip_skip = 2; + } clip_l->set_clip_skip(clip_skip); clip_g->set_clip_skip(clip_skip); } @@ -1008,14 +1010,15 @@ struct FluxCLIPEmbedder : public Conditioner { FluxCLIPEmbedder(ggml_backend_t backend, std::map& tensor_types, int clip_skip = -1) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, true); + clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); + set_clip_skip(clip_skip); } void set_clip_skip(int clip_skip) { + if (clip_skip <= 0) { + clip_skip = 2; + } clip_l->set_clip_skip(clip_skip); } @@ -1218,4 +1221,4 @@ struct FluxCLIPEmbedder : public Conditioner { } }; -#endif \ No newline at end of file +#endif