diff --git a/clip.hpp b/clip.hpp index 7ca565d9..beecd88f 100644 --- a/clip.hpp +++ b/clip.hpp @@ -544,9 +544,15 @@ class CLIPEmbeddings : public GGMLBlock { int64_t embed_dim; int64_t vocab_size; int64_t num_positions; + bool force_clip_f32; void init_params(struct ggml_context* ctx, const String2GGMLType& tensor_types = {}, const std::string prefix = "") { enum ggml_type token_wtype = GGML_TYPE_F32; + if (!force_clip_f32) { + auto tensor_type = tensor_types.find(prefix + "token_embedding.weight"); + if (tensor_type != tensor_types.end()) + token_wtype = tensor_type->second; + } enum ggml_type position_wtype = GGML_TYPE_F32; params["token_embedding.weight"] = ggml_new_tensor_2d(ctx, token_wtype, embed_dim, vocab_size); @@ -556,10 +562,12 @@ class CLIPEmbeddings : public GGMLBlock { public: CLIPEmbeddings(int64_t embed_dim, int64_t vocab_size = 49408, - int64_t num_positions = 77) + int64_t num_positions = 77, + bool force_clip_f32 = false) : embed_dim(embed_dim), vocab_size(vocab_size), - num_positions(num_positions) { + num_positions(num_positions), + force_clip_f32(force_clip_f32) { } struct ggml_tensor* get_token_embed_weight() { @@ -674,12 +682,11 @@ class CLIPTextModel : public GGMLBlock { int32_t n_head = 12; int32_t n_layer = 12; // num_hidden_layers int32_t projection_dim = 1280; // only for OPEN_CLIP_VIT_BIGG_14 - int32_t clip_skip = -1; bool with_final_ln = true; CLIPTextModel(CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool with_final_ln = true, - int clip_skip_value = -1) + bool force_clip_f32 = false) : version(version), with_final_ln(with_final_ln) { if (version == OPEN_CLIP_VIT_H_14) { hidden_size = 1024; @@ -692,20 +699,12 @@ class CLIPTextModel : public GGMLBlock { n_head = 20; n_layer = 32; } - set_clip_skip(clip_skip_value); - blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token)); + blocks["embeddings"] = std::shared_ptr(new CLIPEmbeddings(hidden_size, vocab_size, n_token, force_clip_f32)); blocks["encoder"] = std::shared_ptr(new CLIPEncoder(n_layer, hidden_size, n_head, intermediate_size)); blocks["final_layer_norm"] = std::shared_ptr(new LayerNorm(hidden_size)); } - void set_clip_skip(int skip) { - if (skip <= 0) { - skip = -1; - } - clip_skip = skip; - } - struct ggml_tensor* get_token_embed_weight() { auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); return embeddings->get_token_embed_weight(); @@ -715,7 +714,8 @@ class CLIPTextModel : public GGMLBlock { struct ggml_tensor* input_ids, struct ggml_tensor* tkn_embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { // input_ids: [N, n_token] auto embeddings = std::dynamic_pointer_cast(blocks["embeddings"]); auto encoder = std::dynamic_pointer_cast(blocks["encoder"]); @@ -872,8 +872,8 @@ struct CLIPTextModelRunner : public GGMLRunner { const std::string prefix, CLIPVersion version = OPENAI_CLIP_VIT_L_14, bool with_final_ln = true, - int clip_skip_value = -1) - : GGMLRunner(backend), model(version, with_final_ln, clip_skip_value) { + bool force_clip_f32 = false) + : GGMLRunner(backend), model(version, with_final_ln, force_clip_f32) { model.init(params_ctx, tensor_types, prefix); } @@ -881,10 +881,6 @@ struct CLIPTextModelRunner : public GGMLRunner { return "clip"; } - void set_clip_skip(int clip_skip) { - model.set_clip_skip(clip_skip); - } - void get_param_tensors(std::map& tensors, const std::string prefix) { model.get_param_tensors(tensors, prefix); } @@ -893,7 +889,8 @@ struct CLIPTextModelRunner : public GGMLRunner { struct ggml_tensor* input_ids, struct ggml_tensor* embeddings, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { size_t N = input_ids->ne[1]; size_t n_token = input_ids->ne[0]; if (input_ids->ne[0] > model.n_token) { @@ -901,14 +898,15 @@ struct CLIPTextModelRunner : public GGMLRunner { input_ids = ggml_reshape_2d(ctx, input_ids, model.n_token, input_ids->ne[0] / model.n_token); } - return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled); + return model.forward(ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); } struct ggml_cgraph* build_graph(struct ggml_tensor* input_ids, int num_custom_embeddings = 0, void* custom_embeddings_data = NULL, size_t max_token_idx = 0, - bool return_pooled = false) { + bool return_pooled = false, + int clip_skip = -1) { struct ggml_cgraph* gf = ggml_new_graph(compute_ctx); input_ids = to_backend(input_ids); @@ -927,7 +925,7 @@ struct CLIPTextModelRunner : public GGMLRunner { embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 1); } - struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled); + struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, embeddings, max_token_idx, return_pooled, clip_skip); ggml_build_forward_expand(gf, hidden_states); @@ -940,10 +938,11 @@ struct CLIPTextModelRunner : public GGMLRunner { void* custom_embeddings_data, size_t max_token_idx, bool return_pooled, + int clip_skip, ggml_tensor** output, ggml_context* output_ctx = NULL) { auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled); + return build_graph(input_ids, num_custom_embeddings, custom_embeddings_data, max_token_idx, return_pooled, clip_skip); }; GGMLRunner::compute(get_graph, n_threads, true, output, output_ctx); } diff --git a/conditioner.hpp b/conditioner.hpp index 6a51dce8..7c47670d 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -60,30 +60,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { const String2GGMLType& tensor_types, const std::string& embd_dir, SDVersion version = VERSION_SD1, - PMVersion pv = PM_VERSION_1, - int clip_skip = -1) + PMVersion pv = PM_VERSION_1) : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { + bool force_clip_f32 = embd_dir.size() > 0; if (sd_version_is_sd1(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, true, force_clip_f32); } else if (sd_version_is_sd2(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, true, force_clip_f32); } else if (sd_version_is_sdxl(version)) { - text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); - text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); - } - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 1; - if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { - clip_skip = 2; - } - } - text_model->set_clip_skip(clip_skip); - if (sd_version_is_sdxl(version)) { - text_model2->set_clip_skip(clip_skip); + text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, false, force_clip_f32); + text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false, force_clip_f32); } } @@ -411,7 +397,10 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { int height, int adm_in_channels = -1, bool force_zero_embeddings = false) { - set_clip_skip(clip_skip); + if (clip_skip <= 0) { + clip_skip = (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) ? 2 : 1; + } + int64_t t0 = ggml_time_ms(); struct ggml_tensor* hidden_states = NULL; // [N, n_token, hidden_size] struct ggml_tensor* chunk_hidden_states = NULL; // [n_token, hidden_size] or [n_token, hidden_size + hidden_size2] @@ -454,6 +443,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states1, work_ctx); if (sd_version_is_sdxl(version)) { @@ -463,6 +453,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, false, + clip_skip, &chunk_hidden_states2, work_ctx); // concat chunk_hidden_states = ggml_tensor_concat(work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0); @@ -474,6 +465,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { token_embed_custom.data(), max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -663,21 +655,11 @@ struct SD3CLIPEmbedder : public Conditioner { std::shared_ptr t5; SD3CLIPEmbedder(ggml_backend_t backend, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) + const String2GGMLType& tensor_types = {}) : clip_g_tokenizer(0) { clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, false); clip_g = std::make_shared(backend, tensor_types, "text_encoders.clip_g.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, false); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); - clip_g->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -774,7 +756,9 @@ struct SD3CLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool force_zero_embeddings = false) { - set_clip_skip(clip_skip); + if (clip_skip <= 0) { + clip_skip = 2; + } auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& clip_g_tokens = token_and_weights[1].first; @@ -812,6 +796,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_l, work_ctx); { @@ -839,6 +824,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_l, work_ctx); } @@ -860,6 +846,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, false, + clip_skip, &chunk_hidden_states_g, work_ctx); @@ -888,6 +875,7 @@ struct SD3CLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled_g, work_ctx); } @@ -1010,18 +998,9 @@ struct FluxCLIPEmbedder : public Conditioner { size_t chunk_len = 256; FluxCLIPEmbedder(ggml_backend_t backend, - const String2GGMLType& tensor_types = {}, - int clip_skip = -1) { + const String2GGMLType& tensor_types = {}) { clip_l = std::make_shared(backend, tensor_types, "text_encoders.clip_l.transformer.text_model", OPENAI_CLIP_VIT_L_14, true); t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); - set_clip_skip(clip_skip); - } - - void set_clip_skip(int clip_skip) { - if (clip_skip <= 0) { - clip_skip = 2; - } - clip_l->set_clip_skip(clip_skip); } void get_param_tensors(std::map& tensors) { @@ -1102,7 +1081,9 @@ struct FluxCLIPEmbedder : public Conditioner { std::vector, std::vector>> token_and_weights, int clip_skip, bool force_zero_embeddings = false) { - set_clip_skip(clip_skip); + if (clip_skip <= 0) { + clip_skip = 2; + } auto& clip_l_tokens = token_and_weights[0].first; auto& clip_l_weights = token_and_weights[0].second; auto& t5_tokens = token_and_weights[1].first; @@ -1136,6 +1117,7 @@ struct FluxCLIPEmbedder : public Conditioner { NULL, max_token_idx, true, + clip_skip, &pooled, work_ctx); } @@ -1232,16 +1214,12 @@ struct PixArtCLIPEmbedder : public Conditioner { PixArtCLIPEmbedder(ggml_backend_t backend, const String2GGMLType& tensor_types = {}, - int clip_skip = -1, bool use_mask = false, int mask_pad = 1) : use_mask(use_mask), mask_pad(mask_pad) { t5 = std::make_shared(backend, tensor_types, "text_encoders.t5xxl.transformer"); } - void set_clip_skip(int clip_skip) { - } - void get_param_tensors(std::map& tensors) { t5->get_param_tensors(tensors, "text_encoders.t5xxl.transformer"); } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index c5448f92..7a896774 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -346,7 +346,6 @@ class StableDiffusionGGML { if (is_chroma) { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, - -1, sd_ctx_params->chroma_use_t5_mask, sd_ctx_params->chroma_t5_mask_pad); } else {