From 67919e6223bd582351a21f6e665bd85e3436f57e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Dec 2024 02:33:49 +0100 Subject: [PATCH 01/19] Add support for loading SD1 inpaint models --- conditioner.hpp | 2 +- model.cpp | 8 +++++++- model.h | 1 + unet.hpp | 8 ++++++-- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index 5b3f20dd1..dab7e2c55 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -68,7 +68,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { clip_skip = 2; } } - if (version == VERSION_SD1) { + if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip); } else if (version == VERSION_SD2) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip); diff --git a/model.cpp b/model.cpp index c90918ad2..4dd6bc841 100644 --- a/model.cpp +++ b/model.cpp @@ -1458,7 +1458,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s } SDVersion ModelLoader::get_sd_version() { - TensorStorage token_embedding_weight; + TensorStorage token_embedding_weight, input_block_weight; for (auto& tensor_storage : tensor_storages) { if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { return VERSION_FLUX; @@ -1485,9 +1485,15 @@ SDVersion ModelLoader::get_sd_version() { token_embedding_weight = tensor_storage; // break; } + if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight") { + input_block_weight = tensor_storage; + } } if (token_embedding_weight.ne[0] == 768) { + if(input_block_weight.ne[2]==9){ + return VERSION_SD1_INPAINT; + } return VERSION_SD1; } else if (token_embedding_weight.ne[0] == 1024) { return VERSION_SD2; diff --git a/model.h b/model.h index 29d46c192..06da04d9e 100644 --- a/model.h +++ b/model.h @@ -19,6 +19,7 @@ enum SDVersion { VERSION_SD1, + VERSION_SD1_INPAINT, VERSION_SD2, VERSION_SDXL, VERSION_SVD, diff --git a/unet.hpp b/unet.hpp index 2a7adb3d2..ae89ce037 100644 --- a/unet.hpp +++ b/unet.hpp @@ -166,6 +166,7 @@ class SpatialVideoTransformer : public SpatialTransformer { // ldm.modules.diffusionmodules.openaimodel.UNetModel class UnetModelBlock : public GGMLBlock { protected: + static std::map empty_tensor_types; SDVersion version = VERSION_SD1; // network hparams int in_channels = 4; @@ -183,7 +184,7 @@ class UnetModelBlock : public GGMLBlock { int model_channels = 320; int adm_in_channels = 2816; // only for VERSION_SDXL/SVD - UnetModelBlock(SDVersion version = VERSION_SD1, bool flash_attn = false) + UnetModelBlock(SDVersion version = VERSION_SD1, std::map& tensor_types = empty_tensor_types, bool flash_attn = false) : version(version) { if (version == VERSION_SD2) { context_dim = 1024; @@ -203,7 +204,10 @@ class UnetModelBlock : public GGMLBlock { adm_in_channels = 768; num_head_channels = 64; num_heads = -1; + } else if (version == VERSION_SD1_INPAINT) { + in_channels = 9; } + // dims is always 2 // use_temporal_attention is always True for SVD @@ -536,7 +540,7 @@ struct UNetModelRunner : public GGMLRunner { const std::string prefix, SDVersion version = VERSION_SD1, bool flash_attn = false) - : GGMLRunner(backend), unet(version, flash_attn) { + : GGMLRunner(backend), unet(version, tensor_types, flash_attn) { unet.init(params_ctx, tensor_types, prefix); } From 56746011f8634eee95c1485a895b4cc73456df15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Dec 2024 02:56:47 +0100 Subject: [PATCH 02/19] inpaint: try things --- stable-diffusion.cpp | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 5abc29507..ba3e00dfc 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -26,6 +26,7 @@ const char* model_version_to_str[] = { "SD 1.x", + "SD 1.x Inpaint", "SD 2.x", "SDXL", "SVD", @@ -1359,8 +1360,18 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); - struct ggml_tensor* x_t = init_latent; - struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + struct ggml_tensor* x_t; + struct ggml_tensor* noise; + if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + struct ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, 1, 1); + struct ggml_tensor* masked_image = ggml_dup_tensor(work_ctx, init_latent); + + x_t = ggml_concat(work_ctx, ggml_concat(work_ctx, init_latent, masked_image, 2), mask, 2); + noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C * 2 + 1, 1); + } else { + x_t = init_latent; + noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); + } ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); int start_merge_step = -1; From 048b8f73317c04548aec66543fc9aefc0bede208 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Dec 2024 21:27:10 +0100 Subject: [PATCH 03/19] Actually Support Inpaint models --- ggml_extend.hpp | 37 ++++++++++++++++++++++++- stable-diffusion.cpp | 65 +++++++++++++++++++++++++++++++++----------- 2 files changed, 85 insertions(+), 17 deletions(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 8afcd367c..3366aa6dc 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -290,6 +290,42 @@ __STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data, } } +__STATIC_INLINE__ void sd_mask_to_tensor(const uint8_t* image_data, + struct ggml_tensor* output, + bool scale = true) { + int64_t width = output->ne[0]; + int64_t height = output->ne[1]; + int64_t channels = output->ne[2]; + GGML_ASSERT(channels == 1 && output->type == GGML_TYPE_F32); + for (int iy = 0; iy < height; iy++) { + for (int ix = 0; ix < width; ix++) { + float value = *(image_data + iy * width * channels + ix); + if (scale) { + value /= 255.f; + } + ggml_tensor_set_f32(output, value, ix, iy); + } + } +} + +__STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data, + struct ggml_tensor* mask, + struct ggml_tensor* output) { + int64_t width = output->ne[0]; + int64_t height = output->ne[1]; + int64_t channels = output->ne[2]; + GGML_ASSERT(output->type == GGML_TYPE_F32); + for (int ix = 0; ix < width; ix++) { + for (int iy = 0; iy < height; iy++) { + float m = ggml_tensor_get_f32(mask, ix, iy); + for (int k = 0; k < channels; k++) { + float value = (1 - m) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5; + ggml_tensor_set_f32(output, value, ix, iy, k); + } + } + } +} + __STATIC_INLINE__ void sd_mul_images_to_tensor(const uint8_t* image_data, struct ggml_tensor* output, int idx, @@ -1144,7 +1180,6 @@ struct GGMLRunner { } #endif ggml_backend_graph_compute(backend, gf); - #ifdef GGML_PERF ggml_graph_print(gf); #endif diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ba3e00dfc..ac322d40a 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1164,10 +1164,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, std::string input_id_images_path, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1360,20 +1361,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed); - struct ggml_tensor* x_t; - struct ggml_tensor* noise; - if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { - struct ggml_tensor* mask = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, 1, 1); - struct ggml_tensor* masked_image = ggml_dup_tensor(work_ctx, init_latent); - - x_t = ggml_concat(work_ctx, ggml_concat(work_ctx, init_latent, masked_image, 2), mask, 2); - noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C * 2 + 1, 1); - } else { - x_t = init_latent; - noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); - } + struct ggml_tensor* x_t = init_latent; + struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); + if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + cond.c_concat = masked_image; + uncond.c_concat = masked_image; + } int start_merge_step = -1; if (sd_ctx->sd->stacked_id) { start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); @@ -1614,7 +1609,19 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_ctx->sd->rng->manual_seed(seed); ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); + + // sd_image_to_tensor(mask.data, mask_img); + for (int ix = 0; ix < width; ix++) { + for (int iy = 0; iy < height; iy++) { + ggml_tensor_set_f32(mask_img, (iy < height / 3 && ix > width / 4 && ix < 3 * width / 4) ? 1 : 0, ix, iy); + } + } + sd_image_to_tensor(init_image.data, init_img); + ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + sd_apply_mask(init_img, mask_img, masked_img); + ggml_tensor* init_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img); @@ -1622,12 +1629,38 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } else { init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } + + ggml_tensor* masked_image_0 = NULL; + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + masked_image_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + } + ggml_tensor* masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1); + LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]); + LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]); + for (int ix = 0; ix < masked_image_0->ne[0]; ix++) { + for (int iy = 0; iy < masked_image_0->ne[1]; iy++) { + for (int k = 0; k < masked_image_0->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); + ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1); + } + int mx = ix * 8; + int my = iy * 8; + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(masked_image, m, ix, iy, 0); + } + } + print_ggml_tensor(init_latent, true); size_t t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); std::vector sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps); size_t t_enc = static_cast(sample_steps * strength); + if (t_enc == sample_steps) + t_enc--; LOG_INFO("target t_enc is %zu steps", t_enc); std::vector sigma_sched; sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end()); From f1e6b0d71524d797bb8f9f2d70066ef8d7fa495b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Dec 2024 21:27:24 +0100 Subject: [PATCH 04/19] unet: Fix c_concat --- unet.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/unet.hpp b/unet.hpp index ae89ce037..ee90285ec 100644 --- a/unet.hpp +++ b/unet.hpp @@ -570,6 +570,7 @@ struct UNetModelRunner : public GGMLRunner { context = to_backend(context); y = to_backend(y); timesteps = to_backend(timesteps); + c_concat = to_backend(c_concat); for (int i = 0; i < controls.size(); i++) { controls[i] = to_backend(controls[i]); From 5a0391508ed2071734cd89f03fd3e02839a377db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Dec 2024 22:12:38 +0100 Subject: [PATCH 05/19] support image mask input --- examples/cli/main.cpp | 23 +++++++++++++++++++++++ stable-diffusion.cpp | 8 ++------ stable-diffusion.h | 1 + 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 4b47286f4..5a48b3d61 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -85,6 +85,7 @@ struct SDParams { std::string lora_model_dir; std::string output_path = "output.png"; std::string input_path; + std::string mask_path; std::string control_image_path; std::string prompt; @@ -148,6 +149,7 @@ void print_params(SDParams params) { printf(" normalize input image : %s\n", params.normalize_input ? "true" : "false"); printf(" output_path: %s\n", params.output_path.c_str()); printf(" init_img: %s\n", params.input_path.c_str()); + printf(" mask_img: %s\n", params.mask_path.c_str()); printf(" control_image: %s\n", params.control_image_path.c_str()); printf(" clip on cpu: %s\n", params.clip_on_cpu ? "true" : "false"); printf(" controlnet cpu: %s\n", params.control_net_cpu ? "true" : "false"); @@ -384,6 +386,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.input_path = argv[i]; + } else if (arg == "--mask") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.mask_path = argv[i]; } else if (arg == "--control-image") { if (++i >= argc) { invalid_arg = true; @@ -803,6 +811,8 @@ int main(int argc, const char* argv[]) { bool vae_decode_only = true; uint8_t* input_image_buffer = NULL; uint8_t* control_image_buffer = NULL; + uint8_t* mask_image_buffer = NULL; + if (params.mode == IMG2IMG || params.mode == IMG2VID) { vae_decode_only = false; @@ -907,6 +917,18 @@ int main(int argc, const char* argv[]) { } } + if (params.mask_path != "") { + int c = 0; + mask_image_buffer = stbi_load(params.mask_path.c_str(), ¶ms.width, ¶ms.height, &c, 1); + } else { + std::vector arr(params.width * params.height, 255); + mask_image_buffer = arr.data(); + } + sd_image_t mask_image = {(uint32_t)params.width, + (uint32_t)params.height, + 1, + mask_image_buffer}; + sd_image_t* results; if (params.mode == TXT2IMG) { results = txt2img(sd_ctx, @@ -976,6 +998,7 @@ int main(int argc, const char* argv[]) { } else { results = img2img(sd_ctx, input_image, + mask_image, params.prompt.c_str(), params.negative_prompt.c_str(), params.clip_skip, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ac322d40a..9e44bb1b0 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1549,6 +1549,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, + sd_image_t mask, const char* prompt_c_str, const char* negative_prompt_c_str, int clip_skip, @@ -1611,12 +1612,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1); - // sd_image_to_tensor(mask.data, mask_img); - for (int ix = 0; ix < width; ix++) { - for (int iy = 0; iy < height; iy++) { - ggml_tensor_set_f32(mask_img, (iy < height / 3 && ix > width / 4 && ix < 3 * width / 4) ? 1 : 0, ix, iy); - } - } + sd_mask_to_tensor(mask.data, mask_img); sd_image_to_tensor(init_image.data, init_img); ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); diff --git a/stable-diffusion.h b/stable-diffusion.h index c67bc8a32..5a758df66 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -174,6 +174,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, + sd_image_t mask_image, const char* prompt, const char* negative_prompt, int clip_skip, From a0400949f2240cbc270170a0a74927c95785019d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Dec 2024 22:13:06 +0100 Subject: [PATCH 06/19] Do not crash in txt2img with Inpaint model --- stable-diffusion.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 9e44bb1b0..6d9e6f838 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1366,6 +1366,18 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + if (masked_image == NULL) { + // no mask, set the whole image as masked + masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1); + for (int64_t x = 0; x < masked_image->ne[0]; x++) { + for (int64_t y = 0; y < masked_image->ne[1]; y++) { + ggml_tensor_set_f32(masked_image, 1, x, y, 0); + for (int64_t c = 1; c < masked_image->ne[2]; c++) { + ggml_tensor_set_f32(masked_image, 0, x, y, c); + } + } + } + } cond.c_concat = masked_image; uncond.c_concat = masked_image; } @@ -1516,6 +1528,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, ggml_set_f32(init_latent, 0.f); } + if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); + } + sd_image_t* result_images = generate_image(sd_ctx, work_ctx, init_latent, From 8ee3512c4a241299a064677dab511bd4670db527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 4 Dec 2024 23:09:16 +0100 Subject: [PATCH 07/19] Fix issue introduced by rebase --- stable-diffusion.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 6d9e6f838..66acdeec5 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1164,11 +1164,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, std::string input_id_images_path, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL) { + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1368,6 +1368,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { if (masked_image == NULL) { // no mask, set the whole image as masked + LOG_INFO("Missing image mask, using whole frame..."); masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1); for (int64_t x = 0; x < masked_image->ne[0]; x++) { for (int64_t y = 0; y < masked_image->ne[1]; y++) { @@ -1699,7 +1700,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, skip_layers_vec, slg_scale, skip_layer_start, - skip_layer_end); + skip_layer_end, + masked_image); size_t t2 = ggml_time_ms(); From 10c16da0e1dc7b1c894a196a229c9390bbca78db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 5 Dec 2024 17:57:09 +0100 Subject: [PATCH 08/19] Add "inpaint" support for diffusion models in img2img via denoise mask --- stable-diffusion.cpp | 140 ++++++++++++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 48 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 66acdeec5..08065e7f4 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -782,10 +782,22 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* noise_mask = nullptr) { + struct ggml_init_params params; + size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); + for (int i = 1; i < 4; i++) { + data_size *= init_latent->ne[i]; + } + data_size += 1024; + params.mem_size = data_size * 3; + params.mem_buffer = NULL; + params.no_alloc = false; + ggml_context* tmp_ctx = ggml_init(params); + size_t steps = sigmas.size() - 1; // noise = load_tensor_from_file(work_ctx, "./rand0.bin"); // print_ggml_tensor(noise); @@ -944,6 +956,19 @@ class StableDiffusionGGML { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } + if (noise_mask != nullptr) { + for (int64_t x = 0; x < denoised->ne[0]; x++) { + for (int64_t y = 0; y < denoised->ne[1]; y++) { + float mask = ggml_tensor_get_f32(noise_mask, x, y); + for (int64_t k = 0; k < denoised->ne[2]; k++) { + float init = ggml_tensor_get_f32(init_latent, x, y, k); + float den = ggml_tensor_get_f32(denoised, x, y, k); + ggml_tensor_set_f32(denoised, init + mask * (den - init), x, y, k); + } + } + } + } + return denoised; }; @@ -1355,6 +1380,25 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, int W = width / 8; int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); + ggml_tensor* noise_mask = nullptr; + if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + if (masked_image == NULL) { + // no mask, set the whole image as masked + masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1); + for (int64_t x = 0; x < masked_image->ne[0]; x++) { + for (int64_t y = 0; y < masked_image->ne[1]; y++) { + ggml_tensor_set_f32(masked_image, 1, x, y, 0); + for (int64_t c = 1; c < masked_image->ne[2]; c++) { + ggml_tensor_set_f32(masked_image, 0, x, y, c); + } + } + } + } + cond.c_concat = masked_image; + uncond.c_concat = masked_image; + } else { + noise_mask = masked_image; + } for (int b = 0; b < batch_count; b++) { int64_t sampling_start = ggml_time_ms(); int64_t cur_seed = seed + b; @@ -1365,23 +1409,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1); ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng); - if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { - if (masked_image == NULL) { - // no mask, set the whole image as masked - LOG_INFO("Missing image mask, using whole frame..."); - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1); - for (int64_t x = 0; x < masked_image->ne[0]; x++) { - for (int64_t y = 0; y < masked_image->ne[1]; y++) { - ggml_tensor_set_f32(masked_image, 1, x, y, 0); - for (int64_t c = 1; c < masked_image->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 0, x, y, c); - } - } - } - } - cond.c_concat = masked_image; - uncond.c_concat = masked_image; - } int start_merge_step = -1; if (sd_ctx->sd->stacked_id) { start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps); @@ -1407,7 +1434,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, skip_layers, slg_scale, skip_layer_start, - skip_layer_end); + skip_layer_end, + noise_mask); + // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); int64_t sampling_end = ggml_time_ms(); @@ -1606,7 +1635,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - params.mem_size += width * height * 3 * sizeof(float) * 2; + params.mem_size += width * height * 3 * sizeof(float) * 3; params.mem_size *= batch_count; params.mem_buffer = NULL; params.no_alloc = false; @@ -1632,8 +1661,46 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_mask_to_tensor(mask.data, mask_img); sd_image_to_tensor(init_image.data, init_img); - ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); - sd_apply_mask(init_img, mask_img, masked_img); + + ggml_tensor* masked_image; + + if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); + sd_apply_mask(init_img, mask_img, masked_img); + ggml_tensor* masked_image_0 = NULL; + if (!sd_ctx->sd->use_tiny_autoencoder) { + ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + masked_image_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); + } else { + masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); + } + masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1); + LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]); + LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]); + for (int ix = 0; ix < masked_image_0->ne[0]; ix++) { + for (int iy = 0; iy < masked_image_0->ne[1]; iy++) { + for (int k = 0; k < masked_image_0->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); + ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1); + } + int mx = ix * 8; + int my = iy * 8; + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(masked_image, m, ix, iy, 0); + } + } + } else { + // LOG_WARN("Inpainting with a base model is not great"); + masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1); + for (int ix = 0; ix < masked_image->ne[0]; ix++) { + for (int iy = 0; iy < masked_image->ne[1]; iy++) { + int mx = ix * 8; + int my = iy * 8; + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(masked_image, m, ix, iy); + } + } + } ggml_tensor* init_latent = NULL; if (!sd_ctx->sd->use_tiny_autoencoder) { @@ -1643,29 +1710,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img); } - ggml_tensor* masked_image_0 = NULL; - if (!sd_ctx->sd->use_tiny_autoencoder) { - ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - masked_image_0 = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments); - } else { - masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); - } - ggml_tensor* masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1); - LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]); - LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]); - for (int ix = 0; ix < masked_image_0->ne[0]; ix++) { - for (int iy = 0; iy < masked_image_0->ne[1]; iy++) { - for (int k = 0; k < masked_image_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); - ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1); - } - int mx = ix * 8; - int my = iy * 8; - float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_image, m, ix, iy, 0); - } - } - print_ggml_tensor(init_latent, true); size_t t1 = ggml_time_ms(); LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000); From a40f4c0fb3b852e29b547e60564ec5cc90468a41 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 5 Dec 2024 20:26:15 +0100 Subject: [PATCH 09/19] inpaint: fix halo on blurry masks --- ggml_extend.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 3366aa6dc..5f1db9152 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -319,7 +319,7 @@ __STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data, for (int iy = 0; iy < height; iy++) { float m = ggml_tensor_get_f32(mask, ix, iy); for (int k = 0; k < channels; k++) { - float value = (1 - m) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5; + float value = ((float)(m < 254.5/255)) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5; ggml_tensor_set_f32(output, value, ix, iy, k); } } From 148731da8e3f7985fe8a051c39278a46dec47848 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 6 Dec 2024 02:43:31 +0100 Subject: [PATCH 10/19] sd2 inpaint support --- conditioner.hpp | 8 ++++---- control.hpp | 2 +- model.cpp | 7 +++++-- model.h | 23 +++++++++++++++++++++++ stable-diffusion.cpp | 15 ++++++++++----- unet.hpp | 5 +++-- 6 files changed, 46 insertions(+), 14 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index dab7e2c55..cc515dc2f 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -61,16 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { SDVersion version = VERSION_SD1, PMVersion pv = PM_VERSION_1, int clip_skip = -1) - : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir) { + : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { if (clip_skip <= 0) { clip_skip = 1; - if (version == VERSION_SD2 || version == VERSION_SDXL) { + if (sd_version_is_sd2(version) || version == VERSION_SDXL) { clip_skip = 2; } } - if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) { + if (sd_version_is_sd1(version)) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip); - } else if (version == VERSION_SD2) { + } else if (sd_version_is_sd2(version)) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip); } else if (version == VERSION_SDXL) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); diff --git a/control.hpp b/control.hpp index ed36db280..41c94bdad 100644 --- a/control.hpp +++ b/control.hpp @@ -34,7 +34,7 @@ class ControlNetBlock : public GGMLBlock { ControlNetBlock(SDVersion version = VERSION_SD1) : version(version) { - if (version == VERSION_SD2) { + if (sd_version_is_sd2(version)) { context_dim = 1024; num_head_channels = 64; num_heads = -1; diff --git a/model.cpp b/model.cpp index 4dd6bc841..f1c70bb38 100644 --- a/model.cpp +++ b/model.cpp @@ -1489,13 +1489,16 @@ SDVersion ModelLoader::get_sd_version() { input_block_weight = tensor_storage; } } - + bool is_inpaint = input_block_weight.ne[2] == 9; if (token_embedding_weight.ne[0] == 768) { - if(input_block_weight.ne[2]==9){ + if (is_inpaint) { return VERSION_SD1_INPAINT; } return VERSION_SD1; } else if (token_embedding_weight.ne[0] == 1024) { + if (is_inpaint) { + return VERSION_SD2_INPAINT; + } return VERSION_SD2; } return VERSION_COUNT; diff --git a/model.h b/model.h index 06da04d9e..dd8f05d9a 100644 --- a/model.h +++ b/model.h @@ -21,6 +21,7 @@ enum SDVersion { VERSION_SD1, VERSION_SD1_INPAINT, VERSION_SD2, + VERSION_SD2_INPAINT, VERSION_SDXL, VERSION_SVD, VERSION_SD3, @@ -42,6 +43,27 @@ static inline bool sd_version_is_sd3(SDVersion version) { return false; } +static inline bool sd_version_is_sd1(SDVersion version) { + if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) { + return true; + } + return false; +} + +static inline bool sd_version_is_sd2(SDVersion version) { + if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) { + return true; + } + return false; +} + +static inline bool sd_version_is_inpaint(SDVersion version) { + if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT) { + return true; + } + return false; +} + static inline bool sd_version_is_dit(SDVersion version) { if (sd_version_is_flux(version) || sd_version_is_sd3(version)) { return true; @@ -49,6 +71,7 @@ static inline bool sd_version_is_dit(SDVersion version) { return false; } + enum PMVersion { PM_VERSION_1, PM_VERSION_2, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 08065e7f4..ef987b948 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -28,6 +28,7 @@ const char* model_version_to_str[] = { "SD 1.x", "SD 1.x Inpaint", "SD 2.x", + "SD 2.x Inpaint", "SDXL", "SVD", "SD3.x", @@ -518,7 +519,7 @@ class StableDiffusionGGML { // check is_using_v_parameterization_for_sd2 bool is_using_v_parameterization = false; - if (version == VERSION_SD2) { + if (sd_version_is_sd2(version)) { if (is_using_v_parameterization_for_sd2(ctx)) { is_using_v_parameterization = true; } @@ -601,9 +602,13 @@ class StableDiffusionGGML { struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); ggml_set_f32(timesteps, 999); + + struct ggml_tensor* concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1); + ggml_set_f32(timesteps, 0); + int64_t t0 = ggml_time_ms(); struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); - diffusion_model->compute(n_threads, x_t, timesteps, c, NULL, NULL, NULL, -1, {}, 0.f, &out); + diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out); diffusion_model->free_compute_buffer(); double result = 0.f; @@ -1381,7 +1386,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, int H = height / 8; LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]); ggml_tensor* noise_mask = nullptr; - if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + if (sd_version_is_inpaint(sd_ctx->sd->version)) { if (masked_image == NULL) { // no mask, set the whole image as masked masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1); @@ -1558,7 +1563,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, ggml_set_f32(init_latent, 0.f); } - if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + if (sd_version_is_inpaint(sd_ctx->sd->version)) { LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask"); } @@ -1664,7 +1669,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, ggml_tensor* masked_image; - if (sd_ctx->sd->version == VERSION_SD1_INPAINT) { + if (sd_version_is_inpaint(sd_ctx->sd->version)) { ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_apply_mask(init_img, mask_img, masked_img); ggml_tensor* masked_image_0 = NULL; diff --git a/unet.hpp b/unet.hpp index ee90285ec..22b1a3eb1 100644 --- a/unet.hpp +++ b/unet.hpp @@ -186,7 +186,7 @@ class UnetModelBlock : public GGMLBlock { UnetModelBlock(SDVersion version = VERSION_SD1, std::map& tensor_types = empty_tensor_types, bool flash_attn = false) : version(version) { - if (version == VERSION_SD2) { + if (sd_version_is_sd2(version)) { context_dim = 1024; num_head_channels = 64; num_heads = -1; @@ -204,7 +204,8 @@ class UnetModelBlock : public GGMLBlock { adm_in_channels = 768; num_head_channels = 64; num_heads = -1; - } else if (version == VERSION_SD1_INPAINT) { + } + if (sd_version_is_inpaint(version)) { in_channels = 9; } From 8085e4804a705528a7199904102a9c9d38f6fe00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 6 Dec 2024 03:07:04 +0100 Subject: [PATCH 11/19] sdxl inpaint support --- conditioner.hpp | 20 ++++++++++---------- control.hpp | 4 ++-- model.cpp | 13 +++++++++++-- model.h | 11 +++++++++-- stable-diffusion.cpp | 6 +++--- unet.hpp | 4 ++-- 6 files changed, 37 insertions(+), 21 deletions(-) diff --git a/conditioner.hpp b/conditioner.hpp index cc515dc2f..8d1ec31bc 100644 --- a/conditioner.hpp +++ b/conditioner.hpp @@ -64,7 +64,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) { if (clip_skip <= 0) { clip_skip = 1; - if (sd_version_is_sd2(version) || version == VERSION_SDXL) { + if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) { clip_skip = 2; } } @@ -72,7 +72,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip); } else if (sd_version_is_sd2(version)) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip); - } else if (version == VERSION_SDXL) { + } else if (sd_version_is_sdxl(version)) { text_model = std::make_shared(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false); text_model2 = std::make_shared(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false); } @@ -80,35 +80,35 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { void set_clip_skip(int clip_skip) { text_model->set_clip_skip(clip_skip); - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { text_model2->set_clip_skip(clip_skip); } } void get_param_tensors(std::map& tensors) { text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model"); - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model"); } } void alloc_params_buffer() { text_model->alloc_params_buffer(); - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { text_model2->alloc_params_buffer(); } } void free_params_buffer() { text_model->free_params_buffer(); - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { text_model2->free_params_buffer(); } } size_t get_params_buffer_size() { size_t buffer_size = text_model->get_params_buffer_size(); - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { buffer_size += text_model2->get_params_buffer_size(); } return buffer_size; @@ -402,7 +402,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { auto input_ids = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens); struct ggml_tensor* input_ids2 = NULL; size_t max_token_idx = 0; - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID); if (it != chunk_tokens.end()) { std::fill(std::next(it), chunk_tokens.end(), 0); @@ -427,7 +427,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { false, &chunk_hidden_states1, work_ctx); - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { text_model2->compute(n_threads, input_ids2, 0, @@ -486,7 +486,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner { ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]); ggml_tensor* vec = NULL; - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { int out_dim = 256; vec = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels); // [0:1280] diff --git a/control.hpp b/control.hpp index 41c94bdad..23b75feff 100644 --- a/control.hpp +++ b/control.hpp @@ -38,7 +38,7 @@ class ControlNetBlock : public GGMLBlock { context_dim = 1024; num_head_channels = 64; num_heads = -1; - } else if (version == VERSION_SDXL) { + } else if (sd_version_is_sdxl(version)) { context_dim = 2048; attention_resolutions = {4, 2}; channel_mult = {1, 2, 4}; @@ -58,7 +58,7 @@ class ControlNetBlock : public GGMLBlock { // time_embed_1 is nn.SiLU() blocks["time_embed.2"] = std::shared_ptr(new Linear(time_embed_dim, time_embed_dim)); - if (version == VERSION_SDXL || version == VERSION_SVD) { + if (sd_version_is_sdxl(version) || version == VERSION_SVD) { blocks["label_emb.0.0"] = std::shared_ptr(new Linear(adm_in_channels, time_embed_dim)); // label_emb_1 is nn.SiLU() blocks["label_emb.0.2"] = std::shared_ptr(new Linear(time_embed_dim, time_embed_dim)); diff --git a/model.cpp b/model.cpp index f1c70bb38..bd4f0188a 100644 --- a/model.cpp +++ b/model.cpp @@ -1459,6 +1459,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s SDVersion ModelLoader::get_sd_version() { TensorStorage token_embedding_weight, input_block_weight; + bool is_xl = false; for (auto& tensor_storage : tensor_storages) { if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { return VERSION_FLUX; @@ -1467,10 +1468,10 @@ SDVersion ModelLoader::get_sd_version() { return VERSION_SD3; } if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) { - return VERSION_SDXL; + is_xl = true; } if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) { - return VERSION_SDXL; + is_xl = true; } if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) { return VERSION_SVD; @@ -1487,9 +1488,17 @@ SDVersion ModelLoader::get_sd_version() { } if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight") { input_block_weight = tensor_storage; + if (is_xl) + break; } } bool is_inpaint = input_block_weight.ne[2] == 9; + if (is_xl) { + if (is_inpaint) { + return VERSION_SDXL_INPAINT; + } + return VERSION_SDXL; + } if (token_embedding_weight.ne[0] == 768) { if (is_inpaint) { return VERSION_SD1_INPAINT; diff --git a/model.h b/model.h index dd8f05d9a..cfa60d992 100644 --- a/model.h +++ b/model.h @@ -23,6 +23,7 @@ enum SDVersion { VERSION_SD2, VERSION_SD2_INPAINT, VERSION_SDXL, + VERSION_SDXL_INPAINT, VERSION_SVD, VERSION_SD3, VERSION_FLUX, @@ -57,8 +58,15 @@ static inline bool sd_version_is_sd2(SDVersion version) { return false; } +static inline bool sd_version_is_sdxl(SDVersion version) { + if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) { + return true; + } + return false; +} + static inline bool sd_version_is_inpaint(SDVersion version) { - if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT) { + if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT) { return true; } return false; @@ -71,7 +79,6 @@ static inline bool sd_version_is_dit(SDVersion version) { return false; } - enum PMVersion { PM_VERSION_1, PM_VERSION_2, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ef987b948..f691b9d47 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -265,7 +265,7 @@ class StableDiffusionGGML { model_loader.set_wtype_override(wtype); } - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { vae_wtype = GGML_TYPE_F32; model_loader.set_wtype_override(GGML_TYPE_F32, "vae."); } @@ -277,7 +277,7 @@ class StableDiffusionGGML { LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor)); - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { scale_factor = 0.13025f; if (vae_path.size() == 0 && taesd_path.size() == 0) { LOG_WARN( @@ -1348,7 +1348,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, SDCondition uncond; if (cfg_scale != 1.0) { bool force_zero_embeddings = false; - if (sd_ctx->sd->version == VERSION_SDXL && negative_prompt.size() == 0) { + if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0) { force_zero_embeddings = true; } uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx, diff --git a/unet.hpp b/unet.hpp index 22b1a3eb1..31b7fe986 100644 --- a/unet.hpp +++ b/unet.hpp @@ -190,7 +190,7 @@ class UnetModelBlock : public GGMLBlock { context_dim = 1024; num_head_channels = 64; num_heads = -1; - } else if (version == VERSION_SDXL) { + } else if (sd_version_is_sdxl(version)) { context_dim = 2048; attention_resolutions = {4, 2}; channel_mult = {1, 2, 4}; @@ -216,7 +216,7 @@ class UnetModelBlock : public GGMLBlock { // time_embed_1 is nn.SiLU() blocks["time_embed.2"] = std::shared_ptr(new Linear(time_embed_dim, time_embed_dim)); - if (version == VERSION_SDXL || version == VERSION_SVD) { + if (sd_version_is_sdxl(version) || version == VERSION_SVD) { blocks["label_emb.0.0"] = std::shared_ptr(new Linear(adm_in_channels, time_embed_dim)); // label_emb_1 is nn.SiLU() blocks["label_emb.0.2"] = std::shared_ptr(new Linear(time_embed_dim, time_embed_dim)); From a9e7ee6634b9178508936532fc8951579d427cca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 6 Dec 2024 03:07:16 +0100 Subject: [PATCH 12/19] remove some logging --- stable-diffusion.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index f691b9d47..d538501fd 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1680,8 +1680,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); } masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1); - LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]); - LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]); for (int ix = 0; ix < masked_image_0->ne[0]; ix++) { for (int iy = 0; iy < masked_image_0->ne[1]; iy++) { for (int k = 0; k < masked_image_0->ne[2]; k++) { From ed41e759d5863986eb214e4e3e7fbd73ed853078 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 6 Dec 2024 03:28:55 +0100 Subject: [PATCH 13/19] Fix non inpaint sd2 --- stable-diffusion.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index d538501fd..9b4f6e15a 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -520,7 +520,7 @@ class StableDiffusionGGML { // check is_using_v_parameterization_for_sd2 bool is_using_v_parameterization = false; if (sd_version_is_sd2(version)) { - if (is_using_v_parameterization_for_sd2(ctx)) { + if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) { is_using_v_parameterization = true; } } else if (version == VERSION_SVD) { @@ -594,7 +594,7 @@ class StableDiffusionGGML { return true; } - bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) { + bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) { struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1); ggml_set_f32(x_t, 0.5); struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1); @@ -603,7 +603,7 @@ class StableDiffusionGGML { struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1); ggml_set_f32(timesteps, 999); - struct ggml_tensor* concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1); + struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL; ggml_set_f32(timesteps, 0); int64_t t0 = ggml_time_ms(); @@ -787,11 +787,11 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* noise_mask = nullptr) { struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); for (int i = 1; i < 4; i++) { From d741e2d924b16faa9874a1ed125d353e6d8e56cc Mon Sep 17 00:00:00 2001 From: stduhpf Date: Fri, 6 Dec 2024 18:52:43 +0100 Subject: [PATCH 14/19] Fix model_version _str --- stable-diffusion.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 9b4f6e15a..b99c11daf 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -30,6 +30,7 @@ const char* model_version_to_str[] = { "SD 2.x", "SD 2.x Inpaint", "SDXL", + "SDXL Inpaint", "SVD", "SD3.x", "Flux"}; From 26fab5a4e392fa9a39730fd6583bfbc832fb0802 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 6 Dec 2024 18:35:37 +0100 Subject: [PATCH 15/19] Detect Flux fill models --- model.cpp | 58 +++++++++++++++++++++++++++++--------------- model.h | 5 ++-- stable-diffusion.cpp | 3 ++- 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/model.cpp b/model.cpp index bd4f0188a..5985acb2e 100644 --- a/model.cpp +++ b/model.cpp @@ -1459,24 +1459,33 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s SDVersion ModelLoader::get_sd_version() { TensorStorage token_embedding_weight, input_block_weight; - bool is_xl = false; + bool input_block_checked = false; + + bool is_xl = false; + bool is_flux = false; + +#define found_family (is_xl || is_flux) for (auto& tensor_storage : tensor_storages) { - if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { - return VERSION_FLUX; - } - if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { - return VERSION_SD3; - } - if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) { - is_xl = true; - } - if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) { - is_xl = true; - } - if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) { - return VERSION_SVD; + if (!found_family) { + if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { + is_flux = true; + if (input_block_checked) { + break; + } + } + if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { + return VERSION_SD3; + } + if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) { + is_xl = true; + if (input_block_checked) { + break; + } + } + if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) { + return VERSION_SVD; + } } - if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" || tensor_storage.name == "cond_stage_model.model.token_embedding.weight" || tensor_storage.name == "text_model.embeddings.token_embedding.weight" || @@ -1486,10 +1495,12 @@ SDVersion ModelLoader::get_sd_version() { token_embedding_weight = tensor_storage; // break; } - if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight") { - input_block_weight = tensor_storage; - if (is_xl) + if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight") { + input_block_weight = tensor_storage; + input_block_checked = true; + if (found_family) { break; + } } } bool is_inpaint = input_block_weight.ne[2] == 9; @@ -1499,6 +1510,15 @@ SDVersion ModelLoader::get_sd_version() { } return VERSION_SDXL; } + + if (is_flux) { + is_inpaint = input_block_weight.ne[0] == 384; + if (is_inpaint) { + return VERSION_FLUX_INPAINT; + } + return VERSION_FLUX; + } + if (token_embedding_weight.ne[0] == 768) { if (is_inpaint) { return VERSION_SD1_INPAINT; diff --git a/model.h b/model.h index cfa60d992..69136431e 100644 --- a/model.h +++ b/model.h @@ -27,11 +27,12 @@ enum SDVersion { VERSION_SVD, VERSION_SD3, VERSION_FLUX, + VERSION_FLUX_INPAINT, VERSION_COUNT, }; static inline bool sd_version_is_flux(SDVersion version) { - if (version == VERSION_FLUX) { + if (version == VERSION_FLUX || version == VERSION_FLUX_INPAINT) { return true; } return false; @@ -66,7 +67,7 @@ static inline bool sd_version_is_sdxl(SDVersion version) { } static inline bool sd_version_is_inpaint(SDVersion version) { - if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT) { + if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_INPAINT) { return true; } return false; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index b99c11daf..1a3f57072 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -33,7 +33,8 @@ const char* model_version_to_str[] = { "SDXL Inpaint", "SVD", "SD3.x", - "Flux"}; + "Flux", + "Flux Fill"}; const char* sampling_methods_str[] = { "Euler A", From 29b6fd8cf9fd2c83c29454d31e167b6bb75785fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 6 Dec 2024 19:01:48 +0100 Subject: [PATCH 16/19] Flux fill load --- diffusion_model.hpp | 5 +++-- flux.hpp | 8 ++++++-- stable-diffusion.cpp | 6 +++++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index cbc0cd4c1..c44d147ba 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -133,8 +133,9 @@ struct FluxModel : public DiffusionModel { FluxModel(ggml_backend_t backend, std::map& tensor_types, - bool flash_attn = false) - : flux(backend, tensor_types, "model.diffusion_model", flash_attn) { + SDVersion version = VERSION_FLUX, + bool flash_attn = false) + : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) { } void alloc_params_buffer() { diff --git a/flux.hpp b/flux.hpp index fdd00ebcb..498ecdbc7 100644 --- a/flux.hpp +++ b/flux.hpp @@ -490,6 +490,7 @@ namespace Flux { struct FluxParams { int64_t in_channels = 64; + int64_t out_channels = 64; int64_t vec_in_dim = 768; int64_t context_in_dim = 4096; int64_t hidden_size = 3072; @@ -642,7 +643,6 @@ namespace Flux { Flux() {} Flux(FluxParams params) : params(params) { - int64_t out_channels = params.in_channels; int64_t pe_dim = params.hidden_size / params.num_heads; blocks["img_in"] = std::shared_ptr(new Linear(params.in_channels, params.hidden_size, true)); @@ -669,7 +669,7 @@ namespace Flux { params.flash_attn)); } - blocks["final_layer"] = std::shared_ptr(new LastLayer(params.hidden_size, 1, out_channels)); + blocks["final_layer"] = std::shared_ptr(new LastLayer(params.hidden_size, 1, params.out_channels)); } struct ggml_tensor* patchify(struct ggml_context* ctx, @@ -834,12 +834,16 @@ namespace Flux { FluxRunner(ggml_backend_t backend, std::map& tensor_types = empty_tensor_types, const std::string prefix = "", + SDVersion version = VERSION_FLUX, bool flash_attn = false) : GGMLRunner(backend) { flux_params.flash_attn = flash_attn; flux_params.guidance_embed = false; flux_params.depth = 0; flux_params.depth_single_blocks = 0; + if (version == VERSION_FLUX_INPAINT) { + flux_params.in_channels = 384; + } for (auto pair : tensor_types) { std::string tensor_name = pair.first; if (tensor_name.find("model.diffusion_model.") == std::string::npos) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 1a3f57072..31751eb51 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -333,7 +333,11 @@ class StableDiffusionGGML { diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types); } else if (sd_version_is_flux(version)) { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); - diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, diffusion_flash_attn); + diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn); + } else if (version == VERSION_LTXV) { + // TODO: cond for T5 only + cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); + diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, diffusion_flash_attn); } else { if (id_embeddings_path.find("v2") != std::string::npos) { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2); From 0683c03120a711296a17da3d5b28d604553764da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 6 Dec 2024 20:48:39 +0100 Subject: [PATCH 17/19] Flux Fill working!! --- diffusion_model.hpp | 2 +- flux.hpp | 34 +++++++++++++++++++----- model.cpp | 2 +- model.h | 6 ++--- stable-diffusion.cpp | 63 +++++++++++++++++++++++++++++++++----------- 5 files changed, 80 insertions(+), 27 deletions(-) diff --git a/diffusion_model.hpp b/diffusion_model.hpp index c44d147ba..ee4d88f0c 100644 --- a/diffusion_model.hpp +++ b/diffusion_model.hpp @@ -175,7 +175,7 @@ struct FluxModel : public DiffusionModel { struct ggml_tensor** output = NULL, struct ggml_context* output_ctx = NULL, std::vector skip_layers = std::vector()) { - return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx, skip_layers); + return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers); } }; diff --git a/flux.hpp b/flux.hpp index 498ecdbc7..20ff41096 100644 --- a/flux.hpp +++ b/flux.hpp @@ -643,7 +643,7 @@ namespace Flux { Flux() {} Flux(FluxParams params) : params(params) { - int64_t pe_dim = params.hidden_size / params.num_heads; + int64_t pe_dim = params.hidden_size / params.num_heads; blocks["img_in"] = std::shared_ptr(new Linear(params.in_channels, params.hidden_size, true)); blocks["time_in"] = std::shared_ptr(new MLPEmbedder(256, params.hidden_size)); @@ -789,6 +789,7 @@ namespace Flux { struct ggml_tensor* x, struct ggml_tensor* timestep, struct ggml_tensor* context, + struct ggml_tensor* c_concat, struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor* pe, @@ -797,6 +798,7 @@ namespace Flux { // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) // timestep: (N,) tensor of diffusion timesteps // context: (N, L, D) + // c_concat: NULL, or for (N,C+M, H, W) for Fill // y: (N, adm_in_channels) tensor of class labels // guidance: (N,) // pe: (L, d_head/2, 2, 2) @@ -806,6 +808,7 @@ namespace Flux { int64_t W = x->ne[0]; int64_t H = x->ne[1]; + int64_t C = x->ne[2]; int64_t patch_size = 2; int pad_h = (patch_size - H % patch_size) % patch_size; int pad_w = (patch_size - W % patch_size) % patch_size; @@ -814,6 +817,19 @@ namespace Flux { // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size) auto img = patchify(ctx, x, patch_size); // [N, h*w, C * patch_size * patch_size] + if (c_concat != NULL) { + ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0); + ggml_tensor* mask = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C); + + masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0); + mask = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0); + + masked = patchify(ctx, masked, patch_size); + mask = patchify(ctx, mask, patch_size); + + img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0); + } + auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, skip_layers); // [N, h*w, C * patch_size * patch_size] // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2) @@ -841,7 +857,7 @@ namespace Flux { flux_params.guidance_embed = false; flux_params.depth = 0; flux_params.depth_single_blocks = 0; - if (version == VERSION_FLUX_INPAINT) { + if (version == VERSION_FLUX_FILL) { flux_params.in_channels = 384; } for (auto pair : tensor_types) { @@ -890,14 +906,18 @@ namespace Flux { struct ggml_cgraph* build_graph(struct ggml_tensor* x, struct ggml_tensor* timesteps, struct ggml_tensor* context, + struct ggml_tensor* c_concat, struct ggml_tensor* y, struct ggml_tensor* guidance, std::vector skip_layers = std::vector()) { GGML_ASSERT(x->ne[3] == 1); struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false); - x = to_backend(x); - context = to_backend(context); + x = to_backend(x); + context = to_backend(context); + if (c_concat != NULL) { + c_concat = to_backend(c_concat); + } y = to_backend(y); timesteps = to_backend(timesteps); if (flux_params.guidance_embed) { @@ -917,6 +937,7 @@ namespace Flux { x, timesteps, context, + c_concat, y, guidance, pe, @@ -931,6 +952,7 @@ namespace Flux { struct ggml_tensor* x, struct ggml_tensor* timesteps, struct ggml_tensor* context, + struct ggml_tensor* c_concat, struct ggml_tensor* y, struct ggml_tensor* guidance, struct ggml_tensor** output = NULL, @@ -942,7 +964,7 @@ namespace Flux { // y: [N, adm_in_channels] or [1, adm_in_channels] // guidance: [N, ] auto get_graph = [&]() -> struct ggml_cgraph* { - return build_graph(x, timesteps, context, y, guidance, skip_layers); + return build_graph(x, timesteps, context, c_concat, y, guidance, skip_layers); }; GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx); @@ -982,7 +1004,7 @@ namespace Flux { struct ggml_tensor* out = NULL; int t0 = ggml_time_ms(); - compute(8, x, timesteps, context, y, guidance, &out, work_ctx); + compute(8, x, timesteps, context, NULL, y, guidance, &out, work_ctx); int t1 = ggml_time_ms(); print_ggml_tensor(out); diff --git a/model.cpp b/model.cpp index 5985acb2e..767a8b822 100644 --- a/model.cpp +++ b/model.cpp @@ -1514,7 +1514,7 @@ SDVersion ModelLoader::get_sd_version() { if (is_flux) { is_inpaint = input_block_weight.ne[0] == 384; if (is_inpaint) { - return VERSION_FLUX_INPAINT; + return VERSION_FLUX_FILL; } return VERSION_FLUX; } diff --git a/model.h b/model.h index 69136431e..95bbf1da2 100644 --- a/model.h +++ b/model.h @@ -27,12 +27,12 @@ enum SDVersion { VERSION_SVD, VERSION_SD3, VERSION_FLUX, - VERSION_FLUX_INPAINT, + VERSION_FLUX_FILL, VERSION_COUNT, }; static inline bool sd_version_is_flux(SDVersion version) { - if (version == VERSION_FLUX || version == VERSION_FLUX_INPAINT) { + if (version == VERSION_FLUX || version == VERSION_FLUX_FILL) { return true; } return false; @@ -67,7 +67,7 @@ static inline bool sd_version_is_sdxl(SDVersion version) { } static inline bool sd_version_is_inpaint(SDVersion version) { - if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_INPAINT) { + if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL) { return true; } return false; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 31751eb51..26772f85e 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -334,10 +334,6 @@ class StableDiffusionGGML { } else if (sd_version_is_flux(version)) { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn); - } else if (version == VERSION_LTXV) { - // TODO: cond for T5 only - cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types); - diffusion_model = std::make_shared(backend, model_loader.tensor_storages_types, diffusion_flash_attn); } else { if (id_embeddings_path.find("v2") != std::string::npos) { cond_stage_model = std::make_shared(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2); @@ -798,6 +794,7 @@ class StableDiffusionGGML { float skip_layer_start = 0.01, float skip_layer_end = 0.2, ggml_tensor* noise_mask = nullptr) { + LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); for (int i = 1; i < 4; i++) { @@ -1394,13 +1391,27 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, ggml_tensor* noise_mask = nullptr; if (sd_version_is_inpaint(sd_ctx->sd->version)) { if (masked_image == NULL) { + int64_t mask_channels = 1; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + mask_channels = 8 * 8; // flatten the whole mask + } // no mask, set the whole image as masked - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1); + masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1); for (int64_t x = 0; x < masked_image->ne[0]; x++) { for (int64_t y = 0; y < masked_image->ne[1]; y++) { - ggml_tensor_set_f32(masked_image, 1, x, y, 0); - for (int64_t c = 1; c < masked_image->ne[2]; c++) { - ggml_tensor_set_f32(masked_image, 0, x, y, c); + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + // TODO: this might be wrong + for (int64_t c = 0; c < init_latent->ne[2]; c++) { + ggml_tensor_set_f32(masked_image, 0, x, y, c); + } + for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) { + ggml_tensor_set_f32(masked_image, 1, x, y, c); + } + } else { + ggml_tensor_set_f32(masked_image, 1, x, y, 0); + for (int64_t c = 1; c < masked_image->ne[2]; c++) { + ggml_tensor_set_f32(masked_image, 0, x, y, c); + } } } } @@ -1676,6 +1687,10 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, ggml_tensor* masked_image; if (sd_version_is_inpaint(sd_ctx->sd->version)) { + int64_t mask_channels = 1; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + mask_channels = 8 * 8; // flatten the whole mask + } ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1); sd_apply_mask(init_img, mask_img, masked_img); ggml_tensor* masked_image_0 = NULL; @@ -1685,17 +1700,33 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, } else { masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img); } - masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1); + masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1); for (int ix = 0; ix < masked_image_0->ne[0]; ix++) { for (int iy = 0; iy < masked_image_0->ne[1]; iy++) { - for (int k = 0; k < masked_image_0->ne[2]; k++) { - float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); - ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1); + int mx = ix * 8; + int my = iy * 8; + if (sd_ctx->sd->version == VERSION_FLUX_FILL) { + for (int k = 0; k < masked_image_0->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); + ggml_tensor_set_f32(masked_image, v, ix, iy, k); + } + // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image + for (int x = 0; x < 8; x++) { + for (int y = 0; y < 8; y++) { + float m = ggml_tensor_get_f32(mask_img, mx + x, my + y); + // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?) + // python code was using "b (h 8) (w 8) -> b (8 8) h w" + ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y); + } + } + } else { + float m = ggml_tensor_get_f32(mask_img, mx, my); + ggml_tensor_set_f32(masked_image, m, ix, iy, 0); + for (int k = 0; k < masked_image_0->ne[2]; k++) { + float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k); + ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels); + } } - int mx = ix * 8; - int my = iy * 8; - float m = ggml_tensor_get_f32(mask_img, mx, my); - ggml_tensor_set_f32(masked_image, m, ix, iy, 0); } } } else { From 17b4fc5054fefbeea4af08aa671f1612019d0fb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 26 Dec 2024 19:57:11 +0100 Subject: [PATCH 18/19] fix mistake in sd2 mode check --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 26772f85e..6e6d4f628 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -606,7 +606,7 @@ class StableDiffusionGGML { ggml_set_f32(timesteps, 999); struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL; - ggml_set_f32(timesteps, 0); + ggml_set_f32(concat, 0); int64_t t0 = ggml_time_ms(); struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t); From fe35689e3fc681657a81fac4d473e8e2c41aabf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 27 Dec 2024 00:21:13 +0100 Subject: [PATCH 19/19] more specific sdxl fingerprint --- model.cpp | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/model.cpp b/model.cpp index 767a8b822..dae1e0d56 100644 --- a/model.cpp +++ b/model.cpp @@ -1461,7 +1461,10 @@ SDVersion ModelLoader::get_sd_version() { TensorStorage token_embedding_weight, input_block_weight; bool input_block_checked = false; - bool is_xl = false; + bool has_multiple_encoders = false; + bool is_unet = false; + + bool is_xl = false; bool is_flux = false; #define found_family (is_xl || is_flux) @@ -1476,10 +1479,22 @@ SDVersion ModelLoader::get_sd_version() { if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) { return VERSION_SD3; } + if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) { + is_unet = true; + if(has_multiple_encoders){ + is_xl = true; + if (input_block_checked) { + break; + } + } + } if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) { - is_xl = true; - if (input_block_checked) { - break; + has_multiple_encoders = true; + if(is_unet){ + is_xl = true; + if (input_block_checked) { + break; + } } } if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {