From 67919e6223bd582351a21f6e665bd85e3436f57e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 4 Dec 2024 02:33:49 +0100
Subject: [PATCH 01/19] Add support for loading SD1 inpaint models

---
 conditioner.hpp | 2 +-
 model.cpp       | 8 +++++++-
 model.h         | 1 +
 unet.hpp        | 8 ++++++--
 4 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/conditioner.hpp b/conditioner.hpp
index 5b3f20dd1..dab7e2c55 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -68,7 +68,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                 clip_skip = 2;
             }
         }
-        if (version == VERSION_SD1) {
+        if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
         } else if (version == VERSION_SD2) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
diff --git a/model.cpp b/model.cpp
index c90918ad2..4dd6bc841 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1458,7 +1458,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 }
 
 SDVersion ModelLoader::get_sd_version() {
-    TensorStorage token_embedding_weight;
+    TensorStorage token_embedding_weight, input_block_weight;
     for (auto& tensor_storage : tensor_storages) {
         if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
             return VERSION_FLUX;
@@ -1485,9 +1485,15 @@ SDVersion ModelLoader::get_sd_version() {
             token_embedding_weight = tensor_storage;
             // break;
         }
+        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight") {
+            input_block_weight = tensor_storage;
+        }
     }
 
     if (token_embedding_weight.ne[0] == 768) {
+        if(input_block_weight.ne[2]==9){
+            return VERSION_SD1_INPAINT;
+        }
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
         return VERSION_SD2;
diff --git a/model.h b/model.h
index 29d46c192..06da04d9e 100644
--- a/model.h
+++ b/model.h
@@ -19,6 +19,7 @@
 
 enum SDVersion {
     VERSION_SD1,
+    VERSION_SD1_INPAINT,
     VERSION_SD2,
     VERSION_SDXL,
     VERSION_SVD,
diff --git a/unet.hpp b/unet.hpp
index 2a7adb3d2..ae89ce037 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -166,6 +166,7 @@ class SpatialVideoTransformer : public SpatialTransformer {
 // ldm.modules.diffusionmodules.openaimodel.UNetModel
 class UnetModelBlock : public GGMLBlock {
 protected:
+    static std::map<std::string, enum ggml_type> empty_tensor_types;
     SDVersion version = VERSION_SD1;
     // network hparams
     int in_channels                        = 4;
@@ -183,7 +184,7 @@ class UnetModelBlock : public GGMLBlock {
     int model_channels  = 320;
     int adm_in_channels = 2816;  // only for VERSION_SDXL/SVD
 
-    UnetModelBlock(SDVersion version = VERSION_SD1, bool flash_attn = false)
+    UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false)
         : version(version) {
         if (version == VERSION_SD2) {
             context_dim       = 1024;
@@ -203,7 +204,10 @@ class UnetModelBlock : public GGMLBlock {
             adm_in_channels   = 768;
             num_head_channels = 64;
             num_heads         = -1;
+        } else if (version == VERSION_SD1_INPAINT) {
+            in_channels = 9;
         }
+
         // dims is always 2
         // use_temporal_attention is always True for SVD
 
@@ -536,7 +540,7 @@ struct UNetModelRunner : public GGMLRunner {
                     const std::string prefix,
                     SDVersion version = VERSION_SD1,
                     bool flash_attn   = false)
-        : GGMLRunner(backend), unet(version, flash_attn) {
+        : GGMLRunner(backend), unet(version, tensor_types, flash_attn) {
         unet.init(params_ctx, tensor_types, prefix);
     }
 

From 56746011f8634eee95c1485a895b4cc73456df15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 4 Dec 2024 02:56:47 +0100
Subject: [PATCH 02/19] inpaint: try things

---
 stable-diffusion.cpp | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 5abc29507..ba3e00dfc 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -26,6 +26,7 @@
 
 const char* model_version_to_str[] = {
     "SD 1.x",
+    "SD 1.x Inpaint",
     "SD 2.x",
     "SDXL",
     "SVD",
@@ -1359,8 +1360,18 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
 
         sd_ctx->sd->rng->manual_seed(cur_seed);
-        struct ggml_tensor* x_t   = init_latent;
-        struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+        struct ggml_tensor* x_t;
+        struct ggml_tensor* noise;
+        if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+            struct ggml_tensor* mask         = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, 1, 1);
+            struct ggml_tensor* masked_image = ggml_dup_tensor(work_ctx, init_latent);
+
+            x_t   = ggml_concat(work_ctx, ggml_concat(work_ctx, init_latent, masked_image, 2), mask, 2);
+            noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C * 2 + 1, 1);
+        } else {
+            x_t   = init_latent;
+            noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
+        }
         ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
         int start_merge_step = -1;

From 048b8f73317c04548aec66543fc9aefc0bede208 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 4 Dec 2024 21:27:10 +0100
Subject: [PATCH 03/19] Actually Support Inpaint models

---
 ggml_extend.hpp      | 37 ++++++++++++++++++++++++-
 stable-diffusion.cpp | 65 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 85 insertions(+), 17 deletions(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 8afcd367c..3366aa6dc 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -290,6 +290,42 @@ __STATIC_INLINE__ void sd_image_to_tensor(const uint8_t* image_data,
     }
 }
 
+__STATIC_INLINE__ void sd_mask_to_tensor(const uint8_t* image_data,
+                                         struct ggml_tensor* output,
+                                         bool scale = true) {
+    int64_t width    = output->ne[0];
+    int64_t height   = output->ne[1];
+    int64_t channels = output->ne[2];
+    GGML_ASSERT(channels == 1 && output->type == GGML_TYPE_F32);
+    for (int iy = 0; iy < height; iy++) {
+        for (int ix = 0; ix < width; ix++) {
+            float value = *(image_data + iy * width * channels + ix);
+            if (scale) {
+                value /= 255.f;
+            }
+            ggml_tensor_set_f32(output, value, ix, iy);
+        }
+    }
+}
+
+__STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data,
+                                     struct ggml_tensor* mask,
+                                     struct ggml_tensor* output) {
+    int64_t width    = output->ne[0];
+    int64_t height   = output->ne[1];
+    int64_t channels = output->ne[2];
+    GGML_ASSERT(output->type == GGML_TYPE_F32);
+    for (int ix = 0; ix < width; ix++) {
+        for (int iy = 0; iy < height; iy++) {
+            float m = ggml_tensor_get_f32(mask, ix, iy);
+            for (int k = 0; k < channels; k++) {
+                float value = (1 - m) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5;
+                ggml_tensor_set_f32(output, value, ix, iy, k);
+            }
+        }
+    }
+}
+
 __STATIC_INLINE__ void sd_mul_images_to_tensor(const uint8_t* image_data,
                                                struct ggml_tensor* output,
                                                int idx,
@@ -1144,7 +1180,6 @@ struct GGMLRunner {
         }
 #endif
         ggml_backend_graph_compute(backend, gf);
-
 #ifdef GGML_PERF
         ggml_graph_print(gf);
 #endif
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ba3e00dfc..ac322d40a 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1164,10 +1164,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float style_ratio,
                            bool normalize_input,
                            std::string input_id_images_path,
-                           std::vector<int> skip_layers = {},
-                           float slg_scale              = 0,
-                           float skip_layer_start       = 0.01,
-                           float skip_layer_end         = 0.2) {
+                           std::vector<int> skip_layers                                    = {},
+                           float slg_scale                                                 = 0,
+                           float skip_layer_start                                          = 0.01,
+                           float skip_layer_end                                            = 0.2,
+                           ggml_tensor* masked_image                                       = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1360,20 +1361,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
 
         sd_ctx->sd->rng->manual_seed(cur_seed);
-        struct ggml_tensor* x_t;
-        struct ggml_tensor* noise;
-        if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
-            struct ggml_tensor* mask         = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, 1, 1);
-            struct ggml_tensor* masked_image = ggml_dup_tensor(work_ctx, init_latent);
-
-            x_t   = ggml_concat(work_ctx, ggml_concat(work_ctx, init_latent, masked_image, 2), mask, 2);
-            noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C * 2 + 1, 1);
-        } else {
-            x_t   = init_latent;
-            noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
-        }
+        struct ggml_tensor* x_t   = init_latent;
+        struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
         ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
+        if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+            cond.c_concat   = masked_image;
+            uncond.c_concat = masked_image;
+        }
         int start_merge_step = -1;
         if (sd_ctx->sd->stacked_id) {
             start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
@@ -1614,7 +1609,19 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     sd_ctx->sd->rng->manual_seed(seed);
 
     ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
+
+    // sd_image_to_tensor(mask.data, mask_img);
+    for (int ix = 0; ix < width; ix++) {
+        for (int iy = 0; iy < height; iy++) {
+            ggml_tensor_set_f32(mask_img, (iy < height / 3 && ix > width / 4 && ix < 3 * width / 4) ? 1 : 0, ix, iy);
+        }
+    }
+
     sd_image_to_tensor(init_image.data, init_img);
+    ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+    sd_apply_mask(init_img, mask_img, masked_img);
+
     ggml_tensor* init_latent = NULL;
     if (!sd_ctx->sd->use_tiny_autoencoder) {
         ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
@@ -1622,12 +1629,38 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     } else {
         init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
     }
+
+    ggml_tensor* masked_image_0 = NULL;
+    if (!sd_ctx->sd->use_tiny_autoencoder) {
+        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+        masked_image_0       = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+    } else {
+        masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+    }
+    ggml_tensor* masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1);
+    LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]);
+    LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]);
+    for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
+        for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
+            for (int k = 0; k < masked_image_0->ne[2]; k++) {
+                float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
+                ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1);
+            }
+            int mx  = ix * 8;
+            int my  = iy * 8;
+            float m = ggml_tensor_get_f32(mask_img, mx, my);
+            ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
+        }
+    }
+
     print_ggml_tensor(init_latent, true);
     size_t t1 = ggml_time_ms();
     LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);
 
     std::vector<float> sigmas = sd_ctx->sd->denoiser->get_sigmas(sample_steps);
     size_t t_enc              = static_cast<size_t>(sample_steps * strength);
+    if (t_enc == sample_steps)
+        t_enc--;
     LOG_INFO("target t_enc is %zu steps", t_enc);
     std::vector<float> sigma_sched;
     sigma_sched.assign(sigmas.begin() + sample_steps - t_enc - 1, sigmas.end());

From f1e6b0d71524d797bb8f9f2d70066ef8d7fa495b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 4 Dec 2024 21:27:24 +0100
Subject: [PATCH 04/19] unet: Fix c_concat

---
 unet.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/unet.hpp b/unet.hpp
index ae89ce037..ee90285ec 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -570,6 +570,7 @@ struct UNetModelRunner : public GGMLRunner {
         context   = to_backend(context);
         y         = to_backend(y);
         timesteps = to_backend(timesteps);
+        c_concat  = to_backend(c_concat);
 
         for (int i = 0; i < controls.size(); i++) {
             controls[i] = to_backend(controls[i]);

From 5a0391508ed2071734cd89f03fd3e02839a377db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 4 Dec 2024 22:12:38 +0100
Subject: [PATCH 05/19] support image mask input

---
 examples/cli/main.cpp | 23 +++++++++++++++++++++++
 stable-diffusion.cpp  |  8 ++------
 stable-diffusion.h    |  1 +
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 4b47286f4..5a48b3d61 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -85,6 +85,7 @@ struct SDParams {
     std::string lora_model_dir;
     std::string output_path = "output.png";
     std::string input_path;
+    std::string mask_path;
     std::string control_image_path;
 
     std::string prompt;
@@ -148,6 +149,7 @@ void print_params(SDParams params) {
     printf("    normalize input image :  %s\n", params.normalize_input ? "true" : "false");
     printf("    output_path:       %s\n", params.output_path.c_str());
     printf("    init_img:          %s\n", params.input_path.c_str());
+    printf("    mask_img:          %s\n", params.mask_path.c_str());
     printf("    control_image:     %s\n", params.control_image_path.c_str());
     printf("    clip on cpu:       %s\n", params.clip_on_cpu ? "true" : "false");
     printf("    controlnet cpu:    %s\n", params.control_net_cpu ? "true" : "false");
@@ -384,6 +386,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.input_path = argv[i];
+        } else if (arg == "--mask") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.mask_path = argv[i];
         } else if (arg == "--control-image") {
             if (++i >= argc) {
                 invalid_arg = true;
@@ -803,6 +811,8 @@ int main(int argc, const char* argv[]) {
     bool vae_decode_only          = true;
     uint8_t* input_image_buffer   = NULL;
     uint8_t* control_image_buffer = NULL;
+    uint8_t* mask_image_buffer    = NULL;
+
     if (params.mode == IMG2IMG || params.mode == IMG2VID) {
         vae_decode_only = false;
 
@@ -907,6 +917,18 @@ int main(int argc, const char* argv[]) {
         }
     }
 
+    if (params.mask_path != "") {
+        int c             = 0;
+        mask_image_buffer = stbi_load(params.mask_path.c_str(), &params.width, &params.height, &c, 1);
+    } else {
+        std::vector<uint8_t> arr(params.width * params.height, 255);
+        mask_image_buffer = arr.data();
+    }
+    sd_image_t mask_image = {(uint32_t)params.width,
+                             (uint32_t)params.height,
+                             1,
+                             mask_image_buffer};
+
     sd_image_t* results;
     if (params.mode == TXT2IMG) {
         results = txt2img(sd_ctx,
@@ -976,6 +998,7 @@ int main(int argc, const char* argv[]) {
         } else {
             results = img2img(sd_ctx,
                               input_image,
+                              mask_image,
                               params.prompt.c_str(),
                               params.negative_prompt.c_str(),
                               params.clip_skip,
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ac322d40a..9e44bb1b0 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1549,6 +1549,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
 
 sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     sd_image_t init_image,
+                    sd_image_t mask,
                     const char* prompt_c_str,
                     const char* negative_prompt_c_str,
                     int clip_skip,
@@ -1611,12 +1612,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     ggml_tensor* init_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
     ggml_tensor* mask_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 1, 1);
 
-    // sd_image_to_tensor(mask.data, mask_img);
-    for (int ix = 0; ix < width; ix++) {
-        for (int iy = 0; iy < height; iy++) {
-            ggml_tensor_set_f32(mask_img, (iy < height / 3 && ix > width / 4 && ix < 3 * width / 4) ? 1 : 0, ix, iy);
-        }
-    }
+    sd_mask_to_tensor(mask.data, mask_img);
 
     sd_image_to_tensor(init_image.data, init_img);
     ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
diff --git a/stable-diffusion.h b/stable-diffusion.h
index c67bc8a32..5a758df66 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -174,6 +174,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
 
 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,
+                           sd_image_t mask_image,
                            const char* prompt,
                            const char* negative_prompt,
                            int clip_skip,

From a0400949f2240cbc270170a0a74927c95785019d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 4 Dec 2024 22:13:06 +0100
Subject: [PATCH 06/19] Do not crash in txt2img with Inpaint model

---
 stable-diffusion.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 9e44bb1b0..6d9e6f838 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1366,6 +1366,18 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
         if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+            if (masked_image == NULL) {
+                // no mask, set the whole image as masked
+                masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1);
+                for (int64_t x = 0; x < masked_image->ne[0]; x++) {
+                    for (int64_t y = 0; y < masked_image->ne[1]; y++) {
+                        ggml_tensor_set_f32(masked_image, 1, x, y, 0);
+                        for (int64_t c = 1; c < masked_image->ne[2]; c++) {
+                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
+                        }
+                    }
+                }
+            }
             cond.c_concat   = masked_image;
             uncond.c_concat = masked_image;
         }
@@ -1516,6 +1528,10 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         ggml_set_f32(init_latent, 0.f);
     }
 
+    if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+        LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
+    }
+
     sd_image_t* result_images = generate_image(sd_ctx,
                                                work_ctx,
                                                init_latent,

From 8ee3512c4a241299a064677dab511bd4670db527 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Wed, 4 Dec 2024 23:09:16 +0100
Subject: [PATCH 07/19] Fix issue introduced by rebase

---
 stable-diffusion.cpp | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 6d9e6f838..66acdeec5 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1164,11 +1164,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float style_ratio,
                            bool normalize_input,
                            std::string input_id_images_path,
-                           std::vector<int> skip_layers                                    = {},
-                           float slg_scale                                                 = 0,
-                           float skip_layer_start                                          = 0.01,
-                           float skip_layer_end                                            = 0.2,
-                           ggml_tensor* masked_image                                       = NULL) {
+                           std::vector<int> skip_layers = {},
+                           float slg_scale              = 0,
+                           float skip_layer_start       = 0.01,
+                           float skip_layer_end         = 0.2,
+                           ggml_tensor* masked_image    = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1368,6 +1368,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
             if (masked_image == NULL) {
                 // no mask, set the whole image as masked
+                LOG_INFO("Missing image mask, using whole frame...");
                 masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1);
                 for (int64_t x = 0; x < masked_image->ne[0]; x++) {
                     for (int64_t y = 0; y < masked_image->ne[1]; y++) {
@@ -1699,7 +1700,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                skip_layers_vec,
                                                slg_scale,
                                                skip_layer_start,
-                                               skip_layer_end);
+                                               skip_layer_end,
+                                               masked_image);
 
     size_t t2 = ggml_time_ms();
 

From 10c16da0e1dc7b1c894a196a229c9390bbca78db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 5 Dec 2024 17:57:09 +0100
Subject: [PATCH 08/19] Add "inpaint" support for diffusion models in img2img
 via denoise mask

---
 stable-diffusion.cpp | 140 ++++++++++++++++++++++++++++---------------
 1 file changed, 92 insertions(+), 48 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 66acdeec5..08065e7f4 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -782,10 +782,22 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        std::vector<int> skip_layers = {},
-                        float slg_scale              = 0,
-                        float skip_layer_start       = 0.01,
-                        float skip_layer_end         = 0.2) {
+                        std::vector<int> skip_layers                                    = {},
+                        float slg_scale                                                 = 0,
+                        float skip_layer_start                                          = 0.01,
+                        float skip_layer_end                                            = 0.2,
+                        ggml_tensor* noise_mask                                         = nullptr) {
+        struct ggml_init_params params;
+        size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
+        for (int i = 1; i < 4; i++) {
+            data_size *= init_latent->ne[i];
+        }
+        data_size += 1024;
+        params.mem_size       = data_size * 3;
+        params.mem_buffer     = NULL;
+        params.no_alloc       = false;
+        ggml_context* tmp_ctx = ggml_init(params);
+
         size_t steps = sigmas.size() - 1;
         // noise = load_tensor_from_file(work_ctx, "./rand0.bin");
         // print_ggml_tensor(noise);
@@ -944,6 +956,19 @@ class StableDiffusionGGML {
                 pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
                 // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000);
             }
+            if (noise_mask != nullptr) {
+                for (int64_t x = 0; x < denoised->ne[0]; x++) {
+                    for (int64_t y = 0; y < denoised->ne[1]; y++) {
+                        float mask = ggml_tensor_get_f32(noise_mask, x, y);
+                        for (int64_t k = 0; k < denoised->ne[2]; k++) {
+                            float init = ggml_tensor_get_f32(init_latent, x, y, k);
+                            float den  = ggml_tensor_get_f32(denoised, x, y, k);
+                            ggml_tensor_set_f32(denoised, init + mask * (den - init), x, y, k);
+                        }
+                    }
+                }
+            }
+
             return denoised;
         };
 
@@ -1355,6 +1380,25 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     int W = width / 8;
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
+    ggml_tensor* noise_mask = nullptr;
+    if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+        if (masked_image == NULL) {
+            // no mask, set the whole image as masked
+            masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1);
+            for (int64_t x = 0; x < masked_image->ne[0]; x++) {
+                for (int64_t y = 0; y < masked_image->ne[1]; y++) {
+                    ggml_tensor_set_f32(masked_image, 1, x, y, 0);
+                    for (int64_t c = 1; c < masked_image->ne[2]; c++) {
+                        ggml_tensor_set_f32(masked_image, 0, x, y, c);
+                    }
+                }
+            }
+        }
+        cond.c_concat   = masked_image;
+        uncond.c_concat = masked_image;
+    } else {
+        noise_mask = masked_image;
+    }
     for (int b = 0; b < batch_count; b++) {
         int64_t sampling_start = ggml_time_ms();
         int64_t cur_seed       = seed + b;
@@ -1365,23 +1409,6 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
         struct ggml_tensor* noise = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, 1);
         ggml_tensor_set_f32_randn(noise, sd_ctx->sd->rng);
 
-        if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
-            if (masked_image == NULL) {
-                // no mask, set the whole image as masked
-                LOG_INFO("Missing image mask, using whole frame...");
-                masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1);
-                for (int64_t x = 0; x < masked_image->ne[0]; x++) {
-                    for (int64_t y = 0; y < masked_image->ne[1]; y++) {
-                        ggml_tensor_set_f32(masked_image, 1, x, y, 0);
-                        for (int64_t c = 1; c < masked_image->ne[2]; c++) {
-                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
-                        }
-                    }
-                }
-            }
-            cond.c_concat   = masked_image;
-            uncond.c_concat = masked_image;
-        }
         int start_merge_step = -1;
         if (sd_ctx->sd->stacked_id) {
             start_merge_step = int(sd_ctx->sd->pmid_model->style_strength / 100.f * sample_steps);
@@ -1407,7 +1434,9 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      skip_layers,
                                                      slg_scale,
                                                      skip_layer_start,
-                                                     skip_layer_end);
+                                                     skip_layer_end,
+                                                     noise_mask);
+
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
         int64_t sampling_end = ggml_time_ms();
@@ -1606,7 +1635,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     if (sd_ctx->sd->stacked_id) {
         params.mem_size += static_cast<size_t>(10 * 1024 * 1024);  // 10 MB
     }
-    params.mem_size += width * height * 3 * sizeof(float) * 2;
+    params.mem_size += width * height * 3 * sizeof(float) * 3;
     params.mem_size *= batch_count;
     params.mem_buffer = NULL;
     params.no_alloc   = false;
@@ -1632,8 +1661,46 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     sd_mask_to_tensor(mask.data, mask_img);
 
     sd_image_to_tensor(init_image.data, init_img);
-    ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
-    sd_apply_mask(init_img, mask_img, masked_img);
+
+    ggml_tensor* masked_image;
+
+    if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+        ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
+        sd_apply_mask(init_img, mask_img, masked_img);
+        ggml_tensor* masked_image_0 = NULL;
+        if (!sd_ctx->sd->use_tiny_autoencoder) {
+            ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+            masked_image_0       = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
+        } else {
+            masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
+        }
+        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1);
+        LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]);
+        LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]);
+        for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
+            for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
+                for (int k = 0; k < masked_image_0->ne[2]; k++) {
+                    float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
+                    ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1);
+                }
+                int mx  = ix * 8;
+                int my  = iy * 8;
+                float m = ggml_tensor_get_f32(mask_img, mx, my);
+                ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
+            }
+        }
+    } else {
+        // LOG_WARN("Inpainting with a base model is not great");
+        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width / 8, height / 8, 1, 1);
+        for (int ix = 0; ix < masked_image->ne[0]; ix++) {
+            for (int iy = 0; iy < masked_image->ne[1]; iy++) {
+                int mx  = ix * 8;
+                int my  = iy * 8;
+                float m = ggml_tensor_get_f32(mask_img, mx, my);
+                ggml_tensor_set_f32(masked_image, m, ix, iy);
+            }
+        }
+    }
 
     ggml_tensor* init_latent = NULL;
     if (!sd_ctx->sd->use_tiny_autoencoder) {
@@ -1643,29 +1710,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         init_latent = sd_ctx->sd->encode_first_stage(work_ctx, init_img);
     }
 
-    ggml_tensor* masked_image_0 = NULL;
-    if (!sd_ctx->sd->use_tiny_autoencoder) {
-        ggml_tensor* moments = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-        masked_image_0       = sd_ctx->sd->get_first_stage_encoding(work_ctx, moments);
-    } else {
-        masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
-    }
-    ggml_tensor* masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1);
-    LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]);
-    LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]);
-    for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
-        for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
-            for (int k = 0; k < masked_image_0->ne[2]; k++) {
-                float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
-                ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1);
-            }
-            int mx  = ix * 8;
-            int my  = iy * 8;
-            float m = ggml_tensor_get_f32(mask_img, mx, my);
-            ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
-        }
-    }
-
     print_ggml_tensor(init_latent, true);
     size_t t1 = ggml_time_ms();
     LOG_INFO("encode_first_stage completed, taking %.2fs", (t1 - t0) * 1.0f / 1000);

From a40f4c0fb3b852e29b547e60564ec5cc90468a41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Thu, 5 Dec 2024 20:26:15 +0100
Subject: [PATCH 09/19] inpaint: fix halo on blurry masks

---
 ggml_extend.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index 3366aa6dc..5f1db9152 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -319,7 +319,7 @@ __STATIC_INLINE__ void sd_apply_mask(struct ggml_tensor* image_data,
         for (int iy = 0; iy < height; iy++) {
             float m = ggml_tensor_get_f32(mask, ix, iy);
             for (int k = 0; k < channels; k++) {
-                float value = (1 - m) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5;
+                float value = ((float)(m < 254.5/255)) * (ggml_tensor_get_f32(image_data, ix, iy, k) - .5) + .5;
                 ggml_tensor_set_f32(output, value, ix, iy, k);
             }
         }

From 148731da8e3f7985fe8a051c39278a46dec47848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 6 Dec 2024 02:43:31 +0100
Subject: [PATCH 10/19] sd2 inpaint support

---
 conditioner.hpp      |  8 ++++----
 control.hpp          |  2 +-
 model.cpp            |  7 +++++--
 model.h              | 23 +++++++++++++++++++++++
 stable-diffusion.cpp | 15 ++++++++++-----
 unet.hpp             |  5 +++--
 6 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/conditioner.hpp b/conditioner.hpp
index dab7e2c55..cc515dc2f 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -61,16 +61,16 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                       SDVersion version = VERSION_SD1,
                                       PMVersion pv      = PM_VERSION_1,
                                       int clip_skip     = -1)
-        : version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407), embd_dir(embd_dir) {
+        : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
         if (clip_skip <= 0) {
             clip_skip = 1;
-            if (version == VERSION_SD2 || version == VERSION_SDXL) {
+            if (sd_version_is_sd2(version) || version == VERSION_SDXL) {
                 clip_skip = 2;
             }
         }
-        if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+        if (sd_version_is_sd1(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
-        } else if (version == VERSION_SD2) {
+        } else if (sd_version_is_sd2(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
         } else if (version == VERSION_SDXL) {
             text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
diff --git a/control.hpp b/control.hpp
index ed36db280..41c94bdad 100644
--- a/control.hpp
+++ b/control.hpp
@@ -34,7 +34,7 @@ class ControlNetBlock : public GGMLBlock {
 
     ControlNetBlock(SDVersion version = VERSION_SD1)
         : version(version) {
-        if (version == VERSION_SD2) {
+        if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
             num_head_channels = 64;
             num_heads         = -1;
diff --git a/model.cpp b/model.cpp
index 4dd6bc841..f1c70bb38 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1489,13 +1489,16 @@ SDVersion ModelLoader::get_sd_version() {
             input_block_weight = tensor_storage;
         }
     }
-
+    bool is_inpaint = input_block_weight.ne[2] == 9;
     if (token_embedding_weight.ne[0] == 768) {
-        if(input_block_weight.ne[2]==9){
+        if (is_inpaint) {
             return VERSION_SD1_INPAINT;
         }
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
+        if (is_inpaint) {
+            return VERSION_SD2_INPAINT;
+        }
         return VERSION_SD2;
     }
     return VERSION_COUNT;
diff --git a/model.h b/model.h
index 06da04d9e..dd8f05d9a 100644
--- a/model.h
+++ b/model.h
@@ -21,6 +21,7 @@ enum SDVersion {
     VERSION_SD1,
     VERSION_SD1_INPAINT,
     VERSION_SD2,
+    VERSION_SD2_INPAINT,
     VERSION_SDXL,
     VERSION_SVD,
     VERSION_SD3,
@@ -42,6 +43,27 @@ static inline bool sd_version_is_sd3(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_sd1(SDVersion version) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_sd2(SDVersion version) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
+static inline bool sd_version_is_inpaint(SDVersion version) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_dit(SDVersion version) {
     if (sd_version_is_flux(version) || sd_version_is_sd3(version)) {
         return true;
@@ -49,6 +71,7 @@ static inline bool sd_version_is_dit(SDVersion version) {
     return false;
 }
 
+
 enum PMVersion {
     PM_VERSION_1,
     PM_VERSION_2,
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 08065e7f4..ef987b948 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -28,6 +28,7 @@ const char* model_version_to_str[] = {
     "SD 1.x",
     "SD 1.x Inpaint",
     "SD 2.x",
+    "SD 2.x Inpaint",
     "SDXL",
     "SVD",
     "SD3.x",
@@ -518,7 +519,7 @@ class StableDiffusionGGML {
 
         // check is_using_v_parameterization_for_sd2
         bool is_using_v_parameterization = false;
-        if (version == VERSION_SD2) {
+        if (sd_version_is_sd2(version)) {
             if (is_using_v_parameterization_for_sd2(ctx)) {
                 is_using_v_parameterization = true;
             }
@@ -601,9 +602,13 @@ class StableDiffusionGGML {
 
         struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
         ggml_set_f32(timesteps, 999);
+
+        struct ggml_tensor* concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1);
+        ggml_set_f32(timesteps, 0);
+
         int64_t t0              = ggml_time_ms();
         struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);
-        diffusion_model->compute(n_threads, x_t, timesteps, c, NULL, NULL, NULL, -1, {}, 0.f, &out);
+        diffusion_model->compute(n_threads, x_t, timesteps, c, concat, NULL, NULL, -1, {}, 0.f, &out);
         diffusion_model->free_compute_buffer();
 
         double result = 0.f;
@@ -1381,7 +1386,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     int H = height / 8;
     LOG_INFO("sampling using %s method", sampling_methods_str[sample_method]);
     ggml_tensor* noise_mask = nullptr;
-    if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
         if (masked_image == NULL) {
             // no mask, set the whole image as masked
             masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1);
@@ -1558,7 +1563,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
         ggml_set_f32(init_latent, 0.f);
     }
 
-    if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
         LOG_WARN("This is an inpainting model, this should only be used in img2img mode with a mask");
     }
 
@@ -1664,7 +1669,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
 
     ggml_tensor* masked_image;
 
-    if (sd_ctx->sd->version == VERSION_SD1_INPAINT) {
+    if (sd_version_is_inpaint(sd_ctx->sd->version)) {
         ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
         sd_apply_mask(init_img, mask_img, masked_img);
         ggml_tensor* masked_image_0 = NULL;
diff --git a/unet.hpp b/unet.hpp
index ee90285ec..22b1a3eb1 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -186,7 +186,7 @@ class UnetModelBlock : public GGMLBlock {
 
     UnetModelBlock(SDVersion version = VERSION_SD1, std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types, bool flash_attn = false)
         : version(version) {
-        if (version == VERSION_SD2) {
+        if (sd_version_is_sd2(version)) {
             context_dim       = 1024;
             num_head_channels = 64;
             num_heads         = -1;
@@ -204,7 +204,8 @@ class UnetModelBlock : public GGMLBlock {
             adm_in_channels   = 768;
             num_head_channels = 64;
             num_heads         = -1;
-        } else if (version == VERSION_SD1_INPAINT) {
+        }
+        if (sd_version_is_inpaint(version)) {
             in_channels = 9;
         }
 

From 8085e4804a705528a7199904102a9c9d38f6fe00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 6 Dec 2024 03:07:04 +0100
Subject: [PATCH 11/19] sdxl inpaint support

---
 conditioner.hpp      | 20 ++++++++++----------
 control.hpp          |  4 ++--
 model.cpp            | 13 +++++++++++--
 model.h              | 11 +++++++++--
 stable-diffusion.cpp |  6 +++---
 unet.hpp             |  4 ++--
 6 files changed, 37 insertions(+), 21 deletions(-)

diff --git a/conditioner.hpp b/conditioner.hpp
index cc515dc2f..8d1ec31bc 100644
--- a/conditioner.hpp
+++ b/conditioner.hpp
@@ -64,7 +64,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
         : version(version), pm_version(pv), tokenizer(sd_version_is_sd2(version) ? 0 : 49407), embd_dir(embd_dir) {
         if (clip_skip <= 0) {
             clip_skip = 1;
-            if (sd_version_is_sd2(version) || version == VERSION_SDXL) {
+            if (sd_version_is_sd2(version) || sd_version_is_sdxl(version)) {
                 clip_skip = 2;
             }
         }
@@ -72,7 +72,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip);
         } else if (sd_version_is_sd2(version)) {
             text_model = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPEN_CLIP_VIT_H_14, clip_skip);
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
             text_model  = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.transformer.text_model", OPENAI_CLIP_VIT_L_14, clip_skip, false);
             text_model2 = std::make_shared<CLIPTextModelRunner>(backend, tensor_types, "cond_stage_model.1.transformer.text_model", OPEN_CLIP_VIT_BIGG_14, clip_skip, false);
         }
@@ -80,35 +80,35 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
 
     void set_clip_skip(int clip_skip) {
         text_model->set_clip_skip(clip_skip);
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->set_clip_skip(clip_skip);
         }
     }
 
     void get_param_tensors(std::map<std::string, struct ggml_tensor*>& tensors) {
         text_model->get_param_tensors(tensors, "cond_stage_model.transformer.text_model");
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->get_param_tensors(tensors, "cond_stage_model.1.transformer.text_model");
         }
     }
 
     void alloc_params_buffer() {
         text_model->alloc_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->alloc_params_buffer();
         }
     }
 
     void free_params_buffer() {
         text_model->free_params_buffer();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             text_model2->free_params_buffer();
         }
     }
 
     size_t get_params_buffer_size() {
         size_t buffer_size = text_model->get_params_buffer_size();
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             buffer_size += text_model2->get_params_buffer_size();
         }
         return buffer_size;
@@ -402,7 +402,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
             auto input_ids                 = vector_to_ggml_tensor_i32(work_ctx, chunk_tokens);
             struct ggml_tensor* input_ids2 = NULL;
             size_t max_token_idx           = 0;
-            if (version == VERSION_SDXL) {
+            if (sd_version_is_sdxl(version)) {
                 auto it = std::find(chunk_tokens.begin(), chunk_tokens.end(), tokenizer.EOS_TOKEN_ID);
                 if (it != chunk_tokens.end()) {
                     std::fill(std::next(it), chunk_tokens.end(), 0);
@@ -427,7 +427,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                     false,
                                     &chunk_hidden_states1,
                                     work_ctx);
-                if (version == VERSION_SDXL) {
+                if (sd_version_is_sdxl(version)) {
                     text_model2->compute(n_threads,
                                          input_ids2,
                                          0,
@@ -486,7 +486,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
                                         ggml_nelements(hidden_states) / chunk_hidden_states->ne[0]);
 
         ggml_tensor* vec = NULL;
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             int out_dim = 256;
             vec         = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, adm_in_channels);
             // [0:1280]
diff --git a/control.hpp b/control.hpp
index 41c94bdad..23b75feff 100644
--- a/control.hpp
+++ b/control.hpp
@@ -38,7 +38,7 @@ class ControlNetBlock : public GGMLBlock {
             context_dim       = 1024;
             num_head_channels = 64;
             num_heads         = -1;
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
             context_dim           = 2048;
             attention_resolutions = {4, 2};
             channel_mult          = {1, 2, 4};
@@ -58,7 +58,7 @@ class ControlNetBlock : public GGMLBlock {
         // time_embed_1 is nn.SiLU()
         blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
 
-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
             blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
             // label_emb_1 is nn.SiLU()
             blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
diff --git a/model.cpp b/model.cpp
index f1c70bb38..bd4f0188a 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1459,6 +1459,7 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 
 SDVersion ModelLoader::get_sd_version() {
     TensorStorage token_embedding_weight, input_block_weight;
+    bool is_xl = false;
     for (auto& tensor_storage : tensor_storages) {
         if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
             return VERSION_FLUX;
@@ -1467,10 +1468,10 @@ SDVersion ModelLoader::get_sd_version() {
             return VERSION_SD3;
         }
         if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
-            return VERSION_SDXL;
+            is_xl = true;
         }
         if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
-            return VERSION_SDXL;
+            is_xl = true;
         }
         if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
             return VERSION_SVD;
@@ -1487,9 +1488,17 @@ SDVersion ModelLoader::get_sd_version() {
         }
         if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight") {
             input_block_weight = tensor_storage;
+            if (is_xl)
+                break;
         }
     }
     bool is_inpaint = input_block_weight.ne[2] == 9;
+    if (is_xl) {
+        if (is_inpaint) {
+            return VERSION_SDXL_INPAINT;
+        }
+        return VERSION_SDXL;
+    }
     if (token_embedding_weight.ne[0] == 768) {
         if (is_inpaint) {
             return VERSION_SD1_INPAINT;
diff --git a/model.h b/model.h
index dd8f05d9a..cfa60d992 100644
--- a/model.h
+++ b/model.h
@@ -23,6 +23,7 @@ enum SDVersion {
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SDXL,
+    VERSION_SDXL_INPAINT,
     VERSION_SVD,
     VERSION_SD3,
     VERSION_FLUX,
@@ -57,8 +58,15 @@ static inline bool sd_version_is_sd2(SDVersion version) {
     return false;
 }
 
+static inline bool sd_version_is_sdxl(SDVersion version) {
+    if (version == VERSION_SDXL || version == VERSION_SDXL_INPAINT) {
+        return true;
+    }
+    return false;
+}
+
 static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT) {
         return true;
     }
     return false;
@@ -71,7 +79,6 @@ static inline bool sd_version_is_dit(SDVersion version) {
     return false;
 }
 
-
 enum PMVersion {
     PM_VERSION_1,
     PM_VERSION_2,
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ef987b948..f691b9d47 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -265,7 +265,7 @@ class StableDiffusionGGML {
             model_loader.set_wtype_override(wtype);
         }
 
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             vae_wtype = GGML_TYPE_F32;
             model_loader.set_wtype_override(GGML_TYPE_F32, "vae.");
         }
@@ -277,7 +277,7 @@ class StableDiffusionGGML {
 
         LOG_DEBUG("ggml tensor size = %d bytes", (int)sizeof(ggml_tensor));
 
-        if (version == VERSION_SDXL) {
+        if (sd_version_is_sdxl(version)) {
             scale_factor = 0.13025f;
             if (vae_path.size() == 0 && taesd_path.size() == 0) {
                 LOG_WARN(
@@ -1348,7 +1348,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     SDCondition uncond;
     if (cfg_scale != 1.0) {
         bool force_zero_embeddings = false;
-        if (sd_ctx->sd->version == VERSION_SDXL && negative_prompt.size() == 0) {
+        if (sd_version_is_sdxl(sd_ctx->sd->version) && negative_prompt.size() == 0) {
             force_zero_embeddings = true;
         }
         uncond = sd_ctx->sd->cond_stage_model->get_learned_condition(work_ctx,
diff --git a/unet.hpp b/unet.hpp
index 22b1a3eb1..31b7fe986 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -190,7 +190,7 @@ class UnetModelBlock : public GGMLBlock {
             context_dim       = 1024;
             num_head_channels = 64;
             num_heads         = -1;
-        } else if (version == VERSION_SDXL) {
+        } else if (sd_version_is_sdxl(version)) {
             context_dim           = 2048;
             attention_resolutions = {4, 2};
             channel_mult          = {1, 2, 4};
@@ -216,7 +216,7 @@ class UnetModelBlock : public GGMLBlock {
         // time_embed_1 is nn.SiLU()
         blocks["time_embed.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));
 
-        if (version == VERSION_SDXL || version == VERSION_SVD) {
+        if (sd_version_is_sdxl(version) || version == VERSION_SVD) {
             blocks["label_emb.0.0"] = std::shared_ptr<GGMLBlock>(new Linear(adm_in_channels, time_embed_dim));
             // label_emb_1 is nn.SiLU()
             blocks["label_emb.0.2"] = std::shared_ptr<GGMLBlock>(new Linear(time_embed_dim, time_embed_dim));

From a9e7ee6634b9178508936532fc8951579d427cca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 6 Dec 2024 03:07:16 +0100
Subject: [PATCH 12/19] remove some logging

---
 stable-diffusion.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index f691b9d47..d538501fd 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1680,8 +1680,6 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
             masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
         }
         masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1);
-        LOG_INFO("shape: [%d,%d,%d,%d]", masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2], masked_image_0->ne[3]);
-        LOG_INFO("shape: [%d,%d,%d,%d]", masked_image->ne[0], masked_image->ne[1], masked_image->ne[2], masked_image->ne[3]);
         for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
             for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
                 for (int k = 0; k < masked_image_0->ne[2]; k++) {

From ed41e759d5863986eb214e4e3e7fbd73ed853078 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 6 Dec 2024 03:28:55 +0100
Subject: [PATCH 13/19] Fix non inpaint sd2

---
 stable-diffusion.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index d538501fd..9b4f6e15a 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -520,7 +520,7 @@ class StableDiffusionGGML {
         // check is_using_v_parameterization_for_sd2
         bool is_using_v_parameterization = false;
         if (sd_version_is_sd2(version)) {
-            if (is_using_v_parameterization_for_sd2(ctx)) {
+            if (is_using_v_parameterization_for_sd2(ctx, sd_version_is_inpaint(version))) {
                 is_using_v_parameterization = true;
             }
         } else if (version == VERSION_SVD) {
@@ -594,7 +594,7 @@ class StableDiffusionGGML {
         return true;
     }
 
-    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx) {
+    bool is_using_v_parameterization_for_sd2(ggml_context* work_ctx, bool is_inpaint = false) {
         struct ggml_tensor* x_t = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 4, 1);
         ggml_set_f32(x_t, 0.5);
         struct ggml_tensor* c = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 1024, 2, 1, 1);
@@ -603,7 +603,7 @@ class StableDiffusionGGML {
         struct ggml_tensor* timesteps = ggml_new_tensor_1d(work_ctx, GGML_TYPE_F32, 1);
         ggml_set_f32(timesteps, 999);
 
-        struct ggml_tensor* concat = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1);
+        struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL;
         ggml_set_f32(timesteps, 0);
 
         int64_t t0              = ggml_time_ms();
@@ -787,11 +787,11 @@ class StableDiffusionGGML {
                         const std::vector<float>& sigmas,
                         int start_merge_step,
                         SDCondition id_cond,
-                        std::vector<int> skip_layers                                    = {},
-                        float slg_scale                                                 = 0,
-                        float skip_layer_start                                          = 0.01,
-                        float skip_layer_end                                            = 0.2,
-                        ggml_tensor* noise_mask                                         = nullptr) {
+                        std::vector<int> skip_layers = {},
+                        float slg_scale              = 0,
+                        float skip_layer_start       = 0.01,
+                        float skip_layer_end         = 0.2,
+                        ggml_tensor* noise_mask      = nullptr) {
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
         for (int i = 1; i < 4; i++) {

From d741e2d924b16faa9874a1ed125d353e6d8e56cc Mon Sep 17 00:00:00 2001
From: stduhpf <stephduh@live.fr>
Date: Fri, 6 Dec 2024 18:52:43 +0100
Subject: [PATCH 14/19] Fix model_version _str

---
 stable-diffusion.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 9b4f6e15a..b99c11daf 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -30,6 +30,7 @@ const char* model_version_to_str[] = {
     "SD 2.x",
     "SD 2.x Inpaint",
     "SDXL",
+    "SDXL Inpaint",
     "SVD",
     "SD3.x",
     "Flux"};

From 26fab5a4e392fa9a39730fd6583bfbc832fb0802 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 6 Dec 2024 18:35:37 +0100
Subject: [PATCH 15/19] Detect Flux fill models

---
 model.cpp            | 58 +++++++++++++++++++++++++++++---------------
 model.h              |  5 ++--
 stable-diffusion.cpp |  3 ++-
 3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/model.cpp b/model.cpp
index bd4f0188a..5985acb2e 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1459,24 +1459,33 @@ bool ModelLoader::init_from_ckpt_file(const std::string& file_path, const std::s
 
 SDVersion ModelLoader::get_sd_version() {
     TensorStorage token_embedding_weight, input_block_weight;
-    bool is_xl = false;
+    bool input_block_checked = false;
+
+    bool is_xl   = false;
+    bool is_flux = false;
+
+#define found_family (is_xl || is_flux)
     for (auto& tensor_storage : tensor_storages) {
-        if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
-            return VERSION_FLUX;
-        }
-        if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
-            return VERSION_SD3;
-        }
-        if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos) {
-            is_xl = true;
-        }
-        if (tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
-            is_xl = true;
-        }
-        if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
-            return VERSION_SVD;
+        if (!found_family) {
+            if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
+                is_flux = true;
+                if (input_block_checked) {
+                    break;
+                }
+            }
+            if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
+                return VERSION_SD3;
+            }
+            if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
+                is_xl = true;
+                if (input_block_checked) {
+                    break;
+                }
+            }
+            if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {
+                return VERSION_SVD;
+            }
         }
-
         if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
             tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
             tensor_storage.name == "text_model.embeddings.token_embedding.weight" ||
@@ -1486,10 +1495,12 @@ SDVersion ModelLoader::get_sd_version() {
             token_embedding_weight = tensor_storage;
             // break;
         }
-        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight") {
-            input_block_weight = tensor_storage;
-            if (is_xl)
+        if (tensor_storage.name == "model.diffusion_model.input_blocks.0.0.weight" || tensor_storage.name == "model.diffusion_model.img_in.weight") {
+            input_block_weight  = tensor_storage;
+            input_block_checked = true;
+            if (found_family) {
                 break;
+            }
         }
     }
     bool is_inpaint = input_block_weight.ne[2] == 9;
@@ -1499,6 +1510,15 @@ SDVersion ModelLoader::get_sd_version() {
         }
         return VERSION_SDXL;
     }
+
+    if (is_flux) {
+        is_inpaint = input_block_weight.ne[0] == 384;
+        if (is_inpaint) {
+            return VERSION_FLUX_INPAINT;
+        }
+        return VERSION_FLUX;
+    }
+
     if (token_embedding_weight.ne[0] == 768) {
         if (is_inpaint) {
             return VERSION_SD1_INPAINT;
diff --git a/model.h b/model.h
index cfa60d992..69136431e 100644
--- a/model.h
+++ b/model.h
@@ -27,11 +27,12 @@ enum SDVersion {
     VERSION_SVD,
     VERSION_SD3,
     VERSION_FLUX,
+    VERSION_FLUX_INPAINT,
     VERSION_COUNT,
 };
 
 static inline bool sd_version_is_flux(SDVersion version) {
-    if (version == VERSION_FLUX) {
+    if (version == VERSION_FLUX || version == VERSION_FLUX_INPAINT) {
         return true;
     }
     return false;
@@ -66,7 +67,7 @@ static inline bool sd_version_is_sdxl(SDVersion version) {
 }
 
 static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_INPAINT) {
         return true;
     }
     return false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index b99c11daf..1a3f57072 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -33,7 +33,8 @@ const char* model_version_to_str[] = {
     "SDXL Inpaint",
     "SVD",
     "SD3.x",
-    "Flux"};
+    "Flux",
+    "Flux Fill"};
 
 const char* sampling_methods_str[] = {
     "Euler A",

From 29b6fd8cf9fd2c83c29454d31e167b6bb75785fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 6 Dec 2024 19:01:48 +0100
Subject: [PATCH 16/19] Flux fill load

---
 diffusion_model.hpp  | 5 +++--
 flux.hpp             | 8 ++++++--
 stable-diffusion.cpp | 6 +++++-
 3 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index cbc0cd4c1..c44d147ba 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -133,8 +133,9 @@ struct FluxModel : public DiffusionModel {
 
     FluxModel(ggml_backend_t backend,
               std::map<std::string, enum ggml_type>& tensor_types,
-              bool flash_attn = false)
-        : flux(backend, tensor_types, "model.diffusion_model", flash_attn) {
+              SDVersion version = VERSION_FLUX,
+              bool flash_attn   = false)
+        : flux(backend, tensor_types, "model.diffusion_model", version, flash_attn) {
     }
 
     void alloc_params_buffer() {
diff --git a/flux.hpp b/flux.hpp
index fdd00ebcb..498ecdbc7 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -490,6 +490,7 @@ namespace Flux {
 
     struct FluxParams {
         int64_t in_channels         = 64;
+        int64_t out_channels        = 64;
         int64_t vec_in_dim          = 768;
         int64_t context_in_dim      = 4096;
         int64_t hidden_size         = 3072;
@@ -642,7 +643,6 @@ namespace Flux {
         Flux() {}
         Flux(FluxParams params)
             : params(params) {
-            int64_t out_channels = params.in_channels;
             int64_t pe_dim       = params.hidden_size / params.num_heads;
 
             blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
@@ -669,7 +669,7 @@ namespace Flux {
                                                                                                                 params.flash_attn));
             }
 
-            blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, out_channels));
+            blocks["final_layer"] = std::shared_ptr<GGMLBlock>(new LastLayer(params.hidden_size, 1, params.out_channels));
         }
 
         struct ggml_tensor* patchify(struct ggml_context* ctx,
@@ -834,12 +834,16 @@ namespace Flux {
         FluxRunner(ggml_backend_t backend,
                    std::map<std::string, enum ggml_type>& tensor_types = empty_tensor_types,
                    const std::string prefix                            = "",
+                   SDVersion version                                   = VERSION_FLUX,
                    bool flash_attn                                     = false)
             : GGMLRunner(backend) {
             flux_params.flash_attn          = flash_attn;
             flux_params.guidance_embed      = false;
             flux_params.depth               = 0;
             flux_params.depth_single_blocks = 0;
+            if (version == VERSION_FLUX_INPAINT) {
+                flux_params.in_channels = 384;
+            }
             for (auto pair : tensor_types) {
                 std::string tensor_name = pair.first;
                 if (tensor_name.find("model.diffusion_model.") == std::string::npos)
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 1a3f57072..31751eb51 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -333,7 +333,11 @@ class StableDiffusionGGML {
                 diffusion_model  = std::make_shared<MMDiTModel>(backend, model_loader.tensor_storages_types);
             } else if (sd_version_is_flux(version)) {
                 cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
-                diffusion_model  = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, diffusion_flash_attn);
+                diffusion_model  = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
+            } else if (version == VERSION_LTXV) {
+                // TODO: cond for T5 only
+                cond_stage_model = std::make_shared<SimpleT5Embedder>(clip_backend, model_loader.tensor_storages_types);
+                diffusion_model  = std::make_shared<LTXModel>(backend, model_loader.tensor_storages_types, diffusion_flash_attn);
             } else {
                 if (id_embeddings_path.find("v2") != std::string::npos) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);

From 0683c03120a711296a17da3d5b28d604553764da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephduh@live.fr>
Date: Fri, 6 Dec 2024 20:48:39 +0100
Subject: [PATCH 17/19] Flux Fill working!!

---
 diffusion_model.hpp  |  2 +-
 flux.hpp             | 34 +++++++++++++++++++-----
 model.cpp            |  2 +-
 model.h              |  6 ++---
 stable-diffusion.cpp | 63 +++++++++++++++++++++++++++++++++-----------
 5 files changed, 80 insertions(+), 27 deletions(-)

diff --git a/diffusion_model.hpp b/diffusion_model.hpp
index c44d147ba..ee4d88f0c 100644
--- a/diffusion_model.hpp
+++ b/diffusion_model.hpp
@@ -175,7 +175,7 @@ struct FluxModel : public DiffusionModel {
                  struct ggml_tensor** output               = NULL,
                  struct ggml_context* output_ctx           = NULL,
                  std::vector<int> skip_layers              = std::vector<int>()) {
-        return flux.compute(n_threads, x, timesteps, context, y, guidance, output, output_ctx, skip_layers);
+        return flux.compute(n_threads, x, timesteps, context, c_concat, y, guidance, output, output_ctx, skip_layers);
     }
 };
 
diff --git a/flux.hpp b/flux.hpp
index 498ecdbc7..20ff41096 100644
--- a/flux.hpp
+++ b/flux.hpp
@@ -643,7 +643,7 @@ namespace Flux {
         Flux() {}
         Flux(FluxParams params)
             : params(params) {
-            int64_t pe_dim       = params.hidden_size / params.num_heads;
+            int64_t pe_dim = params.hidden_size / params.num_heads;
 
             blocks["img_in"]    = std::shared_ptr<GGMLBlock>(new Linear(params.in_channels, params.hidden_size, true));
             blocks["time_in"]   = std::shared_ptr<GGMLBlock>(new MLPEmbedder(256, params.hidden_size));
@@ -789,6 +789,7 @@ namespace Flux {
                                     struct ggml_tensor* x,
                                     struct ggml_tensor* timestep,
                                     struct ggml_tensor* context,
+                                    struct ggml_tensor* c_concat,
                                     struct ggml_tensor* y,
                                     struct ggml_tensor* guidance,
                                     struct ggml_tensor* pe,
@@ -797,6 +798,7 @@ namespace Flux {
             // x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
             // timestep: (N,) tensor of diffusion timesteps
             // context: (N, L, D)
+            // c_concat: NULL, or for (N,C+M, H, W) for Fill
             // y: (N, adm_in_channels) tensor of class labels
             // guidance: (N,)
             // pe: (L, d_head/2, 2, 2)
@@ -806,6 +808,7 @@ namespace Flux {
 
             int64_t W          = x->ne[0];
             int64_t H          = x->ne[1];
+            int64_t C          = x->ne[2];
             int64_t patch_size = 2;
             int pad_h          = (patch_size - H % patch_size) % patch_size;
             int pad_w          = (patch_size - W % patch_size) % patch_size;
@@ -814,6 +817,19 @@ namespace Flux {
             // img = rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=patch_size, pw=patch_size)
             auto img = patchify(ctx, x, patch_size);  // [N, h*w, C * patch_size * patch_size]
 
+            if (c_concat != NULL) {
+                ggml_tensor* masked = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], C, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], 0);
+                ggml_tensor* mask   = ggml_view_4d(ctx, c_concat, c_concat->ne[0], c_concat->ne[1], 8 * 8, 1, c_concat->nb[1], c_concat->nb[2], c_concat->nb[3], c_concat->nb[2] * C);
+
+                masked = ggml_pad(ctx, masked, pad_w, pad_h, 0, 0);
+                mask   = ggml_pad(ctx, mask, pad_w, pad_h, 0, 0);
+
+                masked = patchify(ctx, masked, patch_size);
+                mask   = patchify(ctx, mask, patch_size);
+
+                img = ggml_concat(ctx, img, ggml_concat(ctx, masked, mask, 0), 0);
+            }
+
             auto out = forward_orig(ctx, img, context, timestep, y, guidance, pe, skip_layers);  // [N, h*w, C * patch_size * patch_size]
 
             // rearrange(out, "b (h w) (c ph pw) -> b c (h ph) (w pw)", h=h_len, w=w_len, ph=2, pw=2)
@@ -841,7 +857,7 @@ namespace Flux {
             flux_params.guidance_embed      = false;
             flux_params.depth               = 0;
             flux_params.depth_single_blocks = 0;
-            if (version == VERSION_FLUX_INPAINT) {
+            if (version == VERSION_FLUX_FILL) {
                 flux_params.in_channels = 384;
             }
             for (auto pair : tensor_types) {
@@ -890,14 +906,18 @@ namespace Flux {
         struct ggml_cgraph* build_graph(struct ggml_tensor* x,
                                         struct ggml_tensor* timesteps,
                                         struct ggml_tensor* context,
+                                        struct ggml_tensor* c_concat,
                                         struct ggml_tensor* y,
                                         struct ggml_tensor* guidance,
                                         std::vector<int> skip_layers = std::vector<int>()) {
             GGML_ASSERT(x->ne[3] == 1);
             struct ggml_cgraph* gf = ggml_new_graph_custom(compute_ctx, FLUX_GRAPH_SIZE, false);
 
-            x         = to_backend(x);
-            context   = to_backend(context);
+            x       = to_backend(x);
+            context = to_backend(context);
+            if (c_concat != NULL) {
+                c_concat = to_backend(c_concat);
+            }
             y         = to_backend(y);
             timesteps = to_backend(timesteps);
             if (flux_params.guidance_embed) {
@@ -917,6 +937,7 @@ namespace Flux {
                                                    x,
                                                    timesteps,
                                                    context,
+                                                   c_concat,
                                                    y,
                                                    guidance,
                                                    pe,
@@ -931,6 +952,7 @@ namespace Flux {
                      struct ggml_tensor* x,
                      struct ggml_tensor* timesteps,
                      struct ggml_tensor* context,
+                     struct ggml_tensor* c_concat,
                      struct ggml_tensor* y,
                      struct ggml_tensor* guidance,
                      struct ggml_tensor** output     = NULL,
@@ -942,7 +964,7 @@ namespace Flux {
             // y: [N, adm_in_channels] or [1, adm_in_channels]
             // guidance: [N, ]
             auto get_graph = [&]() -> struct ggml_cgraph* {
-                return build_graph(x, timesteps, context, y, guidance, skip_layers);
+                return build_graph(x, timesteps, context, c_concat, y, guidance, skip_layers);
             };
 
             GGMLRunner::compute(get_graph, n_threads, false, output, output_ctx);
@@ -982,7 +1004,7 @@ namespace Flux {
                 struct ggml_tensor* out = NULL;
 
                 int t0 = ggml_time_ms();
-                compute(8, x, timesteps, context, y, guidance, &out, work_ctx);
+                compute(8, x, timesteps, context, NULL, y, guidance, &out, work_ctx);
                 int t1 = ggml_time_ms();
 
                 print_ggml_tensor(out);
diff --git a/model.cpp b/model.cpp
index 5985acb2e..767a8b822 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1514,7 +1514,7 @@ SDVersion ModelLoader::get_sd_version() {
     if (is_flux) {
         is_inpaint = input_block_weight.ne[0] == 384;
         if (is_inpaint) {
-            return VERSION_FLUX_INPAINT;
+            return VERSION_FLUX_FILL;
         }
         return VERSION_FLUX;
     }
diff --git a/model.h b/model.h
index 69136431e..95bbf1da2 100644
--- a/model.h
+++ b/model.h
@@ -27,12 +27,12 @@ enum SDVersion {
     VERSION_SVD,
     VERSION_SD3,
     VERSION_FLUX,
-    VERSION_FLUX_INPAINT,
+    VERSION_FLUX_FILL,
     VERSION_COUNT,
 };
 
 static inline bool sd_version_is_flux(SDVersion version) {
-    if (version == VERSION_FLUX || version == VERSION_FLUX_INPAINT) {
+    if (version == VERSION_FLUX || version == VERSION_FLUX_FILL) {
         return true;
     }
     return false;
@@ -67,7 +67,7 @@ static inline bool sd_version_is_sdxl(SDVersion version) {
 }
 
 static inline bool sd_version_is_inpaint(SDVersion version) {
-    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_INPAINT) {
+    if (version == VERSION_SD1_INPAINT || version == VERSION_SD2_INPAINT || version == VERSION_SDXL_INPAINT || version == VERSION_FLUX_FILL) {
         return true;
     }
     return false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 31751eb51..26772f85e 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -334,10 +334,6 @@ class StableDiffusionGGML {
             } else if (sd_version_is_flux(version)) {
                 cond_stage_model = std::make_shared<FluxCLIPEmbedder>(clip_backend, model_loader.tensor_storages_types);
                 diffusion_model  = std::make_shared<FluxModel>(backend, model_loader.tensor_storages_types, version, diffusion_flash_attn);
-            } else if (version == VERSION_LTXV) {
-                // TODO: cond for T5 only
-                cond_stage_model = std::make_shared<SimpleT5Embedder>(clip_backend, model_loader.tensor_storages_types);
-                diffusion_model  = std::make_shared<LTXModel>(backend, model_loader.tensor_storages_types, diffusion_flash_attn);
             } else {
                 if (id_embeddings_path.find("v2") != std::string::npos) {
                     cond_stage_model = std::make_shared<FrozenCLIPEmbedderWithCustomWords>(clip_backend, model_loader.tensor_storages_types, embeddings_path, version, PM_VERSION_2);
@@ -798,6 +794,7 @@ class StableDiffusionGGML {
                         float skip_layer_start       = 0.01,
                         float skip_layer_end         = 0.2,
                         ggml_tensor* noise_mask      = nullptr) {
+        LOG_DEBUG("Sample");
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
         for (int i = 1; i < 4; i++) {
@@ -1394,13 +1391,27 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
     ggml_tensor* noise_mask = nullptr;
     if (sd_version_is_inpaint(sd_ctx->sd->version)) {
         if (masked_image == NULL) {
+            int64_t mask_channels = 1;
+            if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                mask_channels = 8 * 8;  // flatten the whole mask
+            }
             // no mask, set the whole image as masked
-            masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], init_latent->ne[2] + 1, 1);
+            masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, init_latent->ne[0], init_latent->ne[1], mask_channels + init_latent->ne[2], 1);
             for (int64_t x = 0; x < masked_image->ne[0]; x++) {
                 for (int64_t y = 0; y < masked_image->ne[1]; y++) {
-                    ggml_tensor_set_f32(masked_image, 1, x, y, 0);
-                    for (int64_t c = 1; c < masked_image->ne[2]; c++) {
-                        ggml_tensor_set_f32(masked_image, 0, x, y, c);
+                    if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                        // TODO: this might be wrong
+                        for (int64_t c = 0; c < init_latent->ne[2]; c++) {
+                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
+                        }
+                        for (int64_t c = init_latent->ne[2]; c < masked_image->ne[2]; c++) {
+                            ggml_tensor_set_f32(masked_image, 1, x, y, c);
+                        }
+                    } else {
+                        ggml_tensor_set_f32(masked_image, 1, x, y, 0);
+                        for (int64_t c = 1; c < masked_image->ne[2]; c++) {
+                            ggml_tensor_set_f32(masked_image, 0, x, y, c);
+                        }
                     }
                 }
             }
@@ -1676,6 +1687,10 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
     ggml_tensor* masked_image;
 
     if (sd_version_is_inpaint(sd_ctx->sd->version)) {
+        int64_t mask_channels = 1;
+        if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+            mask_channels = 8 * 8;  // flatten the whole mask
+        }
         ggml_tensor* masked_img = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, width, height, 3, 1);
         sd_apply_mask(init_img, mask_img, masked_img);
         ggml_tensor* masked_image_0 = NULL;
@@ -1685,17 +1700,33 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
         } else {
             masked_image_0 = sd_ctx->sd->encode_first_stage(work_ctx, masked_img);
         }
-        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], masked_image_0->ne[2] + 1, 1);
+        masked_image = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, masked_image_0->ne[0], masked_image_0->ne[1], mask_channels + masked_image_0->ne[2], 1);
         for (int ix = 0; ix < masked_image_0->ne[0]; ix++) {
             for (int iy = 0; iy < masked_image_0->ne[1]; iy++) {
-                for (int k = 0; k < masked_image_0->ne[2]; k++) {
-                    float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
-                    ggml_tensor_set_f32(masked_image, v, ix, iy, k + 1);
+                int mx = ix * 8;
+                int my = iy * 8;
+                if (sd_ctx->sd->version == VERSION_FLUX_FILL) {
+                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
+                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
+                        ggml_tensor_set_f32(masked_image, v, ix, iy, k);
+                    }
+                    // "Encode" 8x8 mask chunks into a flattened 1x64 vector, and concatenate to masked image
+                    for (int x = 0; x < 8; x++) {
+                        for (int y = 0; y < 8; y++) {
+                            float m = ggml_tensor_get_f32(mask_img, mx + x, my + y);
+                            // TODO: check if the way the mask is flattened is correct (is it supposed to be x*8+y or x+8*y?)
+                            // python code was using "b (h 8) (w 8) -> b (8 8) h w"
+                            ggml_tensor_set_f32(masked_image, m, ix, iy, masked_image_0->ne[2] + x * 8 + y);
+                        }
+                    }
+                } else {
+                    float m = ggml_tensor_get_f32(mask_img, mx, my);
+                    ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
+                    for (int k = 0; k < masked_image_0->ne[2]; k++) {
+                        float v = ggml_tensor_get_f32(masked_image_0, ix, iy, k);
+                        ggml_tensor_set_f32(masked_image, v, ix, iy, k + mask_channels);
+                    }
                 }
-                int mx  = ix * 8;
-                int my  = iy * 8;
-                float m = ggml_tensor_get_f32(mask_img, mx, my);
-                ggml_tensor_set_f32(masked_image, m, ix, iy, 0);
             }
         }
     } else {

From 17b4fc5054fefbeea4af08aa671f1612019d0fb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephdham@gmail.com>
Date: Thu, 26 Dec 2024 19:57:11 +0100
Subject: [PATCH 18/19] fix mistake in sd2 mode check

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 26772f85e..6e6d4f628 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -606,7 +606,7 @@ class StableDiffusionGGML {
         ggml_set_f32(timesteps, 999);
 
         struct ggml_tensor* concat = is_inpaint ? ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, 8, 8, 5, 1) : NULL;
-        ggml_set_f32(timesteps, 0);
+        ggml_set_f32(concat, 0);
 
         int64_t t0              = ggml_time_ms();
         struct ggml_tensor* out = ggml_dup_tensor(work_ctx, x_t);

From fe35689e3fc681657a81fac4d473e8e2c41aabf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= <stephdham@gmail.com>
Date: Fri, 27 Dec 2024 00:21:13 +0100
Subject: [PATCH 19/19] more specific sdxl fingerprint

---
 model.cpp | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/model.cpp b/model.cpp
index 767a8b822..dae1e0d56 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1461,7 +1461,10 @@ SDVersion ModelLoader::get_sd_version() {
     TensorStorage token_embedding_weight, input_block_weight;
     bool input_block_checked = false;
 
-    bool is_xl   = false;
+    bool has_multiple_encoders   = false;
+    bool is_unet = false;
+
+    bool is_xl = false;
     bool is_flux = false;
 
 #define found_family (is_xl || is_flux)
@@ -1476,10 +1479,22 @@ SDVersion ModelLoader::get_sd_version() {
             if (tensor_storage.name.find("model.diffusion_model.joint_blocks.") != std::string::npos) {
                 return VERSION_SD3;
             }
+            if (tensor_storage.name.find("model.diffusion_model.input_blocks.") != std::string::npos) {
+                is_unet = true;
+                if(has_multiple_encoders){
+                    is_xl = true;
+                    if (input_block_checked) {
+                        break;
+                    }
+                }
+            }
             if (tensor_storage.name.find("conditioner.embedders.1") != std::string::npos || tensor_storage.name.find("cond_stage_model.1") != std::string::npos) {
-                is_xl = true;
-                if (input_block_checked) {
-                    break;
+                has_multiple_encoders = true;
+                if(is_unet){
+                    is_xl = true;
+                    if (input_block_checked) {
+                        break;
+                    }
                 }
             }
             if (tensor_storage.name.find("model.diffusion_model.input_blocks.8.0.time_mixer.mix_factor") != std::string::npos) {