From 19d40d0a4c406196e9876e80abbcac6c35825025 Mon Sep 17 00:00:00 2001
From: rmatif <66360289+rmatif@users.noreply.github.com>
Date: Sun, 13 Apr 2025 03:43:40 +0000
Subject: [PATCH 1/9] fix tensors name

---
 model.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/model.cpp b/model.cpp
index 24da39f6d..30e5f1bcf 100644
--- a/model.cpp
+++ b/model.cpp
@@ -175,6 +175,12 @@ std::unordered_map<std::string, std::string> pmid_v2_name_map = {
 };
 
 std::string convert_open_clip_to_hf_clip(const std::string& name) {
+    // Specific fix for ComfyUI-style SDXL CLIP-G text projection name
+    // Check this *before* any other modifications
+    if (name == "conditioner.embedders.1.model.text_projection.weight") {
+        return "cond_stage_model.1.transformer.text_model.text_projection";
+    }
+
     std::string new_name = name;
     std::string prefix;
     if (starts_with(new_name, "conditioner.embedders.0.open_clip.")) {
@@ -185,7 +191,7 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
         new_name = new_name.substr(strlen("conditioner.embedders.0."));
     } else if (starts_with(new_name, "conditioner.embedders.1.")) {
         prefix   = "cond_stage_model.1.";
-        new_name = new_name.substr(strlen("conditioner.embedders.0."));
+        new_name = new_name.substr(strlen("conditioner.embedders.1.")); // Fix bug: use correct length for prefix 1
     } else if (starts_with(new_name, "cond_stage_model.")) {
         prefix   = "cond_stage_model.";
         new_name = new_name.substr(strlen("cond_stage_model."));

From 205164dba818292ea5cd00fa545964a67662bfc1 Mon Sep 17 00:00:00 2001
From: rmatif <66360289+rmatif@users.noreply.github.com>
Date: Sun, 13 Apr 2025 04:52:09 +0000
Subject: [PATCH 2/9] implement timestep shift first attempt

---
 .gitignore            |  3 ++-
 denoiser.hpp          |  3 ++-
 examples/cli/main.cpp | 20 +++++++++++++++++---
 stable-diffusion.cpp  | 33 +++++++++++++++++++++++++--------
 stable-diffusion.h    |  7 +++++--
 5 files changed, 51 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 38fe570df..7986ce6b4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@ test/
 *.bin
 *.exe
 *.gguf
+*.pdf
 output*.png
 models*
-*.log
\ No newline at end of file
+*.log
diff --git a/denoiser.hpp b/denoiser.hpp
index 66799109d..880dc22ff 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -971,7 +971,8 @@ static void sample_k_diffusion(sample_method_t method,
                 d_cur = ggml_dup_tensor(work_ctx, x_next);
             }
         } break;
-        case LCM:  // Latent Consistency Models
+        case LCM:             // Latent Consistency Models
+        case TIMESTEP_SHIFT_LCM: // Timestep Shift LCM (uses same core logic as LCM here)
         {
             struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
             struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index af6b2bbdb..626e37544 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -41,6 +41,7 @@ const char* sample_method_str[] = {
     "lcm",
     "ddim_trailing",
     "tcd",
+    "timestep_shift_lcm",
 };
 
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -101,6 +102,7 @@ struct SDParams {
     int width         = 512;
     int height        = 512;
     int batch_count   = 1;
+    int shifted_timestep = -1; // for timestep_shift_lcm
 
     int video_frames         = 6;
     int motion_bucket_id     = 127;
@@ -178,6 +180,9 @@ void print_params(SDParams params) {
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
     printf("    upscale_repeats:   %d\n", params.upscale_repeats);
+    if (params.shifted_timestep > 0) {
+        printf("    shifted_timestep:  %d\n", params.shifted_timestep);
+    }
 }
 
 void print_usage(int argc, const char* argv[]) {
@@ -226,7 +231,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     1.0 corresponds to full destruction of information in init image\n");
     printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
     printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, timestep_shift_lcm}\n");
     printf("                                     sampling method (default: \"euler_a\")\n");
     printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
@@ -244,6 +249,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            Colors the logging tags according to level\n");
+    printf("  --shifted-timestep N               Timestep shift value for timestep_shift_lcm sampler (default: -1, disabled)\n");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -629,6 +635,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
+        } else if (arg == "--shifted-timestep") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.shifted_timestep = std::stoi(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -967,7 +979,8 @@ int main(int argc, const char* argv[]) {
                           params.skip_layers.size(),
                           params.slg_scale,
                           params.skip_layer_start,
-                          params.skip_layer_end);
+                          params.skip_layer_end,
+                          params.shifted_timestep);
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1036,7 +1049,8 @@ int main(int argc, const char* argv[]) {
                               params.skip_layers.size(),
                               params.slg_scale,
                               params.skip_layer_start,
-                              params.skip_layer_end);
+                              params.skip_layer_end,
+                              params.shifted_timestep);
         }
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index e38a6101f..39864b8a5 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -804,7 +804,8 @@ class StableDiffusionGGML {
                         float slg_scale              = 0,
                         float skip_layer_start       = 0.01,
                         float skip_layer_end         = 0.2,
-                        ggml_tensor* noise_mask      = nullptr) {
+                        ggml_tensor* noise_mask      = nullptr,
+                        int shifted_timestep         = -1) {
         LOG_DEBUG("Sample");
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@@ -860,7 +861,16 @@ class StableDiffusionGGML {
             float c_in   = scaling[2];
 
             float t = denoiser->sigma_to_t(sigma);
-            std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
+            float t_for_model = t;
+            if (method == TIMESTEP_SHIFT_LCM && shifted_timestep > 0) {
+                // Apply timestep shift: t_shifted = t * shifted_timestep / TIMESTEPS
+                // TIMESTEPS is defined in denoiser.hpp as 1000
+                t_for_model = t * (float)shifted_timestep / (float)TIMESTEPS;
+                // Ensure t_for_model stays within valid range [0, TIMESTEPS-1]
+                t_for_model = std::max(0.f, std::min(t_for_model, (float)TIMESTEPS - 1.f));
+                LOG_DEBUG("Timestep Shift: original t=%.2f, shifted t=%.2f (shifted_timestep=%d)", t, t_for_model, shifted_timestep);
+            }
+            std::vector<float> timesteps_vec(x->ne[3], t_for_model);  // Use t_for_model for the diffusion model call
             auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
             std::vector<float> guidance_vec(x->ne[3], guidance);
             auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
@@ -1213,7 +1223,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float slg_scale              = 0,
                            float skip_layer_start       = 0.01,
                            float skip_layer_end         = 0.2,
-                           ggml_tensor* masked_image    = NULL) {
+                           ggml_tensor* masked_image    = NULL,
+                           int shifted_timestep         = -1) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1470,7 +1481,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      slg_scale,
                                                      skip_layer_start,
                                                      skip_layer_end,
-                                                     noise_mask);
+                                                     noise_mask,
+                                                     shifted_timestep);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1543,7 +1555,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                     size_t skip_layers_count = 0,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
+                    float skip_layer_end     = 0.2,
+                    int shifted_timestep     = -1) {
     std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
     LOG_DEBUG("txt2img %dx%d", width, height);
     if (sd_ctx == NULL) {
@@ -1621,7 +1634,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                                skip_layers_vec,
                                                slg_scale,
                                                skip_layer_start,
-                                               skip_layer_end);
+                                               skip_layer_end,
+                                               NULL, // masked_image is NULL for txt2img
+                                               shifted_timestep);
 
     size_t t1 = ggml_time_ms();
 
@@ -1655,7 +1670,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     size_t skip_layers_count = 0,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
+                    float skip_layer_end     = 0.2,
+                    int shifted_timestep     = -1) {
     std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
     LOG_DEBUG("img2img %dx%d", width, height);
     if (sd_ctx == NULL) {
@@ -1802,7 +1818,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                slg_scale,
                                                skip_layer_start,
                                                skip_layer_end,
-                                               masked_image);
+                                               masked_image, // Pass the actual masked_image for img2img
+                                               shifted_timestep);
 
     size_t t2 = ggml_time_ms();
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 52dcc848a..cb3c0bafb 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -46,6 +46,7 @@ enum sample_method_t {
     LCM,
     DDIM_TRAILING,
     TCD,
+    TIMESTEP_SHIFT_LCM,
     N_SAMPLE_METHODS
 };
 
@@ -176,7 +177,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                            size_t skip_layers_count,
                            float slg_scale,
                            float skip_layer_start,
-                           float skip_layer_end);
+                           float skip_layer_end,
+                           int shifted_timestep);
 
 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,
@@ -203,7 +205,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            size_t skip_layers_count,
                            float slg_scale,
                            float skip_layer_start,
-                           float skip_layer_end);
+                           float skip_layer_end,
+                           int shifted_timestep);
 
 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,

From a67e93f893f96dd27c4c9cea52648665cec479c9 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Wed, 30 Apr 2025 03:54:26 +0000
Subject: [PATCH 3/9] timestep-shift fix, works in 1 step

---
 denoiser.hpp          |   3 +-
 examples/cli/main.cpp |  43 +++++------
 model.cpp             |  17 +++--
 stable-diffusion.cpp  | 172 ++++++++++++++++++++++++++++++++----------
 stable-diffusion.h    |   1 -
 5 files changed, 163 insertions(+), 73 deletions(-)

diff --git a/denoiser.hpp b/denoiser.hpp
index 880dc22ff..66799109d 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -971,8 +971,7 @@ static void sample_k_diffusion(sample_method_t method,
                 d_cur = ggml_dup_tensor(work_ctx, x_next);
             }
         } break;
-        case LCM:             // Latent Consistency Models
-        case TIMESTEP_SHIFT_LCM: // Timestep Shift LCM (uses same core logic as LCM here)
+        case LCM:  // Latent Consistency Models
         {
             struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
             struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 626e37544..5988a7423 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -41,7 +41,6 @@ const char* sample_method_str[] = {
     "lcm",
     "ddim_trailing",
     "tcd",
-    "timestep_shift_lcm",
 };
 
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -102,7 +101,6 @@ struct SDParams {
     int width         = 512;
     int height        = 512;
     int batch_count   = 1;
-    int shifted_timestep = -1; // for timestep_shift_lcm
 
     int video_frames         = 6;
     int motion_bucket_id     = 127;
@@ -128,9 +126,10 @@ struct SDParams {
     int upscale_repeats           = 1;
 
     std::vector<int> skip_layers = {7, 8, 9};
-    float slg_scale              = 0.f;
+    float slg_scale              = 0.f; // Removed duplicate line
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
+    int shifted_timestep         = -1; // Keep the added parameter from previous step
 };
 
 void print_params(SDParams params) {
@@ -180,9 +179,7 @@ void print_params(SDParams params) {
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
     printf("    upscale_repeats:   %d\n", params.upscale_repeats);
-    if (params.shifted_timestep > 0) {
-        printf("    shifted_timestep:  %d\n", params.shifted_timestep);
-    }
+    printf("    timestep_shift:    %d\n", params.shifted_timestep);
 }
 
 void print_usage(int argc, const char* argv[]) {
@@ -231,7 +228,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     1.0 corresponds to full destruction of information in init image\n");
     printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
     printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, timestep_shift_lcm}\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
     printf("                                     sampling method (default: \"euler_a\")\n");
     printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
@@ -249,7 +246,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            Colors the logging tags according to level\n");
-    printf("  --shifted-timestep N               Timestep shift value for timestep_shift_lcm sampler (default: -1, disabled)\n");
+    printf("  --timestep-shift N                 shift timestep for SDXL models (NitroFusion paper, default: -1 off, N between 1 and 1000)\n");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -635,12 +632,16 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
-        } else if (arg == "--shifted-timestep") {
-            if (++i >= argc) {
-                invalid_arg = true;
-                break;
-            }
-            params.shifted_timestep = std::stoi(argv[i]);
+        } else if (arg == "--timestep-shift") { // Added block
+             if (++i >= argc) {
+                 invalid_arg = true;
+                 break;
+             }
+             params.shifted_timestep = std::stoi(argv[i]);
+             if (params.shifted_timestep != -1 && (params.shifted_timestep < 1 || params.shifted_timestep > 1000)) {
+                  fprintf(stderr, "error: timestep-shift must be between 1 and 1000, or -1 to disable\n");
+                  exit(1);
+             }
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -980,10 +981,10 @@ int main(int argc, const char* argv[]) {
                           params.slg_scale,
                           params.skip_layer_start,
                           params.skip_layer_end,
-                          params.shifted_timestep);
-    } else {
-        sd_image_t input_image = {(uint32_t)params.width,
-                                  (uint32_t)params.height,
+                          params.shifted_timestep); // Passed parameter
+   } else {
+       sd_image_t input_image = {(uint32_t)params.width,
+                                 (uint32_t)params.height,
                                   3,
                                   input_image_buffer};
 
@@ -1050,9 +1051,9 @@ int main(int argc, const char* argv[]) {
                               params.slg_scale,
                               params.skip_layer_start,
                               params.skip_layer_end,
-                              params.shifted_timestep);
-        }
-    }
+                              params.shifted_timestep); // Passed parameter
+       }
+   }
 
     if (results == NULL) {
         printf("generate failed\n");
diff --git a/model.cpp b/model.cpp
index 30e5f1bcf..0be264704 100644
--- a/model.cpp
+++ b/model.cpp
@@ -175,12 +175,6 @@ std::unordered_map<std::string, std::string> pmid_v2_name_map = {
 };
 
 std::string convert_open_clip_to_hf_clip(const std::string& name) {
-    // Specific fix for ComfyUI-style SDXL CLIP-G text projection name
-    // Check this *before* any other modifications
-    if (name == "conditioner.embedders.1.model.text_projection.weight") {
-        return "cond_stage_model.1.transformer.text_model.text_projection";
-    }
-
     std::string new_name = name;
     std::string prefix;
     if (starts_with(new_name, "conditioner.embedders.0.open_clip.")) {
@@ -191,7 +185,8 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
         new_name = new_name.substr(strlen("conditioner.embedders.0."));
     } else if (starts_with(new_name, "conditioner.embedders.1.")) {
         prefix   = "cond_stage_model.1.";
-        new_name = new_name.substr(strlen("conditioner.embedders.1.")); // Fix bug: use correct length for prefix 1
+        // Corrected the substring length to match the prefix being checked
+        new_name = new_name.substr(strlen("conditioner.embedders.1."));
     } else if (starts_with(new_name, "cond_stage_model.")) {
         prefix   = "cond_stage_model.";
         new_name = new_name.substr(strlen("cond_stage_model."));
@@ -199,6 +194,8 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
         prefix   = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight"));
         new_name = prefix + "visual_projection.weight";
         return new_name;
+    // This specific case seems less common or might be handled implicitly later,
+    // but we keep the original logic for now. If issues arise, review if this mapping is needed.
     } else if (ends_with(new_name, "transformer.text_projection.weight")) {
         prefix   = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight"));
         new_name = prefix + "transformer.text_model.text_projection";
@@ -207,9 +204,13 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
         return new_name;
     }
 
-    if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) {
+    // Specific handling for text_projection variants before generic map lookup
+    if (new_name == "model.text_projection.weight" || new_name == "model.text_projection") {
+        new_name = "transformer.text_model.text_projection";
+    } else if (open_clip_to_hf_clip_model.count(new_name)) { // Use .count() for safety
         new_name = open_clip_to_hf_clip_model[new_name];
     }
+    // Note: The specific handling above takes precedence over the map for this tensor.
 
     std::string open_clip_resblock_prefix = "model.transformer.resblocks.";
     std::string hf_clip_resblock_prefix   = "transformer.text_model.encoder.layers.";
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 39864b8a5..f1e6fb72c 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -804,9 +804,13 @@ class StableDiffusionGGML {
                         float slg_scale              = 0,
                         float skip_layer_start       = 0.01,
                         float skip_layer_end         = 0.2,
-                        ggml_tensor* noise_mask      = nullptr,
-                        int shifted_timestep         = -1) {
+                        int shifted_timestep         = -1, // Added parameter
+                        ggml_tensor* noise_mask      = nullptr) {
         LOG_DEBUG("Sample");
+        if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) {
+             LOG_WARN("Timestep shifting is only supported for SDXL models. Ignoring --timestep-shift.");
+             shifted_timestep = -1;
+        }
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
         for (int i = 1; i < 4; i++) {
@@ -848,7 +852,27 @@ class StableDiffusionGGML {
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
 
-        auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
+        // Capture necessary variables for the denoising step lambda
+        auto denoise = [this, // Capture 'this' to access members like version, rng, n_threads etc.
+                        &work_ctx, // Context for tensor creation
+                        denoiser = this->denoiser, // Capture shared_ptr by value (copies the pointer)
+                        steps, // Capture steps by value
+                        &x, // Capture latent tensor by reference (modified in loop)
+                        &sigmas, // Capture sigmas vector by reference (read-only needed)
+                        &cond, &uncond, &id_cond, // Capture conditions by reference
+                        control_hint, // Capture control hint pointer by value
+                        control_strength, min_cfg, cfg_scale, guidance, eta, // Capture floats by value
+                        start_merge_step, // Capture int by value
+                        &skip_layers, slg_scale, skip_layer_start, skip_layer_end, // Capture skip layer params
+                        shifted_timestep, // Capture shifted timestep by value
+                        &noise_mask, // Capture noise mask pointer by reference/value
+                        &noise, // Capture noise tensor by reference (modified in loop)
+                        &init_latent, // Capture initial latent tensor by reference (read-only for mask)
+                        &denoised, // Capture output denoised tensor by reference
+                        &noised_input, &out_cond, &out_uncond, &out_skip, // Capture intermediate tensors by reference
+                        has_unconditioned, has_skiplayer // Capture bools by value
+                       ]
+                       (ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
             if (step == 1) {
                 pretty_progress(0, (int)steps, 0);
             }
@@ -861,22 +885,32 @@ class StableDiffusionGGML {
             float c_in   = scaling[2];
 
             float t = denoiser->sigma_to_t(sigma);
-            float t_for_model = t;
-            if (method == TIMESTEP_SHIFT_LCM && shifted_timestep > 0) {
-                // Apply timestep shift: t_shifted = t * shifted_timestep / TIMESTEPS
-                // TIMESTEPS is defined in denoiser.hpp as 1000
-                t_for_model = t * (float)shifted_timestep / (float)TIMESTEPS;
-                // Ensure t_for_model stays within valid range [0, TIMESTEPS-1]
-                t_for_model = std::max(0.f, std::min(t_for_model, (float)TIMESTEPS - 1.f));
-                LOG_DEBUG("Timestep Shift: original t=%.2f, shifted t=%.2f (shifted_timestep=%d)", t, t_for_model, shifted_timestep);
+            std::vector<float> timesteps_vec;
+            // --- Timestep Shifting Logic ---
+            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
+                // Calculate the shifted timestep value based on the original t
+                // Python: (original_index * (shifted_timestep / total_timesteps)).long()
+                // Assuming TIMESTEPS (1000) is the total_timesteps for SDXL
+                float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
+                // Clamp and round to nearest integer timestep index
+                int64_t shifted_t = static_cast<int64_t>(roundf(shifted_t_float));
+                shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
+                LOG_DEBUG("Shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
+                timesteps_vec.assign(x->ne[3], (float)shifted_t);
+            } else {
+                // Use original timestep if shifting is disabled or model is not SDXL
+                timesteps_vec.assign(x->ne[3], t);
             }
-            std::vector<float> timesteps_vec(x->ne[3], t_for_model);  // Use t_for_model for the diffusion model call
+            // --- End Timestep Shifting Logic ---
+
             auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
             std::vector<float> guidance_vec(x->ne[3], guidance);
             auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
 
             copy_ggml_tensor(noised_input, input);
             // noised_input = noised_input * c_in
+            // NOTE: c_in is calculated based on the *original* sigma, which seems
+            // consistent with how the Python code uses xc derived from original sigma.
             ggml_tensor_scale(noised_input, c_in);
 
             std::vector<struct ggml_tensor*> controls;
@@ -960,26 +994,83 @@ class StableDiffusionGGML {
             float* vec_denoised  = (float*)denoised->data;
             float* vec_input     = (float*)input->data;
             float* positive_data = (float*)out_cond->data;
+            float* negative_data_ptr = has_unconditioned ? (float*)out_uncond->data : nullptr; // Get pointer if needed
+            float* skip_layer_data_ptr = is_skiplayer_step ? (float*)out_skip->data : nullptr; // Get pointer if needed
             int ne_elements      = (int)ggml_nelements(denoised);
-            for (int i = 0; i < ne_elements; i++) {
-                float latent_result = positive_data[i];
-                if (has_unconditioned) {
-                    // out_uncond + cfg_scale * (out_cond - out_uncond)
-                    int64_t ne3 = out_cond->ne[3];
-                    if (min_cfg != cfg_scale && ne3 != 1) {
-                        int64_t i3  = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2];
-                        float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3);
-                    } else {
-                        latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]);
+
+            // --- Select Calculation Path based on Timestep Shifting ---
+            if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
+                // --- Shifted Timestep Final Calculation ---
+
+                // Retrieve the integer shifted timestep calculated earlier
+                // Assuming shifted_t is the float representation of the index
+                int64_t shifted_t_idx = static_cast<int64_t>(roundf(timesteps_vec[0])); // Get the index back
+
+                float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx);
+                std::vector<float> shifted_scaling = denoiser->get_scalings(shifted_sigma);
+                float shifted_c_skip = shifted_scaling[0];
+                float shifted_c_out  = shifted_scaling[1];
+                // shifted_c_in is scaling[2] if needed, but we adjust input instead
+
+                // Need sigma_data from the denoiser (assuming CompVis type for SDXL)
+                auto compvis_denoiser_ptr = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
+                float sigma_data = compvis_denoiser_ptr ? compvis_denoiser_ptr->sigma_data : 1.0f; // Default needed? SDXL uses CompVis.
+
+                float sigma_sq = sigma * sigma; // Original sigma for this step
+                float shifted_sigma_sq = shifted_sigma * shifted_sigma;
+                float sigma_data_sq = sigma_data * sigma_data;
+
+                // Calculate the scaling factor needed to adjust the input `x` (vec_input)
+                // Equivalent to Python: sqrt(denoised_sigma^2 + sigma_data^2) / sqrt(sigma^2 + sigma_data^2)
+                float input_scale_factor = sqrtf((shifted_sigma_sq + sigma_data_sq) / (sigma_sq + sigma_data_sq));
+
+                LOG_DEBUG("Shifted calc [Step %d]: sigma=%.4f, shifted_t_idx=%ld, shifted_sigma=%.4f, input_scale=%.4f, shifted_c_skip=%.4f, shifted_c_out=%.4f",
+                          step, sigma, shifted_t_idx, shifted_sigma, input_scale_factor, shifted_c_skip, shifted_c_out);
+
+                for (int i = 0; i < ne_elements; i++) {
+                    // CFG and SLG apply to the raw model output *before* the final scaling
+                    float model_output_result = positive_data[i]; // Start with positive prediction
+                    if (has_unconditioned) {
+                        // Apply CFG scale: uncond + cfg_scale * (cond - uncond)
+                        model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]);
+                        // TODO: Add min_cfg logic if necessary
                     }
+                    if (is_skiplayer_step) {
+                        // Apply SLG: result + slg_scale * (cond - skip)
+                        model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]);
+                    }
+
+                    // Recalculate input term based on Python logic: x_recalc = x * input_scale_factor
+                    float adjusted_input = vec_input[i] * input_scale_factor;
+
+                    // Final calculation using shifted sigma scales and adjusted input
+                    // Equivalent to Python: calculate_denoised(shifted_sigma, model_output_result, adjusted_input)
+                    // denoised = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out;
+                    vec_denoised[i] = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out;
                 }
-                if (is_skiplayer_step) {
-                    latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale;
-                }
-                // v = latent_result, eps = latent_result
-                // denoised = (v * c_out + input * c_skip) or (input + eps * c_out)
-                vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip;
+
+            } else {
+                 // --- Original Final Calculation ---
+                 LOG_DEBUG("Original calc [Step %d]: sigma=%.4f, c_skip=%.4f, c_out=%.4f", step, sigma, c_skip, c_out);
+                 for (int i = 0; i < ne_elements; i++) {
+                    // CFG and SLG apply to the raw model output *before* the final scaling
+                    float model_output_result = positive_data[i]; // Start with positive prediction
+                     if (has_unconditioned) {
+                         // Apply CFG scale: uncond + cfg_scale * (cond - uncond)
+                         model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]);
+                         // TODO: Add min_cfg logic if necessary
+                     }
+                     if (is_skiplayer_step) {
+                         // Apply SLG: result + slg_scale * (cond - skip)
+                         model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]);
+                     }
+
+                     // Original calculation: denoised = input * c_skip + model_output * c_out;
+                     vec_denoised[i] = vec_input[i] * c_skip + model_output_result * c_out;
+                 }
             }
+            // --- End Calculation Path Selection ---
+
             int64_t t1 = ggml_time_us();
             if (step > 0) {
                 pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f);
@@ -1002,7 +1093,7 @@ class StableDiffusionGGML {
         };
 
         sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
-
+        // No changes needed for inverse_noise_scaling as it depends on the final sigma, not the intermediate timesteps used.
         x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
 
         if (control_net) {
@@ -1223,8 +1314,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float slg_scale              = 0,
                            float skip_layer_start       = 0.01,
                            float skip_layer_end         = 0.2,
-                           ggml_tensor* masked_image    = NULL,
-                           int shifted_timestep         = -1) {
+                           int shifted_timestep         = -1, // Added parameter
+                           ggml_tensor* masked_image    = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1481,8 +1572,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      slg_scale,
                                                      skip_layer_start,
                                                      skip_layer_end,
-                                                     noise_mask,
-                                                     shifted_timestep);
+                                                     shifted_timestep, // Passed parameter
+                                                     noise_mask);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1556,7 +1647,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
                     float skip_layer_end     = 0.2,
-                    int shifted_timestep     = -1) {
+                    int shifted_timestep     = -1) { // Added parameter to definition
     std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
     LOG_DEBUG("txt2img %dx%d", width, height);
     if (sd_ctx == NULL) {
@@ -1635,8 +1726,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                                slg_scale,
                                                skip_layer_start,
                                                skip_layer_end,
-                                               NULL, // masked_image is NULL for txt2img
-                                               shifted_timestep);
+                                               shifted_timestep); // Passed parameter
 
     size_t t1 = ggml_time_ms();
 
@@ -1671,9 +1761,9 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
                     float skip_layer_end     = 0.2,
-                    int shifted_timestep     = -1) {
-    std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
-    LOG_DEBUG("img2img %dx%d", width, height);
+                    int shifted_timestep     = -1) { // Added parameter
+std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
+LOG_DEBUG("img2img %dx%d", width, height);
     if (sd_ctx == NULL) {
         return NULL;
     }
@@ -1818,8 +1908,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                slg_scale,
                                                skip_layer_start,
                                                skip_layer_end,
-                                               masked_image, // Pass the actual masked_image for img2img
-                                               shifted_timestep);
+                                               shifted_timestep, // Passed parameter
+                                               masked_image);
 
     size_t t2 = ggml_time_ms();
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
index cb3c0bafb..89e6ade5b 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -46,7 +46,6 @@ enum sample_method_t {
     LCM,
     DDIM_TRAILING,
     TCD,
-    TIMESTEP_SHIFT_LCM,
     N_SAMPLE_METHODS
 };
 

From 6e4b5e46d8550f5bdf760dfb8d65f1a4684a220b Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Wed, 7 May 2025 08:01:40 +0000
Subject: [PATCH 4/9] add sgm_uniform

---
 denoiser.hpp          | 65 +++++++++++++++++++++++++++++++++++++++++--
 examples/cli/main.cpp | 10 ++++---
 stable-diffusion.cpp  |  5 ++++
 stable-diffusion.h    |  1 +
 4 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/denoiser.hpp b/denoiser.hpp
index 66799109d..bf00568a6 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -235,6 +235,31 @@ struct GITSSchedule : SigmaSchedule {
     }
 };
 
+struct SGMUniformSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override {
+        // This schedule's core logic is now handled directly in Denoiser::get_sigmas
+        // to ensure correct access to both sigma_to_t and t_to_sigma.
+        // This method is overridden to fulfill the virtual contract but ideally should not be
+        // the primary execution path for SGMUniform when called from Denoiser::get_sigmas.
+        // If it IS called, it means the Denoiser::get_sigmas logic wasn't triggered, which is unexpected.
+        LOG_WARN("SGMUniformSchedule::get_sigmas was called directly. This might indicate an issue with Denoiser dispatch.");
+        // Provide a default (potentially incorrect for SGMUniform's intent) or empty schedule to avoid crashes.
+        // For safety, returning a simple discrete-like schedule in t-space if this is ever hit.
+        std::vector<float> result;
+        if (n == 0) {
+            result.push_back(0.0f);
+            return result;
+        }
+        result.reserve(n + 1);
+        int t_max = TIMESTEPS -1; // A common max t value
+        float step = static_cast<float>(t_max) / static_cast<float>(n > 1 ? (n -1) : 1) ;
+        for(uint32_t i=0; i<n; ++i) {
+            result.push_back(t_to_sigma_func(t_max - step * i));
+        }
+        result.push_back(0.0f);
+        return result;
+    }
+};
 struct KarrasSchedule : SigmaSchedule {
     std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) {
         // These *COULD* be function arguments here,
@@ -265,8 +290,44 @@ struct Denoiser {
     virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent)             = 0;
 
     virtual std::vector<float> get_sigmas(uint32_t n) {
-        auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
-        return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma);
+        // Check if the current schedule is SGMUniformSchedule
+        if (std::dynamic_pointer_cast<SGMUniformSchedule>(schedule)) {
+            LOG_DEBUG("Denoiser::get_sigmas - Using SGM_UNIFORM specific logic");
+            std::vector<float> sigs;
+            sigs.reserve(n + 1);
+
+            if (n == 0) {
+                sigs.push_back(0.0f);
+                return sigs;
+            }
+
+            // Use the Denoiser's own sigma_to_t and t_to_sigma methods
+            float start_t_val = this->sigma_to_t(this->sigma_max());
+            float end_t_val   = this->sigma_to_t(this->sigma_min());
+
+            // Python: torch.linspace(start, end, n + 1)[:-1]
+            // This creates n points. The k-th point (0-indexed) is start_t_val + k * (end_t_val - start_t_val) / n.
+            float dt_per_step;
+            if (n > 0) { // Avoid division by zero if n=0, though covered by earlier check
+                 dt_per_step = (end_t_val - start_t_val) / static_cast<float>(n);
+            } else {
+                 dt_per_step = 0.0f;
+            }
+
+
+            for (uint32_t i = 0; i < n; ++i) {
+                float current_t = start_t_val + static_cast<float>(i) * dt_per_step;
+                sigs.push_back(this->t_to_sigma(current_t));
+            }
+
+            sigs.push_back(0.0f); // Append the final zero sigma
+            return sigs;
+
+        } else { // For all other schedules, use the existing virtual dispatch
+            LOG_DEBUG("Denoiser::get_sigmas - Using general schedule dispatch for %s", typeid(*schedule.get()).name());
+            auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
+            return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma);
+        }
     }
 };
 
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 5988a7423..e09516994 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -51,6 +51,7 @@ const char* schedule_str[] = {
     "exponential",
     "ays",
     "gits",
+    "sgm_uniform",
 };
 
 const char* modes_str[] = {
@@ -234,7 +235,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
     printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
     printf("  -b, --batch-count COUNT            number of images to generate\n");
-    printf("  --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)\n");
+    printf("  --schedule {discrete, karras, exponential, ays, gits, sgm_uniform} Denoiser sigma schedule (default: discrete)\n");
     printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
     printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
     printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
@@ -537,14 +538,15 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             }
             const char* schedule_selected = argv[i];
             int schedule_found            = -1;
-            for (int d = 0; d < N_SCHEDULES; d++) {
+            // N_SCHEDULES will be updated by the .h change, so this loop limit is fine
+            for (int d = 0; d < N_SCHEDULES; d++) { 
                 if (!strcmp(schedule_selected, schedule_str[d])) {
                     schedule_found = d;
                 }
             }
             if (schedule_found == -1) {
-                invalid_arg = true;
-                break;
+                fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform]\n", schedule_selected);
+                exit(1); // Exit directly as invalid_arg only triggers at the end
             }
             params.schedule = (schedule_t)schedule_found;
         } else if (arg == "-s" || arg == "--seed") {
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index f1e6fb72c..589494740 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -580,6 +580,11 @@ class StableDiffusionGGML {
                     denoiser->schedule          = std::make_shared<GITSSchedule>();
                     denoiser->schedule->version = version;
                     break;
+                case SGM_UNIFORM:
+                    LOG_INFO("Running with SGM Uniform schedule");
+                    denoiser->schedule          = std::make_shared<SGMUniformSchedule>();
+                    denoiser->schedule->version = version; // version might not be used by SGMUniform but good to keep pattern
+                    break;
                 case DEFAULT:
                     // Don't touch anything.
                     break;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 89e6ade5b..d01674968 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -56,6 +56,7 @@ enum schedule_t {
     EXPONENTIAL,
     AYS,
     GITS,
+    SGM_UNIFORM,
     N_SCHEDULES
 };
 

From 6f81e6bc0ce691c9260f1752f7bd9c45b54a8273 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Wed, 7 May 2025 18:59:22 +0000
Subject: [PATCH 5/9] add simple schedule

---
 denoiser.hpp          | 42 ++++++++++++++++++++++++++++++++++++++++++
 examples/cli/main.cpp |  5 +++--
 stable-diffusion.cpp  |  5 +++++
 stable-diffusion.h    |  1 +
 4 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/denoiser.hpp b/denoiser.hpp
index bf00568a6..6c6021b9d 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -279,6 +279,48 @@ struct KarrasSchedule : SigmaSchedule {
     }
 };
 
+struct SimpleSchedule : SigmaSchedule {
+    std::vector<float> get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override {
+        std::vector<float> result_sigmas;
+
+        if (n == 0) {
+            return result_sigmas; // Return empty for n=0, consistent with DiscreteSchedule
+        }
+
+        result_sigmas.reserve(n + 1);
+
+        // TIMESTEPS is the length of the model's internal sigmas array, typically 1000.
+        // t_to_sigma(t) maps a timestep t (0 to TIMESTEPS-1) to its sigma value.
+        int model_sigmas_len = TIMESTEPS; 
+
+        // ss = len(s.sigmas) / steps in Python
+        float step_factor = static_cast<float>(model_sigmas_len) / static_cast<float>(n);
+
+        for (uint32_t i = 0; i < n; ++i) {
+            // Python: s.sigmas[-(1 + int(x * ss))]
+            // x corresponds to i (0 to n-1)
+            // int(x * ss) in Python is static_cast<int>(static_cast<float>(i) * step_factor)
+            // The index -(1 + offset) means (model_sigmas_len - 1 - offset) from the start of a 0-indexed array.
+            int offset_from_start_of_py_array = static_cast<int>(static_cast<float>(i) * step_factor);
+            int timestep_index = model_sigmas_len - 1 - offset_from_start_of_py_array;
+
+            // Ensure the index is within valid bounds [0, model_sigmas_len - 1]
+            if (timestep_index < 0) {
+                timestep_index = 0;
+            }
+            // No need for upper bound check like `timestep_index >= model_sigmas_len` because
+            // max offset is for i=n-1: int((n-1)/n * model_sigmas_len) which is < model_sigmas_len.
+            // So, model_sigmas_len - 1 - max_offset is >= 0 if model_sigmas_len/n >= 1.
+            // If n > model_sigmas_len, then model_sigmas_len/n < 1, resulting in timestep_index potentially being <0,
+            // which is handled by the clamp above.
+
+            result_sigmas.push_back(t_to_sigma(static_cast<float>(timestep_index)));
+        }
+        result_sigmas.push_back(0.0f); // Append the final zero sigma
+        return result_sigmas;
+    }
+};
+
 struct Denoiser {
     std::shared_ptr<SigmaSchedule> schedule                                                  = std::make_shared<DiscreteSchedule>();
     virtual float sigma_min()                                                                = 0;
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index e09516994..fc95c70b4 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -52,6 +52,7 @@ const char* schedule_str[] = {
     "ays",
     "gits",
     "sgm_uniform",
+    "simple",
 };
 
 const char* modes_str[] = {
@@ -235,7 +236,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
     printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
     printf("  -b, --batch-count COUNT            number of images to generate\n");
-    printf("  --schedule {discrete, karras, exponential, ays, gits, sgm_uniform} Denoiser sigma schedule (default: discrete)\n");
+    printf("  --schedule {discrete, karras, exponential, ays, gits, sgm_uniform, simple} Denoiser sigma schedule (default: discrete)\n");
     printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
     printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
     printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
@@ -545,7 +546,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 }
             }
             if (schedule_found == -1) {
-                fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform]\n", schedule_selected);
+                fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform, simple]\n", schedule_selected);
                 exit(1); // Exit directly as invalid_arg only triggers at the end
             }
             params.schedule = (schedule_t)schedule_found;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 589494740..6f33b00b2 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -585,6 +585,11 @@ class StableDiffusionGGML {
                     denoiser->schedule          = std::make_shared<SGMUniformSchedule>();
                     denoiser->schedule->version = version; // version might not be used by SGMUniform but good to keep pattern
                     break;
+                case SIMPLE:
+                    LOG_INFO("Running with Simple schedule");
+                    denoiser->schedule          = std::make_shared<SimpleSchedule>();
+                    denoiser->schedule->version = version; // version might not be used by Simple but good to keep pattern
+                    break;                    
                 case DEFAULT:
                     // Don't touch anything.
                     break;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index d01674968..e29384d34 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -57,6 +57,7 @@ enum schedule_t {
     AYS,
     GITS,
     SGM_UNIFORM,
+    SIMPLE,
     N_SCHEDULES
 };
 

From c242dd4e621684306e90d18a3965cb2ca5da3f9b Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Fri, 9 May 2025 11:11:22 +0000
Subject: [PATCH 6/9] remove debug comments

---
 denoiser.hpp          | 38 ++++---------------
 examples/cli/main.cpp | 15 ++++----
 model.cpp             |  7 +---
 stable-diffusion.cpp  | 85 ++++++++++++++-----------------------------
 4 files changed, 43 insertions(+), 102 deletions(-)

diff --git a/denoiser.hpp b/denoiser.hpp
index 6c6021b9d..87405fbd4 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -237,21 +237,14 @@ struct GITSSchedule : SigmaSchedule {
 
 struct SGMUniformSchedule : SigmaSchedule {
     std::vector<float> get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override {
-        // This schedule's core logic is now handled directly in Denoiser::get_sigmas
-        // to ensure correct access to both sigma_to_t and t_to_sigma.
-        // This method is overridden to fulfill the virtual contract but ideally should not be
-        // the primary execution path for SGMUniform when called from Denoiser::get_sigmas.
-        // If it IS called, it means the Denoiser::get_sigmas logic wasn't triggered, which is unexpected.
-        LOG_WARN("SGMUniformSchedule::get_sigmas was called directly. This might indicate an issue with Denoiser dispatch.");
-        // Provide a default (potentially incorrect for SGMUniform's intent) or empty schedule to avoid crashes.
-        // For safety, returning a simple discrete-like schedule in t-space if this is ever hit.
+
         std::vector<float> result;
         if (n == 0) {
             result.push_back(0.0f);
             return result;
         }
         result.reserve(n + 1);
-        int t_max = TIMESTEPS -1; // A common max t value
+        int t_max = TIMESTEPS -1; 
         float step = static_cast<float>(t_max) / static_cast<float>(n > 1 ? (n -1) : 1) ;
         for(uint32_t i=0; i<n; ++i) {
             result.push_back(t_to_sigma_func(t_max - step * i));
@@ -284,39 +277,27 @@ struct SimpleSchedule : SigmaSchedule {
         std::vector<float> result_sigmas;
 
         if (n == 0) {
-            return result_sigmas; // Return empty for n=0, consistent with DiscreteSchedule
+            return result_sigmas;
         }
 
         result_sigmas.reserve(n + 1);
 
-        // TIMESTEPS is the length of the model's internal sigmas array, typically 1000.
-        // t_to_sigma(t) maps a timestep t (0 to TIMESTEPS-1) to its sigma value.
         int model_sigmas_len = TIMESTEPS; 
 
-        // ss = len(s.sigmas) / steps in Python
         float step_factor = static_cast<float>(model_sigmas_len) / static_cast<float>(n);
 
         for (uint32_t i = 0; i < n; ++i) {
-            // Python: s.sigmas[-(1 + int(x * ss))]
-            // x corresponds to i (0 to n-1)
-            // int(x * ss) in Python is static_cast<int>(static_cast<float>(i) * step_factor)
-            // The index -(1 + offset) means (model_sigmas_len - 1 - offset) from the start of a 0-indexed array.
+
             int offset_from_start_of_py_array = static_cast<int>(static_cast<float>(i) * step_factor);
             int timestep_index = model_sigmas_len - 1 - offset_from_start_of_py_array;
 
-            // Ensure the index is within valid bounds [0, model_sigmas_len - 1]
             if (timestep_index < 0) {
                 timestep_index = 0;
             }
-            // No need for upper bound check like `timestep_index >= model_sigmas_len` because
-            // max offset is for i=n-1: int((n-1)/n * model_sigmas_len) which is < model_sigmas_len.
-            // So, model_sigmas_len - 1 - max_offset is >= 0 if model_sigmas_len/n >= 1.
-            // If n > model_sigmas_len, then model_sigmas_len/n < 1, resulting in timestep_index potentially being <0,
-            // which is handled by the clamp above.
 
             result_sigmas.push_back(t_to_sigma(static_cast<float>(timestep_index)));
         }
-        result_sigmas.push_back(0.0f); // Append the final zero sigma
+        result_sigmas.push_back(0.0f);
         return result_sigmas;
     }
 };
@@ -334,7 +315,6 @@ struct Denoiser {
     virtual std::vector<float> get_sigmas(uint32_t n) {
         // Check if the current schedule is SGMUniformSchedule
         if (std::dynamic_pointer_cast<SGMUniformSchedule>(schedule)) {
-            LOG_DEBUG("Denoiser::get_sigmas - Using SGM_UNIFORM specific logic");
             std::vector<float> sigs;
             sigs.reserve(n + 1);
 
@@ -347,26 +327,22 @@ struct Denoiser {
             float start_t_val = this->sigma_to_t(this->sigma_max());
             float end_t_val   = this->sigma_to_t(this->sigma_min());
 
-            // Python: torch.linspace(start, end, n + 1)[:-1]
-            // This creates n points. The k-th point (0-indexed) is start_t_val + k * (end_t_val - start_t_val) / n.
             float dt_per_step;
-            if (n > 0) { // Avoid division by zero if n=0, though covered by earlier check
+            if (n > 0) { 
                  dt_per_step = (end_t_val - start_t_val) / static_cast<float>(n);
             } else {
                  dt_per_step = 0.0f;
             }
 
-
             for (uint32_t i = 0; i < n; ++i) {
                 float current_t = start_t_val + static_cast<float>(i) * dt_per_step;
                 sigs.push_back(this->t_to_sigma(current_t));
             }
 
-            sigs.push_back(0.0f); // Append the final zero sigma
+            sigs.push_back(0.0f); 
             return sigs;
 
         } else { // For all other schedules, use the existing virtual dispatch
-            LOG_DEBUG("Denoiser::get_sigmas - Using general schedule dispatch for %s", typeid(*schedule.get()).name());
             auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1);
             return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma);
         }
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index fc95c70b4..064d8c949 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -128,10 +128,10 @@ struct SDParams {
     int upscale_repeats           = 1;
 
     std::vector<int> skip_layers = {7, 8, 9};
-    float slg_scale              = 0.f; // Removed duplicate line
+    float slg_scale              = 0.f;
     float skip_layer_start       = 0.01f;
     float skip_layer_end         = 0.2f;
-    int shifted_timestep         = -1; // Keep the added parameter from previous step
+    int shifted_timestep         = -1;
 };
 
 void print_params(SDParams params) {
@@ -248,7 +248,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            Colors the logging tags according to level\n");
-    printf("  --timestep-shift N                 shift timestep for SDXL models (NitroFusion paper, default: -1 off, N between 1 and 1000)\n");
+    printf("  --timestep-shift N                 shift timestep for NitroFusion models, default: -1 off, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -539,7 +539,6 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             }
             const char* schedule_selected = argv[i];
             int schedule_found            = -1;
-            // N_SCHEDULES will be updated by the .h change, so this loop limit is fine
             for (int d = 0; d < N_SCHEDULES; d++) { 
                 if (!strcmp(schedule_selected, schedule_str[d])) {
                     schedule_found = d;
@@ -547,7 +546,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
             }
             if (schedule_found == -1) {
                 fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform, simple]\n", schedule_selected);
-                exit(1); // Exit directly as invalid_arg only triggers at the end
+                exit(1); 
             }
             params.schedule = (schedule_t)schedule_found;
         } else if (arg == "-s" || arg == "--seed") {
@@ -635,7 +634,7 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
-        } else if (arg == "--timestep-shift") { // Added block
+        } else if (arg == "--timestep-shift") { 
              if (++i >= argc) {
                  invalid_arg = true;
                  break;
@@ -984,7 +983,7 @@ int main(int argc, const char* argv[]) {
                           params.slg_scale,
                           params.skip_layer_start,
                           params.skip_layer_end,
-                          params.shifted_timestep); // Passed parameter
+                          params.shifted_timestep); 
    } else {
        sd_image_t input_image = {(uint32_t)params.width,
                                  (uint32_t)params.height,
@@ -1054,7 +1053,7 @@ int main(int argc, const char* argv[]) {
                               params.slg_scale,
                               params.skip_layer_start,
                               params.skip_layer_end,
-                              params.shifted_timestep); // Passed parameter
+                              params.shifted_timestep);
        }
    }
 
diff --git a/model.cpp b/model.cpp
index 0be264704..0d0c43fcb 100644
--- a/model.cpp
+++ b/model.cpp
@@ -185,7 +185,6 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
         new_name = new_name.substr(strlen("conditioner.embedders.0."));
     } else if (starts_with(new_name, "conditioner.embedders.1.")) {
         prefix   = "cond_stage_model.1.";
-        // Corrected the substring length to match the prefix being checked
         new_name = new_name.substr(strlen("conditioner.embedders.1."));
     } else if (starts_with(new_name, "cond_stage_model.")) {
         prefix   = "cond_stage_model.";
@@ -194,8 +193,6 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
         prefix   = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight"));
         new_name = prefix + "visual_projection.weight";
         return new_name;
-    // This specific case seems less common or might be handled implicitly later,
-    // but we keep the original logic for now. If issues arise, review if this mapping is needed.
     } else if (ends_with(new_name, "transformer.text_projection.weight")) {
         prefix   = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight"));
         new_name = prefix + "transformer.text_model.text_projection";
@@ -204,13 +201,11 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) {
         return new_name;
     }
 
-    // Specific handling for text_projection variants before generic map lookup
     if (new_name == "model.text_projection.weight" || new_name == "model.text_projection") {
         new_name = "transformer.text_model.text_projection";
-    } else if (open_clip_to_hf_clip_model.count(new_name)) { // Use .count() for safety
+    } else if (open_clip_to_hf_clip_model.count(new_name)) { 
         new_name = open_clip_to_hf_clip_model[new_name];
     }
-    // Note: The specific handling above takes precedence over the map for this tensor.
 
     std::string open_clip_resblock_prefix = "model.transformer.resblocks.";
     std::string hf_clip_resblock_prefix   = "transformer.text_model.encoder.layers.";
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 6f33b00b2..ca71a8416 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -583,15 +583,14 @@ class StableDiffusionGGML {
                 case SGM_UNIFORM:
                     LOG_INFO("Running with SGM Uniform schedule");
                     denoiser->schedule          = std::make_shared<SGMUniformSchedule>();
-                    denoiser->schedule->version = version; // version might not be used by SGMUniform but good to keep pattern
+                    denoiser->schedule->version = version; 
                     break;
                 case SIMPLE:
                     LOG_INFO("Running with Simple schedule");
                     denoiser->schedule          = std::make_shared<SimpleSchedule>();
-                    denoiser->schedule->version = version; // version might not be used by Simple but good to keep pattern
+                    denoiser->schedule->version = version; 
                     break;                    
                 case DEFAULT:
-                    // Don't touch anything.
                     break;
                 default:
                     LOG_ERROR("Unknown schedule %i", schedule);
@@ -814,7 +813,7 @@ class StableDiffusionGGML {
                         float slg_scale              = 0,
                         float skip_layer_start       = 0.01,
                         float skip_layer_end         = 0.2,
-                        int shifted_timestep         = -1, // Added parameter
+                        int shifted_timestep         = -1,
                         ggml_tensor* noise_mask      = nullptr) {
         LOG_DEBUG("Sample");
         if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) {
@@ -861,26 +860,24 @@ class StableDiffusionGGML {
             }
         }
         struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x);
-
-        // Capture necessary variables for the denoising step lambda
-        auto denoise = [this, // Capture 'this' to access members like version, rng, n_threads etc.
-                        &work_ctx, // Context for tensor creation
-                        denoiser = this->denoiser, // Capture shared_ptr by value (copies the pointer)
-                        steps, // Capture steps by value
-                        &x, // Capture latent tensor by reference (modified in loop)
-                        &sigmas, // Capture sigmas vector by reference (read-only needed)
-                        &cond, &uncond, &id_cond, // Capture conditions by reference
-                        control_hint, // Capture control hint pointer by value
-                        control_strength, min_cfg, cfg_scale, guidance, eta, // Capture floats by value
-                        start_merge_step, // Capture int by value
-                        &skip_layers, slg_scale, skip_layer_start, skip_layer_end, // Capture skip layer params
-                        shifted_timestep, // Capture shifted timestep by value
-                        &noise_mask, // Capture noise mask pointer by reference/value
-                        &noise, // Capture noise tensor by reference (modified in loop)
-                        &init_latent, // Capture initial latent tensor by reference (read-only for mask)
-                        &denoised, // Capture output denoised tensor by reference
-                        &noised_input, &out_cond, &out_uncond, &out_skip, // Capture intermediate tensors by reference
-                        has_unconditioned, has_skiplayer // Capture bools by value
+        auto denoise = [this, 
+                        &work_ctx, 
+                        denoiser = this->denoiser, 
+                        steps, 
+                        &x, 
+                        &sigmas, 
+                        &cond, &uncond, &id_cond, 
+                        control_hint, 
+                        control_strength, min_cfg, cfg_scale, guidance, eta,
+                        start_merge_step,
+                        &skip_layers, slg_scale, skip_layer_start, skip_layer_end,
+                        shifted_timestep,
+                        &noise_mask,
+                        &noise,
+                        &init_latent,
+                        &denoised,
+                        &noised_input, &out_cond, &out_uncond, &out_skip,
+                        has_unconditioned, has_skiplayer 
                        ]
                        (ggml_tensor* input, float sigma, int step) -> ggml_tensor* {
             if (step == 1) {
@@ -896,31 +893,23 @@ class StableDiffusionGGML {
 
             float t = denoiser->sigma_to_t(sigma);
             std::vector<float> timesteps_vec;
-            // --- Timestep Shifting Logic ---
+            
             if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
-                // Calculate the shifted timestep value based on the original t
-                // Python: (original_index * (shifted_timestep / total_timesteps)).long()
-                // Assuming TIMESTEPS (1000) is the total_timesteps for SDXL
+
                 float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS));
-                // Clamp and round to nearest integer timestep index
                 int64_t shifted_t = static_cast<int64_t>(roundf(shifted_t_float));
                 shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t));
                 LOG_DEBUG("Shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma);
                 timesteps_vec.assign(x->ne[3], (float)shifted_t);
             } else {
-                // Use original timestep if shifting is disabled or model is not SDXL
                 timesteps_vec.assign(x->ne[3], t);
             }
-            // --- End Timestep Shifting Logic ---
 
             auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
             std::vector<float> guidance_vec(x->ne[3], guidance);
             auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
 
             copy_ggml_tensor(noised_input, input);
-            // noised_input = noised_input * c_in
-            // NOTE: c_in is calculated based on the *original* sigma, which seems
-            // consistent with how the Python code uses xc derived from original sigma.
             ggml_tensor_scale(noised_input, c_in);
 
             std::vector<struct ggml_tensor*> controls;
@@ -928,8 +917,6 @@ class StableDiffusionGGML {
             if (control_hint != NULL) {
                 control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);
                 controls = control_net->controls;
-                // print_ggml_tensor(controls[12]);
-                // GGML_ASSERT(0);
             }
 
             if (start_merge_step == -1 || step <= start_merge_step) {
@@ -1008,10 +995,7 @@ class StableDiffusionGGML {
             float* skip_layer_data_ptr = is_skiplayer_step ? (float*)out_skip->data : nullptr; // Get pointer if needed
             int ne_elements      = (int)ggml_nelements(denoised);
 
-            // --- Select Calculation Path based on Timestep Shifting ---
             if (shifted_timestep > 0 && sd_version_is_sdxl(version)) {
-                // --- Shifted Timestep Final Calculation ---
-
                 // Retrieve the integer shifted timestep calculated earlier
                 // Assuming shifted_t is the float representation of the index
                 int64_t shifted_t_idx = static_cast<int64_t>(roundf(timesteps_vec[0])); // Get the index back
@@ -1021,7 +1005,6 @@ class StableDiffusionGGML {
                 float shifted_c_skip = shifted_scaling[0];
                 float shifted_c_out  = shifted_scaling[1];
                 // shifted_c_in is scaling[2] if needed, but we adjust input instead
-
                 // Need sigma_data from the denoiser (assuming CompVis type for SDXL)
                 auto compvis_denoiser_ptr = std::dynamic_pointer_cast<CompVisDenoiser>(denoiser);
                 float sigma_data = compvis_denoiser_ptr ? compvis_denoiser_ptr->sigma_data : 1.0f; // Default needed? SDXL uses CompVis.
@@ -1034,33 +1017,24 @@ class StableDiffusionGGML {
                 // Equivalent to Python: sqrt(denoised_sigma^2 + sigma_data^2) / sqrt(sigma^2 + sigma_data^2)
                 float input_scale_factor = sqrtf((shifted_sigma_sq + sigma_data_sq) / (sigma_sq + sigma_data_sq));
 
-                LOG_DEBUG("Shifted calc [Step %d]: sigma=%.4f, shifted_t_idx=%ld, shifted_sigma=%.4f, input_scale=%.4f, shifted_c_skip=%.4f, shifted_c_out=%.4f",
-                          step, sigma, shifted_t_idx, shifted_sigma, input_scale_factor, shifted_c_skip, shifted_c_out);
-
                 for (int i = 0; i < ne_elements; i++) {
                     // CFG and SLG apply to the raw model output *before* the final scaling
                     float model_output_result = positive_data[i]; // Start with positive prediction
                     if (has_unconditioned) {
                         // Apply CFG scale: uncond + cfg_scale * (cond - uncond)
                         model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]);
-                        // TODO: Add min_cfg logic if necessary
                     }
                     if (is_skiplayer_step) {
                         // Apply SLG: result + slg_scale * (cond - skip)
                         model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]);
                     }
 
-                    // Recalculate input term based on Python logic: x_recalc = x * input_scale_factor
                     float adjusted_input = vec_input[i] * input_scale_factor;
 
-                    // Final calculation using shifted sigma scales and adjusted input
-                    // Equivalent to Python: calculate_denoised(shifted_sigma, model_output_result, adjusted_input)
-                    // denoised = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out;
                     vec_denoised[i] = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out;
                 }
 
             } else {
-                 // --- Original Final Calculation ---
                  LOG_DEBUG("Original calc [Step %d]: sigma=%.4f, c_skip=%.4f, c_out=%.4f", step, sigma, c_skip, c_out);
                  for (int i = 0; i < ne_elements; i++) {
                     // CFG and SLG apply to the raw model output *before* the final scaling
@@ -1068,18 +1042,15 @@ class StableDiffusionGGML {
                      if (has_unconditioned) {
                          // Apply CFG scale: uncond + cfg_scale * (cond - uncond)
                          model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]);
-                         // TODO: Add min_cfg logic if necessary
                      }
                      if (is_skiplayer_step) {
                          // Apply SLG: result + slg_scale * (cond - skip)
                          model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]);
                      }
 
-                     // Original calculation: denoised = input * c_skip + model_output * c_out;
                      vec_denoised[i] = vec_input[i] * c_skip + model_output_result * c_out;
                  }
             }
-            // --- End Calculation Path Selection ---
 
             int64_t t1 = ggml_time_us();
             if (step > 0) {
@@ -1103,7 +1074,7 @@ class StableDiffusionGGML {
         };
 
         sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta);
-        // No changes needed for inverse_noise_scaling as it depends on the final sigma, not the intermediate timesteps used.
+        
         x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x);
 
         if (control_net) {
@@ -1324,7 +1295,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float slg_scale              = 0,
                            float skip_layer_start       = 0.01,
                            float skip_layer_end         = 0.2,
-                           int shifted_timestep         = -1, // Added parameter
+                           int shifted_timestep         = -1, 
                            ggml_tensor* masked_image    = NULL) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
@@ -1657,7 +1628,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
                     float skip_layer_end     = 0.2,
-                    int shifted_timestep     = -1) { // Added parameter to definition
+                    int shifted_timestep     = -1) { 
     std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
     LOG_DEBUG("txt2img %dx%d", width, height);
     if (sd_ctx == NULL) {
@@ -1771,7 +1742,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
                     float skip_layer_end     = 0.2,
-                    int shifted_timestep     = -1) { // Added parameter
+                    int shifted_timestep     = -1) {
 std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
 LOG_DEBUG("img2img %dx%d", width, height);
     if (sd_ctx == NULL) {
@@ -1918,7 +1889,7 @@ LOG_DEBUG("img2img %dx%d", width, height);
                                                slg_scale,
                                                skip_layer_start,
                                                skip_layer_end,
-                                               shifted_timestep, // Passed parameter
+                                               shifted_timestep,
                                                masked_image);
 
     size_t t2 = ggml_time_ms();

From 4e85dfb0c60166d58e16dbf3d750f5ea7776ba5e Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Fri, 9 May 2025 11:18:40 +0000
Subject: [PATCH 7/9] restore original comments

---
 stable-diffusion.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index ca71a8416..201881a8a 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -591,6 +591,7 @@ class StableDiffusionGGML {
                     denoiser->schedule->version = version; 
                     break;                    
                 case DEFAULT:
+                    // Don't touch anything.
                     break;
                 default:
                     LOG_ERROR("Unknown schedule %i", schedule);
@@ -910,6 +911,7 @@ class StableDiffusionGGML {
             auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
 
             copy_ggml_tensor(noised_input, input);
+            // noised_input = noised_input * c_in
             ggml_tensor_scale(noised_input, c_in);
 
             std::vector<struct ggml_tensor*> controls;
@@ -917,6 +919,8 @@ class StableDiffusionGGML {
             if (control_hint != NULL) {
                 control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector);
                 controls = control_net->controls;
+                // print_ggml_tensor(controls[12]);
+                // GGML_ASSERT(0);
             }
 
             if (start_merge_step == -1 || step <= start_merge_step) {
@@ -1553,7 +1557,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      slg_scale,
                                                      skip_layer_start,
                                                      skip_layer_end,
-                                                     shifted_timestep, // Passed parameter
+                                                     shifted_timestep,
                                                      noise_mask);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");

From eca2089d6f3634c940b9bf235540a952c039d3af Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Fri, 9 May 2025 11:32:54 +0000
Subject: [PATCH 8/9] update readme

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 553fb7f8f..d824ee705 100644
--- a/README.md
+++ b/README.md
@@ -256,7 +256,7 @@ arguments:
   --rng {std_default, cuda}          RNG (default: cuda)
   -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
   -b, --batch-count COUNT            number of images to generate
-  --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)
+  --schedule {discrete, karras, exponential, ays, gits, sgm_uniform, simple} Denoiser sigma schedule (default: discrete)
   --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
                                      <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
   --vae-tiling                       process vae in tiles to reduce memory usage
@@ -268,6 +268,7 @@ arguments:
   --control-net-cpu                  keep controlnet in cpu (for low vram)
   --canny                            apply canny preprocessor (edge detection)
   --color                            Colors the logging tags according to level
+  --timestep-shift N                 shift timestep for NitroFusion models, default: -1 off, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant
   -v, --verbose                      print extra info
 ```
 

From e907b858a12dfe04499365eaf2f0d0f65e409973 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Fri, 9 May 2025 16:08:53 +0000
Subject: [PATCH 9/9] remove debug log

---
 stable-diffusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 201881a8a..81e9bd0e1 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1039,7 +1039,7 @@ class StableDiffusionGGML {
                 }
 
             } else {
-                 LOG_DEBUG("Original calc [Step %d]: sigma=%.4f, c_skip=%.4f, c_out=%.4f", step, sigma, c_skip, c_out);
+                
                  for (int i = 0; i < ne_elements; i++) {
                     // CFG and SLG apply to the raw model output *before* the final scaling
                     float model_output_result = positive_data[i]; // Start with positive prediction