implement timestep shift first attempt

rmatif · rmatif · commit 205164dba818 · 2025-04-13T04:52:09.000Z
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@ test/
 *.bin
 *.exe
 *.gguf
+*.pdf
 output*.png
 models*
-*.log
+*.log
diff --git a/denoiser.hpp b/denoiser.hpp
@@ -971,7 +971,8 @@ static void sample_k_diffusion(sample_method_t method,
                 d_cur = ggml_dup_tensor(work_ctx, x_next);
             }
         } break;
-        case LCM:  // Latent Consistency Models
+        case LCM:             // Latent Consistency Models
+        case TIMESTEP_SHIFT_LCM: // Timestep Shift LCM (uses same core logic as LCM here)
         {
             struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);
             struct ggml_tensor* d     = ggml_dup_tensor(work_ctx, x);
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
@@ -41,6 +41,7 @@ const char* sample_method_str[] = {
     "lcm",
     "ddim_trailing",
     "tcd",
+    "timestep_shift_lcm",
 };
 
 // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -101,6 +102,7 @@ struct SDParams {
     int width         = 512;
     int height        = 512;
     int batch_count   = 1;
+    int shifted_timestep = -1; // for timestep_shift_lcm
 
     int video_frames         = 6;
     int motion_bucket_id     = 127;
@@ -178,6 +180,9 @@ void print_params(SDParams params) {
     printf("    batch_count:       %d\n", params.batch_count);
     printf("    vae_tiling:        %s\n", params.vae_tiling ? "true" : "false");
     printf("    upscale_repeats:   %d\n", params.upscale_repeats);
+    if (params.shifted_timestep > 0) {
+        printf("    shifted_timestep:  %d\n", params.shifted_timestep);
+    }
 }
 
 void print_usage(int argc, const char* argv[]) {
@@ -226,7 +231,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("                                     1.0 corresponds to full destruction of information in init image\n");
     printf("  -H, --height H                     image height, in pixel space (default: 512)\n");
     printf("  -W, --width W                      image width, in pixel space (default: 512)\n");
-    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n");
+    printf("  --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, timestep_shift_lcm}\n");
     printf("                                     sampling method (default: \"euler_a\")\n");
     printf("  --steps  STEPS                     number of sample steps (default: 20)\n");
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
@@ -244,6 +249,7 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
     printf("  --color                            Colors the logging tags according to level\n");
+    printf("  --shifted-timestep N               Timestep shift value for timestep_shift_lcm sampler (default: -1, disabled)\n");
     printf("  -v, --verbose                      print extra info\n");
 }
 
@@ -629,6 +635,12 @@ void parse_args(int argc, const char** argv, SDParams& params) {
                 break;
             }
             params.skip_layer_end = std::stof(argv[i]);
+        } else if (arg == "--shifted-timestep") {
+            if (++i >= argc) {
+                invalid_arg = true;
+                break;
+            }
+            params.shifted_timestep = std::stoi(argv[i]);
         } else {
             fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
             print_usage(argc, argv);
@@ -967,7 +979,8 @@ int main(int argc, const char* argv[]) {
                           params.skip_layers.size(),
                           params.slg_scale,
                           params.skip_layer_start,
-                          params.skip_layer_end);
+                          params.skip_layer_end,
+                          params.shifted_timestep);
     } else {
         sd_image_t input_image = {(uint32_t)params.width,
                                   (uint32_t)params.height,
@@ -1036,7 +1049,8 @@ int main(int argc, const char* argv[]) {
                               params.skip_layers.size(),
                               params.slg_scale,
                               params.skip_layer_start,
-                              params.skip_layer_end);
+                              params.skip_layer_end,
+                              params.shifted_timestep);
         }
     }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
@@ -804,7 +804,8 @@ class StableDiffusionGGML {
                         float slg_scale              = 0,
                         float skip_layer_start       = 0.01,
                         float skip_layer_end         = 0.2,
-                        ggml_tensor* noise_mask      = nullptr) {
+                        ggml_tensor* noise_mask      = nullptr,
+                        int shifted_timestep         = -1) {
         LOG_DEBUG("Sample");
         struct ggml_init_params params;
         size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]);
@@ -860,7 +861,16 @@ class StableDiffusionGGML {
             float c_in   = scaling[2];
 
             float t = denoiser->sigma_to_t(sigma);
-            std::vector<float> timesteps_vec(x->ne[3], t);  // [N, ]
+            float t_for_model = t;
+            if (method == TIMESTEP_SHIFT_LCM && shifted_timestep > 0) {
+                // Apply timestep shift: t_shifted = t * shifted_timestep / TIMESTEPS
+                // TIMESTEPS is defined in denoiser.hpp as 1000
+                t_for_model = t * (float)shifted_timestep / (float)TIMESTEPS;
+                // Ensure t_for_model stays within valid range [0, TIMESTEPS-1]
+                t_for_model = std::max(0.f, std::min(t_for_model, (float)TIMESTEPS - 1.f));
+                LOG_DEBUG("Timestep Shift: original t=%.2f, shifted t=%.2f (shifted_timestep=%d)", t, t_for_model, shifted_timestep);
+            }
+            std::vector<float> timesteps_vec(x->ne[3], t_for_model);  // Use t_for_model for the diffusion model call
             auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec);
             std::vector<float> guidance_vec(x->ne[3], guidance);
             auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec);
@@ -1213,7 +1223,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                            float slg_scale              = 0,
                            float skip_layer_start       = 0.01,
                            float skip_layer_end         = 0.2,
-                           ggml_tensor* masked_image    = NULL) {
+                           ggml_tensor* masked_image    = NULL,
+                           int shifted_timestep         = -1) {
     if (seed < 0) {
         // Generally, when using the provided command line, the seed is always >0.
         // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library
@@ -1470,7 +1481,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
                                                      slg_scale,
                                                      skip_layer_start,
                                                      skip_layer_end,
-                                                     noise_mask);
+                                                     noise_mask,
+                                                     shifted_timestep);
 
         // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin");
         // print_ggml_tensor(x_0);
@@ -1543,7 +1555,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                     size_t skip_layers_count = 0,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
+                    float skip_layer_end     = 0.2,
+                    int shifted_timestep     = -1) {
     std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
     LOG_DEBUG("txt2img %dx%d", width, height);
     if (sd_ctx == NULL) {
@@ -1621,7 +1634,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                                                skip_layers_vec,
                                                slg_scale,
                                                skip_layer_start,
-                                               skip_layer_end);
+                                               skip_layer_end,
+                                               NULL, // masked_image is NULL for txt2img
+                                               shifted_timestep);
 
     size_t t1 = ggml_time_ms();
 
@@ -1655,7 +1670,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                     size_t skip_layers_count = 0,
                     float slg_scale          = 0,
                     float skip_layer_start   = 0.01,
-                    float skip_layer_end     = 0.2) {
+                    float skip_layer_end     = 0.2,
+                    int shifted_timestep     = -1) {
     std::vector<int> skip_layers_vec(skip_layers, skip_layers + skip_layers_count);
     LOG_DEBUG("img2img %dx%d", width, height);
     if (sd_ctx == NULL) {
@@ -1802,7 +1818,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx,
                                                slg_scale,
                                                skip_layer_start,
                                                skip_layer_end,
-                                               masked_image);
+                                               masked_image, // Pass the actual masked_image for img2img
+                                               shifted_timestep);
 
     size_t t2 = ggml_time_ms();
 
diff --git a/stable-diffusion.h b/stable-diffusion.h
@@ -46,6 +46,7 @@ enum sample_method_t {
     LCM,
     DDIM_TRAILING,
     TCD,
+    TIMESTEP_SHIFT_LCM,
     N_SAMPLE_METHODS
 };
 
@@ -176,7 +177,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx,
                            size_t skip_layers_count,
                            float slg_scale,
                            float skip_layer_start,
-                           float skip_layer_end);
+                           float skip_layer_end,
+                           int shifted_timestep);
 
 SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,
@@ -203,7 +205,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx,
                            size_t skip_layers_count,
                            float slg_scale,
                            float skip_layer_start,
-                           float skip_layer_end);
+                           float skip_layer_end,
+                           int shifted_timestep);
 
 SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx,
                            sd_image_t init_image,

-Original file line number
+Diff line change
 *.bin
 *.exe
 *.gguf
 +*.pdf
 output*.png
 models*
 -*.log
 +*.log
Original file line number	Diff line number	Diff line change
`@@ -971,7 +971,8 @@ static void sample_k_diffusion(sample_method_t method,`
`971`	`971`	`d_cur = ggml_dup_tensor(work_ctx, x_next);`
`972`	`972`	`}`
`973`	`973`	`} break;`
`974`		`- case LCM: // Latent Consistency Models`
	`974`	`+ case LCM: // Latent Consistency Models`
	`975`	`+ case TIMESTEP_SHIFT_LCM: // Timestep Shift LCM (uses same core logic as LCM here)`
`975`	`976`	`{`
`976`	`977`	`struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x);`
`977`	`978`	`struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x);`