From 19d40d0a4c406196e9876e80abbcac6c35825025 Mon Sep 17 00:00:00 2001 From: rmatif <66360289+rmatif@users.noreply.github.com> Date: Sun, 13 Apr 2025 03:43:40 +0000 Subject: [PATCH 1/9] fix tensors name --- model.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/model.cpp b/model.cpp index 24da39f6d..30e5f1bcf 100644 --- a/model.cpp +++ b/model.cpp @@ -175,6 +175,12 @@ std::unordered_map pmid_v2_name_map = { }; std::string convert_open_clip_to_hf_clip(const std::string& name) { + // Specific fix for ComfyUI-style SDXL CLIP-G text projection name + // Check this *before* any other modifications + if (name == "conditioner.embedders.1.model.text_projection.weight") { + return "cond_stage_model.1.transformer.text_model.text_projection"; + } + std::string new_name = name; std::string prefix; if (starts_with(new_name, "conditioner.embedders.0.open_clip.")) { @@ -185,7 +191,7 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "conditioner.embedders.1.")) { prefix = "cond_stage_model.1."; - new_name = new_name.substr(strlen("conditioner.embedders.0.")); + new_name = new_name.substr(strlen("conditioner.embedders.1.")); // Fix bug: use correct length for prefix 1 } else if (starts_with(new_name, "cond_stage_model.")) { prefix = "cond_stage_model."; new_name = new_name.substr(strlen("cond_stage_model.")); From 205164dba818292ea5cd00fa545964a67662bfc1 Mon Sep 17 00:00:00 2001 From: rmatif <66360289+rmatif@users.noreply.github.com> Date: Sun, 13 Apr 2025 04:52:09 +0000 Subject: [PATCH 2/9] implement timestep shift first attempt --- .gitignore | 3 ++- denoiser.hpp | 3 ++- examples/cli/main.cpp | 20 +++++++++++++++++--- stable-diffusion.cpp | 33 +++++++++++++++++++++++++-------- stable-diffusion.h | 7 +++++-- 5 files changed, 51 insertions(+), 15 deletions(-) diff --git a/.gitignore b/.gitignore index 38fe570df..7986ce6b4 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ test/ *.bin *.exe *.gguf +*.pdf output*.png models* -*.log \ No newline at end of file +*.log diff --git a/denoiser.hpp b/denoiser.hpp index 66799109d..880dc22ff 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -971,7 +971,8 @@ static void sample_k_diffusion(sample_method_t method, d_cur = ggml_dup_tensor(work_ctx, x_next); } } break; - case LCM: // Latent Consistency Models + case LCM: // Latent Consistency Models + case TIMESTEP_SHIFT_LCM: // Timestep Shift LCM (uses same core logic as LCM here) { struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index af6b2bbdb..626e37544 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -41,6 +41,7 @@ const char* sample_method_str[] = { "lcm", "ddim_trailing", "tcd", + "timestep_shift_lcm", }; // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h @@ -101,6 +102,7 @@ struct SDParams { int width = 512; int height = 512; int batch_count = 1; + int shifted_timestep = -1; // for timestep_shift_lcm int video_frames = 6; int motion_bucket_id = 127; @@ -178,6 +180,9 @@ void print_params(SDParams params) { printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); + if (params.shifted_timestep > 0) { + printf(" shifted_timestep: %d\n", params.shifted_timestep); + } } void print_usage(int argc, const char* argv[]) { @@ -226,7 +231,7 @@ void print_usage(int argc, const char* argv[]) { printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); printf(" -W, --width W image width, in pixel space (default: 512)\n"); - printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); + printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, timestep_shift_lcm}\n"); printf(" sampling method (default: \"euler_a\")\n"); printf(" --steps STEPS number of sample steps (default: 20)\n"); printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); @@ -244,6 +249,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color Colors the logging tags according to level\n"); + printf(" --shifted-timestep N Timestep shift value for timestep_shift_lcm sampler (default: -1, disabled)\n"); printf(" -v, --verbose print extra info\n"); } @@ -629,6 +635,12 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.skip_layer_end = std::stof(argv[i]); + } else if (arg == "--shifted-timestep") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.shifted_timestep = std::stoi(argv[i]); } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -967,7 +979,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); + params.skip_layer_end, + params.shifted_timestep); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1036,7 +1049,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); + params.skip_layer_end, + params.shifted_timestep); } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e38a6101f..39864b8a5 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -804,7 +804,8 @@ class StableDiffusionGGML { float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { + ggml_tensor* noise_mask = nullptr, + int shifted_timestep = -1) { LOG_DEBUG("Sample"); struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); @@ -860,7 +861,16 @@ class StableDiffusionGGML { float c_in = scaling[2]; float t = denoiser->sigma_to_t(sigma); - std::vector timesteps_vec(x->ne[3], t); // [N, ] + float t_for_model = t; + if (method == TIMESTEP_SHIFT_LCM && shifted_timestep > 0) { + // Apply timestep shift: t_shifted = t * shifted_timestep / TIMESTEPS + // TIMESTEPS is defined in denoiser.hpp as 1000 + t_for_model = t * (float)shifted_timestep / (float)TIMESTEPS; + // Ensure t_for_model stays within valid range [0, TIMESTEPS-1] + t_for_model = std::max(0.f, std::min(t_for_model, (float)TIMESTEPS - 1.f)); + LOG_DEBUG("Timestep Shift: original t=%.2f, shifted t=%.2f (shifted_timestep=%d)", t, t_for_model, shifted_timestep); + } + std::vector timesteps_vec(x->ne[3], t_for_model); // Use t_for_model for the diffusion model call auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); std::vector guidance_vec(x->ne[3], guidance); auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); @@ -1213,7 +1223,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL) { + ggml_tensor* masked_image = NULL, + int shifted_timestep = -1) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1470,7 +1481,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - noise_mask); + noise_mask, + shifted_timestep); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1543,7 +1555,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + float skip_layer_end = 0.2, + int shifted_timestep = -1) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1621,7 +1634,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, skip_layers_vec, slg_scale, skip_layer_start, - skip_layer_end); + skip_layer_end, + NULL, // masked_image is NULL for txt2img + shifted_timestep); size_t t1 = ggml_time_ms(); @@ -1655,7 +1670,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + float skip_layer_end = 0.2, + int shifted_timestep = -1) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1802,7 +1818,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - masked_image); + masked_image, // Pass the actual masked_image for img2img + shifted_timestep); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 52dcc848a..cb3c0bafb 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -46,6 +46,7 @@ enum sample_method_t { LCM, DDIM_TRAILING, TCD, + TIMESTEP_SHIFT_LCM, N_SAMPLE_METHODS }; @@ -176,7 +177,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + int shifted_timestep); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -203,7 +205,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + int shifted_timestep); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image, From a67e93f893f96dd27c4c9cea52648665cec479c9 Mon Sep 17 00:00:00 2001 From: rmatif Date: Wed, 30 Apr 2025 03:54:26 +0000 Subject: [PATCH 3/9] timestep-shift fix, works in 1 step --- denoiser.hpp | 3 +- examples/cli/main.cpp | 43 +++++------ model.cpp | 17 +++-- stable-diffusion.cpp | 172 ++++++++++++++++++++++++++++++++---------- stable-diffusion.h | 1 - 5 files changed, 163 insertions(+), 73 deletions(-) diff --git a/denoiser.hpp b/denoiser.hpp index 880dc22ff..66799109d 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -971,8 +971,7 @@ static void sample_k_diffusion(sample_method_t method, d_cur = ggml_dup_tensor(work_ctx, x_next); } } break; - case LCM: // Latent Consistency Models - case TIMESTEP_SHIFT_LCM: // Timestep Shift LCM (uses same core logic as LCM here) + case LCM: // Latent Consistency Models { struct ggml_tensor* noise = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* d = ggml_dup_tensor(work_ctx, x); diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 626e37544..5988a7423 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -41,7 +41,6 @@ const char* sample_method_str[] = { "lcm", "ddim_trailing", "tcd", - "timestep_shift_lcm", }; // Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h @@ -102,7 +101,6 @@ struct SDParams { int width = 512; int height = 512; int batch_count = 1; - int shifted_timestep = -1; // for timestep_shift_lcm int video_frames = 6; int motion_bucket_id = 127; @@ -128,9 +126,10 @@ struct SDParams { int upscale_repeats = 1; std::vector skip_layers = {7, 8, 9}; - float slg_scale = 0.f; + float slg_scale = 0.f; // Removed duplicate line float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; + int shifted_timestep = -1; // Keep the added parameter from previous step }; void print_params(SDParams params) { @@ -180,9 +179,7 @@ void print_params(SDParams params) { printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); - if (params.shifted_timestep > 0) { - printf(" shifted_timestep: %d\n", params.shifted_timestep); - } + printf(" timestep_shift: %d\n", params.shifted_timestep); } void print_usage(int argc, const char* argv[]) { @@ -231,7 +228,7 @@ void print_usage(int argc, const char* argv[]) { printf(" 1.0 corresponds to full destruction of information in init image\n"); printf(" -H, --height H image height, in pixel space (default: 512)\n"); printf(" -W, --width W image width, in pixel space (default: 512)\n"); - printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd, timestep_shift_lcm}\n"); + printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); printf(" sampling method (default: \"euler_a\")\n"); printf(" --steps STEPS number of sample steps (default: 20)\n"); printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); @@ -249,7 +246,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color Colors the logging tags according to level\n"); - printf(" --shifted-timestep N Timestep shift value for timestep_shift_lcm sampler (default: -1, disabled)\n"); + printf(" --timestep-shift N shift timestep for SDXL models (NitroFusion paper, default: -1 off, N between 1 and 1000)\n"); printf(" -v, --verbose print extra info\n"); } @@ -635,12 +632,16 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.skip_layer_end = std::stof(argv[i]); - } else if (arg == "--shifted-timestep") { - if (++i >= argc) { - invalid_arg = true; - break; - } - params.shifted_timestep = std::stoi(argv[i]); + } else if (arg == "--timestep-shift") { // Added block + if (++i >= argc) { + invalid_arg = true; + break; + } + params.shifted_timestep = std::stoi(argv[i]); + if (params.shifted_timestep != -1 && (params.shifted_timestep < 1 || params.shifted_timestep > 1000)) { + fprintf(stderr, "error: timestep-shift must be between 1 and 1000, or -1 to disable\n"); + exit(1); + } } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -980,10 +981,10 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end, - params.shifted_timestep); - } else { - sd_image_t input_image = {(uint32_t)params.width, - (uint32_t)params.height, + params.shifted_timestep); // Passed parameter + } else { + sd_image_t input_image = {(uint32_t)params.width, + (uint32_t)params.height, 3, input_image_buffer}; @@ -1050,9 +1051,9 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end, - params.shifted_timestep); - } - } + params.shifted_timestep); // Passed parameter + } + } if (results == NULL) { printf("generate failed\n"); diff --git a/model.cpp b/model.cpp index 30e5f1bcf..0be264704 100644 --- a/model.cpp +++ b/model.cpp @@ -175,12 +175,6 @@ std::unordered_map pmid_v2_name_map = { }; std::string convert_open_clip_to_hf_clip(const std::string& name) { - // Specific fix for ComfyUI-style SDXL CLIP-G text projection name - // Check this *before* any other modifications - if (name == "conditioner.embedders.1.model.text_projection.weight") { - return "cond_stage_model.1.transformer.text_model.text_projection"; - } - std::string new_name = name; std::string prefix; if (starts_with(new_name, "conditioner.embedders.0.open_clip.")) { @@ -191,7 +185,8 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "conditioner.embedders.1.")) { prefix = "cond_stage_model.1."; - new_name = new_name.substr(strlen("conditioner.embedders.1.")); // Fix bug: use correct length for prefix 1 + // Corrected the substring length to match the prefix being checked + new_name = new_name.substr(strlen("conditioner.embedders.1.")); } else if (starts_with(new_name, "cond_stage_model.")) { prefix = "cond_stage_model."; new_name = new_name.substr(strlen("cond_stage_model.")); @@ -199,6 +194,8 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { prefix = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight")); new_name = prefix + "visual_projection.weight"; return new_name; + // This specific case seems less common or might be handled implicitly later, + // but we keep the original logic for now. If issues arise, review if this mapping is needed. } else if (ends_with(new_name, "transformer.text_projection.weight")) { prefix = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight")); new_name = prefix + "transformer.text_model.text_projection"; @@ -207,9 +204,13 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { return new_name; } - if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) { + // Specific handling for text_projection variants before generic map lookup + if (new_name == "model.text_projection.weight" || new_name == "model.text_projection") { + new_name = "transformer.text_model.text_projection"; + } else if (open_clip_to_hf_clip_model.count(new_name)) { // Use .count() for safety new_name = open_clip_to_hf_clip_model[new_name]; } + // Note: The specific handling above takes precedence over the map for this tensor. std::string open_clip_resblock_prefix = "model.transformer.resblocks."; std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers."; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 39864b8a5..f1e6fb72c 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -804,9 +804,13 @@ class StableDiffusionGGML { float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr, - int shifted_timestep = -1) { + int shifted_timestep = -1, // Added parameter + ggml_tensor* noise_mask = nullptr) { LOG_DEBUG("Sample"); + if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) { + LOG_WARN("Timestep shifting is only supported for SDXL models. Ignoring --timestep-shift."); + shifted_timestep = -1; + } struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); for (int i = 1; i < 4; i++) { @@ -848,7 +852,27 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { + // Capture necessary variables for the denoising step lambda + auto denoise = [this, // Capture 'this' to access members like version, rng, n_threads etc. + &work_ctx, // Context for tensor creation + denoiser = this->denoiser, // Capture shared_ptr by value (copies the pointer) + steps, // Capture steps by value + &x, // Capture latent tensor by reference (modified in loop) + &sigmas, // Capture sigmas vector by reference (read-only needed) + &cond, &uncond, &id_cond, // Capture conditions by reference + control_hint, // Capture control hint pointer by value + control_strength, min_cfg, cfg_scale, guidance, eta, // Capture floats by value + start_merge_step, // Capture int by value + &skip_layers, slg_scale, skip_layer_start, skip_layer_end, // Capture skip layer params + shifted_timestep, // Capture shifted timestep by value + &noise_mask, // Capture noise mask pointer by reference/value + &noise, // Capture noise tensor by reference (modified in loop) + &init_latent, // Capture initial latent tensor by reference (read-only for mask) + &denoised, // Capture output denoised tensor by reference + &noised_input, &out_cond, &out_uncond, &out_skip, // Capture intermediate tensors by reference + has_unconditioned, has_skiplayer // Capture bools by value + ] + (ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); } @@ -861,22 +885,32 @@ class StableDiffusionGGML { float c_in = scaling[2]; float t = denoiser->sigma_to_t(sigma); - float t_for_model = t; - if (method == TIMESTEP_SHIFT_LCM && shifted_timestep > 0) { - // Apply timestep shift: t_shifted = t * shifted_timestep / TIMESTEPS - // TIMESTEPS is defined in denoiser.hpp as 1000 - t_for_model = t * (float)shifted_timestep / (float)TIMESTEPS; - // Ensure t_for_model stays within valid range [0, TIMESTEPS-1] - t_for_model = std::max(0.f, std::min(t_for_model, (float)TIMESTEPS - 1.f)); - LOG_DEBUG("Timestep Shift: original t=%.2f, shifted t=%.2f (shifted_timestep=%d)", t, t_for_model, shifted_timestep); + std::vector timesteps_vec; + // --- Timestep Shifting Logic --- + if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { + // Calculate the shifted timestep value based on the original t + // Python: (original_index * (shifted_timestep / total_timesteps)).long() + // Assuming TIMESTEPS (1000) is the total_timesteps for SDXL + float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); + // Clamp and round to nearest integer timestep index + int64_t shifted_t = static_cast(roundf(shifted_t_float)); + shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); + LOG_DEBUG("Shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); + timesteps_vec.assign(x->ne[3], (float)shifted_t); + } else { + // Use original timestep if shifting is disabled or model is not SDXL + timesteps_vec.assign(x->ne[3], t); } - std::vector timesteps_vec(x->ne[3], t_for_model); // Use t_for_model for the diffusion model call + // --- End Timestep Shifting Logic --- + auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); std::vector guidance_vec(x->ne[3], guidance); auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); copy_ggml_tensor(noised_input, input); // noised_input = noised_input * c_in + // NOTE: c_in is calculated based on the *original* sigma, which seems + // consistent with how the Python code uses xc derived from original sigma. ggml_tensor_scale(noised_input, c_in); std::vector controls; @@ -960,26 +994,83 @@ class StableDiffusionGGML { float* vec_denoised = (float*)denoised->data; float* vec_input = (float*)input->data; float* positive_data = (float*)out_cond->data; + float* negative_data_ptr = has_unconditioned ? (float*)out_uncond->data : nullptr; // Get pointer if needed + float* skip_layer_data_ptr = is_skiplayer_step ? (float*)out_skip->data : nullptr; // Get pointer if needed int ne_elements = (int)ggml_nelements(denoised); - for (int i = 0; i < ne_elements; i++) { - float latent_result = positive_data[i]; - if (has_unconditioned) { - // out_uncond + cfg_scale * (out_cond - out_uncond) - int64_t ne3 = out_cond->ne[3]; - if (min_cfg != cfg_scale && ne3 != 1) { - int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; - float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); - } else { - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + + // --- Select Calculation Path based on Timestep Shifting --- + if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { + // --- Shifted Timestep Final Calculation --- + + // Retrieve the integer shifted timestep calculated earlier + // Assuming shifted_t is the float representation of the index + int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); // Get the index back + + float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx); + std::vector shifted_scaling = denoiser->get_scalings(shifted_sigma); + float shifted_c_skip = shifted_scaling[0]; + float shifted_c_out = shifted_scaling[1]; + // shifted_c_in is scaling[2] if needed, but we adjust input instead + + // Need sigma_data from the denoiser (assuming CompVis type for SDXL) + auto compvis_denoiser_ptr = std::dynamic_pointer_cast(denoiser); + float sigma_data = compvis_denoiser_ptr ? compvis_denoiser_ptr->sigma_data : 1.0f; // Default needed? SDXL uses CompVis. + + float sigma_sq = sigma * sigma; // Original sigma for this step + float shifted_sigma_sq = shifted_sigma * shifted_sigma; + float sigma_data_sq = sigma_data * sigma_data; + + // Calculate the scaling factor needed to adjust the input `x` (vec_input) + // Equivalent to Python: sqrt(denoised_sigma^2 + sigma_data^2) / sqrt(sigma^2 + sigma_data^2) + float input_scale_factor = sqrtf((shifted_sigma_sq + sigma_data_sq) / (sigma_sq + sigma_data_sq)); + + LOG_DEBUG("Shifted calc [Step %d]: sigma=%.4f, shifted_t_idx=%ld, shifted_sigma=%.4f, input_scale=%.4f, shifted_c_skip=%.4f, shifted_c_out=%.4f", + step, sigma, shifted_t_idx, shifted_sigma, input_scale_factor, shifted_c_skip, shifted_c_out); + + for (int i = 0; i < ne_elements; i++) { + // CFG and SLG apply to the raw model output *before* the final scaling + float model_output_result = positive_data[i]; // Start with positive prediction + if (has_unconditioned) { + // Apply CFG scale: uncond + cfg_scale * (cond - uncond) + model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]); + // TODO: Add min_cfg logic if necessary } + if (is_skiplayer_step) { + // Apply SLG: result + slg_scale * (cond - skip) + model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]); + } + + // Recalculate input term based on Python logic: x_recalc = x * input_scale_factor + float adjusted_input = vec_input[i] * input_scale_factor; + + // Final calculation using shifted sigma scales and adjusted input + // Equivalent to Python: calculate_denoised(shifted_sigma, model_output_result, adjusted_input) + // denoised = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out; + vec_denoised[i] = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out; } - if (is_skiplayer_step) { - latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; - } - // v = latent_result, eps = latent_result - // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) - vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; + + } else { + // --- Original Final Calculation --- + LOG_DEBUG("Original calc [Step %d]: sigma=%.4f, c_skip=%.4f, c_out=%.4f", step, sigma, c_skip, c_out); + for (int i = 0; i < ne_elements; i++) { + // CFG and SLG apply to the raw model output *before* the final scaling + float model_output_result = positive_data[i]; // Start with positive prediction + if (has_unconditioned) { + // Apply CFG scale: uncond + cfg_scale * (cond - uncond) + model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]); + // TODO: Add min_cfg logic if necessary + } + if (is_skiplayer_step) { + // Apply SLG: result + slg_scale * (cond - skip) + model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]); + } + + // Original calculation: denoised = input * c_skip + model_output * c_out; + vec_denoised[i] = vec_input[i] * c_skip + model_output_result * c_out; + } } + // --- End Calculation Path Selection --- + int64_t t1 = ggml_time_us(); if (step > 0) { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); @@ -1002,7 +1093,7 @@ class StableDiffusionGGML { }; sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); - + // No changes needed for inverse_noise_scaling as it depends on the final sigma, not the intermediate timesteps used. x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); if (control_net) { @@ -1223,8 +1314,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL, - int shifted_timestep = -1) { + int shifted_timestep = -1, // Added parameter + ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1481,8 +1572,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - noise_mask, - shifted_timestep); + shifted_timestep, // Passed parameter + noise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1556,7 +1647,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - int shifted_timestep = -1) { + int shifted_timestep = -1) { // Added parameter to definition std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1635,8 +1726,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - NULL, // masked_image is NULL for txt2img - shifted_timestep); + shifted_timestep); // Passed parameter size_t t1 = ggml_time_ms(); @@ -1671,9 +1761,9 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - int shifted_timestep = -1) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); - LOG_DEBUG("img2img %dx%d", width, height); + int shifted_timestep = -1) { // Added parameter +std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); +LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; } @@ -1818,8 +1908,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - masked_image, // Pass the actual masked_image for img2img - shifted_timestep); + shifted_timestep, // Passed parameter + masked_image); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index cb3c0bafb..89e6ade5b 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -46,7 +46,6 @@ enum sample_method_t { LCM, DDIM_TRAILING, TCD, - TIMESTEP_SHIFT_LCM, N_SAMPLE_METHODS }; From 6e4b5e46d8550f5bdf760dfb8d65f1a4684a220b Mon Sep 17 00:00:00 2001 From: rmatif Date: Wed, 7 May 2025 08:01:40 +0000 Subject: [PATCH 4/9] add sgm_uniform --- denoiser.hpp | 65 +++++++++++++++++++++++++++++++++++++++++-- examples/cli/main.cpp | 10 ++++--- stable-diffusion.cpp | 5 ++++ stable-diffusion.h | 1 + 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/denoiser.hpp b/denoiser.hpp index 66799109d..bf00568a6 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -235,6 +235,31 @@ struct GITSSchedule : SigmaSchedule { } }; +struct SGMUniformSchedule : SigmaSchedule { + std::vector get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override { + // This schedule's core logic is now handled directly in Denoiser::get_sigmas + // to ensure correct access to both sigma_to_t and t_to_sigma. + // This method is overridden to fulfill the virtual contract but ideally should not be + // the primary execution path for SGMUniform when called from Denoiser::get_sigmas. + // If it IS called, it means the Denoiser::get_sigmas logic wasn't triggered, which is unexpected. + LOG_WARN("SGMUniformSchedule::get_sigmas was called directly. This might indicate an issue with Denoiser dispatch."); + // Provide a default (potentially incorrect for SGMUniform's intent) or empty schedule to avoid crashes. + // For safety, returning a simple discrete-like schedule in t-space if this is ever hit. + std::vector result; + if (n == 0) { + result.push_back(0.0f); + return result; + } + result.reserve(n + 1); + int t_max = TIMESTEPS -1; // A common max t value + float step = static_cast(t_max) / static_cast(n > 1 ? (n -1) : 1) ; + for(uint32_t i=0; i get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { // These *COULD* be function arguments here, @@ -265,8 +290,44 @@ struct Denoiser { virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0; virtual std::vector get_sigmas(uint32_t n) { - auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); - return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma); + // Check if the current schedule is SGMUniformSchedule + if (std::dynamic_pointer_cast(schedule)) { + LOG_DEBUG("Denoiser::get_sigmas - Using SGM_UNIFORM specific logic"); + std::vector sigs; + sigs.reserve(n + 1); + + if (n == 0) { + sigs.push_back(0.0f); + return sigs; + } + + // Use the Denoiser's own sigma_to_t and t_to_sigma methods + float start_t_val = this->sigma_to_t(this->sigma_max()); + float end_t_val = this->sigma_to_t(this->sigma_min()); + + // Python: torch.linspace(start, end, n + 1)[:-1] + // This creates n points. The k-th point (0-indexed) is start_t_val + k * (end_t_val - start_t_val) / n. + float dt_per_step; + if (n > 0) { // Avoid division by zero if n=0, though covered by earlier check + dt_per_step = (end_t_val - start_t_val) / static_cast(n); + } else { + dt_per_step = 0.0f; + } + + + for (uint32_t i = 0; i < n; ++i) { + float current_t = start_t_val + static_cast(i) * dt_per_step; + sigs.push_back(this->t_to_sigma(current_t)); + } + + sigs.push_back(0.0f); // Append the final zero sigma + return sigs; + + } else { // For all other schedules, use the existing virtual dispatch + LOG_DEBUG("Denoiser::get_sigmas - Using general schedule dispatch for %s", typeid(*schedule.get()).name()); + auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); + return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma); + } } }; diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 5988a7423..e09516994 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -51,6 +51,7 @@ const char* schedule_str[] = { "exponential", "ays", "gits", + "sgm_uniform", }; const char* modes_str[] = { @@ -234,7 +235,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -b, --batch-count COUNT number of images to generate\n"); - printf(" --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)\n"); + printf(" --schedule {discrete, karras, exponential, ays, gits, sgm_uniform} Denoiser sigma schedule (default: discrete)\n"); printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); @@ -537,14 +538,15 @@ void parse_args(int argc, const char** argv, SDParams& params) { } const char* schedule_selected = argv[i]; int schedule_found = -1; - for (int d = 0; d < N_SCHEDULES; d++) { + // N_SCHEDULES will be updated by the .h change, so this loop limit is fine + for (int d = 0; d < N_SCHEDULES; d++) { if (!strcmp(schedule_selected, schedule_str[d])) { schedule_found = d; } } if (schedule_found == -1) { - invalid_arg = true; - break; + fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform]\n", schedule_selected); + exit(1); // Exit directly as invalid_arg only triggers at the end } params.schedule = (schedule_t)schedule_found; } else if (arg == "-s" || arg == "--seed") { diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index f1e6fb72c..589494740 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -580,6 +580,11 @@ class StableDiffusionGGML { denoiser->schedule = std::make_shared(); denoiser->schedule->version = version; break; + case SGM_UNIFORM: + LOG_INFO("Running with SGM Uniform schedule"); + denoiser->schedule = std::make_shared(); + denoiser->schedule->version = version; // version might not be used by SGMUniform but good to keep pattern + break; case DEFAULT: // Don't touch anything. break; diff --git a/stable-diffusion.h b/stable-diffusion.h index 89e6ade5b..d01674968 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -56,6 +56,7 @@ enum schedule_t { EXPONENTIAL, AYS, GITS, + SGM_UNIFORM, N_SCHEDULES }; From 6f81e6bc0ce691c9260f1752f7bd9c45b54a8273 Mon Sep 17 00:00:00 2001 From: rmatif Date: Wed, 7 May 2025 18:59:22 +0000 Subject: [PATCH 5/9] add simple schedule --- denoiser.hpp | 42 ++++++++++++++++++++++++++++++++++++++++++ examples/cli/main.cpp | 5 +++-- stable-diffusion.cpp | 5 +++++ stable-diffusion.h | 1 + 4 files changed, 51 insertions(+), 2 deletions(-) diff --git a/denoiser.hpp b/denoiser.hpp index bf00568a6..6c6021b9d 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -279,6 +279,48 @@ struct KarrasSchedule : SigmaSchedule { } }; +struct SimpleSchedule : SigmaSchedule { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { + std::vector result_sigmas; + + if (n == 0) { + return result_sigmas; // Return empty for n=0, consistent with DiscreteSchedule + } + + result_sigmas.reserve(n + 1); + + // TIMESTEPS is the length of the model's internal sigmas array, typically 1000. + // t_to_sigma(t) maps a timestep t (0 to TIMESTEPS-1) to its sigma value. + int model_sigmas_len = TIMESTEPS; + + // ss = len(s.sigmas) / steps in Python + float step_factor = static_cast(model_sigmas_len) / static_cast(n); + + for (uint32_t i = 0; i < n; ++i) { + // Python: s.sigmas[-(1 + int(x * ss))] + // x corresponds to i (0 to n-1) + // int(x * ss) in Python is static_cast(static_cast(i) * step_factor) + // The index -(1 + offset) means (model_sigmas_len - 1 - offset) from the start of a 0-indexed array. + int offset_from_start_of_py_array = static_cast(static_cast(i) * step_factor); + int timestep_index = model_sigmas_len - 1 - offset_from_start_of_py_array; + + // Ensure the index is within valid bounds [0, model_sigmas_len - 1] + if (timestep_index < 0) { + timestep_index = 0; + } + // No need for upper bound check like `timestep_index >= model_sigmas_len` because + // max offset is for i=n-1: int((n-1)/n * model_sigmas_len) which is < model_sigmas_len. + // So, model_sigmas_len - 1 - max_offset is >= 0 if model_sigmas_len/n >= 1. + // If n > model_sigmas_len, then model_sigmas_len/n < 1, resulting in timestep_index potentially being <0, + // which is handled by the clamp above. + + result_sigmas.push_back(t_to_sigma(static_cast(timestep_index))); + } + result_sigmas.push_back(0.0f); // Append the final zero sigma + return result_sigmas; + } +}; + struct Denoiser { std::shared_ptr schedule = std::make_shared(); virtual float sigma_min() = 0; diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index e09516994..fc95c70b4 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -52,6 +52,7 @@ const char* schedule_str[] = { "ays", "gits", "sgm_uniform", + "simple", }; const char* modes_str[] = { @@ -235,7 +236,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -b, --batch-count COUNT number of images to generate\n"); - printf(" --schedule {discrete, karras, exponential, ays, gits, sgm_uniform} Denoiser sigma schedule (default: discrete)\n"); + printf(" --schedule {discrete, karras, exponential, ays, gits, sgm_uniform, simple} Denoiser sigma schedule (default: discrete)\n"); printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); @@ -545,7 +546,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { } } if (schedule_found == -1) { - fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform]\n", schedule_selected); + fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform, simple]\n", schedule_selected); exit(1); // Exit directly as invalid_arg only triggers at the end } params.schedule = (schedule_t)schedule_found; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 589494740..6f33b00b2 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -585,6 +585,11 @@ class StableDiffusionGGML { denoiser->schedule = std::make_shared(); denoiser->schedule->version = version; // version might not be used by SGMUniform but good to keep pattern break; + case SIMPLE: + LOG_INFO("Running with Simple schedule"); + denoiser->schedule = std::make_shared(); + denoiser->schedule->version = version; // version might not be used by Simple but good to keep pattern + break; case DEFAULT: // Don't touch anything. break; diff --git a/stable-diffusion.h b/stable-diffusion.h index d01674968..e29384d34 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -57,6 +57,7 @@ enum schedule_t { AYS, GITS, SGM_UNIFORM, + SIMPLE, N_SCHEDULES }; From c242dd4e621684306e90d18a3965cb2ca5da3f9b Mon Sep 17 00:00:00 2001 From: rmatif Date: Fri, 9 May 2025 11:11:22 +0000 Subject: [PATCH 6/9] remove debug comments --- denoiser.hpp | 38 ++++--------------- examples/cli/main.cpp | 15 ++++---- model.cpp | 7 +--- stable-diffusion.cpp | 85 ++++++++++++++----------------------------- 4 files changed, 43 insertions(+), 102 deletions(-) diff --git a/denoiser.hpp b/denoiser.hpp index 6c6021b9d..87405fbd4 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -237,21 +237,14 @@ struct GITSSchedule : SigmaSchedule { struct SGMUniformSchedule : SigmaSchedule { std::vector get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override { - // This schedule's core logic is now handled directly in Denoiser::get_sigmas - // to ensure correct access to both sigma_to_t and t_to_sigma. - // This method is overridden to fulfill the virtual contract but ideally should not be - // the primary execution path for SGMUniform when called from Denoiser::get_sigmas. - // If it IS called, it means the Denoiser::get_sigmas logic wasn't triggered, which is unexpected. - LOG_WARN("SGMUniformSchedule::get_sigmas was called directly. This might indicate an issue with Denoiser dispatch."); - // Provide a default (potentially incorrect for SGMUniform's intent) or empty schedule to avoid crashes. - // For safety, returning a simple discrete-like schedule in t-space if this is ever hit. + std::vector result; if (n == 0) { result.push_back(0.0f); return result; } result.reserve(n + 1); - int t_max = TIMESTEPS -1; // A common max t value + int t_max = TIMESTEPS -1; float step = static_cast(t_max) / static_cast(n > 1 ? (n -1) : 1) ; for(uint32_t i=0; i result_sigmas; if (n == 0) { - return result_sigmas; // Return empty for n=0, consistent with DiscreteSchedule + return result_sigmas; } result_sigmas.reserve(n + 1); - // TIMESTEPS is the length of the model's internal sigmas array, typically 1000. - // t_to_sigma(t) maps a timestep t (0 to TIMESTEPS-1) to its sigma value. int model_sigmas_len = TIMESTEPS; - // ss = len(s.sigmas) / steps in Python float step_factor = static_cast(model_sigmas_len) / static_cast(n); for (uint32_t i = 0; i < n; ++i) { - // Python: s.sigmas[-(1 + int(x * ss))] - // x corresponds to i (0 to n-1) - // int(x * ss) in Python is static_cast(static_cast(i) * step_factor) - // The index -(1 + offset) means (model_sigmas_len - 1 - offset) from the start of a 0-indexed array. + int offset_from_start_of_py_array = static_cast(static_cast(i) * step_factor); int timestep_index = model_sigmas_len - 1 - offset_from_start_of_py_array; - // Ensure the index is within valid bounds [0, model_sigmas_len - 1] if (timestep_index < 0) { timestep_index = 0; } - // No need for upper bound check like `timestep_index >= model_sigmas_len` because - // max offset is for i=n-1: int((n-1)/n * model_sigmas_len) which is < model_sigmas_len. - // So, model_sigmas_len - 1 - max_offset is >= 0 if model_sigmas_len/n >= 1. - // If n > model_sigmas_len, then model_sigmas_len/n < 1, resulting in timestep_index potentially being <0, - // which is handled by the clamp above. result_sigmas.push_back(t_to_sigma(static_cast(timestep_index))); } - result_sigmas.push_back(0.0f); // Append the final zero sigma + result_sigmas.push_back(0.0f); return result_sigmas; } }; @@ -334,7 +315,6 @@ struct Denoiser { virtual std::vector get_sigmas(uint32_t n) { // Check if the current schedule is SGMUniformSchedule if (std::dynamic_pointer_cast(schedule)) { - LOG_DEBUG("Denoiser::get_sigmas - Using SGM_UNIFORM specific logic"); std::vector sigs; sigs.reserve(n + 1); @@ -347,26 +327,22 @@ struct Denoiser { float start_t_val = this->sigma_to_t(this->sigma_max()); float end_t_val = this->sigma_to_t(this->sigma_min()); - // Python: torch.linspace(start, end, n + 1)[:-1] - // This creates n points. The k-th point (0-indexed) is start_t_val + k * (end_t_val - start_t_val) / n. float dt_per_step; - if (n > 0) { // Avoid division by zero if n=0, though covered by earlier check + if (n > 0) { dt_per_step = (end_t_val - start_t_val) / static_cast(n); } else { dt_per_step = 0.0f; } - for (uint32_t i = 0; i < n; ++i) { float current_t = start_t_val + static_cast(i) * dt_per_step; sigs.push_back(this->t_to_sigma(current_t)); } - sigs.push_back(0.0f); // Append the final zero sigma + sigs.push_back(0.0f); return sigs; } else { // For all other schedules, use the existing virtual dispatch - LOG_DEBUG("Denoiser::get_sigmas - Using general schedule dispatch for %s", typeid(*schedule.get()).name()); auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma); } diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index fc95c70b4..064d8c949 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -128,10 +128,10 @@ struct SDParams { int upscale_repeats = 1; std::vector skip_layers = {7, 8, 9}; - float slg_scale = 0.f; // Removed duplicate line + float slg_scale = 0.f; float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; - int shifted_timestep = -1; // Keep the added parameter from previous step + int shifted_timestep = -1; }; void print_params(SDParams params) { @@ -248,7 +248,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color Colors the logging tags according to level\n"); - printf(" --timestep-shift N shift timestep for SDXL models (NitroFusion paper, default: -1 off, N between 1 and 1000)\n"); + printf(" --timestep-shift N shift timestep for NitroFusion models, default: -1 off, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n"); printf(" -v, --verbose print extra info\n"); } @@ -539,7 +539,6 @@ void parse_args(int argc, const char** argv, SDParams& params) { } const char* schedule_selected = argv[i]; int schedule_found = -1; - // N_SCHEDULES will be updated by the .h change, so this loop limit is fine for (int d = 0; d < N_SCHEDULES; d++) { if (!strcmp(schedule_selected, schedule_str[d])) { schedule_found = d; @@ -547,7 +546,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { } if (schedule_found == -1) { fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform, simple]\n", schedule_selected); - exit(1); // Exit directly as invalid_arg only triggers at the end + exit(1); } params.schedule = (schedule_t)schedule_found; } else if (arg == "-s" || arg == "--seed") { @@ -635,7 +634,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.skip_layer_end = std::stof(argv[i]); - } else if (arg == "--timestep-shift") { // Added block + } else if (arg == "--timestep-shift") { if (++i >= argc) { invalid_arg = true; break; @@ -984,7 +983,7 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end, - params.shifted_timestep); // Passed parameter + params.shifted_timestep); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1054,7 +1053,7 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end, - params.shifted_timestep); // Passed parameter + params.shifted_timestep); } } diff --git a/model.cpp b/model.cpp index 0be264704..0d0c43fcb 100644 --- a/model.cpp +++ b/model.cpp @@ -185,7 +185,6 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "conditioner.embedders.1.")) { prefix = "cond_stage_model.1."; - // Corrected the substring length to match the prefix being checked new_name = new_name.substr(strlen("conditioner.embedders.1.")); } else if (starts_with(new_name, "cond_stage_model.")) { prefix = "cond_stage_model."; @@ -194,8 +193,6 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { prefix = new_name.substr(0, new_name.size() - strlen("vision_model.visual_projection.weight")); new_name = prefix + "visual_projection.weight"; return new_name; - // This specific case seems less common or might be handled implicitly later, - // but we keep the original logic for now. If issues arise, review if this mapping is needed. } else if (ends_with(new_name, "transformer.text_projection.weight")) { prefix = new_name.substr(0, new_name.size() - strlen("transformer.text_projection.weight")); new_name = prefix + "transformer.text_model.text_projection"; @@ -204,13 +201,11 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { return new_name; } - // Specific handling for text_projection variants before generic map lookup if (new_name == "model.text_projection.weight" || new_name == "model.text_projection") { new_name = "transformer.text_model.text_projection"; - } else if (open_clip_to_hf_clip_model.count(new_name)) { // Use .count() for safety + } else if (open_clip_to_hf_clip_model.count(new_name)) { new_name = open_clip_to_hf_clip_model[new_name]; } - // Note: The specific handling above takes precedence over the map for this tensor. std::string open_clip_resblock_prefix = "model.transformer.resblocks."; std::string hf_clip_resblock_prefix = "transformer.text_model.encoder.layers."; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 6f33b00b2..ca71a8416 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -583,15 +583,14 @@ class StableDiffusionGGML { case SGM_UNIFORM: LOG_INFO("Running with SGM Uniform schedule"); denoiser->schedule = std::make_shared(); - denoiser->schedule->version = version; // version might not be used by SGMUniform but good to keep pattern + denoiser->schedule->version = version; break; case SIMPLE: LOG_INFO("Running with Simple schedule"); denoiser->schedule = std::make_shared(); - denoiser->schedule->version = version; // version might not be used by Simple but good to keep pattern + denoiser->schedule->version = version; break; case DEFAULT: - // Don't touch anything. break; default: LOG_ERROR("Unknown schedule %i", schedule); @@ -814,7 +813,7 @@ class StableDiffusionGGML { float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - int shifted_timestep = -1, // Added parameter + int shifted_timestep = -1, ggml_tensor* noise_mask = nullptr) { LOG_DEBUG("Sample"); if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) { @@ -861,26 +860,24 @@ class StableDiffusionGGML { } } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - - // Capture necessary variables for the denoising step lambda - auto denoise = [this, // Capture 'this' to access members like version, rng, n_threads etc. - &work_ctx, // Context for tensor creation - denoiser = this->denoiser, // Capture shared_ptr by value (copies the pointer) - steps, // Capture steps by value - &x, // Capture latent tensor by reference (modified in loop) - &sigmas, // Capture sigmas vector by reference (read-only needed) - &cond, &uncond, &id_cond, // Capture conditions by reference - control_hint, // Capture control hint pointer by value - control_strength, min_cfg, cfg_scale, guidance, eta, // Capture floats by value - start_merge_step, // Capture int by value - &skip_layers, slg_scale, skip_layer_start, skip_layer_end, // Capture skip layer params - shifted_timestep, // Capture shifted timestep by value - &noise_mask, // Capture noise mask pointer by reference/value - &noise, // Capture noise tensor by reference (modified in loop) - &init_latent, // Capture initial latent tensor by reference (read-only for mask) - &denoised, // Capture output denoised tensor by reference - &noised_input, &out_cond, &out_uncond, &out_skip, // Capture intermediate tensors by reference - has_unconditioned, has_skiplayer // Capture bools by value + auto denoise = [this, + &work_ctx, + denoiser = this->denoiser, + steps, + &x, + &sigmas, + &cond, &uncond, &id_cond, + control_hint, + control_strength, min_cfg, cfg_scale, guidance, eta, + start_merge_step, + &skip_layers, slg_scale, skip_layer_start, skip_layer_end, + shifted_timestep, + &noise_mask, + &noise, + &init_latent, + &denoised, + &noised_input, &out_cond, &out_uncond, &out_skip, + has_unconditioned, has_skiplayer ] (ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { @@ -896,31 +893,23 @@ class StableDiffusionGGML { float t = denoiser->sigma_to_t(sigma); std::vector timesteps_vec; - // --- Timestep Shifting Logic --- + if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { - // Calculate the shifted timestep value based on the original t - // Python: (original_index * (shifted_timestep / total_timesteps)).long() - // Assuming TIMESTEPS (1000) is the total_timesteps for SDXL + float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); - // Clamp and round to nearest integer timestep index int64_t shifted_t = static_cast(roundf(shifted_t_float)); shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); LOG_DEBUG("Shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); timesteps_vec.assign(x->ne[3], (float)shifted_t); } else { - // Use original timestep if shifting is disabled or model is not SDXL timesteps_vec.assign(x->ne[3], t); } - // --- End Timestep Shifting Logic --- auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); std::vector guidance_vec(x->ne[3], guidance); auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); copy_ggml_tensor(noised_input, input); - // noised_input = noised_input * c_in - // NOTE: c_in is calculated based on the *original* sigma, which seems - // consistent with how the Python code uses xc derived from original sigma. ggml_tensor_scale(noised_input, c_in); std::vector controls; @@ -928,8 +917,6 @@ class StableDiffusionGGML { if (control_hint != NULL) { control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector); controls = control_net->controls; - // print_ggml_tensor(controls[12]); - // GGML_ASSERT(0); } if (start_merge_step == -1 || step <= start_merge_step) { @@ -1008,10 +995,7 @@ class StableDiffusionGGML { float* skip_layer_data_ptr = is_skiplayer_step ? (float*)out_skip->data : nullptr; // Get pointer if needed int ne_elements = (int)ggml_nelements(denoised); - // --- Select Calculation Path based on Timestep Shifting --- if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { - // --- Shifted Timestep Final Calculation --- - // Retrieve the integer shifted timestep calculated earlier // Assuming shifted_t is the float representation of the index int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); // Get the index back @@ -1021,7 +1005,6 @@ class StableDiffusionGGML { float shifted_c_skip = shifted_scaling[0]; float shifted_c_out = shifted_scaling[1]; // shifted_c_in is scaling[2] if needed, but we adjust input instead - // Need sigma_data from the denoiser (assuming CompVis type for SDXL) auto compvis_denoiser_ptr = std::dynamic_pointer_cast(denoiser); float sigma_data = compvis_denoiser_ptr ? compvis_denoiser_ptr->sigma_data : 1.0f; // Default needed? SDXL uses CompVis. @@ -1034,33 +1017,24 @@ class StableDiffusionGGML { // Equivalent to Python: sqrt(denoised_sigma^2 + sigma_data^2) / sqrt(sigma^2 + sigma_data^2) float input_scale_factor = sqrtf((shifted_sigma_sq + sigma_data_sq) / (sigma_sq + sigma_data_sq)); - LOG_DEBUG("Shifted calc [Step %d]: sigma=%.4f, shifted_t_idx=%ld, shifted_sigma=%.4f, input_scale=%.4f, shifted_c_skip=%.4f, shifted_c_out=%.4f", - step, sigma, shifted_t_idx, shifted_sigma, input_scale_factor, shifted_c_skip, shifted_c_out); - for (int i = 0; i < ne_elements; i++) { // CFG and SLG apply to the raw model output *before* the final scaling float model_output_result = positive_data[i]; // Start with positive prediction if (has_unconditioned) { // Apply CFG scale: uncond + cfg_scale * (cond - uncond) model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]); - // TODO: Add min_cfg logic if necessary } if (is_skiplayer_step) { // Apply SLG: result + slg_scale * (cond - skip) model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]); } - // Recalculate input term based on Python logic: x_recalc = x * input_scale_factor float adjusted_input = vec_input[i] * input_scale_factor; - // Final calculation using shifted sigma scales and adjusted input - // Equivalent to Python: calculate_denoised(shifted_sigma, model_output_result, adjusted_input) - // denoised = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out; vec_denoised[i] = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out; } } else { - // --- Original Final Calculation --- LOG_DEBUG("Original calc [Step %d]: sigma=%.4f, c_skip=%.4f, c_out=%.4f", step, sigma, c_skip, c_out); for (int i = 0; i < ne_elements; i++) { // CFG and SLG apply to the raw model output *before* the final scaling @@ -1068,18 +1042,15 @@ class StableDiffusionGGML { if (has_unconditioned) { // Apply CFG scale: uncond + cfg_scale * (cond - uncond) model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]); - // TODO: Add min_cfg logic if necessary } if (is_skiplayer_step) { // Apply SLG: result + slg_scale * (cond - skip) model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]); } - // Original calculation: denoised = input * c_skip + model_output * c_out; vec_denoised[i] = vec_input[i] * c_skip + model_output_result * c_out; } } - // --- End Calculation Path Selection --- int64_t t1 = ggml_time_us(); if (step > 0) { @@ -1103,7 +1074,7 @@ class StableDiffusionGGML { }; sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); - // No changes needed for inverse_noise_scaling as it depends on the final sigma, not the intermediate timesteps used. + x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); if (control_net) { @@ -1324,7 +1295,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - int shifted_timestep = -1, // Added parameter + int shifted_timestep = -1, ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. @@ -1657,7 +1628,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - int shifted_timestep = -1) { // Added parameter to definition + int shifted_timestep = -1) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1771,7 +1742,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - int shifted_timestep = -1) { // Added parameter + int shifted_timestep = -1) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1918,7 +1889,7 @@ LOG_DEBUG("img2img %dx%d", width, height); slg_scale, skip_layer_start, skip_layer_end, - shifted_timestep, // Passed parameter + shifted_timestep, masked_image); size_t t2 = ggml_time_ms(); From 4e85dfb0c60166d58e16dbf3d750f5ea7776ba5e Mon Sep 17 00:00:00 2001 From: rmatif Date: Fri, 9 May 2025 11:18:40 +0000 Subject: [PATCH 7/9] restore original comments --- stable-diffusion.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ca71a8416..201881a8a 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -591,6 +591,7 @@ class StableDiffusionGGML { denoiser->schedule->version = version; break; case DEFAULT: + // Don't touch anything. break; default: LOG_ERROR("Unknown schedule %i", schedule); @@ -910,6 +911,7 @@ class StableDiffusionGGML { auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); copy_ggml_tensor(noised_input, input); + // noised_input = noised_input * c_in ggml_tensor_scale(noised_input, c_in); std::vector controls; @@ -917,6 +919,8 @@ class StableDiffusionGGML { if (control_hint != NULL) { control_net->compute(n_threads, noised_input, control_hint, timesteps, cond.c_crossattn, cond.c_vector); controls = control_net->controls; + // print_ggml_tensor(controls[12]); + // GGML_ASSERT(0); } if (start_merge_step == -1 || step <= start_merge_step) { @@ -1553,7 +1557,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - shifted_timestep, // Passed parameter + shifted_timestep, noise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); From eca2089d6f3634c940b9bf235540a952c039d3af Mon Sep 17 00:00:00 2001 From: rmatif Date: Fri, 9 May 2025 11:32:54 +0000 Subject: [PATCH 8/9] update readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 553fb7f8f..d824ee705 100644 --- a/README.md +++ b/README.md @@ -256,7 +256,7 @@ arguments: --rng {std_default, cuda} RNG (default: cuda) -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) -b, --batch-count COUNT number of images to generate - --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete) + --schedule {discrete, karras, exponential, ays, gits, sgm_uniform, simple} Denoiser sigma schedule (default: discrete) --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x --vae-tiling process vae in tiles to reduce memory usage @@ -268,6 +268,7 @@ arguments: --control-net-cpu keep controlnet in cpu (for low vram) --canny apply canny preprocessor (edge detection) --color Colors the logging tags according to level + --timestep-shift N shift timestep for NitroFusion models, default: -1 off, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant -v, --verbose print extra info ``` From e907b858a12dfe04499365eaf2f0d0f65e409973 Mon Sep 17 00:00:00 2001 From: rmatif Date: Fri, 9 May 2025 16:08:53 +0000 Subject: [PATCH 9/9] remove debug log --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 201881a8a..81e9bd0e1 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1039,7 +1039,7 @@ class StableDiffusionGGML { } } else { - LOG_DEBUG("Original calc [Step %d]: sigma=%.4f, c_skip=%.4f, c_out=%.4f", step, sigma, c_skip, c_out); + for (int i = 0; i < ne_elements; i++) { // CFG and SLG apply to the raw model output *before* the final scaling float model_output_result = positive_data[i]; // Start with positive prediction