diff --git a/.gitignore b/.gitignore index 38fe570d..7986ce6b 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ test/ *.bin *.exe *.gguf +*.pdf output*.png models* -*.log \ No newline at end of file +*.log diff --git a/README.md b/README.md index 553fb7f8..d824ee70 100644 --- a/README.md +++ b/README.md @@ -256,7 +256,7 @@ arguments: --rng {std_default, cuda} RNG (default: cuda) -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) -b, --batch-count COUNT number of images to generate - --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete) + --schedule {discrete, karras, exponential, ays, gits, sgm_uniform, simple} Denoiser sigma schedule (default: discrete) --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x --vae-tiling process vae in tiles to reduce memory usage @@ -268,6 +268,7 @@ arguments: --control-net-cpu keep controlnet in cpu (for low vram) --canny apply canny preprocessor (edge detection) --color Colors the logging tags according to level + --timestep-shift N shift timestep for NitroFusion models, default: -1 off, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant -v, --verbose print extra info ``` diff --git a/denoiser.hpp b/denoiser.hpp index 66799109..87405fbd 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -235,6 +235,24 @@ struct GITSSchedule : SigmaSchedule { } }; +struct SGMUniformSchedule : SigmaSchedule { + std::vector get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override { + + std::vector result; + if (n == 0) { + result.push_back(0.0f); + return result; + } + result.reserve(n + 1); + int t_max = TIMESTEPS -1; + float step = static_cast(t_max) / static_cast(n > 1 ? (n -1) : 1) ; + for(uint32_t i=0; i get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { // These *COULD* be function arguments here, @@ -254,6 +272,36 @@ struct KarrasSchedule : SigmaSchedule { } }; +struct SimpleSchedule : SigmaSchedule { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { + std::vector result_sigmas; + + if (n == 0) { + return result_sigmas; + } + + result_sigmas.reserve(n + 1); + + int model_sigmas_len = TIMESTEPS; + + float step_factor = static_cast(model_sigmas_len) / static_cast(n); + + for (uint32_t i = 0; i < n; ++i) { + + int offset_from_start_of_py_array = static_cast(static_cast(i) * step_factor); + int timestep_index = model_sigmas_len - 1 - offset_from_start_of_py_array; + + if (timestep_index < 0) { + timestep_index = 0; + } + + result_sigmas.push_back(t_to_sigma(static_cast(timestep_index))); + } + result_sigmas.push_back(0.0f); + return result_sigmas; + } +}; + struct Denoiser { std::shared_ptr schedule = std::make_shared(); virtual float sigma_min() = 0; @@ -265,8 +313,39 @@ struct Denoiser { virtual ggml_tensor* inverse_noise_scaling(float sigma, ggml_tensor* latent) = 0; virtual std::vector get_sigmas(uint32_t n) { - auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); - return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma); + // Check if the current schedule is SGMUniformSchedule + if (std::dynamic_pointer_cast(schedule)) { + std::vector sigs; + sigs.reserve(n + 1); + + if (n == 0) { + sigs.push_back(0.0f); + return sigs; + } + + // Use the Denoiser's own sigma_to_t and t_to_sigma methods + float start_t_val = this->sigma_to_t(this->sigma_max()); + float end_t_val = this->sigma_to_t(this->sigma_min()); + + float dt_per_step; + if (n > 0) { + dt_per_step = (end_t_val - start_t_val) / static_cast(n); + } else { + dt_per_step = 0.0f; + } + + for (uint32_t i = 0; i < n; ++i) { + float current_t = start_t_val + static_cast(i) * dt_per_step; + sigs.push_back(this->t_to_sigma(current_t)); + } + + sigs.push_back(0.0f); + return sigs; + + } else { // For all other schedules, use the existing virtual dispatch + auto bound_t_to_sigma = std::bind(&Denoiser::t_to_sigma, this, std::placeholders::_1); + return schedule->get_sigmas(n, sigma_min(), sigma_max(), bound_t_to_sigma); + } } }; diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index af6b2bbd..064d8c94 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -51,6 +51,8 @@ const char* schedule_str[] = { "exponential", "ays", "gits", + "sgm_uniform", + "simple", }; const char* modes_str[] = { @@ -129,6 +131,7 @@ struct SDParams { float slg_scale = 0.f; float skip_layer_start = 0.01f; float skip_layer_end = 0.2f; + int shifted_timestep = -1; }; void print_params(SDParams params) { @@ -178,6 +181,7 @@ void print_params(SDParams params) { printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); + printf(" timestep_shift: %d\n", params.shifted_timestep); } void print_usage(int argc, const char* argv[]) { @@ -232,7 +236,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -b, --batch-count COUNT number of images to generate\n"); - printf(" --schedule {discrete, karras, exponential, ays, gits} Denoiser sigma schedule (default: discrete)\n"); + printf(" --schedule {discrete, karras, exponential, ays, gits, sgm_uniform, simple} Denoiser sigma schedule (default: discrete)\n"); printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); @@ -244,6 +248,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); printf(" --color Colors the logging tags according to level\n"); + printf(" --timestep-shift N shift timestep for NitroFusion models, default: -1 off, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n"); printf(" -v, --verbose print extra info\n"); } @@ -534,14 +539,14 @@ void parse_args(int argc, const char** argv, SDParams& params) { } const char* schedule_selected = argv[i]; int schedule_found = -1; - for (int d = 0; d < N_SCHEDULES; d++) { + for (int d = 0; d < N_SCHEDULES; d++) { if (!strcmp(schedule_selected, schedule_str[d])) { schedule_found = d; } } if (schedule_found == -1) { - invalid_arg = true; - break; + fprintf(stderr, "error: invalid schedule %s, must be one of [discrete, karras, exponential, ays, gits, sgm_uniform, simple]\n", schedule_selected); + exit(1); } params.schedule = (schedule_t)schedule_found; } else if (arg == "-s" || arg == "--seed") { @@ -629,6 +634,16 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.skip_layer_end = std::stof(argv[i]); + } else if (arg == "--timestep-shift") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.shifted_timestep = std::stoi(argv[i]); + if (params.shifted_timestep != -1 && (params.shifted_timestep < 1 || params.shifted_timestep > 1000)) { + fprintf(stderr, "error: timestep-shift must be between 1 and 1000, or -1 to disable\n"); + exit(1); + } } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -967,10 +982,11 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); - } else { - sd_image_t input_image = {(uint32_t)params.width, - (uint32_t)params.height, + params.skip_layer_end, + params.shifted_timestep); + } else { + sd_image_t input_image = {(uint32_t)params.width, + (uint32_t)params.height, 3, input_image_buffer}; @@ -1036,9 +1052,10 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); - } - } + params.skip_layer_end, + params.shifted_timestep); + } + } if (results == NULL) { printf("generate failed\n"); diff --git a/model.cpp b/model.cpp index 24da39f6..0d0c43fc 100644 --- a/model.cpp +++ b/model.cpp @@ -185,7 +185,7 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { new_name = new_name.substr(strlen("conditioner.embedders.0.")); } else if (starts_with(new_name, "conditioner.embedders.1.")) { prefix = "cond_stage_model.1."; - new_name = new_name.substr(strlen("conditioner.embedders.0.")); + new_name = new_name.substr(strlen("conditioner.embedders.1.")); } else if (starts_with(new_name, "cond_stage_model.")) { prefix = "cond_stage_model."; new_name = new_name.substr(strlen("cond_stage_model.")); @@ -201,7 +201,9 @@ std::string convert_open_clip_to_hf_clip(const std::string& name) { return new_name; } - if (open_clip_to_hf_clip_model.find(new_name) != open_clip_to_hf_clip_model.end()) { + if (new_name == "model.text_projection.weight" || new_name == "model.text_projection") { + new_name = "transformer.text_model.text_projection"; + } else if (open_clip_to_hf_clip_model.count(new_name)) { new_name = open_clip_to_hf_clip_model[new_name]; } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index e38a6101..81e9bd0e 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -580,6 +580,16 @@ class StableDiffusionGGML { denoiser->schedule = std::make_shared(); denoiser->schedule->version = version; break; + case SGM_UNIFORM: + LOG_INFO("Running with SGM Uniform schedule"); + denoiser->schedule = std::make_shared(); + denoiser->schedule->version = version; + break; + case SIMPLE: + LOG_INFO("Running with Simple schedule"); + denoiser->schedule = std::make_shared(); + denoiser->schedule->version = version; + break; case DEFAULT: // Don't touch anything. break; @@ -804,8 +814,13 @@ class StableDiffusionGGML { float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, + int shifted_timestep = -1, ggml_tensor* noise_mask = nullptr) { LOG_DEBUG("Sample"); + if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) { + LOG_WARN("Timestep shifting is only supported for SDXL models. Ignoring --timestep-shift."); + shifted_timestep = -1; + } struct ggml_init_params params; size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); for (int i = 1; i < 4; i++) { @@ -846,8 +861,26 @@ class StableDiffusionGGML { } } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); - - auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { + auto denoise = [this, + &work_ctx, + denoiser = this->denoiser, + steps, + &x, + &sigmas, + &cond, &uncond, &id_cond, + control_hint, + control_strength, min_cfg, cfg_scale, guidance, eta, + start_merge_step, + &skip_layers, slg_scale, skip_layer_start, skip_layer_end, + shifted_timestep, + &noise_mask, + &noise, + &init_latent, + &denoised, + &noised_input, &out_cond, &out_uncond, &out_skip, + has_unconditioned, has_skiplayer + ] + (ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); } @@ -860,7 +893,19 @@ class StableDiffusionGGML { float c_in = scaling[2]; float t = denoiser->sigma_to_t(sigma); - std::vector timesteps_vec(x->ne[3], t); // [N, ] + std::vector timesteps_vec; + + if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { + + float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); + int64_t shifted_t = static_cast(roundf(shifted_t_float)); + shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); + LOG_DEBUG("Shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); + timesteps_vec.assign(x->ne[3], (float)shifted_t); + } else { + timesteps_vec.assign(x->ne[3], t); + } + auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); std::vector guidance_vec(x->ne[3], guidance); auto guidance_tensor = vector_to_ggml_tensor(work_ctx, guidance_vec); @@ -950,26 +995,67 @@ class StableDiffusionGGML { float* vec_denoised = (float*)denoised->data; float* vec_input = (float*)input->data; float* positive_data = (float*)out_cond->data; + float* negative_data_ptr = has_unconditioned ? (float*)out_uncond->data : nullptr; // Get pointer if needed + float* skip_layer_data_ptr = is_skiplayer_step ? (float*)out_skip->data : nullptr; // Get pointer if needed int ne_elements = (int)ggml_nelements(denoised); - for (int i = 0; i < ne_elements; i++) { - float latent_result = positive_data[i]; - if (has_unconditioned) { - // out_uncond + cfg_scale * (out_cond - out_uncond) - int64_t ne3 = out_cond->ne[3]; - if (min_cfg != cfg_scale && ne3 != 1) { - int64_t i3 = i / out_cond->ne[0] * out_cond->ne[1] * out_cond->ne[2]; - float scale = min_cfg + (cfg_scale - min_cfg) * (i3 * 1.0f / ne3); - } else { - latent_result = negative_data[i] + cfg_scale * (positive_data[i] - negative_data[i]); + + if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { + // Retrieve the integer shifted timestep calculated earlier + // Assuming shifted_t is the float representation of the index + int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); // Get the index back + + float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx); + std::vector shifted_scaling = denoiser->get_scalings(shifted_sigma); + float shifted_c_skip = shifted_scaling[0]; + float shifted_c_out = shifted_scaling[1]; + // shifted_c_in is scaling[2] if needed, but we adjust input instead + // Need sigma_data from the denoiser (assuming CompVis type for SDXL) + auto compvis_denoiser_ptr = std::dynamic_pointer_cast(denoiser); + float sigma_data = compvis_denoiser_ptr ? compvis_denoiser_ptr->sigma_data : 1.0f; // Default needed? SDXL uses CompVis. + + float sigma_sq = sigma * sigma; // Original sigma for this step + float shifted_sigma_sq = shifted_sigma * shifted_sigma; + float sigma_data_sq = sigma_data * sigma_data; + + // Calculate the scaling factor needed to adjust the input `x` (vec_input) + // Equivalent to Python: sqrt(denoised_sigma^2 + sigma_data^2) / sqrt(sigma^2 + sigma_data^2) + float input_scale_factor = sqrtf((shifted_sigma_sq + sigma_data_sq) / (sigma_sq + sigma_data_sq)); + + for (int i = 0; i < ne_elements; i++) { + // CFG and SLG apply to the raw model output *before* the final scaling + float model_output_result = positive_data[i]; // Start with positive prediction + if (has_unconditioned) { + // Apply CFG scale: uncond + cfg_scale * (cond - uncond) + model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]); } + if (is_skiplayer_step) { + // Apply SLG: result + slg_scale * (cond - skip) + model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]); + } + + float adjusted_input = vec_input[i] * input_scale_factor; + + vec_denoised[i] = adjusted_input * shifted_c_skip + model_output_result * shifted_c_out; } - if (is_skiplayer_step) { - latent_result = latent_result + (positive_data[i] - skip_layer_data[i]) * slg_scale; - } - // v = latent_result, eps = latent_result - // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) - vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; + + } else { + + for (int i = 0; i < ne_elements; i++) { + // CFG and SLG apply to the raw model output *before* the final scaling + float model_output_result = positive_data[i]; // Start with positive prediction + if (has_unconditioned) { + // Apply CFG scale: uncond + cfg_scale * (cond - uncond) + model_output_result = negative_data_ptr[i] + cfg_scale * (positive_data[i] - negative_data_ptr[i]); + } + if (is_skiplayer_step) { + // Apply SLG: result + slg_scale * (cond - skip) + model_output_result = model_output_result + slg_scale * (positive_data[i] - skip_layer_data_ptr[i]); + } + + vec_denoised[i] = vec_input[i] * c_skip + model_output_result * c_out; + } } + int64_t t1 = ggml_time_us(); if (step > 0) { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); @@ -992,7 +1078,7 @@ class StableDiffusionGGML { }; sample_k_diffusion(method, denoise, work_ctx, x, sigmas, rng, eta); - + x = denoiser->inverse_noise_scaling(sigmas[sigmas.size() - 1], x); if (control_net) { @@ -1213,6 +1299,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, + int shifted_timestep = -1, ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. @@ -1470,6 +1557,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, + shifted_timestep, noise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); @@ -1543,7 +1631,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + float skip_layer_end = 0.2, + int shifted_timestep = -1) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1621,7 +1710,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, skip_layers_vec, slg_scale, skip_layer_start, - skip_layer_end); + skip_layer_end, + shifted_timestep); // Passed parameter size_t t1 = ggml_time_ms(); @@ -1655,9 +1745,10 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { - std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); - LOG_DEBUG("img2img %dx%d", width, height); + float skip_layer_end = 0.2, + int shifted_timestep = -1) { +std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); +LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { return NULL; } @@ -1802,6 +1893,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, + shifted_timestep, masked_image); size_t t2 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 52dcc848..e29384d3 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -56,6 +56,8 @@ enum schedule_t { EXPONENTIAL, AYS, GITS, + SGM_UNIFORM, + SIMPLE, N_SCHEDULES }; @@ -176,7 +178,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + int shifted_timestep); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -203,7 +206,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + int shifted_timestep); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image,