diff --git a/README.md b/README.md index 451388aa..41c7ba68 100644 --- a/README.md +++ b/README.md @@ -326,9 +326,10 @@ arguments: --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9]) --skip-layer-start START SLG enabling point: (default: 0.01) --skip-layer-end END SLG disabling point: (default: 0.2) - --scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete) + --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete) --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd} sampling method (default: "euler" for Flux/SD3/Wan, "euler_a" otherwise) + --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant --steps STEPS number of sample steps (default: 20) --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0) --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale) @@ -339,7 +340,7 @@ arguments: --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9]) --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01) --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2) - --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete) + --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete) --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd} (high noise) sampling method (default: "euler_a") --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto) @@ -352,7 +353,7 @@ arguments: --rng {std_default, cuda} RNG (default: cuda) -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0) -b, --batch-count COUNT number of images to generate - --clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) + --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1) <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x --vae-tiling process vae in tiles to reduce memory usage --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32) diff --git a/denoiser.hpp b/denoiser.hpp index 20d5f726..e64b292d 100644 --- a/denoiser.hpp +++ b/denoiser.hpp @@ -232,6 +232,25 @@ struct GITSSchedule : SigmaSchedule { } }; +struct SGMUniformSchedule : SigmaSchedule { + std::vector get_sigmas(uint32_t n, float sigma_min_in, float sigma_max_in, t_to_sigma_t t_to_sigma_func) override { + std::vector result; + if (n == 0) { + result.push_back(0.0f); + return result; + } + result.reserve(n + 1); + int t_max = TIMESTEPS - 1; + int t_min = 0; + std::vector timesteps = linear_space(static_cast(t_max), static_cast(t_min), n + 1); + for (int i = 0; i < n; i++) { + result.push_back(t_to_sigma_func(timesteps[i])); + } + result.push_back(0.0f); + return result; + } +}; + struct KarrasSchedule : SigmaSchedule { std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) { // These *COULD* be function arguments here, @@ -251,6 +270,35 @@ struct KarrasSchedule : SigmaSchedule { } }; +struct SimpleSchedule : SigmaSchedule { + std::vector get_sigmas(uint32_t n, float sigma_min, float sigma_max, t_to_sigma_t t_to_sigma) override { + std::vector result_sigmas; + + if (n == 0) { + return result_sigmas; + } + + result_sigmas.reserve(n + 1); + + int model_sigmas_len = TIMESTEPS; + + float step_factor = static_cast(model_sigmas_len) / static_cast(n); + + for (uint32_t i = 0; i < n; ++i) { + int offset_from_start_of_py_array = static_cast(static_cast(i) * step_factor); + int timestep_index = model_sigmas_len - 1 - offset_from_start_of_py_array; + + if (timestep_index < 0) { + timestep_index = 0; + } + + result_sigmas.push_back(t_to_sigma(static_cast(timestep_index))); + } + result_sigmas.push_back(0.0f); + return result_sigmas; + } +}; + // Close to Beta Schedule, but increadably simple in code. struct SmoothStepSchedule : SigmaSchedule { static constexpr float smoothstep(float x) { diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 0ba3acb7..274a25a1 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -248,9 +248,10 @@ void print_usage(int argc, const char* argv[]) { printf(" --skip-layers LAYERS Layers to skip for SLG steps: (default: [7,8,9])\n"); printf(" --skip-layer-start START SLG enabling point: (default: 0.01)\n"); printf(" --skip-layer-end END SLG disabling point: (default: 0.2)\n"); - printf(" --scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n"); + printf(" --scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n"); printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); printf(" sampling method (default: \"euler\" for Flux/SD3/Wan, \"euler_a\" otherwise)\n"); + printf(" --timestep-shift N shift timestep for NitroFusion models, default: 0, recommended N for NitroSD-Realism around 250 and 500 for NitroSD-Vibrant\n"); printf(" --steps STEPS number of sample steps (default: 20)\n"); printf(" --high-noise-cfg-scale SCALE (high noise) unconditional guidance scale: (default: 7.0)\n"); printf(" --high-noise-img-cfg-scale SCALE (high noise) image guidance scale for inpaint or instruct-pix2pix models: (default: same as --cfg-scale)\n"); @@ -261,7 +262,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --high-noise-skip-layers LAYERS (high noise) Layers to skip for SLG steps: (default: [7,8,9])\n"); printf(" --high-noise-skip-layer-start (high noise) SLG enabling point: (default: 0.01)\n"); printf(" --high-noise-skip-layer-end END (high noise) SLG disabling point: (default: 0.2)\n"); - printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep} Denoiser sigma scheduler (default: discrete)\n"); + printf(" --high-noise-scheduler {discrete, karras, exponential, ays, gits, smoothstep, sgm_uniform, simple} Denoiser sigma scheduler (default: discrete)\n"); printf(" --high-noise-sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing, tcd}\n"); printf(" (high noise) sampling method (default: \"euler_a\")\n"); printf(" --high-noise-steps STEPS (high noise) number of sample steps (default: -1 = auto)\n"); @@ -274,7 +275,7 @@ void print_usage(int argc, const char* argv[]) { printf(" --rng {std_default, cuda} RNG (default: cuda)\n"); printf(" -s SEED, --seed SEED RNG seed (default: 42, use random seed for < 0)\n"); printf(" -b, --batch-count COUNT number of images to generate\n"); - printf(" --clip-skip N ignore last_dot_pos layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); + printf(" --clip-skip N ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n"); printf(" <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n"); printf(" --vae-tiling process vae in tiles to reduce memory usage\n"); printf(" --vae-tile-size [X]x[Y] tile size for vae tiling (default: 32x32)\n"); @@ -520,6 +521,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { {"", "--chroma-t5-mask-pad", "", ¶ms.chroma_t5_mask_pad}, {"", "--video-frames", "", ¶ms.video_frames}, {"", "--fps", "", ¶ms.fps}, + {"", "--timestep-shift", "", ¶ms.sample_params.shifted_timestep}, }; options.float_options = { @@ -875,6 +877,11 @@ void parse_args(int argc, const char** argv, SDParams& params) { exit(1); } + if (params.sample_params.shifted_timestep < 0 || params.sample_params.shifted_timestep > 1000) { + fprintf(stderr, "error: timestep-shift must be between 0 and 1000\n"); + exit(1); + } + if (params.upscale_repeats < 1) { fprintf(stderr, "error: upscale multiplier must be at least 1\n"); exit(1); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index ccd90a00..97455c2c 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -747,6 +747,16 @@ class StableDiffusionGGML { denoiser->scheduler = std::make_shared(); denoiser->scheduler->version = version; break; + case SGM_UNIFORM: + LOG_INFO("Running with SGM Uniform schedule"); + denoiser->scheduler = std::make_shared(); + denoiser->scheduler->version = version; + break; + case SIMPLE: + LOG_INFO("Running with Simple schedule"); + denoiser->scheduler = std::make_shared(); + denoiser->scheduler->version = version; + break; case SMOOTHSTEP: LOG_INFO("Running with SmoothStep scheduler"); denoiser->scheduler = std::make_shared(); @@ -1033,6 +1043,7 @@ class StableDiffusionGGML { float control_strength, sd_guidance_params_t guidance, float eta, + int shifted_timestep, sample_method_t method, const std::vector& sigmas, int start_merge_step, @@ -1042,6 +1053,10 @@ class StableDiffusionGGML { ggml_tensor* denoise_mask = NULL, ggml_tensor* vace_context = NULL, float vace_strength = 1.f) { + if (shifted_timestep > 0 && !sd_version_is_sdxl(version)) { + LOG_WARN("timestep shifting is only supported for SDXL models!"); + shifted_timestep = 0; + } std::vector skip_layers(guidance.slg.layers, guidance.slg.layers + guidance.slg.layer_count); float cfg_scale = guidance.txt_cfg; @@ -1102,7 +1117,17 @@ class StableDiffusionGGML { float c_in = scaling[2]; float t = denoiser->sigma_to_t(sigma); - std::vector timesteps_vec(1, t); // [N, ] + std::vector timesteps_vec; + if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { + float shifted_t_float = t * (float(shifted_timestep) / float(TIMESTEPS)); + int64_t shifted_t = static_cast(roundf(shifted_t_float)); + shifted_t = std::max((int64_t)0, std::min((int64_t)(TIMESTEPS - 1), shifted_t)); + LOG_DEBUG("shifting timestep from %.2f to %" PRId64 " (sigma: %.4f)", t, shifted_t, sigma); + timesteps_vec.assign(1, (float)shifted_t); + } else { + timesteps_vec.assign(1, t); + } + timesteps_vec = process_timesteps(timesteps_vec, init_latent, denoise_mask); auto timesteps = vector_to_ggml_tensor(work_ctx, timesteps_vec); std::vector guidance_vec(1, guidance.distilled_guidance); @@ -1200,6 +1225,19 @@ class StableDiffusionGGML { float* vec_input = (float*)input->data; float* positive_data = (float*)out_cond->data; int ne_elements = (int)ggml_nelements(denoised); + + if (shifted_timestep > 0 && sd_version_is_sdxl(version)) { + int64_t shifted_t_idx = static_cast(roundf(timesteps_vec[0])); + float shifted_sigma = denoiser->t_to_sigma((float)shifted_t_idx); + std::vector shifted_scaling = denoiser->get_scalings(shifted_sigma); + float shifted_c_skip = shifted_scaling[0]; + float shifted_c_out = shifted_scaling[1]; + float shifted_c_in = shifted_scaling[2]; + + c_skip = shifted_c_skip * c_in / shifted_c_in; + c_out = shifted_c_out; + } + for (int i = 0; i < ne_elements; i++) { float latent_result = positive_data[i]; if (has_unconditioned) { @@ -1222,6 +1260,7 @@ class StableDiffusionGGML { // denoised = (v * c_out + input * c_skip) or (input + eps * c_out) vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; } + int64_t t1 = ggml_time_us(); if (step > 0) { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); @@ -1588,6 +1627,8 @@ const char* schedule_to_str[] = { "exponential", "ays", "gits", + "sgm_uniform", + "simple", "smoothstep", }; @@ -1720,7 +1761,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { "scheduler: %s, " "sample_method: %s, " "sample_steps: %d, " - "eta: %.2f)", + "eta: %.2f, " + "shifted_timestep: %d)", sample_params->guidance.txt_cfg, sample_params->guidance.img_cfg, sample_params->guidance.distilled_guidance, @@ -1731,7 +1773,8 @@ char* sd_sample_params_to_str(const sd_sample_params_t* sample_params) { sd_schedule_name(sample_params->scheduler), sd_sample_method_name(sample_params->sample_method), sample_params->sample_steps, - sample_params->eta); + sample_params->eta, + sample_params->shifted_timestep); return buf; } @@ -1863,6 +1906,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, int clip_skip, sd_guidance_params_t guidance, float eta, + int shifted_timestep, int width, int height, enum sample_method_t sample_method, @@ -2101,6 +2145,7 @@ sd_image_t* generate_image_internal(sd_ctx_t* sd_ctx, control_strength, guidance, eta, + shifted_timestep, sample_method, sigmas, start_merge_step, @@ -2394,6 +2439,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g sd_img_gen_params->clip_skip, sd_img_gen_params->sample_params.guidance, sd_img_gen_params->sample_params.eta, + sd_img_gen_params->sample_params.shifted_timestep, width, height, sample_method, @@ -2734,6 +2780,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s 0, sd_vid_gen_params->high_noise_sample_params.guidance, sd_vid_gen_params->high_noise_sample_params.eta, + sd_vid_gen_params->high_noise_sample_params.shifted_timestep, sd_vid_gen_params->high_noise_sample_params.sample_method, high_noise_sigmas, -1, @@ -2769,6 +2816,7 @@ SD_API sd_image_t* generate_video(sd_ctx_t* sd_ctx, const sd_vid_gen_params_t* s 0, sd_vid_gen_params->sample_params.guidance, sd_vid_gen_params->sample_params.eta, + sd_vid_gen_params->sample_params.shifted_timestep, sd_vid_gen_params->sample_params.sample_method, sigmas, -1, diff --git a/stable-diffusion.h b/stable-diffusion.h index d1c3c717..80f1f6e7 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -58,6 +58,8 @@ enum scheduler_t { EXPONENTIAL, AYS, GITS, + SGM_UNIFORM, + SIMPLE, SMOOTHSTEP, SCHEDULE_COUNT }; @@ -183,6 +185,7 @@ typedef struct { enum sample_method_t sample_method; int sample_steps; float eta; + int shifted_timestep; } sd_sample_params_t; typedef struct {