Skip to content

Commit bf353f1

Browse files
committed
Implement DDIM with the "trailing" timestep spacing
1 parent dcf91f9 commit bf353f1

File tree

4 files changed

+172
-1
lines changed

4 files changed

+172
-1
lines changed

denoiser.hpp

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,6 +1005,174 @@ static void sample_k_diffusion(sample_method_t method,
10051005
}
10061006
}
10071007
} break;
1008+
case DDIM_TRAILING: // Denoising Diffusion Implicit Models
1009+
// with the "trailing" timestep spacing
1010+
{
1011+
// DDIM itself needs alphas_cumprod (DDPM, Ho et al.,
1012+
// arXiv:2006.11239 [cs.LG] with k-diffusion's start and
1013+
// end beta) (which unfortunately k-diffusion's data
1014+
// structure hides from the denoiser), and the sigmas are
1015+
// also needed to invert the behavior of CompVisDenoiser
1016+
// (k-diffusion's LMSDiscreteScheduler)
1017+
std::vector<double> alphas_cumprod;
1018+
std::vector<double> compvis_sigmas;
1019+
1020+
alphas_cumprod.reserve(TIMESTEPS);
1021+
compvis_sigmas.reserve(TIMESTEPS);
1022+
for (int i = 0; i < TIMESTEPS; i++) {
1023+
alphas_cumprod[i] =
1024+
(i == 0 ? 1.0f : alphas_cumprod[i - 1]) *
1025+
(1.0f -
1026+
std::pow(sqrtf(0.00085f) +
1027+
(sqrtf(0.0120f) - sqrtf(0.00085f)) *
1028+
((float)i / (TIMESTEPS - 1)), 2));
1029+
compvis_sigmas[i] =
1030+
std::sqrt((1 - alphas_cumprod[i]) /
1031+
alphas_cumprod[i]);
1032+
}
1033+
for (int i = 0; i < steps; i++) {
1034+
// The "trailing" DDIM timestep, see S. Lin et al.,
1035+
// "Common Diffusion Noise Schedules and Sample Steps
1036+
// are Flawed", arXiv:2305.08891 [cs], p. 4, Table
1037+
// 2. Most variables below follow Diffusers naming.
1038+
int timestep =
1039+
roundf(TIMESTEPS -
1040+
i * ((float)TIMESTEPS / steps)) - 1;
1041+
int prev_timestep = timestep - TIMESTEPS / steps;
1042+
// The sigma here is chosen to cause the
1043+
// CompVisDenoiser to produce t = timestep
1044+
float sigma = compvis_sigmas[timestep];
1045+
if (i == 0) {
1046+
// The function add_noise intializes x to
1047+
// Diffusers' latents * sigma (as in Diffusers'
1048+
// pipeline) or sample * sigma (Diffusers'
1049+
// scheduler), where this sigma = init_noise_sigma
1050+
// in Diffusers. For DDPM and DDIM however,
1051+
// init_noise_sigma = 1. But the k-diffusion
1052+
// model() also evaluates F_theta(c_in(sigma) x;
1053+
// ...) instead of the bare U-net F_theta, with
1054+
// c_in = 1 / sqrt(sigma^2 + 1), as defined in
1055+
// T. Karras et al., "Elucidating the Design Space
1056+
// of Diffusion-Based Generative Models",
1057+
// arXiv:2206.00364 [cs.CV], p. 3, Table 1. Hence
1058+
// the first call has to be prescaled as x <- x /
1059+
// (c_in * sigma) with the k-diffusion pipeline
1060+
// and CompVisDenoiser.
1061+
float* vec_x = (float*)x->data;
1062+
for (int j = 0; j < ggml_nelements(x); j++) {
1063+
vec_x[j] *= std::sqrt(sigma * sigma + 1) /
1064+
sigma;
1065+
}
1066+
}
1067+
else {
1068+
// For the subsequent steps after the first one,
1069+
// at this point x = latents (pipeline) or x =
1070+
// sample (scheduler), and needs to be prescaled
1071+
// with x <- latents / c_in to compensate for
1072+
// model() applying the scale c_in before the
1073+
// U-net F_theta
1074+
float* vec_x = (float*)x->data;
1075+
for (int j = 0; j < ggml_nelements(x); j++) {
1076+
vec_x[j] *= std::sqrt(sigma * sigma + 1);
1077+
}
1078+
}
1079+
// Note model() is the D(x, sigma) as defined in
1080+
// T. Karras et al., arXiv:2206.00364, p. 3, Table 1
1081+
// and p. 8 (7)
1082+
struct ggml_tensor* noise_pred =
1083+
model(x, sigma, i + 1);
1084+
// Here noise_pred is still the k-diffusion denoiser
1085+
// output, not the U-net output F_theta(c_in(sigma) x;
1086+
// ...) in Karras et al. (2022), whereas Diffusers'
1087+
// noise_pred is F_theta(...). Recover the actual
1088+
// noise_pred, which is also referred to as the
1089+
// "Karras ODE derivative" d or d_cur in several
1090+
// samplers above.
1091+
{
1092+
float* vec_x = (float*)x->data;
1093+
float* vec_noise_pred = (float*)noise_pred->data;
1094+
for (int j = 0; j < ggml_nelements(x); j++) {
1095+
vec_noise_pred[j] =
1096+
(vec_x[j] - vec_noise_pred[j]) *
1097+
(1 / sigma);
1098+
}
1099+
}
1100+
// 2. compute alphas, betas
1101+
float alpha_prod_t = alphas_cumprod[timestep];
1102+
// Note final_alpha_cumprod = alphas_cumprod[0]
1103+
float alpha_prod_t_prev = prev_timestep >= 0 ?
1104+
alphas_cumprod[prev_timestep] : alphas_cumprod[0];
1105+
float beta_prod_t = 1 - alpha_prod_t;
1106+
// 3. compute predicted original sample from predicted
1107+
// noise also called "predicted x_0" of formula (12)
1108+
// from https://arxiv.org/pdf/2010.02502.pdf
1109+
struct ggml_tensor* pred_original_sample =
1110+
ggml_dup_tensor(work_ctx, x);
1111+
{
1112+
float* vec_x = (float*)x->data;
1113+
float* vec_noise_pred = (float*)noise_pred->data;
1114+
float* vec_pred_original_sample =
1115+
(float*)pred_original_sample->data;
1116+
// Note the substitution of latents or sample = x
1117+
// * c_in = x / sqrt(sigma^2 + 1)
1118+
for (int j = 0; j < ggml_nelements(x); j++) {
1119+
vec_pred_original_sample[j] =
1120+
(vec_x[j] / std::sqrt(sigma * sigma + 1) -
1121+
std::sqrt(beta_prod_t) *
1122+
vec_noise_pred[j]) *
1123+
(1 / std::sqrt(alpha_prod_t));
1124+
}
1125+
}
1126+
// Assuming the "epsilon" prediction type, where below
1127+
// pred_epsilon = noise_pred is inserted, and is not
1128+
// defined/copied explicitly.
1129+
//
1130+
// 5. compute variance: "sigma_t(eta)" -> see formula
1131+
// (16)
1132+
//
1133+
// sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
1134+
// sqrt(1 - alpha_t/alpha_t-1)
1135+
float beta_prod_t_prev = 1 - alpha_prod_t_prev;
1136+
float variance = (beta_prod_t_prev / beta_prod_t) *
1137+
(1 - alpha_prod_t / alpha_prod_t_prev);
1138+
float std_dev_t = 0 * std::sqrt(variance);
1139+
// 6. compute "direction pointing to x_t" of formula
1140+
// (12) from https://arxiv.org/pdf/2010.02502.pdf
1141+
struct ggml_tensor* pred_sample_direction =
1142+
ggml_dup_tensor(work_ctx, noise_pred);
1143+
{
1144+
float* vec_noise_pred = (float*)noise_pred->data;
1145+
float* vec_pred_sample_direction =
1146+
(float*)pred_sample_direction->data;
1147+
for (int j = 0; j < ggml_nelements(x); j++) {
1148+
vec_pred_sample_direction[j] =
1149+
std::sqrt(1 - alpha_prod_t_prev -
1150+
std::pow(std_dev_t, 2)) *
1151+
vec_noise_pred[j];
1152+
}
1153+
}
1154+
// 7. compute x_t without "random noise" of formula
1155+
// (12) from https://arxiv.org/pdf/2010.02502.pdf
1156+
{
1157+
float* vec_pred_original_sample =
1158+
(float*)pred_original_sample->data;
1159+
float* vec_pred_sample_direction =
1160+
(float*)pred_sample_direction->data;
1161+
float* vec_x = (float*)x->data;
1162+
for (int j = 0; j < ggml_nelements(x); j++) {
1163+
vec_x[j] = std::sqrt(alpha_prod_t_prev) *
1164+
vec_pred_original_sample[j] +
1165+
vec_pred_sample_direction[j];
1166+
}
1167+
}
1168+
// See the note above: x = latents or sample here, and
1169+
// is not scaled by the c_in. For the final output
1170+
// this is correct, but for subsequent iterations, x
1171+
// needs to be prescaled again, since k-diffusion's
1172+
// model() differes from the bare U-net F_theta by the
1173+
// factor c_in.
1174+
}
1175+
} break;
10081176

10091177
default:
10101178
LOG_ERROR("Attempting to sample with nonexisting sample method %i", method);

examples/cli/main.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ const char* sample_method_str[] = {
3939
"ipndm",
4040
"ipndm_v",
4141
"lcm",
42+
"ddim_trailing",
4243
};
4344

4445
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -219,7 +220,7 @@ void print_usage(int argc, const char* argv[]) {
219220
printf(" 1.0 corresponds to full destruction of information in init image\n");
220221
printf(" -H, --height H image height, in pixel space (default: 512)\n");
221222
printf(" -W, --width W image width, in pixel space (default: 512)\n");
222-
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm}\n");
223+
printf(" --sampling-method {euler, euler_a, heun, dpm2, dpm++2s_a, dpm++2m, dpm++2mv2, ipndm, ipndm_v, lcm, ddim_trailing}\n");
223224
printf(" sampling method (default: \"euler_a\")\n");
224225
printf(" --steps STEPS number of sample steps (default: 20)\n");
225226
printf(" --rng {std_default, cuda} RNG (default: cuda)\n");

stable-diffusion.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ const char* sampling_methods_str[] = {
4747
"iPNDM",
4848
"iPNDM_v",
4949
"LCM",
50+
"DDIM \"trailing\""
5051
};
5152

5253
/*================================================== Helper Functions ================================================*/

stable-diffusion.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ enum sample_method_t {
4444
IPNDM,
4545
IPNDM_V,
4646
LCM,
47+
DDIM_TRAILING,
4748
N_SAMPLE_METHODS
4849
};
4950

0 commit comments

Comments
 (0)