@@ -1005,6 +1005,174 @@ static void sample_k_diffusion(sample_method_t method,
1005
1005
}
1006
1006
}
1007
1007
} break ;
1008
+ case DDIM_TRAILING: // Denoising Diffusion Implicit Models
1009
+ // with the "trailing" timestep spacing
1010
+ {
1011
+ // DDIM itself needs alphas_cumprod (DDPM, Ho et al.,
1012
+ // arXiv:2006.11239 [cs.LG] with k-diffusion's start and
1013
+ // end beta) (which unfortunately k-diffusion's data
1014
+ // structure hides from the denoiser), and the sigmas are
1015
+ // also needed to invert the behavior of CompVisDenoiser
1016
+ // (k-diffusion's LMSDiscreteScheduler)
1017
+ std::vector<double > alphas_cumprod;
1018
+ std::vector<double > compvis_sigmas;
1019
+
1020
+ alphas_cumprod.reserve (TIMESTEPS);
1021
+ compvis_sigmas.reserve (TIMESTEPS);
1022
+ for (int i = 0 ; i < TIMESTEPS; i++) {
1023
+ alphas_cumprod[i] =
1024
+ (i == 0 ? 1 .0f : alphas_cumprod[i - 1 ]) *
1025
+ (1 .0f -
1026
+ std::pow (sqrtf (0 .00085f ) +
1027
+ (sqrtf (0 .0120f ) - sqrtf (0 .00085f )) *
1028
+ ((float )i / (TIMESTEPS - 1 )), 2 ));
1029
+ compvis_sigmas[i] =
1030
+ std::sqrt ((1 - alphas_cumprod[i]) /
1031
+ alphas_cumprod[i]);
1032
+ }
1033
+ for (int i = 0 ; i < steps; i++) {
1034
+ // The "trailing" DDIM timestep, see S. Lin et al.,
1035
+ // "Common Diffusion Noise Schedules and Sample Steps
1036
+ // are Flawed", arXiv:2305.08891 [cs], p. 4, Table
1037
+ // 2. Most variables below follow Diffusers naming.
1038
+ int timestep =
1039
+ roundf (TIMESTEPS -
1040
+ i * ((float )TIMESTEPS / steps)) - 1 ;
1041
+ int prev_timestep = timestep - TIMESTEPS / steps;
1042
+ // The sigma here is chosen to cause the
1043
+ // CompVisDenoiser to produce t = timestep
1044
+ float sigma = compvis_sigmas[timestep];
1045
+ if (i == 0 ) {
1046
+ // The function add_noise intializes x to
1047
+ // Diffusers' latents * sigma (as in Diffusers'
1048
+ // pipeline) or sample * sigma (Diffusers'
1049
+ // scheduler), where this sigma = init_noise_sigma
1050
+ // in Diffusers. For DDPM and DDIM however,
1051
+ // init_noise_sigma = 1. But the k-diffusion
1052
+ // model() also evaluates F_theta(c_in(sigma) x;
1053
+ // ...) instead of the bare U-net F_theta, with
1054
+ // c_in = 1 / sqrt(sigma^2 + 1), as defined in
1055
+ // T. Karras et al., "Elucidating the Design Space
1056
+ // of Diffusion-Based Generative Models",
1057
+ // arXiv:2206.00364 [cs.CV], p. 3, Table 1. Hence
1058
+ // the first call has to be prescaled as x <- x /
1059
+ // (c_in * sigma) with the k-diffusion pipeline
1060
+ // and CompVisDenoiser.
1061
+ float * vec_x = (float *)x->data ;
1062
+ for (int j = 0 ; j < ggml_nelements (x); j++) {
1063
+ vec_x[j] *= std::sqrt (sigma * sigma + 1 ) /
1064
+ sigma;
1065
+ }
1066
+ }
1067
+ else {
1068
+ // For the subsequent steps after the first one,
1069
+ // at this point x = latents (pipeline) or x =
1070
+ // sample (scheduler), and needs to be prescaled
1071
+ // with x <- latents / c_in to compensate for
1072
+ // model() applying the scale c_in before the
1073
+ // U-net F_theta
1074
+ float * vec_x = (float *)x->data ;
1075
+ for (int j = 0 ; j < ggml_nelements (x); j++) {
1076
+ vec_x[j] *= std::sqrt (sigma * sigma + 1 );
1077
+ }
1078
+ }
1079
+ // Note model() is the D(x, sigma) as defined in
1080
+ // T. Karras et al., arXiv:2206.00364, p. 3, Table 1
1081
+ // and p. 8 (7)
1082
+ struct ggml_tensor * noise_pred =
1083
+ model (x, sigma, i + 1 );
1084
+ // Here noise_pred is still the k-diffusion denoiser
1085
+ // output, not the U-net output F_theta(c_in(sigma) x;
1086
+ // ...) in Karras et al. (2022), whereas Diffusers'
1087
+ // noise_pred is F_theta(...). Recover the actual
1088
+ // noise_pred, which is also referred to as the
1089
+ // "Karras ODE derivative" d or d_cur in several
1090
+ // samplers above.
1091
+ {
1092
+ float * vec_x = (float *)x->data ;
1093
+ float * vec_noise_pred = (float *)noise_pred->data ;
1094
+ for (int j = 0 ; j < ggml_nelements (x); j++) {
1095
+ vec_noise_pred[j] =
1096
+ (vec_x[j] - vec_noise_pred[j]) *
1097
+ (1 / sigma);
1098
+ }
1099
+ }
1100
+ // 2. compute alphas, betas
1101
+ float alpha_prod_t = alphas_cumprod[timestep];
1102
+ // Note final_alpha_cumprod = alphas_cumprod[0]
1103
+ float alpha_prod_t_prev = prev_timestep >= 0 ?
1104
+ alphas_cumprod[prev_timestep] : alphas_cumprod[0 ];
1105
+ float beta_prod_t = 1 - alpha_prod_t ;
1106
+ // 3. compute predicted original sample from predicted
1107
+ // noise also called "predicted x_0" of formula (12)
1108
+ // from https://arxiv.org/pdf/2010.02502.pdf
1109
+ struct ggml_tensor * pred_original_sample =
1110
+ ggml_dup_tensor (work_ctx, x);
1111
+ {
1112
+ float * vec_x = (float *)x->data ;
1113
+ float * vec_noise_pred = (float *)noise_pred->data ;
1114
+ float * vec_pred_original_sample =
1115
+ (float *)pred_original_sample->data ;
1116
+ // Note the substitution of latents or sample = x
1117
+ // * c_in = x / sqrt(sigma^2 + 1)
1118
+ for (int j = 0 ; j < ggml_nelements (x); j++) {
1119
+ vec_pred_original_sample[j] =
1120
+ (vec_x[j] / std::sqrt (sigma * sigma + 1 ) -
1121
+ std::sqrt (beta_prod_t ) *
1122
+ vec_noise_pred[j]) *
1123
+ (1 / std::sqrt (alpha_prod_t ));
1124
+ }
1125
+ }
1126
+ // Assuming the "epsilon" prediction type, where below
1127
+ // pred_epsilon = noise_pred is inserted, and is not
1128
+ // defined/copied explicitly.
1129
+ //
1130
+ // 5. compute variance: "sigma_t(eta)" -> see formula
1131
+ // (16)
1132
+ //
1133
+ // sigma_t = sqrt((1 - alpha_t-1)/(1 - alpha_t)) *
1134
+ // sqrt(1 - alpha_t/alpha_t-1)
1135
+ float beta_prod_t_prev = 1 - alpha_prod_t_prev;
1136
+ float variance = (beta_prod_t_prev / beta_prod_t ) *
1137
+ (1 - alpha_prod_t / alpha_prod_t_prev);
1138
+ float std_dev_t = 0 * std::sqrt (variance);
1139
+ // 6. compute "direction pointing to x_t" of formula
1140
+ // (12) from https://arxiv.org/pdf/2010.02502.pdf
1141
+ struct ggml_tensor * pred_sample_direction =
1142
+ ggml_dup_tensor (work_ctx, noise_pred);
1143
+ {
1144
+ float * vec_noise_pred = (float *)noise_pred->data ;
1145
+ float * vec_pred_sample_direction =
1146
+ (float *)pred_sample_direction->data ;
1147
+ for (int j = 0 ; j < ggml_nelements (x); j++) {
1148
+ vec_pred_sample_direction[j] =
1149
+ std::sqrt (1 - alpha_prod_t_prev -
1150
+ std::pow (std_dev_t , 2 )) *
1151
+ vec_noise_pred[j];
1152
+ }
1153
+ }
1154
+ // 7. compute x_t without "random noise" of formula
1155
+ // (12) from https://arxiv.org/pdf/2010.02502.pdf
1156
+ {
1157
+ float * vec_pred_original_sample =
1158
+ (float *)pred_original_sample->data ;
1159
+ float * vec_pred_sample_direction =
1160
+ (float *)pred_sample_direction->data ;
1161
+ float * vec_x = (float *)x->data ;
1162
+ for (int j = 0 ; j < ggml_nelements (x); j++) {
1163
+ vec_x[j] = std::sqrt (alpha_prod_t_prev) *
1164
+ vec_pred_original_sample[j] +
1165
+ vec_pred_sample_direction[j];
1166
+ }
1167
+ }
1168
+ // See the note above: x = latents or sample here, and
1169
+ // is not scaled by the c_in. For the final output
1170
+ // this is correct, but for subsequent iterations, x
1171
+ // needs to be prescaled again, since k-diffusion's
1172
+ // model() differes from the bare U-net F_theta by the
1173
+ // factor c_in.
1174
+ }
1175
+ } break ;
1008
1176
1009
1177
default :
1010
1178
LOG_ERROR (" Attempting to sample with nonexisting sample method %i" , method);
0 commit comments