@@ -112,9 +112,9 @@ class SpatialVideoTransformer : public SpatialTransformer {
112
112
x = ggml_cont (ctx, ggml_permute (ctx, x, 1 , 2 , 0 , 3 )); // [N, h, w, inner_dim]
113
113
x = ggml_reshape_3d (ctx, x, inner_dim, w * h, n); // [N, h * w, inner_dim]
114
114
115
- std::vector< float > num_frames = arange ( 0 , timesteps);
115
+ auto num_frames = ggml_arange (ctx, 0 , timesteps, 1 );
116
116
// since b is 1, no need to do repeat
117
- auto t_emb = new_timestep_embedding (ctx, allocr , num_frames, in_channels, max_time_embed_period); // [N, in_channels]
117
+ auto t_emb = ggml_nn_timestep_embedding (ctx, num_frames, in_channels, max_time_embed_period); // [N, in_channels]
118
118
119
119
auto emb = time_pos_embed_0->forward (ctx, t_emb);
120
120
emb = ggml_silu_inplace (ctx, emb);
@@ -377,7 +377,7 @@ class UnetModelBlock : public GGMLBlock {
377
377
struct ggml_tensor * forward (struct ggml_context * ctx,
378
378
struct ggml_allocr * allocr,
379
379
struct ggml_tensor * x,
380
- std::vector< float > timesteps,
380
+ struct ggml_tensor * timesteps,
381
381
struct ggml_tensor * context,
382
382
struct ggml_tensor * c_concat = NULL ,
383
383
struct ggml_tensor * y = NULL ,
@@ -386,7 +386,6 @@ class UnetModelBlock : public GGMLBlock {
386
386
float control_strength = 0 .f) {
387
387
// x: [N, in_channels, h, w] or [N, in_channels/2, h, w]
388
388
// timesteps: [N,]
389
- // t_emb: [N, model_channels] timestep_embedding(timesteps, model_channels)
390
389
// context: [N, max_position, hidden_size] or [1, max_position, hidden_size]. for example, [N, 77, 768]
391
390
// c_concat: [N, in_channels, h, w] or [1, in_channels, h, w]
392
391
// y: [N, adm_in_channels] or [1, adm_in_channels]
@@ -417,7 +416,7 @@ class UnetModelBlock : public GGMLBlock {
417
416
auto out_0 = std::dynamic_pointer_cast<GroupNorm32>(blocks[" out.0" ]);
418
417
auto out_2 = std::dynamic_pointer_cast<Conv2d>(blocks[" out.2" ]);
419
418
420
- auto t_emb = new_timestep_embedding (ctx, allocr , timesteps, model_channels); // [N, model_channels]
419
+ auto t_emb = ggml_nn_timestep_embedding (ctx, timesteps, model_channels); // [N, model_channels]
421
420
422
421
auto emb = time_embed_0->forward (ctx, t_emb);
423
422
emb = ggml_silu_inplace (ctx, emb);
@@ -561,7 +560,7 @@ struct UNetModel : public GGMLModule {
561
560
}
562
561
563
562
struct ggml_cgraph * build_graph (struct ggml_tensor * x,
564
- std::vector< float > timesteps,
563
+ struct ggml_tensor * timesteps,
565
564
struct ggml_tensor * context,
566
565
struct ggml_tensor * c_concat = NULL ,
567
566
struct ggml_tensor * y = NULL ,
@@ -577,6 +576,7 @@ struct UNetModel : public GGMLModule {
577
576
x = to_backend (x);
578
577
context = to_backend (context);
579
578
y = to_backend (y);
579
+ timesteps = to_backend (timesteps);
580
580
581
581
for (int i = 0 ; i < controls.size (); i++) {
582
582
controls[i] = to_backend (controls[i]);
@@ -600,7 +600,7 @@ struct UNetModel : public GGMLModule {
600
600
601
601
void compute (int n_threads,
602
602
struct ggml_tensor * x,
603
- std::vector< float > timesteps,
603
+ struct ggml_tensor * timesteps,
604
604
struct ggml_tensor * context,
605
605
struct ggml_tensor * c_concat,
606
606
struct ggml_tensor * y,
@@ -638,7 +638,8 @@ struct UNetModel : public GGMLModule {
638
638
int num_video_frames = 3 ;
639
639
640
640
auto x = ggml_new_tensor_4d (work_ctx, GGML_TYPE_F32, 8 , 8 , 8 , num_video_frames);
641
- std::vector<float > timesteps (num_video_frames, 999 .f );
641
+ std::vector<float > timesteps_vec (num_video_frames, 999 .f );
642
+ auto timesteps = vector_to_ggml_tensor (work_ctx, timesteps_vec);
642
643
ggml_set_f32 (x, 0 .5f );
643
644
// print_ggml_tensor(x);
644
645
0 commit comments