@@ -65,7 +65,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
65
65
: version(version), pm_version(pv), tokenizer(version == VERSION_SD2 ? 0 : 49407 ), embd_dir(embd_dir), wtype(wtype) {
66
66
if (clip_skip <= 0 ) {
67
67
clip_skip = 1 ;
68
- if (version == VERSION_SD2 || version == VERSION_SDXL) {
68
+ if (version == VERSION_SD2 || version == VERSION_SDXL || version == VERSION_SDXL_REFINER ) {
69
69
clip_skip = 2 ;
70
70
}
71
71
}
@@ -76,40 +76,53 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
76
76
} else if (version == VERSION_SDXL) {
77
77
text_model = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPENAI_CLIP_VIT_L_14, clip_skip, false );
78
78
text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
79
+ } else if (version == VERSION_SDXL_REFINER) {
80
+ text_model2 = std::make_shared<CLIPTextModelRunner>(backend, wtype, OPEN_CLIP_VIT_BIGG_14, clip_skip, false );
79
81
}
80
82
}
81
83
82
84
void set_clip_skip (int clip_skip) {
83
- text_model->set_clip_skip (clip_skip);
84
- if (version == VERSION_SDXL) {
85
+ if (version != VERSION_SDXL_REFINER) {
86
+ text_model->set_clip_skip (clip_skip);
87
+ }
88
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
85
89
text_model2->set_clip_skip (clip_skip);
86
90
}
87
91
}
88
92
89
93
void get_param_tensors (std::map<std::string, struct ggml_tensor *>& tensors) {
90
- text_model->get_param_tensors (tensors, " cond_stage_model.transformer.text_model" );
91
- if (version == VERSION_SDXL) {
94
+ if (version != VERSION_SDXL_REFINER) {
95
+ text_model->get_param_tensors (tensors, " cond_stage_model.transformer.text_model" );
96
+ }
97
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
92
98
text_model2->get_param_tensors (tensors, " cond_stage_model.1.transformer.text_model" );
93
99
}
94
100
}
95
101
96
102
void alloc_params_buffer () {
97
- text_model->alloc_params_buffer ();
98
- if (version == VERSION_SDXL) {
103
+ if (version != VERSION_SDXL_REFINER) {
104
+ text_model->alloc_params_buffer ();
105
+ }
106
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
99
107
text_model2->alloc_params_buffer ();
100
108
}
101
109
}
102
110
103
111
void free_params_buffer () {
104
- text_model->free_params_buffer ();
105
- if (version == VERSION_SDXL) {
112
+ if (version != VERSION_SDXL_REFINER) {
113
+ text_model->free_params_buffer ();
114
+ }
115
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
106
116
text_model2->free_params_buffer ();
107
117
}
108
118
}
109
119
110
120
size_t get_params_buffer_size () {
111
- size_t buffer_size = text_model->get_params_buffer_size ();
112
- if (version == VERSION_SDXL) {
121
+ size_t buffer_size = 0 ;
122
+ if (version != VERSION_SDXL_REFINER) {
123
+ buffer_size = text_model->get_params_buffer_size ();
124
+ }
125
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER) {
113
126
buffer_size += text_model2->get_params_buffer_size ();
114
127
}
115
128
return buffer_size;
@@ -132,8 +145,13 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
132
145
params.no_alloc = false ;
133
146
struct ggml_context * embd_ctx = ggml_init (params);
134
147
struct ggml_tensor * embd = NULL ;
135
- int64_t hidden_size = text_model->model .hidden_size ;
136
- auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
148
+ int64_t hidden_size = 0 ;
149
+ if (version != VERSION_SDXL_REFINER) {
150
+ hidden_size = text_model->model .hidden_size ;
151
+ } else {
152
+ hidden_size = text_model2->model .hidden_size ;
153
+ }
154
+ auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
137
155
if (tensor_storage.ne [0 ] != hidden_size) {
138
156
LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], hidden_size);
139
157
return false ;
@@ -149,7 +167,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
149
167
embd->data ,
150
168
ggml_nbytes (embd));
151
169
for (int i = 0 ; i < embd->ne [1 ]; i++) {
152
- bpe_tokens.push_back (text_model->model .vocab_size + num_custom_embeddings);
170
+ if (version != VERSION_SDXL_REFINER) {
171
+ bpe_tokens.push_back (text_model->model .vocab_size + num_custom_embeddings);
172
+ } else {
173
+ bpe_tokens.push_back (text_model2->model .vocab_size + num_custom_embeddings);
174
+ }
153
175
// LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
154
176
num_custom_embeddings++;
155
177
}
@@ -163,7 +185,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
163
185
int32_t image_token,
164
186
bool padding = false ) {
165
187
return tokenize_with_trigger_token (text, num_input_imgs, image_token,
166
- text_model->model .n_token , padding);
188
+ version != VERSION_SDXL_REFINER ? text_model-> model . n_token : text_model2 ->model .n_token , padding);
167
189
}
168
190
169
191
std::vector<int > convert_token_to_id (std::string text) {
@@ -312,7 +334,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
312
334
313
335
std::pair<std::vector<int >, std::vector<float >> tokenize (std::string text,
314
336
bool padding = false ) {
315
- return tokenize (text, text_model->model .n_token , padding);
337
+ return tokenize (text, version != VERSION_SDXL_REFINER ? text_model-> model . n_token : text_model2 ->model .n_token , padding);
316
338
}
317
339
318
340
std::pair<std::vector<int >, std::vector<float >> tokenize (std::string text,
@@ -403,7 +425,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
403
425
auto input_ids = vector_to_ggml_tensor_i32 (work_ctx, chunk_tokens);
404
426
struct ggml_tensor * input_ids2 = NULL ;
405
427
size_t max_token_idx = 0 ;
406
- if (version == VERSION_SDXL) {
428
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER ) {
407
429
auto it = std::find (chunk_tokens.begin (), chunk_tokens.end (), tokenizer.EOS_TOKEN_ID );
408
430
if (it != chunk_tokens.end ()) {
409
431
std::fill (std::next (it), chunk_tokens.end (), 0 );
@@ -428,16 +450,20 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
428
450
false ,
429
451
&chunk_hidden_states1,
430
452
work_ctx);
431
- if (version == VERSION_SDXL) {
453
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER ) {
432
454
text_model2->compute (n_threads,
433
455
input_ids2,
434
456
0 ,
435
457
NULL ,
436
458
max_token_idx,
437
459
false ,
438
460
&chunk_hidden_states2, work_ctx);
439
- // concat
440
- chunk_hidden_states = ggml_tensor_concat (work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0 );
461
+ if (version == VERSION_SDXL) {
462
+ // concat
463
+ chunk_hidden_states = ggml_tensor_concat (work_ctx, chunk_hidden_states1, chunk_hidden_states2, 0 );
464
+ } else {
465
+ chunk_hidden_states = chunk_hidden_states2;
466
+ }
441
467
442
468
if (chunk_idx == 0 ) {
443
469
text_model2->compute (n_threads,
@@ -487,7 +513,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public Conditioner {
487
513
ggml_nelements (hidden_states) / chunk_hidden_states->ne [0 ]);
488
514
489
515
ggml_tensor* vec = NULL ;
490
- if (version == VERSION_SDXL) {
516
+ if (version == VERSION_SDXL || version == VERSION_SDXL_REFINER ) {
491
517
int out_dim = 256 ;
492
518
vec = ggml_new_tensor_1d (work_ctx, GGML_TYPE_F32, adm_in_channels);
493
519
// [0:1280]
0 commit comments