2
2
#define __CLIP_HPP__
3
3
4
4
#include " ggml_extend.hpp"
5
+ #include " model.h"
5
6
6
7
/* ================================================== CLIPTokenizer ===================================================*/
7
8
@@ -67,6 +68,9 @@ std::vector<std::pair<int, std::u32string>> bytes_to_unicode() {
67
68
}
68
69
69
70
// Ref: https://github.com/openai/CLIP/blob/main/clip/simple_tokenizer.py
71
+
72
+ typedef std::function<bool (std::string&, std::vector<int32_t >&)> on_new_token_cb_t ;
73
+
70
74
class CLIPTokenizer {
71
75
private:
72
76
SDVersion version = VERSION_1_x;
@@ -234,8 +238,11 @@ class CLIPTokenizer {
234
238
return result;
235
239
}
236
240
237
- std::vector<int > tokenize (std::string text, size_t max_length = 0 , bool padding = false ) {
238
- std::vector<int32_t > tokens = encode (text);
241
+ std::vector<int > tokenize (std::string text,
242
+ on_new_token_cb_t on_new_token_cb,
243
+ size_t max_length = 0 ,
244
+ bool padding = false ) {
245
+ std::vector<int32_t > tokens = encode (text, on_new_token_cb);
239
246
tokens.insert (tokens.begin (), BOS_TOKEN_ID);
240
247
if (max_length > 0 ) {
241
248
if (tokens.size () > max_length - 1 ) {
@@ -255,7 +262,7 @@ class CLIPTokenizer {
255
262
return tokens;
256
263
}
257
264
258
- std::vector<int > encode (std::string text) {
265
+ std::vector<int > encode (std::string text, on_new_token_cb_t on_new_token_cb ) {
259
266
std::string original_text = text;
260
267
std::vector<int32_t > bpe_tokens;
261
268
text = whitespace_clean (text);
@@ -268,6 +275,10 @@ class CLIPTokenizer {
268
275
std::string str = text;
269
276
std::vector<std::string> token_strs;
270
277
while (std::regex_search (str, matches, pat)) {
278
+ bool skip = on_new_token_cb (str, bpe_tokens);
279
+ if (skip) {
280
+ continue ;
281
+ }
271
282
for (auto & token : matches) {
272
283
std::string token_str = token.str ();
273
284
std::u32string utf32_token;
@@ -536,7 +547,13 @@ class CLIPEmbeddings : public GGMLBlock {
536
547
num_positions(num_positions) {
537
548
}
538
549
539
- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * input_ids) {
550
+ struct ggml_tensor * get_token_embed_weight () {
551
+ return params[" token_embedding.weight" ];
552
+ }
553
+
554
+ struct ggml_tensor * forward (struct ggml_context * ctx,
555
+ struct ggml_tensor * input_ids,
556
+ struct ggml_tensor * custom_embed_weight) {
540
557
// input_ids: [N, n_token]
541
558
auto token_embed_weight = params[" token_embedding.weight" ];
542
559
auto position_embed_weight = params[" position_embedding.weight" ];
@@ -545,7 +562,7 @@ class CLIPEmbeddings : public GGMLBlock {
545
562
546
563
// token_embedding + position_embedding
547
564
auto x = ggml_add (ctx,
548
- ggml_get_rows (ctx, token_embed_weight, input_ids),
565
+ ggml_get_rows (ctx, custom_embed_weight != NULL ? custom_embed_weight : token_embed_weight, input_ids),
549
566
position_embed_weight); // [N, n_token, embed_dim]
550
567
return x;
551
568
}
@@ -667,14 +684,23 @@ class CLIPTextModel : public GGMLBlock {
667
684
clip_skip = skip;
668
685
}
669
686
670
- struct ggml_tensor * forward (struct ggml_context * ctx, struct ggml_tensor * input_ids, size_t max_token_idx = 0 , bool return_pooled = false ) {
687
+ struct ggml_tensor * get_token_embed_weight () {
688
+ auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks[" embeddings" ]);
689
+ return embeddings->get_token_embed_weight ();
690
+ }
691
+
692
+ struct ggml_tensor * forward (struct ggml_context * ctx,
693
+ struct ggml_tensor * input_ids,
694
+ struct ggml_tensor * tkn_embeddings,
695
+ size_t max_token_idx = 0 ,
696
+ bool return_pooled = false ) {
671
697
// input_ids: [N, n_token]
672
698
auto embeddings = std::dynamic_pointer_cast<CLIPEmbeddings>(blocks[" embeddings" ]);
673
699
auto encoder = std::dynamic_pointer_cast<CLIPEncoder>(blocks[" encoder" ]);
674
700
auto final_layer_norm = std::dynamic_pointer_cast<LayerNorm>(blocks[" final_layer_norm" ]);
675
701
676
- auto x = embeddings->forward (ctx, input_ids); // [N, n_token, hidden_size]
677
- x = encoder->forward (ctx, x, return_pooled ? -1 : clip_skip, true );
702
+ auto x = embeddings->forward (ctx, input_ids, tkn_embeddings ); // [N, n_token, hidden_size]
703
+ x = encoder->forward (ctx, x, return_pooled ? -1 : clip_skip, true );
678
704
if (return_pooled || with_final_ln) {
679
705
x = final_layer_norm->forward (ctx, x);
680
706
}
@@ -695,6 +721,7 @@ class CLIPVisionModel : public GGMLBlock {
695
721
void init_params (struct ggml_context * ctx, ggml_type wtype) {
696
722
params[" visual_projection" ] = ggml_new_tensor_2d (ctx, GGML_TYPE_F32, projection_dim, hidden_size);
697
723
}
724
+
698
725
public:
699
726
// network hparams
700
727
int32_t num_channels = 3 ;
@@ -742,10 +769,10 @@ class CLIPVisionModel : public GGMLBlock {
742
769
x = post_layernorm->forward (ctx, x); // [N, n_token, hidden_size]
743
770
744
771
GGML_ASSERT (x->ne [2 ] == 1 );
745
- int64_t max_token_idx = 0 ;
746
- ggml_tensor* pooled = ggml_view_1d (ctx, x, x->ne [0 ], x->nb [1 ] * max_token_idx); // assert N == 1
772
+ int64_t max_token_idx = 0 ;
773
+ ggml_tensor* pooled = ggml_view_1d (ctx, x, x->ne [0 ], x->nb [1 ] * max_token_idx); // assert N == 1
747
774
auto visual_projection = params[" visual_projection" ];
748
- pooled = ggml_mul_mat (ctx, ggml_cont (ctx, ggml_transpose (ctx, visual_projection)), pooled);
775
+ pooled = ggml_mul_mat (ctx, ggml_cont (ctx, ggml_transpose (ctx, visual_projection)), pooled);
749
776
return pooled; // [N, projection_dim]
750
777
}
751
778
};
@@ -790,6 +817,11 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
790
817
CLIPTextModel text_model;
791
818
CLIPTextModel text_model2;
792
819
820
+ std::string embd_dir;
821
+ int32_t num_custom_embeddings = 0 ;
822
+ std::vector<uint8_t > token_embed_custom;
823
+ std::vector<std::string> readed_embeddings;
824
+
793
825
FrozenCLIPEmbedderWithCustomWords (ggml_backend_t backend,
794
826
ggml_type wtype,
795
827
SDVersion version = VERSION_1_x,
@@ -849,15 +881,53 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
849
881
}
850
882
}
851
883
884
+ bool load_embedding (std::string embd_name, std::string embd_path, std::vector<int32_t >& bpe_tokens) {
885
+ // the order matters
886
+ ModelLoader model_loader;
887
+ if (!model_loader.init_from_file (embd_path)) {
888
+ LOG_ERROR (" embedding '%s' failed" , embd_name.c_str ());
889
+ return false ;
890
+ }
891
+ struct ggml_init_params params;
892
+ params.mem_size = 32 * 1024 ; // max for custom embeddings 32 KB
893
+ params.mem_buffer = NULL ;
894
+ params.no_alloc = false ;
895
+ struct ggml_context * embd_ctx = ggml_init (params);
896
+ struct ggml_tensor * embd = NULL ;
897
+ auto on_load = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) {
898
+ if (tensor_storage.ne [0 ] != text_model.hidden_size ) {
899
+ LOG_DEBUG (" embedding wrong hidden size, got %i, expected %i" , tensor_storage.ne [0 ], text_model.hidden_size );
900
+ return false ;
901
+ }
902
+ embd = ggml_new_tensor_2d (embd_ctx, wtype, text_model.hidden_size , tensor_storage.n_dims > 1 ? tensor_storage.ne [1 ] : 1 );
903
+ *dst_tensor = embd;
904
+ return true ;
905
+ };
906
+ model_loader.load_tensors (on_load, NULL );
907
+ readed_embeddings.push_back (embd_name);
908
+ token_embed_custom.resize (token_embed_custom.size () + ggml_nbytes (embd));
909
+ memcpy ((void *)(token_embed_custom.data () + num_custom_embeddings * text_model.hidden_size * ggml_type_size (wtype)),
910
+ embd->data ,
911
+ ggml_nbytes (embd));
912
+ for (int i = 0 ; i < embd->ne [1 ]; i++) {
913
+ bpe_tokens.push_back (text_model.vocab_size + num_custom_embeddings);
914
+ // LOG_DEBUG("new custom token: %i", text_model.vocab_size + num_custom_embeddings);
915
+ num_custom_embeddings++;
916
+ }
917
+ LOG_DEBUG (" embedding '%s' applied, custom embeddings: %i" , embd_name.c_str (), num_custom_embeddings);
918
+ return true ;
919
+ }
920
+
852
921
struct ggml_tensor * forward (struct ggml_context * ctx,
853
922
struct ggml_tensor * input_ids,
854
923
struct ggml_tensor * input_ids2,
924
+ struct ggml_tensor * embeddings,
855
925
size_t max_token_idx = 0 ,
856
926
bool return_pooled = false ) {
857
927
if (return_pooled) {
858
- return text_model2.forward (ctx, input_ids2, max_token_idx, return_pooled);
928
+ return text_model2.forward (ctx, input_ids2, NULL , max_token_idx, return_pooled);
859
929
}
860
- auto hidden_states = text_model.forward (ctx, input_ids); // [N, n_token, hidden_size]
930
+ auto hidden_states = text_model.forward (ctx, input_ids, embeddings ); // [N, n_token, hidden_size]
861
931
// LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
862
932
if (version == VERSION_XL) {
863
933
hidden_states = ggml_reshape_4d (ctx,
@@ -868,7 +938,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
868
938
hidden_states->ne [3 ]);
869
939
hidden_states = ggml_cont (ctx, ggml_permute (ctx, hidden_states, 2 , 0 , 1 , 3 ));
870
940
871
- auto hidden_states2 = text_model2.forward (ctx, input_ids2); // [N, n_token, hidden_size2]
941
+ auto hidden_states2 = text_model2.forward (ctx, input_ids2, NULL ); // [N, n_token, hidden_size2]
872
942
// LOG_DEBUG("hidden_states: %d %d %d %d", hidden_states->ne[0], hidden_states->ne[1], hidden_states->ne[2], hidden_states->ne[3]);
873
943
hidden_states2 = ggml_reshape_4d (ctx,
874
944
hidden_states2,
@@ -919,7 +989,34 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
919
989
}
920
990
}
921
991
922
- struct ggml_tensor * hidden_states = forward (compute_ctx, input_ids, input_ids2, max_token_idx, return_pooled);
992
+ struct ggml_tensor * embeddings = NULL ;
993
+
994
+ if (num_custom_embeddings > 0 && version != VERSION_XL) {
995
+ embeddings = ggml_new_tensor_2d (compute_ctx,
996
+ wtype,
997
+ text_model.hidden_size ,
998
+ text_model.vocab_size + num_custom_embeddings /* custom placeholder */ );
999
+ ggml_allocr_alloc (allocr, embeddings);
1000
+ if (!ggml_allocr_is_measure (allocr)) {
1001
+ // really bad, there is memory inflexibility (this is for host<->device memory conflicts)
1002
+ auto token_embed_weight = text_model.get_token_embed_weight ();
1003
+ void * freeze_data = malloc (ggml_nbytes (token_embed_weight));
1004
+ ggml_backend_tensor_get_and_sync (backend,
1005
+ token_embed_weight,
1006
+ freeze_data,
1007
+ 0 ,
1008
+ ggml_nbytes (token_embed_weight));
1009
+ ggml_backend_tensor_set (embeddings, freeze_data, 0 , ggml_nbytes (token_embed_weight));
1010
+ free (freeze_data);
1011
+ // concatenate custom embeddings
1012
+ ggml_backend_tensor_set (embeddings,
1013
+ (const void *)token_embed_custom.data (),
1014
+ ggml_nbytes (token_embed_weight),
1015
+ num_custom_embeddings * text_model.hidden_size * ggml_type_size (wtype));
1016
+ }
1017
+ }
1018
+
1019
+ struct ggml_tensor * hidden_states = forward (compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
923
1020
924
1021
ggml_build_forward_expand (gf, hidden_states);
925
1022
@@ -957,12 +1054,36 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
957
1054
LOG_DEBUG (" parse '%s' to %s" , text.c_str (), ss.str ().c_str ());
958
1055
}
959
1056
1057
+ auto on_new_token_cb = [&](std::string& str, std::vector<int32_t >& bpe_tokens) -> bool {
1058
+ size_t word_end = str.find (" ," );
1059
+ std::string embd_name = word_end == std::string::npos ? str : str.substr (0 , word_end);
1060
+ embd_name = trim (embd_name);
1061
+ std::string embd_path = get_full_path (embd_dir, embd_name + " .pt" );
1062
+ if (embd_path.size () == 0 ) {
1063
+ embd_path = get_full_path (embd_dir, embd_name + " .ckpt" );
1064
+ }
1065
+ if (embd_path.size () == 0 ) {
1066
+ embd_path = get_full_path (embd_dir, embd_name + " .safetensors" );
1067
+ }
1068
+ if (embd_path.size () > 0 ) {
1069
+ if (load_embedding (embd_name, embd_path, bpe_tokens)) {
1070
+ if (word_end != std::string::npos) {
1071
+ str = str.substr (word_end);
1072
+ } else {
1073
+ str = " " ;
1074
+ }
1075
+ return true ;
1076
+ }
1077
+ }
1078
+ return false ;
1079
+ };
1080
+
960
1081
std::vector<int > tokens;
961
1082
std::vector<float > weights;
962
1083
for (const auto & item : parsed_attention) {
963
1084
const std::string& curr_text = item.first ;
964
1085
float curr_weight = item.second ;
965
- std::vector<int > curr_tokens = tokenizer.encode (curr_text);
1086
+ std::vector<int > curr_tokens = tokenizer.encode (curr_text, on_new_token_cb );
966
1087
tokens.insert (tokens.end (), curr_tokens.begin (), curr_tokens.end ());
967
1088
weights.insert (weights.end (), curr_tokens.size (), curr_weight);
968
1089
}
0 commit comments