fxlin
diff --git a/‎.gitignore
Lines changed: 2 additions & 1 deletion b/‎.gitignore
Lines changed: 2 additions & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎clip.dot
Lines changed: 1469 additions & 0 deletions b/‎clip.dot
Lines changed: 1469 additions & 0 deletions
diff --git a/‎clip.dot.png
1.03 MB b/‎clip.dot.png
1.03 MB
diff --git a/‎clip.hpp
Lines changed: 6 additions & 3 deletions b/‎clip.hpp
Lines changed: 6 additions & 3 deletions
diff --git a/‎ggml_extend.hpp
Lines changed: 7 additions & 1 deletion b/‎ggml_extend.hpp
Lines changed: 7 additions & 1 deletion
diff --git a/‎model.h
Lines changed: 1 addition & 1 deletion b/‎model.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎run.sh
Lines changed: 1 addition & 0 deletions b/‎run.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎sd-1.5-graph-print-by-FL.txt
Lines changed: 1009 additions & 0 deletions b/‎sd-1.5-graph-print-by-FL.txt
Lines changed: 1009 additions & 0 deletions
diff --git a/‎stable-diffusion.cpp
Lines changed: 5 additions & 2 deletions b/‎stable-diffusion.cpp
Lines changed: 5 additions & 2 deletions
@@ -1,3 +1,4 @@
+*.ckpt
 build*/
 test/
 .vscode/
@@ -10,4 +11,4 @@ test/
 *.gguf
 output*.png
 models*
-*.log
+*.log
@@ -58,6 +58,8 @@ if(SD_FLASH_ATTN)
     add_definitions(-DSD_USE_FLASH_ATTENTION)
 endif()
 
+add_definitions(-DGGML_PERF)    # xzl
+
 set(SD_LIB stable-diffusion)
 
 file(GLOB SD_LIB_SOURCES 
 
@@ -262,6 +262,7 @@ class CLIPTokenizer {
         return tokens;
     }
 
+    // xzl: encode textual prmopt
     std::vector<int> encode(std::string text, on_new_token_cb_t on_new_token_cb) {
         std::string original_text = text;
         std::vector<int32_t> bpe_tokens;
@@ -423,7 +424,8 @@ std::vector<std::pair<std::string, float>> parse_prompt_attention(const std::str
 /*================================================ FrozenCLIPEmbedder ================================================*/
 
 // Ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py
-
+// xzl: text to embedding...
+//          construct the embedder... from bottom up
 struct CLIPMLP : public GGMLBlock {
 protected:
     bool use_gelu;
@@ -814,7 +816,7 @@ class CLIPVisionModelProjection : public GGMLBlock {
 
 // ldm.modules.encoders.modules.FrozenCLIPEmbedder
 // Ref: https://github.com/AUTOMATIC1111/stable-diffusion-webui/blob/cad87bf4e3e0b0a759afa94e933527c3123d59bc/modules/sd_hijack_clip.py#L283
-// xzl: a "module" ... can be evaluated....
+// xzl: a "module" ... can be evaluated.... xzl: used as "condition" model...
 struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
     SDVersion version = VERSION_1_x;
     CLIPTokenizer tokenizer;
@@ -1008,7 +1010,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
         struct ggml_tensor* hidden_states = forward(compute_ctx, input_ids, input_ids2, embeddings, max_token_idx, return_pooled);
 
         ggml_build_forward_expand(gf, hidden_states);
-
+        ggml_graph_dump_dot(gf, NULL, "clip.dot"); // xzladd    
         return gf;
     }
 
@@ -1169,6 +1171,7 @@ struct FrozenCLIPVisionEmbedder : public GGMLModule {
         return gf;
     }
 
+    // lazyily construct graph when compute()
     void compute(const int n_threads,
                  ggml_tensor* pixel_values,
                  ggml_tensor** output,
 
@@ -721,6 +721,8 @@ struct GGMLModule {
     ggml_type wtype        = GGML_TYPE_F32;
     ggml_backend_t backend = NULL;
 
+    int graph_print_count = 0;  // xzladd
+
     void alloc_params_ctx() {
         struct ggml_init_params params;
         params.mem_size   = static_cast<size_t>(MAX_PARAMS_TENSOR_NUM * ggml_tensor_overhead());
@@ -886,7 +888,11 @@ struct GGMLModule {
         ggml_backend_graph_compute(backend, gf);
 
 #ifdef GGML_PERF
-        ggml_graph_print(gf);
+        // xzladd            
+        if (this->graph_print_count == 0) {
+            ggml_graph_print(gf);
+            this->graph_print_count++; 
+        }
 #endif
 
         if (output != NULL) {
 
@@ -20,7 +20,7 @@ enum SDVersion {
     VERSION_1_x,
     VERSION_2_x,
     VERSION_XL,
-    VERSION_SVD,
+    VERSION_SVD,            // xzl: video diffusion
     VERSION_COUNT,
 };
 
 
@@ -0,0 +1 @@
+build/bin/sd -m assets/sd-v1-4.ckpt -p "a lovely cat" --threads 10
@@ -67,8 +67,8 @@ class StableDiffusionGGML {
     int n_threads            = -1;
     float scale_factor       = 0.18215f;
 
-    std::shared_ptr<FrozenCLIPEmbedderWithCustomWords> cond_stage_model;
-    std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd
+    std::shared_ptr<FrozenCLIPEmbedderWithCustomWords> cond_stage_model;            // xzl: for condition (pos, eg...)
+    std::shared_ptr<FrozenCLIPVisionEmbedder> clip_vision;  // for svd              xzl: svd - video
     std::shared_ptr<UNetModel> diffusion_model;     // xzl: THE model...
     std::shared_ptr<AutoEncoderKL> first_stage_model;
     std::shared_ptr<TinyAutoEncoder> tae_first_stage;
@@ -205,6 +205,7 @@ class StableDiffusionGGML {
             diffusion_model->alloc_params_buffer();
             diffusion_model->get_param_tensors(tensors, "model.diffusion_model");
 
+            // xzl: 1st stage... auto encoder...
             ggml_type vae_type = model_data_type;
             if (version == VERSION_XL) {
                 vae_type = GGML_TYPE_F32;  // avoid nan, not work...
@@ -218,6 +219,7 @@ class StableDiffusionGGML {
                 tae_first_stage = std::make_shared<TinyAutoEncoder>(backend, model_data_type, vae_decode_only);
             }
 
+            // xzl: control net.... (optional)
             if (control_net_path.size() > 0) {
                 ggml_backend_t cn_backend = NULL;
                 if (control_net_cpu && !ggml_backend_is_cpu(backend)) {
@@ -651,6 +653,7 @@ class StableDiffusionGGML {
         return {c_crossattn, c_concat, y};
     }
 
+    // xzl: the main sample loop...
     ggml_tensor* sample(ggml_context* work_ctx,
                         ggml_tensor* x_t,
                         ggml_tensor* noise,
-Original file line number
+Diff line change
@@ @@ -1,3 +1,4 @@ @@
 +*.ckpt
 build*/
 test/
 .vscode/
 *.gguf
 output*.png
 models*
 -*.log
 +*.log
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+build/bin/sd -m assets/sd-v1-4.ckpt -p "a lovely cat" --threads 10`