leejet · leejet · Jan 5, 2024 · Jan 1, 2024 · Jan 1, 2024 · Jan 5, 2024
diff --git a/clip.hpp b/clip.hpp
@@ -443,16 +443,13 @@ struct ResidualAttentionBlock {
     struct ggml_tensor* ln2_w;  // [hidden_size, ]
     struct ggml_tensor* ln2_b;  // [hidden_size, ]
 
-    struct ggml_tensor* attn_scale;  // [hidden_size, ]
-
     size_t calculate_mem_size(ggml_type wtype) {
         double mem_size = 0;
         mem_size += 4 * hidden_size * hidden_size * ggml_type_sizef(wtype);        // q_w/k_w/v_w/out_w
         mem_size += 8 * hidden_size * ggml_type_sizef(GGML_TYPE_F32);              // q_b/k_b/v_b/out_b/ln1_w/ln1_b/ln2_w/ln2_b
         mem_size += 2 * hidden_size * intermediate_size * ggml_type_sizef(wtype);  // fc1_w/fc2_w
         mem_size += intermediate_size * ggml_type_sizef(GGML_TYPE_F32);            // fc1_b
         mem_size += hidden_size * ggml_type_sizef(GGML_TYPE_F32);                  // fc2_b
-        mem_size += ggml_type_sizef(GGML_TYPE_F32);                                // attn_scale
         return static_cast<size_t>(mem_size);
     }
 
@@ -479,10 +476,6 @@ struct ResidualAttentionBlock {
         ln2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
         ln2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, hidden_size);
 
-        attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-        ggml_allocr_alloc(alloc, attn_scale);
-        float scale = 1.0f / sqrt((float)d_model);
-        ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
     }
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@@ -521,7 +514,7 @@ struct ResidualAttentionBlock {
         // self-attention
         {
             struct ggml_tensor* q = ggml_nn_linear(ctx, x, q_w, q_b);
-            q                     = ggml_scale_inplace(ctx, q, attn_scale);
+            q                     = ggml_scale_inplace(ctx, q, 1.0f / sqrt((float)d_model));
             q                     = ggml_reshape_4d(ctx, q, d_model, n_head, n_token, N);   // [N, n_token, n_head, d_model]
             q                     = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));       // [N, n_head, n_token, d_model]
             q                     = ggml_reshape_3d(ctx, q, d_model, n_token, n_head * N);  // [N * n_head, n_token, d_model]

diff --git a/esrgan.hpp b/esrgan.hpp
@@ -91,7 +91,7 @@ struct ResidualDenseBlock {
         tensors[prefix + "conv5.bias"]   = conv5_b;
     }
 
-    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
+    ggml_tensor* forward(ggml_context* ctx, float out_scale, ggml_tensor* x /* feat */) {
         // x1 = self.lrelu(self.conv1(x))
         ggml_tensor* x1 = ggml_nn_conv_2d(ctx, x, conv1_w, conv1_b, 1, 1, 1, 1);
         x1              = ggml_leaky_relu(ctx, x1, 0.2f, true);
@@ -161,7 +161,7 @@ struct EsrganBlock {
         }
     }
 
-    ggml_tensor* forward(ggml_context* ctx, ggml_tensor* out_scale, ggml_tensor* x) {
+    ggml_tensor* forward(ggml_context* ctx, float out_scale, ggml_tensor* x) {
         ggml_tensor* out = x;
         for (int i = 0; i < num_residual_blocks; i++) {
             // out = self.rdb...(x)
@@ -325,7 +325,7 @@ struct ESRGAN : public GGMLModule {
         tensors["conv_last.bias"]   = conv_last_b;
     }
 
-    ggml_tensor* forward(ggml_context* ctx0, ggml_tensor* out_scale, ggml_tensor* x /* feat */) {
+    ggml_tensor* forward(ggml_context* ctx0, float out_scale, ggml_tensor* x /* feat */) {
         // feat = self.conv_first(feat)
         auto h = ggml_nn_conv_2d(ctx0, x, conv_first_w, conv_first_b, 1, 1, 1, 1);
 
@@ -376,12 +376,7 @@ struct ESRGAN : public GGMLModule {
         struct ggml_cgraph* gf = ggml_new_graph(ctx0);
 
         struct ggml_tensor* x_ = NULL;
-        struct ggml_tensor* os = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-        ggml_allocr_alloc(compute_allocr, os);
-        if (!ggml_allocr_is_measure(compute_allocr)) {
-            float scale = 0.2f;
-            ggml_backend_tensor_set(os, &scale, 0, sizeof(scale));
-        }
+        float out_scale = 0.2f;
 
         // it's performing a compute, check if backend isn't cpu
         if (!ggml_backend_is_cpu(backend)) {
@@ -397,7 +392,7 @@ struct ESRGAN : public GGMLModule {
             x_ = x;
         }
 
-        struct ggml_tensor* out = forward(ctx0, os, x);
+        struct ggml_tensor* out = forward(ctx0, out_scale, x);
 
         ggml_build_forward_expand(gf, out);
         ggml_free(ctx0);

diff --git a/ggml b/ggml
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
@@ -449,7 +449,7 @@ __STATIC_INLINE__ struct ggml_tensor* ggml_nn_group_norm(struct ggml_context* ct
                                                          struct ggml_tensor* w,
                                                          struct ggml_tensor* b,
                                                          int num_groups = 32) {
-    if (x->n_dims == 4) {
+    if (ggml_n_dims(x) >= 3) {
         w = ggml_reshape_4d(ctx, w, 1, 1, w->ne[0], 1);
         b = ggml_reshape_4d(ctx, b, 1, 1, b->ne[0], 1);
     }

diff --git a/lora.hpp b/lora.hpp
@@ -113,7 +113,7 @@ struct LoraModel : public GGMLModule {
             applied_lora_tensors.insert(scale_name);
 
             // calc_cale
-            int64_t dim       = lora_down->ne[lora_down->n_dims - 1];
+            int64_t dim       = lora_down->ne[ggml_n_dims(lora_down) - 1];
             float scale_value = 1.0f;
             if (lora_tensors.find(scale_name) != lora_tensors.end()) {
                 scale_value = ggml_backend_tensor_get_f32(lora_tensors[scale_name]);
@@ -123,17 +123,10 @@ struct LoraModel : public GGMLModule {
             }
             scale_value *= multiplier;
 
-            ggml_tensor* lora_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
-
-            ggml_allocr_alloc(compute_allocr, lora_scale);
-            if (!ggml_allocr_is_measure(compute_allocr)) {
-                ggml_backend_tensor_set(lora_scale, &scale_value, 0, ggml_nbytes(lora_scale));
-            }
-
             // flat lora tensors to multiply it
-            int64_t lora_up_rows   = lora_up->ne[lora_up->n_dims - 1];
+            int64_t lora_up_rows   = lora_up->ne[ggml_n_dims(lora_up) - 1];
             lora_up                = ggml_reshape_2d(ctx0, lora_up, ggml_nelements(lora_up) / lora_up_rows, lora_up_rows);
-            int64_t lora_down_rows = lora_down->ne[lora_down->n_dims - 1];
+            int64_t lora_down_rows = lora_down->ne[ggml_n_dims(lora_down) - 1];
             lora_down              = ggml_reshape_2d(ctx0, lora_down, ggml_nelements(lora_down) / lora_down_rows, lora_down_rows);
 
             // ggml_mul_mat requires tensor b transposed
@@ -142,7 +135,7 @@ struct LoraModel : public GGMLModule {
             updown                     = ggml_cont(ctx0, ggml_transpose(ctx0, updown));
             updown                     = ggml_reshape(ctx0, updown, weight);
             GGML_ASSERT(ggml_nelements(updown) == ggml_nelements(weight));
-            updown = ggml_scale_inplace(ctx0, updown, lora_scale);
+            updown = ggml_scale_inplace(ctx0, updown, scale_value);
             ggml_tensor* final_weight;
             // if (weight->type != GGML_TYPE_F32 && weight->type != GGML_TYPE_F16) {
             //     final_weight = ggml_new_tensor(ctx0, GGML_TYPE_F32, weight->n_dims, weight->ne);

diff --git a/model.cpp b/model.cpp
@@ -673,7 +673,7 @@ bool ModelLoader::init_from_gguf_file(const std::string& file_path, const std::s
 
         // LOG_DEBUG("%s", name.c_str());
 
-        TensorStorage tensor_storage(prefix + name, dummy->type, dummy->ne, dummy->n_dims, file_index, offset);
+        TensorStorage tensor_storage(prefix + name, dummy->type, dummy->ne, ggml_n_dims(dummy), file_index, offset);
 
         GGML_ASSERT(ggml_nbytes(dummy) == tensor_storage.nbytes());
 
@@ -1417,6 +1417,9 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
         if (pair.first.find("cond_stage_model.transformer.text_model.encoder.layers.23") != std::string::npos) {
             continue;
         }
+        if (pair.first.find("alphas_cumprod") != std::string::npos) {
+            continue;
+        }
 
         if (pair.first.find("alphas_cumprod") != std::string::npos) {
             continue;

diff --git a/tae.hpp b/tae.hpp
@@ -278,9 +278,6 @@ struct TinyDecoder {
     ggml_tensor* conv_final_w;  // [output_channels, channels, 3, 3]
     ggml_tensor* conv_final_b;  // [output_channels]
 
-    ggml_tensor* in_scale_1d3;  // [1]
-    ggml_tensor* in_scale_3;    // [1]
-
     TinyDecoder() {
         for (int i = 0; i < num_blocks; i++) {
             input_blocks[i].in_channels  = channels;
@@ -351,16 +348,6 @@ struct TinyDecoder {
         }
 
         final_block.init_params(ctx);
-
-        // initialize constants scales
-        in_scale_1d3 = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-        in_scale_3   = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-        ggml_allocr_alloc(alloc, in_scale_1d3);
-        float scale_1d3 = 1.0f / 3.0f;
-        ggml_backend_tensor_set(in_scale_1d3, &scale_1d3, 0, sizeof(scale_1d3));
-        ggml_allocr_alloc(alloc, in_scale_3);
-        float scale_3 = 3.0f;
-        ggml_backend_tensor_set(in_scale_3, &scale_3, 0, sizeof(scale_3));
     }
 
     void map_by_name(std::map<std::string, ggml_tensor*>& tensors, std::string prefix) {
@@ -391,9 +378,9 @@ struct TinyDecoder {
 
     ggml_tensor* forward(ggml_context* ctx, ggml_tensor* z) {
         // torch.tanh(x / 3) * 3
-        auto h = ggml_scale(ctx, z, in_scale_1d3);
+        auto h = ggml_scale(ctx, z, 1.0f / 3.0f);
         h      = ggml_tanh_inplace(ctx, h);
-        h      = ggml_scale(ctx, h, in_scale_3);
+        h      = ggml_scale(ctx, h, 3.0f);
 
         // conv(4, 64)
         h = ggml_nn_conv_2d(ctx, h, conv_input_w, conv_input_b, 1, 1, 1, 1);

diff --git a/unet.hpp b/unet.hpp
@@ -182,8 +182,6 @@ struct SpatialTransformer {
 
     std::vector<Transformer> transformers;
 
-    struct ggml_tensor* attn_scale;
-
     // proj_out
     struct ggml_tensor* proj_out_w;  // [in_channels, in_channels, 1, 1]
     struct ggml_tensor* proj_out_b;  // [in_channels,]
@@ -202,7 +200,6 @@ struct SpatialTransformer {
         mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                        // norm_w/norm_b
         mem_size += 2 * in_channels * in_channels * 1 * 1 * ggml_type_sizef(GGML_TYPE_F16);  // proj_in_w/proj_out_w
         mem_size += 2 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                        // proj_in_b/proj_out_b
-        mem_size += 1 * ggml_type_sizef(GGML_TYPE_F32);                                      // attn_scale
 
         // transformer
         for (auto& transformer : transformers) {
@@ -226,11 +223,6 @@ struct SpatialTransformer {
         proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
         proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
 
-        attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-        ggml_allocr_alloc(alloc, attn_scale);
-        float scale = 1.0f / sqrt((float)d_head);
-        ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
-
         // transformer
         for (auto& transformer : transformers) {
             transformer.norm1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
@@ -332,7 +324,7 @@ struct SpatialTransformer {
                 x                     = ggml_reshape_2d(ctx, x, c, h * w * n);        // [N * h * w, in_channels]
                 struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn1_q_w, x);  // [N * h * w, in_channels]
 #if !defined(SD_USE_FLASH_ATTENTION) || defined(SD_USE_CUBLAS) || defined(SD_USE_METAL)
-                q = ggml_scale_inplace(ctx, q, attn_scale);
+                q = ggml_scale_inplace(ctx, q, 1.0f / sqrt((float)d_head));
 #endif
                 q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
                 q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]
@@ -380,7 +372,7 @@ struct SpatialTransformer {
                 context               = ggml_reshape_2d(ctx, context, context->ne[0], context->ne[1] * context->ne[2]);  // [N * max_position, hidden_size]
                 struct ggml_tensor* q = ggml_mul_mat(ctx, transformer.attn2_q_w, x);                                     // [N * h * w, in_channels]
 #if !defined(SD_USE_FLASH_ATTENTION) || defined(SD_USE_CUBLAS) || defined(SD_USE_METAL)
-                q = ggml_scale_inplace(ctx, q, attn_scale);
+                q = ggml_scale_inplace(ctx, q, 1.0f / sqrt((float)d_head));
 #endif
                 q = ggml_reshape_4d(ctx, q, d_head, n_head, h * w, n);   // [N, h * w, n_head, d_head]
                 q = ggml_cont(ctx, ggml_permute(ctx, q, 0, 2, 1, 3));    // [N, n_head, h * w, d_head]

diff --git a/vae.hpp b/vae.hpp
@@ -118,8 +118,6 @@ struct AttnBlock {
     struct ggml_tensor* proj_out_w;  // [in_channels, in_channels, 1, 1]
     struct ggml_tensor* proj_out_b;  // [in_channels,]
 
-    struct ggml_tensor* attn_scale;
-
     size_t calculate_mem_size(ggml_type wtype) {
         double mem_size = 0;
         mem_size += 6 * in_channels * ggml_type_sizef(GGML_TYPE_F32);                        // norm_w/norm_b/q_b/k_v/v_b/proj_out_b
@@ -140,11 +138,6 @@ struct AttnBlock {
 
         proj_out_w = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, 1, 1, in_channels, in_channels);
         proj_out_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, in_channels);
-
-        attn_scale = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-        ggml_allocr_alloc(alloc, attn_scale);
-        float scale = 1.0f / sqrt((float)in_channels);
-        ggml_backend_tensor_set(attn_scale, &scale, 0, sizeof(scale));
     }
 
     void map_by_name(std::map<std::string, struct ggml_tensor*>& tensors, const std::string prefix) {
@@ -181,7 +174,7 @@ struct AttnBlock {
         k = ggml_reshape_3d(ctx, k, c, h * w, n);              // [N, h * w, in_channels]
 
         auto w_ = ggml_mul_mat(ctx, k, q);  // [N, h * w, h * w]
-        w_      = ggml_scale_inplace(ctx, w_, attn_scale);
+        w_      = ggml_scale_inplace(ctx, w_, 1.0f / sqrt((float)in_channels));
         w_      = ggml_soft_max_inplace(ctx, w_);
 
         v  = ggml_reshape_3d(ctx, v, h * w, c, n);               // [N, in_channels, h * w]
+49 −0		Package.swift
+37 −1		README.md
+20 −21		examples/dolly-v2/main.cpp
+20 −26		examples/gpt-2/main-alloc.cpp
+1 −8		examples/gpt-2/main-backend.cpp
+18 −25		examples/gpt-2/main-batched.cpp
+20 −24		examples/gpt-2/main-ctx.cpp
+3 −14		examples/gpt-2/main.cpp
+18 −19		examples/gpt-j/main.cpp
+20 −21		examples/gpt-neox/main.cpp
+4 −4		examples/mnist/main.cpp
+14 −11		examples/mpt/main.cpp
+14 −11		examples/replit/main.cpp
+66 −70		examples/sam/main.cpp
+20 −23		examples/starcoder/main.cpp
+1 −2		examples/starcoder/starcoder-mmap.cpp
+62 −74		examples/whisper/whisper.cpp
+8 −1		include/ggml/ggml-backend.h
+40 −16		include/ggml/ggml.h
+155 −0		scripts/sync-llama-am.sh
+1 −0		scripts/sync-llama.last
+165 −0		scripts/sync-whisper-am.sh
+1 −0		scripts/sync-whisper.last
+8 −1		src/CMakeLists.txt
+13 −5		src/ggml-alloc.c
+13 −9		src/ggml-backend-impl.h
+94 −20		src/ggml-backend.c
+830 −554		src/ggml-cuda.cu
+4 −1		src/ggml-metal.h
+236 −66		src/ggml-metal.m
+475 −247		src/ggml-metal.metal
+9 −9		src/ggml-opencl.h
+84 −399		src/ggml-quants.c
+1 −1		src/ggml-quants.h
+500 −161		src/ggml.c
+8 −0		tests/CMakeLists.txt
+57 −20		tests/test-backend-ops.cpp
+2 −2		tests/test-conv1d.cpp
+2 −2		tests/test-conv2d.cpp
+110 −0		tests/test-dup.c
+5 −8		tests/test-grad0.cpp
+2 −2		tests/test-mul-mat.cpp
+5 −5		tests/test-quantize-perf.cpp
+3 −3		tests/test0.c