Reformat TransformerComputeKernelsLayered by adjusting Javadoc annotations, reorganizing line breaks, and improving parameter alignment for enhanced readability and consistency.

mikepapadim · mikepapadim · commit 47406da9b4b6 · 2025-08-03T22:08:02.000+03:00
diff --git a/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java b/src/main/java/com/example/tornadovm/TransformerComputeKernelsLayered.java
@@ -16,22 +16,23 @@ public TransformerComputeKernelsLayered() {
     }
 
     /**
-     * Performs RMS (Root Mean Square) normalization using parallel reduction.
-     * This is the first phase of RMS normalization that computes the variance
-     * and scaling factor across all work groups.
+     * Performs RMS (Root Mean Square) normalization using parallel reduction. This is the first phase of RMS normalization that computes the variance and scaling factor across all work groups.
      *
-     * Algorithm:
-     * 1. Each thread computes square of its input element
-     * 2. Work group performs parallel reduction of squares
-     * 3. Partial sums stored per work group
-     * 4. First thread combines all partial sums and computes normalization factor
+     * Algorithm: 1. Each thread computes square of its input element 2. Work group performs parallel reduction of squares 3. Partial sums stored per work group 4. First thread combines all partial
+     * sums and computes normalization factor
      *
-     * @param context Kernel execution context
-     * @param output Array to store partial sums and final normalization factor
-     * @param x Input array to normalize
-     * @param size Number of elements to process
-     * @param ermsNorm Epsilon value squared for numerical stability
-     * @param localMemSize Size of local memory allocation (must match work group size)
+     * @param context
+     *         Kernel execution context
+     * @param output
+     *         Array to store partial sums and final normalization factor
+     * @param x
+     *         Input array to normalize
+     * @param size
+     *         Number of elements to process
+     * @param ermsNorm
+     *         Epsilon value squared for numerical stability
+     * @param localMemSize
+     *         Size of local memory allocation (must match work group size)
      */
     public static void reductionOneBlockWithLayer(KernelContext context, FloatArray output, FloatArray x, int size, float ermsNorm, int localMemSize) {
         int gid = context.globalIdx;
@@ -80,16 +81,20 @@ public static void reductionOneBlockWithLayer(KernelContext context, FloatArray
     }
 
     /**
-     * Applies the computed normalization factor to input and weight elements.
-     * This is the second phase of RMS normalization.
+     * Applies the computed normalization factor to input and weight elements. This is the second phase of RMS normalization.
      *
      * Formula: output[i] = weight[i] * (normalizationFactor * x[i])
      *
-     * @param context Kernel execution context
-     * @param output Array for normalized output
-     * @param x Input values to normalize
-     * @param weights Weight values for each element
-     * @param temp Temporary array containing normalization factor at index 0
+     * @param context
+     *         Kernel execution context
+     * @param output
+     *         Array for normalized output
+     * @param x
+     *         Input values to normalize
+     * @param weights
+     *         Weight values for each element
+     * @param temp
+     *         Temporary array containing normalization factor at index 0
      */
     public static void reductionOneBlock2WithLayer(KernelContext context, FloatArray output, FloatArray x, FloatArray weights, FloatArray temp) {
         int gid = context.globalIdx;
@@ -99,21 +104,26 @@ public static void reductionOneBlock2WithLayer(KernelContext context, FloatArray
     }
 
     /**
-     * Copies keys and values into the key-value cache for attention computation.
-     * Enables efficient access to past key-value pairs during autoregressive generation.
+     * Copies keys and values into the key-value cache for attention computation. Enables efficient access to past key-value pairs during autoregressive generation.
      *
-     * Cache layout: [layer][position][dimension]
-     * - Each layer has its own key and value cache
-     * - Each position in sequence has a key and value vector
+     * Cache layout: [layer][position][dimension] - Each layer has its own key and value cache - Each position in sequence has a key and value vector
      *
-     * @param destKeyCache Destination array for key cache
-     * @param srcKey Source keys to copy
-     * @param destValueCache Destination array for value cache
-     * @param srcValue Source values to copy
-     * @param positioNlayer Array containing current position
-     * @param kvDim Dimension of key/value vectors
-     * @param layer Current transformer layer index
-     * @param contextLength Maximum sequence length
+     * @param destKeyCache
+     *         Destination array for key cache
+     * @param srcKey
+     *         Source keys to copy
+     * @param destValueCache
+     *         Destination array for value cache
+     * @param srcValue
+     *         Source values to copy
+     * @param positioNlayer
+     *         Array containing current position
+     * @param kvDim
+     *         Dimension of key/value vectors
+     * @param layer
+     *         Current transformer layer index
+     * @param contextLength
+     *         Maximum sequence length
      */
     public static void copyToCache(FloatArray destKeyCache, FloatArray srcKey, FloatArray destValueCache, FloatArray srcValue, IntArray positioNlayer, int kvDim, int layer, int contextLength) {
 
@@ -127,7 +137,6 @@ public static void copyToCache(FloatArray destKeyCache, FloatArray srcKey, Float
         }
     }
 
-
     public static void copyTo(FloatArray src, int srcOffset, FloatArray dest, int destOffset, int size) {
         // Generic copy: src[srcOffset:srcOffset+size] -> dest[destOffset:destOffset+size]
         for (@Parallel int i = 0; i < size; i++) {
@@ -144,20 +153,23 @@ public static void copyChunk(FloatArray in, FloatArray out, int dim1In, int dim1
     }
 
     /**
-     * Applies Rotary Position Encoding (RoPE) to query and key vectors.
-     * RoPE rotates pairs of dimensions based on their position in the sequence,
-     * enabling the model to learn relative positional information.
+     * Applies Rotary Position Encoding (RoPE) to query and key vectors. RoPE rotates pairs of dimensions based on their position in the sequence, enabling the model to learn relative positional
+     * information.
      *
-     * For each pair of dimensions (2*i, 2*i+1):
-     * - Compute rotation angle based on position and frequency
-     * - Apply 2D rotation to the pair
+     * For each pair of dimensions (2*i, 2*i+1): - Compute rotation angle based on position and frequency - Apply 2D rotation to the pair
      *
-     * @param context Kernel execution context
-     * @param positionHolder Array containing current position
-     * @param sq Query vectors to rotate
-     * @param sk Key vectors to rotate
-     * @param kv_dim Dimension of key/value vectors
-     * @param head_size Dimension of each attention head
+     * @param context
+     *         Kernel execution context
+     * @param positionHolder
+     *         Array containing current position
+     * @param sq
+     *         Query vectors to rotate
+     * @param sk
+     *         Key vectors to rotate
+     * @param kv_dim
+     *         Dimension of key/value vectors
+     * @param head_size
+     *         Dimension of each attention head
      */
     public static void ropeRotation(KernelContext context, IntArray positionHolder, FloatArray sq, FloatArray sk, int kv_dim, int head_size) {
         int i = context.globalIdx * 2;
@@ -194,8 +206,9 @@ public static void ropeRotationPhi3(KernelContext context, IntArray positionHold
         int dimHalf = head_size / 2;
 
         // Each thread processes one dimension pair
-        if (idx >= dimHalf)
+        if (idx >= dimHalf) {
             return;
+        }
 
         int position = positionHolder.get(0);
 
@@ -209,8 +222,9 @@ public static void ropeRotationPhi3(KernelContext context, IntArray positionHold
         int totalDim = sq.getSize();
         for (int base = 0; base < totalDim; base += head_size) {
             // Skip if we're beyond the bounds
-            if (base + idx >= totalDim || base + idx + dimHalf >= totalDim)
+            if (base + idx >= totalDim || base + idx + dimHalf >= totalDim) {
                 break;
+            }
 
             // Rotate query
             float v0 = sq.get(base + idx);
@@ -719,18 +733,24 @@ public static void matrixVectorGeneric(
     // @formatter:on
 
     /**
-     * Matrix-vector multiplication with residual connection.
-     * Combines regular matrix multiplication with addition of existing values.
+     * Matrix-vector multiplication with residual connection. Combines regular matrix multiplication with addition of existing values.
      *
      * Formula: hb[i] = hb[i] + w[i]·x
      *
-     * @param context Kernel execution context
-     * @param x Input vector
-     * @param hb Input/output vector (contains residual, receives result)
-     * @param w Weight matrix
-     * @param n Input dimension
-     * @param d Output dimension
-     * @param localWorkGroupSize Work group size
+     * @param context
+     *         Kernel execution context
+     * @param x
+     *         Input vector
+     * @param hb
+     *         Input/output vector (contains residual, receives result)
+     * @param w
+     *         Weight matrix
+     * @param n
+     *         Input dimension
+     * @param d
+     *         Output dimension
+     * @param localWorkGroupSize
+     *         Work group size
      */
     public static void matrixVectorGenericWithResidual(KernelContext context, FloatArray x, FloatArray hb, HalfFloatArray w, int n, int d, int localWorkGroupSize) {
         // One row per workgroup (not per thread)
@@ -753,20 +773,26 @@ public static void matrixVectorGenericWithResidual(KernelContext context, FloatA
     }
 
     /**
-     * Fused feed-forward network with SiLU activation and GLU gating.
-     * Implements the SwiGLU variant used in LLaMA-style models.
+     * Fused feed-forward network with SiLU activation and GLU gating. Implements the SwiGLU variant used in LLaMA-style models.
      *
-     * Formula: FFN(x) = SiLU(x·W1) ⊙ (x·W3)
-     * where ⊙ denotes element-wise multiplication
+     * Formula: FFN(x) = SiLU(x·W1) ⊙ (x·W3) where ⊙ denotes element-wise multiplication
      *
-     * @param context Kernel execution context
-     * @param x Input vector
-     * @param hb Output buffer
-     * @param w1 First feed-forward weight matrix
-     * @param w3 Third feed-forward weight matrix (gate)
-     * @param n Input dimension
-     * @param d Hidden dimension
-     * @param localWorkGroupSize Work group size
+     * @param context
+     *         Kernel execution context
+     * @param x
+     *         Input vector
+     * @param hb
+     *         Output buffer
+     * @param w1
+     *         First feed-forward weight matrix
+     * @param w3
+     *         Third feed-forward weight matrix (gate)
+     * @param n
+     *         Input dimension
+     * @param d
+     *         Hidden dimension
+     * @param localWorkGroupSize
+     *         Work group size
      */
     public static void fusedFeedForwardWithSiLUAndGLUActivation(KernelContext context, FloatArray x, FloatArray hb, HalfFloatArray w1, HalfFloatArray w3, int n, int d, int localWorkGroupSize) {
         // One row per workgroup (not per thread)
@@ -789,10 +815,10 @@ public static void fusedFeedForwardWithSiLUAndGLUActivation(KernelContext contex
     }
 
     /**
-     * Gaussian Error Linear Unit (GELU) activation function.
-     * Approximation formula: GELU(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
+     * Gaussian Error Linear Unit (GELU) activation function. Approximation formula: GELU(x) ≈ 0.5 * x * (1 + tanh(√(2/π) * (x + 0.044715 * x³)))
      *
-     * @param x Input value
+     * @param x
+     *         Input value
      * @return Activated value
      */
     public static float geluActivation(float x) {
@@ -801,33 +827,33 @@ public static float geluActivation(float x) {
     }
 
     /**
-     * Sigmoid-weighted Linear Unit (SiLU) activation function.
-     * Also known as Swish activation.
+     * Sigmoid-weighted Linear Unit (SiLU) activation function. Also known as Swish activation.
      *
      * Formula: SiLU(x) = x * σ(x) = x / (1 + e^(-x))
      *
-     * @param x Input value
+     * @param x
+     *         Input value
      * @return Activated value
      */
     public static float siluActivation(float x) {
         return x * (1.0f / (1.0f + TornadoMath.exp(-x)));
     }
 
     /**
-     * Optimized row-major matrix-vector multiplication for a single row.
-     * Uses parallel reduction within a work group to compute one dot product.
+     * Optimized row-major matrix-vector multiplication for a single row. Uses parallel reduction within a work group to compute one dot product.
      *
-     * Algorithm:
-     * 1. Each thread computes partial dot product
-     * 2. Partial results stored in local memory
-     * 3. Tree-based reduction combines partial results
-     * 4. Returns final dot product for the row
+     * Algorithm: 1. Each thread computes partial dot product 2. Partial results stored in local memory 3. Tree-based reduction combines partial results 4. Returns final dot product for the row
      *
-     * @param context Kernel execution context
-     * @param localSize Work group size
-     * @param x Input vector
-     * @param w Weight matrix row
-     * @param n Input dimension
+     * @param context
+     *         Kernel execution context
+     * @param localSize
+     *         Work group size
+     * @param x
+     *         Input vector
+     * @param w
+     *         Weight matrix row
+     * @param n
+     *         Input dimension
      * @return Dot product result for this row
      */
     public static float matrixVectorRowMajorOptimized(KernelContext context, int localSize, FloatArray x, FloatArray w, int n) {
@@ -915,7 +941,7 @@ public static void siluInPlace(FloatArray array, int size) {
         // SiLU activation: silu(x) = x * sigmoid(x) = x / (1 + exp(-x))
         for (@Parallel int i = 0; i < size; i++) {
             float x = array.get(i);
-            float silu = x / (1.0f +  TornadoMath.exp(-x));
+            float silu = x / (1.0f + TornadoMath.exp(-x));
             array.set(i, silu);
         }
     }