use fused multiply-add pointwise ops in chroma (comfyanonymous#8279)

drhead · web-flow · commit 08b7cc750681 · 2025-05-30T18:09:54.000-04:00
diff --git a/comfy/ldm/chroma/layers.py b/comfy/ldm/chroma/layers.py
@@ -80,15 +80,13 @@ def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=N
         (img_mod1, img_mod2), (txt_mod1, txt_mod2) = vec
 
         # prepare image for attention
-        img_modulated = self.img_norm1(img)
-        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_modulated = torch.addcmul(img_mod1.shift, 1 + img_mod1.scale, self.img_norm1(img))
         img_qkv = self.img_attn.qkv(img_modulated)
         img_q, img_k, img_v = img_qkv.view(img_qkv.shape[0], img_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
 
         # prepare txt for attention
-        txt_modulated = self.txt_norm1(txt)
-        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_modulated = torch.addcmul(txt_mod1.shift, 1 + txt_mod1.scale, self.txt_norm1(txt))
         txt_qkv = self.txt_attn.qkv(txt_modulated)
         txt_q, txt_k, txt_v = txt_qkv.view(txt_qkv.shape[0], txt_qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
         txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
@@ -102,12 +100,12 @@ def forward(self, img: Tensor, txt: Tensor, pe: Tensor, vec: Tensor, attn_mask=N
         txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
 
         # calculate the img bloks
-        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
-        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+        img.addcmul_(img_mod1.gate, self.img_attn.proj(img_attn))
+        img.addcmul_(img_mod2.gate, self.img_mlp(torch.addcmul(img_mod2.shift, 1 + img_mod2.scale, self.img_norm2(img))))
 
         # calculate the txt bloks
-        txt += txt_mod1.gate * self.txt_attn.proj(txt_attn)
-        txt += txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        txt.addcmul_(txt_mod1.gate, self.txt_attn.proj(txt_attn))
+        txt.addcmul_(txt_mod2.gate, self.txt_mlp(torch.addcmul(txt_mod2.shift, 1 + txt_mod2.scale, self.txt_norm2(txt))))
 
         if txt.dtype == torch.float16:
             txt = torch.nan_to_num(txt, nan=0.0, posinf=65504, neginf=-65504)
@@ -152,7 +150,7 @@ def __init__(
 
     def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor:
         mod = vec
-        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        x_mod = torch.addcmul(mod.shift, 1 + mod.scale, self.pre_norm(x))
         qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
 
         q, k, v = qkv.view(qkv.shape[0], qkv.shape[1], 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
@@ -162,7 +160,7 @@ def forward(self, x: Tensor, pe: Tensor, vec: Tensor, attn_mask=None) -> Tensor:
         attn = attention(q, k, v, pe=pe, mask=attn_mask)
         # compute activation in mlp stream, cat again and run second linear layer
         output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
-        x += mod.gate * output
+        x.addcmul_(mod.gate, output)
         if x.dtype == torch.float16:
             x = torch.nan_to_num(x, nan=0.0, posinf=65504, neginf=-65504)
         return x
@@ -178,6 +176,6 @@ def forward(self, x: Tensor, vec: Tensor) -> Tensor:
         shift, scale = vec
         shift = shift.squeeze(1)
         scale = scale.squeeze(1)
-        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = torch.addcmul(shift[:, None, :], 1 + scale[:, None, :], self.norm_final(x))
         x = self.linear(x)
         return x