Avoid CUDA stream sync

cyyever · cyyever · commit aa682bb1fcc2 · 2025-08-10T18:03:13.000+08:00
Signed-off-by: cyy &lt;cyyever@outlook.com&gt;
diff --git a/src/transformers/modeling_flash_attention_utils.py b/src/transformers/modeling_flash_attention_utils.py
@@ -243,12 +243,12 @@ def _prepare_from_posids(query, key, value, position_ids, query_length):
         cu_seq_lens_q = torch.cat([torch.zeros(1, **tensor_kws), q_len.cumsum(0).to(torch.int32)], 0)
         max_length_q = int(q_len.max())
     else:
-        position_ids = position_ids.flatten()
-        indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
+        position_ids = position_ids.view(-1)
+        indices_q = (position_ids == 0).nonzero().view(-1)
 
         cu_seq_lens_q = torch.cat(
             (
-                indices_q[position_ids == 0],
+                indices_q,
                 torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
             )
         )

Original file line number	Diff line number	Diff line change
`@@ -243,12 +243,12 @@ def _prepare_from_posids(query, key, value, position_ids, query_length):`
`243`	`243`	`cu_seq_lens_q = torch.cat([torch.zeros(1, **tensor_kws), q_len.cumsum(0).to(torch.int32)], 0)`
`244`	`244`	`max_length_q = int(q_len.max())`
`245`	`245`	`else:`
`246`		`- position_ids = position_ids.flatten()`
`247`		`- indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)`
	`246`	`+ position_ids = position_ids.view(-1)`
	`247`	`+ indices_q = (position_ids == 0).nonzero().view(-1)`
`248`	`248`
`249`	`249`	`cu_seq_lens_q = torch.cat(`
`250`	`250`	`(`
`251`		`- indices_q[position_ids == 0],`
	`251`	`+ indices_q,`
`252`	`252`	`torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),`
`253`	`253`	`)`
`254`	`254`	`)`