Skip to content

Suboptimal code for switch with bit tests #155953

@efriedma-quic

Description

@efriedma-quic

Consider the following (borrowed from #155910):

define i32 @foo(i32 noundef signext %x) "no-jump-tables"="true" {
entry:
  switch i32 %x, label %return [
  i32 7, label %sw.bb
  i32 10, label %sw.bb
  i32 8, label %sw.bb1
  i32 11, label %sw.bb1
  i32 9, label %sw.bb2
  i32 12, label %sw.bb2
  i32 4, label %sw.bb3
  ]

sw.bb:                                            ; preds = %entry, %entry
  tail call void @foo1(i32 noundef signext %x)
  br label %return

sw.bb1:                                           ; preds = %entry, %entry
  tail call void @foo2(i32 noundef signext %x)
  br label %return

sw.bb2:                                           ; preds = %entry, %entry
  tail call void @foo3(i32 noundef signext %x)
  br label %return

sw.bb3:                                           ; preds = %entry
  tail call void @foo4(i32 noundef signext 4)
  br label %return

return:                                           ; preds = %sw.bb, %sw.bb1, %sw.bb2, %sw.bb3, %entry
  %retval.0 = phi i32 [ 0, %entry ], [ 4, %sw.bb3 ], [ %x, %sw.bb2 ], [ %x, %sw.bb1 ], [ %x, %sw.bb ]
  ret i32 %retval.0
}
declare void @foo1(i32 noundef signext)
declare void @foo2(i32 noundef signext)
declare void @foo3(i32 noundef signext)
declare void @foo4(i32 noundef signext)

llc -mtriple=aarch64 produces (cut down to the relevant bit)

[...]
        mov     w19, w0
        cmp     w0, #12
        b.hi    .LBB0_6
// %bb.1:                               // %entry
        mov     w8, #1                          // =0x1
        mov     w9, #1152                       // =0x480
        lsl     w8, w8, w19
        tst     w8, w9
        b.ne    .LBB0_8
// %bb.2:                               // %entry
        mov     w9, #2304                       // =0x900
        tst     w8, w9
        b.ne    .LBB0_5
// %bb.3:                               // %entry
        mov     w8, #1                          // =0x1
        mov     w9, #4608                       // =0x1200
        lsl     w8, w8, w19
        tst     w8, w9
        b.eq    .LBB0_6
[...]
.LBB0_6:                                // %entry
        cmp     w19, #4
        b.ne    .LBB0_9

There are two issues here:

  • We generate a redundant shift. Both lsl w8, w8, w19 instructions produce the same value.
  • The initial branch to LBB0_6 leads to a redundant check: if w0 is greater than 12, it can't be equal to 4.

32-bit Arm, PowerPC, and MIPS generate something similar. SPARC somehow manages to hoist the shift. RISCV transforms the shift-left into a shift-right, which is probably worse than hoisting the shift. x86 has a dedicated bit-test instruction, so it doesn't use any shifts.

CC @fhahn @scui-ibm

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions