Skip to content

Conversation

yonghong-song
Copy link
Contributor

@yonghong-song yonghong-song commented Jul 20, 2025

Add jump table (switch statement and computed goto) support for BPF backend.
A gotox <reg> insn is implemented and the <reg> holds the target insn where the gotox will go.

For a switch statement like

...
            switch (ctx->x) {
            case 1: ret_user = 18; break;
            case 20: ret_user = 6; break;
            case 16: ret_user = 9; break;
            case 6: ret_user = 16; break;
            case 8: ret_user = 14; break;
            case 30: ret_user = 2; break;
            default: ret_user = 1; break;
            }
...

and the final binary

    The final binary:
       4:       67 01 00 00 03 00 00 00 r1 <<= 0x3                                                           
       5:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll                                  
                0000000000000028:  R_BPF_64_64  BPF.JT.0.0                                                   
       7:       0f 12 00 00 00 00 00 00 r2 += r1
        ...
    Symbol table:
     4: 0000000000000000   240 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0
     5: 0000000000000000     4 OBJECT  GLOBAL DEFAULT     6 ret_user
     6: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND bar
     7: 00000000000000f0   256 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.1
    and
  [ 4] .jumptables       PROGBITS        0000000000000000 0001c8 0001f0 00      0   0  1

Note that for the above example, -mllvm -bpf-min-jump-table-entries=5 should be in compilation flags as the current default bpf-min-jump-table-entries is 13. For example.

clang --target=bpf -mcpu=v4 -O2 -mllvm -bpf-min-jump-table-entries=5 -S -g test.c

For computed goto like

      int foo(int a, int b) {
        __label__ l1, l2, l3, l4;
        void *jt1[] = {[0]=&&l1, [1]=&&l2};
        void *jt2[] = {[0]=&&l3, [1]=&&l4};
        int ret = 0;
    
        goto *jt1[a % 2];
        l1: ret += 1;
        l2: ret += 3;
        goto *jt2[b % 2];
        l3: ret += 5;
        l4: ret += 7;
        return ret;
      }

The final binary:

      12:       bf 23 20 00 00 00 00 00 r3 = (s32)r2
      13:       67 03 00 00 03 00 00 00 r3 <<= 0x3
      14:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000070:  R_BPF_64_64  BPF.JT.0.0
      16:       0f 32 00 00 00 00 00 00 r2 += r3
      17:       bf 11 20 00 00 00 00 00 r1 = (s32)r1
      18:       67 01 00 00 03 00 00 00 r1 <<= 0x3
      19:       18 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r3 = 0x0 ll
                0000000000000098:  R_BPF_64_64  BPF.JT.0.1
      21:       0f 13 00 00 00 00 00 00 r3 += r1
    
  [ 4] .jumptables       PROGBITS        0000000000000000 000160 000020 00      0   0  1

     4: 0000000000000000    16 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0
     5: 0000000000000010    16 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.1

A more complicated test with both switch-statement triggered jump table and compute gotos:

$ cat test3.c
struct simple_ctx {
        int x;
        int y;
        int z;
};

int ret_user, ret_user2;
void bar(void);
int foo(struct simple_ctx *ctx, struct simple_ctx *ctx2, int a, int b)
{
        __label__ l1, l2, l3, l4;
        void *jt1[] = {[0]=&&l1, [1]=&&l2};
        void *jt2[] = {[0]=&&l3, [1]=&&l4};
        int ret = 0;

        goto *jt1[a % 2];
        l1: ret += 1;
        l2: ret += 3;
        goto *jt2[b % 2];
        l3: ret += 5;
        l4: ret += 7;

        bar();

        switch (ctx->x) {
        case 1: ret_user = 18; break;
        case 20: ret_user = 6; break;
        case 16: ret_user = 9; break;
        case 6: ret_user = 16; break;
        case 8: ret_user = 14; break;
        case 30: ret_user = 2; break;
        default: ret_user = 1; break;
        }

        return ret;
}

Compile with

  clang --target=bpf -mcpu=v4 -O2 -S test3.c
  clang --target=bpf -mcpu=v4 -O2 -c test3.c

The binary:

     /* For computed goto */
      13:       bf 42 20 00 00 00 00 00 r2 = (s32)r4                                                         
      14:       67 02 00 00 03 00 00 00 r2 <<= 0x3                                                           
      15:       18 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x0 ll                                  
                0000000000000078:  R_BPF_64_64  BPF.JT.0.1                                                   
      17:       0f 21 00 00 00 00 00 00 r1 += r2                                                             
      18:       bf 32 20 00 00 00 00 00 r2 = (s32)r3                                                         
      19:       67 02 00 00 03 00 00 00 r2 <<= 0x3                                                           
      20:       18 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r3 = 0x0 ll                                  
                00000000000000a0:  R_BPF_64_64  BPF.JT.0.2                                                   
      22:       0f 23 00 00 00 00 00 00 r3 += r2

      /* For switch statement */
      39:       67 01 00 00 03 00 00 00 r1 <<= 0x3
      40:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000140:  R_BPF_64_64  BPF.JT.0.0
      42:       0f 12 00 00 00 00 00 00 r2 += r1

You can see jump table symbols are all different.

Copy link

github-actions bot commented Jul 20, 2025

✅ With the latest revision this PR passed the C/C++ code formatter.

@aspsk
Copy link

aspsk commented Jul 21, 2025

Thanks! (Building this && rebasing my branch.)

@aspsk
Copy link

aspsk commented Jul 21, 2025

All looks to be compiling properly, I should be able to make this work altogether.

I have two questions & nits below, will split them in two comments.

Question 1: For switches and for computed goto relocations look a bit different. Switch:

       4:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000020:  R_BPF_64_64  BPF.JT.0.0

computed gotos:

      55:       18 02 00 00 28 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x28 ll
                00000000000001b8:  R_BPF_64_64  .jumptables
...
      63:       18 01 00 00 38 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x38 ll
                00000000000001f8:  R_BPF_64_64  .jumptables

The latter two point to properly defined symbols:

     3: 0000000000000028    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt1
     4: 0000000000000038    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt2

but this is another step to find those? Can relocation point to the symbol directly as with the switch?

@aspsk
Copy link

aspsk commented Jul 21, 2025

Currently, jump tables contain 8 bytes per entry, is this intentional? (the offsets will never be greater than 4 bytes):

Hex dump of section '.jumptables':
0x00000000 48000000 00000000 c8000000 00000000 H...............
0x00000010 88000000 00000000 a8000000 00000000 ................
etc.

One related bug: the size of BPF.JT.0.0 computed as it points to 4-byte entries, and the __const.simple_test.jt* computed as they point to 8-byte entries:

Symbol table '.symtab' contains 20 entries:
   Num:    Value          Size Type    Bind   Vis       Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
     1: 0000000000000000     0 FILE    LOCAL  DEFAULT   ABS aspsk_play.c
     2: 0000000000000000     0 SECTION LOCAL  DEFAULT     3 syscall
     3: 0000000000000028    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt1
     4: 0000000000000038    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt2
...
    16: 0000000000000000    20 NOTYPE  GLOBAL DEFAULT     5 BPF.JT.0.0

^ here the first two have size 2 (as in your example above), the BPF.JT.0.0 actually has size 5 (a switch from my test). However, symbol size of it is 20 = 5*4, while for __const.simple_test.jt1 it is 16 = 2*8

aspsk added a commit to aspsk/bpf-next that referenced this pull request Jul 21, 2025
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
@aspsk
Copy link

aspsk commented Jul 21, 2025

Ok, my bpf_goto_x tests from RFC pass now with this branch (my kernel branch is here). I will continue with adding support/tests for computed goto and addressing the comments from RFC (main ones left is on register liveness and removing min,max->index). [Just in case, I am on PTO in mountains this and the next week, and may be 100% offline on some days.]

@yonghong-song
Copy link
Contributor Author

All looks to be compiling properly, I should be able to make this work altogether.

I have two questions & nits below, will split them in two comments.

Question 1: For switches and for computed goto relocations look a bit different. Switch:

       4:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000020:  R_BPF_64_64  BPF.JT.0.0

computed gotos:

      55:       18 02 00 00 28 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x28 ll
                00000000000001b8:  R_BPF_64_64  .jumptables
...
      63:       18 01 00 00 38 00 00 00 00 00 00 00 00 00 00 00 r1 = 0x38 ll
                00000000000001f8:  R_BPF_64_64  .jumptables

The latter two point to properly defined symbols:

     3: 0000000000000028    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt1
     4: 0000000000000038    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt2

but this is another step to find those? Can relocation point to the symbol directly as with the switch?

This part is generated by the compiler directly and the bpf backend is not involved. I need to do some investigation to find out why and whether we could make a change to relocate to the symbol directly or not.

@yonghong-song
Copy link
Contributor Author

Currently, jump tables contain 8 bytes per entry, is this intentional? (the offsets will never be greater than 4 bytes):

Hex dump of section '.jumptables':
0x00000000 48000000 00000000 c8000000 00000000 H...............
0x00000010 88000000 00000000 a8000000 00000000 ................
etc.

One related bug: the size of BPF.JT.0.0 computed as it points to 4-byte entries, and the __const.simple_test.jt* computed as they point to 8-byte entries:

Symbol table '.symtab' contains 20 entries:
   Num:    Value          Size Type    Bind   Vis       Ndx Name
     0: 0000000000000000     0 NOTYPE  LOCAL  DEFAULT   UND
     1: 0000000000000000     0 FILE    LOCAL  DEFAULT   ABS aspsk_play.c
     2: 0000000000000000     0 SECTION LOCAL  DEFAULT     3 syscall
     3: 0000000000000028    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt1
     4: 0000000000000038    16 OBJECT  LOCAL  DEFAULT     5 __const.simple_test.jt2
...
    16: 0000000000000000    20 NOTYPE  GLOBAL DEFAULT     5 BPF.JT.0.0

^ here the first two have size 2 (as in your example above), the BPF.JT.0.0 actually has size 5 (a switch from my test). However, symbol size of it is 20 = 5*4, while for __const.simple_test.jt1 it is 16 = 2*8

Thanks. I will fix the bug. Regarding why we have 8 byte jump table entry. I guess this is probably due to the address is calculated from the start of the section.

@yonghong-song
Copy link
Contributor Author

Ok, my bpf_goto_x tests from RFC pass now with this branch (my kernel branch is here). I will continue with adding support/tests for computed goto and addressing the comments from RFC (main ones left is on register liveness and removing min,max->index). [Just in case, I am on PTO in mountains this and the next week, and may be 100% offline on some days.]

Great. I will try to address the bug you found and the relocation difference between switch-statement vs. computed goto ASAP.

@yonghong-song
Copy link
Contributor Author

Just pushed a fix (BPF.JT.0.0/1 size) discovered by @aspsk in the above.

@eddyz87
Copy link
Contributor

eddyz87 commented Jul 21, 2025

Currently JT offsets are calculated in bytes, but I think it still would be simpler for libbpf/kernel if offsets would be calculated in instructions. Also, there would be no need to track offsets as 8 bytes, 4 bytes would suffice. The following part was responsible for this in old pr:

void BPFAsmPrinter::emitJumpTableInfo()
    ...
    SmallPtrSet<const MachineBasicBlock *, 16> EmittedSets;
     auto *Base: const MCSymbolRefExpr * = MCSymbolRefExpr::create(Symbol: getJXAnchorSymbol(JTI), &Ctx: OutContext);
     for (const MachineBasicBlock *MBB : JTBBs) {
       if (!EmittedSets.insert(Ptr: MBB).second)
         continue;

       // Offset from gotox to target basic block expressed in number
       // of instructions, e.g.:
       //
       //   .L0_0_set_4 = ((LBB0_4 - .LBPF.JX.0.0) >> 3) - 1
       const MCExpr *LHS = MCSymbolRefExpr::create(Symbol: MBB->getSymbol(), &Ctx: OutContext);
       OutStreamer->emitAssignment(
           Symbol: GetJTSetSymbol(UID: JTI, MBBID: MBB->getNumber()),
           Value: MCBinaryExpr::createSub(
               LHS: MCBinaryExpr::createAShr(
                   LHS: MCBinaryExpr::createSub(LHS, RHS: Base, &Ctx: OutContext),
                   RHS: MCConstantExpr::create(Value: 3, &Ctx: OutContext), &Ctx: OutContext),
               RHS: MCConstantExpr::create(Value: 1, &Ctx: OutContext), &Ctx: OutContext));
     }
     // BPF.JT.0.0:
     //    .long   .L0_0_set_4
     //    .long   .L0_0_set_2
     //    ...
     //    .size   BPF.JT.0.0, 128
     MCSymbol *JTStart = getJTPublicSymbol(JTI);
     OutStreamer->emitLabel(Symbol: JTStart);
     for (const MachineBasicBlock *MBB : JTBBs) {
       MCSymbol *SetSymbol = GetJTSetSymbol(UID: JTI, MBBID: MBB->getNumber());
       const MCExpr *V = MCSymbolRefExpr::create(Symbol: SetSymbol, &Ctx: OutContext);
       OutStreamer->emitValue(Value: V, Size: EntrySize);
     }
     const MCExpr *JTSize = MCConstantExpr::create(Value: JTBBs.size() * 4, &Ctx: OutContext);
     OutStreamer->emitELFSize(Symbol: JTStart, Value: JTSize);
   }
   ...

The expression for _set_ labels would probably be simpler in this pr.

@eddyz87
Copy link
Contributor

eddyz87 commented Jul 21, 2025

The following example is not handled:

int bar(int a) {
  __label__ l1, l2;
  void * volatile tgt;
  int ret = 0;
  if (a)
    tgt = &&l1;
  else
    tgt = &&l2;
  goto *tgt;
 l1: ret += 1;
 l2: ret += 2;
 return ret;
}

Currently the following code is produced:

$ (use-my-llvm ; clang -O2 -S -o - --target=bpf jt-vars.c )
        .file   "jt-vars.c"
        .text
        .globl  bar                             # -- Begin function bar
        .p2align        3
        .type   bar,@function
bar:                                    # @bar
# %bb.0:                                # %entry
        r2 = .Ltmp0 ll
        if w1 == 0 goto LBB0_2
# %bb.1:                                # %entry
        r2 = .Ltmp1 ll
LBB0_2:                                 # %entry
        *(u64 *)(r10 - 8) = r2
        r1 = *(u64 *)(r10 - 8)
        gotox r1
.Ltmp1:                                 # Block address taken
LBB0_3:                                 # %l1
        w0 = 3
        goto LBB0_5
.Ltmp0:                                 # Block address taken
LBB0_4:                                 # %l2
        w0 = 2
LBB0_5:                                 # %.split
        exit
.Lfunc_end0:
        .size   bar, .Lfunc_end0-bar
                                        # -- End function
        .addrsig

As discussed previously, r2 = .Ltmp0 ll should be converted to an access to a single element jump table. Otherwise kernel won't be able to adjust .Ltmp0 properly in verifier.c:do_misc_fixups.

@yonghong-song
Copy link
Contributor Author

Currently JT offsets are calculated in bytes, but I think it still would be simpler for libbpf/kernel if offsets would be calculated in instructions. Also, there would be no need to track offsets as 8 bytes, 4 bytes would suffice. The following part was responsible for this in old pr:

void BPFAsmPrinter::emitJumpTableInfo()
    ...
    SmallPtrSet<const MachineBasicBlock *, 16> EmittedSets;
     auto *Base: const MCSymbolRefExpr * = MCSymbolRefExpr::create(Symbol: getJXAnchorSymbol(JTI), &Ctx: OutContext);
     for (const MachineBasicBlock *MBB : JTBBs) {
       if (!EmittedSets.insert(Ptr: MBB).second)
         continue;

       // Offset from gotox to target basic block expressed in number
       // of instructions, e.g.:
       //
       //   .L0_0_set_4 = ((LBB0_4 - .LBPF.JX.0.0) >> 3) - 1
       const MCExpr *LHS = MCSymbolRefExpr::create(Symbol: MBB->getSymbol(), &Ctx: OutContext);
       OutStreamer->emitAssignment(
           Symbol: GetJTSetSymbol(UID: JTI, MBBID: MBB->getNumber()),
           Value: MCBinaryExpr::createSub(
               LHS: MCBinaryExpr::createAShr(
                   LHS: MCBinaryExpr::createSub(LHS, RHS: Base, &Ctx: OutContext),
                   RHS: MCConstantExpr::create(Value: 3, &Ctx: OutContext), &Ctx: OutContext),
               RHS: MCConstantExpr::create(Value: 1, &Ctx: OutContext), &Ctx: OutContext));
     }
     // BPF.JT.0.0:
     //    .long   .L0_0_set_4
     //    .long   .L0_0_set_2
     //    ...
     //    .size   BPF.JT.0.0, 128
     MCSymbol *JTStart = getJTPublicSymbol(JTI);
     OutStreamer->emitLabel(Symbol: JTStart);
     for (const MachineBasicBlock *MBB : JTBBs) {
       MCSymbol *SetSymbol = GetJTSetSymbol(UID: JTI, MBBID: MBB->getNumber());
       const MCExpr *V = MCSymbolRefExpr::create(Symbol: SetSymbol, &Ctx: OutContext);
       OutStreamer->emitValue(Value: V, Size: EntrySize);
     }
     const MCExpr *JTSize = MCConstantExpr::create(Value: JTBBs.size() * 4, &Ctx: OutContext);
     OutStreamer->emitELFSize(Symbol: JTStart, Value: JTSize);
   }
   ...

The expression for _set_ labels would probably be simpler in this pr.

For computed goto, for 'const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();' the MJTI will nullptr. I didn't use the above to be consistent with computed goto jump table.

@yonghong-song
Copy link
Contributor Author

The following example is not handled:

int bar(int a) {
  __label__ l1, l2;
  void * volatile tgt;
  int ret = 0;
  if (a)
    tgt = &&l1;
  else
    tgt = &&l2;
  goto *tgt;
 l1: ret += 1;
 l2: ret += 2;
 return ret;
}

Currently the following code is produced:

$ (use-my-llvm ; clang -O2 -S -o - --target=bpf jt-vars.c )
        .file   "jt-vars.c"
        .text
        .globl  bar                             # -- Begin function bar
        .p2align        3
        .type   bar,@function
bar:                                    # @bar
# %bb.0:                                # %entry
        r2 = .Ltmp0 ll
        if w1 == 0 goto LBB0_2
# %bb.1:                                # %entry
        r2 = .Ltmp1 ll
LBB0_2:                                 # %entry
        *(u64 *)(r10 - 8) = r2
        r1 = *(u64 *)(r10 - 8)
        gotox r1
.Ltmp1:                                 # Block address taken
LBB0_3:                                 # %l1
        w0 = 3
        goto LBB0_5
.Ltmp0:                                 # Block address taken
LBB0_4:                                 # %l2
        w0 = 2
LBB0_5:                                 # %.split
        exit
.Lfunc_end0:
        .size   bar, .Lfunc_end0-bar
                                        # -- End function
        .addrsig

As discussed previously, r2 = .Ltmp0 ll should be converted to an access to a single element jump table. Otherwise kernel won't be able to adjust .Ltmp0 properly in verifier.c:do_misc_fixups.

Yes, make sense. Missed this one.

@yonghong-song
Copy link
Contributor Author

One more thing to check, all labels e.g. BPF.JT.0.0 is global. What if there are two bpf progs both having BPF.JT.0.0 and they need to be linked together? Does libbpf can handle this properly? We need to double check with libbpf for this.

@yonghong-song
Copy link
Contributor Author

For the new version, computed goto has the same relocation mechanism based on symbols.

@eddyz87
Copy link
Contributor

eddyz87 commented Jul 21, 2025

One more thing to check, all labels e.g. BPF.JT.0.0 is global. What if there are two bpf progs both having BPF.JT.0.0 and they need to be linked together? Does libbpf can handle this properly? We need to double check with libbpf for this.

Good point, I think libbpf linker should be modified to take care of this.

@eddyz87
Copy link
Contributor

eddyz87 commented Jul 21, 2025

For computed goto, for 'const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();' the MJTI will nullptr. I didn't use the above to be consistent with computed goto jump table.

Well, arrays of labels declared in ".jumptables" sections can be lowered as jump tables, this way MF->getJumpTableInfo() won't be empty. A slight complication on llvm side is worth it if there would be simplification on kernel/libbpf side, imo.

@yonghong-song
Copy link
Contributor Author

For computed goto, for 'const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();' the MJTI will nullptr. I didn't use the above to be consistent with computed goto jump table.

Well, arrays of labels declared in ".jumptables" sections can be lowered as jump tables, this way MF->getJumpTableInfo() won't be empty. A slight complication on llvm side is worth it if there would be simplification on kernel/libbpf side, imo.

Let me take a look.

@aspsk
Copy link

aspsk commented Jul 22, 2025

Ok. With this current patch (and my dev kernel branch) smth like this

__label__ l1, l2, l3, l4;
void *jt1[] = {[0]=&&l1, [1]=&&l2};
void *jt2[] = {[0]=&&l3, [1]=&&l4};

...

if (ctx->x % 2)
        goto *jt1[a];
else
        goto *jt2[b];

l1: ret += 1;
l2: ret += 3;
l3: ret += 5;
l4: ret += 7;

compiles into two map loads and one gotox:

# bpftool prog dump x id 42
...
; if (ctx->x % 2)
  10: (67) r3 <<= 3
  11: (18) r4 = 0xffff8881beef5f90
  13: (0f) r4 += r3
  14: (57) r2 &= 1
  15: (15) if r2 == 0x0 goto pc+4
  16: (67) r1 <<= 3
  17: (18) r4 = 0xffff888beefeef10
  19: (0f) r4 += r1
  20: (79) r1 = *(u64 *)(r4 +0)
  21: (0d) gotox r1
...

(and it verifies and run properly).

So far all looks good for me to start working on cleaning things up & more examples of computed gotos.

@yonghong-song yonghong-song force-pushed the jumptable branch 2 times, most recently from 92d1b62 to a26a805 Compare July 22, 2025 16:01
@yonghong-song
Copy link
Contributor Author

But the question remains, @aspsk , do we need relocations in the .jumptables section for libbpf linker?

So far, I don't see that we do need them @eddyz87

Regardless whether we will need it or not, thanks @eddyz87 for the work. Previously without any implementation I looked at asm code and find for each file we have:
file "test4.c"
.text
.globl foo1
... <code, jump table sections, etc.>
.text
.globl foo2
... <code, jump tables sections, etc.>
I thought that inside each function, we might get the address for '.text' in the beginning of that function.
Looks like this is not correct. We can certainly get an address at the beginning of merged section.
I will look at Eduard's patch tomorrow.

@@ -750,6 +753,54 @@ bool BPFMIPreEmitPeephole::addExitAfterUnreachable() {
return true;
}

bool BPFMIPreEmitPeephole::convertBAToConstantArray() {
Copy link
Contributor

@eddyz87 eddyz87 Jul 25, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: below works for me and avoids introducing temporary rodata variable.
I wonder, if the same logic can be moved to BPFTargetLowering::LowerBlockAddress?

--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/Support/Debug.h"
@@ -754,9 +755,15 @@ bool BPFMIPreEmitPeephole::addExitAfterUnreachable() {
 }
 
 bool BPFMIPreEmitPeephole::convertBAToConstantArray() {
+  DenseMap<const BasicBlock *, MachineBasicBlock *> AddressTakenBBs;
   bool Changed = false;
 
-  unsigned gv_idx = 0;
+  for (MachineBasicBlock &MBB : *MF) {
+    if (!MBB.getBasicBlock()->hasAddressTaken())
+      continue;
+    AddressTakenBBs[MBB.getBasicBlock()] = &MBB;
+  }
+
   for (MachineBasicBlock &MBB : *MF) {
     for (MachineInstr &MI : make_early_inc_range(MBB)) {
       if (MI.getOpcode() != BPF::LD_imm64)
@@ -772,22 +779,18 @@ bool BPFMIPreEmitPeephole::convertBAToConstantArray() {
       // BloackAddress, and then generate a global variable based on this array.
       // This will allow generating a jump table later.
       const BlockAddress *OldBA = MO.getBlockAddress();
-      BlockAddress *NewBA =
-          BlockAddress::get(OldBA->getFunction(), OldBA->getBasicBlock());
-      ArrayType *ArrTy = ArrayType::get(NewBA->getType(), 1);
-      SmallVector<Constant *, 4> Elems;
-      Elems.push_back((Constant *)NewBA);
-      Constant *Init = ConstantArray::get(ArrTy, Elems);
-      GlobalVariable *GV = new GlobalVariable(
-          *MF->getFunction().getParent(), ArrTy, true,
-          GlobalValue::PrivateLinkage, Init, "ba2gv_" + std::to_string(gv_idx));
-      gv_idx++;
+      MachineBasicBlock *TgtMBB = AddressTakenBBs[OldBA->getBasicBlock()];
+      std::vector<MachineBasicBlock *> Targets;
+      Targets.push_back(TgtMBB);
+      unsigned JTI =
+        MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_LabelDifference64)
+          ->createJumpTableIndex(Targets);
 
       // From 'reg = LD_imm64 <blockaddress>' to 'reg = LD_imm64 ba2gv_<idx>;
       // reg = *(u64 *)(reg + 0)'.
       BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::LD_imm64))
           .addReg(ResultReg)
-          .addGlobalAddress(GV);
+          .addJumpTableIndex(JTI);
       BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::LDD))
           .addReg(ResultReg)
           .addReg(ResultReg)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. This does remove the 'global variable' which will not be used later.
I just updated one version with this change.

BTW, let me check whether we can move the logic to lowering stage.

@yonghong-song yonghong-song force-pushed the jumptable branch 4 times, most recently from 22f9b85 to c2022dd Compare July 27, 2025 16:19
@yonghong-song
Copy link
Contributor Author

Ok. I updated a new version to handle blockaddress and global variables in bpf lowering phase. Basically all jump tables are created at lowering phase.

@aspsk
Copy link

aspsk commented Aug 13, 2025

So, it takes a bit longer than I thought to address all the comments & properly implement the new approach (where map is not hardcoded in gotox, but all jump tables withing a subprogram are collected). But it works now, and all tests pass (including the "extended" set -- normal selftests which contain gotox being compiled using this compiler version).

I still have a few comments from RFC not addressed (mainly, to get rid of min_index/max_index and properly use regsafe instead of hacking), and a non-zero amount of squashing / cleaning up, but this is now all converging, and I am planning to send v1 "soon".

@aspsk
Copy link

aspsk commented Aug 16, 2025

planning to send v1 "soon"

All looks ok now from my side, now fixing warnings triggered by CI, then sending out v1

@aspsk
Copy link

aspsk commented Aug 16, 2025

@yonghong-song
Copy link
Contributor Author

Thanks @aspsk for the update! I will try to remove unneeded globals and add some tests from llvm side to make the pull request complete.

aspsk added a commit to aspsk/bpf-next that referenced this pull request Aug 19, 2025
Signed-off-by: Anton Protopopov <a.s.protopopov@gmail.com>
ret i32 %ret.0
}

; CHECK: w1 &= 1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use CHECK-NEXT: whenever feasible so that FileCheck diagnostics will be better when an instruction is changed.
Consider using update_llc_test_checks.py

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use CHECK-NEXT: whenever feasible so that FileCheck diagnostics will be better when an instruction is changed. Consider using update_llc_test_checks.py

Thanks for the suggestion! I also want to dump '.jumptables' section and that is why I didn't use update_llc_test_checks.py.

Yonghong Song added 5 commits August 20, 2025 10:16
NOTE 1: We probably need cpu v5 or other flags to enable this feature.
We can add it later when necessary. Let us use cpu v4 for now.
NOTE 2: An option -bpf-min-jump-table-entries is implemented to control the minimum
number of entries to use a jump table on BPF. The default value 5 and this is
to make it easy to test. Eventually we will increase min jump table entries to be 13.

This patch adds jump table support. A new insn 'gotox <reg>' is
added to allow goto through a register. The register represents
the address in the current section.

Example 1 (switch statement):
=============================

Code:
struct simple_ctx {
        int x;
        int y;
        int z;
};

int ret_user, ret_user2;
void bar(void);
int foo(struct simple_ctx *ctx, struct simple_ctx *ctx2)
{
        switch (ctx->x) {
        case 1: ret_user = 18; break;
        case 20: ret_user = 6; break;
        case 16: ret_user = 9; break;
        case 6: ret_user = 16; break;
        case 8: ret_user = 14; break;
        case 30: ret_user = 2; break;
        default: ret_user = 1; break;
        }

        bar();

        switch (ctx2->x) {
        case 0: ret_user2 = 8; break;
        case 31: ret_user2 = 5; break;
        case 13: ret_user2 = 8; break;
        case 1: ret_user2 = 3; break;
        case 11: ret_user2 = 4; break;
        default: ret_user2 = 29; break;
        }

        return 0;
}

Run: clang --target=bpf -mcpu=v4 -O2 -S test.c
The assembly code:
        ...
    # %bb.1:                                # %entry
        r1 <<= 3
        r2 = .LJTI0_0 ll
        r2 += r1
        r1 = *(u64 *)(r2 + 0)
        gotox r1
    LBB0_2:
        w1 = 18
        goto LBB0_9
        ...
    # %bb.10:                               # %sw.epilog
        r1 <<= 3
        r2 = .LJTI0_1 ll
        r2 += r1
        r1 = *(u64 *)(r2 + 0)
        gotox r1
    LBB0_11:
        w1 = 8
        goto LBB0_16
        ...
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
    .LJTI0_0:
        .quad   LBB0_2
        .quad   LBB0_8
        ...
        .quad   LBB0_7
    .LJTI0_1:
        .quad   LBB0_11
        .quad   LBB0_13
         ...

Although we do have labels .LJTI0_0 and .LJTI0_1, but since they have
prefix '.L' so they won't appear in the .o file like other symbols.

Run: llvm-objdump -Sr test.o
       ...
       4:       67 01 00 00 03 00 00 00 r1 <<= 0x3
       5:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000028:  R_BPF_64_64  .rodata
       7:       0f 12 00 00 00 00 00 00 r2 += r1
       ...
      29:       67 01 00 00 03 00 00 00 r1 <<= 0x3
      30:       18 02 00 00 f0 00 00 00 00 00 00 00 00 00 00 00 r2 = 0xf0 ll
                00000000000000f0:  R_BPF_64_64  .rodata
      32:       0f 12 00 00 00 00 00 00 r2 += r1

The size of jump table is not obvious. The libbpf needs to check all relocations
against .rodata section in order to get precise size in order to construct bpf
maps.

Example 2 (Simple computed goto):
=================================

Code:
     int bar(int a) {
        __label__ l1, l2;
        void * volatile tgt;
        int ret = 0;
        if (a)
          tgt = &&l1; // synthetic jump table generated here
        else
          tgt = &&l2; // another synthetic jump table
        goto *tgt;
    l1: ret += 1;
    l2: ret += 2;
        return ret;
      }

Compile: clang --target=bpf -mcpu=v4 -O2 -c test1.c
Objdump: llvm-objdump -Sr test1.o

       0:       18 02 00 00 50 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x50 ll
                0000000000000000:  R_BPF_64_64  .text
       2:       16 01 02 00 00 00 00 00 if w1 == 0x0 goto +0x2 <bar+0x28>
       3:       18 02 00 00 40 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x40 ll
                0000000000000018:  R_BPF_64_64  .text
       5:       7b 2a f8 ff 00 00 00 00 *(u64 *)(r10 - 0x8) = r2
       6:       79 a1 f8 ff 00 00 00 00 r1 = *(u64 *)(r10 - 0x8)
       7:       0d 01 00 00 00 00 00 00 gotox r1
       8:       b4 00 00 00 03 00 00 00 w0 = 0x3
       9:       05 00 01 00 00 00 00 00 goto +0x1 <bar+0x58>
      10:       b4 00 00 00 02 00 00 00 w0 = 0x2
      11:       95 00 00 00 00 00 00 00 exit

For this case, there is no jump table so it would be hard to track offset
during verification esp. when offset needs adjustment. So practically we
need to create two jump tables for '&&l1' and '&&l2' respectively.

Example 3 (More complicated computed goto):
===========================================

Code:
  int foo(int a, int b) {
    __label__ l1, l2, l3, l4;
    void *jt1[] = {[0]=&&l1, [1]=&&l2};
    void *jt2[] = {[0]=&&l3, [1]=&&l4};
    int ret = 0;

    goto *jt1[a % 2];
    l1: ret += 1;
    l2: ret += 3;
    goto *jt2[b % 2];
    l3: ret += 5;
    l4: ret += 7;
    return ret;
  }

Compile: clang --target=bpf -mcpu=v4 -O2 -S test2.c
Asm code:
        ...
        r3 = (s32)r2
        r3 <<= 3
        r2 = .L__const.foo.jt2 ll
        r2 += r3
        r1 = (s32)r1
        r1 <<= 3
        r3 = .L__const.foo.jt1 ll
        r3 += r1
        w0 = 0
        r1 = *(u64 *)(r3 + 0)
        gotox r1
    .Ltmp0:                                 # Block address taken
    LBB0_1:                                 # %l1
                                        # =>This Inner Loop Header: Depth=1
        w0 += 1
        w0 += 3
        r1 = *(u64 *)(r2 + 0)
        gotox r1
    .Ltmp1:                                 # Block address taken
    LBB0_2:                                 # %l2
        ...
        .type   .L__const.foo.jt1,@object       # @__const.foo.jt1
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
    .L__const.foo.jt1:
        .quad   .Ltmp0
        .quad   .Ltmp1
        .size   .L__const.foo.jt1, 16

        .type   .L__const.foo.jt2,@object       # @__const.foo.jt2
        .p2align        3, 0x0
    .L__const.foo.jt2:
        .quad   .Ltmp2
        .quad   .Ltmp3
        .size   .L__const.foo.jt2, 16

Similar to switch statement case, for the binary, the symbols
.L__const.foo.jt* will not show up in the symbol table and jump table
will be in .rodata section.

We need to resolve Example 2 case.

Also with more libbpf work (dealing with .rodata sections etc.),
everything should work fine for Examples 1 and 3. But we could do
better by
  - Replacing symbols like .L<...> with symbols appearing in
    symbol table.
  - Add jump tables to .jumptables section instead of .rodata section.
This should make things easier for libbpf. User can also benefit
from this as relocation/section will be easy to check.

Next two patches will fix Example 2 and improve all of them as
mentioned in the above.
Example 2, Asm code:
        ...
    # %bb.0:                                # %entry
        r2 = .LJTI0_0 ll
        r2 = *(u64 *)(r2 + 0)
        r3 = .LJTI0_1 ll
        r3 = *(u64 *)(r3 + 0)
        if w1 == 0 goto LBB0_2
    # %bb.1:                                # %entry
        r3 = r2
    LBB0_2:                                 # %entry
        *(u64 *)(r10 - 8) = r3
        r1 = *(u64 *)(r10 - 8)
        gotox r1
    .Ltmp0:                                 # Block address taken
    LBB0_3:                                 # %l1
        w0 = 3
        goto LBB0_5
    .Ltmp1:                                 # Block address taken
    LBB0_4:                                 # %l2
        w0 = 2
    LBB0_5:                                 # %.split
        exit
        ...
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
    .LJTI0_0:
        .quad   LBB0_3
    .LJTI0_1:
        .quad   LBB0_4

Example 3, Asm Code:

        r3 = (s32)r2
        r3 <<= 3
        r2 = .LJTI0_0 ll
        r2 += r3
        r1 = (s32)r1
        r1 <<= 3
        r3 = .LJTI0_1 ll
        r3 += r1
        w0 = 0
        r1 = *(u64 *)(r3 + 0)
        gotox r1
    .Ltmp0:                                 # Block address taken
    LBB0_1:                                 # %l1
                                            # =>This Inner Loop Header: Depth=1
        w0 += 1
        w0 += 3
        r1 = *(u64 *)(r2 + 0)
        gotox r1
    .Ltmp1:                                 # Block address taken
    LBB0_2:                                 # %l2
                                            # =>This Inner Loop Header: Depth=1
        w0 += 3
        r1 = *(u64 *)(r2 + 0)
        gotox r1
    .Ltmp2:                                 # Block address taken
    LBB0_3:                                 # %l3
        w0 += 5
        goto LBB0_5
    .Ltmp3:                                 # Block address taken
    LBB0_4:                                 # %l4
    LBB0_5:                                 # %.split17
        w0 += 7
        exit
        ...
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
    .LJTI0_0:
        .quad   LBB0_3
        .quad   LBB0_4
    .LJTI0_1:
        .quad   LBB0_1
        .quad   LBB0_2
                                        # -- End function
        .type   .L__const.foo.jt1,@object       # @__const.foo.jt1
        .p2align        3, 0x0
    .L__const.foo.jt1:
        .quad   .Ltmp0
        .quad   .Ltmp1
        .size   .L__const.foo.jt1, 16

        .type   .L__const.foo.jt2,@object       # @__const.foo.jt2
        .p2align        3, 0x0
    .L__const.foo.jt2:
        .quad   .Ltmp2
        .quad   .Ltmp3
        .size   .L__const.foo.jt2, 16

Note that for both above examples, the jump table section is '.rodata'
and labels have '.L' prefix which means labels won't show up in the
symbol table. As mentioned in previous patch, we want to
  - Move jump tables to '.jumptables' section
  - Rename '.L*' labels with proper labels which are visible in symbol table.

Note that for Example 3, there are extra global functions like
  .L__const.foo.jt1 and .L__const.foo.jt2
and we are not able to remove them. But they won't show up in symbol
table either.
For jumptables from switch statements, generate 'llvm-readelf -s' visible
symbols and put jumptables into a dedicated section. Most work from
Eduard.

For the previous example 1,
Compile: clang --target=bpf -mcpu=v4 -O2 -S test.c
Asm code:
        ...
    # %bb.1:                                # %entry
        r1 <<= 3
        r2 = BPF.JT.0.0 ll
        r2 += r1
        r1 = *(u64 *)(r2 + 0)
        gotox r1
    LBB0_2:
        w1 = 18
        goto LBB0_9
        ...
    # %bb.10:                               # %sw.epilog
        r1 <<= 3
        r2 = BPF.JT.0.1 ll
        r2 += r1
        r1 = *(u64 *)(r2 + 0)
        gotox r1
    LBB0_11:
        w1 = 8
        goto LBB0_16
        ...
        .section        .jumptables,"",@progbits
    BPF.JT.0.0:
        .quad   LBB0_2
        .quad   LBB0_8
        ...
        .quad   LBB0_7
        .size   BPF.JT.0.0, 240
    BPF.JT.0.1:
        .quad   LBB0_11
        .quad   LBB0_13
        ...
        .quad   LBB0_12
        .size   BPF.JT.0.1, 256

And symbols BPF.JT.0.{0,1} will be in symbol table.
The final binary:
       4:       67 01 00 00 03 00 00 00 r1 <<= 0x3
       5:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000028:  R_BPF_64_64  BPF.JT.0.0
       7:       0f 12 00 00 00 00 00 00 r2 += r1
       ...
      29:       67 01 00 00 03 00 00 00 r1 <<= 0x3
      30:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                00000000000000f0:  R_BPF_64_64  BPF.JT.0.1
      32:       0f 12 00 00 00 00 00 00 r2 += r1
       ...
Symbol table:
     4: 0000000000000000   240 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0
     5: 0000000000000000     4 OBJECT  GLOBAL DEFAULT     6 ret_user
     6: 0000000000000000     0 NOTYPE  GLOBAL DEFAULT   UND bar
     7: 00000000000000f0   256 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.1
and
  [ 4] .jumptables       PROGBITS        0000000000000000 0001c8 0001f0 00      0   0  1

For the previous example 2,
    Compile: clang --target=bpf -mcpu=v4 -O2 -S test1.c
    Asm code:
        ...
    # %bb.0:                                # %entry
        r2 = BPF.JT.0.0 ll
        r2 = *(u64 *)(r2 + 0)
        r3 = BPF.JT.0.1 ll
        r3 = *(u64 *)(r3 + 0)
        if w1 == 0 goto LBB0_2
    # %bb.1:                                # %entry
        r3 = r2
    LBB0_2:                                 # %entry
        *(u64 *)(r10 - 8) = r3
        r1 = *(u64 *)(r10 - 8)
        gotox r1
        ...
        .section        .jumptables,"",@progbits
    BPF.JT.0.0:
        .quad   LBB0_3
        .size   BPF.JT.0.0, 8
    BPF.JT.0.1:
        .quad   LBB0_4
        .size   BPF.JT.0.1, 8

    The binary: clang --target=bpf -mcpu=v4 -O2 -c test1.c

       0:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                0000000000000000:  R_BPF_64_64  BPF.JT.0.0
       2:       79 22 00 00 00 00 00 00 r2 = *(u64 *)(r2 + 0x0)
       3:       18 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r3 = 0x0 ll
                0000000000000018:  R_BPF_64_64  BPF.JT.0.1
       5:       79 33 00 00 00 00 00 00 r3 = *(u64 *)(r3 + 0x0)
       6:       16 01 01 00 00 00 00 00 if w1 == 0x0 goto +0x1 <bar+0x40>
       7:       bf 23 00 00 00 00 00 00 r3 = r2
       8:       7b 3a f8 ff 00 00 00 00 *(u64 *)(r10 - 0x8) = r3
       9:       79 a1 f8 ff 00 00 00 00 r1 = *(u64 *)(r10 - 0x8)
      10:       0d 01 00 00 00 00 00 00 gotox r1

       4: 0000000000000000     8 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0
       5: 0000000000000008     8 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.1

      [ 4] .jumptables       PROGBITS        0000000000000000 0000b8 000010 00      0   0  1

For the previous example 3,
    Compile: clang --target=bpf -mcpu=v4 -O2 -S test.c
    Asm code:
        ...
        r3 = (s32)r2
        r3 <<= 3
        r2 = BPF.JT.0.0 ll
        r2 += r3
        r1 = (s32)r1
        r1 <<= 3
        r3 = BPF.JT.0.1 ll
        r3 += r1
        w0 = 0
        r1 = *(u64 *)(r3 + 0)
        gotox r1
    .Ltmp0:                                 # Block address taken
    LBB0_1:                                 # %l1
                                            # =>This Inner Loop Header: Depth=1
        w0 += 1                                            # =>This Inner Loop Header: Depth=1
        ...
        .section        .jumptables,"",@progbits
    BPF.JT.0.0:
        .quad   LBB0_3
        .quad   LBB0_4
        .size   BPF.JT.0.0, 16
    BPF.JT.0.1:
        .quad   LBB0_1
        .quad   LBB0_2
        .size   BPF.JT.0.1, 16

    The binary: clang --target=bpf -mcpu=v4 -O2 -c test2.c

          12:       bf 23 20 00 00 00 00 00 r3 = (s32)r2
          13:       67 03 00 00 03 00 00 00 r3 <<= 0x3
          14:       18 02 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r2 = 0x0 ll
                    0000000000000070:  R_BPF_64_64  BPF.JT.0.0
          16:       0f 32 00 00 00 00 00 00 r2 += r3
          17:       bf 11 20 00 00 00 00 00 r1 = (s32)r1
          18:       67 01 00 00 03 00 00 00 r1 <<= 0x3
          19:       18 03 00 00 00 00 00 00 00 00 00 00 00 00 00 00 r3 = 0x0 ll
                    0000000000000098:  R_BPF_64_64  BPF.JT.0.1
          21:       0f 13 00 00 00 00 00 00 r3 += r1

         4: 0000000000000000    16 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.0
         5: 0000000000000010    16 OBJECT  GLOBAL DEFAULT     4 BPF.JT.0.1

      [ 4] .jumptables       PROGBITS        0000000000000000 000160 000020 00      0   0  1
This is temporary and it makes easy to run bpf selftests.
Once kernel side is ready, we will implement CPU V5 which
will support jump tables.
To adjust, add '-mllvm -bpf-min-jump-table-entries=<n>' to the
compilation flags.
@yonghong-song
Copy link
Contributor Author

yonghong-song commented Aug 20, 2025

@aspsk @eddyz87 Just update the patch with following:

  • Change minimum jump-table-entries (for switch statements) to be 13, so if you intends to test with less switch statements, you can add option '-mllvm -bpf-min-jump-table-entries=' (e.g., n = 3).
  • I added three tests to cover switch statement, blockaddress and globalvar.

I also rebased on top of current main branch and fix one issue during the update.

diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index 269d3b9566cd..535af3a46d4a 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -138,7 +138,7 @@ MCSymbol *BPFAsmPrinter::getJTPublicSymbol(unsigned JTI) {
 raw_svector_ostream(Name)
     << "BPF.JT." << MF->getFunctionNumber() << '.' << JTI;
 MCSymbol *S = OutContext.getOrCreateSymbol(Name);
-  if (auto *ES = dyn_cast<MCSymbolELF>(S)) {
+  if (auto *ES = static_cast<MCSymbolELF *>(S)) {
   ES->setBinding(ELF::STB_GLOBAL);
   ES->setType(ELF::STT_OBJECT);
 }

@yonghong-song
Copy link
Contributor Author

I also tried to remove those global variables (with blockaddresses) when gotox is available at pretty late optimizations:

diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index 6275d5b7721c..91768eb1ae3f 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include <set>
 
@@ -308,6 +309,8 @@ struct BPFMIPreEmitPeephole : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
   const BPFInstrInfo *TII;
   bool SupportGotol;
+  bool SupportGotox;
+  bool doneRemoveUnneededGV = false;
 
   BPFMIPreEmitPeephole() : MachineFunctionPass(ID) {}
 
@@ -321,6 +324,7 @@ private:
   bool insertMissingCallerSavedSpills();
   bool removeMayGotoZero();
   bool addExitAfterUnreachable();
+  bool removeUnneededGV();
 
 public:
 
@@ -338,6 +342,8 @@ public:
     Changed |= insertMissingCallerSavedSpills();
     Changed |= removeMayGotoZero();
     Changed |= addExitAfterUnreachable();
+    if (SupportGotox && !doneRemoveUnneededGV)
+      Changed |= removeUnneededGV();
     return Changed;
   }
 };
@@ -348,6 +354,7 @@ void BPFMIPreEmitPeephole::initialize(MachineFunction &MFParm) {
   TII = MF->getSubtarget<BPFSubtarget>().getInstrInfo();
   TRI = MF->getSubtarget<BPFSubtarget>().getRegisterInfo();
   SupportGotol = MF->getSubtarget<BPFSubtarget>().hasGotol();
+  SupportGotox = MF->getSubtarget<BPFSubtarget>().hasGotox();
   LLVM_DEBUG(dbgs() << "*** BPF PreEmit peephole pass ***\n\n");
 }
 
@@ -750,6 +757,40 @@ bool BPFMIPreEmitPeephole::addExitAfterUnreachable() {
   return true;
 }

+// Remove global variables which has blockaddress since those global variables
+// have been converted to proper jump tables.
+bool BPFMIPreEmitPeephole::removeUnneededGV() {
+  bool Changed = false;
+  Module *M = MF->getFunction().getParent();
+  std::vector<GlobalVariable *> Targets;
+  for (GlobalVariable &Global : M->globals()) {
+    if (Global.getLinkage() != GlobalValue::PrivateLinkage)
+      continue;
+    if (!Global.isConstant() || !Global.hasInitializer())
+      continue;
+
+    Constant *CV = dyn_cast<Constant>(Global.getInitializer());
+    if (!CV)
+      continue;
+    ConstantArray *CA = dyn_cast<ConstantArray>(CV);
+    if (!CA)
+      continue;
+
+    for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i) {
+      if (!dyn_cast<BlockAddress>(CA->getOperand(i)))
+        continue;
+    }
+    Targets.push_back(&Global);
+    Changed = true;
+  }
+
+  for (GlobalVariable *GV : Targets)
+    GV->eraseFromParent();
+
+  doneRemoveUnneededGV = true;
+  return Changed;
+}
+
 } // end default namespace
 
 INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",

Unfortunately, tried a few examples, e.g.,

struct simple_ctx {          
        int x;                                                         
        int y;                                                         
        int z;  
};
                                   
int ret_user, ret_user2;
void bar(void);  
int foo1(struct simple_ctx *ctx, struct simple_ctx *ctx2)
{                
        switch (ctx->x) {
        case 1: ret_user = 18; break;
        case 20: ret_user = 6; break;
        case 16: ret_user = 9; break;     
        case 6: ret_user = 16; break;
        case 8: ret_user = 14; break;
        case 30: ret_user = 2; break;
        default: ret_user = 1; break;
        }

        bar();

        switch (ctx2->x) {
        case 0: ret_user2 = 8; break;
        case 31: ret_user2 = 5; break;
        case 13: ret_user2 = 8; break;
        case 1: ret_user2 = 3; break;
        case 11: ret_user2 = 4; break;
        default: ret_user2 = 29; break;
        }

        return 0;
}

int foo2(int a, int b) {
    __label__ l1, l2, l3, l4;
    void *jt1[] = {[0]=&&l1, [1]=&&l2};
    void *jt2[] = {[0]=&&l3, [1]=&&l4};
    int ret = 0;

    goto *jt1[a % 2];
    l1: ret += 1;
    l2: ret += 3;
    goto *jt2[b % 2];
    l3: ret += 5;
    l4: ret += 7;
    return ret;
}

It does not work.
As we discussed earlier I think it is okay to leave these unused globals, it does not impact anything except tiny bit space increase.

@eddyz87
Copy link
Contributor

eddyz87 commented Aug 21, 2025

@yonghong-song ,

When I try the example from a previous comment, I get the following error:

fatal error: error in backend: Cannot select: <U+001B>[0;36mt6<U+001B>[0m: ch = brind <U+001B>[0;30mt0<U+001B>[0m, <U+001B>[0;35mt5<U+001B>[0m                                                                                                                 
  <U+001B>[0;35mt5<U+001B>[0m: i64,ch = load<(invariant load (s64) from %ir.indirect.goto.dest.in, !tbaa !9)> <U+001B>[0;30mt0<U+001B>[0m, <U+001B>[0;32mt2<U+001B>[0m, undef:i64                                                                              
    <U+001B>[0;32mt2<U+001B>[0m: i64,ch = CopyFromReg <U+001B>[0;30mt0<U+001B>[0m, Register:i64 %8                                                                                                                                                             
In function: foo2 

I'm at cc70bda3c145 (HEAD -> yhs-jumptable, yhs/jumptable) ("[BPF] Add some selftests").

@eddyz87
Copy link
Contributor

eddyz87 commented Aug 21, 2025

@yonghong-song ,

When I try the example from a previous comment, I get the following error:

fatal error: error in backend: Cannot select: <U+001B>[0;36mt6<U+001B>[0m: ch = brind <U+001B>[0;30mt0<U+001B>[0m, <U+001B>[0;35mt5<U+001B>[0m                                                                                                                 
  <U+001B>[0;35mt5<U+001B>[0m: i64,ch = load<(invariant load (s64) from %ir.indirect.goto.dest.in, !tbaa !9)> <U+001B>[0;30mt0<U+001B>[0m, <U+001B>[0;32mt2<U+001B>[0m, undef:i64                                                                              
    <U+001B>[0;32mt2<U+001B>[0m: i64,ch = CopyFromReg <U+001B>[0;30mt0<U+001B>[0m, Register:i64 %8                                                                                                                                                             
In function: foo2 

I'm at cc70bda3c145 (HEAD -> yhs-jumptable, yhs/jumptable) ("[BPF] Add some selftests").

Uh-oh, that's because I forgot to pass -mcpuv4 option.
We should probably do a better error reporting in such cases.

@eddyz87
Copy link
Contributor

eddyz87 commented Aug 21, 2025

Below slight modification of what Yonghong tried handles removal of leftover globals for the foo2 example. But it needs to be changed to a proper module pass and checks should be duplicated between lowering phase and this pass.

--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -30,6 +30,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include <set>

@@ -321,6 +324,7 @@ private:
   bool insertMissingCallerSavedSpills();
   bool removeMayGotoZero();
   bool addExitAfterUnreachable();
+  bool removeUnusedGV();

 public:

@@ -338,6 +342,7 @@ public:
     Changed |= insertMissingCallerSavedSpills();
     Changed |= removeMayGotoZero();
     Changed |= addExitAfterUnreachable();
+    Changed |= removeUnusedGV();
     return Changed;
   }
 };
@@ -750,6 +755,29 @@ bool BPFMIPreEmitPeephole::addExitAfterUnreachable() {
   return true;
 }

+bool BPFMIPreEmitPeephole::removeUnusedGV() {
+  Module *M = MF->getFunction().getParent();
+  std::vector<GlobalVariable *> Targets;
+  for (GlobalVariable &Global : M->globals()) {
+    if (Global.getLinkage() != GlobalValue::PrivateLinkage)
+      continue;
+    if (!Global.isConstant() || !Global.hasInitializer())
+      continue;
+    Constant *CV = dyn_cast<Constant>(Global.getInitializer());
+    if (!CV)
+      continue;
+    ConstantArray *CA = dyn_cast<ConstantArray>(CV);
+    if (!CA)
+      continue;
+    Targets.push_back(&Global);
+  }
+  for (auto *G: Targets) {
+    G->replaceAllUsesWith(PoisonValue::get(G->getType())); // <----- Key change
+    G->eraseFromParent();
+  }
+  return true;
+}
+
 } // end default namespace

 INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",

@yonghong-song
Copy link
Contributor Author

Below slight modification of what Yonghong tried handles removal of leftover globals for the foo2 example. But it needs to be changed to a proper module pass and checks should be duplicated between lowering phase and this pass.

--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -30,6 +30,9 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include <set>

@@ -321,6 +324,7 @@ private:
   bool insertMissingCallerSavedSpills();
   bool removeMayGotoZero();
   bool addExitAfterUnreachable();
+  bool removeUnusedGV();

 public:

@@ -338,6 +342,7 @@ public:
     Changed |= insertMissingCallerSavedSpills();
     Changed |= removeMayGotoZero();
     Changed |= addExitAfterUnreachable();
+    Changed |= removeUnusedGV();
     return Changed;
   }
 };
@@ -750,6 +755,29 @@ bool BPFMIPreEmitPeephole::addExitAfterUnreachable() {
   return true;
 }

+bool BPFMIPreEmitPeephole::removeUnusedGV() {
+  Module *M = MF->getFunction().getParent();
+  std::vector<GlobalVariable *> Targets;
+  for (GlobalVariable &Global : M->globals()) {
+    if (Global.getLinkage() != GlobalValue::PrivateLinkage)
+      continue;
+    if (!Global.isConstant() || !Global.hasInitializer())
+      continue;
+    Constant *CV = dyn_cast<Constant>(Global.getInitializer());
+    if (!CV)
+      continue;
+    ConstantArray *CA = dyn_cast<ConstantArray>(CV);
+    if (!CA)
+      continue;
+    Targets.push_back(&Global);
+  }
+  for (auto *G: Targets) {
+    G->replaceAllUsesWith(PoisonValue::get(G->getType())); // <----- Key change
+    G->eraseFromParent();
+  }
+  return true;
+}
+
 } // end default namespace

 INITIALIZE_PASS(BPFMIPreEmitPeephole, "bpf-mi-pemit-peephole",

Just updated the pull request to address unused global variable issue. I moved the above code to doFinalization() which handles at module level.

@aspsk
Copy link

aspsk commented Aug 25, 2025

Thanks, tested it with my current branch.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants