|
24 | 24 | #define rH r31
|
25 | 25 |
|
26 | 26 | #ifdef __LITTLE_ENDIAN__
|
| 27 | +#define LH lhbrx |
| 28 | +#define LW lwbrx |
27 | 29 | #define LD ldbrx
|
28 | 30 | #else
|
| 31 | +#define LH lhzx |
| 32 | +#define LW lwzx |
29 | 33 | #define LD ldx
|
30 | 34 | #endif
|
31 | 35 |
|
| 36 | +/* |
| 37 | + * There are 2 categories for memcmp: |
| 38 | + * 1) src/dst has the same offset to the 8 bytes boundary. The handlers |
| 39 | + * are named like .Lsameoffset_xxxx |
| 40 | + * 2) src/dst has different offset to the 8 bytes boundary. The handlers |
| 41 | + * are named like .Ldiffoffset_xxxx |
| 42 | + */ |
32 | 43 | _GLOBAL(memcmp)
|
33 | 44 | cmpdi cr1,r5,0
|
34 | 45 |
|
35 |
| - /* Use the short loop if both strings are not 8B aligned */ |
36 |
| - or r6,r3,r4 |
| 46 | + /* Use the short loop if the src/dst addresses are not |
| 47 | + * with the same offset of 8 bytes align boundary. |
| 48 | + */ |
| 49 | + xor r6,r3,r4 |
37 | 50 | andi. r6,r6,7
|
38 | 51 |
|
39 |
| - /* Use the short loop if length is less than 32B */ |
40 |
| - cmpdi cr6,r5,31 |
| 52 | + /* Fall back to short loop if compare at aligned addrs |
| 53 | + * with less than 8 bytes. |
| 54 | + */ |
| 55 | + cmpdi cr6,r5,7 |
41 | 56 |
|
42 | 57 | beq cr1,.Lzero
|
43 |
| - bne .Lshort |
44 |
| - bgt cr6,.Llong |
| 58 | + bgt cr6,.Lno_short |
45 | 59 |
|
46 | 60 | .Lshort:
|
47 | 61 | mtctr r5
|
48 |
| - |
49 | 62 | 1: lbz rA,0(r3)
|
50 | 63 | lbz rB,0(r4)
|
51 | 64 | subf. rC,rB,rA
|
@@ -78,11 +91,89 @@ _GLOBAL(memcmp)
|
78 | 91 | li r3,0
|
79 | 92 | blr
|
80 | 93 |
|
| 94 | +.Lno_short: |
| 95 | + dcbt 0,r3 |
| 96 | + dcbt 0,r4 |
| 97 | + bne .Ldiffoffset_8bytes_make_align_start |
| 98 | + |
| 99 | + |
| 100 | +.Lsameoffset_8bytes_make_align_start: |
| 101 | + /* attempt to compare bytes not aligned with 8 bytes so that |
| 102 | + * rest comparison can run based on 8 bytes alignment. |
| 103 | + */ |
| 104 | + andi. r6,r3,7 |
| 105 | + |
| 106 | + /* Try to compare the first double word which is not 8 bytes aligned: |
| 107 | + * load the first double word at (src & ~7UL) and shift left appropriate |
| 108 | + * bits before comparision. |
| 109 | + */ |
| 110 | + rlwinm r6,r3,3,26,28 |
| 111 | + beq .Lsameoffset_8bytes_aligned |
| 112 | + clrrdi r3,r3,3 |
| 113 | + clrrdi r4,r4,3 |
| 114 | + LD rA,0,r3 |
| 115 | + LD rB,0,r4 |
| 116 | + sld rA,rA,r6 |
| 117 | + sld rB,rB,r6 |
| 118 | + cmpld cr0,rA,rB |
| 119 | + srwi r6,r6,3 |
| 120 | + bne cr0,.LcmpAB_lightweight |
| 121 | + subfic r6,r6,8 |
| 122 | + subf. r5,r6,r5 |
| 123 | + addi r3,r3,8 |
| 124 | + addi r4,r4,8 |
| 125 | + beq .Lzero |
| 126 | + |
| 127 | +.Lsameoffset_8bytes_aligned: |
| 128 | + /* now we are aligned with 8 bytes. |
| 129 | + * Use .Llong loop if left cmp bytes are equal or greater than 32B. |
| 130 | + */ |
| 131 | + cmpdi cr6,r5,31 |
| 132 | + bgt cr6,.Llong |
| 133 | + |
| 134 | +.Lcmp_lt32bytes: |
| 135 | + /* compare 1 ~ 32 bytes, at least r3 addr is 8 bytes aligned now */ |
| 136 | + cmpdi cr5,r5,7 |
| 137 | + srdi r0,r5,3 |
| 138 | + ble cr5,.Lcmp_rest_lt8bytes |
| 139 | + |
| 140 | + /* handle 8 ~ 31 bytes */ |
| 141 | + clrldi r5,r5,61 |
| 142 | + mtctr r0 |
| 143 | +2: |
| 144 | + LD rA,0,r3 |
| 145 | + LD rB,0,r4 |
| 146 | + cmpld cr0,rA,rB |
| 147 | + addi r3,r3,8 |
| 148 | + addi r4,r4,8 |
| 149 | + bne cr0,.LcmpAB_lightweight |
| 150 | + bdnz 2b |
| 151 | + |
| 152 | + cmpwi r5,0 |
| 153 | + beq .Lzero |
| 154 | + |
| 155 | +.Lcmp_rest_lt8bytes: |
| 156 | + /* Here we have only less than 8 bytes to compare with. at least s1 |
| 157 | + * Address is aligned with 8 bytes. |
| 158 | + * The next double words are load and shift right with appropriate |
| 159 | + * bits. |
| 160 | + */ |
| 161 | + subfic r6,r5,8 |
| 162 | + slwi r6,r6,3 |
| 163 | + LD rA,0,r3 |
| 164 | + LD rB,0,r4 |
| 165 | + srd rA,rA,r6 |
| 166 | + srd rB,rB,r6 |
| 167 | + cmpld cr0,rA,rB |
| 168 | + bne cr0,.LcmpAB_lightweight |
| 169 | + b .Lzero |
| 170 | + |
81 | 171 | .Lnon_zero:
|
82 | 172 | mr r3,rC
|
83 | 173 | blr
|
84 | 174 |
|
85 | 175 | .Llong:
|
| 176 | + /* At least s1 addr is aligned with 8 bytes */ |
86 | 177 | li off8,8
|
87 | 178 | li off16,16
|
88 | 179 | li off24,24
|
@@ -232,4 +323,39 @@ _GLOBAL(memcmp)
|
232 | 323 | ld r28,-32(r1)
|
233 | 324 | ld r27,-40(r1)
|
234 | 325 | blr
|
| 326 | + |
| 327 | +.LcmpAB_lightweight: /* skip NV GPRS restore */ |
| 328 | + li r3,1 |
| 329 | + bgtlr |
| 330 | + li r3,-1 |
| 331 | + blr |
| 332 | + |
| 333 | +.Ldiffoffset_8bytes_make_align_start: |
| 334 | + /* now try to align s1 with 8 bytes */ |
| 335 | + rlwinm r6,r3,3,26,28 |
| 336 | + beq .Ldiffoffset_align_s1_8bytes |
| 337 | + |
| 338 | + clrrdi r3,r3,3 |
| 339 | + LD rA,0,r3 |
| 340 | + LD rB,0,r4 /* unaligned load */ |
| 341 | + sld rA,rA,r6 |
| 342 | + srd rA,rA,r6 |
| 343 | + srd rB,rB,r6 |
| 344 | + cmpld cr0,rA,rB |
| 345 | + srwi r6,r6,3 |
| 346 | + bne cr0,.LcmpAB_lightweight |
| 347 | + |
| 348 | + subfic r6,r6,8 |
| 349 | + subf. r5,r6,r5 |
| 350 | + addi r3,r3,8 |
| 351 | + add r4,r4,r6 |
| 352 | + |
| 353 | + beq .Lzero |
| 354 | + |
| 355 | +.Ldiffoffset_align_s1_8bytes: |
| 356 | + /* now s1 is aligned with 8 bytes. */ |
| 357 | + cmpdi cr5,r5,31 |
| 358 | + ble cr5,.Lcmp_lt32bytes |
| 359 | + b .Llong |
| 360 | + |
235 | 361 | EXPORT_SYMBOL(memcmp)
|
0 commit comments