19
19
* /
20
20
21
21
#include <linux/linkage.h>
22
+ #include <asm/assembler.h>
23
+ #include <asm/cache.h>
22
24
23
25
.text
24
26
. align 6
36
38
* /
37
39
chacha_permute:
38
40
39
- adr x10 , ROT8
41
+ adr_l x10 , ROT8
40
42
ld1 {v12.4s} , [ x10 ]
41
43
42
44
.Ldoubleround:
@@ -169,6 +171,12 @@ ENTRY(chacha_4block_xor_neon)
169
171
// x1: 4 data blocks output , o
170
172
// x2: 4 data blocks input , i
171
173
// w3: nrounds
174
+ // x4: byte count
175
+
176
+ adr_l x10 , .Lpermute
177
+ and x5 , x4 , # 63
178
+ add x10 , x10 , x5
179
+ add x11 , x10 , # 64
172
180
173
181
//
174
182
// This function encrypts four consecutive ChaCha blocks by loading
@@ -178,15 +186,15 @@ ENTRY(chacha_4block_xor_neon)
178
186
// matrix by interleaving 32 - and then 64 - bit words , which allows us to
179
187
// do XOR in NEON registers.
180
188
//
181
- adr x9 , CTRINC // ... and ROT8
189
+ adr_l x9 , CTRINC // ... and ROT8
182
190
ld1 {v30.4s - v31.4s} , [ x9 ]
183
191
184
192
// x0.. 15 [ 0 - 3 ] = s0.. 3 [ 0 .. 3 ]
185
- mov x4 , x0
186
- ld4r { v0.4s - v3.4s} , [ x4 ], # 16
187
- ld4r { v4.4s - v7.4s} , [ x4 ], # 16
188
- ld4r { v8.4s - v11.4s} , [ x4 ], # 16
189
- ld4r {v12.4s - v15.4s} , [ x4 ]
193
+ add x8 , x0 , # 16
194
+ ld4r { v0.4s - v3.4s} , [ x0 ]
195
+ ld4r { v4.4s - v7.4s} , [ x8 ], # 16
196
+ ld4r { v8.4s - v11.4s} , [ x8 ], # 16
197
+ ld4r {v12.4s - v15.4s} , [ x8 ]
190
198
191
199
// x12 + = counter values 0 - 3
192
200
add v12.4s , v12.4s , v30.4s
@@ -430,24 +438,47 @@ ENTRY(chacha_4block_xor_neon)
430
438
zip1 v30.4s , v14.4s , v15.4s
431
439
zip2 v31.4s , v14.4s , v15.4s
432
440
441
+ mov x3 , # 64
442
+ subs x5 , x4 , # 64
443
+ add x6 , x5 , x2
444
+ csel x3 , x3 , xzr , ge
445
+ csel x2 , x2 , x6 , ge
446
+
433
447
// interleave 64 - bit words in state n , n + 2
434
448
zip1 v0.2d , v16.2d , v18.2d
435
449
zip2 v4.2d , v16.2d , v18.2d
436
450
zip1 v8.2d , v17.2d , v19.2d
437
451
zip2 v12.2d , v17.2d , v19.2d
438
- ld1 {v16.16b - v19.16b} , [ x2 ], # 64
452
+ ld1 {v16.16b - v19.16b} , [ x2 ], x3
453
+
454
+ subs x6 , x4 , # 128
455
+ ccmp x3 , xzr , # 4 , lt
456
+ add x7 , x6 , x2
457
+ csel x3 , x3 , xzr , eq
458
+ csel x2 , x2 , x7 , eq
439
459
440
460
zip1 v1.2d , v20.2d , v22.2d
441
461
zip2 v5.2d , v20.2d , v22.2d
442
462
zip1 v9.2d , v21.2d , v23.2d
443
463
zip2 v13.2d , v21.2d , v23.2d
444
- ld1 {v20.16b - v23.16b} , [ x2 ], # 64
464
+ ld1 {v20.16b - v23.16b} , [ x2 ], x3
465
+
466
+ subs x7 , x4 , # 192
467
+ ccmp x3 , xzr , # 4 , lt
468
+ add x8 , x7 , x2
469
+ csel x3 , x3 , xzr , eq
470
+ csel x2 , x2 , x8 , eq
445
471
446
472
zip1 v2.2d , v24.2d , v26.2d
447
473
zip2 v6.2d , v24.2d , v26.2d
448
474
zip1 v10.2d , v25.2d , v27.2d
449
475
zip2 v14.2d , v25.2d , v27.2d
450
- ld1 {v24.16b - v27.16b} , [ x2 ], # 64
476
+ ld1 {v24.16b - v27.16b} , [ x2 ], x3
477
+
478
+ subs x8 , x4 , # 256
479
+ ccmp x3 , xzr , # 4 , lt
480
+ add x9 , x8 , x2
481
+ csel x2 , x2 , x9 , eq
451
482
452
483
zip1 v3.2d , v28.2d , v30.2d
453
484
zip2 v7.2d , v28.2d , v30.2d
@@ -456,29 +487,155 @@ ENTRY(chacha_4block_xor_neon)
456
487
ld1 {v28.16b - v31.16b} , [ x2 ]
457
488
458
489
// xor with corresponding input , write to output
490
+ tbnz x5 , # 63 , 0f
459
491
eor v16.16b , v16.16b , v0.16b
460
492
eor v17.16b , v17.16b , v1.16b
461
493
eor v18.16b , v18.16b , v2.16b
462
494
eor v19.16b , v19.16b , v3.16b
495
+ st1 {v16.16b - v19.16b} , [ x1 ], # 64
496
+
497
+ tbnz x6 , # 63 , 1f
463
498
eor v20.16b , v20.16b , v4.16b
464
499
eor v21.16b , v21.16b , v5.16b
465
- st1 {v16.16b - v19.16b} , [ x1 ], # 64
466
500
eor v22.16b , v22.16b , v6.16b
467
501
eor v23.16b , v23.16b , v7.16b
502
+ st1 {v20.16b - v23.16b} , [ x1 ], # 64
503
+
504
+ tbnz x7 , # 63 , 2f
468
505
eor v24.16b , v24.16b , v8.16b
469
506
eor v25.16b , v25.16b , v9.16b
470
- st1 {v20.16b - v23.16b} , [ x1 ], # 64
471
507
eor v26.16b , v26.16b , v10.16b
472
508
eor v27.16b , v27.16b , v11.16b
473
- eor v28.16b , v28.16b , v12.16b
474
509
st1 {v24.16b - v27.16b} , [ x1 ], # 64
510
+
511
+ tbnz x8 , # 63 , 3f
512
+ eor v28.16b , v28.16b , v12.16b
475
513
eor v29.16b , v29.16b , v13.16b
476
514
eor v30.16b , v30.16b , v14.16b
477
515
eor v31.16b , v31.16b , v15.16b
478
516
st1 {v28.16b - v31.16b} , [ x1 ]
479
517
480
518
ret
519
+
520
+ // fewer than 64 bytes of in /output
521
+ 0 : ld1 {v8.16b} , [ x10 ]
522
+ ld1 {v9.16b} , [ x11 ]
523
+ movi v10.16b , # 16
524
+ sub x2 , x1 , # 64
525
+ add x1 , x1 , x5
526
+ ld1 {v16.16b - v19.16b} , [ x2 ]
527
+ tbl v4.16b , {v0.16b - v3.16b} , v8.16b
528
+ tbx v20.16b , {v16.16b - v19.16b} , v9.16b
529
+ add v8.16b , v8.16b , v10.16b
530
+ add v9.16b , v9.16b , v10.16b
531
+ tbl v5.16b , {v0.16b - v3.16b} , v8.16b
532
+ tbx v21.16b , {v16.16b - v19.16b} , v9.16b
533
+ add v8.16b , v8.16b , v10.16b
534
+ add v9.16b , v9.16b , v10.16b
535
+ tbl v6.16b , {v0.16b - v3.16b} , v8.16b
536
+ tbx v22.16b , {v16.16b - v19.16b} , v9.16b
537
+ add v8.16b , v8.16b , v10.16b
538
+ add v9.16b , v9.16b , v10.16b
539
+ tbl v7.16b , {v0.16b - v3.16b} , v8.16b
540
+ tbx v23.16b , {v16.16b - v19.16b} , v9.16b
541
+
542
+ eor v20.16b , v20.16b , v4.16b
543
+ eor v21.16b , v21.16b , v5.16b
544
+ eor v22.16b , v22.16b , v6.16b
545
+ eor v23.16b , v23.16b , v7.16b
546
+ st1 {v20.16b - v23.16b} , [ x1 ]
547
+ ret
548
+
549
+ // fewer than 128 bytes of in /output
550
+ 1 : ld1 {v8.16b} , [ x10 ]
551
+ ld1 {v9.16b} , [ x11 ]
552
+ movi v10.16b , # 16
553
+ add x1 , x1 , x6
554
+ tbl v0.16b , {v4.16b - v7.16b} , v8.16b
555
+ tbx v20.16b , {v16.16b - v19.16b} , v9.16b
556
+ add v8.16b , v8.16b , v10.16b
557
+ add v9.16b , v9.16b , v10.16b
558
+ tbl v1.16b , {v4.16b - v7.16b} , v8.16b
559
+ tbx v21.16b , {v16.16b - v19.16b} , v9.16b
560
+ add v8.16b , v8.16b , v10.16b
561
+ add v9.16b , v9.16b , v10.16b
562
+ tbl v2.16b , {v4.16b - v7.16b} , v8.16b
563
+ tbx v22.16b , {v16.16b - v19.16b} , v9.16b
564
+ add v8.16b , v8.16b , v10.16b
565
+ add v9.16b , v9.16b , v10.16b
566
+ tbl v3.16b , {v4.16b - v7.16b} , v8.16b
567
+ tbx v23.16b , {v16.16b - v19.16b} , v9.16b
568
+
569
+ eor v20.16b , v20.16b , v0.16b
570
+ eor v21.16b , v21.16b , v1.16b
571
+ eor v22.16b , v22.16b , v2.16b
572
+ eor v23.16b , v23.16b , v3.16b
573
+ st1 {v20.16b - v23.16b} , [ x1 ]
574
+ ret
575
+
576
+ // fewer than 192 bytes of in /output
577
+ 2 : ld1 {v4.16b} , [ x10 ]
578
+ ld1 {v5.16b} , [ x11 ]
579
+ movi v6.16b , # 16
580
+ add x1 , x1 , x7
581
+ tbl v0.16b , {v8.16b - v11.16b} , v4.16b
582
+ tbx v24.16b , {v20.16b - v23.16b} , v5.16b
583
+ add v4.16b , v4.16b , v6.16b
584
+ add v5.16b , v5.16b , v6.16b
585
+ tbl v1.16b , {v8.16b - v11.16b} , v4.16b
586
+ tbx v25.16b , {v20.16b - v23.16b} , v5.16b
587
+ add v4.16b , v4.16b , v6.16b
588
+ add v5.16b , v5.16b , v6.16b
589
+ tbl v2.16b , {v8.16b - v11.16b} , v4.16b
590
+ tbx v26.16b , {v20.16b - v23.16b} , v5.16b
591
+ add v4.16b , v4.16b , v6.16b
592
+ add v5.16b , v5.16b , v6.16b
593
+ tbl v3.16b , {v8.16b - v11.16b} , v4.16b
594
+ tbx v27.16b , {v20.16b - v23.16b} , v5.16b
595
+
596
+ eor v24.16b , v24.16b , v0.16b
597
+ eor v25.16b , v25.16b , v1.16b
598
+ eor v26.16b , v26.16b , v2.16b
599
+ eor v27.16b , v27.16b , v3.16b
600
+ st1 {v24.16b - v27.16b} , [ x1 ]
601
+ ret
602
+
603
+ // fewer than 256 bytes of in /output
604
+ 3 : ld1 {v4.16b} , [ x10 ]
605
+ ld1 {v5.16b} , [ x11 ]
606
+ movi v6.16b , # 16
607
+ add x1 , x1 , x8
608
+ tbl v0.16b , {v12.16b - v15.16b} , v4.16b
609
+ tbx v28.16b , {v24.16b - v27.16b} , v5.16b
610
+ add v4.16b , v4.16b , v6.16b
611
+ add v5.16b , v5.16b , v6.16b
612
+ tbl v1.16b , {v12.16b - v15.16b} , v4.16b
613
+ tbx v29.16b , {v24.16b - v27.16b} , v5.16b
614
+ add v4.16b , v4.16b , v6.16b
615
+ add v5.16b , v5.16b , v6.16b
616
+ tbl v2.16b , {v12.16b - v15.16b} , v4.16b
617
+ tbx v30.16b , {v24.16b - v27.16b} , v5.16b
618
+ add v4.16b , v4.16b , v6.16b
619
+ add v5.16b , v5.16b , v6.16b
620
+ tbl v3.16b , {v12.16b - v15.16b} , v4.16b
621
+ tbx v31.16b , {v24.16b - v27.16b} , v5.16b
622
+
623
+ eor v28.16b , v28.16b , v0.16b
624
+ eor v29.16b , v29.16b , v1.16b
625
+ eor v30.16b , v30.16b , v2.16b
626
+ eor v31.16b , v31.16b , v3.16b
627
+ st1 {v28.16b - v31.16b} , [ x1 ]
628
+ ret
481
629
ENDPROC(chacha_4block_xor_neon)
482
630
631
+ . section ".rodata" , "a" , %progbits
632
+ . align L1_CACHE_SHIFT
633
+ .Lpermute:
634
+ .set .Li , 0
635
+ .rept 192
636
+ . byte (.Li - 64 )
637
+ .set .Li , .Li + 1
638
+ .endr
639
+
483
640
CTRINC: .word 0 , 1 , 2 , 3
484
641
ROT8: .word 0x02010003 , 0x06050407 , 0x0a09080b , 0x0e0d0c0f
0 commit comments