#include .text .p2align 3 .globl ChaCha20_ctr32_v_zbb_zvkb .type ChaCha20_ctr32_v_zbb_zvkb,@function ChaCha20_ctr32_v_zbb_zvkb: addi sp, sp, -96 sd s0, 0(sp) sd s1, 8(sp) sd s2, 16(sp) sd s3, 24(sp) sd s4, 32(sp) sd s5, 40(sp) sd s6, 48(sp) sd s7, 56(sp) sd s8, 64(sp) sd s9, 72(sp) sd s10, 80(sp) sd s11, 88(sp) addi sp, sp, -64 lw t2, 0(a4) .Lblock_loop: # We will use the scalar ALU for 1 chacha block. srli t3, a2, 6 .word 219050839 slli t4, t1, 6 bltu t4, a2, 1f # Since there is no more chacha block existed, we need to split 1 block # from vector ALU. addi t4, t1, -1 .word 219083607 1: #### chacha block data # init chacha const states into v0~v3 # "expa" little endian li a5, 0x61707865 .word 1577566295 # "nd 3" little endian li a6, 0x3320646e .word 1577599191 # "2-by" little endian li a7, 0x79622d32 .word 1577632087 # "te k" little endian li s0, 0x6b206574 lw s1, 0(a3) .word 1577337303 # init chacha key states into v4~v11 lw s2, 4(a3) .word 1577370199 lw s3, 8(a3) .word 1577665239 lw s4, 12(a3) .word 1577698135 lw s5, 16(a3) .word 1577731031 lw s6, 20(a3) .word 1577763927 lw s7, 24(a3) .word 1577796823 lw s8, 28(a3) .word 1577829719 .word 1577862615 # init chacha key states into v12~v13 lw s10, 4(a4) .word 1376298583 lw s11, 8(a4) .word 46384727 lw t0, 12(a4) .word 1577928407 add s9, t2, t1 # init chacha nonce states into v14~v15 .word 1577961303 .word 1577240535 li t3, 64 # load the top-half of input data into v16~v23 .word 3955615751 # till now in block_loop, we used: # - v0~v15 for chacha states. # - v16~v23 for top-half of input data. # - v24~v31 haven't been used yet. # 20 round groups li t3, 10 .Lround_loop: # we can use v24~v31 as temporary registers in round_loop. addi t3, t3, -1 # a += b; d ^= a; d <<<= 16; .word 33685591 add a5, a5, s1 .word 34767063 add a6, a6, s2 .word 35848535 add a7, a7, s3 .word 36930007 add s0, s0, s4 .word 784336471 xor s9, s9, a5 .word 785417943 xor s10, s10, a6 .word 786499415 xor s11, s11, a7 .word 787580887 xor t0, t0, s0 .word 1388852823 .word 1628232859 .word 1389901527 .word 1628265755 .word 1390950231 .word 1628298651 .word 1391998935 .word 1627574939 # c += d; b ^= c; b <<<= 12; .word 42337367 add s5, s5, s9 .word 43418839 add s6, s6, s10 .word 44500311 add s7, s7, s11 .word 45581783 add s8, s8, t0 .word 776208983 xor s1, s1, s5 .word 777290455 xor s2, s2, s6 .word 778371927 xor s3, s3, s7 .word 779453399 xor s4, s4, s8 .word 1380594263 .word 1631900827 .word 1381642967 .word 1632196891 .word 1382691671 .word 1632229787 .word 1383740375 .word 1632262683 # a += b; d ^= a; d <<<= 8; .word 33685591 add a5, a5, s1 .word 34767063 add a6, a6, s2 .word 35848535 add a7, a7, s3 .word 36930007 add s0, s0, s4 .word 784336471 xor s9, s9, a5 .word 785417943 xor s10, s10, a6 .word 786499415 xor s11, s11, a7 .word 787580887 xor t0, t0, s0 .word 1389114967 .word 1636621467 .word 1390163671 .word 1636654363 .word 1391212375 .word 1636687259 .word 1392261079 .word 1635963547 # c += d; b ^= c; b <<<= 7; .word 42337367 add s5, s5, s9 .word 43418839 add s6, s6, s10 .word 44500311 add s7, s7, s11 .word 45581783 add s8, s8, t0 .word 776208983 xor s1, s1, s5 .word 777290455 xor s2, s2, s6 .word 778371927 xor s3, s3, s7 .word 779453399 xor s4, s4, s8 .word 1380758103 .word 1637143707 .word 1381806807 .word 1637439771 .word 1382855511 .word 1637472667 .word 1383904215 .word 1637505563 # a += b; d ^= a; d <<<= 16; .word 36831703 add s0, s0, s1 .word 33718359 add a5, a5, s2 .word 34799831 add a6, a6, s3 .word 35881303 add a7, a7, s4 .word 786532183 xor s11, s11, s0 .word 787482583 xor t0, t0, a5 .word 784369239 xor s9, s9, a6 .word 785450711 xor s10, s10, a7 .word 1390950231 .word 1628298651 .word 1391998935 .word 1627574939 .word 1388852823 .word 1628232859 .word 1389901527 .word 1628265755 # c += d; b ^= c; b <<<= 12; .word 43451607 add s6, s6, s11 .word 44533079 add s7, s7, t0 .word 45483479 add s8, s8, s9 .word 42370135 add s5, s5, s10 .word 776241751 xor s1, s1, s6 .word 777323223 xor s2, s2, s7 .word 778404695 xor s3, s3, s8 .word 779355095 xor s4, s4, s5 .word 1380594263 .word 1631900827 .word 1381642967 .word 1632196891 .word 1382691671 .word 1632229787 .word 1383740375 .word 1632262683 # a += b; d ^= a; d <<<= 8; .word 36831703 add s0, s0, s1 .word 33718359 add a5, a5, s2 .word 34799831 add a6, a6, s3 .word 35881303 add a7, a7, s4 .word 786532183 xor s11, s11, s0 .word 787482583 xor t0, t0, a5 .word 784369239 xor s9, s9, a6 .word 785450711 xor s10, s10, a7 .word 1391212375 .word 1636687259 .word 1392261079 .word 1635963547 .word 1389114967 .word 1636621467 .word 1390163671 .word 1636654363 # c += d; b ^= c; b <<<= 7; .word 43451607 add s6, s6, s11 .word 44533079 add s7, s7, t0 .word 45483479 add s8, s8, s9 .word 42370135 add s5, s5, s10 .word 776241751 xor s1, s1, s6 .word 777323223 xor s2, s2, s7 .word 778404695 xor s3, s3, s8 .word 779355095 xor s4, s4, s5 .word 1380758103 .word 1637143707 .word 1381806807 .word 1637439771 .word 1382855511 .word 1637472667 .word 1383904215 .word 1637505563 bnez t3, .Lround_loop li t3, 64 # load the bottom-half of input data into v24~v31 addi t4, a1, 32 .word 3956206599 # now, there are no free vector registers until the round_loop exits. # add chacha top-half initial block states # "expa" little endian li t3, 0x61707865 .word 34488407 add a5, a5, t3 # "nd 3" little endian li t4, 0x3320646e .word 35569879 add a6, a6, t4 lw t3, 0(a3) # "2-by" little endian li t5, 0x79622d32 .word 36651351 add a7, a7, t5 lw t4, 4(a3) # "te k" little endian li t6, 0x6b206574 .word 37732823 add s0, s0, t6 lw t5, 8(a3) .word 38683223 add s1, s1, t3 lw t6, 12(a3) .word 39764695 add s2, s2, t4 .word 40846167 add s3, s3, t5 .word 41927639 add s4, s4, t6 # xor with the top-half input .word 788531287 sw a5, 0(sp) sw a6, 4(sp) .word 789612759 sw a7, 8(sp) sw s0, 12(sp) .word 790694231 sw s1, 16(sp) sw s2, 20(sp) .word 791775703 sw s3, 24(sp) sw s4, 28(sp) .word 792857175 lw t3, 16(a3) .word 793938647 lw t4, 20(a3) .word 795020119 lw t5, 24(a3) .word 796101591 # save the top-half of output from v16~v23 li t6, 64 .word 3958728743 # add chacha bottom-half initial block states .word 42878039 add s5, s5, t3 lw t6, 28(a3) .word 43959511 add s6, s6, t4 lw t3, 4(a4) .word 45040983 add s7, s7, t5 lw t4, 8(a4) .word 46122455 add s8, s8, t6 lw t5, 12(a4) .word 1376297047 add s9, s9, t2 .word 46384727 add s9, s9, t1 .word 48121559 add s10, s10, t3 .word 49203031 add s11, s11, t4 .word 50284503 add t0, t0, t5 .word 46138967 # xor with the bottom-half input .word 797183063 sw s5, 32(sp) .word 798264535 sw s6, 36(sp) .word 799346007 sw s7, 40(sp) .word 800427479 sw s8, 44(sp) .word 802590423 sw s9, 48(sp) .word 801508951 sw s10, 52(sp) .word 803671895 sw s11, 56(sp) .word 804753367 sw t0, 60(sp) # save the bottom-half of output from v24~v31 li t3, 64 addi t4, a0, 32 .word 3956206631 # the computed vector parts: `64 * VL` slli t3, t1, 6 add a1, a1, t3 add a0, a0, t3 sub a2, a2, t3 add t2, t2, t1 # process the scalar data block addi t2, t2, 1 li t3, 64 .word 197549747 sub a2, a2, t4 mv t5, sp .Lscalar_data_loop: .word 205452119 # from this on, vector registers are grouped with lmul = 8 .word 33915911 .word 34539527 .word 780665943 .word 33883175 add a1, a1, t1 add a0, a0, t1 add t5, t5, t1 sub t4, t4, t1 bnez t4, .Lscalar_data_loop bnez a2, .Lblock_loop addi sp, sp, 64 ld s0, 0(sp) ld s1, 8(sp) ld s2, 16(sp) ld s3, 24(sp) ld s4, 32(sp) ld s5, 40(sp) ld s6, 48(sp) ld s7, 56(sp) ld s8, 64(sp) ld s9, 72(sp) ld s10, 80(sp) ld s11, 88(sp) addi sp, sp, 96 ret .size ChaCha20_ctr32_v_zbb_zvkb,.-ChaCha20_ctr32_v_zbb_zvkb