lib/crypto: x86/blake2s: Improve readability

Various cleanups for readability.  No change to the generated code:

- Add some comments
- Add #defines for arguments
- Rename some labels
- Use decimal constants instead of hex where it makes sense.
  (The pshufd immediates intentionally remain as hex.)
- Add blank lines when there's a logical break

The round loop still could use some work, but this is at least a start.

Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20251102234209.62133-5-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
This commit is contained in:
Eric Biggers
2025-11-02 15:42:07 -08:00
parent 83c1a867c9
commit a7acd77ebd

View File

@@ -50,34 +50,52 @@
.byte 15, 5, 4, 13, 10, 7, 3, 11, 12, 2, 0, 6, 9, 8, 1, 14
.byte 8, 7, 14, 11, 13, 15, 0, 12, 10, 4, 5, 6, 3, 2, 1, 9
#define CTX %rdi
#define DATA %rsi
#define NBLOCKS %rdx
#define INC %ecx
.text
//
// void blake2s_compress_ssse3(struct blake2s_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
SYM_FUNC_START(blake2s_compress_ssse3)
movdqu (%rdi),%xmm0
movdqu 0x10(%rdi),%xmm1
movdqu (CTX),%xmm0 // Load h[0..3]
movdqu 16(CTX),%xmm1 // Load h[4..7]
movdqa .Lror16(%rip),%xmm12
movdqa .Lror8(%rip),%xmm13
movdqu 0x20(%rdi),%xmm14
movd %ecx,%xmm15
leaq .Lsigma+0xa0(%rip),%r8
jmp .Lbeginofloop
movdqu 32(CTX),%xmm14 // Load t and f
movd INC,%xmm15 // Load inc
leaq .Lsigma+160(%rip),%r8
jmp .Lssse3_mainloop
.align 32
.Lbeginofloop:
movdqa %xmm0,%xmm10
movdqa %xmm1,%xmm11
paddq %xmm15,%xmm14
movdqa .Liv(%rip),%xmm2
.Lssse3_mainloop:
// Main loop: each iteration processes one 64-byte block.
movdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
movdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
paddq %xmm15,%xmm14 // t += inc (64-bit addition)
movdqa .Liv(%rip),%xmm2 // v[8..11] = iv[0..3]
movdqa %xmm14,%xmm3
pxor .Liv+0x10(%rip),%xmm3
pxor .Liv+16(%rip),%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
leaq .Lsigma(%rip),%rcx
.Lroundloop:
.Lssse3_roundloop:
// Round loop: each iteration does 1 round (of 10 rounds total).
movzbl (%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0x1(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x2(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x3(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movd (DATA,%rax,4),%xmm4
movzbl 1(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 2(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 3(%rcx),%eax
movd (DATA,%rax,4),%xmm7
punpckldq %xmm5,%xmm4
punpckldq %xmm7,%xmm6
punpcklqdq %xmm6,%xmm4
@@ -88,17 +106,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 0x4(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0x5(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x6(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0x7(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 4(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 5(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 6(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 7(%rcx),%eax
movd (DATA,%rax,4),%xmm4
punpckldq %xmm6,%xmm5
punpckldq %xmm4,%xmm7
punpcklqdq %xmm7,%xmm5
@@ -109,20 +127,20 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x93,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x39,%xmm2,%xmm2
movzbl 0x8(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 0x9(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xa(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xb(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 8(%rcx),%eax
movd (DATA,%rax,4),%xmm6
movzbl 9(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 10(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 11(%rcx),%eax
movd (DATA,%rax,4),%xmm5
punpckldq %xmm7,%xmm6
punpckldq %xmm5,%xmm4
punpcklqdq %xmm4,%xmm6
@@ -133,17 +151,17 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0xc,%xmm1
pslld $0x14,%xmm8
psrld $12,%xmm1
pslld $20,%xmm8
por %xmm8,%xmm1
movzbl 0xc(%rcx),%eax
movd (%rsi,%rax,4),%xmm7
movzbl 0xd(%rcx),%eax
movd (%rsi,%rax,4),%xmm4
movzbl 0xe(%rcx),%eax
movd (%rsi,%rax,4),%xmm5
movzbl 0xf(%rcx),%eax
movd (%rsi,%rax,4),%xmm6
movzbl 12(%rcx),%eax
movd (DATA,%rax,4),%xmm7
movzbl 13(%rcx),%eax
movd (DATA,%rax,4),%xmm4
movzbl 14(%rcx),%eax
movd (DATA,%rax,4),%xmm5
movzbl 15(%rcx),%eax
movd (DATA,%rax,4),%xmm6
punpckldq %xmm4,%xmm7
punpckldq %xmm6,%xmm5
punpcklqdq %xmm5,%xmm7
@@ -154,52 +172,68 @@ SYM_FUNC_START(blake2s_compress_ssse3)
paddd %xmm3,%xmm2
pxor %xmm2,%xmm1
movdqa %xmm1,%xmm8
psrld $0x7,%xmm1
pslld $0x19,%xmm8
psrld $7,%xmm1
pslld $25,%xmm8
por %xmm8,%xmm1
pshufd $0x39,%xmm0,%xmm0
pshufd $0x4e,%xmm3,%xmm3
pshufd $0x93,%xmm2,%xmm2
addq $0x10,%rcx
addq $16,%rcx
cmpq %r8,%rcx
jnz .Lroundloop
jnz .Lssse3_roundloop
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
pxor %xmm2,%xmm0
pxor %xmm3,%xmm1
pxor %xmm10,%xmm0
pxor %xmm11,%xmm1
addq $0x40,%rsi
decq %rdx
jnz .Lbeginofloop
movdqu %xmm0,(%rdi)
movdqu %xmm1,0x10(%rdi)
movdqu %xmm14,0x20(%rdi)
addq $64,DATA
decq NBLOCKS
jnz .Lssse3_mainloop
movdqu %xmm0,(CTX) // Store new h[0..3]
movdqu %xmm1,16(CTX) // Store new h[4..7]
movdqu %xmm14,32(CTX) // Store new t and f
RET
SYM_FUNC_END(blake2s_compress_ssse3)
//
// void blake2s_compress_avx512(struct blake2s_ctx *ctx,
// const u8 *data, size_t nblocks, u32 inc);
//
// Only the first three fields of struct blake2s_ctx are used:
// u32 h[8]; (inout)
// u32 t[2]; (inout)
// u32 f[2]; (in)
//
SYM_FUNC_START(blake2s_compress_avx512)
vmovdqu (%rdi),%xmm0
vmovdqu 0x10(%rdi),%xmm1
vmovdqu 0x20(%rdi),%xmm4
vmovd %ecx,%xmm5
vmovdqa .Liv(%rip),%xmm14
vmovdqa .Liv+16(%rip),%xmm15
jmp .Lblake2s_compress_avx512_mainloop
.align 32
.Lblake2s_compress_avx512_mainloop:
vmovdqa %xmm0,%xmm10
vmovdqa %xmm1,%xmm11
vpaddq %xmm5,%xmm4,%xmm4
vmovdqa %xmm14,%xmm2
vpxor %xmm15,%xmm4,%xmm3
vmovdqu (%rsi),%ymm6
vmovdqu 0x20(%rsi),%ymm7
addq $0x40,%rsi
vmovdqu (CTX),%xmm0 // Load h[0..3]
vmovdqu 16(CTX),%xmm1 // Load h[4..7]
vmovdqu 32(CTX),%xmm4 // Load t and f
vmovd INC,%xmm5 // Load inc
vmovdqa .Liv(%rip),%xmm14 // Load iv[0..3]
vmovdqa .Liv+16(%rip),%xmm15 // Load iv[4..7]
jmp .Lavx512_mainloop
.align 32
.Lavx512_mainloop:
// Main loop: each iteration processes one 64-byte block.
vmovdqa %xmm0,%xmm10 // Save h[0..3] and let v[0..3] = h[0..3]
vmovdqa %xmm1,%xmm11 // Save h[4..7] and let v[4..7] = h[4..7]
vpaddq %xmm5,%xmm4,%xmm4 // t += inc (64-bit addition)
vmovdqa %xmm14,%xmm2 // v[8..11] = iv[0..3]
vpxor %xmm15,%xmm4,%xmm3 // v[12..15] = iv[4..7] ^ [t, f]
vmovdqu (DATA),%ymm6 // Load first 8 data words
vmovdqu 32(DATA),%ymm7 // Load second 8 data words
addq $64,DATA
leaq .Lsigma2(%rip),%rax
movb $0xa,%cl
.Lblake2s_compress_avx512_roundloop:
movb $10,%cl // Set num rounds remaining
.Lavx512_roundloop:
// Round loop: each iteration does 1 round (of 10 rounds total).
vpmovzxbd (%rax),%ymm8
vpmovzxbd 0x8(%rax),%ymm9
addq $0x10,%rax
vpmovzxbd 8(%rax),%ymm9
addq $16,%rax
vpermi2d %ymm7,%ymm6,%ymm8
vpermi2d %ymm7,%ymm6,%ymm9
vmovdqa %ymm8,%ymm6
@@ -207,50 +241,53 @@ SYM_FUNC_START(blake2s_compress_avx512)
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm8,%xmm8
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm8,%xmm8
vpaddd %xmm8,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x93,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x39,%xmm2,%xmm2
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x10,%xmm3,%xmm3
vprord $16,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0xc,%xmm1,%xmm1
vextracti128 $0x1,%ymm9,%xmm9
vprord $12,%xmm1,%xmm1
vextracti128 $1,%ymm9,%xmm9
vpaddd %xmm9,%xmm0,%xmm0
vpaddd %xmm1,%xmm0,%xmm0
vpxor %xmm0,%xmm3,%xmm3
vprord $0x8,%xmm3,%xmm3
vprord $8,%xmm3,%xmm3
vpaddd %xmm3,%xmm2,%xmm2
vpxor %xmm2,%xmm1,%xmm1
vprord $0x7,%xmm1,%xmm1
vprord $7,%xmm1,%xmm1
vpshufd $0x39,%xmm0,%xmm0
vpshufd $0x4e,%xmm3,%xmm3
vpshufd $0x93,%xmm2,%xmm2
decb %cl
jne .Lblake2s_compress_avx512_roundloop
jne .Lavx512_roundloop
// Compute the new h: h[0..7] ^= v[0..7] ^ v[8..15]
vpxor %xmm10,%xmm0,%xmm0
vpxor %xmm11,%xmm1,%xmm1
vpxor %xmm2,%xmm0,%xmm0
vpxor %xmm3,%xmm1,%xmm1
decq %rdx
jne .Lblake2s_compress_avx512_mainloop
vmovdqu %xmm0,(%rdi)
vmovdqu %xmm1,0x10(%rdi)
vmovdqu %xmm4,0x20(%rdi)
decq NBLOCKS
jne .Lavx512_mainloop
vmovdqu %xmm0,(CTX) // Store new h[0..3]
vmovdqu %xmm1,16(CTX) // Store new h[4..7]
vmovdqu %xmm4,32(CTX) // Store new t and f
vzeroupper
RET
SYM_FUNC_END(blake2s_compress_avx512)