Files
linux/lib/crc/x86/crc32.h
Eric Biggers 5ff74f5f71 lib/crc: Drop inline from all *_mod_init_arch() functions
Drop 'inline' from all the *_mod_init_arch() functions so that the
compiler will warn about any bugs where they are unused due to not being
wired up properly.  (There are no such bugs currently, so this just
establishes a more robust convention for the future.  Of course, these
functions also tend to get inlined anyway, regardless of the keyword.)

Link: https://lore.kernel.org/r/20250816020240.431545-1-ebiggers@kernel.org
Signed-off-by: Eric Biggers <ebiggers@kernel.org>
2025-08-15 19:06:08 -07:00

138 lines
4.3 KiB
C

// SPDX-License-Identifier: GPL-2.0-only
/*
* x86-optimized CRC32 functions
*
* Copyright (C) 2008 Intel Corporation
* Copyright 2012 Xyratex Technology Limited
* Copyright 2024 Google LLC
*/
#include "crc-pclmul-template.h"
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_crc32);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_pclmulqdq);
static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_vpclmul_avx512);
DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32);
static inline u32 crc32_le_arch(u32 crc, const u8 *p, size_t len)
{
CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts,
have_pclmulqdq);
return crc32_le_base(crc, p, len);
}
#ifdef CONFIG_X86_64
#define CRC32_INST "crc32q %1, %q0"
#else
#define CRC32_INST "crc32l %1, %0"
#endif
/*
* Use carryless multiply version of crc32c when buffer size is >= 512 to
* account for FPU state save/restore overhead.
*/
#define CRC32C_PCLMUL_BREAKEVEN 512
asmlinkage u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len);
static inline u32 crc32c_arch(u32 crc, const u8 *p, size_t len)
{
size_t num_longs;
if (!static_branch_likely(&have_crc32))
return crc32c_base(crc, p, len);
if (IS_ENABLED(CONFIG_X86_64) && len >= CRC32C_PCLMUL_BREAKEVEN &&
static_branch_likely(&have_pclmulqdq) && likely(irq_fpu_usable())) {
/*
* Long length, the vector registers are usable, and the CPU is
* 64-bit and supports both CRC32 and PCLMULQDQ instructions.
* It is worthwhile to divide the data into multiple streams,
* CRC them independently, and combine them using PCLMULQDQ.
* crc32c_x86_3way() does this using 3 streams, which is the
* most that x86_64 CPUs have traditionally been capable of.
*
* However, due to improved VPCLMULQDQ performance on newer
* CPUs, use crc32_lsb_vpclmul_avx512() instead of
* crc32c_x86_3way() when the CPU supports VPCLMULQDQ and has a
* "good" implementation of AVX-512.
*
* Future work: the optimal strategy on Zen 3--5 is actually to
* use both crc32q and VPCLMULQDQ in parallel. Unfortunately,
* different numbers of streams and vector lengths are optimal
* on each CPU microarchitecture, making it challenging to take
* advantage of this. (Zen 5 even supports 7 parallel crc32q, a
* major upgrade.) For now, just choose between
* crc32c_x86_3way() and crc32_lsb_vpclmul_avx512(). The latter
* is needed anyway for crc32_le(), so we just reuse it here.
*/
kernel_fpu_begin();
if (static_branch_likely(&have_vpclmul_avx512))
crc = crc32_lsb_vpclmul_avx512(crc, p, len,
crc32_lsb_0x82f63b78_consts.fold_across_128_bits_consts);
else
crc = crc32c_x86_3way(crc, p, len);
kernel_fpu_end();
return crc;
}
/*
* Short length, XMM registers unusable, or the CPU is 32-bit; but the
* CPU supports CRC32 instructions. Just issue a single stream of CRC32
* instructions inline. While this doesn't use the CPU's CRC32
* throughput very well, it avoids the need to combine streams. Stream
* combination would be inefficient here.
*/
for (num_longs = len / sizeof(unsigned long);
num_longs != 0; num_longs--, p += sizeof(unsigned long))
asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p));
if (sizeof(unsigned long) > 4 && (len & 4)) {
asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p));
p += 4;
}
if (len & 2) {
asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p));
p += 2;
}
if (len & 1)
asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p));
return crc;
}
#define crc32_be_arch crc32_be_base /* not implemented on this arch */
#define crc32_mod_init_arch crc32_mod_init_arch
static void crc32_mod_init_arch(void)
{
if (boot_cpu_has(X86_FEATURE_XMM4_2))
static_branch_enable(&have_crc32);
if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) {
static_branch_enable(&have_pclmulqdq);
if (have_vpclmul()) {
if (have_avx512()) {
static_call_update(crc32_lsb_pclmul,
crc32_lsb_vpclmul_avx512);
static_branch_enable(&have_vpclmul_avx512);
} else {
static_call_update(crc32_lsb_pclmul,
crc32_lsb_vpclmul_avx2);
}
}
}
}
static inline u32 crc32_optimizations_arch(void)
{
u32 optimizations = 0;
if (static_key_enabled(&have_crc32))
optimizations |= CRC32C_OPTIMIZATION;
if (static_key_enabled(&have_pclmulqdq))
optimizations |= CRC32_LE_OPTIMIZATION;
return optimizations;
}