Files
linux/arch/x86/include/asm/div64.h
David Laight 630f96a687 lib: mul_u64_u64_div_u64(): optimise multiply on 32bit x86
gcc generates horrid code for both ((u64)u32_a * u32_b) and (u64_a +
u32_b).  As well as the extra instructions it can generate a lot of spills
to stack (including spills of constant zeros and even multiplies by
constant zero).

mul_u32_u32() already exists to optimise the multiply.  Add a similar
add_u64_32() for the addition.  Disable both for clang - it generates
better code without them.

Move the 64x64 => 128 multiply into a static inline helper function for
code clarity.  No need for the a/b_hi/lo variables, the implicit casts on
the function calls do the work for us.  Should have minimal effect on the
generated code.

Use mul_u32_u32() and add_u64_u32() in the 64x64 => 128 multiply in
mul_u64_add_u64_div_u64().

Link: https://lkml.kernel.org/r/20251105201035.64043-8-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-11-20 14:03:42 -08:00

131 lines
3.0 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_DIV64_H
#define _ASM_X86_DIV64_H
#ifdef CONFIG_X86_32
#include <linux/types.h>
#include <linux/log2.h>
/*
* do_div() is NOT a C function. It wants to return
* two values (the quotient and the remainder), but
* since that doesn't work very well in C, what it
* does is:
*
* - modifies the 64-bit dividend _in_place_
* - returns the 32-bit remainder
*
* This ends up being the most efficient "calling
* convention" on x86.
*/
#define do_div(n, base) \
({ \
unsigned long __upper, __low, __high, __mod, __base; \
__base = (base); \
if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \
__mod = n & (__base - 1); \
n >>= ilog2(__base); \
} else { \
asm("" : "=a" (__low), "=d" (__high) : "A" (n));\
__upper = __high; \
if (__high) { \
__upper = __high % (__base); \
__high = __high / (__base); \
} \
asm("divl %2" : "=a" (__low), "=d" (__mod) \
: "rm" (__base), "0" (__low), "1" (__upper)); \
asm("" : "=A" (n) : "a" (__low), "d" (__high)); \
} \
__mod; \
})
static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
{
union {
u64 v64;
u32 v32[2];
} d = { dividend };
u32 upper;
upper = d.v32[1];
d.v32[1] = 0;
if (upper >= divisor) {
d.v32[1] = upper / divisor;
upper %= divisor;
}
asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
"rm" (divisor), "0" (d.v32[0]), "1" (upper));
return d.v64;
}
#define div_u64_rem div_u64_rem
/*
* gcc tends to zero extend 32bit values and do full 64bit maths.
* Define asm functions that avoid this.
* (clang generates better code for the C versions.)
*/
#ifndef __clang__
static inline u64 mul_u32_u32(u32 a, u32 b)
{
u32 high, low;
asm ("mull %[b]" : "=a" (low), "=d" (high)
: [a] "a" (a), [b] "rm" (b) );
return low | ((u64)high) << 32;
}
#define mul_u32_u32 mul_u32_u32
static inline u64 add_u64_u32(u64 a, u32 b)
{
u32 high = a >> 32, low = a;
asm ("addl %[b], %[low]; adcl $0, %[high]"
: [low] "+r" (low), [high] "+r" (high)
: [b] "rm" (b) );
return low | (u64)high << 32;
}
#define add_u64_u32 add_u64_u32
#endif
/*
* __div64_32() is never called on x86, so prevent the
* generic definition from getting built.
*/
#define __div64_32
#else
# include <asm-generic/div64.h>
/*
* Will generate an #DE when the result doesn't fit u64, could fix with an
* __ex_table[] entry when it becomes an issue.
*/
static inline u64 mul_u64_add_u64_div_u64(u64 rax, u64 mul, u64 add, u64 div)
{
u64 rdx;
asm ("mulq %[mul]" : "+a" (rax), "=d" (rdx) : [mul] "rm" (mul));
if (!statically_true(!add))
asm ("addq %[add], %[lo]; adcq $0, %[hi]" :
[lo] "+r" (rax), [hi] "+r" (rdx) : [add] "irm" (add));
asm ("divq %[div]" : "+a" (rax), "+d" (rdx) : [div] "rm" (div));
return rax;
}
#define mul_u64_add_u64_div_u64 mul_u64_add_u64_div_u64
static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 div)
{
return mul_u64_add_u64_div_u64(a, mul, 0, div);
}
#define mul_u64_u32_div mul_u64_u32_div
#endif /* CONFIG_X86_32 */
#endif /* _ASM_X86_DIV64_H */