mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
Replace the bit by bit algorithm with one that generates 16 bits per
iteration on 32bit architectures and 32 bits on 64bit ones.
On my zen 5 this reduces the time for the tests (using the generic code)
from ~3350ns to ~1000ns.
Running the 32bit algorithm on 64bit x86 takes ~1500ns. It'll be slightly
slower on a real 32bit system, mostly due to register pressure.
The savings for 32bit x86 are much higher (tested in userspace). The
worst case (lots of bits in the quotient) drops from ~900 clocks to ~130
(pretty much independant of the arguments). Other 32bit architectures may
see better savings.
It is possibly to optimise for divisors that span less than
__LONG_WIDTH__/2 bits. However I suspect they don't happen that often and
it doesn't remove any slow cpu divide instructions which dominate the
result.
Typical improvements for 64bit random divides:
old new
sandy bridge: 470 150
haswell: 400 144
piledriver: 960 467 I think rdpmc is very slow.
zen5: 244 80
(Timing is 'rdpmc; mul_div(); rdpmc' with the multiply depending on the
first rdpmc and the second rdpmc depending on the quotient.)
Object code (64bit x86 test program): old 0x173 new 0x141.
Link: https://lkml.kernel.org/r/20251105201035.64043-9-david.laight.linux@gmail.com
Signed-off-by: David Laight <david.laight.linux@gmail.com>
Reviewed-by: Nicolas Pitre <npitre@baylibre.com>
Cc: Biju Das <biju.das.jz@bp.renesas.com>
Cc: Borislav Betkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Li RongQing <lirongqing@baidu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleinxer <tglx@linutronix.de>
Cc: Uwe Kleine-König <u.kleine-koenig@baylibre.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
334 lines
7.9 KiB
C
334 lines
7.9 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* Copyright (C) 2003 Bernardo Innocenti <bernie@develer.com>
|
|
*
|
|
* Based on former do_div() implementation from asm-parisc/div64.h:
|
|
* Copyright (C) 1999 Hewlett-Packard Co
|
|
* Copyright (C) 1999 David Mosberger-Tang <davidm@hpl.hp.com>
|
|
*
|
|
*
|
|
* Generic C version of 64bit/32bit division and modulo, with
|
|
* 64bit result and 32bit remainder.
|
|
*
|
|
* The fast case for (n>>32 == 0) is handled inline by do_div().
|
|
*
|
|
* Code generated for this function might be very inefficient
|
|
* for some CPUs. __div64_32() can be overridden by linking arch-specific
|
|
* assembly versions such as arch/ppc/lib/div64.S and arch/sh/lib/div64.S
|
|
* or by defining a preprocessor macro in arch/include/asm/div64.h.
|
|
*/
|
|
|
|
#include <linux/bitops.h>
|
|
#include <linux/export.h>
|
|
#include <linux/math.h>
|
|
#include <linux/math64.h>
|
|
#include <linux/minmax.h>
|
|
#include <linux/log2.h>
|
|
|
|
/* Not needed on 64bit architectures */
|
|
#if BITS_PER_LONG == 32
|
|
|
|
#ifndef __div64_32
|
|
uint32_t __attribute__((weak)) __div64_32(uint64_t *n, uint32_t base)
|
|
{
|
|
uint64_t rem = *n;
|
|
uint64_t b = base;
|
|
uint64_t res, d = 1;
|
|
uint32_t high = rem >> 32;
|
|
|
|
/* Reduce the thing a bit first */
|
|
res = 0;
|
|
if (high >= base) {
|
|
high /= base;
|
|
res = (uint64_t) high << 32;
|
|
rem -= (uint64_t) (high*base) << 32;
|
|
}
|
|
|
|
while ((int64_t)b > 0 && b < rem) {
|
|
b = b+b;
|
|
d = d+d;
|
|
}
|
|
|
|
do {
|
|
if (rem >= b) {
|
|
rem -= b;
|
|
res += d;
|
|
}
|
|
b >>= 1;
|
|
d >>= 1;
|
|
} while (d);
|
|
|
|
*n = res;
|
|
return rem;
|
|
}
|
|
EXPORT_SYMBOL(__div64_32);
|
|
#endif
|
|
|
|
#ifndef div_s64_rem
|
|
s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder)
|
|
{
|
|
u64 quotient;
|
|
|
|
if (dividend < 0) {
|
|
quotient = div_u64_rem(-dividend, abs(divisor), (u32 *)remainder);
|
|
*remainder = -*remainder;
|
|
if (divisor > 0)
|
|
quotient = -quotient;
|
|
} else {
|
|
quotient = div_u64_rem(dividend, abs(divisor), (u32 *)remainder);
|
|
if (divisor < 0)
|
|
quotient = -quotient;
|
|
}
|
|
return quotient;
|
|
}
|
|
EXPORT_SYMBOL(div_s64_rem);
|
|
#endif
|
|
|
|
/*
|
|
* div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder
|
|
* @dividend: 64bit dividend
|
|
* @divisor: 64bit divisor
|
|
* @remainder: 64bit remainder
|
|
*
|
|
* This implementation is a comparable to algorithm used by div64_u64.
|
|
* But this operation, which includes math for calculating the remainder,
|
|
* is kept distinct to avoid slowing down the div64_u64 operation on 32bit
|
|
* systems.
|
|
*/
|
|
#ifndef div64_u64_rem
|
|
u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder)
|
|
{
|
|
u32 high = divisor >> 32;
|
|
u64 quot;
|
|
|
|
if (high == 0) {
|
|
u32 rem32;
|
|
quot = div_u64_rem(dividend, divisor, &rem32);
|
|
*remainder = rem32;
|
|
} else {
|
|
int n = fls(high);
|
|
quot = div_u64(dividend >> n, divisor >> n);
|
|
|
|
if (quot != 0)
|
|
quot--;
|
|
|
|
*remainder = dividend - quot * divisor;
|
|
if (*remainder >= divisor) {
|
|
quot++;
|
|
*remainder -= divisor;
|
|
}
|
|
}
|
|
|
|
return quot;
|
|
}
|
|
EXPORT_SYMBOL(div64_u64_rem);
|
|
#endif
|
|
|
|
/*
|
|
* div64_u64 - unsigned 64bit divide with 64bit divisor
|
|
* @dividend: 64bit dividend
|
|
* @divisor: 64bit divisor
|
|
*
|
|
* This implementation is a modified version of the algorithm proposed
|
|
* by the book 'Hacker's Delight'. The original source and full proof
|
|
* can be found here and is available for use without restriction.
|
|
*
|
|
* 'http://www.hackersdelight.org/hdcodetxt/divDouble.c.txt'
|
|
*/
|
|
#ifndef div64_u64
|
|
u64 div64_u64(u64 dividend, u64 divisor)
|
|
{
|
|
u32 high = divisor >> 32;
|
|
u64 quot;
|
|
|
|
if (high == 0) {
|
|
quot = div_u64(dividend, divisor);
|
|
} else {
|
|
int n = fls(high);
|
|
quot = div_u64(dividend >> n, divisor >> n);
|
|
|
|
if (quot != 0)
|
|
quot--;
|
|
if ((dividend - quot * divisor) >= divisor)
|
|
quot++;
|
|
}
|
|
|
|
return quot;
|
|
}
|
|
EXPORT_SYMBOL(div64_u64);
|
|
#endif
|
|
|
|
#ifndef div64_s64
|
|
s64 div64_s64(s64 dividend, s64 divisor)
|
|
{
|
|
s64 quot, t;
|
|
|
|
quot = div64_u64(abs(dividend), abs(divisor));
|
|
t = (dividend ^ divisor) >> 63;
|
|
|
|
return (quot ^ t) - t;
|
|
}
|
|
EXPORT_SYMBOL(div64_s64);
|
|
#endif
|
|
|
|
#endif /* BITS_PER_LONG == 32 */
|
|
|
|
/*
|
|
* Iterative div/mod for use when dividend is not expected to be much
|
|
* bigger than divisor.
|
|
*/
|
|
#ifndef iter_div_u64_rem
|
|
u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder)
|
|
{
|
|
return __iter_div_u64_rem(dividend, divisor, remainder);
|
|
}
|
|
EXPORT_SYMBOL(iter_div_u64_rem);
|
|
#endif
|
|
|
|
#if !defined(mul_u64_add_u64_div_u64) || defined(test_mul_u64_add_u64_div_u64)
|
|
|
|
#define mul_add(a, b, c) add_u64_u32(mul_u32_u32(a, b), c)
|
|
|
|
#if defined(__SIZEOF_INT128__) && !defined(test_mul_u64_add_u64_div_u64)
|
|
static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
|
|
{
|
|
/* native 64x64=128 bits multiplication */
|
|
u128 prod = (u128)a * b + c;
|
|
|
|
*p_lo = prod;
|
|
return prod >> 64;
|
|
}
|
|
#else
|
|
static inline u64 mul_u64_u64_add_u64(u64 *p_lo, u64 a, u64 b, u64 c)
|
|
{
|
|
/* perform a 64x64=128 bits multiplication in 32bit chunks */
|
|
u64 x, y, z;
|
|
|
|
/* Since (x-1)(x-1) + 2(x-1) == x.x - 1 two u32 can be added to a u64 */
|
|
x = mul_add(a, b, c);
|
|
y = mul_add(a, b >> 32, c >> 32);
|
|
y = add_u64_u32(y, x >> 32);
|
|
z = mul_add(a >> 32, b >> 32, y >> 32);
|
|
y = mul_add(a >> 32, b, y);
|
|
*p_lo = (y << 32) + (u32)x;
|
|
return add_u64_u32(z, y >> 32);
|
|
}
|
|
#endif
|
|
|
|
#ifndef BITS_PER_ITER
|
|
#define BITS_PER_ITER (__LONG_WIDTH__ >= 64 ? 32 : 16)
|
|
#endif
|
|
|
|
#if BITS_PER_ITER == 32
|
|
#define mul_u64_long_add_u64(p_lo, a, b, c) mul_u64_u64_add_u64(p_lo, a, b, c)
|
|
#define add_u64_long(a, b) ((a) + (b))
|
|
#else
|
|
#undef BITS_PER_ITER
|
|
#define BITS_PER_ITER 16
|
|
static inline u32 mul_u64_long_add_u64(u64 *p_lo, u64 a, u32 b, u64 c)
|
|
{
|
|
u64 n_lo = mul_add(a, b, c);
|
|
u64 n_med = mul_add(a >> 32, b, c >> 32);
|
|
|
|
n_med = add_u64_u32(n_med, n_lo >> 32);
|
|
*p_lo = n_med << 32 | (u32)n_lo;
|
|
return n_med >> 32;
|
|
}
|
|
|
|
#define add_u64_long(a, b) add_u64_u32(a, b)
|
|
#endif
|
|
|
|
u64 mul_u64_add_u64_div_u64(u64 a, u64 b, u64 c, u64 d)
|
|
{
|
|
unsigned long d_msig, q_digit;
|
|
unsigned int reps, d_z_hi;
|
|
u64 quotient, n_lo, n_hi;
|
|
u32 overflow;
|
|
|
|
n_hi = mul_u64_u64_add_u64(&n_lo, a, b, c);
|
|
|
|
if (!n_hi)
|
|
return div64_u64(n_lo, d);
|
|
|
|
if (unlikely(n_hi >= d)) {
|
|
/* trigger runtime exception if divisor is zero */
|
|
if (d == 0) {
|
|
unsigned long zero = 0;
|
|
|
|
OPTIMIZER_HIDE_VAR(zero);
|
|
return ~0UL/zero;
|
|
}
|
|
/* overflow: result is unrepresentable in a u64 */
|
|
return ~0ULL;
|
|
}
|
|
|
|
/* Left align the divisor, shifting the dividend to match */
|
|
d_z_hi = __builtin_clzll(d);
|
|
if (d_z_hi) {
|
|
d <<= d_z_hi;
|
|
n_hi = n_hi << d_z_hi | n_lo >> (64 - d_z_hi);
|
|
n_lo <<= d_z_hi;
|
|
}
|
|
|
|
reps = 64 / BITS_PER_ITER;
|
|
/* Optimise loop count for small dividends */
|
|
if (!(u32)(n_hi >> 32)) {
|
|
reps -= 32 / BITS_PER_ITER;
|
|
n_hi = n_hi << 32 | n_lo >> 32;
|
|
n_lo <<= 32;
|
|
}
|
|
#if BITS_PER_ITER == 16
|
|
if (!(u32)(n_hi >> 48)) {
|
|
reps--;
|
|
n_hi = add_u64_u32(n_hi << 16, n_lo >> 48);
|
|
n_lo <<= 16;
|
|
}
|
|
#endif
|
|
|
|
/* Invert the dividend so we can use add instead of subtract. */
|
|
n_lo = ~n_lo;
|
|
n_hi = ~n_hi;
|
|
|
|
/*
|
|
* Get the most significant BITS_PER_ITER bits of the divisor.
|
|
* This is used to get a low 'guestimate' of the quotient digit.
|
|
*/
|
|
d_msig = (d >> (64 - BITS_PER_ITER)) + 1;
|
|
|
|
/*
|
|
* Now do a 'long division' with BITS_PER_ITER bit 'digits'.
|
|
* The 'guess' quotient digit can be low and BITS_PER_ITER+1 bits.
|
|
* The worst case is dividing ~0 by 0x8000 which requires two subtracts.
|
|
*/
|
|
quotient = 0;
|
|
while (reps--) {
|
|
q_digit = (unsigned long)(~n_hi >> (64 - 2 * BITS_PER_ITER)) / d_msig;
|
|
/* Shift 'n' left to align with the product q_digit * d */
|
|
overflow = n_hi >> (64 - BITS_PER_ITER);
|
|
n_hi = add_u64_u32(n_hi << BITS_PER_ITER, n_lo >> (64 - BITS_PER_ITER));
|
|
n_lo <<= BITS_PER_ITER;
|
|
/* Add product to negated divisor */
|
|
overflow += mul_u64_long_add_u64(&n_hi, d, q_digit, n_hi);
|
|
/* Adjust for the q_digit 'guestimate' being low */
|
|
while (overflow < 0xffffffff >> (32 - BITS_PER_ITER)) {
|
|
q_digit++;
|
|
n_hi += d;
|
|
overflow += n_hi < d;
|
|
}
|
|
quotient = add_u64_long(quotient << BITS_PER_ITER, q_digit);
|
|
}
|
|
|
|
/*
|
|
* The above only ensures the remainder doesn't overflow,
|
|
* it can still be possible to add (aka subtract) another copy
|
|
* of the divisor.
|
|
*/
|
|
if ((n_hi + d) > n_hi)
|
|
quotient++;
|
|
return quotient;
|
|
}
|
|
#if !defined(test_mul_u64_add_u64_div_u64)
|
|
EXPORT_SYMBOL(mul_u64_add_u64_div_u64);
|
|
#endif
|
|
#endif
|