timers/migration: Fix imbalanced NUMA trees

When a CPU from a new node boots, the old root may happen to be
connected to the new root even if their node mismatch, as depicted in
the following scenario:

1) CPU 0 boots and creates the first group for node 0.

   [GRP0:0]
    node 0
      |
    CPU 0

2) CPU 1 from node 1 boots and creates a new top that corresponds to
   node 1, but it also connects the old root from node 0 to the new root
   from node 1 by mistake.

             [GRP1:0]
              node 1
            /        \
           /          \
   [GRP0:0]             [GRP0:1]
    node 0               node 1
      |                    |
    CPU 0                CPU 1

3) This eventually leads to an imbalanced tree where some node 0 CPUs
   migrate node 1 timers (and vice versa) way before reaching the
   crossnode groups, resulting in more frequent remote memory accesses
   than expected.

                      [GRP2:0]
                      NUMA_NO_NODE
                     /             \
             [GRP1:0]              [GRP1:1]
              node 1               node 0
            /        \                |
           /          \             [...]
   [GRP0:0]             [GRP0:1]
    node 0               node 1
      |                    |
    CPU 0...              CPU 1...

A balanced tree should only contain groups having children that belong
to the same node:

                      [GRP2:0]
                      NUMA_NO_NODE
                     /             \
             [GRP1:0]              [GRP1:0]
              node 0               node 1
            /        \             /      \
           /          \           /        \
   [GRP0:0]          [...]      [...]    [GRP0:1]
    node 0                                node 1
      |                                     |
    CPU 0...                              CPU 1...

In order to fix this, the hierarchy must be unfolded up to the crossnode
level as soon as a node mismatch is detected. For example the stage 2
above should lead to this layout:

                      [GRP2:0]
                      NUMA_NO_NODE
                     /             \
             [GRP1:0]              [GRP1:1]
              node 0               node 1
              /                         \
             /                           \
        [GRP0:0]                        [GRP0:1]
        node 0                           node 1
          |                                |
       CPU 0                             CPU 1

This means that not only GRP1:0 must be created but also GRP1:1 and
GRP2:0 in order to prepare a balanced tree for next CPUs to boot.

Fixes: 7ee9887703 ("timers: Implement the hierarchical pull model")
Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://patch.msgid.link/20251024132536.39841-4-frederic@kernel.org
This commit is contained in:
Frederic Weisbecker
2025-10-24 15:25:33 +02:00
committed by Thomas Gleixner
parent fa9620355d
commit 5eb579dfd4

View File

@@ -420,6 +420,8 @@ static struct list_head *tmigr_level_list __read_mostly;
static unsigned int tmigr_hierarchy_levels __read_mostly;
static unsigned int tmigr_crossnode_level __read_mostly;
static struct tmigr_group *tmigr_root;
static DEFINE_PER_CPU(struct tmigr_cpu, tmigr_cpu);
#define TMIGR_NONE 0xFF
@@ -522,11 +524,9 @@ struct tmigr_walk {
typedef bool (*up_f)(struct tmigr_group *, struct tmigr_group *, struct tmigr_walk *);
static void __walk_groups(up_f up, struct tmigr_walk *data,
struct tmigr_cpu *tmc)
static void __walk_groups_from(up_f up, struct tmigr_walk *data,
struct tmigr_group *child, struct tmigr_group *group)
{
struct tmigr_group *child = NULL, *group = tmc->tmgroup;
do {
WARN_ON_ONCE(group->level >= tmigr_hierarchy_levels);
@@ -544,6 +544,12 @@ static void __walk_groups(up_f up, struct tmigr_walk *data,
} while (group);
}
static void __walk_groups(up_f up, struct tmigr_walk *data,
struct tmigr_cpu *tmc)
{
__walk_groups_from(up, data, NULL, tmc->tmgroup);
}
static void walk_groups(up_f up, struct tmigr_walk *data, struct tmigr_cpu *tmc)
{
lockdep_assert_held(&tmc->lock);
@@ -1498,21 +1504,6 @@ static void tmigr_init_group(struct tmigr_group *group, unsigned int lvl,
s.seq = 0;
atomic_set(&group->migr_state, s.state);
/*
* If this is a new top-level, prepare its groupmask in advance.
* This avoids accidents where yet another new top-level is
* created in the future and made visible before the current groupmask.
*/
if (list_empty(&tmigr_level_list[lvl])) {
group->groupmask = BIT(0);
/*
* The previous top level has prepared its groupmask already,
* simply account it as the first child.
*/
if (lvl > 0)
group->num_children = 1;
}
timerqueue_init_head(&group->events);
timerqueue_init(&group->groupevt.nextevt);
group->groupevt.nextevt.expires = KTIME_MAX;
@@ -1567,22 +1558,51 @@ static struct tmigr_group *tmigr_get_group(unsigned int cpu, int node,
return group;
}
static bool tmigr_init_root(struct tmigr_group *group, bool activate)
{
if (!group->parent && group != tmigr_root) {
/*
* This is the new top-level, prepare its groupmask in advance
* to avoid accidents where yet another new top-level is
* created in the future and made visible before this groupmask.
*/
group->groupmask = BIT(0);
WARN_ON_ONCE(activate);
return true;
}
return false;
}
static void tmigr_connect_child_parent(struct tmigr_group *child,
struct tmigr_group *parent,
bool activate)
{
struct tmigr_walk data;
if (activate) {
if (tmigr_init_root(parent, activate)) {
/*
* @child is the old top and @parent the new one. In this
* case groupmask is pre-initialized and @child already
* accounted, along with its new sibling corresponding to the
* CPU going up.
* The previous top level had prepared its groupmask already,
* simply account it in advance as the first child. If some groups
* have been created between the old and new root due to node
* mismatch, the new root's child will be intialized accordingly.
*/
WARN_ON_ONCE(child->groupmask != BIT(0) || parent->num_children != 2);
parent->num_children = 1;
}
/* Connecting old root to new root ? */
if (!parent->parent && activate) {
/*
* @child is the old top, or in case of node mismatch, some
* intermediate group between the old top and the new one in
* @parent. In this case the @child must be pre-accounted above
* as the first child. Its new inactive sibling corresponding
* to the CPU going up has been accounted as the second child.
*/
WARN_ON_ONCE(parent->num_children != 2);
child->groupmask = BIT(0);
} else {
/* Adding @child for the CPU going up to @parent. */
/* Common case adding @child for the CPU going up to @parent. */
child->groupmask = BIT(parent->num_children++);
}
@@ -1594,56 +1614,28 @@ static void tmigr_connect_child_parent(struct tmigr_group *child,
smp_store_release(&child->parent, parent);
trace_tmigr_connect_child_parent(child);
if (!activate)
return;
/*
* To prevent inconsistent states, active children need to be active in
* the new parent as well. Inactive children are already marked inactive
* in the parent group:
*
* * When new groups were created by tmigr_setup_groups() starting from
* the lowest level (and not higher then one level below the current
* top level), then they are not active. They will be set active when
* the new online CPU comes active.
*
* * But if a new group above the current top level is required, it is
* mandatory to propagate the active state of the already existing
* child to the new parent. So tmigr_connect_child_parent() is
* executed with the formerly top level group (child) and the newly
* created group (parent).
*
* * It is ensured that the child is active, as this setup path is
* executed in hotplug prepare callback. This is exectued by an
* already connected and !idle CPU. Even if all other CPUs go idle,
* the CPU executing the setup will be responsible up to current top
* level group. And the next time it goes inactive, it will release
* the new childmask and parent to subsequent walkers through this
* @child. Therefore propagate active state unconditionally.
*/
data.childmask = child->groupmask;
/*
* There is only one new level per time (which is protected by
* tmigr_mutex). When connecting the child and the parent and set the
* child active when the parent is inactive, the parent needs to be the
* uppermost level. Otherwise there went something wrong!
*/
WARN_ON(!tmigr_active_up(parent, child, &data) && parent->parent);
}
static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
static int tmigr_setup_groups(unsigned int cpu, unsigned int node,
struct tmigr_group *start, bool activate)
{
struct tmigr_group *group, *child, **stack;
int i, top = 0, err = 0;
struct list_head *lvllist;
int i, top = 0, err = 0, start_lvl = 0;
bool root_mismatch = false;
stack = kcalloc(tmigr_hierarchy_levels, sizeof(*stack), GFP_KERNEL);
if (!stack)
return -ENOMEM;
for (i = 0; i < tmigr_hierarchy_levels; i++) {
if (start) {
stack[start->level] = start;
start_lvl = start->level + 1;
}
if (tmigr_root)
root_mismatch = tmigr_root->numa_node != node;
for (i = start_lvl; i < tmigr_hierarchy_levels; i++) {
group = tmigr_get_group(cpu, node, i);
if (IS_ERR(group)) {
err = PTR_ERR(group);
@@ -1656,23 +1648,25 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
/*
* When booting only less CPUs of a system than CPUs are
* available, not all calculated hierarchy levels are required.
* available, not all calculated hierarchy levels are required,
* unless a node mismatch is detected.
*
* The loop is aborted as soon as the highest level, which might
* be different from tmigr_hierarchy_levels, contains only a
* single group.
* single group, unless the nodes mismatch below tmigr_crossnode_level
*/
if (group->parent || list_is_singular(&tmigr_level_list[i]))
if (group->parent)
break;
if ((!root_mismatch || i >= tmigr_crossnode_level) &&
list_is_singular(&tmigr_level_list[i]))
break;
}
/* Assert single root without parent */
if (WARN_ON_ONCE(i >= tmigr_hierarchy_levels))
return -EINVAL;
if (WARN_ON_ONCE(!err && !group->parent && !list_is_singular(&tmigr_level_list[top])))
return -EINVAL;
for (; i >= 0; i--) {
for (; i >= start_lvl; i--) {
group = stack[i];
if (err < 0) {
@@ -1692,48 +1686,63 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
tmc->tmgroup = group;
tmc->groupmask = BIT(group->num_children++);
tmigr_init_root(group, activate);
trace_tmigr_connect_cpu_parent(tmc);
/* There are no children that need to be connected */
continue;
} else {
child = stack[i - 1];
/* Will be activated at online time */
tmigr_connect_child_parent(child, group, false);
}
/* check if uppermost level was newly created */
if (top != i)
continue;
WARN_ON_ONCE(top == 0);
lvllist = &tmigr_level_list[top];
/*
* Newly created root level should have accounted the upcoming
* CPU's child group and pre-accounted the old root.
*/
if (group->num_children == 2 && list_is_singular(lvllist)) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
* it may spuriously activate the old top level group inside
* the new one (nevertheless whether old top level group is
* active or not) and/or release an uninitialized childmask.
*/
WARN_ON_ONCE(cpu == raw_smp_processor_id());
lvllist = &tmigr_level_list[top - 1];
list_for_each_entry(child, lvllist, list) {
if (child->parent)
continue;
tmigr_connect_child_parent(child, group, true);
}
tmigr_connect_child_parent(child, group, activate);
}
}
if (err < 0)
goto out;
if (activate) {
struct tmigr_walk data;
/*
* To prevent inconsistent states, active children need to be active in
* the new parent as well. Inactive children are already marked inactive
* in the parent group:
*
* * When new groups were created by tmigr_setup_groups() starting from
* the lowest level, then they are not active. They will be set active
* when the new online CPU comes active.
*
* * But if new groups above the current top level are required, it is
* mandatory to propagate the active state of the already existing
* child to the new parents. So tmigr_active_up() activates the
* new parents while walking up from the old root to the new.
*
* * It is ensured that @start is active, as this setup path is
* executed in hotplug prepare callback. This is executed by an
* already connected and !idle CPU. Even if all other CPUs go idle,
* the CPU executing the setup will be responsible up to current top
* level group. And the next time it goes inactive, it will release
* the new childmask and parent to subsequent walkers through this
* @child. Therefore propagate active state unconditionally.
*/
WARN_ON_ONCE(!start->parent);
data.childmask = start->groupmask;
__walk_groups_from(tmigr_active_up, &data, start, start->parent);
}
/* Root update */
if (list_is_singular(&tmigr_level_list[top])) {
group = list_first_entry(&tmigr_level_list[top],
typeof(*group), list);
WARN_ON_ONCE(group->parent);
if (tmigr_root) {
/* Old root should be the same or below */
WARN_ON_ONCE(tmigr_root->level > top);
}
tmigr_root = group;
}
out:
kfree(stack);
return err;
@@ -1741,12 +1750,26 @@ static int tmigr_setup_groups(unsigned int cpu, unsigned int node)
static int tmigr_add_cpu(unsigned int cpu)
{
struct tmigr_group *old_root = tmigr_root;
int node = cpu_to_node(cpu);
int ret;
mutex_lock(&tmigr_mutex);
ret = tmigr_setup_groups(cpu, node);
mutex_unlock(&tmigr_mutex);
guard(mutex)(&tmigr_mutex);
ret = tmigr_setup_groups(cpu, node, NULL, false);
/* Root has changed? Connect the old one to the new */
if (ret >= 0 && old_root && old_root != tmigr_root) {
/*
* The target CPU must never do the prepare work, except
* on early boot when the boot CPU is the target. Otherwise
* it may spuriously activate the old top level group inside
* the new one (nevertheless whether old top level group is
* active or not) and/or release an uninitialized childmask.
*/
WARN_ON_ONCE(cpu == raw_smp_processor_id());
ret = tmigr_setup_groups(-1, old_root->numa_node, old_root, true);
}
return ret;
}