Browse Source

Update v6.2 patches

Changes:
 - Add patches for Intel Thread Director to improve performance on
   Alder-Lake devices.

Links:
 - kernel: https://github.com/linux-surface/kernel/commit/cb08b65716c8c6bb1e5891323ada483cbfce6bb2
 - Upstream submission for Thread Director:
     https://lore.kernel.org/lkml/20230207051105.11575-1-ricardo.neri-calderon@linux.intel.com/
 - Patches required as pre-requisites for thread-director patchest:
     https://lore.kernel.org/lkml/20230207045838.11243-1-ricardo.neri-calderon@linux.intel.com/
Maximilian Luz 2 years ago
parent
commit
ca2ddd30a1
1 changed files with 3268 additions and 0 deletions
  1. 3268 0
      patches/6.2/0014-intel-thread-director.patch

+ 3268 - 0
patches/6.2/0014-intel-thread-director.patch

@@ -0,0 +1,3268 @@
+From bd2bba4036cb8c95f83e45cd4d8b22369fe6d0cb Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:29 -0800
+Subject: [PATCH] sched/fair: Generalize asym_packing logic for SMT cores
+
+When doing asym_packing load balancing between cores, all we care is that
+the destination core is fully idle (including SMT siblings, if any) and
+that the busiest candidate scheduling group has exactly one busy CPU. It is
+irrelevant whether the candidate busiest core is non-SMT, SMT2, SMT4, SMT8,
+etc.
+
+Do not handle the candidate busiest non-SMT vs SMT cases separately. Simply
+do the two checks described above. Let find_busiest_group() handle bigger
+imbalances in the number of idle CPUs.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Reviewed-by: Len Brown <len.brown@intel.com>
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 41 ++++++++++++++---------------------------
+ 1 file changed, 14 insertions(+), 27 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0f8736991427..4509086a60a0 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9124,13 +9124,11 @@ group_type group_classify(unsigned int imbalance_pct,
+  * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
+  * only if @dst_cpu has higher priority.
+  *
+- * If both @dst_cpu and @sg have SMT siblings, and @sg has exactly one more
+- * busy CPU than @sds::local, let @dst_cpu pull tasks if it has higher priority.
+- * Bigger imbalances in the number of busy CPUs will be dealt with in
+- * update_sd_pick_busiest().
+- *
+- * If @sg does not have SMT siblings, only pull tasks if all of the SMT siblings
+- * of @dst_cpu are idle and @sg has lower priority.
++ * If @dst_cpu has SMT siblings, check if there are no running tasks in
++ * @sds::local. In such case, decide based on the priority of @sg. Do it only
++ * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger
++ * imbalances in the number of busy CPUs will be dealt with in
++ * find_busiest_group().
+  *
+  * Return: true if @dst_cpu can pull tasks, false otherwise.
+  */
+@@ -9139,12 +9137,10 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+ 				    struct sched_group *sg)
+ {
+ #ifdef CONFIG_SCHED_SMT
+-	bool local_is_smt, sg_is_smt;
++	bool local_is_smt;
+ 	int sg_busy_cpus;
+ 
+ 	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
+-	sg_is_smt = sg->flags & SD_SHARE_CPUCAPACITY;
+-
+ 	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
+ 
+ 	if (!local_is_smt) {
+@@ -9165,25 +9161,16 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+ 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ 	}
+ 
+-	/* @dst_cpu has SMT siblings. */
+-
+-	if (sg_is_smt) {
+-		int local_busy_cpus = sds->local->group_weight -
+-				      sds->local_stat.idle_cpus;
+-		int busy_cpus_delta = sg_busy_cpus - local_busy_cpus;
+-
+-		if (busy_cpus_delta == 1)
+-			return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+-
+-		return false;
+-	}
+-
+ 	/*
+-	 * @sg does not have SMT siblings. Ensure that @sds::local does not end
+-	 * up with more than one busy SMT sibling and only pull tasks if there
+-	 * are not busy CPUs (i.e., no CPU has running tasks).
++	 * @dst_cpu has SMT siblings. Do asym_packing load balancing only if
++	 * all its siblings are idle (moving tasks between physical cores in
++	 * which some SMT siblings are busy results in the same throughput).
++	 *
++	 * If the difference in the number of busy CPUs is two or more, let
++	 * find_busiest_group() take care of it. We only care if @sg has
++	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
+ 	 */
+-	if (!sds->local_stat.sum_nr_running)
++	if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running)
+ 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ 
+ 	return false;
+-- 
+2.39.2
+
+From ee23d606abde99fbab94fa15ce3ef701b430d8a7 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:30 -0800
+Subject: [PATCH] sched/fair: Move is_core_idle() out of CONFIG_NUMA
+
+asym_packing needs this function to determine whether an SMT core is a
+suitable destination for load balancing.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 34 +++++++++++++++++-----------------
+ 1 file changed, 17 insertions(+), 17 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 4509086a60a0..d58df9c6a88c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1064,6 +1064,23 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+  * Scheduling class queueing methods:
+  */
+ 
++static inline bool is_core_idle(int cpu)
++{
++#ifdef CONFIG_SCHED_SMT
++	int sibling;
++
++	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
++		if (cpu == sibling)
++			continue;
++
++		if (!idle_cpu(sibling))
++			return false;
++	}
++#endif
++
++	return true;
++}
++
+ #ifdef CONFIG_NUMA
+ #define NUMA_IMBALANCE_MIN 2
+ 
+@@ -1700,23 +1717,6 @@ struct numa_stats {
+ 	int idle_cpu;
+ };
+ 
+-static inline bool is_core_idle(int cpu)
+-{
+-#ifdef CONFIG_SCHED_SMT
+-	int sibling;
+-
+-	for_each_cpu(sibling, cpu_smt_mask(cpu)) {
+-		if (cpu == sibling)
+-			continue;
+-
+-		if (!idle_cpu(sibling))
+-			return false;
+-	}
+-#endif
+-
+-	return true;
+-}
+-
+ struct task_numa_env {
+ 	struct task_struct *p;
+ 
+-- 
+2.39.2
+
+From 995477b05ed2c85c3b3b796118468c1c66edb37e Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:31 -0800
+Subject: [PATCH] sched/fair: Only do asym_packing load balancing from fully
+ idle SMT cores
+
+When balancing load between cores, all the SMT siblings of the destination
+CPU, if any, must be idle. Otherwise, pulling new tasks degrades the
+throughput of the busy SMT siblings. The overall throughput of the system
+remains the same.
+
+When balancing load within an SMT core this consideration is not relevant
+relevant. Follow the priorities that hardware indicates.
+
+Using is_core_idle() renders checking !sds->local_stat.sum_nr_running
+redundant. Remove it.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Suggested-by: Valentin Schneider <vschneid@redhat.com>
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 34 +++++++++++++++++++++++++---------
+ 1 file changed, 25 insertions(+), 9 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index d58df9c6a88c..1b134a2f0585 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9120,12 +9120,14 @@ group_type group_classify(unsigned int imbalance_pct,
+  * Check the state of the SMT siblings of both @sds::local and @sg and decide
+  * if @dst_cpu can pull tasks.
+  *
++ * This function must be called only if all the SMT siblings of @dst_cpu are
++ * idle, if any.
++ *
+  * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
+  * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
+  * only if @dst_cpu has higher priority.
+  *
+- * If @dst_cpu has SMT siblings, check if there are no running tasks in
+- * @sds::local. In such case, decide based on the priority of @sg. Do it only
++ * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only
+  * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger
+  * imbalances in the number of busy CPUs will be dealt with in
+  * find_busiest_group().
+@@ -9162,15 +9164,13 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+ 	}
+ 
+ 	/*
+-	 * @dst_cpu has SMT siblings. Do asym_packing load balancing only if
+-	 * all its siblings are idle (moving tasks between physical cores in
+-	 * which some SMT siblings are busy results in the same throughput).
++	 * @dst_cpu has SMT siblings and are also idle.
+ 	 *
+ 	 * If the difference in the number of busy CPUs is two or more, let
+ 	 * find_busiest_group() take care of it. We only care if @sg has
+ 	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
+ 	 */
+-	if (sg_busy_cpus == 1 && !sds->local_stat.sum_nr_running)
++	if (sg_busy_cpus == 1)
+ 		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+ 
+ 	return false;
+@@ -9184,7 +9184,14 @@ static inline bool
+ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs,
+ 	   struct sched_group *group)
+ {
+-	/* Only do SMT checks if either local or candidate have SMT siblings */
++	/*
++	 * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE
++	 * is not sufficient. We need to make sure the whole core is idle.
++	 */
++	if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu))
++		return false;
++
++	/* Only do SMT checks if either local or candidate have SMT siblings. */
+ 	if ((sds->local->flags & SD_SHARE_CPUCAPACITY) ||
+ 	    (group->flags & SD_SHARE_CPUCAPACITY))
+ 		return asym_smt_can_pull_tasks(env->dst_cpu, sds, sgs, group);
+@@ -11131,8 +11138,17 @@ static void nohz_balancer_kick(struct rq *rq)
+ 		 */
+ 		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+ 			if (sched_asym_prefer(i, cpu)) {
+-				flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+-				goto unlock;
++				/*
++				 * Always do ASYM_PACKING balance in the SMT
++				 * domain. In upper domains, the core must be
++				 * fully idle.
++				 */
++				if (sd->flags & SD_SHARE_CPUCAPACITY ||
++				    (!(sd->flags & SD_SHARE_CPUCAPACITY) &&
++				     is_core_idle(i))) {
++					flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
++					goto unlock;
++				}
+ 			}
+ 		}
+ 	}
+-- 
+2.39.2
+
+From 9941162cdf50901818e53975e116f317cb38173d Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:32 -0800
+Subject: [PATCH] sched/fair: Let low-priority cores help high-priority busy
+ SMT cores
+
+Using asym_packing priorities within an SMT core is straightforward. Just
+follow the priorities that hardware indicates.
+
+When balancing load from an SMT core, also consider the idle of its
+siblings. Priorities do not reflect that an SMT core divides its throughput
+among all its busy siblings. They only makes sense when exactly one sibling
+is busy.
+
+Indicate that active balance is needed if the destination CPU has lower
+priority than the source CPU but the latter has busy SMT siblings.
+
+Make find_busiest_queue() not skip higher-priority SMT cores with more than
+busy sibling.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Suggested-by: Valentin Schneider <vschneid@redhat.com>
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 31 ++++++++++++++++++++++++++-----
+ 1 file changed, 26 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 1b134a2f0585..1255d99877fe 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -10306,11 +10306,20 @@ static struct rq *find_busiest_queue(struct lb_env *env,
+ 		    nr_running == 1)
+ 			continue;
+ 
+-		/* Make sure we only pull tasks from a CPU of lower priority */
++		/*
++		 * Make sure we only pull tasks from a CPU of lower priority
++		 * when balancing between SMT siblings.
++		 *
++		 * If balancing between cores, let lower priority CPUs help
++		 * SMT cores with more than one busy sibling.
++		 */
+ 		if ((env->sd->flags & SD_ASYM_PACKING) &&
+ 		    sched_asym_prefer(i, env->dst_cpu) &&
+-		    nr_running == 1)
+-			continue;
++		    nr_running == 1) {
++			if (env->sd->flags & SD_SHARE_CPUCAPACITY ||
++			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i)))
++				continue;
++		}
+ 
+ 		switch (env->migration_type) {
+ 		case migrate_load:
+@@ -10400,8 +10409,20 @@ asym_active_balance(struct lb_env *env)
+ 	 * lower priority CPUs in order to pack all tasks in the
+ 	 * highest priority CPUs.
+ 	 */
+-	return env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING) &&
+-	       sched_asym_prefer(env->dst_cpu, env->src_cpu);
++	if (env->idle != CPU_NOT_IDLE && (env->sd->flags & SD_ASYM_PACKING)) {
++		/* Always obey priorities between SMT siblings. */
++		if (env->sd->flags & SD_SHARE_CPUCAPACITY)
++			return sched_asym_prefer(env->dst_cpu, env->src_cpu);
++
++		/*
++		 * A lower priority CPU can help an SMT core with more than one
++		 * busy sibling.
++		 */
++		return sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
++		       !is_core_idle(env->src_cpu);
++	}
++
++	return false;
+ }
+ 
+ static inline bool
+-- 
+2.39.2
+
+From b6fe3b340efe48625bcd5d6f9080a77e39be6a3f Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:33 -0800
+Subject: [PATCH] sched/fair: Keep a fully_busy SMT sched group as busiest
+
+When comparing two fully_busy scheduling groups, keep the current busiest
+group if it represents an SMT core. Tasks in such scheduling group share
+CPU resources and need more help than tasks in a non-SMT fully_busy group.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 16 ++++++++++++++--
+ 1 file changed, 14 insertions(+), 2 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 1255d99877fe..ed1f13fa32f8 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9384,10 +9384,22 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 		 * contention when accessing shared HW resources.
+ 		 *
+ 		 * XXX for now avg_load is not computed and always 0 so we
+-		 * select the 1st one.
++		 * select the 1st one, except if @sg is composed of SMT
++		 * siblings.
+ 		 */
+-		if (sgs->avg_load <= busiest->avg_load)
++
++		if (sgs->avg_load < busiest->avg_load)
+ 			return false;
++
++		if (sgs->avg_load == busiest->avg_load) {
++			/*
++			 * SMT sched groups need more help than non-SMT groups.
++			 * If @sg happens to also be SMT, either choice is good.
++			 */
++			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
++				return false;
++		}
++
+ 		break;
+ 
+ 	case group_has_spare:
+-- 
+2.39.2
+
+From 33b193a8846ec229414b71da7d26977fdfb3c9b3 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:34 -0800
+Subject: [PATCH] sched/fair: Use the prefer_sibling flag of the current sched
+ domain
+
+SD_PREFER_SIBLING is set from the SMT scheduling domain up to the first
+non-NUMA domain (the exception is systems with SD_ASYM_CPUCAPACITY).
+
+Above the SMT sched domain, all domains have a child. The SD_PREFER_
+SIBLING is honored always regardless of the scheduling domain at which the
+load balance takes place.
+
+There are cases, however, in which the busiest CPU's sched domain has
+child but the destination CPU's does not. Consider, for instance a non-SMT
+core (or an SMT core with only one online sibling) doing load balance with
+an SMT core at the MC level. SD_PREFER_SIBLING will not be honored. We are
+left with a fully busy SMT core and an idle non-SMT core.
+
+Avoid inconsistent behavior. Use the prefer_sibling behavior at the current
+scheduling domain, not its child.
+
+The NUMA sched domain does not have the SD_PREFER_SIBLING flag. Thus, we
+will not spread load among NUMA sched groups, as desired.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Suggested-by: Valentin Schneider <vschneid@redhat.com>
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index ed1f13fa32f8..9d94ba3f6726 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9874,7 +9874,6 @@ static void update_idle_cpu_scan(struct lb_env *env,
+ 
+ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
+ {
+-	struct sched_domain *child = env->sd->child;
+ 	struct sched_group *sg = env->sd->groups;
+ 	struct sg_lb_stats *local = &sds->local_stat;
+ 	struct sg_lb_stats tmp_sgs;
+@@ -9915,9 +9914,11 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
+ 		sg = sg->next;
+ 	} while (sg != env->sd->groups);
+ 
+-	/* Tag domain that child domain prefers tasks go to siblings first */
+-	sds->prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+-
++	/*
++	 * Tag domain that @env::sd prefers to spread excess tasks among
++	 * sibling sched groups.
++	 */
++	sds->prefer_sibling = env->sd->flags & SD_PREFER_SIBLING;
+ 
+ 	if (env->sd->flags & SD_NUMA)
+ 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+@@ -10216,7 +10217,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
+ 			goto out_balanced;
+ 	}
+ 
+-	/* Try to move all excess tasks to child's sibling domain */
+ 	if (sds.prefer_sibling && local->group_type == group_has_spare &&
+ 	    busiest->sum_nr_running > local->sum_nr_running + 1)
+ 		goto force_balance;
+-- 
+2.39.2
+
+From 3cada1dc1aaa1bdbbacb9973c3ed69851a9a8054 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:35 -0800
+Subject: [PATCH] sched/fair: Do not even the number of busy CPUs via
+ asym_packing
+
+Now that find_busiest_group() triggers load balancing between a fully_
+busy SMT2 core and an idle non-SMT core, it is no longer needed to force
+balancing via asym_packing. Use asym_packing only as intended: when there
+is high-priority CPU that is idle.
+
+After this change, the same logic apply to SMT and non-SMT local groups.
+Simplify asym_smt_can_pull_tasks() accordingly.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 37 +++++--------------------------------
+ 1 file changed, 5 insertions(+), 32 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 9d94ba3f6726..e5079ee882ff 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9117,20 +9117,15 @@ group_type group_classify(unsigned int imbalance_pct,
+  * @sgs:	Load-balancing statistics of the candidate busiest group
+  * @sg:		The candidate busiest group
+  *
+- * Check the state of the SMT siblings of both @sds::local and @sg and decide
+- * if @dst_cpu can pull tasks.
++ * Check the state of the SMT siblings of @sg and decide if @dst_cpu can pull
++ * tasks.
+  *
+  * This function must be called only if all the SMT siblings of @dst_cpu are
+  * idle, if any.
+  *
+- * If @dst_cpu does not have SMT siblings, it can pull tasks if two or more of
+- * the SMT siblings of @sg are busy. If only one CPU in @sg is busy, pull tasks
+- * only if @dst_cpu has higher priority.
+- *
+- * If @dst_cpu has SMT siblings, decide based on the priority of @sg. Do it only
+- * if @sg has exactly one busy CPU (i.e., one more than @sds::local). Bigger
+- * imbalances in the number of busy CPUs will be dealt with in
+- * find_busiest_group().
++ * @dst_cpu can pull tasks if @sg has exactly one busy CPU (i.e., one more than
++ * @sds::local) and has lower group priority than @sds::local. Bigger imbalances
++ * in the number of busy CPUs will be dealt with in find_busiest_group().
+  *
+  * Return: true if @dst_cpu can pull tasks, false otherwise.
+  */
+@@ -9139,33 +9134,11 @@ static bool asym_smt_can_pull_tasks(int dst_cpu, struct sd_lb_stats *sds,
+ 				    struct sched_group *sg)
+ {
+ #ifdef CONFIG_SCHED_SMT
+-	bool local_is_smt;
+ 	int sg_busy_cpus;
+ 
+-	local_is_smt = sds->local->flags & SD_SHARE_CPUCAPACITY;
+ 	sg_busy_cpus = sgs->group_weight - sgs->idle_cpus;
+ 
+-	if (!local_is_smt) {
+-		/*
+-		 * If we are here, @dst_cpu is idle and does not have SMT
+-		 * siblings. Pull tasks if candidate group has two or more
+-		 * busy CPUs.
+-		 */
+-		if (sg_busy_cpus >= 2) /* implies sg_is_smt */
+-			return true;
+-
+-		/*
+-		 * @dst_cpu does not have SMT siblings. @sg may have SMT
+-		 * siblings and only one is busy. In such case, @dst_cpu
+-		 * can help if it has higher priority and is idle (i.e.,
+-		 * it has no running tasks).
+-		 */
+-		return sched_asym_prefer(dst_cpu, sg->asym_prefer_cpu);
+-	}
+-
+ 	/*
+-	 * @dst_cpu has SMT siblings and are also idle.
+-	 *
+ 	 * If the difference in the number of busy CPUs is two or more, let
+ 	 * find_busiest_group() take care of it. We only care if @sg has
+ 	 * exactly one busy CPU. This covers SMT and non-SMT sched groups.
+-- 
+2.39.2
+
+From 9502629c285b133622a66eafae6983fe717906cb Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:36 -0800
+Subject: [PATCH] sched/topology: Remove SHARED_CHILD from ASYM_PACKING
+
+Only x86 and Power7 use ASYM_PACKING. They use it differently.
+
+Power7 has cores of equal priority, but the SMT siblings of a core have
+different priorities. Parent scheduling domains do not need (nor have) the
+ASYM_PACKING flag. SHARED_CHILD is not needed. Using SHARED_PARENT would
+cause the topology debug code to complain.
+
+X86 has cores of different priority, but all the SMT siblings of the core
+have equal priority. It needs ASYM_PACKING at the MC level, but not at the
+SMT level (it also needs it at upper levels if they have scheduling groups
+of different priority). Removing ASYM_PACKING from the SMT domain causes
+the topology debug code to complain.
+
+Remove SHARED_CHILD for now. We still need a topology check that satisfies
+both architectures.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Suggested-by: Valentin Schneider <vschneid@redhat.com>
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ include/linux/sched/sd_flags.h | 5 +----
+ 1 file changed, 1 insertion(+), 4 deletions(-)
+
+diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
+index 57bde66d95f7..800238854ba5 100644
+--- a/include/linux/sched/sd_flags.h
++++ b/include/linux/sched/sd_flags.h
+@@ -132,12 +132,9 @@ SD_FLAG(SD_SERIALIZE, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+ /*
+  * Place busy tasks earlier in the domain
+  *
+- * SHARED_CHILD: Usually set on the SMT level. Technically could be set further
+- *               up, but currently assumed to be set from the base domain
+- *               upwards (see update_top_cache_domain()).
+  * NEEDS_GROUPS: Load balancing flag.
+  */
+-SD_FLAG(SD_ASYM_PACKING, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS)
++SD_FLAG(SD_ASYM_PACKING,  SDF_NEEDS_GROUPS)
+ 
+ /*
+  * Prefer to place tasks in a sibling domain
+-- 
+2.39.2
+
+From 503eed0aa6bc93d5bbae5c0ecb5dd98221ac70d3 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:37 -0800
+Subject: [PATCH] x86/sched: Remove SD_ASYM_PACKING from the SMT domain flags
+
+There is no difference between any of the SMT siblings of a physical core.
+Do not do asym_packing load balancing at this level.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/kernel/smpboot.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
+index 55cad72715d9..0213d066a9a9 100644
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -547,7 +547,7 @@ static int x86_core_flags(void)
+ #ifdef CONFIG_SCHED_SMT
+ static int x86_smt_flags(void)
+ {
+-	return cpu_smt_flags() | x86_sched_itmt_flags();
++	return cpu_smt_flags();
+ }
+ #endif
+ #ifdef CONFIG_SCHED_CLUSTER
+-- 
+2.39.2
+
+From 1344221f62b96498586051f3e2a6c1e9524eebf3 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 20:58:38 -0800
+Subject: [PATCH] x86/sched/itmt: Give all SMT siblings of a core the same
+ priority
+
+X86 does not have the SD_ASYM_PACKING flag in the SMT domain. The scheduler
+knows how to handle SMT and non-SMT cores of different priority. There is
+no reason for SMT siblings of a core to have different priorities.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Reviewed-by: Len Brown <len.brown@intel.com>
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Tested-by: Zhang Rui <rui.zhang@intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/kernel/itmt.c | 23 +++++------------------
+ 1 file changed, 5 insertions(+), 18 deletions(-)
+
+diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
+index 9ff480e94511..6510883c5e81 100644
+--- a/arch/x86/kernel/itmt.c
++++ b/arch/x86/kernel/itmt.c
+@@ -174,32 +174,19 @@ int arch_asym_cpu_priority(int cpu)
+ 
+ /**
+  * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
+- * @prio:	Priority of cpu core
+- * @core_cpu:	The cpu number associated with the core
++ * @prio:	Priority of @cpu
++ * @cpu:	The CPU number
+  *
+  * The pstate driver will find out the max boost frequency
+  * and call this function to set a priority proportional
+- * to the max boost frequency. CPU with higher boost
++ * to the max boost frequency. CPUs with higher boost
+  * frequency will receive higher priority.
+  *
+  * No need to rebuild sched domain after updating
+  * the CPU priorities. The sched domains have no
+  * dependency on CPU priorities.
+  */
+-void sched_set_itmt_core_prio(int prio, int core_cpu)
++void sched_set_itmt_core_prio(int prio, int cpu)
+ {
+-	int cpu, i = 1;
+-
+-	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
+-		int smt_prio;
+-
+-		/*
+-		 * Ensure that the siblings are moved to the end
+-		 * of the priority chain and only used when
+-		 * all other high priority cpus are out of capacity.
+-		 */
+-		smt_prio = prio * smp_num_siblings / (i * i);
+-		per_cpu(sched_core_priority, cpu) = smt_prio;
+-		i++;
+-	}
++	per_cpu(sched_core_priority, cpu) = prio;
+ }
+-- 
+2.39.2
+
+From 25de1f88b45889ab6b7d03acc4638c93f978e427 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:42 -0800
+Subject: [PATCH] sched/task_struct: Introduce IPC classes of tasks
+
+On hybrid processors, the architecture differences between the types of
+CPUs lead to different instructions-per-cycle (IPC) on each type of CPU.
+IPCs may differ further by the type of instructions. Instructions can be
+grouped into classes of similar IPCs.
+
+Hence, tasks can be classified into groups based on the type of
+instructions they execute.
+
+Add a new member task_struct::ipcc to associate a particular task to
+an IPC class that depends on the instructions it executes.
+
+The scheduler may use the IPC class of a task and data about the
+performance among CPUs of a given IPC class to improve throughput. It
+may, for instance, place certain classes of tasks on CPUs of higher
+performance.
+
+The methods to determine the classification of a task and its relative
+IPC score are specific to each CPU architecture.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ include/linux/sched.h | 10 ++++++++++
+ init/Kconfig          | 12 ++++++++++++
+ 2 files changed, 22 insertions(+)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 853d08f7562b..f29294217885 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -127,6 +127,8 @@ struct task_group;
+ 					 __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
+ 					 TASK_PARKED)
+ 
++#define IPC_CLASS_UNCLASSIFIED		0
++
+ #define task_is_running(task)		(READ_ONCE((task)->__state) == TASK_RUNNING)
+ 
+ #define task_is_traced(task)		((READ_ONCE(task->jobctl) & JOBCTL_TRACED) != 0)
+@@ -1522,6 +1524,14 @@ struct task_struct {
+ 	union rv_task_monitor		rv[RV_PER_TASK_MONITORS];
+ #endif
+ 
++#ifdef CONFIG_IPC_CLASSES
++	/*
++	 * A hardware-defined classification of task that reflects but is
++	 * not identical to the number of instructions per cycle.
++	 */
++	unsigned short			ipcc;
++#endif
++
+ 	/*
+ 	 * New fields for task_struct should be added above here, so that
+ 	 * they are included in the randomized portion of task_struct.
+diff --git a/init/Kconfig b/init/Kconfig
+index 44e90b28a30f..24c5eec9d22e 100644
+--- a/init/Kconfig
++++ b/init/Kconfig
+@@ -867,6 +867,18 @@ config UCLAMP_BUCKETS_COUNT
+ 
+ 	  If in doubt, use the default value.
+ 
++config IPC_CLASSES
++	bool "IPC classes of tasks"
++	depends on SMP
++	help
++	  If selected, each task is assigned a classification value that
++	  reflects the type of instructions that the task executes. This
++	  classification reflects but is not equal to the number of
++	  instructions retired per cycle.
++
++	  The scheduler uses the classification value to improve the placement
++	  of tasks.
++
+ endmenu
+ 
+ #
+-- 
+2.39.2
+
+From a0e3326c33d45e7c433635bc1d620b086731c1cf Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:43 -0800
+Subject: [PATCH] sched: Add interfaces for IPC classes
+
+Add the interfaces that architectures shall implement to convey the data
+to support IPC classes.
+
+arch_update_ipcc() updates the IPC classification of the current task as
+given by hardware.
+
+arch_get_ipcc_score() provides a performance score for a given IPC class
+when placed on a specific CPU. Higher scores indicate higher performance.
+
+When a driver or equivalent enablement code has configured the necessary
+hardware to support IPC classes, it should call sched_enable_ipc_classes()
+to notify the scheduler that it can start using IPC classes data.
+
+The number of classes and the score of each class of task are determined
+by hardware.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ include/linux/sched/topology.h |  6 ++++
+ kernel/sched/sched.h           | 66 ++++++++++++++++++++++++++++++++++
+ kernel/sched/topology.c        |  9 +++++
+ 3 files changed, 81 insertions(+)
+
+diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
+index 816df6cc444e..5b084d3c9ad1 100644
+--- a/include/linux/sched/topology.h
++++ b/include/linux/sched/topology.h
+@@ -280,4 +280,10 @@ static inline int task_node(const struct task_struct *p)
+ 	return cpu_to_node(task_cpu(p));
+ }
+ 
++#ifdef CONFIG_IPC_CLASSES
++extern void sched_enable_ipc_classes(void);
++#else
++static inline void sched_enable_ipc_classes(void) { }
++#endif
++
+ #endif /* _LINUX_SCHED_TOPOLOGY_H */
+diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
+index 771f8ddb7053..7ab65d3feaa1 100644
+--- a/kernel/sched/sched.h
++++ b/kernel/sched/sched.h
+@@ -2526,6 +2526,72 @@ void arch_scale_freq_tick(void)
+ }
+ #endif
+ 
++#ifdef CONFIG_IPC_CLASSES
++DECLARE_STATIC_KEY_FALSE(sched_ipcc);
++
++static inline bool sched_ipcc_enabled(void)
++{
++	return static_branch_unlikely(&sched_ipcc);
++}
++
++#ifndef arch_update_ipcc
++/**
++ * arch_update_ipcc() - Update the IPC class of the current task
++ * @curr:		The current task
++ *
++ * Request that the IPC classification of @curr is updated.
++ *
++ * Returns: none
++ */
++static __always_inline
++void arch_update_ipcc(struct task_struct *curr)
++{
++}
++#endif
++
++#ifndef arch_get_ipcc_score
++
++#define SCHED_IPCC_SCORE_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
++/**
++ * arch_get_ipcc_score() - Get the IPC score of a class of task
++ * @ipcc:	The IPC class
++ * @cpu:	A CPU number
++ *
++ * The IPC performance scores reflects (but it is not identical to) the number
++ * of instructions retired per cycle for a given IPC class. It is a linear and
++ * abstract metric. Higher scores reflect better performance.
++ *
++ * The IPC score can be normalized with respect to the class, i, with the
++ * highest IPC score on the CPU, c, with highest performance:
++ *
++ *            IPC(i, c)
++ *  ------------------------------------ * SCHED_IPCC_SCORE_SCALE
++ *     max(IPC(i, c) : (i, c))
++ *
++ * Scheduling schemes that want to use the IPC score along with other
++ * normalized metrics for scheduling (e.g., CPU capacity) may need to normalize
++ * it.
++ *
++ * Other scheduling schemes (e.g., asym_packing) do not need normalization.
++ *
++ * Returns the performance score of an IPC class, @ipcc, when running on @cpu.
++ * Error when either @ipcc or @cpu are invalid.
++ */
++static __always_inline
++unsigned long arch_get_ipcc_score(unsigned short ipcc, int cpu)
++{
++	return SCHED_IPCC_SCORE_SCALE;
++}
++#endif
++#else /* CONFIG_IPC_CLASSES */
++
++#define arch_get_ipcc_score(ipcc, cpu) (-EINVAL)
++#define arch_update_ipcc(curr)
++
++static inline bool sched_ipcc_enabled(void) { return false; }
++
++#endif /* CONFIG_IPC_CLASSES */
++
+ #ifndef arch_scale_freq_capacity
+ /**
+  * arch_scale_freq_capacity - get the frequency scale factor of a given CPU.
+diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
+index 8739c2a5a54e..60e03d15f58c 100644
+--- a/kernel/sched/topology.c
++++ b/kernel/sched/topology.c
+@@ -670,6 +670,15 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing);
+ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity);
+ DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
+ 
++#ifdef CONFIG_IPC_CLASSES
++DEFINE_STATIC_KEY_FALSE(sched_ipcc);
++
++void sched_enable_ipc_classes(void)
++{
++	static_branch_enable_cpuslocked(&sched_ipcc);
++}
++#endif
++
+ static void update_top_cache_domain(int cpu)
+ {
+ 	struct sched_domain_shared *sds = NULL;
+-- 
+2.39.2
+
+From c18e80caa66e108ad250a79ee9688e07705830cf Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:44 -0800
+Subject: [PATCH] sched/core: Initialize the IPC class of a new task
+
+New tasks shall start life as unclassified. They will be classified by
+hardware when they run.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/core.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 2a4918a1faa9..325b1d3cf7a8 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4424,6 +4424,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.prev_sum_exec_runtime	= 0;
+ 	p->se.nr_migrations		= 0;
+ 	p->se.vruntime			= 0;
++#ifdef CONFIG_IPC_CLASSES
++	p->ipcc				= IPC_CLASS_UNCLASSIFIED;
++#endif
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+-- 
+2.39.2
+
+From b98df1322d063aee5015bf6fc751cf612151183c Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:45 -0800
+Subject: [PATCH] sched/core: Add user_tick as argument to scheduler_tick()
+
+Differentiate between user and kernel ticks so that the scheduler updates
+the IPC class of the current task during the former.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ include/linux/sched.h | 2 +-
+ kernel/sched/core.c   | 2 +-
+ kernel/time/timer.c   | 2 +-
+ 3 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index f29294217885..4f96c3dd59d0 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -293,7 +293,7 @@ enum {
+ 	TASK_COMM_LEN = 16,
+ };
+ 
+-extern void scheduler_tick(void);
++extern void scheduler_tick(bool user_tick);
+ 
+ #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
+ 
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 325b1d3cf7a8..b438fc79f868 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -5550,7 +5550,7 @@ static inline u64 cpu_resched_latency(struct rq *rq) { return 0; }
+  * This function gets called by the timer code, with HZ frequency.
+  * We call it with interrupts disabled.
+  */
+-void scheduler_tick(void)
++void scheduler_tick(bool user_tick)
+ {
+ 	int cpu = smp_processor_id();
+ 	struct rq *rq = cpu_rq(cpu);
+diff --git a/kernel/time/timer.c b/kernel/time/timer.c
+index 63a8ce7177dd..e15e24105891 100644
+--- a/kernel/time/timer.c
++++ b/kernel/time/timer.c
+@@ -2073,7 +2073,7 @@ void update_process_times(int user_tick)
+ 	if (in_irq())
+ 		irq_work_tick();
+ #endif
+-	scheduler_tick();
++	scheduler_tick(user_tick);
+ 	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+ 		run_posix_cpu_timers();
+ }
+-- 
+2.39.2
+
+From 736249a61b243746519f78008913237317180313 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:46 -0800
+Subject: [PATCH] sched/core: Update the IPC class of the current task
+
+When supported, hardware monitors the instruction stream to classify the
+current task. Hence, at userspace tick, we are ready to read the most
+recent classification result for the current task.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/core.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index b438fc79f868..0ab39cc055c7 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -5562,6 +5562,9 @@ void scheduler_tick(bool user_tick)
+ 	if (housekeeping_cpu(cpu, HK_TYPE_TICK))
+ 		arch_scale_freq_tick();
+ 
++	if (sched_ipcc_enabled() && user_tick)
++		arch_update_ipcc(curr);
++
+ 	sched_clock_tick();
+ 
+ 	rq_lock(rq, &rf);
+-- 
+2.39.2
+
+From e466ceec97170f0038327d9402d1a7287bdfda01 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:47 -0800
+Subject: [PATCH] sched/fair: Collect load-balancing stats for IPC classes
+
+When selecting a busiest scheduling group, the IPC class of the current
+task can be used to select between two scheduling groups of types asym_
+packing or fully_busy that are otherwise identical.
+
+Compute the IPC class performance score for a scheduling group. It
+is the sum of the scores of the current tasks of all the runqueues.
+
+Also, keep track of the class of the task with the lowest IPC class score
+in the scheduling group.
+
+These two metrics will be used during idle load balancing to compute the
+current and the prospective IPC class score of a scheduling group.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 61 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 61 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index e5079ee882ff..a418164953c3 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8767,6 +8767,11 @@ struct sg_lb_stats {
+ 	unsigned int nr_numa_running;
+ 	unsigned int nr_preferred_running;
+ #endif
++#ifdef CONFIG_IPC_CLASSES
++	unsigned long min_score; /* Min(score(rq->curr->ipcc)) */
++	unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */
++	unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */
++#endif
+ };
+ 
+ /*
+@@ -9110,6 +9115,59 @@ group_type group_classify(unsigned int imbalance_pct,
+ 	return group_has_spare;
+ }
+ 
++#ifdef CONFIG_IPC_CLASSES
++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs)
++{
++	/* All IPCC stats have been set to zero in update_sg_lb_stats(). */
++	sgs->min_score = ULONG_MAX;
++}
++
++/* Called only if cpu_of(@rq) is not idle and has tasks running. */
++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
++				    struct rq *rq)
++{
++	struct task_struct *curr;
++	unsigned short ipcc;
++	unsigned long score;
++
++	if (!sched_ipcc_enabled())
++		return;
++
++	curr = rcu_dereference(rq->curr);
++	if (!curr || (curr->flags & PF_EXITING) || is_idle_task(curr) ||
++	    task_is_realtime(curr) ||
++	    !cpumask_test_cpu(dst_cpu, curr->cpus_ptr))
++		return;
++
++	ipcc = curr->ipcc;
++	score = arch_get_ipcc_score(ipcc, cpu_of(rq));
++
++	/*
++	 * Ignore tasks with invalid scores. When finding the busiest group, we
++	 * prefer those with higher sum_score. This group will not be selected.
++	 */
++	if (IS_ERR_VALUE(score))
++		return;
++
++	sgs->sum_score += score;
++
++	if (score < sgs->min_score) {
++		sgs->min_score = score;
++		sgs->min_ipcc = ipcc;
++	}
++}
++
++#else /* CONFIG_IPC_CLASSES */
++static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
++				    struct rq *rq)
++{
++}
++
++static void init_rq_ipcc_stats(struct sg_lb_stats *sgs)
++{
++}
++#endif /* CONFIG_IPC_CLASSES */
++
+ /**
+  * asym_smt_can_pull_tasks - Check whether the load balancing CPU can pull tasks
+  * @dst_cpu:	Destination CPU of the load balancing
+@@ -9202,6 +9260,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 	int i, nr_running, local_group;
+ 
+ 	memset(sgs, 0, sizeof(*sgs));
++	init_rq_ipcc_stats(sgs);
+ 
+ 	local_group = group == sds->local;
+ 
+@@ -9251,6 +9310,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 			if (sgs->group_misfit_task_load < load)
+ 				sgs->group_misfit_task_load = load;
+ 		}
++
++		update_sg_lb_ipcc_stats(env->dst_cpu, sgs, rq);
+ 	}
+ 
+ 	sgs->group_capacity = group->sgc->capacity;
+-- 
+2.39.2
+
+From 493a3d6568c0ae6aa677dbcaa4f623b03a5feae0 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:48 -0800
+Subject: [PATCH] sched/fair: Compute IPC class scores for load balancing
+
+Compute the joint total (both current and prospective) IPC class score of
+a scheduling group and the local scheduling group.
+
+These IPCC statistics are used during idle load balancing. The candidate
+scheduling group will have one fewer busy CPU after load balancing. This
+observation is important for cores with SMT support.
+
+The IPCC score of scheduling groups composed of SMT siblings needs to
+consider that the siblings share CPU resources. When computing the total
+IPCC score of the scheduling group, divide score of each sibling by the
+number of busy siblings.
+
+Collect IPCC statistics for asym_packing and fully_busy scheduling groups.
+When picking a busiest group, they are used to break ties between otherwise
+identical groups.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 68 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 68 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a418164953c3..ae0c908be707 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -8771,6 +8771,8 @@ struct sg_lb_stats {
+ 	unsigned long min_score; /* Min(score(rq->curr->ipcc)) */
+ 	unsigned short min_ipcc; /* Class of the task with the minimum IPCC score in the rq */
+ 	unsigned long sum_score; /* Sum(score(rq->curr->ipcc)) */
++	long ipcc_score_after; /* Prospective IPCC score after load balancing */
++	unsigned long ipcc_score_before; /* IPCC score before load balancing */
+ #endif
+ };
+ 
+@@ -9157,6 +9159,62 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
+ 	}
+ }
+ 
++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
++				      struct sched_group *sg,
++				      struct lb_env *env)
++{
++	unsigned long score_on_dst_cpu, before;
++	int busy_cpus;
++	long after;
++
++	if (!sched_ipcc_enabled())
++		return;
++
++	/*
++	 * IPCC scores are only useful during idle load balancing. For now,
++	 * only asym_packing uses IPCC scores.
++	 */
++	if (!(env->sd->flags & SD_ASYM_PACKING) ||
++	    env->idle == CPU_NOT_IDLE)
++		return;
++
++	/*
++	 * IPCC scores are used to break ties only between these types of
++	 * groups.
++	 */
++	if (sgs->group_type != group_fully_busy &&
++	    sgs->group_type != group_asym_packing)
++		return;
++
++	busy_cpus = sgs->group_weight - sgs->idle_cpus;
++
++	/* No busy CPUs in the group. No tasks to move. */
++	if (!busy_cpus)
++		return;
++
++	score_on_dst_cpu = arch_get_ipcc_score(sgs->min_ipcc, env->dst_cpu);
++
++	/*
++	 * Do not use IPC scores. sgs::ipcc_score_{after, before} will be zero
++	 * and not used.
++	 */
++	if (IS_ERR_VALUE(score_on_dst_cpu))
++		return;
++
++	before = sgs->sum_score;
++	after = before - sgs->min_score;
++
++	/* SMT siblings share throughput. */
++	if (busy_cpus > 1 && sg->flags & SD_SHARE_CPUCAPACITY) {
++		before /= busy_cpus;
++		/* One sibling will become idle after load balance. */
++		after /= busy_cpus - 1;
++	}
++
++	sgs->ipcc_score_after = after + score_on_dst_cpu;
++	sgs->ipcc_score_before = before;
++}
++
+ #else /* CONFIG_IPC_CLASSES */
+ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
+ 				    struct rq *rq)
+@@ -9166,6 +9224,13 @@ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
+ static void init_rq_ipcc_stats(struct sg_lb_stats *sgs)
+ {
+ }
++
++static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
++				      struct sched_group *sg,
++				      struct lb_env *env)
++{
++}
++
+ #endif /* CONFIG_IPC_CLASSES */
+ 
+ /**
+@@ -9327,6 +9392,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
+ 
+ 	sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs);
+ 
++	if (!local_group)
++		update_sg_lb_stats_scores(sgs, group, env);
++
+ 	/* Computing avg_load makes sense only when group is overloaded */
+ 	if (sgs->group_type == group_overloaded)
+ 		sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) /
+-- 
+2.39.2
+
+From e93c0032e04663397da64d2fb501ddc3de9c961d Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:49 -0800
+Subject: [PATCH] sched/fair: Use IPCC stats to break ties between asym_packing
+ sched groups
+
+As it iterates, update_sd_pick_busiest() keeps on selecting as busiest
+sched groups of identical priority. Since both groups have the same
+priority, either group is a good choice. The IPCC statistics provide a
+measure of the throughput before and after load balance. Use them to
+pick a busiest scheduling group from otherwise identical asym_packing
+scheduling groups.
+
+Pick as busiest the scheduling group that yields a higher IPCC score
+after load balancing.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 72 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 72 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index ae0c908be707..cffb435e2b1c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9215,6 +9215,60 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
+ 	sgs->ipcc_score_before = before;
+ }
+ 
++/**
++ * sched_asym_ipcc_prefer - Select a sched group based on its IPCC score
++ * @a:	Load balancing statistics of a sched group
++ * @b:	Load balancing statistics of a second sched group
++ *
++ * Returns: true if @a has a higher IPCC score than @b after load balance.
++ * False otherwise.
++ */
++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a,
++				   struct sg_lb_stats *b)
++{
++	if (!sched_ipcc_enabled())
++		return false;
++
++	/* @a increases overall throughput after load balance. */
++	if (a->ipcc_score_after > b->ipcc_score_after)
++		return true;
++
++	/*
++	 * If @a and @b yield the same overall throughput, pick @a if
++	 * its current throughput is lower than that of @b.
++	 */
++	if (a->ipcc_score_after == b->ipcc_score_after)
++		return a->ipcc_score_before < b->ipcc_score_before;
++
++	return false;
++}
++
++/**
++ * sched_asym_ipcc_pick - Select a sched group based on its IPCC score
++ * @a:		A scheduling group
++ * @b:		A second scheduling group
++ * @a_stats:	Load balancing statistics of @a
++ * @b_stats:	Load balancing statistics of @b
++ *
++ * Returns: true if @a has the same priority and @a has tasks with IPC classes
++ * that yield higher overall throughput after load balance. False otherwise.
++ */
++static bool sched_asym_ipcc_pick(struct sched_group *a,
++				 struct sched_group *b,
++				 struct sg_lb_stats *a_stats,
++				 struct sg_lb_stats *b_stats)
++{
++	/*
++	 * Only use the class-specific preference selection if both sched
++	 * groups have the same priority.
++	 */
++	if (arch_asym_cpu_priority(a->asym_prefer_cpu) !=
++	    arch_asym_cpu_priority(b->asym_prefer_cpu))
++		return false;
++
++	return sched_asym_ipcc_prefer(a_stats, b_stats);
++}
++
+ #else /* CONFIG_IPC_CLASSES */
+ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
+ 				    struct rq *rq)
+@@ -9231,6 +9285,14 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
+ {
+ }
+ 
++static bool sched_asym_ipcc_pick(struct sched_group *a,
++				 struct sched_group *b,
++				 struct sg_lb_stats *a_stats,
++				 struct sg_lb_stats *b_stats)
++{
++	return false;
++}
++
+ #endif /* CONFIG_IPC_CLASSES */
+ 
+ /**
+@@ -9466,6 +9528,16 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 		/* Prefer to move from lowest priority CPU's work */
+ 		if (sched_asym_prefer(sg->asym_prefer_cpu, sds->busiest->asym_prefer_cpu))
+ 			return false;
++
++		/*
++		 * Unlike other callers of sched_asym_prefer(), here both @sg
++		 * and @sds::busiest have tasks running. When they have equal
++		 * priority, their IPC class scores can be used to select a
++		 * better busiest.
++		 */
++		if (sched_asym_ipcc_pick(sds->busiest, sg, &sds->busiest_stat, sgs))
++			return false;
++
+ 		break;
+ 
+ 	case group_misfit_task:
+-- 
+2.39.2
+
+From 6e3ab209c9551934abd38dedffa499ee7d7902d0 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:50 -0800
+Subject: [PATCH] sched/fair: Use IPCC stats to break ties between fully_busy
+ SMT groups
+
+IPCC statistics are used during idle load balancing. After balancing one
+of the siblings of an SMT core will become idle. The rest of the busy
+siblings will enjoy increased throughput. The IPCC statistics provide
+a measure of the increased throughput. Use them to pick a busiest group
+from otherwise identical fully_busy scheduling groups (of which the
+avg_load is equal - and zero).
+
+Using IPCC scores to break ties with non-SMT fully_busy sched groups
+is not necessary. SMT sched groups always need more help.
+
+Add a stub sched_asym_ipcc_prefer() for !CONFIG_IPC_CLASSES.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 23 ++++++++++++++++++++---
+ 1 file changed, 20 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index cffb435e2b1c..0996339df429 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9285,6 +9285,12 @@ static void update_sg_lb_stats_scores(struct sg_lb_stats *sgs,
+ {
+ }
+ 
++static bool sched_asym_ipcc_prefer(struct sg_lb_stats *a,
++				   struct sg_lb_stats *b)
++{
++	return false;
++}
++
+ static bool sched_asym_ipcc_pick(struct sched_group *a,
+ 				 struct sched_group *b,
+ 				 struct sg_lb_stats *a_stats,
+@@ -9568,10 +9574,21 @@ static bool update_sd_pick_busiest(struct lb_env *env,
+ 		if (sgs->avg_load == busiest->avg_load) {
+ 			/*
+ 			 * SMT sched groups need more help than non-SMT groups.
+-			 * If @sg happens to also be SMT, either choice is good.
+ 			 */
+-			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY)
+-				return false;
++			if (sds->busiest->flags & SD_SHARE_CPUCAPACITY) {
++				if (!(sg->flags & SD_SHARE_CPUCAPACITY))
++					return false;
++
++				/*
++				 * Between two SMT groups, use IPCC scores to pick the
++				 * one that would improve throughput the most (only
++				 * asym_packing uses IPCC scores for now).
++				 */
++				if (sched_ipcc_enabled() &&
++				    env->sd->flags & SD_ASYM_PACKING &&
++				    sched_asym_ipcc_prefer(busiest, sgs))
++					return false;
++			}
+ 		}
+ 
+ 		break;
+-- 
+2.39.2
+
+From a293954b9b5f0b273e5acd5cbfa0ba0d70d9c139 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:51 -0800
+Subject: [PATCH] sched/fair: Use IPCC scores to select a busiest runqueue
+
+For two runqueues of equal priority and equal number of running of tasks,
+select the one whose current task would have the highest IPC class score
+if placed on the destination CPU.
+
+For now, use IPCC scores only for scheduling domains with the
+SD_ASYM_PACKING flag.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/fair.c | 64 +++++++++++++++++++++++++++++++++++++++++++++
+ 1 file changed, 64 insertions(+)
+
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index 0996339df429..a9a105092e7c 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -9269,6 +9269,37 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
+ 	return sched_asym_ipcc_prefer(a_stats, b_stats);
+ }
+ 
++/**
++ * ipcc_score_delta - Get the IPCC score delta wrt the load balance's dst_cpu
++ * @p:		A task
++ * @env:	Load balancing environment
++ *
++ * Returns: The IPCC score delta that @p would get if placed in the destination
++ * CPU of @env. LONG_MIN to indicate that the delta should not be used.
++ */
++static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
++{
++	unsigned long score_src, score_dst;
++	unsigned short ipcc = p->ipcc;
++
++	if (!sched_ipcc_enabled())
++		return LONG_MIN;
++
++	/* Only asym_packing uses IPCC scores at the moment. */
++	if (!(env->sd->flags & SD_ASYM_PACKING))
++		return LONG_MIN;
++
++	score_dst = arch_get_ipcc_score(ipcc, env->dst_cpu);
++	if (IS_ERR_VALUE(score_dst))
++		return LONG_MIN;
++
++	score_src = arch_get_ipcc_score(ipcc, task_cpu(p));
++	if (IS_ERR_VALUE(score_src))
++		return LONG_MIN;
++
++	return score_dst - score_src;
++}
++
+ #else /* CONFIG_IPC_CLASSES */
+ static void update_sg_lb_ipcc_stats(int dst_cpu, struct sg_lb_stats *sgs,
+ 				    struct rq *rq)
+@@ -9299,6 +9330,11 @@ static bool sched_asym_ipcc_pick(struct sched_group *a,
+ 	return false;
+ }
+ 
++static long ipcc_score_delta(struct task_struct *p, struct lb_env *env)
++{
++	return LONG_MIN;
++}
++
+ #endif /* CONFIG_IPC_CLASSES */
+ 
+ /**
+@@ -10459,6 +10495,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
+ {
+ 	struct rq *busiest = NULL, *rq;
+ 	unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1;
++	long busiest_ipcc_delta = LONG_MIN;
+ 	unsigned int busiest_nr = 0;
+ 	int i;
+ 
+@@ -10575,8 +10612,35 @@ static struct rq *find_busiest_queue(struct lb_env *env,
+ 
+ 		case migrate_task:
+ 			if (busiest_nr < nr_running) {
++				struct task_struct *curr;
++
+ 				busiest_nr = nr_running;
+ 				busiest = rq;
++
++				/*
++				 * Remember the IPCC score delta of busiest::curr.
++				 * We may need it to break a tie with other queues
++				 * with equal nr_running.
++				 */
++				curr = rcu_dereference(busiest->curr);
++				busiest_ipcc_delta = ipcc_score_delta(curr, env);
++			/*
++			 * If rq and busiest have the same number of running
++			 * tasks and IPC classes are supported, pick rq if doing
++			 * so would give rq::curr a bigger IPC boost on dst_cpu.
++			 */
++			} else if (busiest_nr == nr_running) {
++				struct task_struct *curr;
++				long delta;
++
++				curr = rcu_dereference(rq->curr);
++				delta = ipcc_score_delta(curr, env);
++
++				if (busiest_ipcc_delta < delta) {
++					busiest_ipcc_delta = delta;
++					busiest_nr = nr_running;
++					busiest = rq;
++				}
+ 			}
+ 			break;
+ 
+-- 
+2.39.2
+
+From 8c517b81e0894d90b440d862bc1704259a94cf46 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:52 -0800
+Subject: [PATCH] thermal: intel: hfi: Introduce Intel Thread Director classes
+
+On Intel hybrid parts, each type of CPU has specific performance and
+energy efficiency capabilities. The Intel Thread Director technology
+extends the Hardware Feedback Interface (HFI) to provide performance and
+energy efficiency data for advanced classes of instructions.
+
+Add support to parse per-class capabilities.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ drivers/thermal/intel/intel_hfi.c | 30 ++++++++++++++++++++++++------
+ 1 file changed, 24 insertions(+), 6 deletions(-)
+
+diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
+index 6e604bda2b93..2527ae3836c7 100644
+--- a/drivers/thermal/intel/intel_hfi.c
++++ b/drivers/thermal/intel/intel_hfi.c
+@@ -77,7 +77,7 @@ union cpuid6_edx {
+  * @ee_cap:		Energy efficiency capability
+  *
+  * Capabilities of a logical processor in the HFI table. These capabilities are
+- * unitless.
++ * unitless and specific to each HFI class.
+  */
+ struct hfi_cpu_data {
+ 	u8	perf_cap;
+@@ -89,7 +89,8 @@ struct hfi_cpu_data {
+  * @perf_updated:	Hardware updated performance capabilities
+  * @ee_updated:		Hardware updated energy efficiency capabilities
+  *
+- * Properties of the data in an HFI table.
++ * Properties of the data in an HFI table. There exists one header per each
++ * HFI class.
+  */
+ struct hfi_hdr {
+ 	u8	perf_updated;
+@@ -127,16 +128,21 @@ struct hfi_instance {
+ 
+ /**
+  * struct hfi_features - Supported HFI features
++ * @nr_classes:		Number of classes supported
+  * @nr_table_pages:	Size of the HFI table in 4KB pages
+  * @cpu_stride:		Stride size to locate the capability data of a logical
+  *			processor within the table (i.e., row stride)
++ * @class_stride:	Stride size to locate a class within the capability
++ *			data of a logical processor or the HFI table header
+  * @hdr_size:		Size of the table header
+  *
+  * Parameters and supported features that are common to all HFI instances
+  */
+ struct hfi_features {
++	unsigned int	nr_classes;
+ 	size_t		nr_table_pages;
+ 	unsigned int	cpu_stride;
++	unsigned int	class_stride;
+ 	unsigned int	hdr_size;
+ };
+ 
+@@ -333,8 +339,8 @@ static void init_hfi_cpu_index(struct hfi_cpu_info *info)
+ }
+ 
+ /*
+- * The format of the HFI table depends on the number of capabilities that the
+- * hardware supports. Keep a data structure to navigate the table.
++ * The format of the HFI table depends on the number of capabilities and classes
++ * that the hardware supports. Keep a data structure to navigate the table.
+  */
+ static void init_hfi_instance(struct hfi_instance *hfi_instance)
+ {
+@@ -515,18 +521,30 @@ static __init int hfi_parse_features(void)
+ 	/* The number of 4KB pages required by the table */
+ 	hfi_features.nr_table_pages = edx.split.table_pages + 1;
+ 
++	/*
++	 * Capability fields of an HFI class are grouped together. Classes are
++	 * contiguous in memory.  Hence, use the number of supported features to
++	 * locate a specific class.
++	 */
++	hfi_features.class_stride = nr_capabilities;
++
++	/* For now, use only one class of the HFI table */
++	hfi_features.nr_classes = 1;
++
+ 	/*
+ 	 * The header contains change indications for each supported feature.
+ 	 * The size of the table header is rounded up to be a multiple of 8
+ 	 * bytes.
+ 	 */
+-	hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities, 8) * 8;
++	hfi_features.hdr_size = DIV_ROUND_UP(nr_capabilities *
++					     hfi_features.nr_classes, 8) * 8;
+ 
+ 	/*
+ 	 * Data of each logical processor is also rounded up to be a multiple
+ 	 * of 8 bytes.
+ 	 */
+-	hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities, 8) * 8;
++	hfi_features.cpu_stride = DIV_ROUND_UP(nr_capabilities *
++					       hfi_features.nr_classes, 8) * 8;
+ 
+ 	return 0;
+ }
+-- 
+2.39.2
+
+From 258fdd38eadf1a4b1cff687dcc99a834ca97095f Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:53 -0800
+Subject: [PATCH] x86/cpufeatures: Add the Intel Thread Director feature
+ definitions
+
+Intel Thread Director (ITD) provides hardware resources to classify
+the current task. The classification reflects the type of instructions that
+a task currently executes.
+
+ITD extends the Hardware Feedback Interface table to provide performance
+and energy efficiency capabilities for each of the supported classes of
+tasks.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/include/asm/cpufeatures.h       | 1 +
+ arch/x86/include/asm/disabled-features.h | 8 +++++++-
+ arch/x86/kernel/cpu/cpuid-deps.c         | 1 +
+ 3 files changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index 8f39c46197b8..a2f2730737ae 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -345,6 +345,7 @@
+ #define X86_FEATURE_HWP_EPP		(14*32+10) /* HWP Energy Perf. Preference */
+ #define X86_FEATURE_HWP_PKG_REQ		(14*32+11) /* HWP Package Level Request */
+ #define X86_FEATURE_HFI			(14*32+19) /* Hardware Feedback Interface */
++#define X86_FEATURE_ITD			(14*32+23) /* Intel Thread Director */
+ 
+ /* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
+ #define X86_FEATURE_NPT			(15*32+ 0) /* Nested Page Table support */
+diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
+index c44b56f7ffba..0edd9bef7f2e 100644
+--- a/arch/x86/include/asm/disabled-features.h
++++ b/arch/x86/include/asm/disabled-features.h
+@@ -99,6 +99,12 @@
+ # define DISABLE_TDX_GUEST	(1 << (X86_FEATURE_TDX_GUEST & 31))
+ #endif
+ 
++#ifdef CONFIG_IPC_CLASSES
++# define DISABLE_ITD	0
++#else
++# define DISABLE_ITD	(1 << (X86_FEATURE_ITD & 31))
++#endif
++
+ /*
+  * Make sure to add features to the correct mask
+  */
+@@ -117,7 +123,7 @@
+ 			 DISABLE_CALL_DEPTH_TRACKING)
+ #define DISABLED_MASK12	0
+ #define DISABLED_MASK13	0
+-#define DISABLED_MASK14	0
++#define DISABLED_MASK14	(DISABLE_ITD)
+ #define DISABLED_MASK15	0
+ #define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
+ 			 DISABLE_ENQCMD)
+diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
+index d95221117129..277f157e067e 100644
+--- a/arch/x86/kernel/cpu/cpuid-deps.c
++++ b/arch/x86/kernel/cpu/cpuid-deps.c
+@@ -79,6 +79,7 @@ static const struct cpuid_dep cpuid_deps[] = {
+ 	{ X86_FEATURE_XFD,			X86_FEATURE_XSAVES    },
+ 	{ X86_FEATURE_XFD,			X86_FEATURE_XGETBV1   },
+ 	{ X86_FEATURE_AMX_TILE,			X86_FEATURE_XFD       },
++	{ X86_FEATURE_ITD,			X86_FEATURE_HFI       },
+ 	{}
+ };
+ 
+-- 
+2.39.2
+
+From b2c8d8d2cf45125c1b3be140385979a1cadcc4ca Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:54 -0800
+Subject: [PATCH] thermal: intel: hfi: Store per-CPU IPCC scores
+
+The scheduler reads the IPCC scores when balancing load. These reads can
+be quite frequent. Hardware can also update the HFI table frequently.
+Concurrent access may cause a lot of lock contention. It gets worse as the
+number of CPUs increases.
+
+Instead, create separate per-CPU IPCC scores that the scheduler can read
+without the HFI table lock.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Suggested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ drivers/thermal/intel/intel_hfi.c | 46 +++++++++++++++++++++++++++++++
+ 1 file changed, 46 insertions(+)
+
+diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
+index 2527ae3836c7..b06021828892 100644
+--- a/drivers/thermal/intel/intel_hfi.c
++++ b/drivers/thermal/intel/intel_hfi.c
+@@ -29,6 +29,7 @@
+ #include <linux/kernel.h>
+ #include <linux/math.h>
+ #include <linux/mutex.h>
++#include <linux/percpu.h>
+ #include <linux/percpu-defs.h>
+ #include <linux/printk.h>
+ #include <linux/processor.h>
+@@ -170,6 +171,43 @@ static struct workqueue_struct *hfi_updates_wq;
+ #define HFI_UPDATE_INTERVAL		HZ
+ #define HFI_MAX_THERM_NOTIFY_COUNT	16
+ 
++#ifdef CONFIG_IPC_CLASSES
++static int __percpu *hfi_ipcc_scores;
++
++static int alloc_hfi_ipcc_scores(void)
++{
++	if (!cpu_feature_enabled(X86_FEATURE_ITD))
++		return 0;
++
++	hfi_ipcc_scores = __alloc_percpu(sizeof(*hfi_ipcc_scores) *
++					 hfi_features.nr_classes,
++					 sizeof(*hfi_ipcc_scores));
++
++	return !hfi_ipcc_scores;
++}
++
++static void set_hfi_ipcc_score(void *caps, int cpu)
++{
++	int i, *hfi_class;
++
++	if (!cpu_feature_enabled(X86_FEATURE_ITD))
++		return;
++
++	hfi_class = per_cpu_ptr(hfi_ipcc_scores, cpu);
++
++	for (i = 0;  i < hfi_features.nr_classes; i++) {
++		struct hfi_cpu_data *class_caps;
++
++		class_caps = caps + i * hfi_features.class_stride;
++		WRITE_ONCE(hfi_class[i], class_caps->perf_cap);
++	}
++}
++
++#else
++static int alloc_hfi_ipcc_scores(void) { return 0; }
++static void set_hfi_ipcc_score(void *caps, int cpu) { }
++#endif /* CONFIG_IPC_CLASSES */
++
+ static void get_hfi_caps(struct hfi_instance *hfi_instance,
+ 			 struct thermal_genl_cpu_caps *cpu_caps)
+ {
+@@ -192,6 +230,8 @@ static void get_hfi_caps(struct hfi_instance *hfi_instance,
+ 		cpu_caps[i].efficiency = caps->ee_cap << 2;
+ 
+ 		++i;
++
++		set_hfi_ipcc_score(caps, cpu);
+ 	}
+ 	raw_spin_unlock_irq(&hfi_instance->table_lock);
+ }
+@@ -580,8 +620,14 @@ void __init intel_hfi_init(void)
+ 	if (!hfi_updates_wq)
+ 		goto err_nomem;
+ 
++	if (alloc_hfi_ipcc_scores())
++		goto err_ipcc;
++
+ 	return;
+ 
++err_ipcc:
++	destroy_workqueue(hfi_updates_wq);
++
+ err_nomem:
+ 	for (j = 0; j < i; ++j) {
+ 		hfi_instance = &hfi_instances[j];
+-- 
+2.39.2
+
+From 55930531b4e99582a7b9969e810178c0317f196a Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:55 -0800
+Subject: [PATCH] thermal: intel: hfi: Update the IPC class of the current task
+
+Use Intel Thread Director classification to update the IPC class of a
+task. Implement the arch_update_ipcc() interface of the scheduler.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/include/asm/topology.h   |  6 ++++++
+ drivers/thermal/intel/intel_hfi.c | 32 +++++++++++++++++++++++++++++++
+ 2 files changed, 38 insertions(+)
+
+diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
+index 458c891a8273..ffcdac3f398f 100644
+--- a/arch/x86/include/asm/topology.h
++++ b/arch/x86/include/asm/topology.h
+@@ -227,4 +227,10 @@ void init_freq_invariance_cppc(void);
+ #define arch_init_invariance_cppc init_freq_invariance_cppc
+ #endif
+ 
++#if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL)
++void intel_hfi_update_ipcc(struct task_struct *curr);
++
++#define arch_update_ipcc intel_hfi_update_ipcc
++#endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */
++
+ #endif /* _ASM_X86_TOPOLOGY_H */
+diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
+index b06021828892..530dcf57e06e 100644
+--- a/drivers/thermal/intel/intel_hfi.c
++++ b/drivers/thermal/intel/intel_hfi.c
+@@ -72,6 +72,17 @@ union cpuid6_edx {
+ 	u32 full;
+ };
+ 
++#ifdef CONFIG_IPC_CLASSES
++union hfi_thread_feedback_char_msr {
++	struct {
++		u64	classid : 8;
++		u64	__reserved : 55;
++		u64	valid : 1;
++	} split;
++	u64 full;
++};
++#endif
++
+ /**
+  * struct hfi_cpu_data - HFI capabilities per CPU
+  * @perf_cap:		Performance capability
+@@ -174,6 +185,27 @@ static struct workqueue_struct *hfi_updates_wq;
+ #ifdef CONFIG_IPC_CLASSES
+ static int __percpu *hfi_ipcc_scores;
+ 
++void intel_hfi_update_ipcc(struct task_struct *curr)
++{
++	union hfi_thread_feedback_char_msr msr;
++
++	/* We should not be here if ITD is not supported. */
++	if (!cpu_feature_enabled(X86_FEATURE_ITD)) {
++		pr_warn_once("task classification requested but not supported!");
++		return;
++	}
++
++	rdmsrl(MSR_IA32_HW_FEEDBACK_CHAR, msr.full);
++	if (!msr.split.valid)
++		return;
++
++	/*
++	 * 0 is a valid classification for Intel Thread Director. A scheduler
++	 * IPCC class of 0 means that the task is unclassified. Adjust.
++	 */
++	curr->ipcc = msr.split.classid + 1;
++}
++
+ static int alloc_hfi_ipcc_scores(void)
+ {
+ 	if (!cpu_feature_enabled(X86_FEATURE_ITD))
+-- 
+2.39.2
+
+From 3ace3fa2778cce8d16caec8e828145b4dc7f2532 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:56 -0800
+Subject: [PATCH] thermal: intel: hfi: Report the IPC class score of a CPU
+
+Implement the arch_get_ipcc_score() interface of the scheduler. Use the
+performance capabilities of the extended Hardware Feedback Interface table
+as the IPC score.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/include/asm/topology.h   |  2 ++
+ drivers/thermal/intel/intel_hfi.c | 27 +++++++++++++++++++++++++++
+ 2 files changed, 29 insertions(+)
+
+diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
+index ffcdac3f398f..c4fcd9c3c634 100644
+--- a/arch/x86/include/asm/topology.h
++++ b/arch/x86/include/asm/topology.h
+@@ -229,8 +229,10 @@ void init_freq_invariance_cppc(void);
+ 
+ #if defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL)
+ void intel_hfi_update_ipcc(struct task_struct *curr);
++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu);
+ 
+ #define arch_update_ipcc intel_hfi_update_ipcc
++#define arch_get_ipcc_score intel_hfi_get_ipcc_score
+ #endif /* defined(CONFIG_IPC_CLASSES) && defined(CONFIG_INTEL_HFI_THERMAL) */
+ 
+ #endif /* _ASM_X86_TOPOLOGY_H */
+diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
+index 530dcf57e06e..fa9b4a678d92 100644
+--- a/drivers/thermal/intel/intel_hfi.c
++++ b/drivers/thermal/intel/intel_hfi.c
+@@ -206,6 +206,33 @@ void intel_hfi_update_ipcc(struct task_struct *curr)
+ 	curr->ipcc = msr.split.classid + 1;
+ }
+ 
++unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu)
++{
++	unsigned short hfi_class;
++	int *scores;
++
++	if (cpu < 0 || cpu >= nr_cpu_ids)
++		return -EINVAL;
++
++	if (ipcc == IPC_CLASS_UNCLASSIFIED)
++		return -EINVAL;
++
++	/*
++	 * Scheduler IPC classes start at 1. HFI classes start at 0.
++	 * See note intel_hfi_update_ipcc().
++	 */
++	hfi_class = ipcc - 1;
++
++	if (hfi_class >= hfi_features.nr_classes)
++		return -EINVAL;
++
++	scores = per_cpu_ptr(hfi_ipcc_scores, cpu);
++	if (!scores)
++		return -ENODEV;
++
++	return READ_ONCE(scores[hfi_class]);
++}
++
+ static int alloc_hfi_ipcc_scores(void)
+ {
+ 	if (!cpu_feature_enabled(X86_FEATURE_ITD))
+-- 
+2.39.2
+
+From 7637b8a5d201d49ef56d31f22af30531d0193538 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:57 -0800
+Subject: [PATCH] thermal: intel: hfi: Define a default class for unclassified
+ tasks
+
+A task may be unclassified if it has been recently created, spend most of
+its lifetime sleeping, or hardware has not provided a classification.
+
+Most tasks will be eventually classified as scheduler's IPC class 1
+(HFI class 0). This class corresponds to the capabilities in the legacy,
+classless, HFI table.
+
+IPC class 1 is a reasonable choice until hardware provides an actual
+classification. Meanwhile, the scheduler will place classes of tasks with
+higher IPC scores on higher-performance CPUs.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ drivers/thermal/intel/intel_hfi.c | 15 ++++++++++++++-
+ 1 file changed, 14 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
+index fa9b4a678d92..7ea6acce7107 100644
+--- a/drivers/thermal/intel/intel_hfi.c
++++ b/drivers/thermal/intel/intel_hfi.c
+@@ -185,6 +185,19 @@ static struct workqueue_struct *hfi_updates_wq;
+ #ifdef CONFIG_IPC_CLASSES
+ static int __percpu *hfi_ipcc_scores;
+ 
++/*
++ * A task may be unclassified if it has been recently created, spend most of
++ * its lifetime sleeping, or hardware has not provided a classification.
++ *
++ * Most tasks will be classified as scheduler's IPC class 1 (HFI class 0)
++ * eventually. Meanwhile, the scheduler will place classes of tasks with higher
++ * IPC scores on higher-performance CPUs.
++ *
++ * IPC class 1 is a reasonable choice. It matches the performance capability
++ * of the legacy, classless, HFI table.
++ */
++#define HFI_UNCLASSIFIED_DEFAULT 1
++
+ void intel_hfi_update_ipcc(struct task_struct *curr)
+ {
+ 	union hfi_thread_feedback_char_msr msr;
+@@ -215,7 +228,7 @@ unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu)
+ 		return -EINVAL;
+ 
+ 	if (ipcc == IPC_CLASS_UNCLASSIFIED)
+-		return -EINVAL;
++		ipcc = HFI_UNCLASSIFIED_DEFAULT;
+ 
+ 	/*
+ 	 * Scheduler IPC classes start at 1. HFI classes start at 0.
+-- 
+2.39.2
+
+From 9ddcae3ee191e5e27247d7ea9456d768919ac21f Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:58 -0800
+Subject: [PATCH] thermal: intel: hfi: Enable the Intel Thread Director
+
+Enable Intel Thread Director from the CPU hotplug callback: globally from
+CPU0 and then enable the thread-classification hardware in each logical
+processor individually.
+
+Also, initialize the number of classes supported.
+
+Let the scheduler know that it can start using IPC classes.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/include/asm/msr-index.h  |  2 ++
+ drivers/thermal/intel/intel_hfi.c | 40 +++++++++++++++++++++++++++++--
+ 2 files changed, 40 insertions(+), 2 deletions(-)
+
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index d3fe82c5d6b6..d83437d3473d 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -1095,6 +1095,8 @@
+ /* Hardware Feedback Interface */
+ #define MSR_IA32_HW_FEEDBACK_PTR        0x17d0
+ #define MSR_IA32_HW_FEEDBACK_CONFIG     0x17d1
++#define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4
++#define MSR_IA32_HW_FEEDBACK_CHAR	0x17d2
+ 
+ /* x2APIC locked status */
+ #define MSR_IA32_XAPIC_DISABLE_STATUS	0xBD
+diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
+index 7ea6acce7107..35d947f47550 100644
+--- a/drivers/thermal/intel/intel_hfi.c
++++ b/drivers/thermal/intel/intel_hfi.c
+@@ -48,6 +48,8 @@
+ /* Hardware Feedback Interface MSR configuration bits */
+ #define HW_FEEDBACK_PTR_VALID_BIT		BIT(0)
+ #define HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT	BIT(0)
++#define HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT	BIT(1)
++#define HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT	BIT(0)
+ 
+ /* CPUID detection and enumeration definitions for HFI */
+ 
+@@ -72,6 +74,15 @@ union cpuid6_edx {
+ 	u32 full;
+ };
+ 
++union cpuid6_ecx {
++	struct {
++		u32	dont_care0:8;
++		u32	nr_classes:8;
++		u32	dont_care1:16;
++	} split;
++	u32 full;
++};
++
+ #ifdef CONFIG_IPC_CLASSES
+ union hfi_thread_feedback_char_msr {
+ 	struct {
+@@ -506,6 +517,11 @@ void intel_hfi_online(unsigned int cpu)
+ 
+ 	init_hfi_cpu_index(info);
+ 
++	if (cpu_feature_enabled(X86_FEATURE_ITD)) {
++		msr_val = HW_FEEDBACK_THREAD_CONFIG_ENABLE_BIT;
++		wrmsrl(MSR_IA32_HW_FEEDBACK_THREAD_CONFIG, msr_val);
++	}
++
+ 	/*
+ 	 * Now check if the HFI instance of the package/die of @cpu has been
+ 	 * initialized (by checking its header). In such case, all we have to
+@@ -561,8 +577,22 @@ void intel_hfi_online(unsigned int cpu)
+ 	 */
+ 	rdmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
+ 	msr_val |= HW_FEEDBACK_CONFIG_HFI_ENABLE_BIT;
++
++	if (cpu_feature_enabled(X86_FEATURE_ITD))
++		msr_val |= HW_FEEDBACK_CONFIG_ITD_ENABLE_BIT;
++
+ 	wrmsrl(MSR_IA32_HW_FEEDBACK_CONFIG, msr_val);
+ 
++	/*
++	 * We have all we need to support IPC classes. Task classification is
++	 * now working.
++	 *
++	 * All class scores are zero until after the first HFI update. That is
++	 * OK. The scheduler queries these scores at every load balance.
++	 */
++	if (cpu_feature_enabled(X86_FEATURE_ITD))
++		sched_enable_ipc_classes();
++
+ unlock:
+ 	mutex_unlock(&hfi_instance_lock);
+ 	return;
+@@ -640,8 +670,14 @@ static __init int hfi_parse_features(void)
+ 	 */
+ 	hfi_features.class_stride = nr_capabilities;
+ 
+-	/* For now, use only one class of the HFI table */
+-	hfi_features.nr_classes = 1;
++	if (cpu_feature_enabled(X86_FEATURE_ITD)) {
++		union cpuid6_ecx ecx;
++
++		ecx.full = cpuid_ecx(CPUID_HFI_LEAF);
++		hfi_features.nr_classes = ecx.split.nr_classes;
++	} else {
++		hfi_features.nr_classes = 1;
++	}
+ 
+ 	/*
+ 	 * The header contains change indications for each supported feature.
+-- 
+2.39.2
+
+From aeb2e2fb157001cdd6c10d261fe006c8aa22bf06 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:10:59 -0800
+Subject: [PATCH] sched/task_struct: Add helpers for IPC classification
+
+The unprocessed classification that hardware provides for a task may not
+be usable by the scheduler: the classification may change too frequently or
+architectures may want to consider extra factors. For instance, some
+processors with Intel Thread Director need to consider the state of the SMT
+siblings of a core.
+
+Provide per-task helper variables that architectures can use to post-
+process the classification that hardware provides.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ include/linux/sched.h | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 4f96c3dd59d0..582e14cf3f76 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1529,7 +1529,17 @@ struct task_struct {
+ 	 * A hardware-defined classification of task that reflects but is
+ 	 * not identical to the number of instructions per cycle.
+ 	 */
+-	unsigned short			ipcc;
++	unsigned int			ipcc : 9;
++	/*
++	 * A candidate classification that arch-specific implementations
++	 * qualify for correctness.
++	 */
++	unsigned int			ipcc_tmp : 9;
++	/*
++	 * Counter to filter out transient candidate classifications
++	 * of a task.
++	 */
++	unsigned int			ipcc_cntr : 14;
+ #endif
+ 
+ 	/*
+-- 
+2.39.2
+
+From fd936723a40205d2b47336596468dba9c59a4287 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:11:00 -0800
+Subject: [PATCH] sched/core: Initialize helpers of task classification
+
+Just as tasks start life unclassified, initialize the classification
+auxiliar variables.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ kernel/sched/core.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/kernel/sched/core.c b/kernel/sched/core.c
+index 0ab39cc055c7..2a942fc3c309 100644
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -4426,6 +4426,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
+ 	p->se.vruntime			= 0;
+ #ifdef CONFIG_IPC_CLASSES
+ 	p->ipcc				= IPC_CLASS_UNCLASSIFIED;
++	p->ipcc_tmp			= IPC_CLASS_UNCLASSIFIED;
++	p->ipcc_cntr			= 0;
+ #endif
+ 	INIT_LIST_HEAD(&p->se.group_node);
+ 
+-- 
+2.39.2
+
+From b98db691b522d6b2ed0dc1bd17e77165b7531ba9 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:11:01 -0800
+Subject: [PATCH] sched/fair: Introduce sched_smt_siblings_idle()
+
+X86 needs to know the idle state of the SMT siblings of a CPU to improve
+the accuracy of IPCC classification. X86 implements support for IPC classes
+in the thermal HFI driver.
+
+Rename is_core_idle() as sched_smt_siblings_idle() and make it available
+outside the scheduler code.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ include/linux/sched.h |  2 ++
+ kernel/sched/fair.c   | 21 +++++++++++++++------
+ 2 files changed, 17 insertions(+), 6 deletions(-)
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 582e14cf3f76..f2adf662eda8 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -2440,4 +2440,6 @@ static inline void sched_core_fork(struct task_struct *p) { }
+ 
+ extern void sched_set_stop_task(int cpu, struct task_struct *stop);
+ 
++extern bool sched_smt_siblings_idle(int cpu);
++
+ #endif
+diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
+index a9a105092e7c..97c574d5fa57 100644
+--- a/kernel/sched/fair.c
++++ b/kernel/sched/fair.c
+@@ -1064,7 +1064,14 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
+  * Scheduling class queueing methods:
+  */
+ 
+-static inline bool is_core_idle(int cpu)
++/**
++ * sched_smt_siblings_idle - Check whether SMT siblings of a CPU are idle
++ * @cpu:	The CPU to check
++ *
++ * Returns true if all the SMT siblings of @cpu are idle or @cpu does not have
++ * SMT siblings. The idle state of @cpu is not considered.
++ */
++bool sched_smt_siblings_idle(int cpu)
+ {
+ #ifdef CONFIG_SCHED_SMT
+ 	int sibling;
+@@ -1767,7 +1774,7 @@ static inline int numa_idle_core(int idle_core, int cpu)
+ 	 * Prefer cores instead of packing HT siblings
+ 	 * and triggering future load balancing.
+ 	 */
+-	if (is_core_idle(cpu))
++	if (sched_smt_siblings_idle(cpu))
+ 		idle_core = cpu;
+ 
+ 	return idle_core;
+@@ -9388,7 +9395,8 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds,  struct sg_lb_stats *sgs
+ 	 * If the destination CPU has SMT siblings, env->idle != CPU_NOT_IDLE
+ 	 * is not sufficient. We need to make sure the whole core is idle.
+ 	 */
+-	if (sds->local->flags & SD_SHARE_CPUCAPACITY && !is_core_idle(env->dst_cpu))
++	if (sds->local->flags & SD_SHARE_CPUCAPACITY &&
++	    !sched_smt_siblings_idle(env->dst_cpu))
+ 		return false;
+ 
+ 	/* Only do SMT checks if either local or candidate have SMT siblings. */
+@@ -10557,7 +10565,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
+ 		    sched_asym_prefer(i, env->dst_cpu) &&
+ 		    nr_running == 1) {
+ 			if (env->sd->flags & SD_SHARE_CPUCAPACITY ||
+-			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) && is_core_idle(i)))
++			    (!(env->sd->flags & SD_SHARE_CPUCAPACITY) &&
++			     sched_smt_siblings_idle(i)))
+ 				continue;
+ 		}
+ 
+@@ -10686,7 +10695,7 @@ asym_active_balance(struct lb_env *env)
+ 		 * busy sibling.
+ 		 */
+ 		return sched_asym_prefer(env->dst_cpu, env->src_cpu) ||
+-		       !is_core_idle(env->src_cpu);
++		       !sched_smt_siblings_idle(env->src_cpu);
+ 	}
+ 
+ 	return false;
+@@ -11433,7 +11442,7 @@ static void nohz_balancer_kick(struct rq *rq)
+ 				 */
+ 				if (sd->flags & SD_SHARE_CPUCAPACITY ||
+ 				    (!(sd->flags & SD_SHARE_CPUCAPACITY) &&
+-				     is_core_idle(i))) {
++				     sched_smt_siblings_idle(i))) {
+ 					flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK;
+ 					goto unlock;
+ 				}
+-- 
+2.39.2
+
+From 7acc78f51465e7ea2b876136a1d99632f3f4ec46 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:11:02 -0800
+Subject: [PATCH] thermal: intel: hfi: Implement model-specific checks for task
+ classification
+
+In Alder Lake and Raptor Lake, the result of thread classification is more
+accurate when only one SMT sibling is busy. Classification results for
+class 2 and 3 are always reliable.
+
+To avoid unnecessary migrations, only update the class of a task if it has
+been the same during 4 consecutive user ticks.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ drivers/thermal/intel/intel_hfi.c | 60 ++++++++++++++++++++++++++++++-
+ 1 file changed, 59 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/thermal/intel/intel_hfi.c b/drivers/thermal/intel/intel_hfi.c
+index 35d947f47550..fdb53e4cabc1 100644
+--- a/drivers/thermal/intel/intel_hfi.c
++++ b/drivers/thermal/intel/intel_hfi.c
+@@ -40,6 +40,7 @@
+ #include <linux/workqueue.h>
+ 
+ #include <asm/msr.h>
++#include <asm/intel-family.h>
+ 
+ #include "../thermal_core.h"
+ #include "intel_hfi.h"
+@@ -209,9 +210,64 @@ static int __percpu *hfi_ipcc_scores;
+  */
+ #define HFI_UNCLASSIFIED_DEFAULT 1
+ 
++#define CLASS_DEBOUNCER_SKIPS 4
++
++/**
++ * debounce_and_update_class() - Process and update a task's classification
++ *
++ * @p:		The task of which the classification will be updated
++ * @new_ipcc:	The new IPC classification
++ *
++ * Update the classification of @p with the new value that hardware provides.
++ * Only update the classification of @p if it has been the same during
++ * CLASS_DEBOUNCER_SKIPS consecutive ticks.
++ */
++static void debounce_and_update_class(struct task_struct *p, u8 new_ipcc)
++{
++	u16 debounce_skip;
++
++	/* The class of @p changed. Only restart the debounce counter. */
++	if (p->ipcc_tmp != new_ipcc) {
++		p->ipcc_cntr = 1;
++		goto out;
++	}
++
++	/*
++	 * The class of @p did not change. Update it if it has been the same
++	 * for CLASS_DEBOUNCER_SKIPS user ticks.
++	 */
++	debounce_skip = p->ipcc_cntr + 1;
++	if (debounce_skip < CLASS_DEBOUNCER_SKIPS)
++		p->ipcc_cntr++;
++	else
++		p->ipcc = new_ipcc;
++
++out:
++	p->ipcc_tmp = new_ipcc;
++}
++
++static bool classification_is_accurate(u8 hfi_class, bool smt_siblings_idle)
++{
++	switch (boot_cpu_data.x86_model) {
++	case INTEL_FAM6_ALDERLAKE:
++	case INTEL_FAM6_ALDERLAKE_L:
++	case INTEL_FAM6_RAPTORLAKE:
++	case INTEL_FAM6_RAPTORLAKE_P:
++	case INTEL_FAM6_RAPTORLAKE_S:
++		if (hfi_class == 3 || hfi_class == 2 || smt_siblings_idle)
++			return true;
++
++		return false;
++
++	default:
++		return true;
++	}
++}
++
+ void intel_hfi_update_ipcc(struct task_struct *curr)
+ {
+ 	union hfi_thread_feedback_char_msr msr;
++	bool idle;
+ 
+ 	/* We should not be here if ITD is not supported. */
+ 	if (!cpu_feature_enabled(X86_FEATURE_ITD)) {
+@@ -227,7 +283,9 @@ void intel_hfi_update_ipcc(struct task_struct *curr)
+ 	 * 0 is a valid classification for Intel Thread Director. A scheduler
+ 	 * IPCC class of 0 means that the task is unclassified. Adjust.
+ 	 */
+-	curr->ipcc = msr.split.classid + 1;
++	idle = sched_smt_siblings_idle(task_cpu(curr));
++	if (classification_is_accurate(msr.split.classid, idle))
++		debounce_and_update_class(curr, msr.split.classid + 1);
+ }
+ 
+ unsigned long intel_hfi_get_ipcc_score(unsigned short ipcc, int cpu)
+-- 
+2.39.2
+
+From a7d1ce079429314c7c2c287a0de5930a90134bb4 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:11:03 -0800
+Subject: [PATCH] x86/cpufeatures: Add feature bit for HRESET
+
+The HRESET instruction prevents the classification of the current task
+from influencing the classification of the next task when running serially
+on the same logical processor.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/include/asm/cpufeatures.h | 1 +
+ arch/x86/include/asm/msr-index.h   | 4 +++-
+ arch/x86/kernel/cpu/scattered.c    | 1 +
+ 3 files changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
+index a2f2730737ae..0a64e6bc67b1 100644
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -307,6 +307,7 @@
+ #define X86_FEATURE_SGX_EDECCSSA	(11*32+18) /* "" SGX EDECCSSA user leaf function */
+ #define X86_FEATURE_CALL_DEPTH		(11*32+19) /* "" Call depth tracking for RSB stuffing */
+ #define X86_FEATURE_MSR_TSX_CTRL	(11*32+20) /* "" MSR IA32_TSX_CTRL (Intel) implemented */
++#define X86_FEATURE_HRESET		(11*32+23) /* Hardware history reset instruction */
+ 
+ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+ #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
+diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
+index d83437d3473d..ce8b78d77588 100644
+--- a/arch/x86/include/asm/msr-index.h
++++ b/arch/x86/include/asm/msr-index.h
+@@ -1098,6 +1098,9 @@
+ #define MSR_IA32_HW_FEEDBACK_THREAD_CONFIG 0x17d4
+ #define MSR_IA32_HW_FEEDBACK_CHAR	0x17d2
+ 
++/* Hardware History Reset  */
++#define MSR_IA32_HW_HRESET_ENABLE	0x17da
++
+ /* x2APIC locked status */
+ #define MSR_IA32_XAPIC_DISABLE_STATUS	0xBD
+ #define LEGACY_XAPIC_DISABLED		BIT(0) /*
+@@ -1105,5 +1108,4 @@
+ 						* disabling x2APIC will cause
+ 						* a #GP
+ 						*/
+-
+ #endif /* _ASM_X86_MSR_INDEX_H */
+diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
+index f53944fb8f7f..66bc5713644d 100644
+--- a/arch/x86/kernel/cpu/scattered.c
++++ b/arch/x86/kernel/cpu/scattered.c
+@@ -28,6 +28,7 @@ static const struct cpuid_bit cpuid_bits[] = {
+ 	{ X86_FEATURE_EPB,		CPUID_ECX,  3, 0x00000006, 0 },
+ 	{ X86_FEATURE_INTEL_PPIN,	CPUID_EBX,  0, 0x00000007, 1 },
+ 	{ X86_FEATURE_RRSBA_CTRL,	CPUID_EDX,  2, 0x00000007, 2 },
++	{ X86_FEATURE_HRESET,		CPUID_EAX, 22, 0x00000007, 1 },
+ 	{ X86_FEATURE_CQM_LLC,		CPUID_EDX,  1, 0x0000000f, 0 },
+ 	{ X86_FEATURE_CQM_OCCUP_LLC,	CPUID_EDX,  0, 0x0000000f, 1 },
+ 	{ X86_FEATURE_CQM_MBM_TOTAL,	CPUID_EDX,  1, 0x0000000f, 1 },
+-- 
+2.39.2
+
+From 8ee8e3c510cb4a284738d65df270e9d8ddbfc67f Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:11:04 -0800
+Subject: [PATCH] x86/hreset: Configure history reset
+
+Configure the MSR that controls the behavior of HRESET on each logical
+processor.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/kernel/cpu/common.c | 23 ++++++++++++++++++++++-
+ 1 file changed, 22 insertions(+), 1 deletion(-)
+
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index f3cc7699e1e1..a2de5736099e 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -412,6 +412,26 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+ 	cr4_clear_bits(X86_CR4_UMIP);
+ }
+ 
++static u32 hardware_history_features __ro_after_init;
++
++static __always_inline void setup_hreset(struct cpuinfo_x86 *c)
++{
++	if (!cpu_feature_enabled(X86_FEATURE_HRESET))
++		return;
++
++	/*
++	 * Use on all CPUs the hardware history features that the boot
++	 * CPU supports.
++	 */
++	if (c == &boot_cpu_data)
++		hardware_history_features = cpuid_ebx(0x20);
++
++	if (!hardware_history_features)
++		return;
++
++	wrmsrl(MSR_IA32_HW_HRESET_ENABLE, hardware_history_features);
++}
++
+ /* These bits should not change their value after CPU init is finished. */
+ static const unsigned long cr4_pinned_mask =
+ 	X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP |
+@@ -1849,10 +1869,11 @@ static void identify_cpu(struct cpuinfo_x86 *c)
+ 	/* Disable the PN if appropriate */
+ 	squash_the_stupid_serial_number(c);
+ 
+-	/* Set up SMEP/SMAP/UMIP */
++	/* Set up SMEP/SMAP/UMIP/HRESET */
+ 	setup_smep(c);
+ 	setup_smap(c);
+ 	setup_umip(c);
++	setup_hreset(c);
+ 
+ 	/* Enable FSGSBASE instructions if available. */
+ 	if (cpu_has(c, X86_FEATURE_FSGSBASE)) {
+-- 
+2.39.2
+
+From 4a25b2ad89edfc72bf3f3d1b2cc96916a229ac60 Mon Sep 17 00:00:00 2001
+From: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Date: Mon, 6 Feb 2023 21:11:05 -0800
+Subject: [PATCH] x86/process: Reset hardware history in context switch
+
+Reset the classification history of the current task when switching to the
+next task. Hardware will start the classification of the next task from
+scratch.
+
+Cc: Ben Segall <bsegall@google.com>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Ionela Voinescu <ionela.voinescu@arm.com>
+Cc: Joel Fernandes (Google) <joel@joelfernandes.org>
+Cc: Len Brown <len.brown@intel.com>
+Cc: Lukasz Luba <lukasz.luba@arm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Tim C. Chen <tim.c.chen@intel.com>
+Cc: Valentin Schneider <vschneid@redhat.com>
+Cc: x86@kernel.org
+Cc: linux-pm@vger.kernel.org
+Cc: linux-kernel@vger.kernel.org
+Signed-off-by: Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+Patchset: intel-thread-director
+---
+ arch/x86/include/asm/hreset.h | 30 ++++++++++++++++++++++++++++++
+ arch/x86/kernel/cpu/common.c  |  7 +++++++
+ arch/x86/kernel/process_32.c  |  3 +++
+ arch/x86/kernel/process_64.c  |  3 +++
+ 4 files changed, 43 insertions(+)
+ create mode 100644 arch/x86/include/asm/hreset.h
+
+diff --git a/arch/x86/include/asm/hreset.h b/arch/x86/include/asm/hreset.h
+new file mode 100644
+index 000000000000..d68ca2fb8642
+--- /dev/null
++++ b/arch/x86/include/asm/hreset.h
+@@ -0,0 +1,30 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++#ifndef _ASM_X86_HRESET_H
++
++/**
++ * HRESET - History reset. Available since binutils v2.36.
++ *
++ * Request the processor to reset the history of task classification on the
++ * current logical processor. The history components to be
++ * reset are specified in %eax. Only bits specified in CPUID(0x20).EBX
++ * and enabled in the IA32_HRESET_ENABLE MSR can be selected.
++ *
++ * The assembly code looks like:
++ *
++ *	hreset %eax
++ *
++ * The corresponding machine code looks like:
++ *
++ *	F3 0F 3A F0 ModRM Imm
++ *
++ * The value of ModRM is 0xc0 to specify %eax register addressing.
++ * The ignored immediate operand is set to 0.
++ *
++ * The instruction is documented in the Intel SDM.
++ */
++
++#define __ASM_HRESET  ".byte 0xf3, 0xf, 0x3a, 0xf0, 0xc0, 0x0"
++
++void reset_hardware_history(void);
++
++#endif /* _ASM_X86_HRESET_H */
+diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
+index a2de5736099e..2aaf2320b149 100644
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -53,6 +53,7 @@
+ #include <asm/mce.h>
+ #include <asm/msr.h>
+ #include <asm/cacheinfo.h>
++#include <asm/hreset.h>
+ #include <asm/memtype.h>
+ #include <asm/microcode.h>
+ #include <asm/microcode_intel.h>
+@@ -414,6 +415,12 @@ static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+ 
+ static u32 hardware_history_features __ro_after_init;
+ 
++void reset_hardware_history(void)
++{
++	asm_inline volatile (ALTERNATIVE("", __ASM_HRESET, X86_FEATURE_HRESET)
++			     : : "a" (hardware_history_features) : "memory");
++}
++
+ static __always_inline void setup_hreset(struct cpuinfo_x86 *c)
+ {
+ 	if (!cpu_feature_enabled(X86_FEATURE_HRESET))
+diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
+index 470c128759ea..397a6e6f4e61 100644
+--- a/arch/x86/kernel/process_32.c
++++ b/arch/x86/kernel/process_32.c
+@@ -52,6 +52,7 @@
+ #include <asm/switch_to.h>
+ #include <asm/vm86.h>
+ #include <asm/resctrl.h>
++#include <asm/hreset.h>
+ #include <asm/proto.h>
+ 
+ #include "process.h"
+@@ -214,6 +215,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 	/* Load the Intel cache allocation PQR MSR. */
+ 	resctrl_sched_in();
+ 
++	reset_hardware_history();
++
+ 	return prev_p;
+ }
+ 
+diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
+index 4e34b3b68ebd..6176044ecc16 100644
+--- a/arch/x86/kernel/process_64.c
++++ b/arch/x86/kernel/process_64.c
+@@ -53,6 +53,7 @@
+ #include <asm/xen/hypervisor.h>
+ #include <asm/vdso.h>
+ #include <asm/resctrl.h>
++#include <asm/hreset.h>
+ #include <asm/unistd.h>
+ #include <asm/fsgsbase.h>
+ #ifdef CONFIG_IA32_EMULATION
+@@ -658,6 +659,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
+ 	/* Load the Intel cache allocation PQR MSR. */
+ 	resctrl_sched_in();
+ 
++	reset_hardware_history();
++
+ 	return prev_p;
+ }
+ 
+-- 
+2.39.2
+