schedutil CPUFreq governor 代码分析

参考内核5.x, code是kernel/sched/cpufreq_schedutil.c，配置说明：

config CPU_FREQ_GOV_SCHEDUTIL
bool “‘schedutil’ cpufreq policy governor”
depends on CPU_FREQ && SMP
select CPU_FREQ_GOV_ATTR_SET
select IRQ_WORK
help
This governor makes decisions based on the utilization data provided
by the scheduler. It sets the CPU frequency to be proportional to
the utilization/capacity ratio coming from the scheduler. If the
utilization is frequency-invariant, the new frequency is also
proportional to the maximum available frequency. If that is not the
case, it is proportional to the current frequency of the CPU. The
frequency tipping point is at utilization/capacity equal to 80% in
both cases.

就是根据调度时CPU utilization变化进行调频。EAS只用schedutil，因为EAS也是根据util来调度，文档描述：

told to do, for example), schedutil as opposed to other CPUFreq governors at
least requests frequencies calculated using the utilization signals.
Consequently, the only sane governor to use together with EAS is schedutil,
because it is the only one providing some degree of consistency between
frequency requests and energy predictions.

Update Utilization

that is cpufreq_update_util(), util变化时会call，比如CFS:

static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
{
	struct rq *rq = rq_of(cfs_rq);

	if (&rq->cfs == cfs_rq) {
		/*
		 * There are a few boundary cases this might miss but it should
		 * get called often enough that that should (hopefully) not be
		 * a real problem.
		 *
		 * It will not get called when we go idle, because the idle
		 * thread is a different class (!fair), nor will the utilization
		 * number include things like RT tasks.
		 *
		 * As is, the util number is not freq-invariant (we'd have to
		 * implement arch_scale_freq_capacity() for that).
		 *
		 * See cpu_util().
		 */
		cpufreq_update_util(rq, flags); //here
	}
}

static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
{
	struct update_util_data *data;

	data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data,
						  cpu_of(rq)));
	if (data)
		data->func(data, rq_clock(rq), flags);
}

这里的->func是在sugov_start()里注册的：

if (policy_is_shared(policy))
	uu = sugov_update_shared;
else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf())
	uu = sugov_update_single_perf;
else
	uu = sugov_update_single_freq;

for_each_cpu(cpu, policy->cpus) {
	struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);

	cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); //tj: here
}

void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
			void (*func)(struct update_util_data *data, u64 time,
				     unsigned int flags))
{
	if (WARN_ON(!data || !func))
		return;

	if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
		return;

	data->func = func; //tj: here
	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
}

这里的更新还区分shared(多个CPU共享)和single(only单个CPU使用)两种policy。

同时，还支持快速和慢速两种更新方式，比如在sugov_update_single_freq():

if (sg_policy->policy->fast_switch_enabled) {
	sugov_fast_switch(sg_policy, time, next_f); //tj: fast path
} else {
	raw_spin_lock(&sg_policy->update_lock);
	sugov_deferred_update(sg_policy, time, next_f); //tj: slow path
	raw_spin_unlock(&sg_policy->update_lock);
}

sugov_fast_switch():

static void sugov_fast_switch(struct sugov_policy *sg_policy, u64 time,
			      unsigned int next_freq)
{
	if (sugov_update_next_freq(sg_policy, time, next_freq))
		cpufreq_driver_fast_switch(sg_policy->policy, next_freq);
}

slow path借助一个work来完成，call __cpufreq_driver_target()完成，需要mutex。

static void sugov_work(struct kthread_work *work)
{
	struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
	unsigned int freq;
	unsigned long flags;

	/*
	 * Hold sg_policy->update_lock shortly to handle the case where:
	 * incase sg_policy->next_freq is read here, and then updated by
	 * sugov_deferred_update() just before work_in_progress is set to false
	 * here, we may miss queueing the new update.
	 *
	 * Note: If a work was queued after the update_lock is released,
	 * sugov_work() will just be called again by kthread_work code; and the
	 * request will be proceed before the sugov thread sleeps.
	 */
	raw_spin_lock_irqsave(&sg_policy->update_lock, flags);
	freq = sg_policy->next_freq;
	sg_policy->work_in_progress = false;
	raw_spin_unlock_irqrestore(&sg_policy->update_lock, flags);

	mutex_lock(&sg_policy->work_lock);
	__cpufreq_driver_target(sg_policy->policy, freq, CPUFREQ_RELATION_L); //tj: here
	mutex_unlock(&sg_policy->work_lock);
}

切换之前还有个if (sugov_update_next_freq()):

static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time,
				   unsigned int next_freq)
{
	if (sg_policy->need_freq_update)
		sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS);
	else if (sg_policy->next_freq == next_freq)
		return false;

	sg_policy->next_freq = next_freq;
	sg_policy->last_freq_update_time = time;

	return true;
}

这里的逻辑是只要need_freq_update flag被设置了，那就更新。(也符合这个flag的名字，rt:) 反之，如果当前freq和之前的一样，就不要更新了。

need_freq_update来自limits_changed:

if (unlikely(sg_policy->limits_changed)) {
	sg_policy->limits_changed = false;
	sg_policy->need_freq_update = true;

set limits_changed有两处：

一是sugov_limits()，就是policy min/max改变了。另一个是ignore_dl_rate_limit():

/*
 * Make sugov_should_update_freq() ignore the rate limit when DL
 * has increased the utilization.
 */
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
		sg_cpu->sg_policy->limits_changed = true;
}

如果deadline task增加了CPU utilization那就忽略限速，强制更新。

sugov_get_util():

static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
	struct rq *rq = cpu_rq(sg_cpu->cpu);
	unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu);

	sg_cpu->max = max;
	sg_cpu->bw_dl = cpu_bw_dl(rq);
	sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max,  //tj:here
					  FREQUENCY_UTIL, NULL);
}

通过effective_cpu_util()获取当前的CPU utilization，里面会根据schedule class来聚合util，涉及调度侧，暂略过。

计算频率

get_next_freq():

/**
 * get_next_freq - Compute a new frequency for a given cpufreq policy.
 * @sg_policy: schedutil policy object to compute the new frequency for.
 * @util: Current CPU utilization.
 * @max: CPU capacity.
 *
 * If the utilization is frequency-invariant, choose the new frequency to be
 * proportional to it, that is
 *
 * next_freq = C * max_freq * util / max
 *
 * Otherwise, approximate the would-be frequency-invariant utilization by
 * util_raw * (curr_freq / max_freq) which leads to
 *
 * next_freq = C * curr_freq * util_raw / max
 *
 * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
 *
 * The lowest driver-supported frequency which is equal or greater than the raw
 * next_freq (as calculated above) is returned, subject to policy min/max and
 * cpufreq driver limitations.
 */
static unsigned int get_next_freq(struct sugov_policy *sg_policy,
				  unsigned long util, unsigned long max)
{
	struct cpufreq_policy *policy = sg_policy->policy;
	unsigned int freq = arch_scale_freq_invariant() ?
				policy->cpuinfo.max_freq : policy->cur;

	freq = map_util_freq(util, freq, max);

注释很详细，next freq就是: (1.25 * freq * util / max), 1.25来自临界点0.8。

IO-wait boosting

如果任务最近一直在等待I/O，那么就(逐步)调频到最大的freq，以免连续IO请求吞吐性能不足。

主要是两个函数：sugov_iowait_boost()和sugov_iowait_apply()。

sugov_iowait_boost():

/**
 * sugov_iowait_boost() - Updates the IO boost status of a CPU.
 * @sg_cpu: the sugov data for the CPU to boost
 * @time: the update time from the caller
 * @flags: SCHED_CPUFREQ_IOWAIT if the task is waking up after an IO wait
 *
 * Each time a task wakes up after an IO operation, the CPU utilization can be
 * boosted to a certain utilization which doubles at each "frequent and
 * successive" wakeup from IO, ranging from IOWAIT_BOOST_MIN to the utilization
 * of the maximum OPP.
 *
 * To keep doubling, an IO boost has to be requested at least once per tick,
 * otherwise we restart from the utilization of the minimum OPP.
 */

留意最后一段：为了保持doubling，IO boost每个tick至少来1次，否则重新开始。

static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
			       unsigned int flags)
{
	bool set_iowait_boost = flags & SCHED_CPUFREQ_IOWAIT;

	/* Reset boost if the CPU appears to have been idle enough */
	if (sg_cpu->iowait_boost &&
	    sugov_iowait_reset(sg_cpu, time, set_iowait_boost))
		return;

当iowait_boost有值，如果此boost超过1个tick，那忽略这次boost，复位从头再来。

static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time,
			       bool set_iowait_boost)
{
	s64 delta_ns = time - sg_cpu->last_update;

	/* Reset boost only if a tick has elapsed since last request */
	if (delta_ns <= TICK_NSEC)  //tj: here, 连续的IO wait必须在one tick内
		return false;

	sg_cpu->iowait_boost = set_iowait_boost ? IOWAIT_BOOST_MIN : 0;
	sg_cpu->iowait_boost_pending = set_iowait_boost;

	return true;
}

/* Boost only tasks waking up after IO */
if (!set_iowait_boost)
	return;

SCHED_CPUFREQ_IOWAIT是由调度传过来in enqueue_task_fair()，没配置就不需要boost了。

/*
 * If in_iowait is set, the code below may not trigger any cpufreq
 * utilization updates, so do it here explicitly with the IOWAIT flag
 * passed.
 */
if (p->in_iowait)
        cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT);

继续看：

/* Ensure boost doubles only one time at each request */
if (sg_cpu->iowait_boost_pending)
	return;
sg_cpu->iowait_boost_pending = true;

/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
	sg_cpu->iowait_boost =
		min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE);
	return;
}

每一次boost只double一次。

	/* First wakeup after IO: start with minimum boost */
	sg_cpu->iowait_boost = IOWAIT_BOOST_MIN;
}

第一次boost从IOWAIT_BOOST_MIN开始。

Rate Limit

降低freq update过快带来的消耗:

   Minimum time (in microseconds) that has to pass between two consecutive
   runs of governor computations (default: 1000 times the scaling driver's
   transition latency).

   The purpose of this tunable is to reduce the scheduler context overhead
   of the governor which might be excessive without it.

static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
{
	...
	delta_ns = time - sg_policy->last_freq_update_time;

	return delta_ns >= sg_policy->freq_update_delay_ns;
}

还有个DL的rate limit(ignore_dl_rate_limit()):

/*
 * Make sugov_should_update_freq() ignore the rate limit when DL
 * has increased the utilization.
 */
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
{
	if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
		sg_cpu->sg_policy->limits_changed = true;
}

看下调用：

ignore_dl_rate_limit(sg_cpu);

if (!sugov_should_update_freq(sg_cpu->sg_policy, time))
	return false;

	if (unlikely(sg_policy->limits_changed)) {
		sg_policy->limits_changed = false;
		sg_policy->need_freq_update = true;
		return true;
	}

	delta_ns = time - sg_policy->last_freq_update_time;

	return delta_ns >= sg_policy->freq_update_delay_ns;
}

走need_freq_update强制更新。

Refer Doc

Documentation/admin-guide/pm/cpufreq.rst (schedutil)
Documentation/scheduler/sched-energy.rst