参考4.9,主要了解几个结构成员:

struct cpufreq_policy {
/* CPUs sharing clock, require sw coordination */
cpumask_var_t cpus; /* Online CPUs only */
cpumask_var_t related_cpus; /* Online + Offline CPUs */
cpumask_var_t real_cpus; /* Related and present */
...
unsigned int policy; /* see above */
unsigned int last_policy; /* policy before unplug */
struct cpufreq_governor *governor; /* see below */
void *governor_data;
char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */

offline有两种情况:逻辑offline和物理offline(for NUMA arch)。related_cpus里的offline就有这两个,而real_cpus必须是物理存在的所有CPU。

看下文档描述:

SMP systems normally have same clock source for a group of cpus. For these the
.init() would be called only once for the first online cpu. Here the .init()
routine must initialize policy->cpus with mask of all possible cpus (Online +
Offline) that share the clock. Then the core would copy this mask onto
policy->related_cpus and will reset policy->cpus to carry only online cpus.

driver会初始化->cpus,core会copy到->related_cpus上,比如高通平台:

static struct cpufreq_driver msm_cpufreq_driver = {
...
.init = msm_cpufreq_init,

static int msm_cpufreq_init(struct cpufreq_policy *policy)
{
...
/*
* In some SoC, some cores are clocked by same source, and their
* frequencies can not be changed independently. Find all other
* CPUs that share same clock, and mark them as controlled by
* same policy.
*/
for_each_possible_cpu(cpu)
if (cpu_clk[cpu] == cpu_clk[policy->cpu])
cpumask_set_cpu(cpu, policy->cpus);

所有的cpu core共用同一个clock,所以共用一个调频策略。

static int cpufreq_online(unsigned int cpu)
{
...
if (new_policy) {
/* related_cpus should at least include policy->cpus. */
cpumask_copy(policy->related_cpus, policy->cpus);
}

bringing CPU online时会copy。 CPU offline时会clear这个cpus。

static int cpufreq_offline(unsigned int cpu)
{
struct cpufreq_policy *policy;
int ret;

pr_debug("%s: unregistering CPU %u\n", __func__, cpu);

policy = cpufreq_cpu_get_raw(cpu);
if (!policy) {
pr_debug("%s: No cpu_data found\n", __func__);
return 0;
}

down_write(&policy->rwsem);
if (has_target())
cpufreq_stop_governor(policy);

cpumask_clear_cpu(cpu, policy->cpus); //tj: here

real_cpus的引入是为了cpufeq symlink,我们在每个cpu的节点下能看到cpufreq这个symlink:

xxx:/sys/devices/system/cpu/cpu0 # ls -l
total 0
drwxr-xr-x 6 root root 0 2009-01-03 06:40 cache
lrwxrwxrwx 1 root root 0 1970-04-05 05:56 cpufreq -> ../cpufreq/policy0
xxx:/sys/devices/system/cpu/cpufreq/policy0 # ls -l
total 0
-r--r--r-- 1 root root 4096 2009-01-03 06:41 affected_cpus
-r-------- 1 root root 4096 2009-01-03 06:41 cpuinfo_cur_freq
-r--r--r-- 1 root root 4096 2019-01-30 16:43 cpuinfo_max_freq
-r--r--r-- 1 root root 4096 2009-01-03 06:41 cpuinfo_min_freq
-r--r--r-- 1 root root 4096 2009-01-03 06:41 cpuinfo_transition_latency
-r--r--r-- 1 root root 4096 2009-01-03 06:41 related_cpus
-r--r--r-- 1 root root 4096 2019-01-30 16:43 scaling_available_frequencies
-r--r--r-- 1 root root 4096 2009-01-03 06:41 scaling_available_governors
-r--r--r-- 1 root root 4096 2009-01-03 06:41 scaling_cur_freq
-r--r--r-- 1 root root 4096 2009-01-03 06:41 scaling_driver
-rw-r--r-- 1 root root 4096 2008-12-31 16:00 scaling_governor
-rw-rw-r-- 1 system system 4096 1970-04-05 05:56 scaling_max_freq
-rw-rw-r-- 1 system system 4096 2008-12-31 16:00 scaling_min_freq
-rw-r--r-- 1 root root 4096 2009-01-03 06:41 scaling_setspeed

为什么要创建policy0?然后所有cpu的cpufreq symlink过去?原因:

commit 96bdda61f58b70431bbe8a3e49794c8210f7691b
Author: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu Oct 15 21:35:24 2015 +0530

cpufreq: create cpu/cpufreq/policyX directories

The cpufreq sysfs interface had been a bit inconsistent as one of the
CPUs for a policy had a real directory within its sysfs 'cpuX' directory
and all other CPUs had links to it. That also made the code a bit
complex as we need to take care of moving the sysfs directory if the CPU
containing the real directory is getting physically hot-unplugged.

Solve this by creating 'policyX' directories (per-policy) in
/sys/devices/system/cpu/cpufreq/ directory, where X is the CPU for which
the policy was first created.

ok,就是为了简化代码逻辑,在policy初始化分配时创建:

static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
{
...
if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL))
goto err_free_rcpumask;

ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
cpufreq_global_kobject, "policy%u", cpu);

real_cpus就是offline物理存在的related_cpus,这个link仅仅存在于real_cpus下:

static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
{
struct device *dev = get_cpu_device(cpu);

if (!dev)
return;

if (cpumask_test_and_set_cpu(cpu, policy->real_cpus)) //tj: set if not
return;

在CPU online时在related_cpus下add link,此时都是从driver init而来都是物理存在的:

static int cpufreq_online(unsigned int cpu)
{
...
if (new_policy) {
policy->user_policy.min = policy->min;
policy->user_policy.max = policy->max;

for_each_cpu(j, policy->related_cpus) {
per_cpu(cpufreq_cpu_data, j) = policy;
add_cpu_dev_symlink(policy, j);
}
}

在remove这个cpu时clear这个real_cpus mask:

static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
{
unsigned int cpu = dev->id;
struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);

if (!policy)
return;

if (cpu_online(cpu))
cpufreq_offline(cpu);

cpumask_clear_cpu(cpu, policy->real_cpus);
remove_cpu_dev_symlink(policy, dev);

ok,继续看:

unsigned int		policy; /* see above */
struct cpufreq_governor *governor; /* see below */

policy是留给->setpolicy driver用的,governor是给->target || >target_index driver用。

/*
* If (cpufreq_driver->target) exists, the ->governor decides what frequency
* within the limits is used. If (cpufreq_driver->setpolicy> exists, these
* two generic policies are available:
*/
#define CPUFREQ_POLICY_POWERSAVE (1)
#define CPUFREQ_POLICY_PERFORMANCE (2)

解析governor string时赋值:

/**
* cpufreq_parse_governor - parse a governor string
*/
static int cpufreq_parse_governor(char *str_governor, unsigned int *policy,
struct cpufreq_governor **governor)
{
int err = -EINVAL;

if (cpufreq_driver->setpolicy) {
if (!strncasecmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {
*policy = CPUFREQ_POLICY_PERFORMANCE;
err = 0;
} else if (!strncasecmp(str_governor, "powersave",
CPUFREQ_NAME_LEN)) {
*policy = CPUFREQ_POLICY_POWERSAVE;
err = 0;
}
}

一是初始化policy会set:

static int cpufreq_init_policy(struct cpufreq_policy *policy)
{
...
/* Use the default policy if there is no last_policy. */
if (cpufreq_driver->setpolicy) {
if (policy->last_policy)
new_policy.policy = policy->last_policy;
else
cpufreq_parse_governor(gov->name, &new_policy.policy,
NULL);
}

另一个是sysfs store_scaling_governor会call。

ok,而governor应该就是for ->target || ->target_index driver。可用的scaling governor:

xxx:/sys/devices/system/cpu/cpufreq/policy0 # cat scaling_available_governor
conservative ondemand userspace powersave performance schedutil

通过core提供的cpufreq_register_governor注册到core。4.9高通平台默认的schedutil (check it later)。

last_policylast_governor是给热插拔用。

static int cpufreq_offline(unsigned int cpu)
{
struct cpufreq_policy *policy;
int ret;

...
if (policy_is_inactive(policy)) {
if (has_target())
strncpy(policy->last_governor, policy->governor->name,
CPUFREQ_NAME_LEN);
else
policy->last_policy = policy->policy;
static inline bool policy_is_inactive(struct cpufreq_policy *policy)
{
return cpumask_empty(policy->cpus);
}

static inline bool has_target(void)
{
return cpufreq_driver->target_index || cpufreq_driver->target;
}

inactive policy就是这个策略的->cpus都下线喽。

Done.