参考4.9,主要了解几个结构成员:

struct cpufreq_policy {
    /* CPUs sharing clock, require sw coordination */
    cpumask_var_t        cpus;    /* Online CPUs only */
    cpumask_var_t        related_cpus; /* Online + Offline CPUs */
    cpumask_var_t        real_cpus; /* Related and present */
    ...
    unsigned int        policy; /* see above */
    unsigned int        last_policy; /* policy before unplug */
    struct cpufreq_governor    *governor; /* see below */
    void            *governor_data;
    char            last_governor[CPUFREQ_NAME_LEN]; /* last governor used */

offline有两种情况:逻辑offline和物理offline(for NUMA arch)。related_cpus里的offline就有这两个,而real_cpus必须是物理存在的所有CPU。

看下文档描述:

SMP systems normally have same clock source for a group of cpus. For these the
.init() would be called only once for the first online cpu. Here the .init()
routine must initialize policy->cpus with mask of all possible cpus (Online +
Offline) that share the clock. Then the core would copy this mask onto
policy->related_cpus and will reset policy->cpus to carry only online cpus.

driver会初始化->cpus,core会copy到->related_cpus上,比如高通平台:

static struct cpufreq_driver msm_cpufreq_driver = {
    ...
    .init        = msm_cpufreq_init,

static int msm_cpufreq_init(struct cpufreq_policy *policy)
{
    ...
    /*
     * In some SoC, some cores are clocked by same source, and their
     * frequencies can not be changed independently. Find all other
     * CPUs that share same clock, and mark them as controlled by
     * same policy.
     */
    for_each_possible_cpu(cpu)
        if (cpu_clk[cpu] == cpu_clk[policy->cpu])
            cpumask_set_cpu(cpu, policy->cpus);

所有的cpu core共用同一个clock,所以共用一个调频策略。

static int cpufreq_online(unsigned int cpu)
{
    ...
    if (new_policy) {
        /* related_cpus should at least include policy->cpus. */
        cpumask_copy(policy->related_cpus, policy->cpus);
    }

bringing CPU online时会copy。 CPU offline时会clear这个cpus。

static int cpufreq_offline(unsigned int cpu)
{
    struct cpufreq_policy *policy;
    int ret;

    pr_debug("%s: unregistering CPU %u\n", __func__, cpu);

    policy = cpufreq_cpu_get_raw(cpu);
    if (!policy) {
        pr_debug("%s: No cpu_data found\n", __func__);
        return 0;
    }

    down_write(&policy->rwsem);
    if (has_target())
        cpufreq_stop_governor(policy);

    cpumask_clear_cpu(cpu, policy->cpus); //tj: here

real_cpus的引入是为了cpufeq symlink,我们在每个cpu的节点下能看到cpufreq这个symlink:

xxx:/sys/devices/system/cpu/cpu0 # ls -l
total 0
drwxr-xr-x 6 root root    0 2009-01-03 06:40 cache
lrwxrwxrwx 1 root root    0 1970-04-05 05:56 cpufreq -> ../cpufreq/policy0
xxx:/sys/devices/system/cpu/cpufreq/policy0 # ls -l
total 0
-r--r--r-- 1 root   root   4096 2009-01-03 06:41 affected_cpus
-r-------- 1 root   root   4096 2009-01-03 06:41 cpuinfo_cur_freq
-r--r--r-- 1 root   root   4096 2019-01-30 16:43 cpuinfo_max_freq
-r--r--r-- 1 root   root   4096 2009-01-03 06:41 cpuinfo_min_freq
-r--r--r-- 1 root   root   4096 2009-01-03 06:41 cpuinfo_transition_latency
-r--r--r-- 1 root   root   4096 2009-01-03 06:41 related_cpus
-r--r--r-- 1 root   root   4096 2019-01-30 16:43 scaling_available_frequencies
-r--r--r-- 1 root   root   4096 2009-01-03 06:41 scaling_available_governors
-r--r--r-- 1 root   root   4096 2009-01-03 06:41 scaling_cur_freq
-r--r--r-- 1 root   root   4096 2009-01-03 06:41 scaling_driver
-rw-r--r-- 1 root   root   4096 2008-12-31 16:00 scaling_governor
-rw-rw-r-- 1 system system 4096 1970-04-05 05:56 scaling_max_freq
-rw-rw-r-- 1 system system 4096 2008-12-31 16:00 scaling_min_freq
-rw-r--r-- 1 root   root   4096 2009-01-03 06:41 scaling_setspeed

为什么要创建policy0?然后所有cpu的cpufreq symlink过去?原因:

commit 96bdda61f58b70431bbe8a3e49794c8210f7691b
Author: Viresh Kumar <viresh.kumar@linaro.org>
Date:   Thu Oct 15 21:35:24 2015 +0530

    cpufreq: create cpu/cpufreq/policyX directories
    
    The cpufreq sysfs interface had been a bit inconsistent as one of the
    CPUs for a policy had a real directory within its sysfs 'cpuX' directory
    and all other CPUs had links to it. That also made the code a bit
    complex as we need to take care of moving the sysfs directory if the CPU
    containing the real directory is getting physically hot-unplugged.
    
    Solve this by creating 'policyX' directories (per-policy) in
    /sys/devices/system/cpu/cpufreq/ directory, where X is the CPU for which
    the policy was first created.

ok,就是为了简化代码逻辑,在policy初始化分配时创建:

static struct cpufreq_policy *cpufreq_policy_alloc(unsigned int cpu)
{
    ...
    if (!zalloc_cpumask_var(&policy->real_cpus, GFP_KERNEL))
        goto err_free_rcpumask;

    ret = kobject_init_and_add(&policy->kobj, &ktype_cpufreq,
                   cpufreq_global_kobject, "policy%u", cpu);

real_cpus就是offline物理存在的related_cpus,这个link仅仅存在于real_cpus下:

static void add_cpu_dev_symlink(struct cpufreq_policy *policy, unsigned int cpu)
{
    struct device *dev = get_cpu_device(cpu);

    if (!dev)
        return;

    if (cpumask_test_and_set_cpu(cpu, policy->real_cpus)) //tj: set if not
        return;

在CPU online时在related_cpus下add link,此时都是从driver init而来都是物理存在的:

static int cpufreq_online(unsigned int cpu)
{
    ...
    if (new_policy) {
        policy->user_policy.min = policy->min;
        policy->user_policy.max = policy->max;

        for_each_cpu(j, policy->related_cpus) {
            per_cpu(cpufreq_cpu_data, j) = policy;
            add_cpu_dev_symlink(policy, j);
        }
    }

在remove这个cpu时clear这个real_cpus mask:

static void cpufreq_remove_dev(struct device *dev, struct subsys_interface *sif)
{
    unsigned int cpu = dev->id;
    struct cpufreq_policy *policy = per_cpu(cpufreq_cpu_data, cpu);

    if (!policy)
        return;

    if (cpu_online(cpu))
        cpufreq_offline(cpu);

    cpumask_clear_cpu(cpu, policy->real_cpus);
    remove_cpu_dev_symlink(policy, dev);

ok,继续看:

    unsigned int        policy; /* see above */
    struct cpufreq_governor    *governor; /* see below */

policy是留给->setpolicy driver用的,governor是给->target || >target_index driver用。

/*
 * If (cpufreq_driver->target) exists, the ->governor decides what frequency
 * within the limits is used. If (cpufreq_driver->setpolicy> exists, these
 * two generic policies are available:
 */
#define CPUFREQ_POLICY_POWERSAVE    (1)
#define CPUFREQ_POLICY_PERFORMANCE    (2)

解析governor string时赋值:

/**
 * cpufreq_parse_governor - parse a governor string
 */
static int cpufreq_parse_governor(char *str_governor, unsigned int *policy,
                struct cpufreq_governor **governor)
{
    int err = -EINVAL;

    if (cpufreq_driver->setpolicy) {
        if (!strncasecmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {
            *policy = CPUFREQ_POLICY_PERFORMANCE;
            err = 0;
        } else if (!strncasecmp(str_governor, "powersave",
                        CPUFREQ_NAME_LEN)) {
            *policy = CPUFREQ_POLICY_POWERSAVE;
            err = 0;
        }
    } 

一是初始化policy会set:

static int cpufreq_init_policy(struct cpufreq_policy *policy)
{
    ...
    /* Use the default policy if there is no last_policy. */
    if (cpufreq_driver->setpolicy) {
        if (policy->last_policy)
            new_policy.policy = policy->last_policy;
        else
            cpufreq_parse_governor(gov->name, &new_policy.policy,
                           NULL);
    }

另一个是sysfs store_scaling_governor会call。

ok,而governor应该就是for ->target || ->target_index driver。可用的scaling governor:

xxx:/sys/devices/system/cpu/cpufreq/policy0 # cat scaling_available_governor
conservative ondemand userspace powersave performance schedutil

通过core提供的cpufreq_register_governor注册到core。4.9高通平台默认的schedutil (check it later)。

last_policylast_governor是给热插拔用。

static int cpufreq_offline(unsigned int cpu)
{
    struct cpufreq_policy *policy;
    int ret;

    ...
    if (policy_is_inactive(policy)) {
        if (has_target())
            strncpy(policy->last_governor, policy->governor->name,
                CPUFREQ_NAME_LEN);
        else
            policy->last_policy = policy->policy;
static inline bool policy_is_inactive(struct cpufreq_policy *policy)
{
    return cpumask_empty(policy->cpus);
}

static inline bool has_target(void)
{
    return cpufreq_driver->target_index || cpufreq_driver->target;
}

inactive policy就是这个策略的->cpus都下线喽。

Done.