如何使 perf sched 支持 workqueue name

问题

workqueue name 就是 alloc_workqueue 的第一个参数：

/**
 * alloc_workqueue - allocate a workqueue
 * @fmt: printf format for the name of the workqueue
[...]
#define alloc_workqueue(fmt, flags, max_active, args...)                \

前文已经说过如何使用 perf sched 来统计 task 调度延迟，但是如下图 workqueue name 被剪掉了：

-----------------------------------------------------------------------------------------------------------------
 Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |
-----------------------------------------------------------------------------------------------------------------
 kworker/4:2-eve:404   |     88.386 ms |      207 | avg:    0.248 ms | max:   12.380 ms | max at:    271.002611 s
 kworker/u16:18-:1499  |     36.641 ms |      681 | avg:    0.081 ms | max:    4.086 ms | max at:    269.173878 s
 kworker/u16:17-:1498  |     21.918 ms |      511 | avg:    0.080 ms | max:    2.706 ms | max at:    272.811798 s
 kworker/u16:19-:1500  |     41.742 ms |      713 | avg:    0.077 ms | max:    4.098 ms | max at:    272.342235 s

那如何显示完整name了？我们直接看 perf 源码(kernel 4.19)。

perf 源码分析

打印入口：

static int perf_sched__lat(struct perf_sched *sched)
{
        struct rb_node *next;

        setup_pager();

        if (perf_sched__read_events(sched))
                return -1;

        perf_sched__merge_lat(sched);
        perf_sched__sort_lat(sched);

        printf("\n -----------------------------------------------------------------------------------------------------------------\n");
        printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
        printf(" -----------------------------------------------------------------------------------------------------------------\n");

        next = rb_first(&sched->sorted_atom_root);

        while (next) {
                struct work_atoms *work_list;

                work_list = rb_entry(next, struct work_atoms, node);
                output_lat_thread(sched, work_list);
                next = rb_next(next);
                thread__zput(work_list->thread);
        }

应该是 output_lat_thread():

static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
{
        int i;
        int ret;
        u64 avg;
        char max_lat_at[32];

        if (!work_list->nb_atoms)
                return;
        /*
         * Ignore idle threads:
         */
        if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
                return;

        sched->all_runtime += work_list->total_runtime;
        sched->all_count   += work_list->nb_atoms;

        if (work_list->num_merged > 1)
                ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
        else
                ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);

        for (i = 0; i < 24 - ret; i++)

所以，那个 workqueue name 应该是来自 thread__comm_str(work_list->thread)。

const char *thread__comm_str(const struct thread *thread)
{
        const char *str;

        down_read((struct rw_semaphore *)&thread->comm_lock);
        str = __thread__comm_str(thread);
        up_read((struct rw_semaphore *)&thread->comm_lock);

        return str;
}

static const char *__thread__comm_str(const struct thread *thread)
{
        const struct comm *comm = thread__comm(thread);

        if (!comm)
                return NULL;

        return comm__str(comm);
}

struct comm *thread__comm(const struct thread *thread)
{
        if (list_empty(&thread->comm_list))
                return NULL;

        return list_first_entry(&thread->comm_list, struct comm, list);
}

啊，原来是thread里有个链表 comm_list，从这个表里取出comm -> comm__str(comm):

const char *comm__str(const struct comm *comm)
{
        return comm->comm_str->str;
}

大概看下 struct comm:

struct comm_str;

struct comm {
        struct comm_str *comm_str;
        u64 start;
        struct list_head list;
        bool exec;
        union { /* Tool specific area */
                void    *priv;
                u64     db_id;
        };
};

util/comm.c里：

struct comm_str {
        char *str;
        struct rb_node rb_node;
        refcount_t refcnt;
};

能看到 str 是一个 char * 的指针，那啥时候分配的？分配里不是有大小吗？这个comm.c文件里就有分配实现：

static struct comm_str *comm_str__alloc(const char *str)
{
        struct comm_str *cs;

        cs = zalloc(sizeof(*cs));
        if (!cs)
                return NULL;

        cs->str = strdup(str); //tj: here
        if (!cs->str) {
                free(cs);
                return NULL;
        }

        refcount_set(&cs->refcnt, 1);

        return cs;
}

是按照入参str的长度来分配的，继续跟。

有两个caller: comm__new 和 comm__override，看下 comm__new:

struct thread *thread__new(pid_t pid, pid_t tid)
{
        char *comm_str;
        struct comm *comm;
        struct thread *thread = zalloc(sizeof(*thread));

        if (thread != NULL) {
                thread->pid_ = pid;
                thread->tid = tid;
                thread->ppid = -1;
                thread->cpu = -1;
                INIT_LIST_HEAD(&thread->namespaces_list);
                INIT_LIST_HEAD(&thread->comm_list);
                init_rwsem(&thread->namespaces_lock);
                init_rwsem(&thread->comm_lock);

                comm_str = malloc(32);
                if (!comm_str)
                        goto err_thread;

                snprintf(comm_str, 32, ":%d", tid); //tj: here
                comm = comm__new(comm_str, 0, false);
                free(comm_str);
                if (!comm)
                        goto err_thread;

                list_add(&comm->list, &thread->comm_list);

分配了32个字节，应该够了，那实际用了多少？也就是这个 str 是如何被填入的，那就是 sched record喽。回头再看下相关code：

struct work_atom {
        struct list_head        list;
        enum thread_state       state;
        u64                     sched_out_time;
        u64                     wake_up_time;
        u64                     sched_in_time;
        u64                     runtime;
};

struct work_atoms {
        struct list_head        work_list;
        struct thread           *thread;
        struct rb_node          node;
        u64                     max_lat;
        u64                     max_lat_at;
        u64                     total_lat;
        u64                     nb_atoms;
        u64                     total_runtime;
        int                     num_merged;
};

每一个 struct work_atom 定义的work都有相应的行为成员，work_atom的list用来关联到work_atoms。

list_add_tail(&atom->list, &atoms->work_list);

看下work_atoms里的thread，这个 thread 里有个 comm_list，除了上文提到的thread_new()会add 一个comm到这个list以外，还有：

static int ____thread__set_comm(struct thread *thread, const char *str,
                                u64 timestamp, bool exec)
{
        struct comm *new, *curr = thread__comm(thread);

        /* Override the default :tid entry */
        if (!thread->comm_set) {
                int err = comm__override(curr, str, timestamp, exec);
                if (err)
                        return err;
        } else {
                new = comm__new(str, timestamp, exec);
                if (!new)
                        return -ENOMEM;
                list_add(&new->list, &thread->comm_list); //tj: here

                if (exec)
                        unwind__flush_access(thread);
        }

        thread->comm_set = true;

        return 0;
}

if comm_set过了，就new一个comm。read这个set code的分支有点多，我们借用dump_stack()看下运行时：

+++ b/tools/perf/util/thread.c
@@ -218,6 +218,8 @@ static int ____thread__set_comm(struct thread *thread, const char *str,
 {
        struct comm *new, *curr = thread__comm(thread);

+       dump_stack();

能追到：

cmd_record > perf_session__process_events -> perf_session__deliver_event ->machines__deliver_event-> machine__process_comm_event

前面几个都ok，中间有个machines__deliver_event是怎么调到machine__process_comm_event的呢？来看下。

static int machines__deliver_event(struct machines *machines,
                                   struct perf_evlist *evlist,
                                   union perf_event *event,
                                   struct perf_sample *sample,
                                   struct perf_tool *tool, u64 file_offset)
{
[...]
        switch (event->header.type) {
[...]
        case PERF_RECORD_COMM:
                return tool->comm(tool, event, sample, machine);

应该是这里，用了一个callback，看下builtin-sched.c/cmd sched:

int cmd_sched(int argc, const char **argv)
{
        const char default_sort_order[] = "avg, max, switch, runtime";
        struct perf_sched sched = {
                .tool = {
                        .sample          = perf_sched__process_tracepoint_sample,
                        .comm            = perf_sched__process_comm, //tj: here

see? 这里有个tool的comm (perf_sched__process_comm)，直觉告诉我就是它:)

static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
                                    union perf_event *event,
                                    struct perf_sample *sample,
                                    struct machine *machine)
{
        struct thread *thread;
        struct thread_runtime *tr;
        int err;

        err = perf_event__process_comm(tool, event, sample, machine);
        if (err)

int perf_event__process_comm(struct perf_tool *tool __maybe_unused,
                             union perf_event *event,
                             struct perf_sample *sample,
                             struct machine *machine)
{
        return machine__process_comm_event(machine, event, sample);
}

rt? 到这里整个call stack就清楚了。现在应该知道怎么改了，如果还不知道，check below: