// SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
// Copyright (c) 2022 Google
#include "vmlinux.h"
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>
/* task->flags for off-cpu analysis */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
/* task->state for off-cpu analysis */
#define TASK_INTERRUPTIBLE 0x0001
#define TASK_UNINTERRUPTIBLE 0x0002
/* create a new thread */
#define CLONE_THREAD 0x10000
#define MAX_STACKS 32
#define MAX_ENTRIES 102400
struct tstamp_data {
__u32 stack_id;
__u32 state;
__u64 timestamp;
};
struct offcpu_key {
__u32 pid;
__u32 tgid;
__u32 stack_id;
__u32 state;
__u64 cgroup_id;
};
struct {
__uint(type, BPF_MAP_TYPE_STACK_TRACE);
__uint(key_size, sizeof(__u32));
__uint(value_size, MAX_STACKS * sizeof(__u64));
__uint(max_entries, MAX_ENTRIES);
} stacks SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_TASK_STORAGE);
__uint(map_flags, BPF_F_NO_PREALLOC);
__type(key, int);
__type(value, struct tstamp_data);
} tstamp SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(struct offcpu_key));
__uint(value_size, sizeof(__u64));
__uint(max_entries, MAX_ENTRIES);
} off_cpu SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u8));
__uint(max_entries, 1);
} cpu_filter SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(__u32));
__uint(value_size, sizeof(__u8));
__uint(max_entries, 1);
} task_filter SEC(".maps");
struct {
__uint(type, BPF_MAP_TYPE_HASH);
__uint(key_size, sizeof(__u64));
__uint(value_size, sizeof(__u8));
__uint(max_entries, 1);
} cgroup_filter SEC(".maps");
/* new kernel task_struct definition */
struct task_struct___new {
long __state;
} __attribute__((preserve_access_index));
/* old kernel task_struct definition */
struct task_struct___old {
long state;
} __attribute__((preserve_access_index));
int enabled = 0;
const volatile int has_cpu = 0;
const volatile int has_task = 0;
const volatile int has_cgroup = 0;
const volatile int uses_tgid = 0;
const volatile bool has_prev_state = false;
const volatile bool needs_cgroup = false;
const volatile bool uses_cgroup_v1 = false;
int perf_subsys_id = -1;
/*
* Old kernel used to call it task_struct->state and now it's '__state'.
* Use BPF CO-RE "ignored suffix rule" to deal with it like below:
*
* https://nakryiko.com/posts/bpf-core-reference-guide/#handling-incompatible-field-and-type-changes
*/
static inline int get_task_state(struct task_struct *t)
{
/* recast pointer to capture new type for compiler */
struct task_struct___new *t_new = (void *)t;
if (bpf_core_field_exists(t_new->__state)) {
return BPF_CORE_READ(t_new, __state);
} else {
/* recast pointer to capture old type for compiler */
struct task_struct___old *t_old = (void *)t;
return BPF_CORE_READ(t_old, state);
}
}
static inline __u64 get_cgroup_id(struct task_struct *t)
{
struct cgroup *cgrp;
if (!uses_cgroup_v1)
return BPF_CORE_READ(t, cgroups, dfl_cgrp, kn, id);
if (perf_subsys_id == -1) {
#if __has_builtin(__builtin_preserve_enum_value)
perf_subsys_id = bpf_core_enum_value(enum cgroup_subsys_id,
perf_event_cgrp_id);
#else
perf_subsys_id = perf_event_cgrp_id;
#endif
}
cgrp = BPF_CORE_READ(t, cgroups, subsys[perf_subsys_id], cgroup);
return BPF_CORE_READ(cgrp, kn, id);
}
static inline int can_record(struct task_struct *t, int state)
{
/* kernel threads don't have user stack */
if (t->flags & PF_KTHREAD)
return 0;
if (state != TASK_INTERRUPTIBLE &&
state != TASK_UNINTERRUPTIBLE)
return 0;
if (has_cpu) {
__u32 cpu = bpf_get_smp_processor_id();
__u8 *ok;
ok = bpf_map_lookup_elem(&cpu_filter, &cpu);
if (!ok)
return 0;
}
if (has_task) {
__u8 *ok;
__u32 pid;
if (uses_tgid)
pid = t->tgid;
else
pid = t->pid;
ok = bpf_map_lookup_elem(&task_filter, &pid);
if (!ok)
return 0;
}
if (has_cgroup) {
__u8 *ok;
__u64 cgrp_id = get_cgroup_id(t);
ok = bpf_map_lookup_elem(&cgroup_filter, &cgrp_id);
if (!ok)
return 0;
}
return 1;
}
static int off_cpu_stat(u64 *ctx, struct task_struct *prev,
struct task_struct *next, int state)
{
__u64 ts;
__u32 stack_id;
struct tstamp_data *pelem;
ts = bpf_ktime_get_ns();
if (!can_record(prev, state))
goto next;
stack_id = bpf_get_stackid(ctx, &stacks,
BPF_F_FAST_STACK_CMP | BPF_F_USER_STACK);
pelem = bpf_task_storage_get(&tstamp, prev, NULL,
BPF_LOCAL_STORAGE_GET_F_CREATE);
if (!pelem)
goto next;
pelem->timestamp = ts;
pelem->state = state;
pelem->stack_id = stack_id;
next:
pelem = bpf_task_storage_get(&tstamp, next, NULL, 0);
if (pelem && pelem->timestamp) {
struct offcpu_key key = {
.pid = next->pid,
.tgid = next->tgid,
.stack_id = pelem->stack_id,
.state = pelem->state,
.cgroup_id = needs_cgroup ? get_cgroup_id(next) : 0,
};
__u64 delta = ts - pelem->timestamp;
__u64 *total;
total = bpf_map_lookup_elem(&off_cpu, &key);
if (total)
*total += delta;
else
bpf_map_update_elem(&off_cpu, &key, &delta, BPF_ANY);
/* prevent to reuse the timestamp later */
pelem->timestamp = 0;
}
return 0;
}
SEC("tp_btf/task_newtask")
int on_newtask(u64 *ctx)
{
struct task_struct *task;
u64 clone_flags;
u32 pid;
u8 val = 1;
if (!uses_tgid)
return 0;
task = (struct task_struct *)bpf_get_current_task();
pid = BPF_CORE_READ(task, tgid);
if (!bpf_map_lookup_elem(&task_filter, &pid))
return 0;
task = (struct task_struct *)ctx[0];
clone_flags = ctx[1];
pid = task->tgid;
if (!(clone_flags & CLONE_THREAD))
bpf_map_update_elem(&task_filter, &pid, &val, BPF_NOEXIST);
return 0;
}
SEC("tp_btf/sched_switch")
int on_switch(u64 *ctx)
{
struct task_struct *prev, *next;
int prev_state;
if (!enabled)
return 0;
prev = (struct task_struct *)ctx[1];
next = (struct task_struct *)ctx[2];
if (has_prev_state)
prev_state = (int)ctx[3];
else
prev_state = get_task_state(prev);
return off_cpu_stat(ctx, prev, next, prev_state & 0xff);
}
char LICENSE[] SEC("license") = "Dual BSD/GPL";