1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun
3*4882a593Smuzhiyun #include <linux/version.h>
4*4882a593Smuzhiyun #include <linux/ptrace.h>
5*4882a593Smuzhiyun #include <uapi/linux/bpf.h>
6*4882a593Smuzhiyun #include <bpf/bpf_helpers.h>
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun /*
9*4882a593Smuzhiyun * The CPU number, cstate number and pstate number are based
10*4882a593Smuzhiyun * on 96boards Hikey with octa CA53 CPUs.
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * Every CPU have three idle states for cstate:
13*4882a593Smuzhiyun * WFI, CPU_OFF, CLUSTER_OFF
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * Every CPU have 5 operating points:
16*4882a593Smuzhiyun * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * This code is based on these assumption and other platforms
19*4882a593Smuzhiyun * need to adjust these definitions.
20*4882a593Smuzhiyun */
21*4882a593Smuzhiyun #define MAX_CPU 8
22*4882a593Smuzhiyun #define MAX_PSTATE_ENTRIES 5
23*4882a593Smuzhiyun #define MAX_CSTATE_ENTRIES 3
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 };
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun /*
28*4882a593Smuzhiyun * my_map structure is used to record cstate and pstate index and
29*4882a593Smuzhiyun * timestamp (Idx, Ts), when new event incoming we need to update
30*4882a593Smuzhiyun * combination for new state index and timestamp (Idx`, Ts`).
31*4882a593Smuzhiyun *
32*4882a593Smuzhiyun * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time
33*4882a593Smuzhiyun * interval for the previous state: Duration(Idx) = Ts` - Ts.
34*4882a593Smuzhiyun *
35*4882a593Smuzhiyun * Every CPU has one below array for recording state index and
36*4882a593Smuzhiyun * timestamp, and record for cstate and pstate saperately:
37*4882a593Smuzhiyun *
38*4882a593Smuzhiyun * +--------------------------+
39*4882a593Smuzhiyun * | cstate timestamp |
40*4882a593Smuzhiyun * +--------------------------+
41*4882a593Smuzhiyun * | cstate index |
42*4882a593Smuzhiyun * +--------------------------+
43*4882a593Smuzhiyun * | pstate timestamp |
44*4882a593Smuzhiyun * +--------------------------+
45*4882a593Smuzhiyun * | pstate index |
46*4882a593Smuzhiyun * +--------------------------+
47*4882a593Smuzhiyun */
48*4882a593Smuzhiyun #define MAP_OFF_CSTATE_TIME 0
49*4882a593Smuzhiyun #define MAP_OFF_CSTATE_IDX 1
50*4882a593Smuzhiyun #define MAP_OFF_PSTATE_TIME 2
51*4882a593Smuzhiyun #define MAP_OFF_PSTATE_IDX 3
52*4882a593Smuzhiyun #define MAP_OFF_NUM 4
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun struct {
55*4882a593Smuzhiyun __uint(type, BPF_MAP_TYPE_ARRAY);
56*4882a593Smuzhiyun __type(key, u32);
57*4882a593Smuzhiyun __type(value, u64);
58*4882a593Smuzhiyun __uint(max_entries, MAX_CPU * MAP_OFF_NUM);
59*4882a593Smuzhiyun } my_map SEC(".maps");
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun /* cstate_duration records duration time for every idle state per CPU */
62*4882a593Smuzhiyun struct {
63*4882a593Smuzhiyun __uint(type, BPF_MAP_TYPE_ARRAY);
64*4882a593Smuzhiyun __type(key, u32);
65*4882a593Smuzhiyun __type(value, u64);
66*4882a593Smuzhiyun __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES);
67*4882a593Smuzhiyun } cstate_duration SEC(".maps");
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun /* pstate_duration records duration time for every operating point per CPU */
70*4882a593Smuzhiyun struct {
71*4882a593Smuzhiyun __uint(type, BPF_MAP_TYPE_ARRAY);
72*4882a593Smuzhiyun __type(key, u32);
73*4882a593Smuzhiyun __type(value, u64);
74*4882a593Smuzhiyun __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES);
75*4882a593Smuzhiyun } pstate_duration SEC(".maps");
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun /*
78*4882a593Smuzhiyun * The trace events for cpu_idle and cpu_frequency are taken from:
79*4882a593Smuzhiyun * /sys/kernel/debug/tracing/events/power/cpu_idle/format
80*4882a593Smuzhiyun * /sys/kernel/debug/tracing/events/power/cpu_frequency/format
81*4882a593Smuzhiyun *
82*4882a593Smuzhiyun * These two events have same format, so define one common structure.
83*4882a593Smuzhiyun */
84*4882a593Smuzhiyun struct cpu_args {
85*4882a593Smuzhiyun u64 pad;
86*4882a593Smuzhiyun u32 state;
87*4882a593Smuzhiyun u32 cpu_id;
88*4882a593Smuzhiyun };
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun /* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */
find_cpu_pstate_idx(u32 frequency)91*4882a593Smuzhiyun static u32 find_cpu_pstate_idx(u32 frequency)
92*4882a593Smuzhiyun {
93*4882a593Smuzhiyun u32 i;
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) {
96*4882a593Smuzhiyun if (frequency == cpu_opps[i])
97*4882a593Smuzhiyun return i;
98*4882a593Smuzhiyun }
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun return i;
101*4882a593Smuzhiyun }
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun SEC("tracepoint/power/cpu_idle")
bpf_prog1(struct cpu_args * ctx)104*4882a593Smuzhiyun int bpf_prog1(struct cpu_args *ctx)
105*4882a593Smuzhiyun {
106*4882a593Smuzhiyun u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta;
107*4882a593Smuzhiyun u32 key, cpu, pstate_idx;
108*4882a593Smuzhiyun u64 *val;
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun if (ctx->cpu_id > MAX_CPU)
111*4882a593Smuzhiyun return 0;
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun cpu = ctx->cpu_id;
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME;
116*4882a593Smuzhiyun cts = bpf_map_lookup_elem(&my_map, &key);
117*4882a593Smuzhiyun if (!cts)
118*4882a593Smuzhiyun return 0;
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
121*4882a593Smuzhiyun cstate = bpf_map_lookup_elem(&my_map, &key);
122*4882a593Smuzhiyun if (!cstate)
123*4882a593Smuzhiyun return 0;
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
126*4882a593Smuzhiyun pts = bpf_map_lookup_elem(&my_map, &key);
127*4882a593Smuzhiyun if (!pts)
128*4882a593Smuzhiyun return 0;
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
131*4882a593Smuzhiyun pstate = bpf_map_lookup_elem(&my_map, &key);
132*4882a593Smuzhiyun if (!pstate)
133*4882a593Smuzhiyun return 0;
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun prev_state = *cstate;
136*4882a593Smuzhiyun *cstate = ctx->state;
137*4882a593Smuzhiyun
138*4882a593Smuzhiyun if (!*cts) {
139*4882a593Smuzhiyun *cts = bpf_ktime_get_ns();
140*4882a593Smuzhiyun return 0;
141*4882a593Smuzhiyun }
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun cur_ts = bpf_ktime_get_ns();
144*4882a593Smuzhiyun delta = cur_ts - *cts;
145*4882a593Smuzhiyun *cts = cur_ts;
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun /*
148*4882a593Smuzhiyun * When state doesn't equal to (u32)-1, the cpu will enter
149*4882a593Smuzhiyun * one idle state; for this case we need to record interval
150*4882a593Smuzhiyun * for the pstate.
151*4882a593Smuzhiyun *
152*4882a593Smuzhiyun * OPP2
153*4882a593Smuzhiyun * +---------------------+
154*4882a593Smuzhiyun * OPP1 | |
155*4882a593Smuzhiyun * ---------+ |
156*4882a593Smuzhiyun * | Idle state
157*4882a593Smuzhiyun * +---------------
158*4882a593Smuzhiyun *
159*4882a593Smuzhiyun * |<- pstate duration ->|
160*4882a593Smuzhiyun * ^ ^
161*4882a593Smuzhiyun * pts cur_ts
162*4882a593Smuzhiyun */
163*4882a593Smuzhiyun if (ctx->state != (u32)-1) {
164*4882a593Smuzhiyun
165*4882a593Smuzhiyun /* record pstate after have first cpu_frequency event */
166*4882a593Smuzhiyun if (!*pts)
167*4882a593Smuzhiyun return 0;
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun delta = cur_ts - *pts;
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun pstate_idx = find_cpu_pstate_idx(*pstate);
172*4882a593Smuzhiyun if (pstate_idx >= MAX_PSTATE_ENTRIES)
173*4882a593Smuzhiyun return 0;
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
176*4882a593Smuzhiyun val = bpf_map_lookup_elem(&pstate_duration, &key);
177*4882a593Smuzhiyun if (val)
178*4882a593Smuzhiyun __sync_fetch_and_add((long *)val, delta);
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun /*
181*4882a593Smuzhiyun * When state equal to (u32)-1, the cpu just exits from one
182*4882a593Smuzhiyun * specific idle state; for this case we need to record
183*4882a593Smuzhiyun * interval for the pstate.
184*4882a593Smuzhiyun *
185*4882a593Smuzhiyun * OPP2
186*4882a593Smuzhiyun * -----------+
187*4882a593Smuzhiyun * | OPP1
188*4882a593Smuzhiyun * | +-----------
189*4882a593Smuzhiyun * | Idle state |
190*4882a593Smuzhiyun * +---------------------+
191*4882a593Smuzhiyun *
192*4882a593Smuzhiyun * |<- cstate duration ->|
193*4882a593Smuzhiyun * ^ ^
194*4882a593Smuzhiyun * cts cur_ts
195*4882a593Smuzhiyun */
196*4882a593Smuzhiyun } else {
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun key = cpu * MAX_CSTATE_ENTRIES + prev_state;
199*4882a593Smuzhiyun val = bpf_map_lookup_elem(&cstate_duration, &key);
200*4882a593Smuzhiyun if (val)
201*4882a593Smuzhiyun __sync_fetch_and_add((long *)val, delta);
202*4882a593Smuzhiyun }
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun /* Update timestamp for pstate as new start time */
205*4882a593Smuzhiyun if (*pts)
206*4882a593Smuzhiyun *pts = cur_ts;
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun return 0;
209*4882a593Smuzhiyun }
210*4882a593Smuzhiyun
211*4882a593Smuzhiyun SEC("tracepoint/power/cpu_frequency")
bpf_prog2(struct cpu_args * ctx)212*4882a593Smuzhiyun int bpf_prog2(struct cpu_args *ctx)
213*4882a593Smuzhiyun {
214*4882a593Smuzhiyun u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta;
215*4882a593Smuzhiyun u32 key, cpu, pstate_idx;
216*4882a593Smuzhiyun u64 *val;
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun cpu = ctx->cpu_id;
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME;
221*4882a593Smuzhiyun pts = bpf_map_lookup_elem(&my_map, &key);
222*4882a593Smuzhiyun if (!pts)
223*4882a593Smuzhiyun return 0;
224*4882a593Smuzhiyun
225*4882a593Smuzhiyun key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX;
226*4882a593Smuzhiyun pstate = bpf_map_lookup_elem(&my_map, &key);
227*4882a593Smuzhiyun if (!pstate)
228*4882a593Smuzhiyun return 0;
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX;
231*4882a593Smuzhiyun cstate = bpf_map_lookup_elem(&my_map, &key);
232*4882a593Smuzhiyun if (!cstate)
233*4882a593Smuzhiyun return 0;
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun prev_state = *pstate;
236*4882a593Smuzhiyun *pstate = ctx->state;
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun if (!*pts) {
239*4882a593Smuzhiyun *pts = bpf_ktime_get_ns();
240*4882a593Smuzhiyun return 0;
241*4882a593Smuzhiyun }
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun cur_ts = bpf_ktime_get_ns();
244*4882a593Smuzhiyun delta = cur_ts - *pts;
245*4882a593Smuzhiyun *pts = cur_ts;
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun /* When CPU is in idle, bail out to skip pstate statistics */
248*4882a593Smuzhiyun if (*cstate != (u32)(-1))
249*4882a593Smuzhiyun return 0;
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun /*
252*4882a593Smuzhiyun * The cpu changes to another different OPP (in below diagram
253*4882a593Smuzhiyun * change frequency from OPP3 to OPP1), need recording interval
254*4882a593Smuzhiyun * for previous frequency OPP3 and update timestamp as start
255*4882a593Smuzhiyun * time for new frequency OPP1.
256*4882a593Smuzhiyun *
257*4882a593Smuzhiyun * OPP3
258*4882a593Smuzhiyun * +---------------------+
259*4882a593Smuzhiyun * OPP2 | |
260*4882a593Smuzhiyun * ---------+ |
261*4882a593Smuzhiyun * | OPP1
262*4882a593Smuzhiyun * +---------------
263*4882a593Smuzhiyun *
264*4882a593Smuzhiyun * |<- pstate duration ->|
265*4882a593Smuzhiyun * ^ ^
266*4882a593Smuzhiyun * pts cur_ts
267*4882a593Smuzhiyun */
268*4882a593Smuzhiyun pstate_idx = find_cpu_pstate_idx(*pstate);
269*4882a593Smuzhiyun if (pstate_idx >= MAX_PSTATE_ENTRIES)
270*4882a593Smuzhiyun return 0;
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun key = cpu * MAX_PSTATE_ENTRIES + pstate_idx;
273*4882a593Smuzhiyun val = bpf_map_lookup_elem(&pstate_duration, &key);
274*4882a593Smuzhiyun if (val)
275*4882a593Smuzhiyun __sync_fetch_and_add((long *)val, delta);
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun return 0;
278*4882a593Smuzhiyun }
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun char _license[] SEC("license") = "GPL";
281*4882a593Smuzhiyun u32 _version SEC("version") = LINUX_VERSION_CODE;
282