1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Precise Delay Loops for i386
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 1993 Linus Torvalds
6*4882a593Smuzhiyun * Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
7*4882a593Smuzhiyun * Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * The __delay function must _NOT_ be inlined as its execution time
10*4882a593Smuzhiyun * depends wildly on alignment on many x86 processors. The additional
11*4882a593Smuzhiyun * jump magic is needed to get the timing stable on all the CPU's
12*4882a593Smuzhiyun * we have to worry about.
13*4882a593Smuzhiyun */
14*4882a593Smuzhiyun
15*4882a593Smuzhiyun #include <linux/export.h>
16*4882a593Smuzhiyun #include <linux/sched.h>
17*4882a593Smuzhiyun #include <linux/timex.h>
18*4882a593Smuzhiyun #include <linux/preempt.h>
19*4882a593Smuzhiyun #include <linux/delay.h>
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun #include <asm/processor.h>
22*4882a593Smuzhiyun #include <asm/delay.h>
23*4882a593Smuzhiyun #include <asm/timer.h>
24*4882a593Smuzhiyun #include <asm/mwait.h>
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun #ifdef CONFIG_SMP
27*4882a593Smuzhiyun # include <asm/smp.h>
28*4882a593Smuzhiyun #endif
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun static void delay_loop(u64 __loops);
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun /*
33*4882a593Smuzhiyun * Calibration and selection of the delay mechanism happens only once
34*4882a593Smuzhiyun * during boot.
35*4882a593Smuzhiyun */
36*4882a593Smuzhiyun static void (*delay_fn)(u64) __ro_after_init = delay_loop;
37*4882a593Smuzhiyun static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init;
38*4882a593Smuzhiyun
39*4882a593Smuzhiyun /* simple loop based delay: */
delay_loop(u64 __loops)40*4882a593Smuzhiyun static void delay_loop(u64 __loops)
41*4882a593Smuzhiyun {
42*4882a593Smuzhiyun unsigned long loops = (unsigned long)__loops;
43*4882a593Smuzhiyun
44*4882a593Smuzhiyun asm volatile(
45*4882a593Smuzhiyun " test %0,%0 \n"
46*4882a593Smuzhiyun " jz 3f \n"
47*4882a593Smuzhiyun " jmp 1f \n"
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun ".align 16 \n"
50*4882a593Smuzhiyun "1: jmp 2f \n"
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun ".align 16 \n"
53*4882a593Smuzhiyun "2: dec %0 \n"
54*4882a593Smuzhiyun " jnz 2b \n"
55*4882a593Smuzhiyun "3: dec %0 \n"
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun : "+a" (loops)
58*4882a593Smuzhiyun :
59*4882a593Smuzhiyun );
60*4882a593Smuzhiyun }
61*4882a593Smuzhiyun
62*4882a593Smuzhiyun /* TSC based delay: */
delay_tsc(u64 cycles)63*4882a593Smuzhiyun static void delay_tsc(u64 cycles)
64*4882a593Smuzhiyun {
65*4882a593Smuzhiyun u64 bclock, now;
66*4882a593Smuzhiyun int cpu;
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun preempt_disable();
69*4882a593Smuzhiyun cpu = smp_processor_id();
70*4882a593Smuzhiyun bclock = rdtsc_ordered();
71*4882a593Smuzhiyun for (;;) {
72*4882a593Smuzhiyun now = rdtsc_ordered();
73*4882a593Smuzhiyun if ((now - bclock) >= cycles)
74*4882a593Smuzhiyun break;
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun /* Allow RT tasks to run */
77*4882a593Smuzhiyun preempt_enable();
78*4882a593Smuzhiyun rep_nop();
79*4882a593Smuzhiyun preempt_disable();
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun /*
82*4882a593Smuzhiyun * It is possible that we moved to another CPU, and
83*4882a593Smuzhiyun * since TSC's are per-cpu we need to calculate
84*4882a593Smuzhiyun * that. The delay must guarantee that we wait "at
85*4882a593Smuzhiyun * least" the amount of time. Being moved to another
86*4882a593Smuzhiyun * CPU could make the wait longer but we just need to
87*4882a593Smuzhiyun * make sure we waited long enough. Rebalance the
88*4882a593Smuzhiyun * counter for this CPU.
89*4882a593Smuzhiyun */
90*4882a593Smuzhiyun if (unlikely(cpu != smp_processor_id())) {
91*4882a593Smuzhiyun cycles -= (now - bclock);
92*4882a593Smuzhiyun cpu = smp_processor_id();
93*4882a593Smuzhiyun bclock = rdtsc_ordered();
94*4882a593Smuzhiyun }
95*4882a593Smuzhiyun }
96*4882a593Smuzhiyun preempt_enable();
97*4882a593Smuzhiyun }
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun /*
100*4882a593Smuzhiyun * On Intel the TPAUSE instruction waits until any of:
101*4882a593Smuzhiyun * 1) the TSC counter exceeds the value provided in EDX:EAX
102*4882a593Smuzhiyun * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded
103*4882a593Smuzhiyun * 3) an external interrupt occurs
104*4882a593Smuzhiyun */
delay_halt_tpause(u64 start,u64 cycles)105*4882a593Smuzhiyun static void delay_halt_tpause(u64 start, u64 cycles)
106*4882a593Smuzhiyun {
107*4882a593Smuzhiyun u64 until = start + cycles;
108*4882a593Smuzhiyun u32 eax, edx;
109*4882a593Smuzhiyun
110*4882a593Smuzhiyun eax = lower_32_bits(until);
111*4882a593Smuzhiyun edx = upper_32_bits(until);
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun /*
114*4882a593Smuzhiyun * Hard code the deeper (C0.2) sleep state because exit latency is
115*4882a593Smuzhiyun * small compared to the "microseconds" that usleep() will delay.
116*4882a593Smuzhiyun */
117*4882a593Smuzhiyun __tpause(TPAUSE_C02_STATE, edx, eax);
118*4882a593Smuzhiyun }
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun /*
121*4882a593Smuzhiyun * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
122*4882a593Smuzhiyun * counts with TSC frequency. The input value is the number of TSC cycles
123*4882a593Smuzhiyun * to wait. MWAITX will also exit when the timer expires.
124*4882a593Smuzhiyun */
delay_halt_mwaitx(u64 unused,u64 cycles)125*4882a593Smuzhiyun static void delay_halt_mwaitx(u64 unused, u64 cycles)
126*4882a593Smuzhiyun {
127*4882a593Smuzhiyun u64 delay;
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
130*4882a593Smuzhiyun /*
131*4882a593Smuzhiyun * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu
132*4882a593Smuzhiyun * variable as the monitor target.
133*4882a593Smuzhiyun */
134*4882a593Smuzhiyun __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun /*
137*4882a593Smuzhiyun * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
138*4882a593Smuzhiyun * enter any deep C-state and we use it here in delay() to minimize
139*4882a593Smuzhiyun * wakeup latency.
140*4882a593Smuzhiyun */
141*4882a593Smuzhiyun __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
142*4882a593Smuzhiyun }
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun /*
145*4882a593Smuzhiyun * Call a vendor specific function to delay for a given amount of time. Because
146*4882a593Smuzhiyun * these functions may return earlier than requested, check for actual elapsed
147*4882a593Smuzhiyun * time and call again until done.
148*4882a593Smuzhiyun */
delay_halt(u64 __cycles)149*4882a593Smuzhiyun static void delay_halt(u64 __cycles)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun u64 start, end, cycles = __cycles;
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun /*
154*4882a593Smuzhiyun * Timer value of 0 causes MWAITX to wait indefinitely, unless there
155*4882a593Smuzhiyun * is a store on the memory monitored by MONITORX.
156*4882a593Smuzhiyun */
157*4882a593Smuzhiyun if (!cycles)
158*4882a593Smuzhiyun return;
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun start = rdtsc_ordered();
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun for (;;) {
163*4882a593Smuzhiyun delay_halt_fn(start, cycles);
164*4882a593Smuzhiyun end = rdtsc_ordered();
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun if (cycles <= end - start)
167*4882a593Smuzhiyun break;
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun cycles -= end - start;
170*4882a593Smuzhiyun start = end;
171*4882a593Smuzhiyun }
172*4882a593Smuzhiyun }
173*4882a593Smuzhiyun
use_tsc_delay(void)174*4882a593Smuzhiyun void __init use_tsc_delay(void)
175*4882a593Smuzhiyun {
176*4882a593Smuzhiyun if (delay_fn == delay_loop)
177*4882a593Smuzhiyun delay_fn = delay_tsc;
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
use_tpause_delay(void)180*4882a593Smuzhiyun void __init use_tpause_delay(void)
181*4882a593Smuzhiyun {
182*4882a593Smuzhiyun delay_halt_fn = delay_halt_tpause;
183*4882a593Smuzhiyun delay_fn = delay_halt;
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun
use_mwaitx_delay(void)186*4882a593Smuzhiyun void use_mwaitx_delay(void)
187*4882a593Smuzhiyun {
188*4882a593Smuzhiyun delay_halt_fn = delay_halt_mwaitx;
189*4882a593Smuzhiyun delay_fn = delay_halt;
190*4882a593Smuzhiyun }
191*4882a593Smuzhiyun
read_current_timer(unsigned long * timer_val)192*4882a593Smuzhiyun int read_current_timer(unsigned long *timer_val)
193*4882a593Smuzhiyun {
194*4882a593Smuzhiyun if (delay_fn == delay_tsc) {
195*4882a593Smuzhiyun *timer_val = rdtsc();
196*4882a593Smuzhiyun return 0;
197*4882a593Smuzhiyun }
198*4882a593Smuzhiyun return -1;
199*4882a593Smuzhiyun }
200*4882a593Smuzhiyun
__delay(unsigned long loops)201*4882a593Smuzhiyun void __delay(unsigned long loops)
202*4882a593Smuzhiyun {
203*4882a593Smuzhiyun delay_fn(loops);
204*4882a593Smuzhiyun }
205*4882a593Smuzhiyun EXPORT_SYMBOL(__delay);
206*4882a593Smuzhiyun
__const_udelay(unsigned long xloops)207*4882a593Smuzhiyun noinline void __const_udelay(unsigned long xloops)
208*4882a593Smuzhiyun {
209*4882a593Smuzhiyun unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy;
210*4882a593Smuzhiyun int d0;
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun xloops *= 4;
213*4882a593Smuzhiyun asm("mull %%edx"
214*4882a593Smuzhiyun :"=d" (xloops), "=&a" (d0)
215*4882a593Smuzhiyun :"1" (xloops), "0" (lpj * (HZ / 4)));
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun __delay(++xloops);
218*4882a593Smuzhiyun }
219*4882a593Smuzhiyun EXPORT_SYMBOL(__const_udelay);
220*4882a593Smuzhiyun
__udelay(unsigned long usecs)221*4882a593Smuzhiyun void __udelay(unsigned long usecs)
222*4882a593Smuzhiyun {
223*4882a593Smuzhiyun __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
224*4882a593Smuzhiyun }
225*4882a593Smuzhiyun EXPORT_SYMBOL(__udelay);
226*4882a593Smuzhiyun
__ndelay(unsigned long nsecs)227*4882a593Smuzhiyun void __ndelay(unsigned long nsecs)
228*4882a593Smuzhiyun {
229*4882a593Smuzhiyun __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun EXPORT_SYMBOL(__ndelay);
232