xref: /OK3568_Linux_fs/kernel/arch/x86/lib/delay.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  *	Precise Delay Loops for i386
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  *	Copyright (C) 1993 Linus Torvalds
6*4882a593Smuzhiyun  *	Copyright (C) 1997 Martin Mares <mj@atrey.karlin.mff.cuni.cz>
7*4882a593Smuzhiyun  *	Copyright (C) 2008 Jiri Hladky <hladky _dot_ jiri _at_ gmail _dot_ com>
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  *	The __delay function must _NOT_ be inlined as its execution time
10*4882a593Smuzhiyun  *	depends wildly on alignment on many x86 processors. The additional
11*4882a593Smuzhiyun  *	jump magic is needed to get the timing stable on all the CPU's
12*4882a593Smuzhiyun  *	we have to worry about.
13*4882a593Smuzhiyun  */
14*4882a593Smuzhiyun 
15*4882a593Smuzhiyun #include <linux/export.h>
16*4882a593Smuzhiyun #include <linux/sched.h>
17*4882a593Smuzhiyun #include <linux/timex.h>
18*4882a593Smuzhiyun #include <linux/preempt.h>
19*4882a593Smuzhiyun #include <linux/delay.h>
20*4882a593Smuzhiyun 
21*4882a593Smuzhiyun #include <asm/processor.h>
22*4882a593Smuzhiyun #include <asm/delay.h>
23*4882a593Smuzhiyun #include <asm/timer.h>
24*4882a593Smuzhiyun #include <asm/mwait.h>
25*4882a593Smuzhiyun 
26*4882a593Smuzhiyun #ifdef CONFIG_SMP
27*4882a593Smuzhiyun # include <asm/smp.h>
28*4882a593Smuzhiyun #endif
29*4882a593Smuzhiyun 
30*4882a593Smuzhiyun static void delay_loop(u64 __loops);
31*4882a593Smuzhiyun 
32*4882a593Smuzhiyun /*
33*4882a593Smuzhiyun  * Calibration and selection of the delay mechanism happens only once
34*4882a593Smuzhiyun  * during boot.
35*4882a593Smuzhiyun  */
36*4882a593Smuzhiyun static void (*delay_fn)(u64) __ro_after_init = delay_loop;
37*4882a593Smuzhiyun static void (*delay_halt_fn)(u64 start, u64 cycles) __ro_after_init;
38*4882a593Smuzhiyun 
39*4882a593Smuzhiyun /* simple loop based delay: */
delay_loop(u64 __loops)40*4882a593Smuzhiyun static void delay_loop(u64 __loops)
41*4882a593Smuzhiyun {
42*4882a593Smuzhiyun 	unsigned long loops = (unsigned long)__loops;
43*4882a593Smuzhiyun 
44*4882a593Smuzhiyun 	asm volatile(
45*4882a593Smuzhiyun 		"	test %0,%0	\n"
46*4882a593Smuzhiyun 		"	jz 3f		\n"
47*4882a593Smuzhiyun 		"	jmp 1f		\n"
48*4882a593Smuzhiyun 
49*4882a593Smuzhiyun 		".align 16		\n"
50*4882a593Smuzhiyun 		"1:	jmp 2f		\n"
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun 		".align 16		\n"
53*4882a593Smuzhiyun 		"2:	dec %0		\n"
54*4882a593Smuzhiyun 		"	jnz 2b		\n"
55*4882a593Smuzhiyun 		"3:	dec %0		\n"
56*4882a593Smuzhiyun 
57*4882a593Smuzhiyun 		: "+a" (loops)
58*4882a593Smuzhiyun 		:
59*4882a593Smuzhiyun 	);
60*4882a593Smuzhiyun }
61*4882a593Smuzhiyun 
62*4882a593Smuzhiyun /* TSC based delay: */
delay_tsc(u64 cycles)63*4882a593Smuzhiyun static void delay_tsc(u64 cycles)
64*4882a593Smuzhiyun {
65*4882a593Smuzhiyun 	u64 bclock, now;
66*4882a593Smuzhiyun 	int cpu;
67*4882a593Smuzhiyun 
68*4882a593Smuzhiyun 	preempt_disable();
69*4882a593Smuzhiyun 	cpu = smp_processor_id();
70*4882a593Smuzhiyun 	bclock = rdtsc_ordered();
71*4882a593Smuzhiyun 	for (;;) {
72*4882a593Smuzhiyun 		now = rdtsc_ordered();
73*4882a593Smuzhiyun 		if ((now - bclock) >= cycles)
74*4882a593Smuzhiyun 			break;
75*4882a593Smuzhiyun 
76*4882a593Smuzhiyun 		/* Allow RT tasks to run */
77*4882a593Smuzhiyun 		preempt_enable();
78*4882a593Smuzhiyun 		rep_nop();
79*4882a593Smuzhiyun 		preempt_disable();
80*4882a593Smuzhiyun 
81*4882a593Smuzhiyun 		/*
82*4882a593Smuzhiyun 		 * It is possible that we moved to another CPU, and
83*4882a593Smuzhiyun 		 * since TSC's are per-cpu we need to calculate
84*4882a593Smuzhiyun 		 * that. The delay must guarantee that we wait "at
85*4882a593Smuzhiyun 		 * least" the amount of time. Being moved to another
86*4882a593Smuzhiyun 		 * CPU could make the wait longer but we just need to
87*4882a593Smuzhiyun 		 * make sure we waited long enough. Rebalance the
88*4882a593Smuzhiyun 		 * counter for this CPU.
89*4882a593Smuzhiyun 		 */
90*4882a593Smuzhiyun 		if (unlikely(cpu != smp_processor_id())) {
91*4882a593Smuzhiyun 			cycles -= (now - bclock);
92*4882a593Smuzhiyun 			cpu = smp_processor_id();
93*4882a593Smuzhiyun 			bclock = rdtsc_ordered();
94*4882a593Smuzhiyun 		}
95*4882a593Smuzhiyun 	}
96*4882a593Smuzhiyun 	preempt_enable();
97*4882a593Smuzhiyun }
98*4882a593Smuzhiyun 
99*4882a593Smuzhiyun /*
100*4882a593Smuzhiyun  * On Intel the TPAUSE instruction waits until any of:
101*4882a593Smuzhiyun  * 1) the TSC counter exceeds the value provided in EDX:EAX
102*4882a593Smuzhiyun  * 2) global timeout in IA32_UMWAIT_CONTROL is exceeded
103*4882a593Smuzhiyun  * 3) an external interrupt occurs
104*4882a593Smuzhiyun  */
delay_halt_tpause(u64 start,u64 cycles)105*4882a593Smuzhiyun static void delay_halt_tpause(u64 start, u64 cycles)
106*4882a593Smuzhiyun {
107*4882a593Smuzhiyun 	u64 until = start + cycles;
108*4882a593Smuzhiyun 	u32 eax, edx;
109*4882a593Smuzhiyun 
110*4882a593Smuzhiyun 	eax = lower_32_bits(until);
111*4882a593Smuzhiyun 	edx = upper_32_bits(until);
112*4882a593Smuzhiyun 
113*4882a593Smuzhiyun 	/*
114*4882a593Smuzhiyun 	 * Hard code the deeper (C0.2) sleep state because exit latency is
115*4882a593Smuzhiyun 	 * small compared to the "microseconds" that usleep() will delay.
116*4882a593Smuzhiyun 	 */
117*4882a593Smuzhiyun 	__tpause(TPAUSE_C02_STATE, edx, eax);
118*4882a593Smuzhiyun }
119*4882a593Smuzhiyun 
120*4882a593Smuzhiyun /*
121*4882a593Smuzhiyun  * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
122*4882a593Smuzhiyun  * counts with TSC frequency. The input value is the number of TSC cycles
123*4882a593Smuzhiyun  * to wait. MWAITX will also exit when the timer expires.
124*4882a593Smuzhiyun  */
delay_halt_mwaitx(u64 unused,u64 cycles)125*4882a593Smuzhiyun static void delay_halt_mwaitx(u64 unused, u64 cycles)
126*4882a593Smuzhiyun {
127*4882a593Smuzhiyun 	u64 delay;
128*4882a593Smuzhiyun 
129*4882a593Smuzhiyun 	delay = min_t(u64, MWAITX_MAX_WAIT_CYCLES, cycles);
130*4882a593Smuzhiyun 	/*
131*4882a593Smuzhiyun 	 * Use cpu_tss_rw as a cacheline-aligned, seldomly accessed per-cpu
132*4882a593Smuzhiyun 	 * variable as the monitor target.
133*4882a593Smuzhiyun 	 */
134*4882a593Smuzhiyun 	 __monitorx(raw_cpu_ptr(&cpu_tss_rw), 0, 0);
135*4882a593Smuzhiyun 
136*4882a593Smuzhiyun 	/*
137*4882a593Smuzhiyun 	 * AMD, like Intel, supports the EAX hint and EAX=0xf means, do not
138*4882a593Smuzhiyun 	 * enter any deep C-state and we use it here in delay() to minimize
139*4882a593Smuzhiyun 	 * wakeup latency.
140*4882a593Smuzhiyun 	 */
141*4882a593Smuzhiyun 	__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
142*4882a593Smuzhiyun }
143*4882a593Smuzhiyun 
144*4882a593Smuzhiyun /*
145*4882a593Smuzhiyun  * Call a vendor specific function to delay for a given amount of time. Because
146*4882a593Smuzhiyun  * these functions may return earlier than requested, check for actual elapsed
147*4882a593Smuzhiyun  * time and call again until done.
148*4882a593Smuzhiyun  */
delay_halt(u64 __cycles)149*4882a593Smuzhiyun static void delay_halt(u64 __cycles)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun 	u64 start, end, cycles = __cycles;
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun 	/*
154*4882a593Smuzhiyun 	 * Timer value of 0 causes MWAITX to wait indefinitely, unless there
155*4882a593Smuzhiyun 	 * is a store on the memory monitored by MONITORX.
156*4882a593Smuzhiyun 	 */
157*4882a593Smuzhiyun 	if (!cycles)
158*4882a593Smuzhiyun 		return;
159*4882a593Smuzhiyun 
160*4882a593Smuzhiyun 	start = rdtsc_ordered();
161*4882a593Smuzhiyun 
162*4882a593Smuzhiyun 	for (;;) {
163*4882a593Smuzhiyun 		delay_halt_fn(start, cycles);
164*4882a593Smuzhiyun 		end = rdtsc_ordered();
165*4882a593Smuzhiyun 
166*4882a593Smuzhiyun 		if (cycles <= end - start)
167*4882a593Smuzhiyun 			break;
168*4882a593Smuzhiyun 
169*4882a593Smuzhiyun 		cycles -= end - start;
170*4882a593Smuzhiyun 		start = end;
171*4882a593Smuzhiyun 	}
172*4882a593Smuzhiyun }
173*4882a593Smuzhiyun 
use_tsc_delay(void)174*4882a593Smuzhiyun void __init use_tsc_delay(void)
175*4882a593Smuzhiyun {
176*4882a593Smuzhiyun 	if (delay_fn == delay_loop)
177*4882a593Smuzhiyun 		delay_fn = delay_tsc;
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun 
use_tpause_delay(void)180*4882a593Smuzhiyun void __init use_tpause_delay(void)
181*4882a593Smuzhiyun {
182*4882a593Smuzhiyun 	delay_halt_fn = delay_halt_tpause;
183*4882a593Smuzhiyun 	delay_fn = delay_halt;
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun 
use_mwaitx_delay(void)186*4882a593Smuzhiyun void use_mwaitx_delay(void)
187*4882a593Smuzhiyun {
188*4882a593Smuzhiyun 	delay_halt_fn = delay_halt_mwaitx;
189*4882a593Smuzhiyun 	delay_fn = delay_halt;
190*4882a593Smuzhiyun }
191*4882a593Smuzhiyun 
read_current_timer(unsigned long * timer_val)192*4882a593Smuzhiyun int read_current_timer(unsigned long *timer_val)
193*4882a593Smuzhiyun {
194*4882a593Smuzhiyun 	if (delay_fn == delay_tsc) {
195*4882a593Smuzhiyun 		*timer_val = rdtsc();
196*4882a593Smuzhiyun 		return 0;
197*4882a593Smuzhiyun 	}
198*4882a593Smuzhiyun 	return -1;
199*4882a593Smuzhiyun }
200*4882a593Smuzhiyun 
__delay(unsigned long loops)201*4882a593Smuzhiyun void __delay(unsigned long loops)
202*4882a593Smuzhiyun {
203*4882a593Smuzhiyun 	delay_fn(loops);
204*4882a593Smuzhiyun }
205*4882a593Smuzhiyun EXPORT_SYMBOL(__delay);
206*4882a593Smuzhiyun 
__const_udelay(unsigned long xloops)207*4882a593Smuzhiyun noinline void __const_udelay(unsigned long xloops)
208*4882a593Smuzhiyun {
209*4882a593Smuzhiyun 	unsigned long lpj = this_cpu_read(cpu_info.loops_per_jiffy) ? : loops_per_jiffy;
210*4882a593Smuzhiyun 	int d0;
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun 	xloops *= 4;
213*4882a593Smuzhiyun 	asm("mull %%edx"
214*4882a593Smuzhiyun 		:"=d" (xloops), "=&a" (d0)
215*4882a593Smuzhiyun 		:"1" (xloops), "0" (lpj * (HZ / 4)));
216*4882a593Smuzhiyun 
217*4882a593Smuzhiyun 	__delay(++xloops);
218*4882a593Smuzhiyun }
219*4882a593Smuzhiyun EXPORT_SYMBOL(__const_udelay);
220*4882a593Smuzhiyun 
__udelay(unsigned long usecs)221*4882a593Smuzhiyun void __udelay(unsigned long usecs)
222*4882a593Smuzhiyun {
223*4882a593Smuzhiyun 	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
224*4882a593Smuzhiyun }
225*4882a593Smuzhiyun EXPORT_SYMBOL(__udelay);
226*4882a593Smuzhiyun 
__ndelay(unsigned long nsecs)227*4882a593Smuzhiyun void __ndelay(unsigned long nsecs)
228*4882a593Smuzhiyun {
229*4882a593Smuzhiyun 	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun EXPORT_SYMBOL(__ndelay);
232