1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Xen time implementation.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * This is implemented in terms of a clocksource driver which uses
6*4882a593Smuzhiyun * the hypervisor clock as a nanosecond timebase, and a clockevent
7*4882a593Smuzhiyun * driver which uses the hypervisor's timer mechanism.
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
10*4882a593Smuzhiyun */
11*4882a593Smuzhiyun #include <linux/kernel.h>
12*4882a593Smuzhiyun #include <linux/interrupt.h>
13*4882a593Smuzhiyun #include <linux/clocksource.h>
14*4882a593Smuzhiyun #include <linux/clockchips.h>
15*4882a593Smuzhiyun #include <linux/gfp.h>
16*4882a593Smuzhiyun #include <linux/slab.h>
17*4882a593Smuzhiyun #include <linux/pvclock_gtod.h>
18*4882a593Smuzhiyun #include <linux/timekeeper_internal.h>
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun #include <asm/pvclock.h>
21*4882a593Smuzhiyun #include <asm/xen/hypervisor.h>
22*4882a593Smuzhiyun #include <asm/xen/hypercall.h>
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun #include <xen/events.h>
25*4882a593Smuzhiyun #include <xen/features.h>
26*4882a593Smuzhiyun #include <xen/interface/xen.h>
27*4882a593Smuzhiyun #include <xen/interface/vcpu.h>
28*4882a593Smuzhiyun
29*4882a593Smuzhiyun #include "xen-ops.h"
30*4882a593Smuzhiyun
31*4882a593Smuzhiyun /* Minimum amount of time until next clock event fires */
32*4882a593Smuzhiyun #define TIMER_SLOP 100000
33*4882a593Smuzhiyun
34*4882a593Smuzhiyun static u64 xen_sched_clock_offset __read_mostly;
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun /* Get the TSC speed from Xen */
xen_tsc_khz(void)37*4882a593Smuzhiyun static unsigned long xen_tsc_khz(void)
38*4882a593Smuzhiyun {
39*4882a593Smuzhiyun struct pvclock_vcpu_time_info *info =
40*4882a593Smuzhiyun &HYPERVISOR_shared_info->vcpu_info[0].time;
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
43*4882a593Smuzhiyun return pvclock_tsc_khz(info);
44*4882a593Smuzhiyun }
45*4882a593Smuzhiyun
xen_clocksource_read(void)46*4882a593Smuzhiyun static u64 xen_clocksource_read(void)
47*4882a593Smuzhiyun {
48*4882a593Smuzhiyun struct pvclock_vcpu_time_info *src;
49*4882a593Smuzhiyun u64 ret;
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun preempt_disable_notrace();
52*4882a593Smuzhiyun src = &__this_cpu_read(xen_vcpu)->time;
53*4882a593Smuzhiyun ret = pvclock_clocksource_read(src);
54*4882a593Smuzhiyun preempt_enable_notrace();
55*4882a593Smuzhiyun return ret;
56*4882a593Smuzhiyun }
57*4882a593Smuzhiyun
xen_clocksource_get_cycles(struct clocksource * cs)58*4882a593Smuzhiyun static u64 xen_clocksource_get_cycles(struct clocksource *cs)
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun return xen_clocksource_read();
61*4882a593Smuzhiyun }
62*4882a593Smuzhiyun
xen_sched_clock(void)63*4882a593Smuzhiyun static u64 xen_sched_clock(void)
64*4882a593Smuzhiyun {
65*4882a593Smuzhiyun return xen_clocksource_read() - xen_sched_clock_offset;
66*4882a593Smuzhiyun }
67*4882a593Smuzhiyun
xen_read_wallclock(struct timespec64 * ts)68*4882a593Smuzhiyun static void xen_read_wallclock(struct timespec64 *ts)
69*4882a593Smuzhiyun {
70*4882a593Smuzhiyun struct shared_info *s = HYPERVISOR_shared_info;
71*4882a593Smuzhiyun struct pvclock_wall_clock *wall_clock = &(s->wc);
72*4882a593Smuzhiyun struct pvclock_vcpu_time_info *vcpu_time;
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun vcpu_time = &get_cpu_var(xen_vcpu)->time;
75*4882a593Smuzhiyun pvclock_read_wallclock(wall_clock, vcpu_time, ts);
76*4882a593Smuzhiyun put_cpu_var(xen_vcpu);
77*4882a593Smuzhiyun }
78*4882a593Smuzhiyun
xen_get_wallclock(struct timespec64 * now)79*4882a593Smuzhiyun static void xen_get_wallclock(struct timespec64 *now)
80*4882a593Smuzhiyun {
81*4882a593Smuzhiyun xen_read_wallclock(now);
82*4882a593Smuzhiyun }
83*4882a593Smuzhiyun
xen_set_wallclock(const struct timespec64 * now)84*4882a593Smuzhiyun static int xen_set_wallclock(const struct timespec64 *now)
85*4882a593Smuzhiyun {
86*4882a593Smuzhiyun return -ENODEV;
87*4882a593Smuzhiyun }
88*4882a593Smuzhiyun
xen_pvclock_gtod_notify(struct notifier_block * nb,unsigned long was_set,void * priv)89*4882a593Smuzhiyun static int xen_pvclock_gtod_notify(struct notifier_block *nb,
90*4882a593Smuzhiyun unsigned long was_set, void *priv)
91*4882a593Smuzhiyun {
92*4882a593Smuzhiyun /* Protected by the calling core code serialization */
93*4882a593Smuzhiyun static struct timespec64 next_sync;
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun struct xen_platform_op op;
96*4882a593Smuzhiyun struct timespec64 now;
97*4882a593Smuzhiyun struct timekeeper *tk = priv;
98*4882a593Smuzhiyun static bool settime64_supported = true;
99*4882a593Smuzhiyun int ret;
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun now.tv_sec = tk->xtime_sec;
102*4882a593Smuzhiyun now.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun /*
105*4882a593Smuzhiyun * We only take the expensive HV call when the clock was set
106*4882a593Smuzhiyun * or when the 11 minutes RTC synchronization time elapsed.
107*4882a593Smuzhiyun */
108*4882a593Smuzhiyun if (!was_set && timespec64_compare(&now, &next_sync) < 0)
109*4882a593Smuzhiyun return NOTIFY_OK;
110*4882a593Smuzhiyun
111*4882a593Smuzhiyun again:
112*4882a593Smuzhiyun if (settime64_supported) {
113*4882a593Smuzhiyun op.cmd = XENPF_settime64;
114*4882a593Smuzhiyun op.u.settime64.mbz = 0;
115*4882a593Smuzhiyun op.u.settime64.secs = now.tv_sec;
116*4882a593Smuzhiyun op.u.settime64.nsecs = now.tv_nsec;
117*4882a593Smuzhiyun op.u.settime64.system_time = xen_clocksource_read();
118*4882a593Smuzhiyun } else {
119*4882a593Smuzhiyun op.cmd = XENPF_settime32;
120*4882a593Smuzhiyun op.u.settime32.secs = now.tv_sec;
121*4882a593Smuzhiyun op.u.settime32.nsecs = now.tv_nsec;
122*4882a593Smuzhiyun op.u.settime32.system_time = xen_clocksource_read();
123*4882a593Smuzhiyun }
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun ret = HYPERVISOR_platform_op(&op);
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun if (ret == -ENOSYS && settime64_supported) {
128*4882a593Smuzhiyun settime64_supported = false;
129*4882a593Smuzhiyun goto again;
130*4882a593Smuzhiyun }
131*4882a593Smuzhiyun if (ret < 0)
132*4882a593Smuzhiyun return NOTIFY_BAD;
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun /*
135*4882a593Smuzhiyun * Move the next drift compensation time 11 minutes
136*4882a593Smuzhiyun * ahead. That's emulating the sync_cmos_clock() update for
137*4882a593Smuzhiyun * the hardware RTC.
138*4882a593Smuzhiyun */
139*4882a593Smuzhiyun next_sync = now;
140*4882a593Smuzhiyun next_sync.tv_sec += 11 * 60;
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun return NOTIFY_OK;
143*4882a593Smuzhiyun }
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun static struct notifier_block xen_pvclock_gtod_notifier = {
146*4882a593Smuzhiyun .notifier_call = xen_pvclock_gtod_notify,
147*4882a593Smuzhiyun };
148*4882a593Smuzhiyun
xen_cs_enable(struct clocksource * cs)149*4882a593Smuzhiyun static int xen_cs_enable(struct clocksource *cs)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun vclocks_set_used(VDSO_CLOCKMODE_PVCLOCK);
152*4882a593Smuzhiyun return 0;
153*4882a593Smuzhiyun }
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun static struct clocksource xen_clocksource __read_mostly = {
156*4882a593Smuzhiyun .name = "xen",
157*4882a593Smuzhiyun .rating = 400,
158*4882a593Smuzhiyun .read = xen_clocksource_get_cycles,
159*4882a593Smuzhiyun .mask = CLOCKSOURCE_MASK(64),
160*4882a593Smuzhiyun .flags = CLOCK_SOURCE_IS_CONTINUOUS,
161*4882a593Smuzhiyun .enable = xen_cs_enable,
162*4882a593Smuzhiyun };
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun /*
165*4882a593Smuzhiyun Xen clockevent implementation
166*4882a593Smuzhiyun
167*4882a593Smuzhiyun Xen has two clockevent implementations:
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun The old timer_op one works with all released versions of Xen prior
170*4882a593Smuzhiyun to version 3.0.4. This version of the hypervisor provides a
171*4882a593Smuzhiyun single-shot timer with nanosecond resolution. However, sharing the
172*4882a593Smuzhiyun same event channel is a 100Hz tick which is delivered while the
173*4882a593Smuzhiyun vcpu is running. We don't care about or use this tick, but it will
174*4882a593Smuzhiyun cause the core time code to think the timer fired too soon, and
175*4882a593Smuzhiyun will end up resetting it each time. It could be filtered, but
176*4882a593Smuzhiyun doing so has complications when the ktime clocksource is not yet
177*4882a593Smuzhiyun the xen clocksource (ie, at boot time).
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun The new vcpu_op-based timer interface allows the tick timer period
180*4882a593Smuzhiyun to be changed or turned off. The tick timer is not useful as a
181*4882a593Smuzhiyun periodic timer because events are only delivered to running vcpus.
182*4882a593Smuzhiyun The one-shot timer can report when a timeout is in the past, so
183*4882a593Smuzhiyun set_next_event is capable of returning -ETIME when appropriate.
184*4882a593Smuzhiyun This interface is used when available.
185*4882a593Smuzhiyun */
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun /*
189*4882a593Smuzhiyun Get a hypervisor absolute time. In theory we could maintain an
190*4882a593Smuzhiyun offset between the kernel's time and the hypervisor's time, and
191*4882a593Smuzhiyun apply that to a kernel's absolute timeout. Unfortunately the
192*4882a593Smuzhiyun hypervisor and kernel times can drift even if the kernel is using
193*4882a593Smuzhiyun the Xen clocksource, because ntp can warp the kernel's clocksource.
194*4882a593Smuzhiyun */
get_abs_timeout(unsigned long delta)195*4882a593Smuzhiyun static s64 get_abs_timeout(unsigned long delta)
196*4882a593Smuzhiyun {
197*4882a593Smuzhiyun return xen_clocksource_read() + delta;
198*4882a593Smuzhiyun }
199*4882a593Smuzhiyun
xen_timerop_shutdown(struct clock_event_device * evt)200*4882a593Smuzhiyun static int xen_timerop_shutdown(struct clock_event_device *evt)
201*4882a593Smuzhiyun {
202*4882a593Smuzhiyun /* cancel timeout */
203*4882a593Smuzhiyun HYPERVISOR_set_timer_op(0);
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun return 0;
206*4882a593Smuzhiyun }
207*4882a593Smuzhiyun
xen_timerop_set_next_event(unsigned long delta,struct clock_event_device * evt)208*4882a593Smuzhiyun static int xen_timerop_set_next_event(unsigned long delta,
209*4882a593Smuzhiyun struct clock_event_device *evt)
210*4882a593Smuzhiyun {
211*4882a593Smuzhiyun WARN_ON(!clockevent_state_oneshot(evt));
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun if (HYPERVISOR_set_timer_op(get_abs_timeout(delta)) < 0)
214*4882a593Smuzhiyun BUG();
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun /* We may have missed the deadline, but there's no real way of
217*4882a593Smuzhiyun knowing for sure. If the event was in the past, then we'll
218*4882a593Smuzhiyun get an immediate interrupt. */
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun return 0;
221*4882a593Smuzhiyun }
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun static struct clock_event_device xen_timerop_clockevent __ro_after_init = {
224*4882a593Smuzhiyun .name = "xen",
225*4882a593Smuzhiyun .features = CLOCK_EVT_FEAT_ONESHOT,
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun .max_delta_ns = 0xffffffff,
228*4882a593Smuzhiyun .max_delta_ticks = 0xffffffff,
229*4882a593Smuzhiyun .min_delta_ns = TIMER_SLOP,
230*4882a593Smuzhiyun .min_delta_ticks = TIMER_SLOP,
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun .mult = 1,
233*4882a593Smuzhiyun .shift = 0,
234*4882a593Smuzhiyun .rating = 500,
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun .set_state_shutdown = xen_timerop_shutdown,
237*4882a593Smuzhiyun .set_next_event = xen_timerop_set_next_event,
238*4882a593Smuzhiyun };
239*4882a593Smuzhiyun
xen_vcpuop_shutdown(struct clock_event_device * evt)240*4882a593Smuzhiyun static int xen_vcpuop_shutdown(struct clock_event_device *evt)
241*4882a593Smuzhiyun {
242*4882a593Smuzhiyun int cpu = smp_processor_id();
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun if (HYPERVISOR_vcpu_op(VCPUOP_stop_singleshot_timer, xen_vcpu_nr(cpu),
245*4882a593Smuzhiyun NULL) ||
246*4882a593Smuzhiyun HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
247*4882a593Smuzhiyun NULL))
248*4882a593Smuzhiyun BUG();
249*4882a593Smuzhiyun
250*4882a593Smuzhiyun return 0;
251*4882a593Smuzhiyun }
252*4882a593Smuzhiyun
xen_vcpuop_set_oneshot(struct clock_event_device * evt)253*4882a593Smuzhiyun static int xen_vcpuop_set_oneshot(struct clock_event_device *evt)
254*4882a593Smuzhiyun {
255*4882a593Smuzhiyun int cpu = smp_processor_id();
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
258*4882a593Smuzhiyun NULL))
259*4882a593Smuzhiyun BUG();
260*4882a593Smuzhiyun
261*4882a593Smuzhiyun return 0;
262*4882a593Smuzhiyun }
263*4882a593Smuzhiyun
xen_vcpuop_set_next_event(unsigned long delta,struct clock_event_device * evt)264*4882a593Smuzhiyun static int xen_vcpuop_set_next_event(unsigned long delta,
265*4882a593Smuzhiyun struct clock_event_device *evt)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun int cpu = smp_processor_id();
268*4882a593Smuzhiyun struct vcpu_set_singleshot_timer single;
269*4882a593Smuzhiyun int ret;
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun WARN_ON(!clockevent_state_oneshot(evt));
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun single.timeout_abs_ns = get_abs_timeout(delta);
274*4882a593Smuzhiyun /* Get an event anyway, even if the timeout is already expired */
275*4882a593Smuzhiyun single.flags = 0;
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun ret = HYPERVISOR_vcpu_op(VCPUOP_set_singleshot_timer, xen_vcpu_nr(cpu),
278*4882a593Smuzhiyun &single);
279*4882a593Smuzhiyun BUG_ON(ret != 0);
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun return ret;
282*4882a593Smuzhiyun }
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun static struct clock_event_device xen_vcpuop_clockevent __ro_after_init = {
285*4882a593Smuzhiyun .name = "xen",
286*4882a593Smuzhiyun .features = CLOCK_EVT_FEAT_ONESHOT,
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun .max_delta_ns = 0xffffffff,
289*4882a593Smuzhiyun .max_delta_ticks = 0xffffffff,
290*4882a593Smuzhiyun .min_delta_ns = TIMER_SLOP,
291*4882a593Smuzhiyun .min_delta_ticks = TIMER_SLOP,
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun .mult = 1,
294*4882a593Smuzhiyun .shift = 0,
295*4882a593Smuzhiyun .rating = 500,
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun .set_state_shutdown = xen_vcpuop_shutdown,
298*4882a593Smuzhiyun .set_state_oneshot = xen_vcpuop_set_oneshot,
299*4882a593Smuzhiyun .set_next_event = xen_vcpuop_set_next_event,
300*4882a593Smuzhiyun };
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun static const struct clock_event_device *xen_clockevent =
303*4882a593Smuzhiyun &xen_timerop_clockevent;
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun struct xen_clock_event_device {
306*4882a593Smuzhiyun struct clock_event_device evt;
307*4882a593Smuzhiyun char name[16];
308*4882a593Smuzhiyun };
309*4882a593Smuzhiyun static DEFINE_PER_CPU(struct xen_clock_event_device, xen_clock_events) = { .evt.irq = -1 };
310*4882a593Smuzhiyun
xen_timer_interrupt(int irq,void * dev_id)311*4882a593Smuzhiyun static irqreturn_t xen_timer_interrupt(int irq, void *dev_id)
312*4882a593Smuzhiyun {
313*4882a593Smuzhiyun struct clock_event_device *evt = this_cpu_ptr(&xen_clock_events.evt);
314*4882a593Smuzhiyun irqreturn_t ret;
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun ret = IRQ_NONE;
317*4882a593Smuzhiyun if (evt->event_handler) {
318*4882a593Smuzhiyun evt->event_handler(evt);
319*4882a593Smuzhiyun ret = IRQ_HANDLED;
320*4882a593Smuzhiyun }
321*4882a593Smuzhiyun
322*4882a593Smuzhiyun return ret;
323*4882a593Smuzhiyun }
324*4882a593Smuzhiyun
xen_teardown_timer(int cpu)325*4882a593Smuzhiyun void xen_teardown_timer(int cpu)
326*4882a593Smuzhiyun {
327*4882a593Smuzhiyun struct clock_event_device *evt;
328*4882a593Smuzhiyun evt = &per_cpu(xen_clock_events, cpu).evt;
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun if (evt->irq >= 0) {
331*4882a593Smuzhiyun unbind_from_irqhandler(evt->irq, NULL);
332*4882a593Smuzhiyun evt->irq = -1;
333*4882a593Smuzhiyun }
334*4882a593Smuzhiyun }
335*4882a593Smuzhiyun
xen_setup_timer(int cpu)336*4882a593Smuzhiyun void xen_setup_timer(int cpu)
337*4882a593Smuzhiyun {
338*4882a593Smuzhiyun struct xen_clock_event_device *xevt = &per_cpu(xen_clock_events, cpu);
339*4882a593Smuzhiyun struct clock_event_device *evt = &xevt->evt;
340*4882a593Smuzhiyun int irq;
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun WARN(evt->irq >= 0, "IRQ%d for CPU%d is already allocated\n", evt->irq, cpu);
343*4882a593Smuzhiyun if (evt->irq >= 0)
344*4882a593Smuzhiyun xen_teardown_timer(cpu);
345*4882a593Smuzhiyun
346*4882a593Smuzhiyun printk(KERN_INFO "installing Xen timer for CPU %d\n", cpu);
347*4882a593Smuzhiyun
348*4882a593Smuzhiyun snprintf(xevt->name, sizeof(xevt->name), "timer%d", cpu);
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
351*4882a593Smuzhiyun IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER|
352*4882a593Smuzhiyun IRQF_FORCE_RESUME|IRQF_EARLY_RESUME,
353*4882a593Smuzhiyun xevt->name, NULL);
354*4882a593Smuzhiyun (void)xen_set_irq_priority(irq, XEN_IRQ_PRIORITY_MAX);
355*4882a593Smuzhiyun
356*4882a593Smuzhiyun memcpy(evt, xen_clockevent, sizeof(*evt));
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun evt->cpumask = cpumask_of(cpu);
359*4882a593Smuzhiyun evt->irq = irq;
360*4882a593Smuzhiyun }
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun
xen_setup_cpu_clockevents(void)363*4882a593Smuzhiyun void xen_setup_cpu_clockevents(void)
364*4882a593Smuzhiyun {
365*4882a593Smuzhiyun clockevents_register_device(this_cpu_ptr(&xen_clock_events.evt));
366*4882a593Smuzhiyun }
367*4882a593Smuzhiyun
xen_timer_resume(void)368*4882a593Smuzhiyun void xen_timer_resume(void)
369*4882a593Smuzhiyun {
370*4882a593Smuzhiyun int cpu;
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun if (xen_clockevent != &xen_vcpuop_clockevent)
373*4882a593Smuzhiyun return;
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun for_each_online_cpu(cpu) {
376*4882a593Smuzhiyun if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer,
377*4882a593Smuzhiyun xen_vcpu_nr(cpu), NULL))
378*4882a593Smuzhiyun BUG();
379*4882a593Smuzhiyun }
380*4882a593Smuzhiyun }
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun static const struct pv_time_ops xen_time_ops __initconst = {
383*4882a593Smuzhiyun .sched_clock = xen_sched_clock,
384*4882a593Smuzhiyun .steal_clock = xen_steal_clock,
385*4882a593Smuzhiyun };
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun static struct pvclock_vsyscall_time_info *xen_clock __read_mostly;
388*4882a593Smuzhiyun static u64 xen_clock_value_saved;
389*4882a593Smuzhiyun
xen_save_time_memory_area(void)390*4882a593Smuzhiyun void xen_save_time_memory_area(void)
391*4882a593Smuzhiyun {
392*4882a593Smuzhiyun struct vcpu_register_time_memory_area t;
393*4882a593Smuzhiyun int ret;
394*4882a593Smuzhiyun
395*4882a593Smuzhiyun xen_clock_value_saved = xen_clocksource_read() - xen_sched_clock_offset;
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun if (!xen_clock)
398*4882a593Smuzhiyun return;
399*4882a593Smuzhiyun
400*4882a593Smuzhiyun t.addr.v = NULL;
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
403*4882a593Smuzhiyun if (ret != 0)
404*4882a593Smuzhiyun pr_notice("Cannot save secondary vcpu_time_info (err %d)",
405*4882a593Smuzhiyun ret);
406*4882a593Smuzhiyun else
407*4882a593Smuzhiyun clear_page(xen_clock);
408*4882a593Smuzhiyun }
409*4882a593Smuzhiyun
xen_restore_time_memory_area(void)410*4882a593Smuzhiyun void xen_restore_time_memory_area(void)
411*4882a593Smuzhiyun {
412*4882a593Smuzhiyun struct vcpu_register_time_memory_area t;
413*4882a593Smuzhiyun int ret;
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun if (!xen_clock)
416*4882a593Smuzhiyun goto out;
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun t.addr.v = &xen_clock->pvti;
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun /*
423*4882a593Smuzhiyun * We don't disable VDSO_CLOCKMODE_PVCLOCK entirely if it fails to
424*4882a593Smuzhiyun * register the secondary time info with Xen or if we migrated to a
425*4882a593Smuzhiyun * host without the necessary flags. On both of these cases what
426*4882a593Smuzhiyun * happens is either process seeing a zeroed out pvti or seeing no
427*4882a593Smuzhiyun * PVCLOCK_TSC_STABLE_BIT bit set. Userspace checks the latter and
428*4882a593Smuzhiyun * if 0, it discards the data in pvti and fallbacks to a system
429*4882a593Smuzhiyun * call for a reliable timestamp.
430*4882a593Smuzhiyun */
431*4882a593Smuzhiyun if (ret != 0)
432*4882a593Smuzhiyun pr_notice("Cannot restore secondary vcpu_time_info (err %d)",
433*4882a593Smuzhiyun ret);
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun out:
436*4882a593Smuzhiyun /* Need pvclock_resume() before using xen_clocksource_read(). */
437*4882a593Smuzhiyun pvclock_resume();
438*4882a593Smuzhiyun xen_sched_clock_offset = xen_clocksource_read() - xen_clock_value_saved;
439*4882a593Smuzhiyun }
440*4882a593Smuzhiyun
xen_setup_vsyscall_time_info(void)441*4882a593Smuzhiyun static void xen_setup_vsyscall_time_info(void)
442*4882a593Smuzhiyun {
443*4882a593Smuzhiyun struct vcpu_register_time_memory_area t;
444*4882a593Smuzhiyun struct pvclock_vsyscall_time_info *ti;
445*4882a593Smuzhiyun int ret;
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun ti = (struct pvclock_vsyscall_time_info *)get_zeroed_page(GFP_KERNEL);
448*4882a593Smuzhiyun if (!ti)
449*4882a593Smuzhiyun return;
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun t.addr.v = &ti->pvti;
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area, 0, &t);
454*4882a593Smuzhiyun if (ret) {
455*4882a593Smuzhiyun pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (err %d)\n", ret);
456*4882a593Smuzhiyun free_page((unsigned long)ti);
457*4882a593Smuzhiyun return;
458*4882a593Smuzhiyun }
459*4882a593Smuzhiyun
460*4882a593Smuzhiyun /*
461*4882a593Smuzhiyun * If primary time info had this bit set, secondary should too since
462*4882a593Smuzhiyun * it's the same data on both just different memory regions. But we
463*4882a593Smuzhiyun * still check it in case hypervisor is buggy.
464*4882a593Smuzhiyun */
465*4882a593Smuzhiyun if (!(ti->pvti.flags & PVCLOCK_TSC_STABLE_BIT)) {
466*4882a593Smuzhiyun t.addr.v = NULL;
467*4882a593Smuzhiyun ret = HYPERVISOR_vcpu_op(VCPUOP_register_vcpu_time_memory_area,
468*4882a593Smuzhiyun 0, &t);
469*4882a593Smuzhiyun if (!ret)
470*4882a593Smuzhiyun free_page((unsigned long)ti);
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun pr_notice("xen: VDSO_CLOCKMODE_PVCLOCK not supported (tsc unstable)\n");
473*4882a593Smuzhiyun return;
474*4882a593Smuzhiyun }
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun xen_clock = ti;
477*4882a593Smuzhiyun pvclock_set_pvti_cpu0_va(xen_clock);
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun xen_clocksource.vdso_clock_mode = VDSO_CLOCKMODE_PVCLOCK;
480*4882a593Smuzhiyun }
481*4882a593Smuzhiyun
xen_time_init(void)482*4882a593Smuzhiyun static void __init xen_time_init(void)
483*4882a593Smuzhiyun {
484*4882a593Smuzhiyun struct pvclock_vcpu_time_info *pvti;
485*4882a593Smuzhiyun int cpu = smp_processor_id();
486*4882a593Smuzhiyun struct timespec64 tp;
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun /* As Dom0 is never moved, no penalty on using TSC there */
489*4882a593Smuzhiyun if (xen_initial_domain())
490*4882a593Smuzhiyun xen_clocksource.rating = 275;
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, xen_vcpu_nr(cpu),
495*4882a593Smuzhiyun NULL) == 0) {
496*4882a593Smuzhiyun /* Successfully turned off 100Hz tick, so we have the
497*4882a593Smuzhiyun vcpuop-based timer interface */
498*4882a593Smuzhiyun printk(KERN_DEBUG "Xen: using vcpuop timer interface\n");
499*4882a593Smuzhiyun xen_clockevent = &xen_vcpuop_clockevent;
500*4882a593Smuzhiyun }
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun /* Set initial system time with full resolution */
503*4882a593Smuzhiyun xen_read_wallclock(&tp);
504*4882a593Smuzhiyun do_settimeofday64(&tp);
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun setup_force_cpu_cap(X86_FEATURE_TSC);
507*4882a593Smuzhiyun
508*4882a593Smuzhiyun /*
509*4882a593Smuzhiyun * We check ahead on the primary time info if this
510*4882a593Smuzhiyun * bit is supported hence speeding up Xen clocksource.
511*4882a593Smuzhiyun */
512*4882a593Smuzhiyun pvti = &__this_cpu_read(xen_vcpu)->time;
513*4882a593Smuzhiyun if (pvti->flags & PVCLOCK_TSC_STABLE_BIT) {
514*4882a593Smuzhiyun pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
515*4882a593Smuzhiyun xen_setup_vsyscall_time_info();
516*4882a593Smuzhiyun }
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun xen_setup_runstate_info(cpu);
519*4882a593Smuzhiyun xen_setup_timer(cpu);
520*4882a593Smuzhiyun xen_setup_cpu_clockevents();
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun xen_time_setup_guest();
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun if (xen_initial_domain())
525*4882a593Smuzhiyun pvclock_gtod_register_notifier(&xen_pvclock_gtod_notifier);
526*4882a593Smuzhiyun }
527*4882a593Smuzhiyun
xen_init_time_ops(void)528*4882a593Smuzhiyun void __init xen_init_time_ops(void)
529*4882a593Smuzhiyun {
530*4882a593Smuzhiyun xen_sched_clock_offset = xen_clocksource_read();
531*4882a593Smuzhiyun pv_ops.time = xen_time_ops;
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun x86_init.timers.timer_init = xen_time_init;
534*4882a593Smuzhiyun x86_init.timers.setup_percpu_clockev = x86_init_noop;
535*4882a593Smuzhiyun x86_cpuinit.setup_percpu_clockev = x86_init_noop;
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun x86_platform.calibrate_tsc = xen_tsc_khz;
538*4882a593Smuzhiyun x86_platform.get_wallclock = xen_get_wallclock;
539*4882a593Smuzhiyun /* Dom0 uses the native method to set the hardware RTC. */
540*4882a593Smuzhiyun if (!xen_initial_domain())
541*4882a593Smuzhiyun x86_platform.set_wallclock = xen_set_wallclock;
542*4882a593Smuzhiyun }
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun #ifdef CONFIG_XEN_PVHVM
xen_hvm_setup_cpu_clockevents(void)545*4882a593Smuzhiyun static void xen_hvm_setup_cpu_clockevents(void)
546*4882a593Smuzhiyun {
547*4882a593Smuzhiyun int cpu = smp_processor_id();
548*4882a593Smuzhiyun xen_setup_runstate_info(cpu);
549*4882a593Smuzhiyun /*
550*4882a593Smuzhiyun * xen_setup_timer(cpu) - snprintf is bad in atomic context. Hence
551*4882a593Smuzhiyun * doing it xen_hvm_cpu_notify (which gets called by smp_init during
552*4882a593Smuzhiyun * early bootup and also during CPU hotplug events).
553*4882a593Smuzhiyun */
554*4882a593Smuzhiyun xen_setup_cpu_clockevents();
555*4882a593Smuzhiyun }
556*4882a593Smuzhiyun
xen_hvm_init_time_ops(void)557*4882a593Smuzhiyun void __init xen_hvm_init_time_ops(void)
558*4882a593Smuzhiyun {
559*4882a593Smuzhiyun static bool hvm_time_initialized;
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun if (hvm_time_initialized)
562*4882a593Smuzhiyun return;
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun /*
565*4882a593Smuzhiyun * vector callback is needed otherwise we cannot receive interrupts
566*4882a593Smuzhiyun * on cpu > 0 and at this point we don't know how many cpus are
567*4882a593Smuzhiyun * available.
568*4882a593Smuzhiyun */
569*4882a593Smuzhiyun if (!xen_have_vector_callback)
570*4882a593Smuzhiyun return;
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun if (!xen_feature(XENFEAT_hvm_safe_pvclock)) {
573*4882a593Smuzhiyun pr_info_once("Xen doesn't support pvclock on HVM, disable pv timer");
574*4882a593Smuzhiyun return;
575*4882a593Smuzhiyun }
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun /*
578*4882a593Smuzhiyun * Only MAX_VIRT_CPUS 'vcpu_info' are embedded inside 'shared_info'.
579*4882a593Smuzhiyun * The __this_cpu_read(xen_vcpu) is still NULL when Xen HVM guest
580*4882a593Smuzhiyun * boots on vcpu >= MAX_VIRT_CPUS (e.g., kexec), To access
581*4882a593Smuzhiyun * __this_cpu_read(xen_vcpu) via xen_clocksource_read() will panic.
582*4882a593Smuzhiyun *
583*4882a593Smuzhiyun * The xen_hvm_init_time_ops() should be called again later after
584*4882a593Smuzhiyun * __this_cpu_read(xen_vcpu) is available.
585*4882a593Smuzhiyun */
586*4882a593Smuzhiyun if (!__this_cpu_read(xen_vcpu)) {
587*4882a593Smuzhiyun pr_info("Delay xen_init_time_common() as kernel is running on vcpu=%d\n",
588*4882a593Smuzhiyun xen_vcpu_nr(0));
589*4882a593Smuzhiyun return;
590*4882a593Smuzhiyun }
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun xen_sched_clock_offset = xen_clocksource_read();
593*4882a593Smuzhiyun pv_ops.time = xen_time_ops;
594*4882a593Smuzhiyun x86_init.timers.setup_percpu_clockev = xen_time_init;
595*4882a593Smuzhiyun x86_cpuinit.setup_percpu_clockev = xen_hvm_setup_cpu_clockevents;
596*4882a593Smuzhiyun
597*4882a593Smuzhiyun x86_platform.calibrate_tsc = xen_tsc_khz;
598*4882a593Smuzhiyun x86_platform.get_wallclock = xen_get_wallclock;
599*4882a593Smuzhiyun x86_platform.set_wallclock = xen_set_wallclock;
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun hvm_time_initialized = true;
602*4882a593Smuzhiyun }
603*4882a593Smuzhiyun #endif
604*4882a593Smuzhiyun
605*4882a593Smuzhiyun /* Kernel parameter to specify Xen timer slop */
parse_xen_timer_slop(char * ptr)606*4882a593Smuzhiyun static int __init parse_xen_timer_slop(char *ptr)
607*4882a593Smuzhiyun {
608*4882a593Smuzhiyun unsigned long slop = memparse(ptr, NULL);
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun xen_timerop_clockevent.min_delta_ns = slop;
611*4882a593Smuzhiyun xen_timerop_clockevent.min_delta_ticks = slop;
612*4882a593Smuzhiyun xen_vcpuop_clockevent.min_delta_ns = slop;
613*4882a593Smuzhiyun xen_vcpuop_clockevent.min_delta_ticks = slop;
614*4882a593Smuzhiyun
615*4882a593Smuzhiyun return 0;
616*4882a593Smuzhiyun }
617*4882a593Smuzhiyun early_param("xen_timer_slop", parse_xen_timer_slop);
618