1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (c) 2012, Microsoft Corporation.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Author:
6*4882a593Smuzhiyun * K. Y. Srinivasan <kys@microsoft.com>
7*4882a593Smuzhiyun */
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
10*4882a593Smuzhiyun
11*4882a593Smuzhiyun #include <linux/kernel.h>
12*4882a593Smuzhiyun #include <linux/jiffies.h>
13*4882a593Smuzhiyun #include <linux/mman.h>
14*4882a593Smuzhiyun #include <linux/delay.h>
15*4882a593Smuzhiyun #include <linux/init.h>
16*4882a593Smuzhiyun #include <linux/module.h>
17*4882a593Smuzhiyun #include <linux/slab.h>
18*4882a593Smuzhiyun #include <linux/kthread.h>
19*4882a593Smuzhiyun #include <linux/completion.h>
20*4882a593Smuzhiyun #include <linux/memory_hotplug.h>
21*4882a593Smuzhiyun #include <linux/memory.h>
22*4882a593Smuzhiyun #include <linux/notifier.h>
23*4882a593Smuzhiyun #include <linux/percpu_counter.h>
24*4882a593Smuzhiyun
25*4882a593Smuzhiyun #include <linux/hyperv.h>
26*4882a593Smuzhiyun #include <asm/hyperv-tlfs.h>
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun #include <asm/mshyperv.h>
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun #define CREATE_TRACE_POINTS
31*4882a593Smuzhiyun #include "hv_trace_balloon.h"
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun /*
34*4882a593Smuzhiyun * We begin with definitions supporting the Dynamic Memory protocol
35*4882a593Smuzhiyun * with the host.
36*4882a593Smuzhiyun *
37*4882a593Smuzhiyun * Begin protocol definitions.
38*4882a593Smuzhiyun */
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun /*
43*4882a593Smuzhiyun * Protocol versions. The low word is the minor version, the high word the major
44*4882a593Smuzhiyun * version.
45*4882a593Smuzhiyun *
46*4882a593Smuzhiyun * History:
47*4882a593Smuzhiyun * Initial version 1.0
48*4882a593Smuzhiyun * Changed to 0.1 on 2009/03/25
49*4882a593Smuzhiyun * Changes to 0.2 on 2009/05/14
50*4882a593Smuzhiyun * Changes to 0.3 on 2009/12/03
51*4882a593Smuzhiyun * Changed to 1.0 on 2011/04/05
52*4882a593Smuzhiyun */
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun #define DYNMEM_MAKE_VERSION(Major, Minor) ((__u32)(((Major) << 16) | (Minor)))
55*4882a593Smuzhiyun #define DYNMEM_MAJOR_VERSION(Version) ((__u32)(Version) >> 16)
56*4882a593Smuzhiyun #define DYNMEM_MINOR_VERSION(Version) ((__u32)(Version) & 0xff)
57*4882a593Smuzhiyun
58*4882a593Smuzhiyun enum {
59*4882a593Smuzhiyun DYNMEM_PROTOCOL_VERSION_1 = DYNMEM_MAKE_VERSION(0, 3),
60*4882a593Smuzhiyun DYNMEM_PROTOCOL_VERSION_2 = DYNMEM_MAKE_VERSION(1, 0),
61*4882a593Smuzhiyun DYNMEM_PROTOCOL_VERSION_3 = DYNMEM_MAKE_VERSION(2, 0),
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun DYNMEM_PROTOCOL_VERSION_WIN7 = DYNMEM_PROTOCOL_VERSION_1,
64*4882a593Smuzhiyun DYNMEM_PROTOCOL_VERSION_WIN8 = DYNMEM_PROTOCOL_VERSION_2,
65*4882a593Smuzhiyun DYNMEM_PROTOCOL_VERSION_WIN10 = DYNMEM_PROTOCOL_VERSION_3,
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun DYNMEM_PROTOCOL_VERSION_CURRENT = DYNMEM_PROTOCOL_VERSION_WIN10
68*4882a593Smuzhiyun };
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun
72*4882a593Smuzhiyun /*
73*4882a593Smuzhiyun * Message Types
74*4882a593Smuzhiyun */
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun enum dm_message_type {
77*4882a593Smuzhiyun /*
78*4882a593Smuzhiyun * Version 0.3
79*4882a593Smuzhiyun */
80*4882a593Smuzhiyun DM_ERROR = 0,
81*4882a593Smuzhiyun DM_VERSION_REQUEST = 1,
82*4882a593Smuzhiyun DM_VERSION_RESPONSE = 2,
83*4882a593Smuzhiyun DM_CAPABILITIES_REPORT = 3,
84*4882a593Smuzhiyun DM_CAPABILITIES_RESPONSE = 4,
85*4882a593Smuzhiyun DM_STATUS_REPORT = 5,
86*4882a593Smuzhiyun DM_BALLOON_REQUEST = 6,
87*4882a593Smuzhiyun DM_BALLOON_RESPONSE = 7,
88*4882a593Smuzhiyun DM_UNBALLOON_REQUEST = 8,
89*4882a593Smuzhiyun DM_UNBALLOON_RESPONSE = 9,
90*4882a593Smuzhiyun DM_MEM_HOT_ADD_REQUEST = 10,
91*4882a593Smuzhiyun DM_MEM_HOT_ADD_RESPONSE = 11,
92*4882a593Smuzhiyun DM_VERSION_03_MAX = 11,
93*4882a593Smuzhiyun /*
94*4882a593Smuzhiyun * Version 1.0.
95*4882a593Smuzhiyun */
96*4882a593Smuzhiyun DM_INFO_MESSAGE = 12,
97*4882a593Smuzhiyun DM_VERSION_1_MAX = 12
98*4882a593Smuzhiyun };
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun /*
102*4882a593Smuzhiyun * Structures defining the dynamic memory management
103*4882a593Smuzhiyun * protocol.
104*4882a593Smuzhiyun */
105*4882a593Smuzhiyun
106*4882a593Smuzhiyun union dm_version {
107*4882a593Smuzhiyun struct {
108*4882a593Smuzhiyun __u16 minor_version;
109*4882a593Smuzhiyun __u16 major_version;
110*4882a593Smuzhiyun };
111*4882a593Smuzhiyun __u32 version;
112*4882a593Smuzhiyun } __packed;
113*4882a593Smuzhiyun
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun union dm_caps {
116*4882a593Smuzhiyun struct {
117*4882a593Smuzhiyun __u64 balloon:1;
118*4882a593Smuzhiyun __u64 hot_add:1;
119*4882a593Smuzhiyun /*
120*4882a593Smuzhiyun * To support guests that may have alignment
121*4882a593Smuzhiyun * limitations on hot-add, the guest can specify
122*4882a593Smuzhiyun * its alignment requirements; a value of n
123*4882a593Smuzhiyun * represents an alignment of 2^n in mega bytes.
124*4882a593Smuzhiyun */
125*4882a593Smuzhiyun __u64 hot_add_alignment:4;
126*4882a593Smuzhiyun __u64 reservedz:58;
127*4882a593Smuzhiyun } cap_bits;
128*4882a593Smuzhiyun __u64 caps;
129*4882a593Smuzhiyun } __packed;
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun union dm_mem_page_range {
132*4882a593Smuzhiyun struct {
133*4882a593Smuzhiyun /*
134*4882a593Smuzhiyun * The PFN number of the first page in the range.
135*4882a593Smuzhiyun * 40 bits is the architectural limit of a PFN
136*4882a593Smuzhiyun * number for AMD64.
137*4882a593Smuzhiyun */
138*4882a593Smuzhiyun __u64 start_page:40;
139*4882a593Smuzhiyun /*
140*4882a593Smuzhiyun * The number of pages in the range.
141*4882a593Smuzhiyun */
142*4882a593Smuzhiyun __u64 page_cnt:24;
143*4882a593Smuzhiyun } finfo;
144*4882a593Smuzhiyun __u64 page_range;
145*4882a593Smuzhiyun } __packed;
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun /*
150*4882a593Smuzhiyun * The header for all dynamic memory messages:
151*4882a593Smuzhiyun *
152*4882a593Smuzhiyun * type: Type of the message.
153*4882a593Smuzhiyun * size: Size of the message in bytes; including the header.
154*4882a593Smuzhiyun * trans_id: The guest is responsible for manufacturing this ID.
155*4882a593Smuzhiyun */
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun struct dm_header {
158*4882a593Smuzhiyun __u16 type;
159*4882a593Smuzhiyun __u16 size;
160*4882a593Smuzhiyun __u32 trans_id;
161*4882a593Smuzhiyun } __packed;
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun /*
164*4882a593Smuzhiyun * A generic message format for dynamic memory.
165*4882a593Smuzhiyun * Specific message formats are defined later in the file.
166*4882a593Smuzhiyun */
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun struct dm_message {
169*4882a593Smuzhiyun struct dm_header hdr;
170*4882a593Smuzhiyun __u8 data[]; /* enclosed message */
171*4882a593Smuzhiyun } __packed;
172*4882a593Smuzhiyun
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun /*
175*4882a593Smuzhiyun * Specific message types supporting the dynamic memory protocol.
176*4882a593Smuzhiyun */
177*4882a593Smuzhiyun
178*4882a593Smuzhiyun /*
179*4882a593Smuzhiyun * Version negotiation message. Sent from the guest to the host.
180*4882a593Smuzhiyun * The guest is free to try different versions until the host
181*4882a593Smuzhiyun * accepts the version.
182*4882a593Smuzhiyun *
183*4882a593Smuzhiyun * dm_version: The protocol version requested.
184*4882a593Smuzhiyun * is_last_attempt: If TRUE, this is the last version guest will request.
185*4882a593Smuzhiyun * reservedz: Reserved field, set to zero.
186*4882a593Smuzhiyun */
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun struct dm_version_request {
189*4882a593Smuzhiyun struct dm_header hdr;
190*4882a593Smuzhiyun union dm_version version;
191*4882a593Smuzhiyun __u32 is_last_attempt:1;
192*4882a593Smuzhiyun __u32 reservedz:31;
193*4882a593Smuzhiyun } __packed;
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun /*
196*4882a593Smuzhiyun * Version response message; Host to Guest and indicates
197*4882a593Smuzhiyun * if the host has accepted the version sent by the guest.
198*4882a593Smuzhiyun *
199*4882a593Smuzhiyun * is_accepted: If TRUE, host has accepted the version and the guest
200*4882a593Smuzhiyun * should proceed to the next stage of the protocol. FALSE indicates that
201*4882a593Smuzhiyun * guest should re-try with a different version.
202*4882a593Smuzhiyun *
203*4882a593Smuzhiyun * reservedz: Reserved field, set to zero.
204*4882a593Smuzhiyun */
205*4882a593Smuzhiyun
206*4882a593Smuzhiyun struct dm_version_response {
207*4882a593Smuzhiyun struct dm_header hdr;
208*4882a593Smuzhiyun __u64 is_accepted:1;
209*4882a593Smuzhiyun __u64 reservedz:63;
210*4882a593Smuzhiyun } __packed;
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun /*
213*4882a593Smuzhiyun * Message reporting capabilities. This is sent from the guest to the
214*4882a593Smuzhiyun * host.
215*4882a593Smuzhiyun */
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun struct dm_capabilities {
218*4882a593Smuzhiyun struct dm_header hdr;
219*4882a593Smuzhiyun union dm_caps caps;
220*4882a593Smuzhiyun __u64 min_page_cnt;
221*4882a593Smuzhiyun __u64 max_page_number;
222*4882a593Smuzhiyun } __packed;
223*4882a593Smuzhiyun
224*4882a593Smuzhiyun /*
225*4882a593Smuzhiyun * Response to the capabilities message. This is sent from the host to the
226*4882a593Smuzhiyun * guest. This message notifies if the host has accepted the guest's
227*4882a593Smuzhiyun * capabilities. If the host has not accepted, the guest must shutdown
228*4882a593Smuzhiyun * the service.
229*4882a593Smuzhiyun *
230*4882a593Smuzhiyun * is_accepted: Indicates if the host has accepted guest's capabilities.
231*4882a593Smuzhiyun * reservedz: Must be 0.
232*4882a593Smuzhiyun */
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun struct dm_capabilities_resp_msg {
235*4882a593Smuzhiyun struct dm_header hdr;
236*4882a593Smuzhiyun __u64 is_accepted:1;
237*4882a593Smuzhiyun __u64 reservedz:63;
238*4882a593Smuzhiyun } __packed;
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun /*
241*4882a593Smuzhiyun * This message is used to report memory pressure from the guest.
242*4882a593Smuzhiyun * This message is not part of any transaction and there is no
243*4882a593Smuzhiyun * response to this message.
244*4882a593Smuzhiyun *
245*4882a593Smuzhiyun * num_avail: Available memory in pages.
246*4882a593Smuzhiyun * num_committed: Committed memory in pages.
247*4882a593Smuzhiyun * page_file_size: The accumulated size of all page files
248*4882a593Smuzhiyun * in the system in pages.
249*4882a593Smuzhiyun * zero_free: The nunber of zero and free pages.
250*4882a593Smuzhiyun * page_file_writes: The writes to the page file in pages.
251*4882a593Smuzhiyun * io_diff: An indicator of file cache efficiency or page file activity,
252*4882a593Smuzhiyun * calculated as File Cache Page Fault Count - Page Read Count.
253*4882a593Smuzhiyun * This value is in pages.
254*4882a593Smuzhiyun *
255*4882a593Smuzhiyun * Some of these metrics are Windows specific and fortunately
256*4882a593Smuzhiyun * the algorithm on the host side that computes the guest memory
257*4882a593Smuzhiyun * pressure only uses num_committed value.
258*4882a593Smuzhiyun */
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun struct dm_status {
261*4882a593Smuzhiyun struct dm_header hdr;
262*4882a593Smuzhiyun __u64 num_avail;
263*4882a593Smuzhiyun __u64 num_committed;
264*4882a593Smuzhiyun __u64 page_file_size;
265*4882a593Smuzhiyun __u64 zero_free;
266*4882a593Smuzhiyun __u32 page_file_writes;
267*4882a593Smuzhiyun __u32 io_diff;
268*4882a593Smuzhiyun } __packed;
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun /*
272*4882a593Smuzhiyun * Message to ask the guest to allocate memory - balloon up message.
273*4882a593Smuzhiyun * This message is sent from the host to the guest. The guest may not be
274*4882a593Smuzhiyun * able to allocate as much memory as requested.
275*4882a593Smuzhiyun *
276*4882a593Smuzhiyun * num_pages: number of pages to allocate.
277*4882a593Smuzhiyun */
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun struct dm_balloon {
280*4882a593Smuzhiyun struct dm_header hdr;
281*4882a593Smuzhiyun __u32 num_pages;
282*4882a593Smuzhiyun __u32 reservedz;
283*4882a593Smuzhiyun } __packed;
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun /*
287*4882a593Smuzhiyun * Balloon response message; this message is sent from the guest
288*4882a593Smuzhiyun * to the host in response to the balloon message.
289*4882a593Smuzhiyun *
290*4882a593Smuzhiyun * reservedz: Reserved; must be set to zero.
291*4882a593Smuzhiyun * more_pages: If FALSE, this is the last message of the transaction.
292*4882a593Smuzhiyun * if TRUE there will atleast one more message from the guest.
293*4882a593Smuzhiyun *
294*4882a593Smuzhiyun * range_count: The number of ranges in the range array.
295*4882a593Smuzhiyun *
296*4882a593Smuzhiyun * range_array: An array of page ranges returned to the host.
297*4882a593Smuzhiyun *
298*4882a593Smuzhiyun */
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun struct dm_balloon_response {
301*4882a593Smuzhiyun struct dm_header hdr;
302*4882a593Smuzhiyun __u32 reservedz;
303*4882a593Smuzhiyun __u32 more_pages:1;
304*4882a593Smuzhiyun __u32 range_count:31;
305*4882a593Smuzhiyun union dm_mem_page_range range_array[];
306*4882a593Smuzhiyun } __packed;
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun /*
309*4882a593Smuzhiyun * Un-balloon message; this message is sent from the host
310*4882a593Smuzhiyun * to the guest to give guest more memory.
311*4882a593Smuzhiyun *
312*4882a593Smuzhiyun * more_pages: If FALSE, this is the last message of the transaction.
313*4882a593Smuzhiyun * if TRUE there will atleast one more message from the guest.
314*4882a593Smuzhiyun *
315*4882a593Smuzhiyun * reservedz: Reserved; must be set to zero.
316*4882a593Smuzhiyun *
317*4882a593Smuzhiyun * range_count: The number of ranges in the range array.
318*4882a593Smuzhiyun *
319*4882a593Smuzhiyun * range_array: An array of page ranges returned to the host.
320*4882a593Smuzhiyun *
321*4882a593Smuzhiyun */
322*4882a593Smuzhiyun
323*4882a593Smuzhiyun struct dm_unballoon_request {
324*4882a593Smuzhiyun struct dm_header hdr;
325*4882a593Smuzhiyun __u32 more_pages:1;
326*4882a593Smuzhiyun __u32 reservedz:31;
327*4882a593Smuzhiyun __u32 range_count;
328*4882a593Smuzhiyun union dm_mem_page_range range_array[];
329*4882a593Smuzhiyun } __packed;
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun /*
332*4882a593Smuzhiyun * Un-balloon response message; this message is sent from the guest
333*4882a593Smuzhiyun * to the host in response to an unballoon request.
334*4882a593Smuzhiyun *
335*4882a593Smuzhiyun */
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun struct dm_unballoon_response {
338*4882a593Smuzhiyun struct dm_header hdr;
339*4882a593Smuzhiyun } __packed;
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun /*
343*4882a593Smuzhiyun * Hot add request message. Message sent from the host to the guest.
344*4882a593Smuzhiyun *
345*4882a593Smuzhiyun * mem_range: Memory range to hot add.
346*4882a593Smuzhiyun *
347*4882a593Smuzhiyun */
348*4882a593Smuzhiyun
349*4882a593Smuzhiyun struct dm_hot_add {
350*4882a593Smuzhiyun struct dm_header hdr;
351*4882a593Smuzhiyun union dm_mem_page_range range;
352*4882a593Smuzhiyun } __packed;
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun /*
355*4882a593Smuzhiyun * Hot add response message.
356*4882a593Smuzhiyun * This message is sent by the guest to report the status of a hot add request.
357*4882a593Smuzhiyun * If page_count is less than the requested page count, then the host should
358*4882a593Smuzhiyun * assume all further hot add requests will fail, since this indicates that
359*4882a593Smuzhiyun * the guest has hit an upper physical memory barrier.
360*4882a593Smuzhiyun *
361*4882a593Smuzhiyun * Hot adds may also fail due to low resources; in this case, the guest must
362*4882a593Smuzhiyun * not complete this message until the hot add can succeed, and the host must
363*4882a593Smuzhiyun * not send a new hot add request until the response is sent.
364*4882a593Smuzhiyun * If VSC fails to hot add memory DYNMEM_NUMBER_OF_UNSUCCESSFUL_HOTADD_ATTEMPTS
365*4882a593Smuzhiyun * times it fails the request.
366*4882a593Smuzhiyun *
367*4882a593Smuzhiyun *
368*4882a593Smuzhiyun * page_count: number of pages that were successfully hot added.
369*4882a593Smuzhiyun *
370*4882a593Smuzhiyun * result: result of the operation 1: success, 0: failure.
371*4882a593Smuzhiyun *
372*4882a593Smuzhiyun */
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun struct dm_hot_add_response {
375*4882a593Smuzhiyun struct dm_header hdr;
376*4882a593Smuzhiyun __u32 page_count;
377*4882a593Smuzhiyun __u32 result;
378*4882a593Smuzhiyun } __packed;
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun /*
381*4882a593Smuzhiyun * Types of information sent from host to the guest.
382*4882a593Smuzhiyun */
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun enum dm_info_type {
385*4882a593Smuzhiyun INFO_TYPE_MAX_PAGE_CNT = 0,
386*4882a593Smuzhiyun MAX_INFO_TYPE
387*4882a593Smuzhiyun };
388*4882a593Smuzhiyun
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun /*
391*4882a593Smuzhiyun * Header for the information message.
392*4882a593Smuzhiyun */
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun struct dm_info_header {
395*4882a593Smuzhiyun enum dm_info_type type;
396*4882a593Smuzhiyun __u32 data_size;
397*4882a593Smuzhiyun } __packed;
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun /*
400*4882a593Smuzhiyun * This message is sent from the host to the guest to pass
401*4882a593Smuzhiyun * some relevant information (win8 addition).
402*4882a593Smuzhiyun *
403*4882a593Smuzhiyun * reserved: no used.
404*4882a593Smuzhiyun * info_size: size of the information blob.
405*4882a593Smuzhiyun * info: information blob.
406*4882a593Smuzhiyun */
407*4882a593Smuzhiyun
408*4882a593Smuzhiyun struct dm_info_msg {
409*4882a593Smuzhiyun struct dm_header hdr;
410*4882a593Smuzhiyun __u32 reserved;
411*4882a593Smuzhiyun __u32 info_size;
412*4882a593Smuzhiyun __u8 info[];
413*4882a593Smuzhiyun };
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun /*
416*4882a593Smuzhiyun * End protocol definitions.
417*4882a593Smuzhiyun */
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun /*
420*4882a593Smuzhiyun * State to manage hot adding memory into the guest.
421*4882a593Smuzhiyun * The range start_pfn : end_pfn specifies the range
422*4882a593Smuzhiyun * that the host has asked us to hot add. The range
423*4882a593Smuzhiyun * start_pfn : ha_end_pfn specifies the range that we have
424*4882a593Smuzhiyun * currently hot added. We hot add in multiples of 128M
425*4882a593Smuzhiyun * chunks; it is possible that we may not be able to bring
426*4882a593Smuzhiyun * online all the pages in the region. The range
427*4882a593Smuzhiyun * covered_start_pfn:covered_end_pfn defines the pages that can
428*4882a593Smuzhiyun * be brough online.
429*4882a593Smuzhiyun */
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun struct hv_hotadd_state {
432*4882a593Smuzhiyun struct list_head list;
433*4882a593Smuzhiyun unsigned long start_pfn;
434*4882a593Smuzhiyun unsigned long covered_start_pfn;
435*4882a593Smuzhiyun unsigned long covered_end_pfn;
436*4882a593Smuzhiyun unsigned long ha_end_pfn;
437*4882a593Smuzhiyun unsigned long end_pfn;
438*4882a593Smuzhiyun /*
439*4882a593Smuzhiyun * A list of gaps.
440*4882a593Smuzhiyun */
441*4882a593Smuzhiyun struct list_head gap_list;
442*4882a593Smuzhiyun };
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun struct hv_hotadd_gap {
445*4882a593Smuzhiyun struct list_head list;
446*4882a593Smuzhiyun unsigned long start_pfn;
447*4882a593Smuzhiyun unsigned long end_pfn;
448*4882a593Smuzhiyun };
449*4882a593Smuzhiyun
450*4882a593Smuzhiyun struct balloon_state {
451*4882a593Smuzhiyun __u32 num_pages;
452*4882a593Smuzhiyun struct work_struct wrk;
453*4882a593Smuzhiyun };
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun struct hot_add_wrk {
456*4882a593Smuzhiyun union dm_mem_page_range ha_page_range;
457*4882a593Smuzhiyun union dm_mem_page_range ha_region_range;
458*4882a593Smuzhiyun struct work_struct wrk;
459*4882a593Smuzhiyun };
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun static bool allow_hibernation;
462*4882a593Smuzhiyun static bool hot_add = true;
463*4882a593Smuzhiyun static bool do_hot_add;
464*4882a593Smuzhiyun /*
465*4882a593Smuzhiyun * Delay reporting memory pressure by
466*4882a593Smuzhiyun * the specified number of seconds.
467*4882a593Smuzhiyun */
468*4882a593Smuzhiyun static uint pressure_report_delay = 45;
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun /*
471*4882a593Smuzhiyun * The last time we posted a pressure report to host.
472*4882a593Smuzhiyun */
473*4882a593Smuzhiyun static unsigned long last_post_time;
474*4882a593Smuzhiyun
475*4882a593Smuzhiyun module_param(hot_add, bool, (S_IRUGO | S_IWUSR));
476*4882a593Smuzhiyun MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add");
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun module_param(pressure_report_delay, uint, (S_IRUGO | S_IWUSR));
479*4882a593Smuzhiyun MODULE_PARM_DESC(pressure_report_delay, "Delay in secs in reporting pressure");
480*4882a593Smuzhiyun static atomic_t trans_id = ATOMIC_INIT(0);
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun static int dm_ring_size = 20 * 1024;
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun /*
485*4882a593Smuzhiyun * Driver specific state.
486*4882a593Smuzhiyun */
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun enum hv_dm_state {
489*4882a593Smuzhiyun DM_INITIALIZING = 0,
490*4882a593Smuzhiyun DM_INITIALIZED,
491*4882a593Smuzhiyun DM_BALLOON_UP,
492*4882a593Smuzhiyun DM_BALLOON_DOWN,
493*4882a593Smuzhiyun DM_HOT_ADD,
494*4882a593Smuzhiyun DM_INIT_ERROR
495*4882a593Smuzhiyun };
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun static __u8 recv_buffer[HV_HYP_PAGE_SIZE];
499*4882a593Smuzhiyun static __u8 balloon_up_send_buffer[HV_HYP_PAGE_SIZE];
500*4882a593Smuzhiyun #define PAGES_IN_2M (2 * 1024 * 1024 / PAGE_SIZE)
501*4882a593Smuzhiyun #define HA_CHUNK (128 * 1024 * 1024 / PAGE_SIZE)
502*4882a593Smuzhiyun
503*4882a593Smuzhiyun struct hv_dynmem_device {
504*4882a593Smuzhiyun struct hv_device *dev;
505*4882a593Smuzhiyun enum hv_dm_state state;
506*4882a593Smuzhiyun struct completion host_event;
507*4882a593Smuzhiyun struct completion config_event;
508*4882a593Smuzhiyun
509*4882a593Smuzhiyun /*
510*4882a593Smuzhiyun * Number of pages we have currently ballooned out.
511*4882a593Smuzhiyun */
512*4882a593Smuzhiyun unsigned int num_pages_ballooned;
513*4882a593Smuzhiyun unsigned int num_pages_onlined;
514*4882a593Smuzhiyun unsigned int num_pages_added;
515*4882a593Smuzhiyun
516*4882a593Smuzhiyun /*
517*4882a593Smuzhiyun * State to manage the ballooning (up) operation.
518*4882a593Smuzhiyun */
519*4882a593Smuzhiyun struct balloon_state balloon_wrk;
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun /*
522*4882a593Smuzhiyun * State to execute the "hot-add" operation.
523*4882a593Smuzhiyun */
524*4882a593Smuzhiyun struct hot_add_wrk ha_wrk;
525*4882a593Smuzhiyun
526*4882a593Smuzhiyun /*
527*4882a593Smuzhiyun * This state tracks if the host has specified a hot-add
528*4882a593Smuzhiyun * region.
529*4882a593Smuzhiyun */
530*4882a593Smuzhiyun bool host_specified_ha_region;
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun /*
533*4882a593Smuzhiyun * State to synchronize hot-add.
534*4882a593Smuzhiyun */
535*4882a593Smuzhiyun struct completion ol_waitevent;
536*4882a593Smuzhiyun /*
537*4882a593Smuzhiyun * This thread handles hot-add
538*4882a593Smuzhiyun * requests from the host as well as notifying
539*4882a593Smuzhiyun * the host with regards to memory pressure in
540*4882a593Smuzhiyun * the guest.
541*4882a593Smuzhiyun */
542*4882a593Smuzhiyun struct task_struct *thread;
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun /*
545*4882a593Smuzhiyun * Protects ha_region_list, num_pages_onlined counter and individual
546*4882a593Smuzhiyun * regions from ha_region_list.
547*4882a593Smuzhiyun */
548*4882a593Smuzhiyun spinlock_t ha_lock;
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun /*
551*4882a593Smuzhiyun * A list of hot-add regions.
552*4882a593Smuzhiyun */
553*4882a593Smuzhiyun struct list_head ha_region_list;
554*4882a593Smuzhiyun
555*4882a593Smuzhiyun /*
556*4882a593Smuzhiyun * We start with the highest version we can support
557*4882a593Smuzhiyun * and downgrade based on the host; we save here the
558*4882a593Smuzhiyun * next version to try.
559*4882a593Smuzhiyun */
560*4882a593Smuzhiyun __u32 next_version;
561*4882a593Smuzhiyun
562*4882a593Smuzhiyun /*
563*4882a593Smuzhiyun * The negotiated version agreed by host.
564*4882a593Smuzhiyun */
565*4882a593Smuzhiyun __u32 version;
566*4882a593Smuzhiyun };
567*4882a593Smuzhiyun
568*4882a593Smuzhiyun static struct hv_dynmem_device dm_device;
569*4882a593Smuzhiyun
570*4882a593Smuzhiyun static void post_status(struct hv_dynmem_device *dm);
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
has_pfn_is_backed(struct hv_hotadd_state * has,unsigned long pfn)573*4882a593Smuzhiyun static inline bool has_pfn_is_backed(struct hv_hotadd_state *has,
574*4882a593Smuzhiyun unsigned long pfn)
575*4882a593Smuzhiyun {
576*4882a593Smuzhiyun struct hv_hotadd_gap *gap;
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun /* The page is not backed. */
579*4882a593Smuzhiyun if ((pfn < has->covered_start_pfn) || (pfn >= has->covered_end_pfn))
580*4882a593Smuzhiyun return false;
581*4882a593Smuzhiyun
582*4882a593Smuzhiyun /* Check for gaps. */
583*4882a593Smuzhiyun list_for_each_entry(gap, &has->gap_list, list) {
584*4882a593Smuzhiyun if ((pfn >= gap->start_pfn) && (pfn < gap->end_pfn))
585*4882a593Smuzhiyun return false;
586*4882a593Smuzhiyun }
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun return true;
589*4882a593Smuzhiyun }
590*4882a593Smuzhiyun
hv_page_offline_check(unsigned long start_pfn,unsigned long nr_pages)591*4882a593Smuzhiyun static unsigned long hv_page_offline_check(unsigned long start_pfn,
592*4882a593Smuzhiyun unsigned long nr_pages)
593*4882a593Smuzhiyun {
594*4882a593Smuzhiyun unsigned long pfn = start_pfn, count = 0;
595*4882a593Smuzhiyun struct hv_hotadd_state *has;
596*4882a593Smuzhiyun bool found;
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun while (pfn < start_pfn + nr_pages) {
599*4882a593Smuzhiyun /*
600*4882a593Smuzhiyun * Search for HAS which covers the pfn and when we find one
601*4882a593Smuzhiyun * count how many consequitive PFNs are covered.
602*4882a593Smuzhiyun */
603*4882a593Smuzhiyun found = false;
604*4882a593Smuzhiyun list_for_each_entry(has, &dm_device.ha_region_list, list) {
605*4882a593Smuzhiyun while ((pfn >= has->start_pfn) &&
606*4882a593Smuzhiyun (pfn < has->end_pfn) &&
607*4882a593Smuzhiyun (pfn < start_pfn + nr_pages)) {
608*4882a593Smuzhiyun found = true;
609*4882a593Smuzhiyun if (has_pfn_is_backed(has, pfn))
610*4882a593Smuzhiyun count++;
611*4882a593Smuzhiyun pfn++;
612*4882a593Smuzhiyun }
613*4882a593Smuzhiyun }
614*4882a593Smuzhiyun
615*4882a593Smuzhiyun /*
616*4882a593Smuzhiyun * This PFN is not in any HAS (e.g. we're offlining a region
617*4882a593Smuzhiyun * which was present at boot), no need to account for it. Go
618*4882a593Smuzhiyun * to the next one.
619*4882a593Smuzhiyun */
620*4882a593Smuzhiyun if (!found)
621*4882a593Smuzhiyun pfn++;
622*4882a593Smuzhiyun }
623*4882a593Smuzhiyun
624*4882a593Smuzhiyun return count;
625*4882a593Smuzhiyun }
626*4882a593Smuzhiyun
hv_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)627*4882a593Smuzhiyun static int hv_memory_notifier(struct notifier_block *nb, unsigned long val,
628*4882a593Smuzhiyun void *v)
629*4882a593Smuzhiyun {
630*4882a593Smuzhiyun struct memory_notify *mem = (struct memory_notify *)v;
631*4882a593Smuzhiyun unsigned long flags, pfn_count;
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun switch (val) {
634*4882a593Smuzhiyun case MEM_ONLINE:
635*4882a593Smuzhiyun case MEM_CANCEL_ONLINE:
636*4882a593Smuzhiyun complete(&dm_device.ol_waitevent);
637*4882a593Smuzhiyun break;
638*4882a593Smuzhiyun
639*4882a593Smuzhiyun case MEM_OFFLINE:
640*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
641*4882a593Smuzhiyun pfn_count = hv_page_offline_check(mem->start_pfn,
642*4882a593Smuzhiyun mem->nr_pages);
643*4882a593Smuzhiyun if (pfn_count <= dm_device.num_pages_onlined) {
644*4882a593Smuzhiyun dm_device.num_pages_onlined -= pfn_count;
645*4882a593Smuzhiyun } else {
646*4882a593Smuzhiyun /*
647*4882a593Smuzhiyun * We're offlining more pages than we managed to online.
648*4882a593Smuzhiyun * This is unexpected. In any case don't let
649*4882a593Smuzhiyun * num_pages_onlined wrap around zero.
650*4882a593Smuzhiyun */
651*4882a593Smuzhiyun WARN_ON_ONCE(1);
652*4882a593Smuzhiyun dm_device.num_pages_onlined = 0;
653*4882a593Smuzhiyun }
654*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
655*4882a593Smuzhiyun break;
656*4882a593Smuzhiyun case MEM_GOING_ONLINE:
657*4882a593Smuzhiyun case MEM_GOING_OFFLINE:
658*4882a593Smuzhiyun case MEM_CANCEL_OFFLINE:
659*4882a593Smuzhiyun break;
660*4882a593Smuzhiyun }
661*4882a593Smuzhiyun return NOTIFY_OK;
662*4882a593Smuzhiyun }
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun static struct notifier_block hv_memory_nb = {
665*4882a593Smuzhiyun .notifier_call = hv_memory_notifier,
666*4882a593Smuzhiyun .priority = 0
667*4882a593Smuzhiyun };
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun /* Check if the particular page is backed and can be onlined and online it. */
hv_page_online_one(struct hv_hotadd_state * has,struct page * pg)670*4882a593Smuzhiyun static void hv_page_online_one(struct hv_hotadd_state *has, struct page *pg)
671*4882a593Smuzhiyun {
672*4882a593Smuzhiyun if (!has_pfn_is_backed(has, page_to_pfn(pg))) {
673*4882a593Smuzhiyun if (!PageOffline(pg))
674*4882a593Smuzhiyun __SetPageOffline(pg);
675*4882a593Smuzhiyun return;
676*4882a593Smuzhiyun }
677*4882a593Smuzhiyun if (PageOffline(pg))
678*4882a593Smuzhiyun __ClearPageOffline(pg);
679*4882a593Smuzhiyun
680*4882a593Smuzhiyun /* This frame is currently backed; online the page. */
681*4882a593Smuzhiyun generic_online_page(pg, 0);
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun lockdep_assert_held(&dm_device.ha_lock);
684*4882a593Smuzhiyun dm_device.num_pages_onlined++;
685*4882a593Smuzhiyun }
686*4882a593Smuzhiyun
hv_bring_pgs_online(struct hv_hotadd_state * has,unsigned long start_pfn,unsigned long size)687*4882a593Smuzhiyun static void hv_bring_pgs_online(struct hv_hotadd_state *has,
688*4882a593Smuzhiyun unsigned long start_pfn, unsigned long size)
689*4882a593Smuzhiyun {
690*4882a593Smuzhiyun int i;
691*4882a593Smuzhiyun
692*4882a593Smuzhiyun pr_debug("Online %lu pages starting at pfn 0x%lx\n", size, start_pfn);
693*4882a593Smuzhiyun for (i = 0; i < size; i++)
694*4882a593Smuzhiyun hv_page_online_one(has, pfn_to_page(start_pfn + i));
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun
hv_mem_hot_add(unsigned long start,unsigned long size,unsigned long pfn_count,struct hv_hotadd_state * has)697*4882a593Smuzhiyun static void hv_mem_hot_add(unsigned long start, unsigned long size,
698*4882a593Smuzhiyun unsigned long pfn_count,
699*4882a593Smuzhiyun struct hv_hotadd_state *has)
700*4882a593Smuzhiyun {
701*4882a593Smuzhiyun int ret = 0;
702*4882a593Smuzhiyun int i, nid;
703*4882a593Smuzhiyun unsigned long start_pfn;
704*4882a593Smuzhiyun unsigned long processed_pfn;
705*4882a593Smuzhiyun unsigned long total_pfn = pfn_count;
706*4882a593Smuzhiyun unsigned long flags;
707*4882a593Smuzhiyun
708*4882a593Smuzhiyun for (i = 0; i < (size/HA_CHUNK); i++) {
709*4882a593Smuzhiyun start_pfn = start + (i * HA_CHUNK);
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
712*4882a593Smuzhiyun has->ha_end_pfn += HA_CHUNK;
713*4882a593Smuzhiyun
714*4882a593Smuzhiyun if (total_pfn > HA_CHUNK) {
715*4882a593Smuzhiyun processed_pfn = HA_CHUNK;
716*4882a593Smuzhiyun total_pfn -= HA_CHUNK;
717*4882a593Smuzhiyun } else {
718*4882a593Smuzhiyun processed_pfn = total_pfn;
719*4882a593Smuzhiyun total_pfn = 0;
720*4882a593Smuzhiyun }
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun has->covered_end_pfn += processed_pfn;
723*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun reinit_completion(&dm_device.ol_waitevent);
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn));
728*4882a593Smuzhiyun ret = add_memory(nid, PFN_PHYS((start_pfn)),
729*4882a593Smuzhiyun (HA_CHUNK << PAGE_SHIFT), MEMHP_MERGE_RESOURCE);
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun if (ret) {
732*4882a593Smuzhiyun pr_err("hot_add memory failed error is %d\n", ret);
733*4882a593Smuzhiyun if (ret == -EEXIST) {
734*4882a593Smuzhiyun /*
735*4882a593Smuzhiyun * This error indicates that the error
736*4882a593Smuzhiyun * is not a transient failure. This is the
737*4882a593Smuzhiyun * case where the guest's physical address map
738*4882a593Smuzhiyun * precludes hot adding memory. Stop all further
739*4882a593Smuzhiyun * memory hot-add.
740*4882a593Smuzhiyun */
741*4882a593Smuzhiyun do_hot_add = false;
742*4882a593Smuzhiyun }
743*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
744*4882a593Smuzhiyun has->ha_end_pfn -= HA_CHUNK;
745*4882a593Smuzhiyun has->covered_end_pfn -= processed_pfn;
746*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
747*4882a593Smuzhiyun break;
748*4882a593Smuzhiyun }
749*4882a593Smuzhiyun
750*4882a593Smuzhiyun /*
751*4882a593Smuzhiyun * Wait for memory to get onlined. If the kernel onlined the
752*4882a593Smuzhiyun * memory when adding it, this will return directly. Otherwise,
753*4882a593Smuzhiyun * it will wait for user space to online the memory. This helps
754*4882a593Smuzhiyun * to avoid adding memory faster than it is getting onlined. As
755*4882a593Smuzhiyun * adding succeeded, it is ok to proceed even if the memory was
756*4882a593Smuzhiyun * not onlined in time.
757*4882a593Smuzhiyun */
758*4882a593Smuzhiyun wait_for_completion_timeout(&dm_device.ol_waitevent, 5 * HZ);
759*4882a593Smuzhiyun post_status(&dm_device);
760*4882a593Smuzhiyun }
761*4882a593Smuzhiyun }
762*4882a593Smuzhiyun
hv_online_page(struct page * pg,unsigned int order)763*4882a593Smuzhiyun static void hv_online_page(struct page *pg, unsigned int order)
764*4882a593Smuzhiyun {
765*4882a593Smuzhiyun struct hv_hotadd_state *has;
766*4882a593Smuzhiyun unsigned long flags;
767*4882a593Smuzhiyun unsigned long pfn = page_to_pfn(pg);
768*4882a593Smuzhiyun
769*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
770*4882a593Smuzhiyun list_for_each_entry(has, &dm_device.ha_region_list, list) {
771*4882a593Smuzhiyun /* The page belongs to a different HAS. */
772*4882a593Smuzhiyun if ((pfn < has->start_pfn) ||
773*4882a593Smuzhiyun (pfn + (1UL << order) > has->end_pfn))
774*4882a593Smuzhiyun continue;
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun hv_bring_pgs_online(has, pfn, 1UL << order);
777*4882a593Smuzhiyun break;
778*4882a593Smuzhiyun }
779*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
780*4882a593Smuzhiyun }
781*4882a593Smuzhiyun
pfn_covered(unsigned long start_pfn,unsigned long pfn_cnt)782*4882a593Smuzhiyun static int pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt)
783*4882a593Smuzhiyun {
784*4882a593Smuzhiyun struct hv_hotadd_state *has;
785*4882a593Smuzhiyun struct hv_hotadd_gap *gap;
786*4882a593Smuzhiyun unsigned long residual, new_inc;
787*4882a593Smuzhiyun int ret = 0;
788*4882a593Smuzhiyun unsigned long flags;
789*4882a593Smuzhiyun
790*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
791*4882a593Smuzhiyun list_for_each_entry(has, &dm_device.ha_region_list, list) {
792*4882a593Smuzhiyun /*
793*4882a593Smuzhiyun * If the pfn range we are dealing with is not in the current
794*4882a593Smuzhiyun * "hot add block", move on.
795*4882a593Smuzhiyun */
796*4882a593Smuzhiyun if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
797*4882a593Smuzhiyun continue;
798*4882a593Smuzhiyun
799*4882a593Smuzhiyun /*
800*4882a593Smuzhiyun * If the current start pfn is not where the covered_end
801*4882a593Smuzhiyun * is, create a gap and update covered_end_pfn.
802*4882a593Smuzhiyun */
803*4882a593Smuzhiyun if (has->covered_end_pfn != start_pfn) {
804*4882a593Smuzhiyun gap = kzalloc(sizeof(struct hv_hotadd_gap), GFP_ATOMIC);
805*4882a593Smuzhiyun if (!gap) {
806*4882a593Smuzhiyun ret = -ENOMEM;
807*4882a593Smuzhiyun break;
808*4882a593Smuzhiyun }
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun INIT_LIST_HEAD(&gap->list);
811*4882a593Smuzhiyun gap->start_pfn = has->covered_end_pfn;
812*4882a593Smuzhiyun gap->end_pfn = start_pfn;
813*4882a593Smuzhiyun list_add_tail(&gap->list, &has->gap_list);
814*4882a593Smuzhiyun
815*4882a593Smuzhiyun has->covered_end_pfn = start_pfn;
816*4882a593Smuzhiyun }
817*4882a593Smuzhiyun
818*4882a593Smuzhiyun /*
819*4882a593Smuzhiyun * If the current hot add-request extends beyond
820*4882a593Smuzhiyun * our current limit; extend it.
821*4882a593Smuzhiyun */
822*4882a593Smuzhiyun if ((start_pfn + pfn_cnt) > has->end_pfn) {
823*4882a593Smuzhiyun residual = (start_pfn + pfn_cnt - has->end_pfn);
824*4882a593Smuzhiyun /*
825*4882a593Smuzhiyun * Extend the region by multiples of HA_CHUNK.
826*4882a593Smuzhiyun */
827*4882a593Smuzhiyun new_inc = (residual / HA_CHUNK) * HA_CHUNK;
828*4882a593Smuzhiyun if (residual % HA_CHUNK)
829*4882a593Smuzhiyun new_inc += HA_CHUNK;
830*4882a593Smuzhiyun
831*4882a593Smuzhiyun has->end_pfn += new_inc;
832*4882a593Smuzhiyun }
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun ret = 1;
835*4882a593Smuzhiyun break;
836*4882a593Smuzhiyun }
837*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
838*4882a593Smuzhiyun
839*4882a593Smuzhiyun return ret;
840*4882a593Smuzhiyun }
841*4882a593Smuzhiyun
handle_pg_range(unsigned long pg_start,unsigned long pg_count)842*4882a593Smuzhiyun static unsigned long handle_pg_range(unsigned long pg_start,
843*4882a593Smuzhiyun unsigned long pg_count)
844*4882a593Smuzhiyun {
845*4882a593Smuzhiyun unsigned long start_pfn = pg_start;
846*4882a593Smuzhiyun unsigned long pfn_cnt = pg_count;
847*4882a593Smuzhiyun unsigned long size;
848*4882a593Smuzhiyun struct hv_hotadd_state *has;
849*4882a593Smuzhiyun unsigned long pgs_ol = 0;
850*4882a593Smuzhiyun unsigned long old_covered_state;
851*4882a593Smuzhiyun unsigned long res = 0, flags;
852*4882a593Smuzhiyun
853*4882a593Smuzhiyun pr_debug("Hot adding %lu pages starting at pfn 0x%lx.\n", pg_count,
854*4882a593Smuzhiyun pg_start);
855*4882a593Smuzhiyun
856*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
857*4882a593Smuzhiyun list_for_each_entry(has, &dm_device.ha_region_list, list) {
858*4882a593Smuzhiyun /*
859*4882a593Smuzhiyun * If the pfn range we are dealing with is not in the current
860*4882a593Smuzhiyun * "hot add block", move on.
861*4882a593Smuzhiyun */
862*4882a593Smuzhiyun if (start_pfn < has->start_pfn || start_pfn >= has->end_pfn)
863*4882a593Smuzhiyun continue;
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun old_covered_state = has->covered_end_pfn;
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun if (start_pfn < has->ha_end_pfn) {
868*4882a593Smuzhiyun /*
869*4882a593Smuzhiyun * This is the case where we are backing pages
870*4882a593Smuzhiyun * in an already hot added region. Bring
871*4882a593Smuzhiyun * these pages online first.
872*4882a593Smuzhiyun */
873*4882a593Smuzhiyun pgs_ol = has->ha_end_pfn - start_pfn;
874*4882a593Smuzhiyun if (pgs_ol > pfn_cnt)
875*4882a593Smuzhiyun pgs_ol = pfn_cnt;
876*4882a593Smuzhiyun
877*4882a593Smuzhiyun has->covered_end_pfn += pgs_ol;
878*4882a593Smuzhiyun pfn_cnt -= pgs_ol;
879*4882a593Smuzhiyun /*
880*4882a593Smuzhiyun * Check if the corresponding memory block is already
881*4882a593Smuzhiyun * online. It is possible to observe struct pages still
882*4882a593Smuzhiyun * being uninitialized here so check section instead.
883*4882a593Smuzhiyun * In case the section is online we need to bring the
884*4882a593Smuzhiyun * rest of pfns (which were not backed previously)
885*4882a593Smuzhiyun * online too.
886*4882a593Smuzhiyun */
887*4882a593Smuzhiyun if (start_pfn > has->start_pfn &&
888*4882a593Smuzhiyun online_section_nr(pfn_to_section_nr(start_pfn)))
889*4882a593Smuzhiyun hv_bring_pgs_online(has, start_pfn, pgs_ol);
890*4882a593Smuzhiyun
891*4882a593Smuzhiyun }
892*4882a593Smuzhiyun
893*4882a593Smuzhiyun if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) {
894*4882a593Smuzhiyun /*
895*4882a593Smuzhiyun * We have some residual hot add range
896*4882a593Smuzhiyun * that needs to be hot added; hot add
897*4882a593Smuzhiyun * it now. Hot add a multiple of
898*4882a593Smuzhiyun * of HA_CHUNK that fully covers the pages
899*4882a593Smuzhiyun * we have.
900*4882a593Smuzhiyun */
901*4882a593Smuzhiyun size = (has->end_pfn - has->ha_end_pfn);
902*4882a593Smuzhiyun if (pfn_cnt <= size) {
903*4882a593Smuzhiyun size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK);
904*4882a593Smuzhiyun if (pfn_cnt % HA_CHUNK)
905*4882a593Smuzhiyun size += HA_CHUNK;
906*4882a593Smuzhiyun } else {
907*4882a593Smuzhiyun pfn_cnt = size;
908*4882a593Smuzhiyun }
909*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
910*4882a593Smuzhiyun hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has);
911*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
912*4882a593Smuzhiyun }
913*4882a593Smuzhiyun /*
914*4882a593Smuzhiyun * If we managed to online any pages that were given to us,
915*4882a593Smuzhiyun * we declare success.
916*4882a593Smuzhiyun */
917*4882a593Smuzhiyun res = has->covered_end_pfn - old_covered_state;
918*4882a593Smuzhiyun break;
919*4882a593Smuzhiyun }
920*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
921*4882a593Smuzhiyun
922*4882a593Smuzhiyun return res;
923*4882a593Smuzhiyun }
924*4882a593Smuzhiyun
process_hot_add(unsigned long pg_start,unsigned long pfn_cnt,unsigned long rg_start,unsigned long rg_size)925*4882a593Smuzhiyun static unsigned long process_hot_add(unsigned long pg_start,
926*4882a593Smuzhiyun unsigned long pfn_cnt,
927*4882a593Smuzhiyun unsigned long rg_start,
928*4882a593Smuzhiyun unsigned long rg_size)
929*4882a593Smuzhiyun {
930*4882a593Smuzhiyun struct hv_hotadd_state *ha_region = NULL;
931*4882a593Smuzhiyun int covered;
932*4882a593Smuzhiyun unsigned long flags;
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun if (pfn_cnt == 0)
935*4882a593Smuzhiyun return 0;
936*4882a593Smuzhiyun
937*4882a593Smuzhiyun if (!dm_device.host_specified_ha_region) {
938*4882a593Smuzhiyun covered = pfn_covered(pg_start, pfn_cnt);
939*4882a593Smuzhiyun if (covered < 0)
940*4882a593Smuzhiyun return 0;
941*4882a593Smuzhiyun
942*4882a593Smuzhiyun if (covered)
943*4882a593Smuzhiyun goto do_pg_range;
944*4882a593Smuzhiyun }
945*4882a593Smuzhiyun
946*4882a593Smuzhiyun /*
947*4882a593Smuzhiyun * If the host has specified a hot-add range; deal with it first.
948*4882a593Smuzhiyun */
949*4882a593Smuzhiyun
950*4882a593Smuzhiyun if (rg_size != 0) {
951*4882a593Smuzhiyun ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL);
952*4882a593Smuzhiyun if (!ha_region)
953*4882a593Smuzhiyun return 0;
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun INIT_LIST_HEAD(&ha_region->list);
956*4882a593Smuzhiyun INIT_LIST_HEAD(&ha_region->gap_list);
957*4882a593Smuzhiyun
958*4882a593Smuzhiyun ha_region->start_pfn = rg_start;
959*4882a593Smuzhiyun ha_region->ha_end_pfn = rg_start;
960*4882a593Smuzhiyun ha_region->covered_start_pfn = pg_start;
961*4882a593Smuzhiyun ha_region->covered_end_pfn = pg_start;
962*4882a593Smuzhiyun ha_region->end_pfn = rg_start + rg_size;
963*4882a593Smuzhiyun
964*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
965*4882a593Smuzhiyun list_add_tail(&ha_region->list, &dm_device.ha_region_list);
966*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
967*4882a593Smuzhiyun }
968*4882a593Smuzhiyun
969*4882a593Smuzhiyun do_pg_range:
970*4882a593Smuzhiyun /*
971*4882a593Smuzhiyun * Process the page range specified; bringing them
972*4882a593Smuzhiyun * online if possible.
973*4882a593Smuzhiyun */
974*4882a593Smuzhiyun return handle_pg_range(pg_start, pfn_cnt);
975*4882a593Smuzhiyun }
976*4882a593Smuzhiyun
977*4882a593Smuzhiyun #endif
978*4882a593Smuzhiyun
hot_add_req(struct work_struct * dummy)979*4882a593Smuzhiyun static void hot_add_req(struct work_struct *dummy)
980*4882a593Smuzhiyun {
981*4882a593Smuzhiyun struct dm_hot_add_response resp;
982*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
983*4882a593Smuzhiyun unsigned long pg_start, pfn_cnt;
984*4882a593Smuzhiyun unsigned long rg_start, rg_sz;
985*4882a593Smuzhiyun #endif
986*4882a593Smuzhiyun struct hv_dynmem_device *dm = &dm_device;
987*4882a593Smuzhiyun
988*4882a593Smuzhiyun memset(&resp, 0, sizeof(struct dm_hot_add_response));
989*4882a593Smuzhiyun resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE;
990*4882a593Smuzhiyun resp.hdr.size = sizeof(struct dm_hot_add_response);
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
993*4882a593Smuzhiyun pg_start = dm->ha_wrk.ha_page_range.finfo.start_page;
994*4882a593Smuzhiyun pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt;
995*4882a593Smuzhiyun
996*4882a593Smuzhiyun rg_start = dm->ha_wrk.ha_region_range.finfo.start_page;
997*4882a593Smuzhiyun rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt;
998*4882a593Smuzhiyun
999*4882a593Smuzhiyun if ((rg_start == 0) && (!dm->host_specified_ha_region)) {
1000*4882a593Smuzhiyun unsigned long region_size;
1001*4882a593Smuzhiyun unsigned long region_start;
1002*4882a593Smuzhiyun
1003*4882a593Smuzhiyun /*
1004*4882a593Smuzhiyun * The host has not specified the hot-add region.
1005*4882a593Smuzhiyun * Based on the hot-add page range being specified,
1006*4882a593Smuzhiyun * compute a hot-add region that can cover the pages
1007*4882a593Smuzhiyun * that need to be hot-added while ensuring the alignment
1008*4882a593Smuzhiyun * and size requirements of Linux as it relates to hot-add.
1009*4882a593Smuzhiyun */
1010*4882a593Smuzhiyun region_start = pg_start;
1011*4882a593Smuzhiyun region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK;
1012*4882a593Smuzhiyun if (pfn_cnt % HA_CHUNK)
1013*4882a593Smuzhiyun region_size += HA_CHUNK;
1014*4882a593Smuzhiyun
1015*4882a593Smuzhiyun region_start = (pg_start / HA_CHUNK) * HA_CHUNK;
1016*4882a593Smuzhiyun
1017*4882a593Smuzhiyun rg_start = region_start;
1018*4882a593Smuzhiyun rg_sz = region_size;
1019*4882a593Smuzhiyun }
1020*4882a593Smuzhiyun
1021*4882a593Smuzhiyun if (do_hot_add)
1022*4882a593Smuzhiyun resp.page_count = process_hot_add(pg_start, pfn_cnt,
1023*4882a593Smuzhiyun rg_start, rg_sz);
1024*4882a593Smuzhiyun
1025*4882a593Smuzhiyun dm->num_pages_added += resp.page_count;
1026*4882a593Smuzhiyun #endif
1027*4882a593Smuzhiyun /*
1028*4882a593Smuzhiyun * The result field of the response structure has the
1029*4882a593Smuzhiyun * following semantics:
1030*4882a593Smuzhiyun *
1031*4882a593Smuzhiyun * 1. If all or some pages hot-added: Guest should return success.
1032*4882a593Smuzhiyun *
1033*4882a593Smuzhiyun * 2. If no pages could be hot-added:
1034*4882a593Smuzhiyun *
1035*4882a593Smuzhiyun * If the guest returns success, then the host
1036*4882a593Smuzhiyun * will not attempt any further hot-add operations. This
1037*4882a593Smuzhiyun * signifies a permanent failure.
1038*4882a593Smuzhiyun *
1039*4882a593Smuzhiyun * If the guest returns failure, then this failure will be
1040*4882a593Smuzhiyun * treated as a transient failure and the host may retry the
1041*4882a593Smuzhiyun * hot-add operation after some delay.
1042*4882a593Smuzhiyun */
1043*4882a593Smuzhiyun if (resp.page_count > 0)
1044*4882a593Smuzhiyun resp.result = 1;
1045*4882a593Smuzhiyun else if (!do_hot_add)
1046*4882a593Smuzhiyun resp.result = 1;
1047*4882a593Smuzhiyun else
1048*4882a593Smuzhiyun resp.result = 0;
1049*4882a593Smuzhiyun
1050*4882a593Smuzhiyun if (!do_hot_add || resp.page_count == 0) {
1051*4882a593Smuzhiyun if (!allow_hibernation)
1052*4882a593Smuzhiyun pr_err("Memory hot add failed\n");
1053*4882a593Smuzhiyun else
1054*4882a593Smuzhiyun pr_info("Ignore hot-add request!\n");
1055*4882a593Smuzhiyun }
1056*4882a593Smuzhiyun
1057*4882a593Smuzhiyun dm->state = DM_INITIALIZED;
1058*4882a593Smuzhiyun resp.hdr.trans_id = atomic_inc_return(&trans_id);
1059*4882a593Smuzhiyun vmbus_sendpacket(dm->dev->channel, &resp,
1060*4882a593Smuzhiyun sizeof(struct dm_hot_add_response),
1061*4882a593Smuzhiyun (unsigned long)NULL,
1062*4882a593Smuzhiyun VM_PKT_DATA_INBAND, 0);
1063*4882a593Smuzhiyun }
1064*4882a593Smuzhiyun
process_info(struct hv_dynmem_device * dm,struct dm_info_msg * msg)1065*4882a593Smuzhiyun static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg)
1066*4882a593Smuzhiyun {
1067*4882a593Smuzhiyun struct dm_info_header *info_hdr;
1068*4882a593Smuzhiyun
1069*4882a593Smuzhiyun info_hdr = (struct dm_info_header *)msg->info;
1070*4882a593Smuzhiyun
1071*4882a593Smuzhiyun switch (info_hdr->type) {
1072*4882a593Smuzhiyun case INFO_TYPE_MAX_PAGE_CNT:
1073*4882a593Smuzhiyun if (info_hdr->data_size == sizeof(__u64)) {
1074*4882a593Smuzhiyun __u64 *max_page_count = (__u64 *)&info_hdr[1];
1075*4882a593Smuzhiyun
1076*4882a593Smuzhiyun pr_info("Max. dynamic memory size: %llu MB\n",
1077*4882a593Smuzhiyun (*max_page_count) >> (20 - HV_HYP_PAGE_SHIFT));
1078*4882a593Smuzhiyun }
1079*4882a593Smuzhiyun
1080*4882a593Smuzhiyun break;
1081*4882a593Smuzhiyun default:
1082*4882a593Smuzhiyun pr_warn("Received Unknown type: %d\n", info_hdr->type);
1083*4882a593Smuzhiyun }
1084*4882a593Smuzhiyun }
1085*4882a593Smuzhiyun
compute_balloon_floor(void)1086*4882a593Smuzhiyun static unsigned long compute_balloon_floor(void)
1087*4882a593Smuzhiyun {
1088*4882a593Smuzhiyun unsigned long min_pages;
1089*4882a593Smuzhiyun unsigned long nr_pages = totalram_pages();
1090*4882a593Smuzhiyun #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT))
1091*4882a593Smuzhiyun /* Simple continuous piecewiese linear function:
1092*4882a593Smuzhiyun * max MiB -> min MiB gradient
1093*4882a593Smuzhiyun * 0 0
1094*4882a593Smuzhiyun * 16 16
1095*4882a593Smuzhiyun * 32 24
1096*4882a593Smuzhiyun * 128 72 (1/2)
1097*4882a593Smuzhiyun * 512 168 (1/4)
1098*4882a593Smuzhiyun * 2048 360 (1/8)
1099*4882a593Smuzhiyun * 8192 744 (1/16)
1100*4882a593Smuzhiyun * 32768 1512 (1/32)
1101*4882a593Smuzhiyun */
1102*4882a593Smuzhiyun if (nr_pages < MB2PAGES(128))
1103*4882a593Smuzhiyun min_pages = MB2PAGES(8) + (nr_pages >> 1);
1104*4882a593Smuzhiyun else if (nr_pages < MB2PAGES(512))
1105*4882a593Smuzhiyun min_pages = MB2PAGES(40) + (nr_pages >> 2);
1106*4882a593Smuzhiyun else if (nr_pages < MB2PAGES(2048))
1107*4882a593Smuzhiyun min_pages = MB2PAGES(104) + (nr_pages >> 3);
1108*4882a593Smuzhiyun else if (nr_pages < MB2PAGES(8192))
1109*4882a593Smuzhiyun min_pages = MB2PAGES(232) + (nr_pages >> 4);
1110*4882a593Smuzhiyun else
1111*4882a593Smuzhiyun min_pages = MB2PAGES(488) + (nr_pages >> 5);
1112*4882a593Smuzhiyun #undef MB2PAGES
1113*4882a593Smuzhiyun return min_pages;
1114*4882a593Smuzhiyun }
1115*4882a593Smuzhiyun
1116*4882a593Smuzhiyun /*
1117*4882a593Smuzhiyun * Post our status as it relates memory pressure to the
1118*4882a593Smuzhiyun * host. Host expects the guests to post this status
1119*4882a593Smuzhiyun * periodically at 1 second intervals.
1120*4882a593Smuzhiyun *
1121*4882a593Smuzhiyun * The metrics specified in this protocol are very Windows
1122*4882a593Smuzhiyun * specific and so we cook up numbers here to convey our memory
1123*4882a593Smuzhiyun * pressure.
1124*4882a593Smuzhiyun */
1125*4882a593Smuzhiyun
post_status(struct hv_dynmem_device * dm)1126*4882a593Smuzhiyun static void post_status(struct hv_dynmem_device *dm)
1127*4882a593Smuzhiyun {
1128*4882a593Smuzhiyun struct dm_status status;
1129*4882a593Smuzhiyun unsigned long now = jiffies;
1130*4882a593Smuzhiyun unsigned long last_post = last_post_time;
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun if (pressure_report_delay > 0) {
1133*4882a593Smuzhiyun --pressure_report_delay;
1134*4882a593Smuzhiyun return;
1135*4882a593Smuzhiyun }
1136*4882a593Smuzhiyun
1137*4882a593Smuzhiyun if (!time_after(now, (last_post_time + HZ)))
1138*4882a593Smuzhiyun return;
1139*4882a593Smuzhiyun
1140*4882a593Smuzhiyun memset(&status, 0, sizeof(struct dm_status));
1141*4882a593Smuzhiyun status.hdr.type = DM_STATUS_REPORT;
1142*4882a593Smuzhiyun status.hdr.size = sizeof(struct dm_status);
1143*4882a593Smuzhiyun status.hdr.trans_id = atomic_inc_return(&trans_id);
1144*4882a593Smuzhiyun
1145*4882a593Smuzhiyun /*
1146*4882a593Smuzhiyun * The host expects the guest to report free and committed memory.
1147*4882a593Smuzhiyun * Furthermore, the host expects the pressure information to include
1148*4882a593Smuzhiyun * the ballooned out pages. For a given amount of memory that we are
1149*4882a593Smuzhiyun * managing we need to compute a floor below which we should not
1150*4882a593Smuzhiyun * balloon. Compute this and add it to the pressure report.
1151*4882a593Smuzhiyun * We also need to report all offline pages (num_pages_added -
1152*4882a593Smuzhiyun * num_pages_onlined) as committed to the host, otherwise it can try
1153*4882a593Smuzhiyun * asking us to balloon them out.
1154*4882a593Smuzhiyun */
1155*4882a593Smuzhiyun status.num_avail = si_mem_available();
1156*4882a593Smuzhiyun status.num_committed = vm_memory_committed() +
1157*4882a593Smuzhiyun dm->num_pages_ballooned +
1158*4882a593Smuzhiyun (dm->num_pages_added > dm->num_pages_onlined ?
1159*4882a593Smuzhiyun dm->num_pages_added - dm->num_pages_onlined : 0) +
1160*4882a593Smuzhiyun compute_balloon_floor();
1161*4882a593Smuzhiyun
1162*4882a593Smuzhiyun trace_balloon_status(status.num_avail, status.num_committed,
1163*4882a593Smuzhiyun vm_memory_committed(), dm->num_pages_ballooned,
1164*4882a593Smuzhiyun dm->num_pages_added, dm->num_pages_onlined);
1165*4882a593Smuzhiyun /*
1166*4882a593Smuzhiyun * If our transaction ID is no longer current, just don't
1167*4882a593Smuzhiyun * send the status. This can happen if we were interrupted
1168*4882a593Smuzhiyun * after we picked our transaction ID.
1169*4882a593Smuzhiyun */
1170*4882a593Smuzhiyun if (status.hdr.trans_id != atomic_read(&trans_id))
1171*4882a593Smuzhiyun return;
1172*4882a593Smuzhiyun
1173*4882a593Smuzhiyun /*
1174*4882a593Smuzhiyun * If the last post time that we sampled has changed,
1175*4882a593Smuzhiyun * we have raced, don't post the status.
1176*4882a593Smuzhiyun */
1177*4882a593Smuzhiyun if (last_post != last_post_time)
1178*4882a593Smuzhiyun return;
1179*4882a593Smuzhiyun
1180*4882a593Smuzhiyun last_post_time = jiffies;
1181*4882a593Smuzhiyun vmbus_sendpacket(dm->dev->channel, &status,
1182*4882a593Smuzhiyun sizeof(struct dm_status),
1183*4882a593Smuzhiyun (unsigned long)NULL,
1184*4882a593Smuzhiyun VM_PKT_DATA_INBAND, 0);
1185*4882a593Smuzhiyun
1186*4882a593Smuzhiyun }
1187*4882a593Smuzhiyun
free_balloon_pages(struct hv_dynmem_device * dm,union dm_mem_page_range * range_array)1188*4882a593Smuzhiyun static void free_balloon_pages(struct hv_dynmem_device *dm,
1189*4882a593Smuzhiyun union dm_mem_page_range *range_array)
1190*4882a593Smuzhiyun {
1191*4882a593Smuzhiyun int num_pages = range_array->finfo.page_cnt;
1192*4882a593Smuzhiyun __u64 start_frame = range_array->finfo.start_page;
1193*4882a593Smuzhiyun struct page *pg;
1194*4882a593Smuzhiyun int i;
1195*4882a593Smuzhiyun
1196*4882a593Smuzhiyun for (i = 0; i < num_pages; i++) {
1197*4882a593Smuzhiyun pg = pfn_to_page(i + start_frame);
1198*4882a593Smuzhiyun __ClearPageOffline(pg);
1199*4882a593Smuzhiyun __free_page(pg);
1200*4882a593Smuzhiyun dm->num_pages_ballooned--;
1201*4882a593Smuzhiyun }
1202*4882a593Smuzhiyun }
1203*4882a593Smuzhiyun
1204*4882a593Smuzhiyun
1205*4882a593Smuzhiyun
alloc_balloon_pages(struct hv_dynmem_device * dm,unsigned int num_pages,struct dm_balloon_response * bl_resp,int alloc_unit)1206*4882a593Smuzhiyun static unsigned int alloc_balloon_pages(struct hv_dynmem_device *dm,
1207*4882a593Smuzhiyun unsigned int num_pages,
1208*4882a593Smuzhiyun struct dm_balloon_response *bl_resp,
1209*4882a593Smuzhiyun int alloc_unit)
1210*4882a593Smuzhiyun {
1211*4882a593Smuzhiyun unsigned int i, j;
1212*4882a593Smuzhiyun struct page *pg;
1213*4882a593Smuzhiyun
1214*4882a593Smuzhiyun for (i = 0; i < num_pages / alloc_unit; i++) {
1215*4882a593Smuzhiyun if (bl_resp->hdr.size + sizeof(union dm_mem_page_range) >
1216*4882a593Smuzhiyun HV_HYP_PAGE_SIZE)
1217*4882a593Smuzhiyun return i * alloc_unit;
1218*4882a593Smuzhiyun
1219*4882a593Smuzhiyun /*
1220*4882a593Smuzhiyun * We execute this code in a thread context. Furthermore,
1221*4882a593Smuzhiyun * we don't want the kernel to try too hard.
1222*4882a593Smuzhiyun */
1223*4882a593Smuzhiyun pg = alloc_pages(GFP_HIGHUSER | __GFP_NORETRY |
1224*4882a593Smuzhiyun __GFP_NOMEMALLOC | __GFP_NOWARN,
1225*4882a593Smuzhiyun get_order(alloc_unit << PAGE_SHIFT));
1226*4882a593Smuzhiyun
1227*4882a593Smuzhiyun if (!pg)
1228*4882a593Smuzhiyun return i * alloc_unit;
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun dm->num_pages_ballooned += alloc_unit;
1231*4882a593Smuzhiyun
1232*4882a593Smuzhiyun /*
1233*4882a593Smuzhiyun * If we allocatted 2M pages; split them so we
1234*4882a593Smuzhiyun * can free them in any order we get.
1235*4882a593Smuzhiyun */
1236*4882a593Smuzhiyun
1237*4882a593Smuzhiyun if (alloc_unit != 1)
1238*4882a593Smuzhiyun split_page(pg, get_order(alloc_unit << PAGE_SHIFT));
1239*4882a593Smuzhiyun
1240*4882a593Smuzhiyun /* mark all pages offline */
1241*4882a593Smuzhiyun for (j = 0; j < (1 << get_order(alloc_unit << PAGE_SHIFT)); j++)
1242*4882a593Smuzhiyun __SetPageOffline(pg + j);
1243*4882a593Smuzhiyun
1244*4882a593Smuzhiyun bl_resp->range_count++;
1245*4882a593Smuzhiyun bl_resp->range_array[i].finfo.start_page =
1246*4882a593Smuzhiyun page_to_pfn(pg);
1247*4882a593Smuzhiyun bl_resp->range_array[i].finfo.page_cnt = alloc_unit;
1248*4882a593Smuzhiyun bl_resp->hdr.size += sizeof(union dm_mem_page_range);
1249*4882a593Smuzhiyun
1250*4882a593Smuzhiyun }
1251*4882a593Smuzhiyun
1252*4882a593Smuzhiyun return i * alloc_unit;
1253*4882a593Smuzhiyun }
1254*4882a593Smuzhiyun
balloon_up(struct work_struct * dummy)1255*4882a593Smuzhiyun static void balloon_up(struct work_struct *dummy)
1256*4882a593Smuzhiyun {
1257*4882a593Smuzhiyun unsigned int num_pages = dm_device.balloon_wrk.num_pages;
1258*4882a593Smuzhiyun unsigned int num_ballooned = 0;
1259*4882a593Smuzhiyun struct dm_balloon_response *bl_resp;
1260*4882a593Smuzhiyun int alloc_unit;
1261*4882a593Smuzhiyun int ret;
1262*4882a593Smuzhiyun bool done = false;
1263*4882a593Smuzhiyun int i;
1264*4882a593Smuzhiyun long avail_pages;
1265*4882a593Smuzhiyun unsigned long floor;
1266*4882a593Smuzhiyun
1267*4882a593Smuzhiyun /*
1268*4882a593Smuzhiyun * We will attempt 2M allocations. However, if we fail to
1269*4882a593Smuzhiyun * allocate 2M chunks, we will go back to PAGE_SIZE allocations.
1270*4882a593Smuzhiyun */
1271*4882a593Smuzhiyun alloc_unit = PAGES_IN_2M;
1272*4882a593Smuzhiyun
1273*4882a593Smuzhiyun avail_pages = si_mem_available();
1274*4882a593Smuzhiyun floor = compute_balloon_floor();
1275*4882a593Smuzhiyun
1276*4882a593Smuzhiyun /* Refuse to balloon below the floor. */
1277*4882a593Smuzhiyun if (avail_pages < num_pages || avail_pages - num_pages < floor) {
1278*4882a593Smuzhiyun pr_info("Balloon request will be partially fulfilled. %s\n",
1279*4882a593Smuzhiyun avail_pages < num_pages ? "Not enough memory." :
1280*4882a593Smuzhiyun "Balloon floor reached.");
1281*4882a593Smuzhiyun
1282*4882a593Smuzhiyun num_pages = avail_pages > floor ? (avail_pages - floor) : 0;
1283*4882a593Smuzhiyun }
1284*4882a593Smuzhiyun
1285*4882a593Smuzhiyun while (!done) {
1286*4882a593Smuzhiyun memset(balloon_up_send_buffer, 0, HV_HYP_PAGE_SIZE);
1287*4882a593Smuzhiyun bl_resp = (struct dm_balloon_response *)balloon_up_send_buffer;
1288*4882a593Smuzhiyun bl_resp->hdr.type = DM_BALLOON_RESPONSE;
1289*4882a593Smuzhiyun bl_resp->hdr.size = sizeof(struct dm_balloon_response);
1290*4882a593Smuzhiyun bl_resp->more_pages = 1;
1291*4882a593Smuzhiyun
1292*4882a593Smuzhiyun num_pages -= num_ballooned;
1293*4882a593Smuzhiyun num_ballooned = alloc_balloon_pages(&dm_device, num_pages,
1294*4882a593Smuzhiyun bl_resp, alloc_unit);
1295*4882a593Smuzhiyun
1296*4882a593Smuzhiyun if (alloc_unit != 1 && num_ballooned == 0) {
1297*4882a593Smuzhiyun alloc_unit = 1;
1298*4882a593Smuzhiyun continue;
1299*4882a593Smuzhiyun }
1300*4882a593Smuzhiyun
1301*4882a593Smuzhiyun if (num_ballooned == 0 || num_ballooned == num_pages) {
1302*4882a593Smuzhiyun pr_debug("Ballooned %u out of %u requested pages.\n",
1303*4882a593Smuzhiyun num_pages, dm_device.balloon_wrk.num_pages);
1304*4882a593Smuzhiyun
1305*4882a593Smuzhiyun bl_resp->more_pages = 0;
1306*4882a593Smuzhiyun done = true;
1307*4882a593Smuzhiyun dm_device.state = DM_INITIALIZED;
1308*4882a593Smuzhiyun }
1309*4882a593Smuzhiyun
1310*4882a593Smuzhiyun /*
1311*4882a593Smuzhiyun * We are pushing a lot of data through the channel;
1312*4882a593Smuzhiyun * deal with transient failures caused because of the
1313*4882a593Smuzhiyun * lack of space in the ring buffer.
1314*4882a593Smuzhiyun */
1315*4882a593Smuzhiyun
1316*4882a593Smuzhiyun do {
1317*4882a593Smuzhiyun bl_resp->hdr.trans_id = atomic_inc_return(&trans_id);
1318*4882a593Smuzhiyun ret = vmbus_sendpacket(dm_device.dev->channel,
1319*4882a593Smuzhiyun bl_resp,
1320*4882a593Smuzhiyun bl_resp->hdr.size,
1321*4882a593Smuzhiyun (unsigned long)NULL,
1322*4882a593Smuzhiyun VM_PKT_DATA_INBAND, 0);
1323*4882a593Smuzhiyun
1324*4882a593Smuzhiyun if (ret == -EAGAIN)
1325*4882a593Smuzhiyun msleep(20);
1326*4882a593Smuzhiyun post_status(&dm_device);
1327*4882a593Smuzhiyun } while (ret == -EAGAIN);
1328*4882a593Smuzhiyun
1329*4882a593Smuzhiyun if (ret) {
1330*4882a593Smuzhiyun /*
1331*4882a593Smuzhiyun * Free up the memory we allocatted.
1332*4882a593Smuzhiyun */
1333*4882a593Smuzhiyun pr_err("Balloon response failed\n");
1334*4882a593Smuzhiyun
1335*4882a593Smuzhiyun for (i = 0; i < bl_resp->range_count; i++)
1336*4882a593Smuzhiyun free_balloon_pages(&dm_device,
1337*4882a593Smuzhiyun &bl_resp->range_array[i]);
1338*4882a593Smuzhiyun
1339*4882a593Smuzhiyun done = true;
1340*4882a593Smuzhiyun }
1341*4882a593Smuzhiyun }
1342*4882a593Smuzhiyun
1343*4882a593Smuzhiyun }
1344*4882a593Smuzhiyun
balloon_down(struct hv_dynmem_device * dm,struct dm_unballoon_request * req)1345*4882a593Smuzhiyun static void balloon_down(struct hv_dynmem_device *dm,
1346*4882a593Smuzhiyun struct dm_unballoon_request *req)
1347*4882a593Smuzhiyun {
1348*4882a593Smuzhiyun union dm_mem_page_range *range_array = req->range_array;
1349*4882a593Smuzhiyun int range_count = req->range_count;
1350*4882a593Smuzhiyun struct dm_unballoon_response resp;
1351*4882a593Smuzhiyun int i;
1352*4882a593Smuzhiyun unsigned int prev_pages_ballooned = dm->num_pages_ballooned;
1353*4882a593Smuzhiyun
1354*4882a593Smuzhiyun for (i = 0; i < range_count; i++) {
1355*4882a593Smuzhiyun free_balloon_pages(dm, &range_array[i]);
1356*4882a593Smuzhiyun complete(&dm_device.config_event);
1357*4882a593Smuzhiyun }
1358*4882a593Smuzhiyun
1359*4882a593Smuzhiyun pr_debug("Freed %u ballooned pages.\n",
1360*4882a593Smuzhiyun prev_pages_ballooned - dm->num_pages_ballooned);
1361*4882a593Smuzhiyun
1362*4882a593Smuzhiyun if (req->more_pages == 1)
1363*4882a593Smuzhiyun return;
1364*4882a593Smuzhiyun
1365*4882a593Smuzhiyun memset(&resp, 0, sizeof(struct dm_unballoon_response));
1366*4882a593Smuzhiyun resp.hdr.type = DM_UNBALLOON_RESPONSE;
1367*4882a593Smuzhiyun resp.hdr.trans_id = atomic_inc_return(&trans_id);
1368*4882a593Smuzhiyun resp.hdr.size = sizeof(struct dm_unballoon_response);
1369*4882a593Smuzhiyun
1370*4882a593Smuzhiyun vmbus_sendpacket(dm_device.dev->channel, &resp,
1371*4882a593Smuzhiyun sizeof(struct dm_unballoon_response),
1372*4882a593Smuzhiyun (unsigned long)NULL,
1373*4882a593Smuzhiyun VM_PKT_DATA_INBAND, 0);
1374*4882a593Smuzhiyun
1375*4882a593Smuzhiyun dm->state = DM_INITIALIZED;
1376*4882a593Smuzhiyun }
1377*4882a593Smuzhiyun
1378*4882a593Smuzhiyun static void balloon_onchannelcallback(void *context);
1379*4882a593Smuzhiyun
dm_thread_func(void * dm_dev)1380*4882a593Smuzhiyun static int dm_thread_func(void *dm_dev)
1381*4882a593Smuzhiyun {
1382*4882a593Smuzhiyun struct hv_dynmem_device *dm = dm_dev;
1383*4882a593Smuzhiyun
1384*4882a593Smuzhiyun while (!kthread_should_stop()) {
1385*4882a593Smuzhiyun wait_for_completion_interruptible_timeout(
1386*4882a593Smuzhiyun &dm_device.config_event, 1*HZ);
1387*4882a593Smuzhiyun /*
1388*4882a593Smuzhiyun * The host expects us to post information on the memory
1389*4882a593Smuzhiyun * pressure every second.
1390*4882a593Smuzhiyun */
1391*4882a593Smuzhiyun reinit_completion(&dm_device.config_event);
1392*4882a593Smuzhiyun post_status(dm);
1393*4882a593Smuzhiyun }
1394*4882a593Smuzhiyun
1395*4882a593Smuzhiyun return 0;
1396*4882a593Smuzhiyun }
1397*4882a593Smuzhiyun
1398*4882a593Smuzhiyun
version_resp(struct hv_dynmem_device * dm,struct dm_version_response * vresp)1399*4882a593Smuzhiyun static void version_resp(struct hv_dynmem_device *dm,
1400*4882a593Smuzhiyun struct dm_version_response *vresp)
1401*4882a593Smuzhiyun {
1402*4882a593Smuzhiyun struct dm_version_request version_req;
1403*4882a593Smuzhiyun int ret;
1404*4882a593Smuzhiyun
1405*4882a593Smuzhiyun if (vresp->is_accepted) {
1406*4882a593Smuzhiyun /*
1407*4882a593Smuzhiyun * We are done; wakeup the
1408*4882a593Smuzhiyun * context waiting for version
1409*4882a593Smuzhiyun * negotiation.
1410*4882a593Smuzhiyun */
1411*4882a593Smuzhiyun complete(&dm->host_event);
1412*4882a593Smuzhiyun return;
1413*4882a593Smuzhiyun }
1414*4882a593Smuzhiyun /*
1415*4882a593Smuzhiyun * If there are more versions to try, continue
1416*4882a593Smuzhiyun * with negotiations; if not
1417*4882a593Smuzhiyun * shutdown the service since we are not able
1418*4882a593Smuzhiyun * to negotiate a suitable version number
1419*4882a593Smuzhiyun * with the host.
1420*4882a593Smuzhiyun */
1421*4882a593Smuzhiyun if (dm->next_version == 0)
1422*4882a593Smuzhiyun goto version_error;
1423*4882a593Smuzhiyun
1424*4882a593Smuzhiyun memset(&version_req, 0, sizeof(struct dm_version_request));
1425*4882a593Smuzhiyun version_req.hdr.type = DM_VERSION_REQUEST;
1426*4882a593Smuzhiyun version_req.hdr.size = sizeof(struct dm_version_request);
1427*4882a593Smuzhiyun version_req.hdr.trans_id = atomic_inc_return(&trans_id);
1428*4882a593Smuzhiyun version_req.version.version = dm->next_version;
1429*4882a593Smuzhiyun dm->version = version_req.version.version;
1430*4882a593Smuzhiyun
1431*4882a593Smuzhiyun /*
1432*4882a593Smuzhiyun * Set the next version to try in case current version fails.
1433*4882a593Smuzhiyun * Win7 protocol ought to be the last one to try.
1434*4882a593Smuzhiyun */
1435*4882a593Smuzhiyun switch (version_req.version.version) {
1436*4882a593Smuzhiyun case DYNMEM_PROTOCOL_VERSION_WIN8:
1437*4882a593Smuzhiyun dm->next_version = DYNMEM_PROTOCOL_VERSION_WIN7;
1438*4882a593Smuzhiyun version_req.is_last_attempt = 0;
1439*4882a593Smuzhiyun break;
1440*4882a593Smuzhiyun default:
1441*4882a593Smuzhiyun dm->next_version = 0;
1442*4882a593Smuzhiyun version_req.is_last_attempt = 1;
1443*4882a593Smuzhiyun }
1444*4882a593Smuzhiyun
1445*4882a593Smuzhiyun ret = vmbus_sendpacket(dm->dev->channel, &version_req,
1446*4882a593Smuzhiyun sizeof(struct dm_version_request),
1447*4882a593Smuzhiyun (unsigned long)NULL,
1448*4882a593Smuzhiyun VM_PKT_DATA_INBAND, 0);
1449*4882a593Smuzhiyun
1450*4882a593Smuzhiyun if (ret)
1451*4882a593Smuzhiyun goto version_error;
1452*4882a593Smuzhiyun
1453*4882a593Smuzhiyun return;
1454*4882a593Smuzhiyun
1455*4882a593Smuzhiyun version_error:
1456*4882a593Smuzhiyun dm->state = DM_INIT_ERROR;
1457*4882a593Smuzhiyun complete(&dm->host_event);
1458*4882a593Smuzhiyun }
1459*4882a593Smuzhiyun
cap_resp(struct hv_dynmem_device * dm,struct dm_capabilities_resp_msg * cap_resp)1460*4882a593Smuzhiyun static void cap_resp(struct hv_dynmem_device *dm,
1461*4882a593Smuzhiyun struct dm_capabilities_resp_msg *cap_resp)
1462*4882a593Smuzhiyun {
1463*4882a593Smuzhiyun if (!cap_resp->is_accepted) {
1464*4882a593Smuzhiyun pr_err("Capabilities not accepted by host\n");
1465*4882a593Smuzhiyun dm->state = DM_INIT_ERROR;
1466*4882a593Smuzhiyun }
1467*4882a593Smuzhiyun complete(&dm->host_event);
1468*4882a593Smuzhiyun }
1469*4882a593Smuzhiyun
balloon_onchannelcallback(void * context)1470*4882a593Smuzhiyun static void balloon_onchannelcallback(void *context)
1471*4882a593Smuzhiyun {
1472*4882a593Smuzhiyun struct hv_device *dev = context;
1473*4882a593Smuzhiyun u32 recvlen;
1474*4882a593Smuzhiyun u64 requestid;
1475*4882a593Smuzhiyun struct dm_message *dm_msg;
1476*4882a593Smuzhiyun struct dm_header *dm_hdr;
1477*4882a593Smuzhiyun struct hv_dynmem_device *dm = hv_get_drvdata(dev);
1478*4882a593Smuzhiyun struct dm_balloon *bal_msg;
1479*4882a593Smuzhiyun struct dm_hot_add *ha_msg;
1480*4882a593Smuzhiyun union dm_mem_page_range *ha_pg_range;
1481*4882a593Smuzhiyun union dm_mem_page_range *ha_region;
1482*4882a593Smuzhiyun
1483*4882a593Smuzhiyun memset(recv_buffer, 0, sizeof(recv_buffer));
1484*4882a593Smuzhiyun vmbus_recvpacket(dev->channel, recv_buffer,
1485*4882a593Smuzhiyun HV_HYP_PAGE_SIZE, &recvlen, &requestid);
1486*4882a593Smuzhiyun
1487*4882a593Smuzhiyun if (recvlen > 0) {
1488*4882a593Smuzhiyun dm_msg = (struct dm_message *)recv_buffer;
1489*4882a593Smuzhiyun dm_hdr = &dm_msg->hdr;
1490*4882a593Smuzhiyun
1491*4882a593Smuzhiyun switch (dm_hdr->type) {
1492*4882a593Smuzhiyun case DM_VERSION_RESPONSE:
1493*4882a593Smuzhiyun version_resp(dm,
1494*4882a593Smuzhiyun (struct dm_version_response *)dm_msg);
1495*4882a593Smuzhiyun break;
1496*4882a593Smuzhiyun
1497*4882a593Smuzhiyun case DM_CAPABILITIES_RESPONSE:
1498*4882a593Smuzhiyun cap_resp(dm,
1499*4882a593Smuzhiyun (struct dm_capabilities_resp_msg *)dm_msg);
1500*4882a593Smuzhiyun break;
1501*4882a593Smuzhiyun
1502*4882a593Smuzhiyun case DM_BALLOON_REQUEST:
1503*4882a593Smuzhiyun if (allow_hibernation) {
1504*4882a593Smuzhiyun pr_info("Ignore balloon-up request!\n");
1505*4882a593Smuzhiyun break;
1506*4882a593Smuzhiyun }
1507*4882a593Smuzhiyun
1508*4882a593Smuzhiyun if (dm->state == DM_BALLOON_UP)
1509*4882a593Smuzhiyun pr_warn("Currently ballooning\n");
1510*4882a593Smuzhiyun bal_msg = (struct dm_balloon *)recv_buffer;
1511*4882a593Smuzhiyun dm->state = DM_BALLOON_UP;
1512*4882a593Smuzhiyun dm_device.balloon_wrk.num_pages = bal_msg->num_pages;
1513*4882a593Smuzhiyun schedule_work(&dm_device.balloon_wrk.wrk);
1514*4882a593Smuzhiyun break;
1515*4882a593Smuzhiyun
1516*4882a593Smuzhiyun case DM_UNBALLOON_REQUEST:
1517*4882a593Smuzhiyun if (allow_hibernation) {
1518*4882a593Smuzhiyun pr_info("Ignore balloon-down request!\n");
1519*4882a593Smuzhiyun break;
1520*4882a593Smuzhiyun }
1521*4882a593Smuzhiyun
1522*4882a593Smuzhiyun dm->state = DM_BALLOON_DOWN;
1523*4882a593Smuzhiyun balloon_down(dm,
1524*4882a593Smuzhiyun (struct dm_unballoon_request *)recv_buffer);
1525*4882a593Smuzhiyun break;
1526*4882a593Smuzhiyun
1527*4882a593Smuzhiyun case DM_MEM_HOT_ADD_REQUEST:
1528*4882a593Smuzhiyun if (dm->state == DM_HOT_ADD)
1529*4882a593Smuzhiyun pr_warn("Currently hot-adding\n");
1530*4882a593Smuzhiyun dm->state = DM_HOT_ADD;
1531*4882a593Smuzhiyun ha_msg = (struct dm_hot_add *)recv_buffer;
1532*4882a593Smuzhiyun if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) {
1533*4882a593Smuzhiyun /*
1534*4882a593Smuzhiyun * This is a normal hot-add request specifying
1535*4882a593Smuzhiyun * hot-add memory.
1536*4882a593Smuzhiyun */
1537*4882a593Smuzhiyun dm->host_specified_ha_region = false;
1538*4882a593Smuzhiyun ha_pg_range = &ha_msg->range;
1539*4882a593Smuzhiyun dm->ha_wrk.ha_page_range = *ha_pg_range;
1540*4882a593Smuzhiyun dm->ha_wrk.ha_region_range.page_range = 0;
1541*4882a593Smuzhiyun } else {
1542*4882a593Smuzhiyun /*
1543*4882a593Smuzhiyun * Host is specifying that we first hot-add
1544*4882a593Smuzhiyun * a region and then partially populate this
1545*4882a593Smuzhiyun * region.
1546*4882a593Smuzhiyun */
1547*4882a593Smuzhiyun dm->host_specified_ha_region = true;
1548*4882a593Smuzhiyun ha_pg_range = &ha_msg->range;
1549*4882a593Smuzhiyun ha_region = &ha_pg_range[1];
1550*4882a593Smuzhiyun dm->ha_wrk.ha_page_range = *ha_pg_range;
1551*4882a593Smuzhiyun dm->ha_wrk.ha_region_range = *ha_region;
1552*4882a593Smuzhiyun }
1553*4882a593Smuzhiyun schedule_work(&dm_device.ha_wrk.wrk);
1554*4882a593Smuzhiyun break;
1555*4882a593Smuzhiyun
1556*4882a593Smuzhiyun case DM_INFO_MESSAGE:
1557*4882a593Smuzhiyun process_info(dm, (struct dm_info_msg *)dm_msg);
1558*4882a593Smuzhiyun break;
1559*4882a593Smuzhiyun
1560*4882a593Smuzhiyun default:
1561*4882a593Smuzhiyun pr_warn_ratelimited("Unhandled message: type: %d\n", dm_hdr->type);
1562*4882a593Smuzhiyun
1563*4882a593Smuzhiyun }
1564*4882a593Smuzhiyun }
1565*4882a593Smuzhiyun
1566*4882a593Smuzhiyun }
1567*4882a593Smuzhiyun
balloon_connect_vsp(struct hv_device * dev)1568*4882a593Smuzhiyun static int balloon_connect_vsp(struct hv_device *dev)
1569*4882a593Smuzhiyun {
1570*4882a593Smuzhiyun struct dm_version_request version_req;
1571*4882a593Smuzhiyun struct dm_capabilities cap_msg;
1572*4882a593Smuzhiyun unsigned long t;
1573*4882a593Smuzhiyun int ret;
1574*4882a593Smuzhiyun
1575*4882a593Smuzhiyun ret = vmbus_open(dev->channel, dm_ring_size, dm_ring_size, NULL, 0,
1576*4882a593Smuzhiyun balloon_onchannelcallback, dev);
1577*4882a593Smuzhiyun if (ret)
1578*4882a593Smuzhiyun return ret;
1579*4882a593Smuzhiyun
1580*4882a593Smuzhiyun /*
1581*4882a593Smuzhiyun * Initiate the hand shake with the host and negotiate
1582*4882a593Smuzhiyun * a version that the host can support. We start with the
1583*4882a593Smuzhiyun * highest version number and go down if the host cannot
1584*4882a593Smuzhiyun * support it.
1585*4882a593Smuzhiyun */
1586*4882a593Smuzhiyun memset(&version_req, 0, sizeof(struct dm_version_request));
1587*4882a593Smuzhiyun version_req.hdr.type = DM_VERSION_REQUEST;
1588*4882a593Smuzhiyun version_req.hdr.size = sizeof(struct dm_version_request);
1589*4882a593Smuzhiyun version_req.hdr.trans_id = atomic_inc_return(&trans_id);
1590*4882a593Smuzhiyun version_req.version.version = DYNMEM_PROTOCOL_VERSION_WIN10;
1591*4882a593Smuzhiyun version_req.is_last_attempt = 0;
1592*4882a593Smuzhiyun dm_device.version = version_req.version.version;
1593*4882a593Smuzhiyun
1594*4882a593Smuzhiyun ret = vmbus_sendpacket(dev->channel, &version_req,
1595*4882a593Smuzhiyun sizeof(struct dm_version_request),
1596*4882a593Smuzhiyun (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
1597*4882a593Smuzhiyun if (ret)
1598*4882a593Smuzhiyun goto out;
1599*4882a593Smuzhiyun
1600*4882a593Smuzhiyun t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
1601*4882a593Smuzhiyun if (t == 0) {
1602*4882a593Smuzhiyun ret = -ETIMEDOUT;
1603*4882a593Smuzhiyun goto out;
1604*4882a593Smuzhiyun }
1605*4882a593Smuzhiyun
1606*4882a593Smuzhiyun /*
1607*4882a593Smuzhiyun * If we could not negotiate a compatible version with the host
1608*4882a593Smuzhiyun * fail the probe function.
1609*4882a593Smuzhiyun */
1610*4882a593Smuzhiyun if (dm_device.state == DM_INIT_ERROR) {
1611*4882a593Smuzhiyun ret = -EPROTO;
1612*4882a593Smuzhiyun goto out;
1613*4882a593Smuzhiyun }
1614*4882a593Smuzhiyun
1615*4882a593Smuzhiyun pr_info("Using Dynamic Memory protocol version %u.%u\n",
1616*4882a593Smuzhiyun DYNMEM_MAJOR_VERSION(dm_device.version),
1617*4882a593Smuzhiyun DYNMEM_MINOR_VERSION(dm_device.version));
1618*4882a593Smuzhiyun
1619*4882a593Smuzhiyun /*
1620*4882a593Smuzhiyun * Now submit our capabilities to the host.
1621*4882a593Smuzhiyun */
1622*4882a593Smuzhiyun memset(&cap_msg, 0, sizeof(struct dm_capabilities));
1623*4882a593Smuzhiyun cap_msg.hdr.type = DM_CAPABILITIES_REPORT;
1624*4882a593Smuzhiyun cap_msg.hdr.size = sizeof(struct dm_capabilities);
1625*4882a593Smuzhiyun cap_msg.hdr.trans_id = atomic_inc_return(&trans_id);
1626*4882a593Smuzhiyun
1627*4882a593Smuzhiyun /*
1628*4882a593Smuzhiyun * When hibernation (i.e. virtual ACPI S4 state) is enabled, the host
1629*4882a593Smuzhiyun * currently still requires the bits to be set, so we have to add code
1630*4882a593Smuzhiyun * to fail the host's hot-add and balloon up/down requests, if any.
1631*4882a593Smuzhiyun */
1632*4882a593Smuzhiyun cap_msg.caps.cap_bits.balloon = 1;
1633*4882a593Smuzhiyun cap_msg.caps.cap_bits.hot_add = 1;
1634*4882a593Smuzhiyun
1635*4882a593Smuzhiyun /*
1636*4882a593Smuzhiyun * Specify our alignment requirements as it relates
1637*4882a593Smuzhiyun * memory hot-add. Specify 128MB alignment.
1638*4882a593Smuzhiyun */
1639*4882a593Smuzhiyun cap_msg.caps.cap_bits.hot_add_alignment = 7;
1640*4882a593Smuzhiyun
1641*4882a593Smuzhiyun /*
1642*4882a593Smuzhiyun * Currently the host does not use these
1643*4882a593Smuzhiyun * values and we set them to what is done in the
1644*4882a593Smuzhiyun * Windows driver.
1645*4882a593Smuzhiyun */
1646*4882a593Smuzhiyun cap_msg.min_page_cnt = 0;
1647*4882a593Smuzhiyun cap_msg.max_page_number = -1;
1648*4882a593Smuzhiyun
1649*4882a593Smuzhiyun ret = vmbus_sendpacket(dev->channel, &cap_msg,
1650*4882a593Smuzhiyun sizeof(struct dm_capabilities),
1651*4882a593Smuzhiyun (unsigned long)NULL, VM_PKT_DATA_INBAND, 0);
1652*4882a593Smuzhiyun if (ret)
1653*4882a593Smuzhiyun goto out;
1654*4882a593Smuzhiyun
1655*4882a593Smuzhiyun t = wait_for_completion_timeout(&dm_device.host_event, 5*HZ);
1656*4882a593Smuzhiyun if (t == 0) {
1657*4882a593Smuzhiyun ret = -ETIMEDOUT;
1658*4882a593Smuzhiyun goto out;
1659*4882a593Smuzhiyun }
1660*4882a593Smuzhiyun
1661*4882a593Smuzhiyun /*
1662*4882a593Smuzhiyun * If the host does not like our capabilities,
1663*4882a593Smuzhiyun * fail the probe function.
1664*4882a593Smuzhiyun */
1665*4882a593Smuzhiyun if (dm_device.state == DM_INIT_ERROR) {
1666*4882a593Smuzhiyun ret = -EPROTO;
1667*4882a593Smuzhiyun goto out;
1668*4882a593Smuzhiyun }
1669*4882a593Smuzhiyun
1670*4882a593Smuzhiyun return 0;
1671*4882a593Smuzhiyun out:
1672*4882a593Smuzhiyun vmbus_close(dev->channel);
1673*4882a593Smuzhiyun return ret;
1674*4882a593Smuzhiyun }
1675*4882a593Smuzhiyun
balloon_probe(struct hv_device * dev,const struct hv_vmbus_device_id * dev_id)1676*4882a593Smuzhiyun static int balloon_probe(struct hv_device *dev,
1677*4882a593Smuzhiyun const struct hv_vmbus_device_id *dev_id)
1678*4882a593Smuzhiyun {
1679*4882a593Smuzhiyun int ret;
1680*4882a593Smuzhiyun
1681*4882a593Smuzhiyun allow_hibernation = hv_is_hibernation_supported();
1682*4882a593Smuzhiyun if (allow_hibernation)
1683*4882a593Smuzhiyun hot_add = false;
1684*4882a593Smuzhiyun
1685*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
1686*4882a593Smuzhiyun do_hot_add = hot_add;
1687*4882a593Smuzhiyun #else
1688*4882a593Smuzhiyun do_hot_add = false;
1689*4882a593Smuzhiyun #endif
1690*4882a593Smuzhiyun dm_device.dev = dev;
1691*4882a593Smuzhiyun dm_device.state = DM_INITIALIZING;
1692*4882a593Smuzhiyun dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN8;
1693*4882a593Smuzhiyun init_completion(&dm_device.host_event);
1694*4882a593Smuzhiyun init_completion(&dm_device.config_event);
1695*4882a593Smuzhiyun INIT_LIST_HEAD(&dm_device.ha_region_list);
1696*4882a593Smuzhiyun spin_lock_init(&dm_device.ha_lock);
1697*4882a593Smuzhiyun INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up);
1698*4882a593Smuzhiyun INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req);
1699*4882a593Smuzhiyun dm_device.host_specified_ha_region = false;
1700*4882a593Smuzhiyun
1701*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
1702*4882a593Smuzhiyun set_online_page_callback(&hv_online_page);
1703*4882a593Smuzhiyun init_completion(&dm_device.ol_waitevent);
1704*4882a593Smuzhiyun register_memory_notifier(&hv_memory_nb);
1705*4882a593Smuzhiyun #endif
1706*4882a593Smuzhiyun
1707*4882a593Smuzhiyun hv_set_drvdata(dev, &dm_device);
1708*4882a593Smuzhiyun
1709*4882a593Smuzhiyun ret = balloon_connect_vsp(dev);
1710*4882a593Smuzhiyun if (ret != 0)
1711*4882a593Smuzhiyun return ret;
1712*4882a593Smuzhiyun
1713*4882a593Smuzhiyun dm_device.state = DM_INITIALIZED;
1714*4882a593Smuzhiyun
1715*4882a593Smuzhiyun dm_device.thread =
1716*4882a593Smuzhiyun kthread_run(dm_thread_func, &dm_device, "hv_balloon");
1717*4882a593Smuzhiyun if (IS_ERR(dm_device.thread)) {
1718*4882a593Smuzhiyun ret = PTR_ERR(dm_device.thread);
1719*4882a593Smuzhiyun goto probe_error;
1720*4882a593Smuzhiyun }
1721*4882a593Smuzhiyun
1722*4882a593Smuzhiyun return 0;
1723*4882a593Smuzhiyun
1724*4882a593Smuzhiyun probe_error:
1725*4882a593Smuzhiyun dm_device.state = DM_INIT_ERROR;
1726*4882a593Smuzhiyun dm_device.thread = NULL;
1727*4882a593Smuzhiyun vmbus_close(dev->channel);
1728*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
1729*4882a593Smuzhiyun unregister_memory_notifier(&hv_memory_nb);
1730*4882a593Smuzhiyun restore_online_page_callback(&hv_online_page);
1731*4882a593Smuzhiyun #endif
1732*4882a593Smuzhiyun return ret;
1733*4882a593Smuzhiyun }
1734*4882a593Smuzhiyun
balloon_remove(struct hv_device * dev)1735*4882a593Smuzhiyun static int balloon_remove(struct hv_device *dev)
1736*4882a593Smuzhiyun {
1737*4882a593Smuzhiyun struct hv_dynmem_device *dm = hv_get_drvdata(dev);
1738*4882a593Smuzhiyun struct hv_hotadd_state *has, *tmp;
1739*4882a593Smuzhiyun struct hv_hotadd_gap *gap, *tmp_gap;
1740*4882a593Smuzhiyun unsigned long flags;
1741*4882a593Smuzhiyun
1742*4882a593Smuzhiyun if (dm->num_pages_ballooned != 0)
1743*4882a593Smuzhiyun pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned);
1744*4882a593Smuzhiyun
1745*4882a593Smuzhiyun cancel_work_sync(&dm->balloon_wrk.wrk);
1746*4882a593Smuzhiyun cancel_work_sync(&dm->ha_wrk.wrk);
1747*4882a593Smuzhiyun
1748*4882a593Smuzhiyun kthread_stop(dm->thread);
1749*4882a593Smuzhiyun vmbus_close(dev->channel);
1750*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
1751*4882a593Smuzhiyun unregister_memory_notifier(&hv_memory_nb);
1752*4882a593Smuzhiyun restore_online_page_callback(&hv_online_page);
1753*4882a593Smuzhiyun #endif
1754*4882a593Smuzhiyun spin_lock_irqsave(&dm_device.ha_lock, flags);
1755*4882a593Smuzhiyun list_for_each_entry_safe(has, tmp, &dm->ha_region_list, list) {
1756*4882a593Smuzhiyun list_for_each_entry_safe(gap, tmp_gap, &has->gap_list, list) {
1757*4882a593Smuzhiyun list_del(&gap->list);
1758*4882a593Smuzhiyun kfree(gap);
1759*4882a593Smuzhiyun }
1760*4882a593Smuzhiyun list_del(&has->list);
1761*4882a593Smuzhiyun kfree(has);
1762*4882a593Smuzhiyun }
1763*4882a593Smuzhiyun spin_unlock_irqrestore(&dm_device.ha_lock, flags);
1764*4882a593Smuzhiyun
1765*4882a593Smuzhiyun return 0;
1766*4882a593Smuzhiyun }
1767*4882a593Smuzhiyun
balloon_suspend(struct hv_device * hv_dev)1768*4882a593Smuzhiyun static int balloon_suspend(struct hv_device *hv_dev)
1769*4882a593Smuzhiyun {
1770*4882a593Smuzhiyun struct hv_dynmem_device *dm = hv_get_drvdata(hv_dev);
1771*4882a593Smuzhiyun
1772*4882a593Smuzhiyun tasklet_disable(&hv_dev->channel->callback_event);
1773*4882a593Smuzhiyun
1774*4882a593Smuzhiyun cancel_work_sync(&dm->balloon_wrk.wrk);
1775*4882a593Smuzhiyun cancel_work_sync(&dm->ha_wrk.wrk);
1776*4882a593Smuzhiyun
1777*4882a593Smuzhiyun if (dm->thread) {
1778*4882a593Smuzhiyun kthread_stop(dm->thread);
1779*4882a593Smuzhiyun dm->thread = NULL;
1780*4882a593Smuzhiyun vmbus_close(hv_dev->channel);
1781*4882a593Smuzhiyun }
1782*4882a593Smuzhiyun
1783*4882a593Smuzhiyun tasklet_enable(&hv_dev->channel->callback_event);
1784*4882a593Smuzhiyun
1785*4882a593Smuzhiyun return 0;
1786*4882a593Smuzhiyun
1787*4882a593Smuzhiyun }
1788*4882a593Smuzhiyun
balloon_resume(struct hv_device * dev)1789*4882a593Smuzhiyun static int balloon_resume(struct hv_device *dev)
1790*4882a593Smuzhiyun {
1791*4882a593Smuzhiyun int ret;
1792*4882a593Smuzhiyun
1793*4882a593Smuzhiyun dm_device.state = DM_INITIALIZING;
1794*4882a593Smuzhiyun
1795*4882a593Smuzhiyun ret = balloon_connect_vsp(dev);
1796*4882a593Smuzhiyun
1797*4882a593Smuzhiyun if (ret != 0)
1798*4882a593Smuzhiyun goto out;
1799*4882a593Smuzhiyun
1800*4882a593Smuzhiyun dm_device.thread =
1801*4882a593Smuzhiyun kthread_run(dm_thread_func, &dm_device, "hv_balloon");
1802*4882a593Smuzhiyun if (IS_ERR(dm_device.thread)) {
1803*4882a593Smuzhiyun ret = PTR_ERR(dm_device.thread);
1804*4882a593Smuzhiyun dm_device.thread = NULL;
1805*4882a593Smuzhiyun goto close_channel;
1806*4882a593Smuzhiyun }
1807*4882a593Smuzhiyun
1808*4882a593Smuzhiyun dm_device.state = DM_INITIALIZED;
1809*4882a593Smuzhiyun return 0;
1810*4882a593Smuzhiyun close_channel:
1811*4882a593Smuzhiyun vmbus_close(dev->channel);
1812*4882a593Smuzhiyun out:
1813*4882a593Smuzhiyun dm_device.state = DM_INIT_ERROR;
1814*4882a593Smuzhiyun #ifdef CONFIG_MEMORY_HOTPLUG
1815*4882a593Smuzhiyun unregister_memory_notifier(&hv_memory_nb);
1816*4882a593Smuzhiyun restore_online_page_callback(&hv_online_page);
1817*4882a593Smuzhiyun #endif
1818*4882a593Smuzhiyun return ret;
1819*4882a593Smuzhiyun }
1820*4882a593Smuzhiyun
1821*4882a593Smuzhiyun static const struct hv_vmbus_device_id id_table[] = {
1822*4882a593Smuzhiyun /* Dynamic Memory Class ID */
1823*4882a593Smuzhiyun /* 525074DC-8985-46e2-8057-A307DC18A502 */
1824*4882a593Smuzhiyun { HV_DM_GUID, },
1825*4882a593Smuzhiyun { },
1826*4882a593Smuzhiyun };
1827*4882a593Smuzhiyun
1828*4882a593Smuzhiyun MODULE_DEVICE_TABLE(vmbus, id_table);
1829*4882a593Smuzhiyun
1830*4882a593Smuzhiyun static struct hv_driver balloon_drv = {
1831*4882a593Smuzhiyun .name = "hv_balloon",
1832*4882a593Smuzhiyun .id_table = id_table,
1833*4882a593Smuzhiyun .probe = balloon_probe,
1834*4882a593Smuzhiyun .remove = balloon_remove,
1835*4882a593Smuzhiyun .suspend = balloon_suspend,
1836*4882a593Smuzhiyun .resume = balloon_resume,
1837*4882a593Smuzhiyun .driver = {
1838*4882a593Smuzhiyun .probe_type = PROBE_PREFER_ASYNCHRONOUS,
1839*4882a593Smuzhiyun },
1840*4882a593Smuzhiyun };
1841*4882a593Smuzhiyun
init_balloon_drv(void)1842*4882a593Smuzhiyun static int __init init_balloon_drv(void)
1843*4882a593Smuzhiyun {
1844*4882a593Smuzhiyun
1845*4882a593Smuzhiyun return vmbus_driver_register(&balloon_drv);
1846*4882a593Smuzhiyun }
1847*4882a593Smuzhiyun
1848*4882a593Smuzhiyun module_init(init_balloon_drv);
1849*4882a593Smuzhiyun
1850*4882a593Smuzhiyun MODULE_DESCRIPTION("Hyper-V Balloon");
1851*4882a593Smuzhiyun MODULE_LICENSE("GPL");
1852