1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * blkfront.c
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * XenLinux virtual block device driver.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
7*4882a593Smuzhiyun * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
8*4882a593Smuzhiyun * Copyright (c) 2004, Christian Limpach
9*4882a593Smuzhiyun * Copyright (c) 2004, Andrew Warfield
10*4882a593Smuzhiyun * Copyright (c) 2005, Christopher Clark
11*4882a593Smuzhiyun * Copyright (c) 2005, XenSource Ltd
12*4882a593Smuzhiyun *
13*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or
14*4882a593Smuzhiyun * modify it under the terms of the GNU General Public License version 2
15*4882a593Smuzhiyun * as published by the Free Software Foundation; or, when distributed
16*4882a593Smuzhiyun * separately from the Linux kernel or incorporated into other
17*4882a593Smuzhiyun * software packages, subject to the following license:
18*4882a593Smuzhiyun *
19*4882a593Smuzhiyun * Permission is hereby granted, free of charge, to any person obtaining a copy
20*4882a593Smuzhiyun * of this source file (the "Software"), to deal in the Software without
21*4882a593Smuzhiyun * restriction, including without limitation the rights to use, copy, modify,
22*4882a593Smuzhiyun * merge, publish, distribute, sublicense, and/or sell copies of the Software,
23*4882a593Smuzhiyun * and to permit persons to whom the Software is furnished to do so, subject to
24*4882a593Smuzhiyun * the following conditions:
25*4882a593Smuzhiyun *
26*4882a593Smuzhiyun * The above copyright notice and this permission notice shall be included in
27*4882a593Smuzhiyun * all copies or substantial portions of the Software.
28*4882a593Smuzhiyun *
29*4882a593Smuzhiyun * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
30*4882a593Smuzhiyun * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
31*4882a593Smuzhiyun * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
32*4882a593Smuzhiyun * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
33*4882a593Smuzhiyun * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
34*4882a593Smuzhiyun * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
35*4882a593Smuzhiyun * IN THE SOFTWARE.
36*4882a593Smuzhiyun */
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun #include <linux/interrupt.h>
39*4882a593Smuzhiyun #include <linux/blkdev.h>
40*4882a593Smuzhiyun #include <linux/blk-mq.h>
41*4882a593Smuzhiyun #include <linux/hdreg.h>
42*4882a593Smuzhiyun #include <linux/cdrom.h>
43*4882a593Smuzhiyun #include <linux/module.h>
44*4882a593Smuzhiyun #include <linux/slab.h>
45*4882a593Smuzhiyun #include <linux/mutex.h>
46*4882a593Smuzhiyun #include <linux/scatterlist.h>
47*4882a593Smuzhiyun #include <linux/bitmap.h>
48*4882a593Smuzhiyun #include <linux/list.h>
49*4882a593Smuzhiyun #include <linux/workqueue.h>
50*4882a593Smuzhiyun #include <linux/sched/mm.h>
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun #include <xen/xen.h>
53*4882a593Smuzhiyun #include <xen/xenbus.h>
54*4882a593Smuzhiyun #include <xen/grant_table.h>
55*4882a593Smuzhiyun #include <xen/events.h>
56*4882a593Smuzhiyun #include <xen/page.h>
57*4882a593Smuzhiyun #include <xen/platform_pci.h>
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun #include <xen/interface/grant_table.h>
60*4882a593Smuzhiyun #include <xen/interface/io/blkif.h>
61*4882a593Smuzhiyun #include <xen/interface/io/protocols.h>
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun #include <asm/xen/hypervisor.h>
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun /*
66*4882a593Smuzhiyun * The minimal size of segment supported by the block framework is PAGE_SIZE.
67*4882a593Smuzhiyun * When Linux is using a different page size than Xen, it may not be possible
68*4882a593Smuzhiyun * to put all the data in a single segment.
69*4882a593Smuzhiyun * This can happen when the backend doesn't support indirect descriptor and
70*4882a593Smuzhiyun * therefore the maximum amount of data that a request can carry is
71*4882a593Smuzhiyun * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
72*4882a593Smuzhiyun *
73*4882a593Smuzhiyun * Note that we only support one extra request. So the Linux page size
74*4882a593Smuzhiyun * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
75*4882a593Smuzhiyun * 88KB.
76*4882a593Smuzhiyun */
77*4882a593Smuzhiyun #define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun enum blkif_state {
80*4882a593Smuzhiyun BLKIF_STATE_DISCONNECTED,
81*4882a593Smuzhiyun BLKIF_STATE_CONNECTED,
82*4882a593Smuzhiyun BLKIF_STATE_SUSPENDED,
83*4882a593Smuzhiyun BLKIF_STATE_ERROR,
84*4882a593Smuzhiyun };
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun struct grant {
87*4882a593Smuzhiyun grant_ref_t gref;
88*4882a593Smuzhiyun struct page *page;
89*4882a593Smuzhiyun struct list_head node;
90*4882a593Smuzhiyun };
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun enum blk_req_status {
93*4882a593Smuzhiyun REQ_PROCESSING,
94*4882a593Smuzhiyun REQ_WAITING,
95*4882a593Smuzhiyun REQ_DONE,
96*4882a593Smuzhiyun REQ_ERROR,
97*4882a593Smuzhiyun REQ_EOPNOTSUPP,
98*4882a593Smuzhiyun };
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun struct blk_shadow {
101*4882a593Smuzhiyun struct blkif_request req;
102*4882a593Smuzhiyun struct request *request;
103*4882a593Smuzhiyun struct grant **grants_used;
104*4882a593Smuzhiyun struct grant **indirect_grants;
105*4882a593Smuzhiyun struct scatterlist *sg;
106*4882a593Smuzhiyun unsigned int num_sg;
107*4882a593Smuzhiyun enum blk_req_status status;
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun #define NO_ASSOCIATED_ID ~0UL
110*4882a593Smuzhiyun /*
111*4882a593Smuzhiyun * Id of the sibling if we ever need 2 requests when handling a
112*4882a593Smuzhiyun * block I/O request
113*4882a593Smuzhiyun */
114*4882a593Smuzhiyun unsigned long associated_id;
115*4882a593Smuzhiyun };
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun struct blkif_req {
118*4882a593Smuzhiyun blk_status_t error;
119*4882a593Smuzhiyun };
120*4882a593Smuzhiyun
blkif_req(struct request * rq)121*4882a593Smuzhiyun static inline struct blkif_req *blkif_req(struct request *rq)
122*4882a593Smuzhiyun {
123*4882a593Smuzhiyun return blk_mq_rq_to_pdu(rq);
124*4882a593Smuzhiyun }
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun static DEFINE_MUTEX(blkfront_mutex);
127*4882a593Smuzhiyun static const struct block_device_operations xlvbd_block_fops;
128*4882a593Smuzhiyun static struct delayed_work blkfront_work;
129*4882a593Smuzhiyun static LIST_HEAD(info_list);
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun /*
132*4882a593Smuzhiyun * Maximum number of segments in indirect requests, the actual value used by
133*4882a593Smuzhiyun * the frontend driver is the minimum of this value and the value provided
134*4882a593Smuzhiyun * by the backend driver.
135*4882a593Smuzhiyun */
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun static unsigned int xen_blkif_max_segments = 32;
138*4882a593Smuzhiyun module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
139*4882a593Smuzhiyun MODULE_PARM_DESC(max_indirect_segments,
140*4882a593Smuzhiyun "Maximum amount of segments in indirect requests (default is 32)");
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun static unsigned int xen_blkif_max_queues = 4;
143*4882a593Smuzhiyun module_param_named(max_queues, xen_blkif_max_queues, uint, 0444);
144*4882a593Smuzhiyun MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun /*
147*4882a593Smuzhiyun * Maximum order of pages to be used for the shared ring between front and
148*4882a593Smuzhiyun * backend, 4KB page granularity is used.
149*4882a593Smuzhiyun */
150*4882a593Smuzhiyun static unsigned int xen_blkif_max_ring_order;
151*4882a593Smuzhiyun module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
152*4882a593Smuzhiyun MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun static bool __read_mostly xen_blkif_trusted = true;
155*4882a593Smuzhiyun module_param_named(trusted, xen_blkif_trusted, bool, 0644);
156*4882a593Smuzhiyun MODULE_PARM_DESC(trusted, "Is the backend trusted");
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun #define BLK_RING_SIZE(info) \
159*4882a593Smuzhiyun __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun /*
162*4882a593Smuzhiyun * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
163*4882a593Smuzhiyun * characters are enough. Define to 20 to keep consistent with backend.
164*4882a593Smuzhiyun */
165*4882a593Smuzhiyun #define RINGREF_NAME_LEN (20)
166*4882a593Smuzhiyun /*
167*4882a593Smuzhiyun * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
168*4882a593Smuzhiyun */
169*4882a593Smuzhiyun #define QUEUE_NAME_LEN (17)
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun /*
172*4882a593Smuzhiyun * Per-ring info.
173*4882a593Smuzhiyun * Every blkfront device can associate with one or more blkfront_ring_info,
174*4882a593Smuzhiyun * depending on how many hardware queues/rings to be used.
175*4882a593Smuzhiyun */
176*4882a593Smuzhiyun struct blkfront_ring_info {
177*4882a593Smuzhiyun /* Lock to protect data in every ring buffer. */
178*4882a593Smuzhiyun spinlock_t ring_lock;
179*4882a593Smuzhiyun struct blkif_front_ring ring;
180*4882a593Smuzhiyun unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
181*4882a593Smuzhiyun unsigned int evtchn, irq;
182*4882a593Smuzhiyun struct work_struct work;
183*4882a593Smuzhiyun struct gnttab_free_callback callback;
184*4882a593Smuzhiyun struct list_head indirect_pages;
185*4882a593Smuzhiyun struct list_head grants;
186*4882a593Smuzhiyun unsigned int persistent_gnts_c;
187*4882a593Smuzhiyun unsigned long shadow_free;
188*4882a593Smuzhiyun struct blkfront_info *dev_info;
189*4882a593Smuzhiyun struct blk_shadow shadow[];
190*4882a593Smuzhiyun };
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun /*
193*4882a593Smuzhiyun * We have one of these per vbd, whether ide, scsi or 'other'. They
194*4882a593Smuzhiyun * hang in private_data off the gendisk structure. We may end up
195*4882a593Smuzhiyun * putting all kinds of interesting stuff here :-)
196*4882a593Smuzhiyun */
197*4882a593Smuzhiyun struct blkfront_info
198*4882a593Smuzhiyun {
199*4882a593Smuzhiyun struct mutex mutex;
200*4882a593Smuzhiyun struct xenbus_device *xbdev;
201*4882a593Smuzhiyun struct gendisk *gd;
202*4882a593Smuzhiyun u16 sector_size;
203*4882a593Smuzhiyun unsigned int physical_sector_size;
204*4882a593Smuzhiyun int vdevice;
205*4882a593Smuzhiyun blkif_vdev_t handle;
206*4882a593Smuzhiyun enum blkif_state connected;
207*4882a593Smuzhiyun /* Number of pages per ring buffer. */
208*4882a593Smuzhiyun unsigned int nr_ring_pages;
209*4882a593Smuzhiyun struct request_queue *rq;
210*4882a593Smuzhiyun unsigned int feature_flush:1;
211*4882a593Smuzhiyun unsigned int feature_fua:1;
212*4882a593Smuzhiyun unsigned int feature_discard:1;
213*4882a593Smuzhiyun unsigned int feature_secdiscard:1;
214*4882a593Smuzhiyun /* Connect-time cached feature_persistent parameter */
215*4882a593Smuzhiyun unsigned int feature_persistent_parm:1;
216*4882a593Smuzhiyun /* Persistent grants feature negotiation result */
217*4882a593Smuzhiyun unsigned int feature_persistent:1;
218*4882a593Smuzhiyun unsigned int bounce:1;
219*4882a593Smuzhiyun unsigned int discard_granularity;
220*4882a593Smuzhiyun unsigned int discard_alignment;
221*4882a593Smuzhiyun /* Number of 4KB segments handled */
222*4882a593Smuzhiyun unsigned int max_indirect_segments;
223*4882a593Smuzhiyun int is_ready;
224*4882a593Smuzhiyun struct blk_mq_tag_set tag_set;
225*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
226*4882a593Smuzhiyun unsigned int nr_rings;
227*4882a593Smuzhiyun unsigned int rinfo_size;
228*4882a593Smuzhiyun /* Save uncomplete reqs and bios for migration. */
229*4882a593Smuzhiyun struct list_head requests;
230*4882a593Smuzhiyun struct bio_list bio_list;
231*4882a593Smuzhiyun struct list_head info_list;
232*4882a593Smuzhiyun };
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun static unsigned int nr_minors;
235*4882a593Smuzhiyun static unsigned long *minors;
236*4882a593Smuzhiyun static DEFINE_SPINLOCK(minor_lock);
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun #define GRANT_INVALID_REF 0
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun #define PARTS_PER_DISK 16
241*4882a593Smuzhiyun #define PARTS_PER_EXT_DISK 256
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun #define BLKIF_MAJOR(dev) ((dev)>>8)
244*4882a593Smuzhiyun #define BLKIF_MINOR(dev) ((dev) & 0xff)
245*4882a593Smuzhiyun
246*4882a593Smuzhiyun #define EXT_SHIFT 28
247*4882a593Smuzhiyun #define EXTENDED (1<<EXT_SHIFT)
248*4882a593Smuzhiyun #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
249*4882a593Smuzhiyun #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
250*4882a593Smuzhiyun #define EMULATED_HD_DISK_MINOR_OFFSET (0)
251*4882a593Smuzhiyun #define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
252*4882a593Smuzhiyun #define EMULATED_SD_DISK_MINOR_OFFSET (0)
253*4882a593Smuzhiyun #define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun #define DEV_NAME "xvd" /* name in /dev */
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun /*
258*4882a593Smuzhiyun * Grants are always the same size as a Xen page (i.e 4KB).
259*4882a593Smuzhiyun * A physical segment is always the same size as a Linux page.
260*4882a593Smuzhiyun * Number of grants per physical segment
261*4882a593Smuzhiyun */
262*4882a593Smuzhiyun #define GRANTS_PER_PSEG (PAGE_SIZE / XEN_PAGE_SIZE)
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun #define GRANTS_PER_INDIRECT_FRAME \
265*4882a593Smuzhiyun (XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun #define INDIRECT_GREFS(_grants) \
268*4882a593Smuzhiyun DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
271*4882a593Smuzhiyun static void blkfront_gather_backend_features(struct blkfront_info *info);
272*4882a593Smuzhiyun static int negotiate_mq(struct blkfront_info *info);
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun #define for_each_rinfo(info, ptr, idx) \
275*4882a593Smuzhiyun for ((ptr) = (info)->rinfo, (idx) = 0; \
276*4882a593Smuzhiyun (idx) < (info)->nr_rings; \
277*4882a593Smuzhiyun (idx)++, (ptr) = (void *)(ptr) + (info)->rinfo_size)
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun static inline struct blkfront_ring_info *
get_rinfo(const struct blkfront_info * info,unsigned int i)280*4882a593Smuzhiyun get_rinfo(const struct blkfront_info *info, unsigned int i)
281*4882a593Smuzhiyun {
282*4882a593Smuzhiyun BUG_ON(i >= info->nr_rings);
283*4882a593Smuzhiyun return (void *)info->rinfo + i * info->rinfo_size;
284*4882a593Smuzhiyun }
285*4882a593Smuzhiyun
get_id_from_freelist(struct blkfront_ring_info * rinfo)286*4882a593Smuzhiyun static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun unsigned long free = rinfo->shadow_free;
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
291*4882a593Smuzhiyun rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
292*4882a593Smuzhiyun rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
293*4882a593Smuzhiyun return free;
294*4882a593Smuzhiyun }
295*4882a593Smuzhiyun
add_id_to_freelist(struct blkfront_ring_info * rinfo,unsigned long id)296*4882a593Smuzhiyun static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
297*4882a593Smuzhiyun unsigned long id)
298*4882a593Smuzhiyun {
299*4882a593Smuzhiyun if (rinfo->shadow[id].req.u.rw.id != id)
300*4882a593Smuzhiyun return -EINVAL;
301*4882a593Smuzhiyun if (rinfo->shadow[id].request == NULL)
302*4882a593Smuzhiyun return -EINVAL;
303*4882a593Smuzhiyun rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free;
304*4882a593Smuzhiyun rinfo->shadow[id].request = NULL;
305*4882a593Smuzhiyun rinfo->shadow_free = id;
306*4882a593Smuzhiyun return 0;
307*4882a593Smuzhiyun }
308*4882a593Smuzhiyun
fill_grant_buffer(struct blkfront_ring_info * rinfo,int num)309*4882a593Smuzhiyun static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
310*4882a593Smuzhiyun {
311*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
312*4882a593Smuzhiyun struct page *granted_page;
313*4882a593Smuzhiyun struct grant *gnt_list_entry, *n;
314*4882a593Smuzhiyun int i = 0;
315*4882a593Smuzhiyun
316*4882a593Smuzhiyun while (i < num) {
317*4882a593Smuzhiyun gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
318*4882a593Smuzhiyun if (!gnt_list_entry)
319*4882a593Smuzhiyun goto out_of_memory;
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun if (info->bounce) {
322*4882a593Smuzhiyun granted_page = alloc_page(GFP_NOIO | __GFP_ZERO);
323*4882a593Smuzhiyun if (!granted_page) {
324*4882a593Smuzhiyun kfree(gnt_list_entry);
325*4882a593Smuzhiyun goto out_of_memory;
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun gnt_list_entry->page = granted_page;
328*4882a593Smuzhiyun }
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun gnt_list_entry->gref = GRANT_INVALID_REF;
331*4882a593Smuzhiyun list_add(&gnt_list_entry->node, &rinfo->grants);
332*4882a593Smuzhiyun i++;
333*4882a593Smuzhiyun }
334*4882a593Smuzhiyun
335*4882a593Smuzhiyun return 0;
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun out_of_memory:
338*4882a593Smuzhiyun list_for_each_entry_safe(gnt_list_entry, n,
339*4882a593Smuzhiyun &rinfo->grants, node) {
340*4882a593Smuzhiyun list_del(&gnt_list_entry->node);
341*4882a593Smuzhiyun if (info->bounce)
342*4882a593Smuzhiyun __free_page(gnt_list_entry->page);
343*4882a593Smuzhiyun kfree(gnt_list_entry);
344*4882a593Smuzhiyun i--;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun BUG_ON(i != 0);
347*4882a593Smuzhiyun return -ENOMEM;
348*4882a593Smuzhiyun }
349*4882a593Smuzhiyun
get_free_grant(struct blkfront_ring_info * rinfo)350*4882a593Smuzhiyun static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
351*4882a593Smuzhiyun {
352*4882a593Smuzhiyun struct grant *gnt_list_entry;
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun BUG_ON(list_empty(&rinfo->grants));
355*4882a593Smuzhiyun gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
356*4882a593Smuzhiyun node);
357*4882a593Smuzhiyun list_del(&gnt_list_entry->node);
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun if (gnt_list_entry->gref != GRANT_INVALID_REF)
360*4882a593Smuzhiyun rinfo->persistent_gnts_c--;
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun return gnt_list_entry;
363*4882a593Smuzhiyun }
364*4882a593Smuzhiyun
grant_foreign_access(const struct grant * gnt_list_entry,const struct blkfront_info * info)365*4882a593Smuzhiyun static inline void grant_foreign_access(const struct grant *gnt_list_entry,
366*4882a593Smuzhiyun const struct blkfront_info *info)
367*4882a593Smuzhiyun {
368*4882a593Smuzhiyun gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref,
369*4882a593Smuzhiyun info->xbdev->otherend_id,
370*4882a593Smuzhiyun gnt_list_entry->page,
371*4882a593Smuzhiyun 0);
372*4882a593Smuzhiyun }
373*4882a593Smuzhiyun
get_grant(grant_ref_t * gref_head,unsigned long gfn,struct blkfront_ring_info * rinfo)374*4882a593Smuzhiyun static struct grant *get_grant(grant_ref_t *gref_head,
375*4882a593Smuzhiyun unsigned long gfn,
376*4882a593Smuzhiyun struct blkfront_ring_info *rinfo)
377*4882a593Smuzhiyun {
378*4882a593Smuzhiyun struct grant *gnt_list_entry = get_free_grant(rinfo);
379*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun if (gnt_list_entry->gref != GRANT_INVALID_REF)
382*4882a593Smuzhiyun return gnt_list_entry;
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun /* Assign a gref to this page */
385*4882a593Smuzhiyun gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
386*4882a593Smuzhiyun BUG_ON(gnt_list_entry->gref == -ENOSPC);
387*4882a593Smuzhiyun if (info->bounce)
388*4882a593Smuzhiyun grant_foreign_access(gnt_list_entry, info);
389*4882a593Smuzhiyun else {
390*4882a593Smuzhiyun /* Grant access to the GFN passed by the caller */
391*4882a593Smuzhiyun gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
392*4882a593Smuzhiyun info->xbdev->otherend_id,
393*4882a593Smuzhiyun gfn, 0);
394*4882a593Smuzhiyun }
395*4882a593Smuzhiyun
396*4882a593Smuzhiyun return gnt_list_entry;
397*4882a593Smuzhiyun }
398*4882a593Smuzhiyun
get_indirect_grant(grant_ref_t * gref_head,struct blkfront_ring_info * rinfo)399*4882a593Smuzhiyun static struct grant *get_indirect_grant(grant_ref_t *gref_head,
400*4882a593Smuzhiyun struct blkfront_ring_info *rinfo)
401*4882a593Smuzhiyun {
402*4882a593Smuzhiyun struct grant *gnt_list_entry = get_free_grant(rinfo);
403*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
404*4882a593Smuzhiyun
405*4882a593Smuzhiyun if (gnt_list_entry->gref != GRANT_INVALID_REF)
406*4882a593Smuzhiyun return gnt_list_entry;
407*4882a593Smuzhiyun
408*4882a593Smuzhiyun /* Assign a gref to this page */
409*4882a593Smuzhiyun gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
410*4882a593Smuzhiyun BUG_ON(gnt_list_entry->gref == -ENOSPC);
411*4882a593Smuzhiyun if (!info->bounce) {
412*4882a593Smuzhiyun struct page *indirect_page;
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun /* Fetch a pre-allocated page to use for indirect grefs */
415*4882a593Smuzhiyun BUG_ON(list_empty(&rinfo->indirect_pages));
416*4882a593Smuzhiyun indirect_page = list_first_entry(&rinfo->indirect_pages,
417*4882a593Smuzhiyun struct page, lru);
418*4882a593Smuzhiyun list_del(&indirect_page->lru);
419*4882a593Smuzhiyun gnt_list_entry->page = indirect_page;
420*4882a593Smuzhiyun }
421*4882a593Smuzhiyun grant_foreign_access(gnt_list_entry, info);
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun return gnt_list_entry;
424*4882a593Smuzhiyun }
425*4882a593Smuzhiyun
op_name(int op)426*4882a593Smuzhiyun static const char *op_name(int op)
427*4882a593Smuzhiyun {
428*4882a593Smuzhiyun static const char *const names[] = {
429*4882a593Smuzhiyun [BLKIF_OP_READ] = "read",
430*4882a593Smuzhiyun [BLKIF_OP_WRITE] = "write",
431*4882a593Smuzhiyun [BLKIF_OP_WRITE_BARRIER] = "barrier",
432*4882a593Smuzhiyun [BLKIF_OP_FLUSH_DISKCACHE] = "flush",
433*4882a593Smuzhiyun [BLKIF_OP_DISCARD] = "discard" };
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun if (op < 0 || op >= ARRAY_SIZE(names))
436*4882a593Smuzhiyun return "unknown";
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun if (!names[op])
439*4882a593Smuzhiyun return "reserved";
440*4882a593Smuzhiyun
441*4882a593Smuzhiyun return names[op];
442*4882a593Smuzhiyun }
xlbd_reserve_minors(unsigned int minor,unsigned int nr)443*4882a593Smuzhiyun static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
444*4882a593Smuzhiyun {
445*4882a593Smuzhiyun unsigned int end = minor + nr;
446*4882a593Smuzhiyun int rc;
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun if (end > nr_minors) {
449*4882a593Smuzhiyun unsigned long *bitmap, *old;
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
452*4882a593Smuzhiyun GFP_KERNEL);
453*4882a593Smuzhiyun if (bitmap == NULL)
454*4882a593Smuzhiyun return -ENOMEM;
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun spin_lock(&minor_lock);
457*4882a593Smuzhiyun if (end > nr_minors) {
458*4882a593Smuzhiyun old = minors;
459*4882a593Smuzhiyun memcpy(bitmap, minors,
460*4882a593Smuzhiyun BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
461*4882a593Smuzhiyun minors = bitmap;
462*4882a593Smuzhiyun nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
463*4882a593Smuzhiyun } else
464*4882a593Smuzhiyun old = bitmap;
465*4882a593Smuzhiyun spin_unlock(&minor_lock);
466*4882a593Smuzhiyun kfree(old);
467*4882a593Smuzhiyun }
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun spin_lock(&minor_lock);
470*4882a593Smuzhiyun if (find_next_bit(minors, end, minor) >= end) {
471*4882a593Smuzhiyun bitmap_set(minors, minor, nr);
472*4882a593Smuzhiyun rc = 0;
473*4882a593Smuzhiyun } else
474*4882a593Smuzhiyun rc = -EBUSY;
475*4882a593Smuzhiyun spin_unlock(&minor_lock);
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun return rc;
478*4882a593Smuzhiyun }
479*4882a593Smuzhiyun
xlbd_release_minors(unsigned int minor,unsigned int nr)480*4882a593Smuzhiyun static void xlbd_release_minors(unsigned int minor, unsigned int nr)
481*4882a593Smuzhiyun {
482*4882a593Smuzhiyun unsigned int end = minor + nr;
483*4882a593Smuzhiyun
484*4882a593Smuzhiyun BUG_ON(end > nr_minors);
485*4882a593Smuzhiyun spin_lock(&minor_lock);
486*4882a593Smuzhiyun bitmap_clear(minors, minor, nr);
487*4882a593Smuzhiyun spin_unlock(&minor_lock);
488*4882a593Smuzhiyun }
489*4882a593Smuzhiyun
blkif_restart_queue_callback(void * arg)490*4882a593Smuzhiyun static void blkif_restart_queue_callback(void *arg)
491*4882a593Smuzhiyun {
492*4882a593Smuzhiyun struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
493*4882a593Smuzhiyun schedule_work(&rinfo->work);
494*4882a593Smuzhiyun }
495*4882a593Smuzhiyun
blkif_getgeo(struct block_device * bd,struct hd_geometry * hg)496*4882a593Smuzhiyun static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
497*4882a593Smuzhiyun {
498*4882a593Smuzhiyun /* We don't have real geometry info, but let's at least return
499*4882a593Smuzhiyun values consistent with the size of the device */
500*4882a593Smuzhiyun sector_t nsect = get_capacity(bd->bd_disk);
501*4882a593Smuzhiyun sector_t cylinders = nsect;
502*4882a593Smuzhiyun
503*4882a593Smuzhiyun hg->heads = 0xff;
504*4882a593Smuzhiyun hg->sectors = 0x3f;
505*4882a593Smuzhiyun sector_div(cylinders, hg->heads * hg->sectors);
506*4882a593Smuzhiyun hg->cylinders = cylinders;
507*4882a593Smuzhiyun if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
508*4882a593Smuzhiyun hg->cylinders = 0xffff;
509*4882a593Smuzhiyun return 0;
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun
blkif_ioctl(struct block_device * bdev,fmode_t mode,unsigned command,unsigned long argument)512*4882a593Smuzhiyun static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
513*4882a593Smuzhiyun unsigned command, unsigned long argument)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun struct blkfront_info *info = bdev->bd_disk->private_data;
516*4882a593Smuzhiyun int i;
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
519*4882a593Smuzhiyun command, (long)argument);
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun switch (command) {
522*4882a593Smuzhiyun case CDROMMULTISESSION:
523*4882a593Smuzhiyun dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
524*4882a593Smuzhiyun for (i = 0; i < sizeof(struct cdrom_multisession); i++)
525*4882a593Smuzhiyun if (put_user(0, (char __user *)(argument + i)))
526*4882a593Smuzhiyun return -EFAULT;
527*4882a593Smuzhiyun return 0;
528*4882a593Smuzhiyun
529*4882a593Smuzhiyun case CDROM_GET_CAPABILITY: {
530*4882a593Smuzhiyun struct gendisk *gd = info->gd;
531*4882a593Smuzhiyun if (gd->flags & GENHD_FL_CD)
532*4882a593Smuzhiyun return 0;
533*4882a593Smuzhiyun return -EINVAL;
534*4882a593Smuzhiyun }
535*4882a593Smuzhiyun
536*4882a593Smuzhiyun default:
537*4882a593Smuzhiyun /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
538*4882a593Smuzhiyun command);*/
539*4882a593Smuzhiyun return -EINVAL; /* same return as native Linux */
540*4882a593Smuzhiyun }
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun return 0;
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun
blkif_ring_get_request(struct blkfront_ring_info * rinfo,struct request * req,struct blkif_request ** ring_req)545*4882a593Smuzhiyun static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
546*4882a593Smuzhiyun struct request *req,
547*4882a593Smuzhiyun struct blkif_request **ring_req)
548*4882a593Smuzhiyun {
549*4882a593Smuzhiyun unsigned long id;
550*4882a593Smuzhiyun
551*4882a593Smuzhiyun *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
552*4882a593Smuzhiyun rinfo->ring.req_prod_pvt++;
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun id = get_id_from_freelist(rinfo);
555*4882a593Smuzhiyun rinfo->shadow[id].request = req;
556*4882a593Smuzhiyun rinfo->shadow[id].status = REQ_PROCESSING;
557*4882a593Smuzhiyun rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun rinfo->shadow[id].req.u.rw.id = id;
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun return id;
562*4882a593Smuzhiyun }
563*4882a593Smuzhiyun
blkif_queue_discard_req(struct request * req,struct blkfront_ring_info * rinfo)564*4882a593Smuzhiyun static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
565*4882a593Smuzhiyun {
566*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
567*4882a593Smuzhiyun struct blkif_request *ring_req, *final_ring_req;
568*4882a593Smuzhiyun unsigned long id;
569*4882a593Smuzhiyun
570*4882a593Smuzhiyun /* Fill out a communications ring structure. */
571*4882a593Smuzhiyun id = blkif_ring_get_request(rinfo, req, &final_ring_req);
572*4882a593Smuzhiyun ring_req = &rinfo->shadow[id].req;
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun ring_req->operation = BLKIF_OP_DISCARD;
575*4882a593Smuzhiyun ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
576*4882a593Smuzhiyun ring_req->u.discard.id = id;
577*4882a593Smuzhiyun ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
578*4882a593Smuzhiyun if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
579*4882a593Smuzhiyun ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
580*4882a593Smuzhiyun else
581*4882a593Smuzhiyun ring_req->u.discard.flag = 0;
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun /* Copy the request to the ring page. */
584*4882a593Smuzhiyun *final_ring_req = *ring_req;
585*4882a593Smuzhiyun rinfo->shadow[id].status = REQ_WAITING;
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun return 0;
588*4882a593Smuzhiyun }
589*4882a593Smuzhiyun
590*4882a593Smuzhiyun struct setup_rw_req {
591*4882a593Smuzhiyun unsigned int grant_idx;
592*4882a593Smuzhiyun struct blkif_request_segment *segments;
593*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
594*4882a593Smuzhiyun struct blkif_request *ring_req;
595*4882a593Smuzhiyun grant_ref_t gref_head;
596*4882a593Smuzhiyun unsigned int id;
597*4882a593Smuzhiyun /* Only used when persistent grant is used and it's a read request */
598*4882a593Smuzhiyun bool need_copy;
599*4882a593Smuzhiyun unsigned int bvec_off;
600*4882a593Smuzhiyun char *bvec_data;
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun bool require_extra_req;
603*4882a593Smuzhiyun struct blkif_request *extra_ring_req;
604*4882a593Smuzhiyun };
605*4882a593Smuzhiyun
blkif_setup_rw_req_grant(unsigned long gfn,unsigned int offset,unsigned int len,void * data)606*4882a593Smuzhiyun static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
607*4882a593Smuzhiyun unsigned int len, void *data)
608*4882a593Smuzhiyun {
609*4882a593Smuzhiyun struct setup_rw_req *setup = data;
610*4882a593Smuzhiyun int n, ref;
611*4882a593Smuzhiyun struct grant *gnt_list_entry;
612*4882a593Smuzhiyun unsigned int fsect, lsect;
613*4882a593Smuzhiyun /* Convenient aliases */
614*4882a593Smuzhiyun unsigned int grant_idx = setup->grant_idx;
615*4882a593Smuzhiyun struct blkif_request *ring_req = setup->ring_req;
616*4882a593Smuzhiyun struct blkfront_ring_info *rinfo = setup->rinfo;
617*4882a593Smuzhiyun /*
618*4882a593Smuzhiyun * We always use the shadow of the first request to store the list
619*4882a593Smuzhiyun * of grant associated to the block I/O request. This made the
620*4882a593Smuzhiyun * completion more easy to handle even if the block I/O request is
621*4882a593Smuzhiyun * split.
622*4882a593Smuzhiyun */
623*4882a593Smuzhiyun struct blk_shadow *shadow = &rinfo->shadow[setup->id];
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun if (unlikely(setup->require_extra_req &&
626*4882a593Smuzhiyun grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
627*4882a593Smuzhiyun /*
628*4882a593Smuzhiyun * We are using the second request, setup grant_idx
629*4882a593Smuzhiyun * to be the index of the segment array.
630*4882a593Smuzhiyun */
631*4882a593Smuzhiyun grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
632*4882a593Smuzhiyun ring_req = setup->extra_ring_req;
633*4882a593Smuzhiyun }
634*4882a593Smuzhiyun
635*4882a593Smuzhiyun if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
636*4882a593Smuzhiyun (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
637*4882a593Smuzhiyun if (setup->segments)
638*4882a593Smuzhiyun kunmap_atomic(setup->segments);
639*4882a593Smuzhiyun
640*4882a593Smuzhiyun n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
641*4882a593Smuzhiyun gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
642*4882a593Smuzhiyun shadow->indirect_grants[n] = gnt_list_entry;
643*4882a593Smuzhiyun setup->segments = kmap_atomic(gnt_list_entry->page);
644*4882a593Smuzhiyun ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
645*4882a593Smuzhiyun }
646*4882a593Smuzhiyun
647*4882a593Smuzhiyun gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
648*4882a593Smuzhiyun ref = gnt_list_entry->gref;
649*4882a593Smuzhiyun /*
650*4882a593Smuzhiyun * All the grants are stored in the shadow of the first
651*4882a593Smuzhiyun * request. Therefore we have to use the global index.
652*4882a593Smuzhiyun */
653*4882a593Smuzhiyun shadow->grants_used[setup->grant_idx] = gnt_list_entry;
654*4882a593Smuzhiyun
655*4882a593Smuzhiyun if (setup->need_copy) {
656*4882a593Smuzhiyun void *shared_data;
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun shared_data = kmap_atomic(gnt_list_entry->page);
659*4882a593Smuzhiyun /*
660*4882a593Smuzhiyun * this does not wipe data stored outside the
661*4882a593Smuzhiyun * range sg->offset..sg->offset+sg->length.
662*4882a593Smuzhiyun * Therefore, blkback *could* see data from
663*4882a593Smuzhiyun * previous requests. This is OK as long as
664*4882a593Smuzhiyun * persistent grants are shared with just one
665*4882a593Smuzhiyun * domain. It may need refactoring if this
666*4882a593Smuzhiyun * changes
667*4882a593Smuzhiyun */
668*4882a593Smuzhiyun memcpy(shared_data + offset,
669*4882a593Smuzhiyun setup->bvec_data + setup->bvec_off,
670*4882a593Smuzhiyun len);
671*4882a593Smuzhiyun
672*4882a593Smuzhiyun kunmap_atomic(shared_data);
673*4882a593Smuzhiyun setup->bvec_off += len;
674*4882a593Smuzhiyun }
675*4882a593Smuzhiyun
676*4882a593Smuzhiyun fsect = offset >> 9;
677*4882a593Smuzhiyun lsect = fsect + (len >> 9) - 1;
678*4882a593Smuzhiyun if (ring_req->operation != BLKIF_OP_INDIRECT) {
679*4882a593Smuzhiyun ring_req->u.rw.seg[grant_idx] =
680*4882a593Smuzhiyun (struct blkif_request_segment) {
681*4882a593Smuzhiyun .gref = ref,
682*4882a593Smuzhiyun .first_sect = fsect,
683*4882a593Smuzhiyun .last_sect = lsect };
684*4882a593Smuzhiyun } else {
685*4882a593Smuzhiyun setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] =
686*4882a593Smuzhiyun (struct blkif_request_segment) {
687*4882a593Smuzhiyun .gref = ref,
688*4882a593Smuzhiyun .first_sect = fsect,
689*4882a593Smuzhiyun .last_sect = lsect };
690*4882a593Smuzhiyun }
691*4882a593Smuzhiyun
692*4882a593Smuzhiyun (setup->grant_idx)++;
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun
blkif_setup_extra_req(struct blkif_request * first,struct blkif_request * second)695*4882a593Smuzhiyun static void blkif_setup_extra_req(struct blkif_request *first,
696*4882a593Smuzhiyun struct blkif_request *second)
697*4882a593Smuzhiyun {
698*4882a593Smuzhiyun uint16_t nr_segments = first->u.rw.nr_segments;
699*4882a593Smuzhiyun
700*4882a593Smuzhiyun /*
701*4882a593Smuzhiyun * The second request is only present when the first request uses
702*4882a593Smuzhiyun * all its segments. It's always the continuity of the first one.
703*4882a593Smuzhiyun */
704*4882a593Smuzhiyun first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
705*4882a593Smuzhiyun
706*4882a593Smuzhiyun second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
707*4882a593Smuzhiyun second->u.rw.sector_number = first->u.rw.sector_number +
708*4882a593Smuzhiyun (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
709*4882a593Smuzhiyun
710*4882a593Smuzhiyun second->u.rw.handle = first->u.rw.handle;
711*4882a593Smuzhiyun second->operation = first->operation;
712*4882a593Smuzhiyun }
713*4882a593Smuzhiyun
blkif_queue_rw_req(struct request * req,struct blkfront_ring_info * rinfo)714*4882a593Smuzhiyun static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
715*4882a593Smuzhiyun {
716*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
717*4882a593Smuzhiyun struct blkif_request *ring_req, *extra_ring_req = NULL;
718*4882a593Smuzhiyun struct blkif_request *final_ring_req, *final_extra_ring_req = NULL;
719*4882a593Smuzhiyun unsigned long id, extra_id = NO_ASSOCIATED_ID;
720*4882a593Smuzhiyun bool require_extra_req = false;
721*4882a593Smuzhiyun int i;
722*4882a593Smuzhiyun struct setup_rw_req setup = {
723*4882a593Smuzhiyun .grant_idx = 0,
724*4882a593Smuzhiyun .segments = NULL,
725*4882a593Smuzhiyun .rinfo = rinfo,
726*4882a593Smuzhiyun .need_copy = rq_data_dir(req) && info->bounce,
727*4882a593Smuzhiyun };
728*4882a593Smuzhiyun
729*4882a593Smuzhiyun /*
730*4882a593Smuzhiyun * Used to store if we are able to queue the request by just using
731*4882a593Smuzhiyun * existing persistent grants, or if we have to get new grants,
732*4882a593Smuzhiyun * as there are not sufficiently many free.
733*4882a593Smuzhiyun */
734*4882a593Smuzhiyun bool new_persistent_gnts = false;
735*4882a593Smuzhiyun struct scatterlist *sg;
736*4882a593Smuzhiyun int num_sg, max_grefs, num_grant;
737*4882a593Smuzhiyun
738*4882a593Smuzhiyun max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG;
739*4882a593Smuzhiyun if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
740*4882a593Smuzhiyun /*
741*4882a593Smuzhiyun * If we are using indirect segments we need to account
742*4882a593Smuzhiyun * for the indirect grefs used in the request.
743*4882a593Smuzhiyun */
744*4882a593Smuzhiyun max_grefs += INDIRECT_GREFS(max_grefs);
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun /* Check if we have enough persistent grants to allocate a requests */
747*4882a593Smuzhiyun if (rinfo->persistent_gnts_c < max_grefs) {
748*4882a593Smuzhiyun new_persistent_gnts = true;
749*4882a593Smuzhiyun
750*4882a593Smuzhiyun if (gnttab_alloc_grant_references(
751*4882a593Smuzhiyun max_grefs - rinfo->persistent_gnts_c,
752*4882a593Smuzhiyun &setup.gref_head) < 0) {
753*4882a593Smuzhiyun gnttab_request_free_callback(
754*4882a593Smuzhiyun &rinfo->callback,
755*4882a593Smuzhiyun blkif_restart_queue_callback,
756*4882a593Smuzhiyun rinfo,
757*4882a593Smuzhiyun max_grefs - rinfo->persistent_gnts_c);
758*4882a593Smuzhiyun return 1;
759*4882a593Smuzhiyun }
760*4882a593Smuzhiyun }
761*4882a593Smuzhiyun
762*4882a593Smuzhiyun /* Fill out a communications ring structure. */
763*4882a593Smuzhiyun id = blkif_ring_get_request(rinfo, req, &final_ring_req);
764*4882a593Smuzhiyun ring_req = &rinfo->shadow[id].req;
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
767*4882a593Smuzhiyun num_grant = 0;
768*4882a593Smuzhiyun /* Calculate the number of grant used */
769*4882a593Smuzhiyun for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
770*4882a593Smuzhiyun num_grant += gnttab_count_grant(sg->offset, sg->length);
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun require_extra_req = info->max_indirect_segments == 0 &&
773*4882a593Smuzhiyun num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
774*4882a593Smuzhiyun BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun rinfo->shadow[id].num_sg = num_sg;
777*4882a593Smuzhiyun if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
778*4882a593Smuzhiyun likely(!require_extra_req)) {
779*4882a593Smuzhiyun /*
780*4882a593Smuzhiyun * The indirect operation can only be a BLKIF_OP_READ or
781*4882a593Smuzhiyun * BLKIF_OP_WRITE
782*4882a593Smuzhiyun */
783*4882a593Smuzhiyun BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA);
784*4882a593Smuzhiyun ring_req->operation = BLKIF_OP_INDIRECT;
785*4882a593Smuzhiyun ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
786*4882a593Smuzhiyun BLKIF_OP_WRITE : BLKIF_OP_READ;
787*4882a593Smuzhiyun ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
788*4882a593Smuzhiyun ring_req->u.indirect.handle = info->handle;
789*4882a593Smuzhiyun ring_req->u.indirect.nr_segments = num_grant;
790*4882a593Smuzhiyun } else {
791*4882a593Smuzhiyun ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
792*4882a593Smuzhiyun ring_req->u.rw.handle = info->handle;
793*4882a593Smuzhiyun ring_req->operation = rq_data_dir(req) ?
794*4882a593Smuzhiyun BLKIF_OP_WRITE : BLKIF_OP_READ;
795*4882a593Smuzhiyun if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) {
796*4882a593Smuzhiyun /*
797*4882a593Smuzhiyun * Ideally we can do an unordered flush-to-disk.
798*4882a593Smuzhiyun * In case the backend onlysupports barriers, use that.
799*4882a593Smuzhiyun * A barrier request a superset of FUA, so we can
800*4882a593Smuzhiyun * implement it the same way. (It's also a FLUSH+FUA,
801*4882a593Smuzhiyun * since it is guaranteed ordered WRT previous writes.)
802*4882a593Smuzhiyun */
803*4882a593Smuzhiyun if (info->feature_flush && info->feature_fua)
804*4882a593Smuzhiyun ring_req->operation =
805*4882a593Smuzhiyun BLKIF_OP_WRITE_BARRIER;
806*4882a593Smuzhiyun else if (info->feature_flush)
807*4882a593Smuzhiyun ring_req->operation =
808*4882a593Smuzhiyun BLKIF_OP_FLUSH_DISKCACHE;
809*4882a593Smuzhiyun else
810*4882a593Smuzhiyun ring_req->operation = 0;
811*4882a593Smuzhiyun }
812*4882a593Smuzhiyun ring_req->u.rw.nr_segments = num_grant;
813*4882a593Smuzhiyun if (unlikely(require_extra_req)) {
814*4882a593Smuzhiyun extra_id = blkif_ring_get_request(rinfo, req,
815*4882a593Smuzhiyun &final_extra_ring_req);
816*4882a593Smuzhiyun extra_ring_req = &rinfo->shadow[extra_id].req;
817*4882a593Smuzhiyun
818*4882a593Smuzhiyun /*
819*4882a593Smuzhiyun * Only the first request contains the scatter-gather
820*4882a593Smuzhiyun * list.
821*4882a593Smuzhiyun */
822*4882a593Smuzhiyun rinfo->shadow[extra_id].num_sg = 0;
823*4882a593Smuzhiyun
824*4882a593Smuzhiyun blkif_setup_extra_req(ring_req, extra_ring_req);
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun /* Link the 2 requests together */
827*4882a593Smuzhiyun rinfo->shadow[extra_id].associated_id = id;
828*4882a593Smuzhiyun rinfo->shadow[id].associated_id = extra_id;
829*4882a593Smuzhiyun }
830*4882a593Smuzhiyun }
831*4882a593Smuzhiyun
832*4882a593Smuzhiyun setup.ring_req = ring_req;
833*4882a593Smuzhiyun setup.id = id;
834*4882a593Smuzhiyun
835*4882a593Smuzhiyun setup.require_extra_req = require_extra_req;
836*4882a593Smuzhiyun if (unlikely(require_extra_req))
837*4882a593Smuzhiyun setup.extra_ring_req = extra_ring_req;
838*4882a593Smuzhiyun
839*4882a593Smuzhiyun for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
840*4882a593Smuzhiyun BUG_ON(sg->offset + sg->length > PAGE_SIZE);
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun if (setup.need_copy) {
843*4882a593Smuzhiyun setup.bvec_off = sg->offset;
844*4882a593Smuzhiyun setup.bvec_data = kmap_atomic(sg_page(sg));
845*4882a593Smuzhiyun }
846*4882a593Smuzhiyun
847*4882a593Smuzhiyun gnttab_foreach_grant_in_range(sg_page(sg),
848*4882a593Smuzhiyun sg->offset,
849*4882a593Smuzhiyun sg->length,
850*4882a593Smuzhiyun blkif_setup_rw_req_grant,
851*4882a593Smuzhiyun &setup);
852*4882a593Smuzhiyun
853*4882a593Smuzhiyun if (setup.need_copy)
854*4882a593Smuzhiyun kunmap_atomic(setup.bvec_data);
855*4882a593Smuzhiyun }
856*4882a593Smuzhiyun if (setup.segments)
857*4882a593Smuzhiyun kunmap_atomic(setup.segments);
858*4882a593Smuzhiyun
859*4882a593Smuzhiyun /* Copy request(s) to the ring page. */
860*4882a593Smuzhiyun *final_ring_req = *ring_req;
861*4882a593Smuzhiyun rinfo->shadow[id].status = REQ_WAITING;
862*4882a593Smuzhiyun if (unlikely(require_extra_req)) {
863*4882a593Smuzhiyun *final_extra_ring_req = *extra_ring_req;
864*4882a593Smuzhiyun rinfo->shadow[extra_id].status = REQ_WAITING;
865*4882a593Smuzhiyun }
866*4882a593Smuzhiyun
867*4882a593Smuzhiyun if (new_persistent_gnts)
868*4882a593Smuzhiyun gnttab_free_grant_references(setup.gref_head);
869*4882a593Smuzhiyun
870*4882a593Smuzhiyun return 0;
871*4882a593Smuzhiyun }
872*4882a593Smuzhiyun
873*4882a593Smuzhiyun /*
874*4882a593Smuzhiyun * Generate a Xen blkfront IO request from a blk layer request. Reads
875*4882a593Smuzhiyun * and writes are handled as expected.
876*4882a593Smuzhiyun *
877*4882a593Smuzhiyun * @req: a request struct
878*4882a593Smuzhiyun */
blkif_queue_request(struct request * req,struct blkfront_ring_info * rinfo)879*4882a593Smuzhiyun static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
880*4882a593Smuzhiyun {
881*4882a593Smuzhiyun if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
882*4882a593Smuzhiyun return 1;
883*4882a593Smuzhiyun
884*4882a593Smuzhiyun if (unlikely(req_op(req) == REQ_OP_DISCARD ||
885*4882a593Smuzhiyun req_op(req) == REQ_OP_SECURE_ERASE))
886*4882a593Smuzhiyun return blkif_queue_discard_req(req, rinfo);
887*4882a593Smuzhiyun else
888*4882a593Smuzhiyun return blkif_queue_rw_req(req, rinfo);
889*4882a593Smuzhiyun }
890*4882a593Smuzhiyun
flush_requests(struct blkfront_ring_info * rinfo)891*4882a593Smuzhiyun static inline void flush_requests(struct blkfront_ring_info *rinfo)
892*4882a593Smuzhiyun {
893*4882a593Smuzhiyun int notify;
894*4882a593Smuzhiyun
895*4882a593Smuzhiyun RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
896*4882a593Smuzhiyun
897*4882a593Smuzhiyun if (notify)
898*4882a593Smuzhiyun notify_remote_via_irq(rinfo->irq);
899*4882a593Smuzhiyun }
900*4882a593Smuzhiyun
blkif_request_flush_invalid(struct request * req,struct blkfront_info * info)901*4882a593Smuzhiyun static inline bool blkif_request_flush_invalid(struct request *req,
902*4882a593Smuzhiyun struct blkfront_info *info)
903*4882a593Smuzhiyun {
904*4882a593Smuzhiyun return (blk_rq_is_passthrough(req) ||
905*4882a593Smuzhiyun ((req_op(req) == REQ_OP_FLUSH) &&
906*4882a593Smuzhiyun !info->feature_flush) ||
907*4882a593Smuzhiyun ((req->cmd_flags & REQ_FUA) &&
908*4882a593Smuzhiyun !info->feature_fua));
909*4882a593Smuzhiyun }
910*4882a593Smuzhiyun
blkif_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * qd)911*4882a593Smuzhiyun static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
912*4882a593Smuzhiyun const struct blk_mq_queue_data *qd)
913*4882a593Smuzhiyun {
914*4882a593Smuzhiyun unsigned long flags;
915*4882a593Smuzhiyun int qid = hctx->queue_num;
916*4882a593Smuzhiyun struct blkfront_info *info = hctx->queue->queuedata;
917*4882a593Smuzhiyun struct blkfront_ring_info *rinfo = NULL;
918*4882a593Smuzhiyun
919*4882a593Smuzhiyun rinfo = get_rinfo(info, qid);
920*4882a593Smuzhiyun blk_mq_start_request(qd->rq);
921*4882a593Smuzhiyun spin_lock_irqsave(&rinfo->ring_lock, flags);
922*4882a593Smuzhiyun if (RING_FULL(&rinfo->ring))
923*4882a593Smuzhiyun goto out_busy;
924*4882a593Smuzhiyun
925*4882a593Smuzhiyun if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
926*4882a593Smuzhiyun goto out_err;
927*4882a593Smuzhiyun
928*4882a593Smuzhiyun if (blkif_queue_request(qd->rq, rinfo))
929*4882a593Smuzhiyun goto out_busy;
930*4882a593Smuzhiyun
931*4882a593Smuzhiyun flush_requests(rinfo);
932*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
933*4882a593Smuzhiyun return BLK_STS_OK;
934*4882a593Smuzhiyun
935*4882a593Smuzhiyun out_err:
936*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
937*4882a593Smuzhiyun return BLK_STS_IOERR;
938*4882a593Smuzhiyun
939*4882a593Smuzhiyun out_busy:
940*4882a593Smuzhiyun blk_mq_stop_hw_queue(hctx);
941*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
942*4882a593Smuzhiyun return BLK_STS_DEV_RESOURCE;
943*4882a593Smuzhiyun }
944*4882a593Smuzhiyun
blkif_complete_rq(struct request * rq)945*4882a593Smuzhiyun static void blkif_complete_rq(struct request *rq)
946*4882a593Smuzhiyun {
947*4882a593Smuzhiyun blk_mq_end_request(rq, blkif_req(rq)->error);
948*4882a593Smuzhiyun }
949*4882a593Smuzhiyun
950*4882a593Smuzhiyun static const struct blk_mq_ops blkfront_mq_ops = {
951*4882a593Smuzhiyun .queue_rq = blkif_queue_rq,
952*4882a593Smuzhiyun .complete = blkif_complete_rq,
953*4882a593Smuzhiyun };
954*4882a593Smuzhiyun
blkif_set_queue_limits(struct blkfront_info * info)955*4882a593Smuzhiyun static void blkif_set_queue_limits(struct blkfront_info *info)
956*4882a593Smuzhiyun {
957*4882a593Smuzhiyun struct request_queue *rq = info->rq;
958*4882a593Smuzhiyun struct gendisk *gd = info->gd;
959*4882a593Smuzhiyun unsigned int segments = info->max_indirect_segments ? :
960*4882a593Smuzhiyun BLKIF_MAX_SEGMENTS_PER_REQUEST;
961*4882a593Smuzhiyun
962*4882a593Smuzhiyun blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
963*4882a593Smuzhiyun
964*4882a593Smuzhiyun if (info->feature_discard) {
965*4882a593Smuzhiyun blk_queue_flag_set(QUEUE_FLAG_DISCARD, rq);
966*4882a593Smuzhiyun blk_queue_max_discard_sectors(rq, get_capacity(gd));
967*4882a593Smuzhiyun rq->limits.discard_granularity = info->discard_granularity ?:
968*4882a593Smuzhiyun info->physical_sector_size;
969*4882a593Smuzhiyun rq->limits.discard_alignment = info->discard_alignment;
970*4882a593Smuzhiyun if (info->feature_secdiscard)
971*4882a593Smuzhiyun blk_queue_flag_set(QUEUE_FLAG_SECERASE, rq);
972*4882a593Smuzhiyun }
973*4882a593Smuzhiyun
974*4882a593Smuzhiyun /* Hard sector size and max sectors impersonate the equiv. hardware. */
975*4882a593Smuzhiyun blk_queue_logical_block_size(rq, info->sector_size);
976*4882a593Smuzhiyun blk_queue_physical_block_size(rq, info->physical_sector_size);
977*4882a593Smuzhiyun blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
978*4882a593Smuzhiyun
979*4882a593Smuzhiyun /* Each segment in a request is up to an aligned page in size. */
980*4882a593Smuzhiyun blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
981*4882a593Smuzhiyun blk_queue_max_segment_size(rq, PAGE_SIZE);
982*4882a593Smuzhiyun
983*4882a593Smuzhiyun /* Ensure a merged request will fit in a single I/O ring slot. */
984*4882a593Smuzhiyun blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
985*4882a593Smuzhiyun
986*4882a593Smuzhiyun /* Make sure buffer addresses are sector-aligned. */
987*4882a593Smuzhiyun blk_queue_dma_alignment(rq, 511);
988*4882a593Smuzhiyun }
989*4882a593Smuzhiyun
xlvbd_init_blk_queue(struct gendisk * gd,u16 sector_size,unsigned int physical_sector_size)990*4882a593Smuzhiyun static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
991*4882a593Smuzhiyun unsigned int physical_sector_size)
992*4882a593Smuzhiyun {
993*4882a593Smuzhiyun struct request_queue *rq;
994*4882a593Smuzhiyun struct blkfront_info *info = gd->private_data;
995*4882a593Smuzhiyun
996*4882a593Smuzhiyun memset(&info->tag_set, 0, sizeof(info->tag_set));
997*4882a593Smuzhiyun info->tag_set.ops = &blkfront_mq_ops;
998*4882a593Smuzhiyun info->tag_set.nr_hw_queues = info->nr_rings;
999*4882a593Smuzhiyun if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
1000*4882a593Smuzhiyun /*
1001*4882a593Smuzhiyun * When indirect descriptior is not supported, the I/O request
1002*4882a593Smuzhiyun * will be split between multiple request in the ring.
1003*4882a593Smuzhiyun * To avoid problems when sending the request, divide by
1004*4882a593Smuzhiyun * 2 the depth of the queue.
1005*4882a593Smuzhiyun */
1006*4882a593Smuzhiyun info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2;
1007*4882a593Smuzhiyun } else
1008*4882a593Smuzhiyun info->tag_set.queue_depth = BLK_RING_SIZE(info);
1009*4882a593Smuzhiyun info->tag_set.numa_node = NUMA_NO_NODE;
1010*4882a593Smuzhiyun info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
1011*4882a593Smuzhiyun info->tag_set.cmd_size = sizeof(struct blkif_req);
1012*4882a593Smuzhiyun info->tag_set.driver_data = info;
1013*4882a593Smuzhiyun
1014*4882a593Smuzhiyun if (blk_mq_alloc_tag_set(&info->tag_set))
1015*4882a593Smuzhiyun return -EINVAL;
1016*4882a593Smuzhiyun rq = blk_mq_init_queue(&info->tag_set);
1017*4882a593Smuzhiyun if (IS_ERR(rq)) {
1018*4882a593Smuzhiyun blk_mq_free_tag_set(&info->tag_set);
1019*4882a593Smuzhiyun return PTR_ERR(rq);
1020*4882a593Smuzhiyun }
1021*4882a593Smuzhiyun
1022*4882a593Smuzhiyun rq->queuedata = info;
1023*4882a593Smuzhiyun info->rq = gd->queue = rq;
1024*4882a593Smuzhiyun info->gd = gd;
1025*4882a593Smuzhiyun info->sector_size = sector_size;
1026*4882a593Smuzhiyun info->physical_sector_size = physical_sector_size;
1027*4882a593Smuzhiyun blkif_set_queue_limits(info);
1028*4882a593Smuzhiyun
1029*4882a593Smuzhiyun return 0;
1030*4882a593Smuzhiyun }
1031*4882a593Smuzhiyun
flush_info(struct blkfront_info * info)1032*4882a593Smuzhiyun static const char *flush_info(struct blkfront_info *info)
1033*4882a593Smuzhiyun {
1034*4882a593Smuzhiyun if (info->feature_flush && info->feature_fua)
1035*4882a593Smuzhiyun return "barrier: enabled;";
1036*4882a593Smuzhiyun else if (info->feature_flush)
1037*4882a593Smuzhiyun return "flush diskcache: enabled;";
1038*4882a593Smuzhiyun else
1039*4882a593Smuzhiyun return "barrier or flush: disabled;";
1040*4882a593Smuzhiyun }
1041*4882a593Smuzhiyun
xlvbd_flush(struct blkfront_info * info)1042*4882a593Smuzhiyun static void xlvbd_flush(struct blkfront_info *info)
1043*4882a593Smuzhiyun {
1044*4882a593Smuzhiyun blk_queue_write_cache(info->rq, info->feature_flush ? true : false,
1045*4882a593Smuzhiyun info->feature_fua ? true : false);
1046*4882a593Smuzhiyun pr_info("blkfront: %s: %s %s %s %s %s %s %s\n",
1047*4882a593Smuzhiyun info->gd->disk_name, flush_info(info),
1048*4882a593Smuzhiyun "persistent grants:", info->feature_persistent ?
1049*4882a593Smuzhiyun "enabled;" : "disabled;", "indirect descriptors:",
1050*4882a593Smuzhiyun info->max_indirect_segments ? "enabled;" : "disabled;",
1051*4882a593Smuzhiyun "bounce buffer:", info->bounce ? "enabled" : "disabled;");
1052*4882a593Smuzhiyun }
1053*4882a593Smuzhiyun
xen_translate_vdev(int vdevice,int * minor,unsigned int * offset)1054*4882a593Smuzhiyun static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
1055*4882a593Smuzhiyun {
1056*4882a593Smuzhiyun int major;
1057*4882a593Smuzhiyun major = BLKIF_MAJOR(vdevice);
1058*4882a593Smuzhiyun *minor = BLKIF_MINOR(vdevice);
1059*4882a593Smuzhiyun switch (major) {
1060*4882a593Smuzhiyun case XEN_IDE0_MAJOR:
1061*4882a593Smuzhiyun *offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
1062*4882a593Smuzhiyun *minor = ((*minor / 64) * PARTS_PER_DISK) +
1063*4882a593Smuzhiyun EMULATED_HD_DISK_MINOR_OFFSET;
1064*4882a593Smuzhiyun break;
1065*4882a593Smuzhiyun case XEN_IDE1_MAJOR:
1066*4882a593Smuzhiyun *offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
1067*4882a593Smuzhiyun *minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
1068*4882a593Smuzhiyun EMULATED_HD_DISK_MINOR_OFFSET;
1069*4882a593Smuzhiyun break;
1070*4882a593Smuzhiyun case XEN_SCSI_DISK0_MAJOR:
1071*4882a593Smuzhiyun *offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
1072*4882a593Smuzhiyun *minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
1073*4882a593Smuzhiyun break;
1074*4882a593Smuzhiyun case XEN_SCSI_DISK1_MAJOR:
1075*4882a593Smuzhiyun case XEN_SCSI_DISK2_MAJOR:
1076*4882a593Smuzhiyun case XEN_SCSI_DISK3_MAJOR:
1077*4882a593Smuzhiyun case XEN_SCSI_DISK4_MAJOR:
1078*4882a593Smuzhiyun case XEN_SCSI_DISK5_MAJOR:
1079*4882a593Smuzhiyun case XEN_SCSI_DISK6_MAJOR:
1080*4882a593Smuzhiyun case XEN_SCSI_DISK7_MAJOR:
1081*4882a593Smuzhiyun *offset = (*minor / PARTS_PER_DISK) +
1082*4882a593Smuzhiyun ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
1083*4882a593Smuzhiyun EMULATED_SD_DISK_NAME_OFFSET;
1084*4882a593Smuzhiyun *minor = *minor +
1085*4882a593Smuzhiyun ((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
1086*4882a593Smuzhiyun EMULATED_SD_DISK_MINOR_OFFSET;
1087*4882a593Smuzhiyun break;
1088*4882a593Smuzhiyun case XEN_SCSI_DISK8_MAJOR:
1089*4882a593Smuzhiyun case XEN_SCSI_DISK9_MAJOR:
1090*4882a593Smuzhiyun case XEN_SCSI_DISK10_MAJOR:
1091*4882a593Smuzhiyun case XEN_SCSI_DISK11_MAJOR:
1092*4882a593Smuzhiyun case XEN_SCSI_DISK12_MAJOR:
1093*4882a593Smuzhiyun case XEN_SCSI_DISK13_MAJOR:
1094*4882a593Smuzhiyun case XEN_SCSI_DISK14_MAJOR:
1095*4882a593Smuzhiyun case XEN_SCSI_DISK15_MAJOR:
1096*4882a593Smuzhiyun *offset = (*minor / PARTS_PER_DISK) +
1097*4882a593Smuzhiyun ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
1098*4882a593Smuzhiyun EMULATED_SD_DISK_NAME_OFFSET;
1099*4882a593Smuzhiyun *minor = *minor +
1100*4882a593Smuzhiyun ((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
1101*4882a593Smuzhiyun EMULATED_SD_DISK_MINOR_OFFSET;
1102*4882a593Smuzhiyun break;
1103*4882a593Smuzhiyun case XENVBD_MAJOR:
1104*4882a593Smuzhiyun *offset = *minor / PARTS_PER_DISK;
1105*4882a593Smuzhiyun break;
1106*4882a593Smuzhiyun default:
1107*4882a593Smuzhiyun printk(KERN_WARNING "blkfront: your disk configuration is "
1108*4882a593Smuzhiyun "incorrect, please use an xvd device instead\n");
1109*4882a593Smuzhiyun return -ENODEV;
1110*4882a593Smuzhiyun }
1111*4882a593Smuzhiyun return 0;
1112*4882a593Smuzhiyun }
1113*4882a593Smuzhiyun
encode_disk_name(char * ptr,unsigned int n)1114*4882a593Smuzhiyun static char *encode_disk_name(char *ptr, unsigned int n)
1115*4882a593Smuzhiyun {
1116*4882a593Smuzhiyun if (n >= 26)
1117*4882a593Smuzhiyun ptr = encode_disk_name(ptr, n / 26 - 1);
1118*4882a593Smuzhiyun *ptr = 'a' + n % 26;
1119*4882a593Smuzhiyun return ptr + 1;
1120*4882a593Smuzhiyun }
1121*4882a593Smuzhiyun
xlvbd_alloc_gendisk(blkif_sector_t capacity,struct blkfront_info * info,u16 vdisk_info,u16 sector_size,unsigned int physical_sector_size)1122*4882a593Smuzhiyun static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
1123*4882a593Smuzhiyun struct blkfront_info *info,
1124*4882a593Smuzhiyun u16 vdisk_info, u16 sector_size,
1125*4882a593Smuzhiyun unsigned int physical_sector_size)
1126*4882a593Smuzhiyun {
1127*4882a593Smuzhiyun struct gendisk *gd;
1128*4882a593Smuzhiyun int nr_minors = 1;
1129*4882a593Smuzhiyun int err;
1130*4882a593Smuzhiyun unsigned int offset;
1131*4882a593Smuzhiyun int minor;
1132*4882a593Smuzhiyun int nr_parts;
1133*4882a593Smuzhiyun char *ptr;
1134*4882a593Smuzhiyun
1135*4882a593Smuzhiyun BUG_ON(info->gd != NULL);
1136*4882a593Smuzhiyun BUG_ON(info->rq != NULL);
1137*4882a593Smuzhiyun
1138*4882a593Smuzhiyun if ((info->vdevice>>EXT_SHIFT) > 1) {
1139*4882a593Smuzhiyun /* this is above the extended range; something is wrong */
1140*4882a593Smuzhiyun printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
1141*4882a593Smuzhiyun return -ENODEV;
1142*4882a593Smuzhiyun }
1143*4882a593Smuzhiyun
1144*4882a593Smuzhiyun if (!VDEV_IS_EXTENDED(info->vdevice)) {
1145*4882a593Smuzhiyun err = xen_translate_vdev(info->vdevice, &minor, &offset);
1146*4882a593Smuzhiyun if (err)
1147*4882a593Smuzhiyun return err;
1148*4882a593Smuzhiyun nr_parts = PARTS_PER_DISK;
1149*4882a593Smuzhiyun } else {
1150*4882a593Smuzhiyun minor = BLKIF_MINOR_EXT(info->vdevice);
1151*4882a593Smuzhiyun nr_parts = PARTS_PER_EXT_DISK;
1152*4882a593Smuzhiyun offset = minor / nr_parts;
1153*4882a593Smuzhiyun if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
1154*4882a593Smuzhiyun printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
1155*4882a593Smuzhiyun "emulated IDE disks,\n\t choose an xvd device name"
1156*4882a593Smuzhiyun "from xvde on\n", info->vdevice);
1157*4882a593Smuzhiyun }
1158*4882a593Smuzhiyun if (minor >> MINORBITS) {
1159*4882a593Smuzhiyun pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
1160*4882a593Smuzhiyun info->vdevice, minor);
1161*4882a593Smuzhiyun return -ENODEV;
1162*4882a593Smuzhiyun }
1163*4882a593Smuzhiyun
1164*4882a593Smuzhiyun if ((minor % nr_parts) == 0)
1165*4882a593Smuzhiyun nr_minors = nr_parts;
1166*4882a593Smuzhiyun
1167*4882a593Smuzhiyun err = xlbd_reserve_minors(minor, nr_minors);
1168*4882a593Smuzhiyun if (err)
1169*4882a593Smuzhiyun goto out;
1170*4882a593Smuzhiyun err = -ENODEV;
1171*4882a593Smuzhiyun
1172*4882a593Smuzhiyun gd = alloc_disk(nr_minors);
1173*4882a593Smuzhiyun if (gd == NULL)
1174*4882a593Smuzhiyun goto release;
1175*4882a593Smuzhiyun
1176*4882a593Smuzhiyun strcpy(gd->disk_name, DEV_NAME);
1177*4882a593Smuzhiyun ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
1178*4882a593Smuzhiyun BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
1179*4882a593Smuzhiyun if (nr_minors > 1)
1180*4882a593Smuzhiyun *ptr = 0;
1181*4882a593Smuzhiyun else
1182*4882a593Smuzhiyun snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
1183*4882a593Smuzhiyun "%d", minor & (nr_parts - 1));
1184*4882a593Smuzhiyun
1185*4882a593Smuzhiyun gd->major = XENVBD_MAJOR;
1186*4882a593Smuzhiyun gd->first_minor = minor;
1187*4882a593Smuzhiyun gd->fops = &xlvbd_block_fops;
1188*4882a593Smuzhiyun gd->private_data = info;
1189*4882a593Smuzhiyun set_capacity(gd, capacity);
1190*4882a593Smuzhiyun
1191*4882a593Smuzhiyun if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size)) {
1192*4882a593Smuzhiyun del_gendisk(gd);
1193*4882a593Smuzhiyun goto release;
1194*4882a593Smuzhiyun }
1195*4882a593Smuzhiyun
1196*4882a593Smuzhiyun xlvbd_flush(info);
1197*4882a593Smuzhiyun
1198*4882a593Smuzhiyun if (vdisk_info & VDISK_READONLY)
1199*4882a593Smuzhiyun set_disk_ro(gd, 1);
1200*4882a593Smuzhiyun
1201*4882a593Smuzhiyun if (vdisk_info & VDISK_REMOVABLE)
1202*4882a593Smuzhiyun gd->flags |= GENHD_FL_REMOVABLE;
1203*4882a593Smuzhiyun
1204*4882a593Smuzhiyun if (vdisk_info & VDISK_CDROM)
1205*4882a593Smuzhiyun gd->flags |= GENHD_FL_CD;
1206*4882a593Smuzhiyun
1207*4882a593Smuzhiyun return 0;
1208*4882a593Smuzhiyun
1209*4882a593Smuzhiyun release:
1210*4882a593Smuzhiyun xlbd_release_minors(minor, nr_minors);
1211*4882a593Smuzhiyun out:
1212*4882a593Smuzhiyun return err;
1213*4882a593Smuzhiyun }
1214*4882a593Smuzhiyun
xlvbd_release_gendisk(struct blkfront_info * info)1215*4882a593Smuzhiyun static void xlvbd_release_gendisk(struct blkfront_info *info)
1216*4882a593Smuzhiyun {
1217*4882a593Smuzhiyun unsigned int minor, nr_minors, i;
1218*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
1219*4882a593Smuzhiyun
1220*4882a593Smuzhiyun if (info->rq == NULL)
1221*4882a593Smuzhiyun return;
1222*4882a593Smuzhiyun
1223*4882a593Smuzhiyun /* No more blkif_request(). */
1224*4882a593Smuzhiyun blk_mq_stop_hw_queues(info->rq);
1225*4882a593Smuzhiyun
1226*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
1227*4882a593Smuzhiyun /* No more gnttab callback work. */
1228*4882a593Smuzhiyun gnttab_cancel_free_callback(&rinfo->callback);
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun /* Flush gnttab callback work. Must be done with no locks held. */
1231*4882a593Smuzhiyun flush_work(&rinfo->work);
1232*4882a593Smuzhiyun }
1233*4882a593Smuzhiyun
1234*4882a593Smuzhiyun del_gendisk(info->gd);
1235*4882a593Smuzhiyun
1236*4882a593Smuzhiyun minor = info->gd->first_minor;
1237*4882a593Smuzhiyun nr_minors = info->gd->minors;
1238*4882a593Smuzhiyun xlbd_release_minors(minor, nr_minors);
1239*4882a593Smuzhiyun
1240*4882a593Smuzhiyun blk_cleanup_queue(info->rq);
1241*4882a593Smuzhiyun blk_mq_free_tag_set(&info->tag_set);
1242*4882a593Smuzhiyun info->rq = NULL;
1243*4882a593Smuzhiyun
1244*4882a593Smuzhiyun put_disk(info->gd);
1245*4882a593Smuzhiyun info->gd = NULL;
1246*4882a593Smuzhiyun }
1247*4882a593Smuzhiyun
1248*4882a593Smuzhiyun /* Already hold rinfo->ring_lock. */
kick_pending_request_queues_locked(struct blkfront_ring_info * rinfo)1249*4882a593Smuzhiyun static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
1250*4882a593Smuzhiyun {
1251*4882a593Smuzhiyun if (!RING_FULL(&rinfo->ring))
1252*4882a593Smuzhiyun blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
1253*4882a593Smuzhiyun }
1254*4882a593Smuzhiyun
kick_pending_request_queues(struct blkfront_ring_info * rinfo)1255*4882a593Smuzhiyun static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
1256*4882a593Smuzhiyun {
1257*4882a593Smuzhiyun unsigned long flags;
1258*4882a593Smuzhiyun
1259*4882a593Smuzhiyun spin_lock_irqsave(&rinfo->ring_lock, flags);
1260*4882a593Smuzhiyun kick_pending_request_queues_locked(rinfo);
1261*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1262*4882a593Smuzhiyun }
1263*4882a593Smuzhiyun
blkif_restart_queue(struct work_struct * work)1264*4882a593Smuzhiyun static void blkif_restart_queue(struct work_struct *work)
1265*4882a593Smuzhiyun {
1266*4882a593Smuzhiyun struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
1267*4882a593Smuzhiyun
1268*4882a593Smuzhiyun if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
1269*4882a593Smuzhiyun kick_pending_request_queues(rinfo);
1270*4882a593Smuzhiyun }
1271*4882a593Smuzhiyun
blkif_free_ring(struct blkfront_ring_info * rinfo)1272*4882a593Smuzhiyun static void blkif_free_ring(struct blkfront_ring_info *rinfo)
1273*4882a593Smuzhiyun {
1274*4882a593Smuzhiyun struct grant *persistent_gnt, *n;
1275*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
1276*4882a593Smuzhiyun int i, j, segs;
1277*4882a593Smuzhiyun
1278*4882a593Smuzhiyun /*
1279*4882a593Smuzhiyun * Remove indirect pages, this only happens when using indirect
1280*4882a593Smuzhiyun * descriptors but not persistent grants
1281*4882a593Smuzhiyun */
1282*4882a593Smuzhiyun if (!list_empty(&rinfo->indirect_pages)) {
1283*4882a593Smuzhiyun struct page *indirect_page, *n;
1284*4882a593Smuzhiyun
1285*4882a593Smuzhiyun BUG_ON(info->bounce);
1286*4882a593Smuzhiyun list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
1287*4882a593Smuzhiyun list_del(&indirect_page->lru);
1288*4882a593Smuzhiyun __free_page(indirect_page);
1289*4882a593Smuzhiyun }
1290*4882a593Smuzhiyun }
1291*4882a593Smuzhiyun
1292*4882a593Smuzhiyun /* Remove all persistent grants. */
1293*4882a593Smuzhiyun if (!list_empty(&rinfo->grants)) {
1294*4882a593Smuzhiyun list_for_each_entry_safe(persistent_gnt, n,
1295*4882a593Smuzhiyun &rinfo->grants, node) {
1296*4882a593Smuzhiyun list_del(&persistent_gnt->node);
1297*4882a593Smuzhiyun if (persistent_gnt->gref != GRANT_INVALID_REF) {
1298*4882a593Smuzhiyun gnttab_end_foreign_access(persistent_gnt->gref,
1299*4882a593Smuzhiyun 0, 0UL);
1300*4882a593Smuzhiyun rinfo->persistent_gnts_c--;
1301*4882a593Smuzhiyun }
1302*4882a593Smuzhiyun if (info->bounce)
1303*4882a593Smuzhiyun __free_page(persistent_gnt->page);
1304*4882a593Smuzhiyun kfree(persistent_gnt);
1305*4882a593Smuzhiyun }
1306*4882a593Smuzhiyun }
1307*4882a593Smuzhiyun BUG_ON(rinfo->persistent_gnts_c != 0);
1308*4882a593Smuzhiyun
1309*4882a593Smuzhiyun for (i = 0; i < BLK_RING_SIZE(info); i++) {
1310*4882a593Smuzhiyun /*
1311*4882a593Smuzhiyun * Clear persistent grants present in requests already
1312*4882a593Smuzhiyun * on the shared ring
1313*4882a593Smuzhiyun */
1314*4882a593Smuzhiyun if (!rinfo->shadow[i].request)
1315*4882a593Smuzhiyun goto free_shadow;
1316*4882a593Smuzhiyun
1317*4882a593Smuzhiyun segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
1318*4882a593Smuzhiyun rinfo->shadow[i].req.u.indirect.nr_segments :
1319*4882a593Smuzhiyun rinfo->shadow[i].req.u.rw.nr_segments;
1320*4882a593Smuzhiyun for (j = 0; j < segs; j++) {
1321*4882a593Smuzhiyun persistent_gnt = rinfo->shadow[i].grants_used[j];
1322*4882a593Smuzhiyun gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
1323*4882a593Smuzhiyun if (info->bounce)
1324*4882a593Smuzhiyun __free_page(persistent_gnt->page);
1325*4882a593Smuzhiyun kfree(persistent_gnt);
1326*4882a593Smuzhiyun }
1327*4882a593Smuzhiyun
1328*4882a593Smuzhiyun if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
1329*4882a593Smuzhiyun /*
1330*4882a593Smuzhiyun * If this is not an indirect operation don't try to
1331*4882a593Smuzhiyun * free indirect segments
1332*4882a593Smuzhiyun */
1333*4882a593Smuzhiyun goto free_shadow;
1334*4882a593Smuzhiyun
1335*4882a593Smuzhiyun for (j = 0; j < INDIRECT_GREFS(segs); j++) {
1336*4882a593Smuzhiyun persistent_gnt = rinfo->shadow[i].indirect_grants[j];
1337*4882a593Smuzhiyun gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
1338*4882a593Smuzhiyun __free_page(persistent_gnt->page);
1339*4882a593Smuzhiyun kfree(persistent_gnt);
1340*4882a593Smuzhiyun }
1341*4882a593Smuzhiyun
1342*4882a593Smuzhiyun free_shadow:
1343*4882a593Smuzhiyun kvfree(rinfo->shadow[i].grants_used);
1344*4882a593Smuzhiyun rinfo->shadow[i].grants_used = NULL;
1345*4882a593Smuzhiyun kvfree(rinfo->shadow[i].indirect_grants);
1346*4882a593Smuzhiyun rinfo->shadow[i].indirect_grants = NULL;
1347*4882a593Smuzhiyun kvfree(rinfo->shadow[i].sg);
1348*4882a593Smuzhiyun rinfo->shadow[i].sg = NULL;
1349*4882a593Smuzhiyun }
1350*4882a593Smuzhiyun
1351*4882a593Smuzhiyun /* No more gnttab callback work. */
1352*4882a593Smuzhiyun gnttab_cancel_free_callback(&rinfo->callback);
1353*4882a593Smuzhiyun
1354*4882a593Smuzhiyun /* Flush gnttab callback work. Must be done with no locks held. */
1355*4882a593Smuzhiyun flush_work(&rinfo->work);
1356*4882a593Smuzhiyun
1357*4882a593Smuzhiyun /* Free resources associated with old device channel. */
1358*4882a593Smuzhiyun for (i = 0; i < info->nr_ring_pages; i++) {
1359*4882a593Smuzhiyun if (rinfo->ring_ref[i] != GRANT_INVALID_REF) {
1360*4882a593Smuzhiyun gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0);
1361*4882a593Smuzhiyun rinfo->ring_ref[i] = GRANT_INVALID_REF;
1362*4882a593Smuzhiyun }
1363*4882a593Smuzhiyun }
1364*4882a593Smuzhiyun free_pages_exact(rinfo->ring.sring,
1365*4882a593Smuzhiyun info->nr_ring_pages * XEN_PAGE_SIZE);
1366*4882a593Smuzhiyun rinfo->ring.sring = NULL;
1367*4882a593Smuzhiyun
1368*4882a593Smuzhiyun if (rinfo->irq)
1369*4882a593Smuzhiyun unbind_from_irqhandler(rinfo->irq, rinfo);
1370*4882a593Smuzhiyun rinfo->evtchn = rinfo->irq = 0;
1371*4882a593Smuzhiyun }
1372*4882a593Smuzhiyun
blkif_free(struct blkfront_info * info,int suspend)1373*4882a593Smuzhiyun static void blkif_free(struct blkfront_info *info, int suspend)
1374*4882a593Smuzhiyun {
1375*4882a593Smuzhiyun unsigned int i;
1376*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
1377*4882a593Smuzhiyun
1378*4882a593Smuzhiyun /* Prevent new requests being issued until we fix things up. */
1379*4882a593Smuzhiyun info->connected = suspend ?
1380*4882a593Smuzhiyun BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
1381*4882a593Smuzhiyun /* No more blkif_request(). */
1382*4882a593Smuzhiyun if (info->rq)
1383*4882a593Smuzhiyun blk_mq_stop_hw_queues(info->rq);
1384*4882a593Smuzhiyun
1385*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i)
1386*4882a593Smuzhiyun blkif_free_ring(rinfo);
1387*4882a593Smuzhiyun
1388*4882a593Smuzhiyun kvfree(info->rinfo);
1389*4882a593Smuzhiyun info->rinfo = NULL;
1390*4882a593Smuzhiyun info->nr_rings = 0;
1391*4882a593Smuzhiyun }
1392*4882a593Smuzhiyun
1393*4882a593Smuzhiyun struct copy_from_grant {
1394*4882a593Smuzhiyun const struct blk_shadow *s;
1395*4882a593Smuzhiyun unsigned int grant_idx;
1396*4882a593Smuzhiyun unsigned int bvec_offset;
1397*4882a593Smuzhiyun char *bvec_data;
1398*4882a593Smuzhiyun };
1399*4882a593Smuzhiyun
blkif_copy_from_grant(unsigned long gfn,unsigned int offset,unsigned int len,void * data)1400*4882a593Smuzhiyun static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
1401*4882a593Smuzhiyun unsigned int len, void *data)
1402*4882a593Smuzhiyun {
1403*4882a593Smuzhiyun struct copy_from_grant *info = data;
1404*4882a593Smuzhiyun char *shared_data;
1405*4882a593Smuzhiyun /* Convenient aliases */
1406*4882a593Smuzhiyun const struct blk_shadow *s = info->s;
1407*4882a593Smuzhiyun
1408*4882a593Smuzhiyun shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page);
1409*4882a593Smuzhiyun
1410*4882a593Smuzhiyun memcpy(info->bvec_data + info->bvec_offset,
1411*4882a593Smuzhiyun shared_data + offset, len);
1412*4882a593Smuzhiyun
1413*4882a593Smuzhiyun info->bvec_offset += len;
1414*4882a593Smuzhiyun info->grant_idx++;
1415*4882a593Smuzhiyun
1416*4882a593Smuzhiyun kunmap_atomic(shared_data);
1417*4882a593Smuzhiyun }
1418*4882a593Smuzhiyun
blkif_rsp_to_req_status(int rsp)1419*4882a593Smuzhiyun static enum blk_req_status blkif_rsp_to_req_status(int rsp)
1420*4882a593Smuzhiyun {
1421*4882a593Smuzhiyun switch (rsp)
1422*4882a593Smuzhiyun {
1423*4882a593Smuzhiyun case BLKIF_RSP_OKAY:
1424*4882a593Smuzhiyun return REQ_DONE;
1425*4882a593Smuzhiyun case BLKIF_RSP_EOPNOTSUPP:
1426*4882a593Smuzhiyun return REQ_EOPNOTSUPP;
1427*4882a593Smuzhiyun case BLKIF_RSP_ERROR:
1428*4882a593Smuzhiyun default:
1429*4882a593Smuzhiyun return REQ_ERROR;
1430*4882a593Smuzhiyun }
1431*4882a593Smuzhiyun }
1432*4882a593Smuzhiyun
1433*4882a593Smuzhiyun /*
1434*4882a593Smuzhiyun * Get the final status of the block request based on two ring response
1435*4882a593Smuzhiyun */
blkif_get_final_status(enum blk_req_status s1,enum blk_req_status s2)1436*4882a593Smuzhiyun static int blkif_get_final_status(enum blk_req_status s1,
1437*4882a593Smuzhiyun enum blk_req_status s2)
1438*4882a593Smuzhiyun {
1439*4882a593Smuzhiyun BUG_ON(s1 < REQ_DONE);
1440*4882a593Smuzhiyun BUG_ON(s2 < REQ_DONE);
1441*4882a593Smuzhiyun
1442*4882a593Smuzhiyun if (s1 == REQ_ERROR || s2 == REQ_ERROR)
1443*4882a593Smuzhiyun return BLKIF_RSP_ERROR;
1444*4882a593Smuzhiyun else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
1445*4882a593Smuzhiyun return BLKIF_RSP_EOPNOTSUPP;
1446*4882a593Smuzhiyun return BLKIF_RSP_OKAY;
1447*4882a593Smuzhiyun }
1448*4882a593Smuzhiyun
1449*4882a593Smuzhiyun /*
1450*4882a593Smuzhiyun * Return values:
1451*4882a593Smuzhiyun * 1 response processed.
1452*4882a593Smuzhiyun * 0 missing further responses.
1453*4882a593Smuzhiyun * -1 error while processing.
1454*4882a593Smuzhiyun */
blkif_completion(unsigned long * id,struct blkfront_ring_info * rinfo,struct blkif_response * bret)1455*4882a593Smuzhiyun static int blkif_completion(unsigned long *id,
1456*4882a593Smuzhiyun struct blkfront_ring_info *rinfo,
1457*4882a593Smuzhiyun struct blkif_response *bret)
1458*4882a593Smuzhiyun {
1459*4882a593Smuzhiyun int i = 0;
1460*4882a593Smuzhiyun struct scatterlist *sg;
1461*4882a593Smuzhiyun int num_sg, num_grant;
1462*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
1463*4882a593Smuzhiyun struct blk_shadow *s = &rinfo->shadow[*id];
1464*4882a593Smuzhiyun struct copy_from_grant data = {
1465*4882a593Smuzhiyun .grant_idx = 0,
1466*4882a593Smuzhiyun };
1467*4882a593Smuzhiyun
1468*4882a593Smuzhiyun num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
1469*4882a593Smuzhiyun s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
1470*4882a593Smuzhiyun
1471*4882a593Smuzhiyun /* The I/O request may be split in two. */
1472*4882a593Smuzhiyun if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
1473*4882a593Smuzhiyun struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
1474*4882a593Smuzhiyun
1475*4882a593Smuzhiyun /* Keep the status of the current response in shadow. */
1476*4882a593Smuzhiyun s->status = blkif_rsp_to_req_status(bret->status);
1477*4882a593Smuzhiyun
1478*4882a593Smuzhiyun /* Wait the second response if not yet here. */
1479*4882a593Smuzhiyun if (s2->status < REQ_DONE)
1480*4882a593Smuzhiyun return 0;
1481*4882a593Smuzhiyun
1482*4882a593Smuzhiyun bret->status = blkif_get_final_status(s->status,
1483*4882a593Smuzhiyun s2->status);
1484*4882a593Smuzhiyun
1485*4882a593Smuzhiyun /*
1486*4882a593Smuzhiyun * All the grants is stored in the first shadow in order
1487*4882a593Smuzhiyun * to make the completion code simpler.
1488*4882a593Smuzhiyun */
1489*4882a593Smuzhiyun num_grant += s2->req.u.rw.nr_segments;
1490*4882a593Smuzhiyun
1491*4882a593Smuzhiyun /*
1492*4882a593Smuzhiyun * The two responses may not come in order. Only the
1493*4882a593Smuzhiyun * first request will store the scatter-gather list.
1494*4882a593Smuzhiyun */
1495*4882a593Smuzhiyun if (s2->num_sg != 0) {
1496*4882a593Smuzhiyun /* Update "id" with the ID of the first response. */
1497*4882a593Smuzhiyun *id = s->associated_id;
1498*4882a593Smuzhiyun s = s2;
1499*4882a593Smuzhiyun }
1500*4882a593Smuzhiyun
1501*4882a593Smuzhiyun /*
1502*4882a593Smuzhiyun * We don't need anymore the second request, so recycling
1503*4882a593Smuzhiyun * it now.
1504*4882a593Smuzhiyun */
1505*4882a593Smuzhiyun if (add_id_to_freelist(rinfo, s->associated_id))
1506*4882a593Smuzhiyun WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
1507*4882a593Smuzhiyun info->gd->disk_name, s->associated_id);
1508*4882a593Smuzhiyun }
1509*4882a593Smuzhiyun
1510*4882a593Smuzhiyun data.s = s;
1511*4882a593Smuzhiyun num_sg = s->num_sg;
1512*4882a593Smuzhiyun
1513*4882a593Smuzhiyun if (bret->operation == BLKIF_OP_READ && info->bounce) {
1514*4882a593Smuzhiyun for_each_sg(s->sg, sg, num_sg, i) {
1515*4882a593Smuzhiyun BUG_ON(sg->offset + sg->length > PAGE_SIZE);
1516*4882a593Smuzhiyun
1517*4882a593Smuzhiyun data.bvec_offset = sg->offset;
1518*4882a593Smuzhiyun data.bvec_data = kmap_atomic(sg_page(sg));
1519*4882a593Smuzhiyun
1520*4882a593Smuzhiyun gnttab_foreach_grant_in_range(sg_page(sg),
1521*4882a593Smuzhiyun sg->offset,
1522*4882a593Smuzhiyun sg->length,
1523*4882a593Smuzhiyun blkif_copy_from_grant,
1524*4882a593Smuzhiyun &data);
1525*4882a593Smuzhiyun
1526*4882a593Smuzhiyun kunmap_atomic(data.bvec_data);
1527*4882a593Smuzhiyun }
1528*4882a593Smuzhiyun }
1529*4882a593Smuzhiyun /* Add the persistent grant into the list of free grants */
1530*4882a593Smuzhiyun for (i = 0; i < num_grant; i++) {
1531*4882a593Smuzhiyun if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) {
1532*4882a593Smuzhiyun /*
1533*4882a593Smuzhiyun * If the grant is still mapped by the backend (the
1534*4882a593Smuzhiyun * backend has chosen to make this grant persistent)
1535*4882a593Smuzhiyun * we add it at the head of the list, so it will be
1536*4882a593Smuzhiyun * reused first.
1537*4882a593Smuzhiyun */
1538*4882a593Smuzhiyun if (!info->feature_persistent) {
1539*4882a593Smuzhiyun pr_alert("backed has not unmapped grant: %u\n",
1540*4882a593Smuzhiyun s->grants_used[i]->gref);
1541*4882a593Smuzhiyun return -1;
1542*4882a593Smuzhiyun }
1543*4882a593Smuzhiyun list_add(&s->grants_used[i]->node, &rinfo->grants);
1544*4882a593Smuzhiyun rinfo->persistent_gnts_c++;
1545*4882a593Smuzhiyun } else {
1546*4882a593Smuzhiyun /*
1547*4882a593Smuzhiyun * If the grant is not mapped by the backend we add it
1548*4882a593Smuzhiyun * to the tail of the list, so it will not be picked
1549*4882a593Smuzhiyun * again unless we run out of persistent grants.
1550*4882a593Smuzhiyun */
1551*4882a593Smuzhiyun s->grants_used[i]->gref = GRANT_INVALID_REF;
1552*4882a593Smuzhiyun list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
1553*4882a593Smuzhiyun }
1554*4882a593Smuzhiyun }
1555*4882a593Smuzhiyun if (s->req.operation == BLKIF_OP_INDIRECT) {
1556*4882a593Smuzhiyun for (i = 0; i < INDIRECT_GREFS(num_grant); i++) {
1557*4882a593Smuzhiyun if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) {
1558*4882a593Smuzhiyun if (!info->feature_persistent) {
1559*4882a593Smuzhiyun pr_alert("backed has not unmapped grant: %u\n",
1560*4882a593Smuzhiyun s->indirect_grants[i]->gref);
1561*4882a593Smuzhiyun return -1;
1562*4882a593Smuzhiyun }
1563*4882a593Smuzhiyun list_add(&s->indirect_grants[i]->node, &rinfo->grants);
1564*4882a593Smuzhiyun rinfo->persistent_gnts_c++;
1565*4882a593Smuzhiyun } else {
1566*4882a593Smuzhiyun struct page *indirect_page;
1567*4882a593Smuzhiyun
1568*4882a593Smuzhiyun /*
1569*4882a593Smuzhiyun * Add the used indirect page back to the list of
1570*4882a593Smuzhiyun * available pages for indirect grefs.
1571*4882a593Smuzhiyun */
1572*4882a593Smuzhiyun if (!info->bounce) {
1573*4882a593Smuzhiyun indirect_page = s->indirect_grants[i]->page;
1574*4882a593Smuzhiyun list_add(&indirect_page->lru, &rinfo->indirect_pages);
1575*4882a593Smuzhiyun }
1576*4882a593Smuzhiyun s->indirect_grants[i]->gref = GRANT_INVALID_REF;
1577*4882a593Smuzhiyun list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
1578*4882a593Smuzhiyun }
1579*4882a593Smuzhiyun }
1580*4882a593Smuzhiyun }
1581*4882a593Smuzhiyun
1582*4882a593Smuzhiyun return 1;
1583*4882a593Smuzhiyun }
1584*4882a593Smuzhiyun
blkif_interrupt(int irq,void * dev_id)1585*4882a593Smuzhiyun static irqreturn_t blkif_interrupt(int irq, void *dev_id)
1586*4882a593Smuzhiyun {
1587*4882a593Smuzhiyun struct request *req;
1588*4882a593Smuzhiyun struct blkif_response bret;
1589*4882a593Smuzhiyun RING_IDX i, rp;
1590*4882a593Smuzhiyun unsigned long flags;
1591*4882a593Smuzhiyun struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
1592*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
1593*4882a593Smuzhiyun unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS;
1594*4882a593Smuzhiyun
1595*4882a593Smuzhiyun if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
1596*4882a593Smuzhiyun xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
1597*4882a593Smuzhiyun return IRQ_HANDLED;
1598*4882a593Smuzhiyun }
1599*4882a593Smuzhiyun
1600*4882a593Smuzhiyun spin_lock_irqsave(&rinfo->ring_lock, flags);
1601*4882a593Smuzhiyun again:
1602*4882a593Smuzhiyun rp = READ_ONCE(rinfo->ring.sring->rsp_prod);
1603*4882a593Smuzhiyun virt_rmb(); /* Ensure we see queued responses up to 'rp'. */
1604*4882a593Smuzhiyun if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) {
1605*4882a593Smuzhiyun pr_alert("%s: illegal number of responses %u\n",
1606*4882a593Smuzhiyun info->gd->disk_name, rp - rinfo->ring.rsp_cons);
1607*4882a593Smuzhiyun goto err;
1608*4882a593Smuzhiyun }
1609*4882a593Smuzhiyun
1610*4882a593Smuzhiyun for (i = rinfo->ring.rsp_cons; i != rp; i++) {
1611*4882a593Smuzhiyun unsigned long id;
1612*4882a593Smuzhiyun unsigned int op;
1613*4882a593Smuzhiyun
1614*4882a593Smuzhiyun eoiflag = 0;
1615*4882a593Smuzhiyun
1616*4882a593Smuzhiyun RING_COPY_RESPONSE(&rinfo->ring, i, &bret);
1617*4882a593Smuzhiyun id = bret.id;
1618*4882a593Smuzhiyun
1619*4882a593Smuzhiyun /*
1620*4882a593Smuzhiyun * The backend has messed up and given us an id that we would
1621*4882a593Smuzhiyun * never have given to it (we stamp it up to BLK_RING_SIZE -
1622*4882a593Smuzhiyun * look in get_id_from_freelist.
1623*4882a593Smuzhiyun */
1624*4882a593Smuzhiyun if (id >= BLK_RING_SIZE(info)) {
1625*4882a593Smuzhiyun pr_alert("%s: response has incorrect id (%ld)\n",
1626*4882a593Smuzhiyun info->gd->disk_name, id);
1627*4882a593Smuzhiyun goto err;
1628*4882a593Smuzhiyun }
1629*4882a593Smuzhiyun if (rinfo->shadow[id].status != REQ_WAITING) {
1630*4882a593Smuzhiyun pr_alert("%s: response references no pending request\n",
1631*4882a593Smuzhiyun info->gd->disk_name);
1632*4882a593Smuzhiyun goto err;
1633*4882a593Smuzhiyun }
1634*4882a593Smuzhiyun
1635*4882a593Smuzhiyun rinfo->shadow[id].status = REQ_PROCESSING;
1636*4882a593Smuzhiyun req = rinfo->shadow[id].request;
1637*4882a593Smuzhiyun
1638*4882a593Smuzhiyun op = rinfo->shadow[id].req.operation;
1639*4882a593Smuzhiyun if (op == BLKIF_OP_INDIRECT)
1640*4882a593Smuzhiyun op = rinfo->shadow[id].req.u.indirect.indirect_op;
1641*4882a593Smuzhiyun if (bret.operation != op) {
1642*4882a593Smuzhiyun pr_alert("%s: response has wrong operation (%u instead of %u)\n",
1643*4882a593Smuzhiyun info->gd->disk_name, bret.operation, op);
1644*4882a593Smuzhiyun goto err;
1645*4882a593Smuzhiyun }
1646*4882a593Smuzhiyun
1647*4882a593Smuzhiyun if (bret.operation != BLKIF_OP_DISCARD) {
1648*4882a593Smuzhiyun int ret;
1649*4882a593Smuzhiyun
1650*4882a593Smuzhiyun /*
1651*4882a593Smuzhiyun * We may need to wait for an extra response if the
1652*4882a593Smuzhiyun * I/O request is split in 2
1653*4882a593Smuzhiyun */
1654*4882a593Smuzhiyun ret = blkif_completion(&id, rinfo, &bret);
1655*4882a593Smuzhiyun if (!ret)
1656*4882a593Smuzhiyun continue;
1657*4882a593Smuzhiyun if (unlikely(ret < 0))
1658*4882a593Smuzhiyun goto err;
1659*4882a593Smuzhiyun }
1660*4882a593Smuzhiyun
1661*4882a593Smuzhiyun if (add_id_to_freelist(rinfo, id)) {
1662*4882a593Smuzhiyun WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
1663*4882a593Smuzhiyun info->gd->disk_name, op_name(bret.operation), id);
1664*4882a593Smuzhiyun continue;
1665*4882a593Smuzhiyun }
1666*4882a593Smuzhiyun
1667*4882a593Smuzhiyun if (bret.status == BLKIF_RSP_OKAY)
1668*4882a593Smuzhiyun blkif_req(req)->error = BLK_STS_OK;
1669*4882a593Smuzhiyun else
1670*4882a593Smuzhiyun blkif_req(req)->error = BLK_STS_IOERR;
1671*4882a593Smuzhiyun
1672*4882a593Smuzhiyun switch (bret.operation) {
1673*4882a593Smuzhiyun case BLKIF_OP_DISCARD:
1674*4882a593Smuzhiyun if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
1675*4882a593Smuzhiyun struct request_queue *rq = info->rq;
1676*4882a593Smuzhiyun
1677*4882a593Smuzhiyun pr_warn_ratelimited("blkfront: %s: %s op failed\n",
1678*4882a593Smuzhiyun info->gd->disk_name, op_name(bret.operation));
1679*4882a593Smuzhiyun blkif_req(req)->error = BLK_STS_NOTSUPP;
1680*4882a593Smuzhiyun info->feature_discard = 0;
1681*4882a593Smuzhiyun info->feature_secdiscard = 0;
1682*4882a593Smuzhiyun blk_queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
1683*4882a593Smuzhiyun blk_queue_flag_clear(QUEUE_FLAG_SECERASE, rq);
1684*4882a593Smuzhiyun }
1685*4882a593Smuzhiyun break;
1686*4882a593Smuzhiyun case BLKIF_OP_FLUSH_DISKCACHE:
1687*4882a593Smuzhiyun case BLKIF_OP_WRITE_BARRIER:
1688*4882a593Smuzhiyun if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
1689*4882a593Smuzhiyun pr_warn_ratelimited("blkfront: %s: %s op failed\n",
1690*4882a593Smuzhiyun info->gd->disk_name, op_name(bret.operation));
1691*4882a593Smuzhiyun blkif_req(req)->error = BLK_STS_NOTSUPP;
1692*4882a593Smuzhiyun }
1693*4882a593Smuzhiyun if (unlikely(bret.status == BLKIF_RSP_ERROR &&
1694*4882a593Smuzhiyun rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
1695*4882a593Smuzhiyun pr_warn_ratelimited("blkfront: %s: empty %s op failed\n",
1696*4882a593Smuzhiyun info->gd->disk_name, op_name(bret.operation));
1697*4882a593Smuzhiyun blkif_req(req)->error = BLK_STS_NOTSUPP;
1698*4882a593Smuzhiyun }
1699*4882a593Smuzhiyun if (unlikely(blkif_req(req)->error)) {
1700*4882a593Smuzhiyun if (blkif_req(req)->error == BLK_STS_NOTSUPP)
1701*4882a593Smuzhiyun blkif_req(req)->error = BLK_STS_OK;
1702*4882a593Smuzhiyun info->feature_fua = 0;
1703*4882a593Smuzhiyun info->feature_flush = 0;
1704*4882a593Smuzhiyun xlvbd_flush(info);
1705*4882a593Smuzhiyun }
1706*4882a593Smuzhiyun fallthrough;
1707*4882a593Smuzhiyun case BLKIF_OP_READ:
1708*4882a593Smuzhiyun case BLKIF_OP_WRITE:
1709*4882a593Smuzhiyun if (unlikely(bret.status != BLKIF_RSP_OKAY))
1710*4882a593Smuzhiyun dev_dbg_ratelimited(&info->xbdev->dev,
1711*4882a593Smuzhiyun "Bad return from blkdev data request: %#x\n",
1712*4882a593Smuzhiyun bret.status);
1713*4882a593Smuzhiyun
1714*4882a593Smuzhiyun break;
1715*4882a593Smuzhiyun default:
1716*4882a593Smuzhiyun BUG();
1717*4882a593Smuzhiyun }
1718*4882a593Smuzhiyun
1719*4882a593Smuzhiyun if (likely(!blk_should_fake_timeout(req->q)))
1720*4882a593Smuzhiyun blk_mq_complete_request(req);
1721*4882a593Smuzhiyun }
1722*4882a593Smuzhiyun
1723*4882a593Smuzhiyun rinfo->ring.rsp_cons = i;
1724*4882a593Smuzhiyun
1725*4882a593Smuzhiyun if (i != rinfo->ring.req_prod_pvt) {
1726*4882a593Smuzhiyun int more_to_do;
1727*4882a593Smuzhiyun RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
1728*4882a593Smuzhiyun if (more_to_do)
1729*4882a593Smuzhiyun goto again;
1730*4882a593Smuzhiyun } else
1731*4882a593Smuzhiyun rinfo->ring.sring->rsp_event = i + 1;
1732*4882a593Smuzhiyun
1733*4882a593Smuzhiyun kick_pending_request_queues_locked(rinfo);
1734*4882a593Smuzhiyun
1735*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1736*4882a593Smuzhiyun
1737*4882a593Smuzhiyun xen_irq_lateeoi(irq, eoiflag);
1738*4882a593Smuzhiyun
1739*4882a593Smuzhiyun return IRQ_HANDLED;
1740*4882a593Smuzhiyun
1741*4882a593Smuzhiyun err:
1742*4882a593Smuzhiyun info->connected = BLKIF_STATE_ERROR;
1743*4882a593Smuzhiyun
1744*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
1745*4882a593Smuzhiyun
1746*4882a593Smuzhiyun /* No EOI in order to avoid further interrupts. */
1747*4882a593Smuzhiyun
1748*4882a593Smuzhiyun pr_alert("%s disabled for further use\n", info->gd->disk_name);
1749*4882a593Smuzhiyun return IRQ_HANDLED;
1750*4882a593Smuzhiyun }
1751*4882a593Smuzhiyun
1752*4882a593Smuzhiyun
setup_blkring(struct xenbus_device * dev,struct blkfront_ring_info * rinfo)1753*4882a593Smuzhiyun static int setup_blkring(struct xenbus_device *dev,
1754*4882a593Smuzhiyun struct blkfront_ring_info *rinfo)
1755*4882a593Smuzhiyun {
1756*4882a593Smuzhiyun struct blkif_sring *sring;
1757*4882a593Smuzhiyun int err, i;
1758*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
1759*4882a593Smuzhiyun unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
1760*4882a593Smuzhiyun grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
1761*4882a593Smuzhiyun
1762*4882a593Smuzhiyun for (i = 0; i < info->nr_ring_pages; i++)
1763*4882a593Smuzhiyun rinfo->ring_ref[i] = GRANT_INVALID_REF;
1764*4882a593Smuzhiyun
1765*4882a593Smuzhiyun sring = alloc_pages_exact(ring_size, GFP_NOIO | __GFP_ZERO);
1766*4882a593Smuzhiyun if (!sring) {
1767*4882a593Smuzhiyun xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
1768*4882a593Smuzhiyun return -ENOMEM;
1769*4882a593Smuzhiyun }
1770*4882a593Smuzhiyun SHARED_RING_INIT(sring);
1771*4882a593Smuzhiyun FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
1772*4882a593Smuzhiyun
1773*4882a593Smuzhiyun err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref);
1774*4882a593Smuzhiyun if (err < 0) {
1775*4882a593Smuzhiyun free_pages_exact(sring, ring_size);
1776*4882a593Smuzhiyun rinfo->ring.sring = NULL;
1777*4882a593Smuzhiyun goto fail;
1778*4882a593Smuzhiyun }
1779*4882a593Smuzhiyun for (i = 0; i < info->nr_ring_pages; i++)
1780*4882a593Smuzhiyun rinfo->ring_ref[i] = gref[i];
1781*4882a593Smuzhiyun
1782*4882a593Smuzhiyun err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
1783*4882a593Smuzhiyun if (err)
1784*4882a593Smuzhiyun goto fail;
1785*4882a593Smuzhiyun
1786*4882a593Smuzhiyun err = bind_evtchn_to_irqhandler_lateeoi(rinfo->evtchn, blkif_interrupt,
1787*4882a593Smuzhiyun 0, "blkif", rinfo);
1788*4882a593Smuzhiyun if (err <= 0) {
1789*4882a593Smuzhiyun xenbus_dev_fatal(dev, err,
1790*4882a593Smuzhiyun "bind_evtchn_to_irqhandler failed");
1791*4882a593Smuzhiyun goto fail;
1792*4882a593Smuzhiyun }
1793*4882a593Smuzhiyun rinfo->irq = err;
1794*4882a593Smuzhiyun
1795*4882a593Smuzhiyun return 0;
1796*4882a593Smuzhiyun fail:
1797*4882a593Smuzhiyun blkif_free(info, 0);
1798*4882a593Smuzhiyun return err;
1799*4882a593Smuzhiyun }
1800*4882a593Smuzhiyun
1801*4882a593Smuzhiyun /*
1802*4882a593Smuzhiyun * Write out per-ring/queue nodes including ring-ref and event-channel, and each
1803*4882a593Smuzhiyun * ring buffer may have multi pages depending on ->nr_ring_pages.
1804*4882a593Smuzhiyun */
write_per_ring_nodes(struct xenbus_transaction xbt,struct blkfront_ring_info * rinfo,const char * dir)1805*4882a593Smuzhiyun static int write_per_ring_nodes(struct xenbus_transaction xbt,
1806*4882a593Smuzhiyun struct blkfront_ring_info *rinfo, const char *dir)
1807*4882a593Smuzhiyun {
1808*4882a593Smuzhiyun int err;
1809*4882a593Smuzhiyun unsigned int i;
1810*4882a593Smuzhiyun const char *message = NULL;
1811*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
1812*4882a593Smuzhiyun
1813*4882a593Smuzhiyun if (info->nr_ring_pages == 1) {
1814*4882a593Smuzhiyun err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
1815*4882a593Smuzhiyun if (err) {
1816*4882a593Smuzhiyun message = "writing ring-ref";
1817*4882a593Smuzhiyun goto abort_transaction;
1818*4882a593Smuzhiyun }
1819*4882a593Smuzhiyun } else {
1820*4882a593Smuzhiyun for (i = 0; i < info->nr_ring_pages; i++) {
1821*4882a593Smuzhiyun char ring_ref_name[RINGREF_NAME_LEN];
1822*4882a593Smuzhiyun
1823*4882a593Smuzhiyun snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
1824*4882a593Smuzhiyun err = xenbus_printf(xbt, dir, ring_ref_name,
1825*4882a593Smuzhiyun "%u", rinfo->ring_ref[i]);
1826*4882a593Smuzhiyun if (err) {
1827*4882a593Smuzhiyun message = "writing ring-ref";
1828*4882a593Smuzhiyun goto abort_transaction;
1829*4882a593Smuzhiyun }
1830*4882a593Smuzhiyun }
1831*4882a593Smuzhiyun }
1832*4882a593Smuzhiyun
1833*4882a593Smuzhiyun err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
1834*4882a593Smuzhiyun if (err) {
1835*4882a593Smuzhiyun message = "writing event-channel";
1836*4882a593Smuzhiyun goto abort_transaction;
1837*4882a593Smuzhiyun }
1838*4882a593Smuzhiyun
1839*4882a593Smuzhiyun return 0;
1840*4882a593Smuzhiyun
1841*4882a593Smuzhiyun abort_transaction:
1842*4882a593Smuzhiyun xenbus_transaction_end(xbt, 1);
1843*4882a593Smuzhiyun if (message)
1844*4882a593Smuzhiyun xenbus_dev_fatal(info->xbdev, err, "%s", message);
1845*4882a593Smuzhiyun
1846*4882a593Smuzhiyun return err;
1847*4882a593Smuzhiyun }
1848*4882a593Smuzhiyun
free_info(struct blkfront_info * info)1849*4882a593Smuzhiyun static void free_info(struct blkfront_info *info)
1850*4882a593Smuzhiyun {
1851*4882a593Smuzhiyun list_del(&info->info_list);
1852*4882a593Smuzhiyun kfree(info);
1853*4882a593Smuzhiyun }
1854*4882a593Smuzhiyun
1855*4882a593Smuzhiyun /* Enable the persistent grants feature. */
1856*4882a593Smuzhiyun static bool feature_persistent = true;
1857*4882a593Smuzhiyun module_param(feature_persistent, bool, 0644);
1858*4882a593Smuzhiyun MODULE_PARM_DESC(feature_persistent,
1859*4882a593Smuzhiyun "Enables the persistent grants feature");
1860*4882a593Smuzhiyun
1861*4882a593Smuzhiyun /* Common code used when first setting up, and when resuming. */
talk_to_blkback(struct xenbus_device * dev,struct blkfront_info * info)1862*4882a593Smuzhiyun static int talk_to_blkback(struct xenbus_device *dev,
1863*4882a593Smuzhiyun struct blkfront_info *info)
1864*4882a593Smuzhiyun {
1865*4882a593Smuzhiyun const char *message = NULL;
1866*4882a593Smuzhiyun struct xenbus_transaction xbt;
1867*4882a593Smuzhiyun int err;
1868*4882a593Smuzhiyun unsigned int i, max_page_order;
1869*4882a593Smuzhiyun unsigned int ring_page_order;
1870*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
1871*4882a593Smuzhiyun
1872*4882a593Smuzhiyun if (!info)
1873*4882a593Smuzhiyun return -ENODEV;
1874*4882a593Smuzhiyun
1875*4882a593Smuzhiyun /* Check if backend is trusted. */
1876*4882a593Smuzhiyun info->bounce = !xen_blkif_trusted ||
1877*4882a593Smuzhiyun !xenbus_read_unsigned(dev->nodename, "trusted", 1);
1878*4882a593Smuzhiyun
1879*4882a593Smuzhiyun max_page_order = xenbus_read_unsigned(info->xbdev->otherend,
1880*4882a593Smuzhiyun "max-ring-page-order", 0);
1881*4882a593Smuzhiyun ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
1882*4882a593Smuzhiyun info->nr_ring_pages = 1 << ring_page_order;
1883*4882a593Smuzhiyun
1884*4882a593Smuzhiyun err = negotiate_mq(info);
1885*4882a593Smuzhiyun if (err)
1886*4882a593Smuzhiyun goto destroy_blkring;
1887*4882a593Smuzhiyun
1888*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
1889*4882a593Smuzhiyun /* Create shared ring, alloc event channel. */
1890*4882a593Smuzhiyun err = setup_blkring(dev, rinfo);
1891*4882a593Smuzhiyun if (err)
1892*4882a593Smuzhiyun goto destroy_blkring;
1893*4882a593Smuzhiyun }
1894*4882a593Smuzhiyun
1895*4882a593Smuzhiyun again:
1896*4882a593Smuzhiyun err = xenbus_transaction_start(&xbt);
1897*4882a593Smuzhiyun if (err) {
1898*4882a593Smuzhiyun xenbus_dev_fatal(dev, err, "starting transaction");
1899*4882a593Smuzhiyun goto destroy_blkring;
1900*4882a593Smuzhiyun }
1901*4882a593Smuzhiyun
1902*4882a593Smuzhiyun if (info->nr_ring_pages > 1) {
1903*4882a593Smuzhiyun err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
1904*4882a593Smuzhiyun ring_page_order);
1905*4882a593Smuzhiyun if (err) {
1906*4882a593Smuzhiyun message = "writing ring-page-order";
1907*4882a593Smuzhiyun goto abort_transaction;
1908*4882a593Smuzhiyun }
1909*4882a593Smuzhiyun }
1910*4882a593Smuzhiyun
1911*4882a593Smuzhiyun /* We already got the number of queues/rings in _probe */
1912*4882a593Smuzhiyun if (info->nr_rings == 1) {
1913*4882a593Smuzhiyun err = write_per_ring_nodes(xbt, info->rinfo, dev->nodename);
1914*4882a593Smuzhiyun if (err)
1915*4882a593Smuzhiyun goto destroy_blkring;
1916*4882a593Smuzhiyun } else {
1917*4882a593Smuzhiyun char *path;
1918*4882a593Smuzhiyun size_t pathsize;
1919*4882a593Smuzhiyun
1920*4882a593Smuzhiyun err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
1921*4882a593Smuzhiyun info->nr_rings);
1922*4882a593Smuzhiyun if (err) {
1923*4882a593Smuzhiyun message = "writing multi-queue-num-queues";
1924*4882a593Smuzhiyun goto abort_transaction;
1925*4882a593Smuzhiyun }
1926*4882a593Smuzhiyun
1927*4882a593Smuzhiyun pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
1928*4882a593Smuzhiyun path = kmalloc(pathsize, GFP_KERNEL);
1929*4882a593Smuzhiyun if (!path) {
1930*4882a593Smuzhiyun err = -ENOMEM;
1931*4882a593Smuzhiyun message = "ENOMEM while writing ring references";
1932*4882a593Smuzhiyun goto abort_transaction;
1933*4882a593Smuzhiyun }
1934*4882a593Smuzhiyun
1935*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
1936*4882a593Smuzhiyun memset(path, 0, pathsize);
1937*4882a593Smuzhiyun snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
1938*4882a593Smuzhiyun err = write_per_ring_nodes(xbt, rinfo, path);
1939*4882a593Smuzhiyun if (err) {
1940*4882a593Smuzhiyun kfree(path);
1941*4882a593Smuzhiyun goto destroy_blkring;
1942*4882a593Smuzhiyun }
1943*4882a593Smuzhiyun }
1944*4882a593Smuzhiyun kfree(path);
1945*4882a593Smuzhiyun }
1946*4882a593Smuzhiyun err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
1947*4882a593Smuzhiyun XEN_IO_PROTO_ABI_NATIVE);
1948*4882a593Smuzhiyun if (err) {
1949*4882a593Smuzhiyun message = "writing protocol";
1950*4882a593Smuzhiyun goto abort_transaction;
1951*4882a593Smuzhiyun }
1952*4882a593Smuzhiyun info->feature_persistent_parm = feature_persistent;
1953*4882a593Smuzhiyun err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
1954*4882a593Smuzhiyun info->feature_persistent_parm);
1955*4882a593Smuzhiyun if (err)
1956*4882a593Smuzhiyun dev_warn(&dev->dev,
1957*4882a593Smuzhiyun "writing persistent grants feature to xenbus");
1958*4882a593Smuzhiyun
1959*4882a593Smuzhiyun err = xenbus_transaction_end(xbt, 0);
1960*4882a593Smuzhiyun if (err) {
1961*4882a593Smuzhiyun if (err == -EAGAIN)
1962*4882a593Smuzhiyun goto again;
1963*4882a593Smuzhiyun xenbus_dev_fatal(dev, err, "completing transaction");
1964*4882a593Smuzhiyun goto destroy_blkring;
1965*4882a593Smuzhiyun }
1966*4882a593Smuzhiyun
1967*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
1968*4882a593Smuzhiyun unsigned int j;
1969*4882a593Smuzhiyun
1970*4882a593Smuzhiyun for (j = 0; j < BLK_RING_SIZE(info); j++)
1971*4882a593Smuzhiyun rinfo->shadow[j].req.u.rw.id = j + 1;
1972*4882a593Smuzhiyun rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
1973*4882a593Smuzhiyun }
1974*4882a593Smuzhiyun xenbus_switch_state(dev, XenbusStateInitialised);
1975*4882a593Smuzhiyun
1976*4882a593Smuzhiyun return 0;
1977*4882a593Smuzhiyun
1978*4882a593Smuzhiyun abort_transaction:
1979*4882a593Smuzhiyun xenbus_transaction_end(xbt, 1);
1980*4882a593Smuzhiyun if (message)
1981*4882a593Smuzhiyun xenbus_dev_fatal(dev, err, "%s", message);
1982*4882a593Smuzhiyun destroy_blkring:
1983*4882a593Smuzhiyun blkif_free(info, 0);
1984*4882a593Smuzhiyun
1985*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
1986*4882a593Smuzhiyun free_info(info);
1987*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
1988*4882a593Smuzhiyun
1989*4882a593Smuzhiyun dev_set_drvdata(&dev->dev, NULL);
1990*4882a593Smuzhiyun
1991*4882a593Smuzhiyun return err;
1992*4882a593Smuzhiyun }
1993*4882a593Smuzhiyun
negotiate_mq(struct blkfront_info * info)1994*4882a593Smuzhiyun static int negotiate_mq(struct blkfront_info *info)
1995*4882a593Smuzhiyun {
1996*4882a593Smuzhiyun unsigned int backend_max_queues;
1997*4882a593Smuzhiyun unsigned int i;
1998*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
1999*4882a593Smuzhiyun
2000*4882a593Smuzhiyun BUG_ON(info->nr_rings);
2001*4882a593Smuzhiyun
2002*4882a593Smuzhiyun /* Check if backend supports multiple queues. */
2003*4882a593Smuzhiyun backend_max_queues = xenbus_read_unsigned(info->xbdev->otherend,
2004*4882a593Smuzhiyun "multi-queue-max-queues", 1);
2005*4882a593Smuzhiyun info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
2006*4882a593Smuzhiyun /* We need at least one ring. */
2007*4882a593Smuzhiyun if (!info->nr_rings)
2008*4882a593Smuzhiyun info->nr_rings = 1;
2009*4882a593Smuzhiyun
2010*4882a593Smuzhiyun info->rinfo_size = struct_size(info->rinfo, shadow,
2011*4882a593Smuzhiyun BLK_RING_SIZE(info));
2012*4882a593Smuzhiyun info->rinfo = kvcalloc(info->nr_rings, info->rinfo_size, GFP_KERNEL);
2013*4882a593Smuzhiyun if (!info->rinfo) {
2014*4882a593Smuzhiyun xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating ring_info structure");
2015*4882a593Smuzhiyun info->nr_rings = 0;
2016*4882a593Smuzhiyun return -ENOMEM;
2017*4882a593Smuzhiyun }
2018*4882a593Smuzhiyun
2019*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
2020*4882a593Smuzhiyun INIT_LIST_HEAD(&rinfo->indirect_pages);
2021*4882a593Smuzhiyun INIT_LIST_HEAD(&rinfo->grants);
2022*4882a593Smuzhiyun rinfo->dev_info = info;
2023*4882a593Smuzhiyun INIT_WORK(&rinfo->work, blkif_restart_queue);
2024*4882a593Smuzhiyun spin_lock_init(&rinfo->ring_lock);
2025*4882a593Smuzhiyun }
2026*4882a593Smuzhiyun return 0;
2027*4882a593Smuzhiyun }
2028*4882a593Smuzhiyun
2029*4882a593Smuzhiyun /**
2030*4882a593Smuzhiyun * Entry point to this code when a new device is created. Allocate the basic
2031*4882a593Smuzhiyun * structures and the ring buffer for communication with the backend, and
2032*4882a593Smuzhiyun * inform the backend of the appropriate details for those. Switch to
2033*4882a593Smuzhiyun * Initialised state.
2034*4882a593Smuzhiyun */
blkfront_probe(struct xenbus_device * dev,const struct xenbus_device_id * id)2035*4882a593Smuzhiyun static int blkfront_probe(struct xenbus_device *dev,
2036*4882a593Smuzhiyun const struct xenbus_device_id *id)
2037*4882a593Smuzhiyun {
2038*4882a593Smuzhiyun int err, vdevice;
2039*4882a593Smuzhiyun struct blkfront_info *info;
2040*4882a593Smuzhiyun
2041*4882a593Smuzhiyun /* FIXME: Use dynamic device id if this is not set. */
2042*4882a593Smuzhiyun err = xenbus_scanf(XBT_NIL, dev->nodename,
2043*4882a593Smuzhiyun "virtual-device", "%i", &vdevice);
2044*4882a593Smuzhiyun if (err != 1) {
2045*4882a593Smuzhiyun /* go looking in the extended area instead */
2046*4882a593Smuzhiyun err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
2047*4882a593Smuzhiyun "%i", &vdevice);
2048*4882a593Smuzhiyun if (err != 1) {
2049*4882a593Smuzhiyun xenbus_dev_fatal(dev, err, "reading virtual-device");
2050*4882a593Smuzhiyun return err;
2051*4882a593Smuzhiyun }
2052*4882a593Smuzhiyun }
2053*4882a593Smuzhiyun
2054*4882a593Smuzhiyun if (xen_hvm_domain()) {
2055*4882a593Smuzhiyun char *type;
2056*4882a593Smuzhiyun int len;
2057*4882a593Smuzhiyun /* no unplug has been done: do not hook devices != xen vbds */
2058*4882a593Smuzhiyun if (xen_has_pv_and_legacy_disk_devices()) {
2059*4882a593Smuzhiyun int major;
2060*4882a593Smuzhiyun
2061*4882a593Smuzhiyun if (!VDEV_IS_EXTENDED(vdevice))
2062*4882a593Smuzhiyun major = BLKIF_MAJOR(vdevice);
2063*4882a593Smuzhiyun else
2064*4882a593Smuzhiyun major = XENVBD_MAJOR;
2065*4882a593Smuzhiyun
2066*4882a593Smuzhiyun if (major != XENVBD_MAJOR) {
2067*4882a593Smuzhiyun printk(KERN_INFO
2068*4882a593Smuzhiyun "%s: HVM does not support vbd %d as xen block device\n",
2069*4882a593Smuzhiyun __func__, vdevice);
2070*4882a593Smuzhiyun return -ENODEV;
2071*4882a593Smuzhiyun }
2072*4882a593Smuzhiyun }
2073*4882a593Smuzhiyun /* do not create a PV cdrom device if we are an HVM guest */
2074*4882a593Smuzhiyun type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
2075*4882a593Smuzhiyun if (IS_ERR(type))
2076*4882a593Smuzhiyun return -ENODEV;
2077*4882a593Smuzhiyun if (strncmp(type, "cdrom", 5) == 0) {
2078*4882a593Smuzhiyun kfree(type);
2079*4882a593Smuzhiyun return -ENODEV;
2080*4882a593Smuzhiyun }
2081*4882a593Smuzhiyun kfree(type);
2082*4882a593Smuzhiyun }
2083*4882a593Smuzhiyun info = kzalloc(sizeof(*info), GFP_KERNEL);
2084*4882a593Smuzhiyun if (!info) {
2085*4882a593Smuzhiyun xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
2086*4882a593Smuzhiyun return -ENOMEM;
2087*4882a593Smuzhiyun }
2088*4882a593Smuzhiyun
2089*4882a593Smuzhiyun info->xbdev = dev;
2090*4882a593Smuzhiyun
2091*4882a593Smuzhiyun mutex_init(&info->mutex);
2092*4882a593Smuzhiyun info->vdevice = vdevice;
2093*4882a593Smuzhiyun info->connected = BLKIF_STATE_DISCONNECTED;
2094*4882a593Smuzhiyun
2095*4882a593Smuzhiyun /* Front end dir is a number, which is used as the id. */
2096*4882a593Smuzhiyun info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
2097*4882a593Smuzhiyun dev_set_drvdata(&dev->dev, info);
2098*4882a593Smuzhiyun
2099*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
2100*4882a593Smuzhiyun list_add(&info->info_list, &info_list);
2101*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
2102*4882a593Smuzhiyun
2103*4882a593Smuzhiyun return 0;
2104*4882a593Smuzhiyun }
2105*4882a593Smuzhiyun
blkif_recover(struct blkfront_info * info)2106*4882a593Smuzhiyun static int blkif_recover(struct blkfront_info *info)
2107*4882a593Smuzhiyun {
2108*4882a593Smuzhiyun unsigned int r_index;
2109*4882a593Smuzhiyun struct request *req, *n;
2110*4882a593Smuzhiyun int rc;
2111*4882a593Smuzhiyun struct bio *bio;
2112*4882a593Smuzhiyun unsigned int segs;
2113*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
2114*4882a593Smuzhiyun
2115*4882a593Smuzhiyun blkfront_gather_backend_features(info);
2116*4882a593Smuzhiyun /* Reset limits changed by blk_mq_update_nr_hw_queues(). */
2117*4882a593Smuzhiyun blkif_set_queue_limits(info);
2118*4882a593Smuzhiyun segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
2119*4882a593Smuzhiyun blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
2120*4882a593Smuzhiyun
2121*4882a593Smuzhiyun for_each_rinfo(info, rinfo, r_index) {
2122*4882a593Smuzhiyun rc = blkfront_setup_indirect(rinfo);
2123*4882a593Smuzhiyun if (rc)
2124*4882a593Smuzhiyun return rc;
2125*4882a593Smuzhiyun }
2126*4882a593Smuzhiyun xenbus_switch_state(info->xbdev, XenbusStateConnected);
2127*4882a593Smuzhiyun
2128*4882a593Smuzhiyun /* Now safe for us to use the shared ring */
2129*4882a593Smuzhiyun info->connected = BLKIF_STATE_CONNECTED;
2130*4882a593Smuzhiyun
2131*4882a593Smuzhiyun for_each_rinfo(info, rinfo, r_index) {
2132*4882a593Smuzhiyun /* Kick any other new requests queued since we resumed */
2133*4882a593Smuzhiyun kick_pending_request_queues(rinfo);
2134*4882a593Smuzhiyun }
2135*4882a593Smuzhiyun
2136*4882a593Smuzhiyun list_for_each_entry_safe(req, n, &info->requests, queuelist) {
2137*4882a593Smuzhiyun /* Requeue pending requests (flush or discard) */
2138*4882a593Smuzhiyun list_del_init(&req->queuelist);
2139*4882a593Smuzhiyun BUG_ON(req->nr_phys_segments > segs);
2140*4882a593Smuzhiyun blk_mq_requeue_request(req, false);
2141*4882a593Smuzhiyun }
2142*4882a593Smuzhiyun blk_mq_start_stopped_hw_queues(info->rq, true);
2143*4882a593Smuzhiyun blk_mq_kick_requeue_list(info->rq);
2144*4882a593Smuzhiyun
2145*4882a593Smuzhiyun while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
2146*4882a593Smuzhiyun /* Traverse the list of pending bios and re-queue them */
2147*4882a593Smuzhiyun submit_bio(bio);
2148*4882a593Smuzhiyun }
2149*4882a593Smuzhiyun
2150*4882a593Smuzhiyun return 0;
2151*4882a593Smuzhiyun }
2152*4882a593Smuzhiyun
2153*4882a593Smuzhiyun /**
2154*4882a593Smuzhiyun * We are reconnecting to the backend, due to a suspend/resume, or a backend
2155*4882a593Smuzhiyun * driver restart. We tear down our blkif structure and recreate it, but
2156*4882a593Smuzhiyun * leave the device-layer structures intact so that this is transparent to the
2157*4882a593Smuzhiyun * rest of the kernel.
2158*4882a593Smuzhiyun */
blkfront_resume(struct xenbus_device * dev)2159*4882a593Smuzhiyun static int blkfront_resume(struct xenbus_device *dev)
2160*4882a593Smuzhiyun {
2161*4882a593Smuzhiyun struct blkfront_info *info = dev_get_drvdata(&dev->dev);
2162*4882a593Smuzhiyun int err = 0;
2163*4882a593Smuzhiyun unsigned int i, j;
2164*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
2165*4882a593Smuzhiyun
2166*4882a593Smuzhiyun dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
2167*4882a593Smuzhiyun
2168*4882a593Smuzhiyun bio_list_init(&info->bio_list);
2169*4882a593Smuzhiyun INIT_LIST_HEAD(&info->requests);
2170*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
2171*4882a593Smuzhiyun struct bio_list merge_bio;
2172*4882a593Smuzhiyun struct blk_shadow *shadow = rinfo->shadow;
2173*4882a593Smuzhiyun
2174*4882a593Smuzhiyun for (j = 0; j < BLK_RING_SIZE(info); j++) {
2175*4882a593Smuzhiyun /* Not in use? */
2176*4882a593Smuzhiyun if (!shadow[j].request)
2177*4882a593Smuzhiyun continue;
2178*4882a593Smuzhiyun
2179*4882a593Smuzhiyun /*
2180*4882a593Smuzhiyun * Get the bios in the request so we can re-queue them.
2181*4882a593Smuzhiyun */
2182*4882a593Smuzhiyun if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
2183*4882a593Smuzhiyun req_op(shadow[j].request) == REQ_OP_DISCARD ||
2184*4882a593Smuzhiyun req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
2185*4882a593Smuzhiyun shadow[j].request->cmd_flags & REQ_FUA) {
2186*4882a593Smuzhiyun /*
2187*4882a593Smuzhiyun * Flush operations don't contain bios, so
2188*4882a593Smuzhiyun * we need to requeue the whole request
2189*4882a593Smuzhiyun *
2190*4882a593Smuzhiyun * XXX: but this doesn't make any sense for a
2191*4882a593Smuzhiyun * write with the FUA flag set..
2192*4882a593Smuzhiyun */
2193*4882a593Smuzhiyun list_add(&shadow[j].request->queuelist, &info->requests);
2194*4882a593Smuzhiyun continue;
2195*4882a593Smuzhiyun }
2196*4882a593Smuzhiyun merge_bio.head = shadow[j].request->bio;
2197*4882a593Smuzhiyun merge_bio.tail = shadow[j].request->biotail;
2198*4882a593Smuzhiyun bio_list_merge(&info->bio_list, &merge_bio);
2199*4882a593Smuzhiyun shadow[j].request->bio = NULL;
2200*4882a593Smuzhiyun blk_mq_end_request(shadow[j].request, BLK_STS_OK);
2201*4882a593Smuzhiyun }
2202*4882a593Smuzhiyun }
2203*4882a593Smuzhiyun
2204*4882a593Smuzhiyun blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
2205*4882a593Smuzhiyun
2206*4882a593Smuzhiyun err = talk_to_blkback(dev, info);
2207*4882a593Smuzhiyun if (!err)
2208*4882a593Smuzhiyun blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
2209*4882a593Smuzhiyun
2210*4882a593Smuzhiyun /*
2211*4882a593Smuzhiyun * We have to wait for the backend to switch to
2212*4882a593Smuzhiyun * connected state, since we want to read which
2213*4882a593Smuzhiyun * features it supports.
2214*4882a593Smuzhiyun */
2215*4882a593Smuzhiyun
2216*4882a593Smuzhiyun return err;
2217*4882a593Smuzhiyun }
2218*4882a593Smuzhiyun
blkfront_closing(struct blkfront_info * info)2219*4882a593Smuzhiyun static void blkfront_closing(struct blkfront_info *info)
2220*4882a593Smuzhiyun {
2221*4882a593Smuzhiyun struct xenbus_device *xbdev = info->xbdev;
2222*4882a593Smuzhiyun struct block_device *bdev = NULL;
2223*4882a593Smuzhiyun
2224*4882a593Smuzhiyun mutex_lock(&info->mutex);
2225*4882a593Smuzhiyun
2226*4882a593Smuzhiyun if (xbdev->state == XenbusStateClosing) {
2227*4882a593Smuzhiyun mutex_unlock(&info->mutex);
2228*4882a593Smuzhiyun return;
2229*4882a593Smuzhiyun }
2230*4882a593Smuzhiyun
2231*4882a593Smuzhiyun if (info->gd)
2232*4882a593Smuzhiyun bdev = bdget_disk(info->gd, 0);
2233*4882a593Smuzhiyun
2234*4882a593Smuzhiyun mutex_unlock(&info->mutex);
2235*4882a593Smuzhiyun
2236*4882a593Smuzhiyun if (!bdev) {
2237*4882a593Smuzhiyun xenbus_frontend_closed(xbdev);
2238*4882a593Smuzhiyun return;
2239*4882a593Smuzhiyun }
2240*4882a593Smuzhiyun
2241*4882a593Smuzhiyun mutex_lock(&bdev->bd_mutex);
2242*4882a593Smuzhiyun
2243*4882a593Smuzhiyun if (bdev->bd_openers) {
2244*4882a593Smuzhiyun xenbus_dev_error(xbdev, -EBUSY,
2245*4882a593Smuzhiyun "Device in use; refusing to close");
2246*4882a593Smuzhiyun xenbus_switch_state(xbdev, XenbusStateClosing);
2247*4882a593Smuzhiyun } else {
2248*4882a593Smuzhiyun xlvbd_release_gendisk(info);
2249*4882a593Smuzhiyun xenbus_frontend_closed(xbdev);
2250*4882a593Smuzhiyun }
2251*4882a593Smuzhiyun
2252*4882a593Smuzhiyun mutex_unlock(&bdev->bd_mutex);
2253*4882a593Smuzhiyun bdput(bdev);
2254*4882a593Smuzhiyun }
2255*4882a593Smuzhiyun
blkfront_setup_discard(struct blkfront_info * info)2256*4882a593Smuzhiyun static void blkfront_setup_discard(struct blkfront_info *info)
2257*4882a593Smuzhiyun {
2258*4882a593Smuzhiyun info->feature_discard = 1;
2259*4882a593Smuzhiyun info->discard_granularity = xenbus_read_unsigned(info->xbdev->otherend,
2260*4882a593Smuzhiyun "discard-granularity",
2261*4882a593Smuzhiyun 0);
2262*4882a593Smuzhiyun info->discard_alignment = xenbus_read_unsigned(info->xbdev->otherend,
2263*4882a593Smuzhiyun "discard-alignment", 0);
2264*4882a593Smuzhiyun info->feature_secdiscard =
2265*4882a593Smuzhiyun !!xenbus_read_unsigned(info->xbdev->otherend, "discard-secure",
2266*4882a593Smuzhiyun 0);
2267*4882a593Smuzhiyun }
2268*4882a593Smuzhiyun
blkfront_setup_indirect(struct blkfront_ring_info * rinfo)2269*4882a593Smuzhiyun static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
2270*4882a593Smuzhiyun {
2271*4882a593Smuzhiyun unsigned int psegs, grants, memflags;
2272*4882a593Smuzhiyun int err, i;
2273*4882a593Smuzhiyun struct blkfront_info *info = rinfo->dev_info;
2274*4882a593Smuzhiyun
2275*4882a593Smuzhiyun memflags = memalloc_noio_save();
2276*4882a593Smuzhiyun
2277*4882a593Smuzhiyun if (info->max_indirect_segments == 0) {
2278*4882a593Smuzhiyun if (!HAS_EXTRA_REQ)
2279*4882a593Smuzhiyun grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2280*4882a593Smuzhiyun else {
2281*4882a593Smuzhiyun /*
2282*4882a593Smuzhiyun * When an extra req is required, the maximum
2283*4882a593Smuzhiyun * grants supported is related to the size of the
2284*4882a593Smuzhiyun * Linux block segment.
2285*4882a593Smuzhiyun */
2286*4882a593Smuzhiyun grants = GRANTS_PER_PSEG;
2287*4882a593Smuzhiyun }
2288*4882a593Smuzhiyun }
2289*4882a593Smuzhiyun else
2290*4882a593Smuzhiyun grants = info->max_indirect_segments;
2291*4882a593Smuzhiyun psegs = DIV_ROUND_UP(grants, GRANTS_PER_PSEG);
2292*4882a593Smuzhiyun
2293*4882a593Smuzhiyun err = fill_grant_buffer(rinfo,
2294*4882a593Smuzhiyun (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
2295*4882a593Smuzhiyun if (err)
2296*4882a593Smuzhiyun goto out_of_memory;
2297*4882a593Smuzhiyun
2298*4882a593Smuzhiyun if (!info->bounce && info->max_indirect_segments) {
2299*4882a593Smuzhiyun /*
2300*4882a593Smuzhiyun * We are using indirect descriptors but don't have a bounce
2301*4882a593Smuzhiyun * buffer, we need to allocate a set of pages that can be
2302*4882a593Smuzhiyun * used for mapping indirect grefs
2303*4882a593Smuzhiyun */
2304*4882a593Smuzhiyun int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
2305*4882a593Smuzhiyun
2306*4882a593Smuzhiyun BUG_ON(!list_empty(&rinfo->indirect_pages));
2307*4882a593Smuzhiyun for (i = 0; i < num; i++) {
2308*4882a593Smuzhiyun struct page *indirect_page = alloc_page(GFP_KERNEL |
2309*4882a593Smuzhiyun __GFP_ZERO);
2310*4882a593Smuzhiyun if (!indirect_page)
2311*4882a593Smuzhiyun goto out_of_memory;
2312*4882a593Smuzhiyun list_add(&indirect_page->lru, &rinfo->indirect_pages);
2313*4882a593Smuzhiyun }
2314*4882a593Smuzhiyun }
2315*4882a593Smuzhiyun
2316*4882a593Smuzhiyun for (i = 0; i < BLK_RING_SIZE(info); i++) {
2317*4882a593Smuzhiyun rinfo->shadow[i].grants_used =
2318*4882a593Smuzhiyun kvcalloc(grants,
2319*4882a593Smuzhiyun sizeof(rinfo->shadow[i].grants_used[0]),
2320*4882a593Smuzhiyun GFP_KERNEL);
2321*4882a593Smuzhiyun rinfo->shadow[i].sg = kvcalloc(psegs,
2322*4882a593Smuzhiyun sizeof(rinfo->shadow[i].sg[0]),
2323*4882a593Smuzhiyun GFP_KERNEL);
2324*4882a593Smuzhiyun if (info->max_indirect_segments)
2325*4882a593Smuzhiyun rinfo->shadow[i].indirect_grants =
2326*4882a593Smuzhiyun kvcalloc(INDIRECT_GREFS(grants),
2327*4882a593Smuzhiyun sizeof(rinfo->shadow[i].indirect_grants[0]),
2328*4882a593Smuzhiyun GFP_KERNEL);
2329*4882a593Smuzhiyun if ((rinfo->shadow[i].grants_used == NULL) ||
2330*4882a593Smuzhiyun (rinfo->shadow[i].sg == NULL) ||
2331*4882a593Smuzhiyun (info->max_indirect_segments &&
2332*4882a593Smuzhiyun (rinfo->shadow[i].indirect_grants == NULL)))
2333*4882a593Smuzhiyun goto out_of_memory;
2334*4882a593Smuzhiyun sg_init_table(rinfo->shadow[i].sg, psegs);
2335*4882a593Smuzhiyun }
2336*4882a593Smuzhiyun
2337*4882a593Smuzhiyun memalloc_noio_restore(memflags);
2338*4882a593Smuzhiyun
2339*4882a593Smuzhiyun return 0;
2340*4882a593Smuzhiyun
2341*4882a593Smuzhiyun out_of_memory:
2342*4882a593Smuzhiyun for (i = 0; i < BLK_RING_SIZE(info); i++) {
2343*4882a593Smuzhiyun kvfree(rinfo->shadow[i].grants_used);
2344*4882a593Smuzhiyun rinfo->shadow[i].grants_used = NULL;
2345*4882a593Smuzhiyun kvfree(rinfo->shadow[i].sg);
2346*4882a593Smuzhiyun rinfo->shadow[i].sg = NULL;
2347*4882a593Smuzhiyun kvfree(rinfo->shadow[i].indirect_grants);
2348*4882a593Smuzhiyun rinfo->shadow[i].indirect_grants = NULL;
2349*4882a593Smuzhiyun }
2350*4882a593Smuzhiyun if (!list_empty(&rinfo->indirect_pages)) {
2351*4882a593Smuzhiyun struct page *indirect_page, *n;
2352*4882a593Smuzhiyun list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
2353*4882a593Smuzhiyun list_del(&indirect_page->lru);
2354*4882a593Smuzhiyun __free_page(indirect_page);
2355*4882a593Smuzhiyun }
2356*4882a593Smuzhiyun }
2357*4882a593Smuzhiyun
2358*4882a593Smuzhiyun memalloc_noio_restore(memflags);
2359*4882a593Smuzhiyun
2360*4882a593Smuzhiyun return -ENOMEM;
2361*4882a593Smuzhiyun }
2362*4882a593Smuzhiyun
2363*4882a593Smuzhiyun /*
2364*4882a593Smuzhiyun * Gather all backend feature-*
2365*4882a593Smuzhiyun */
blkfront_gather_backend_features(struct blkfront_info * info)2366*4882a593Smuzhiyun static void blkfront_gather_backend_features(struct blkfront_info *info)
2367*4882a593Smuzhiyun {
2368*4882a593Smuzhiyun unsigned int indirect_segments;
2369*4882a593Smuzhiyun
2370*4882a593Smuzhiyun info->feature_flush = 0;
2371*4882a593Smuzhiyun info->feature_fua = 0;
2372*4882a593Smuzhiyun
2373*4882a593Smuzhiyun /*
2374*4882a593Smuzhiyun * If there's no "feature-barrier" defined, then it means
2375*4882a593Smuzhiyun * we're dealing with a very old backend which writes
2376*4882a593Smuzhiyun * synchronously; nothing to do.
2377*4882a593Smuzhiyun *
2378*4882a593Smuzhiyun * If there are barriers, then we use flush.
2379*4882a593Smuzhiyun */
2380*4882a593Smuzhiyun if (xenbus_read_unsigned(info->xbdev->otherend, "feature-barrier", 0)) {
2381*4882a593Smuzhiyun info->feature_flush = 1;
2382*4882a593Smuzhiyun info->feature_fua = 1;
2383*4882a593Smuzhiyun }
2384*4882a593Smuzhiyun
2385*4882a593Smuzhiyun /*
2386*4882a593Smuzhiyun * And if there is "feature-flush-cache" use that above
2387*4882a593Smuzhiyun * barriers.
2388*4882a593Smuzhiyun */
2389*4882a593Smuzhiyun if (xenbus_read_unsigned(info->xbdev->otherend, "feature-flush-cache",
2390*4882a593Smuzhiyun 0)) {
2391*4882a593Smuzhiyun info->feature_flush = 1;
2392*4882a593Smuzhiyun info->feature_fua = 0;
2393*4882a593Smuzhiyun }
2394*4882a593Smuzhiyun
2395*4882a593Smuzhiyun if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
2396*4882a593Smuzhiyun blkfront_setup_discard(info);
2397*4882a593Smuzhiyun
2398*4882a593Smuzhiyun if (info->feature_persistent_parm)
2399*4882a593Smuzhiyun info->feature_persistent =
2400*4882a593Smuzhiyun !!xenbus_read_unsigned(info->xbdev->otherend,
2401*4882a593Smuzhiyun "feature-persistent", 0);
2402*4882a593Smuzhiyun if (info->feature_persistent)
2403*4882a593Smuzhiyun info->bounce = true;
2404*4882a593Smuzhiyun
2405*4882a593Smuzhiyun indirect_segments = xenbus_read_unsigned(info->xbdev->otherend,
2406*4882a593Smuzhiyun "feature-max-indirect-segments", 0);
2407*4882a593Smuzhiyun if (indirect_segments > xen_blkif_max_segments)
2408*4882a593Smuzhiyun indirect_segments = xen_blkif_max_segments;
2409*4882a593Smuzhiyun if (indirect_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST)
2410*4882a593Smuzhiyun indirect_segments = 0;
2411*4882a593Smuzhiyun info->max_indirect_segments = indirect_segments;
2412*4882a593Smuzhiyun
2413*4882a593Smuzhiyun if (info->feature_persistent) {
2414*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
2415*4882a593Smuzhiyun schedule_delayed_work(&blkfront_work, HZ * 10);
2416*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
2417*4882a593Smuzhiyun }
2418*4882a593Smuzhiyun }
2419*4882a593Smuzhiyun
2420*4882a593Smuzhiyun /*
2421*4882a593Smuzhiyun * Invoked when the backend is finally 'ready' (and has told produced
2422*4882a593Smuzhiyun * the details about the physical device - #sectors, size, etc).
2423*4882a593Smuzhiyun */
blkfront_connect(struct blkfront_info * info)2424*4882a593Smuzhiyun static void blkfront_connect(struct blkfront_info *info)
2425*4882a593Smuzhiyun {
2426*4882a593Smuzhiyun unsigned long long sectors;
2427*4882a593Smuzhiyun unsigned long sector_size;
2428*4882a593Smuzhiyun unsigned int physical_sector_size;
2429*4882a593Smuzhiyun unsigned int binfo;
2430*4882a593Smuzhiyun int err, i;
2431*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
2432*4882a593Smuzhiyun
2433*4882a593Smuzhiyun switch (info->connected) {
2434*4882a593Smuzhiyun case BLKIF_STATE_CONNECTED:
2435*4882a593Smuzhiyun /*
2436*4882a593Smuzhiyun * Potentially, the back-end may be signalling
2437*4882a593Smuzhiyun * a capacity change; update the capacity.
2438*4882a593Smuzhiyun */
2439*4882a593Smuzhiyun err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
2440*4882a593Smuzhiyun "sectors", "%Lu", §ors);
2441*4882a593Smuzhiyun if (XENBUS_EXIST_ERR(err))
2442*4882a593Smuzhiyun return;
2443*4882a593Smuzhiyun printk(KERN_INFO "Setting capacity to %Lu\n",
2444*4882a593Smuzhiyun sectors);
2445*4882a593Smuzhiyun set_capacity_revalidate_and_notify(info->gd, sectors, true);
2446*4882a593Smuzhiyun
2447*4882a593Smuzhiyun return;
2448*4882a593Smuzhiyun case BLKIF_STATE_SUSPENDED:
2449*4882a593Smuzhiyun /*
2450*4882a593Smuzhiyun * If we are recovering from suspension, we need to wait
2451*4882a593Smuzhiyun * for the backend to announce it's features before
2452*4882a593Smuzhiyun * reconnecting, at least we need to know if the backend
2453*4882a593Smuzhiyun * supports indirect descriptors, and how many.
2454*4882a593Smuzhiyun */
2455*4882a593Smuzhiyun blkif_recover(info);
2456*4882a593Smuzhiyun return;
2457*4882a593Smuzhiyun
2458*4882a593Smuzhiyun default:
2459*4882a593Smuzhiyun break;
2460*4882a593Smuzhiyun }
2461*4882a593Smuzhiyun
2462*4882a593Smuzhiyun dev_dbg(&info->xbdev->dev, "%s:%s.\n",
2463*4882a593Smuzhiyun __func__, info->xbdev->otherend);
2464*4882a593Smuzhiyun
2465*4882a593Smuzhiyun err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
2466*4882a593Smuzhiyun "sectors", "%llu", §ors,
2467*4882a593Smuzhiyun "info", "%u", &binfo,
2468*4882a593Smuzhiyun "sector-size", "%lu", §or_size,
2469*4882a593Smuzhiyun NULL);
2470*4882a593Smuzhiyun if (err) {
2471*4882a593Smuzhiyun xenbus_dev_fatal(info->xbdev, err,
2472*4882a593Smuzhiyun "reading backend fields at %s",
2473*4882a593Smuzhiyun info->xbdev->otherend);
2474*4882a593Smuzhiyun return;
2475*4882a593Smuzhiyun }
2476*4882a593Smuzhiyun
2477*4882a593Smuzhiyun /*
2478*4882a593Smuzhiyun * physcial-sector-size is a newer field, so old backends may not
2479*4882a593Smuzhiyun * provide this. Assume physical sector size to be the same as
2480*4882a593Smuzhiyun * sector_size in that case.
2481*4882a593Smuzhiyun */
2482*4882a593Smuzhiyun physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend,
2483*4882a593Smuzhiyun "physical-sector-size",
2484*4882a593Smuzhiyun sector_size);
2485*4882a593Smuzhiyun blkfront_gather_backend_features(info);
2486*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
2487*4882a593Smuzhiyun err = blkfront_setup_indirect(rinfo);
2488*4882a593Smuzhiyun if (err) {
2489*4882a593Smuzhiyun xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
2490*4882a593Smuzhiyun info->xbdev->otherend);
2491*4882a593Smuzhiyun blkif_free(info, 0);
2492*4882a593Smuzhiyun break;
2493*4882a593Smuzhiyun }
2494*4882a593Smuzhiyun }
2495*4882a593Smuzhiyun
2496*4882a593Smuzhiyun err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
2497*4882a593Smuzhiyun physical_sector_size);
2498*4882a593Smuzhiyun if (err) {
2499*4882a593Smuzhiyun xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
2500*4882a593Smuzhiyun info->xbdev->otherend);
2501*4882a593Smuzhiyun goto fail;
2502*4882a593Smuzhiyun }
2503*4882a593Smuzhiyun
2504*4882a593Smuzhiyun xenbus_switch_state(info->xbdev, XenbusStateConnected);
2505*4882a593Smuzhiyun
2506*4882a593Smuzhiyun /* Kick pending requests. */
2507*4882a593Smuzhiyun info->connected = BLKIF_STATE_CONNECTED;
2508*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i)
2509*4882a593Smuzhiyun kick_pending_request_queues(rinfo);
2510*4882a593Smuzhiyun
2511*4882a593Smuzhiyun device_add_disk(&info->xbdev->dev, info->gd, NULL);
2512*4882a593Smuzhiyun
2513*4882a593Smuzhiyun info->is_ready = 1;
2514*4882a593Smuzhiyun return;
2515*4882a593Smuzhiyun
2516*4882a593Smuzhiyun fail:
2517*4882a593Smuzhiyun blkif_free(info, 0);
2518*4882a593Smuzhiyun return;
2519*4882a593Smuzhiyun }
2520*4882a593Smuzhiyun
2521*4882a593Smuzhiyun /**
2522*4882a593Smuzhiyun * Callback received when the backend's state changes.
2523*4882a593Smuzhiyun */
blkback_changed(struct xenbus_device * dev,enum xenbus_state backend_state)2524*4882a593Smuzhiyun static void blkback_changed(struct xenbus_device *dev,
2525*4882a593Smuzhiyun enum xenbus_state backend_state)
2526*4882a593Smuzhiyun {
2527*4882a593Smuzhiyun struct blkfront_info *info = dev_get_drvdata(&dev->dev);
2528*4882a593Smuzhiyun
2529*4882a593Smuzhiyun dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
2530*4882a593Smuzhiyun
2531*4882a593Smuzhiyun switch (backend_state) {
2532*4882a593Smuzhiyun case XenbusStateInitWait:
2533*4882a593Smuzhiyun if (dev->state != XenbusStateInitialising)
2534*4882a593Smuzhiyun break;
2535*4882a593Smuzhiyun if (talk_to_blkback(dev, info))
2536*4882a593Smuzhiyun break;
2537*4882a593Smuzhiyun case XenbusStateInitialising:
2538*4882a593Smuzhiyun case XenbusStateInitialised:
2539*4882a593Smuzhiyun case XenbusStateReconfiguring:
2540*4882a593Smuzhiyun case XenbusStateReconfigured:
2541*4882a593Smuzhiyun case XenbusStateUnknown:
2542*4882a593Smuzhiyun break;
2543*4882a593Smuzhiyun
2544*4882a593Smuzhiyun case XenbusStateConnected:
2545*4882a593Smuzhiyun /*
2546*4882a593Smuzhiyun * talk_to_blkback sets state to XenbusStateInitialised
2547*4882a593Smuzhiyun * and blkfront_connect sets it to XenbusStateConnected
2548*4882a593Smuzhiyun * (if connection went OK).
2549*4882a593Smuzhiyun *
2550*4882a593Smuzhiyun * If the backend (or toolstack) decides to poke at backend
2551*4882a593Smuzhiyun * state (and re-trigger the watch by setting the state repeatedly
2552*4882a593Smuzhiyun * to XenbusStateConnected (4)) we need to deal with this.
2553*4882a593Smuzhiyun * This is allowed as this is used to communicate to the guest
2554*4882a593Smuzhiyun * that the size of disk has changed!
2555*4882a593Smuzhiyun */
2556*4882a593Smuzhiyun if ((dev->state != XenbusStateInitialised) &&
2557*4882a593Smuzhiyun (dev->state != XenbusStateConnected)) {
2558*4882a593Smuzhiyun if (talk_to_blkback(dev, info))
2559*4882a593Smuzhiyun break;
2560*4882a593Smuzhiyun }
2561*4882a593Smuzhiyun
2562*4882a593Smuzhiyun blkfront_connect(info);
2563*4882a593Smuzhiyun break;
2564*4882a593Smuzhiyun
2565*4882a593Smuzhiyun case XenbusStateClosed:
2566*4882a593Smuzhiyun if (dev->state == XenbusStateClosed)
2567*4882a593Smuzhiyun break;
2568*4882a593Smuzhiyun fallthrough;
2569*4882a593Smuzhiyun case XenbusStateClosing:
2570*4882a593Smuzhiyun if (info)
2571*4882a593Smuzhiyun blkfront_closing(info);
2572*4882a593Smuzhiyun break;
2573*4882a593Smuzhiyun }
2574*4882a593Smuzhiyun }
2575*4882a593Smuzhiyun
blkfront_remove(struct xenbus_device * xbdev)2576*4882a593Smuzhiyun static int blkfront_remove(struct xenbus_device *xbdev)
2577*4882a593Smuzhiyun {
2578*4882a593Smuzhiyun struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
2579*4882a593Smuzhiyun struct block_device *bdev = NULL;
2580*4882a593Smuzhiyun struct gendisk *disk;
2581*4882a593Smuzhiyun
2582*4882a593Smuzhiyun dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
2583*4882a593Smuzhiyun
2584*4882a593Smuzhiyun if (!info)
2585*4882a593Smuzhiyun return 0;
2586*4882a593Smuzhiyun
2587*4882a593Smuzhiyun blkif_free(info, 0);
2588*4882a593Smuzhiyun
2589*4882a593Smuzhiyun mutex_lock(&info->mutex);
2590*4882a593Smuzhiyun
2591*4882a593Smuzhiyun disk = info->gd;
2592*4882a593Smuzhiyun if (disk)
2593*4882a593Smuzhiyun bdev = bdget_disk(disk, 0);
2594*4882a593Smuzhiyun
2595*4882a593Smuzhiyun info->xbdev = NULL;
2596*4882a593Smuzhiyun mutex_unlock(&info->mutex);
2597*4882a593Smuzhiyun
2598*4882a593Smuzhiyun if (!bdev) {
2599*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
2600*4882a593Smuzhiyun free_info(info);
2601*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
2602*4882a593Smuzhiyun return 0;
2603*4882a593Smuzhiyun }
2604*4882a593Smuzhiyun
2605*4882a593Smuzhiyun /*
2606*4882a593Smuzhiyun * The xbdev was removed before we reached the Closed
2607*4882a593Smuzhiyun * state. See if it's safe to remove the disk. If the bdev
2608*4882a593Smuzhiyun * isn't closed yet, we let release take care of it.
2609*4882a593Smuzhiyun */
2610*4882a593Smuzhiyun
2611*4882a593Smuzhiyun mutex_lock(&bdev->bd_mutex);
2612*4882a593Smuzhiyun info = disk->private_data;
2613*4882a593Smuzhiyun
2614*4882a593Smuzhiyun dev_warn(disk_to_dev(disk),
2615*4882a593Smuzhiyun "%s was hot-unplugged, %d stale handles\n",
2616*4882a593Smuzhiyun xbdev->nodename, bdev->bd_openers);
2617*4882a593Smuzhiyun
2618*4882a593Smuzhiyun if (info && !bdev->bd_openers) {
2619*4882a593Smuzhiyun xlvbd_release_gendisk(info);
2620*4882a593Smuzhiyun disk->private_data = NULL;
2621*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
2622*4882a593Smuzhiyun free_info(info);
2623*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
2624*4882a593Smuzhiyun }
2625*4882a593Smuzhiyun
2626*4882a593Smuzhiyun mutex_unlock(&bdev->bd_mutex);
2627*4882a593Smuzhiyun bdput(bdev);
2628*4882a593Smuzhiyun
2629*4882a593Smuzhiyun return 0;
2630*4882a593Smuzhiyun }
2631*4882a593Smuzhiyun
blkfront_is_ready(struct xenbus_device * dev)2632*4882a593Smuzhiyun static int blkfront_is_ready(struct xenbus_device *dev)
2633*4882a593Smuzhiyun {
2634*4882a593Smuzhiyun struct blkfront_info *info = dev_get_drvdata(&dev->dev);
2635*4882a593Smuzhiyun
2636*4882a593Smuzhiyun return info->is_ready && info->xbdev;
2637*4882a593Smuzhiyun }
2638*4882a593Smuzhiyun
blkif_open(struct block_device * bdev,fmode_t mode)2639*4882a593Smuzhiyun static int blkif_open(struct block_device *bdev, fmode_t mode)
2640*4882a593Smuzhiyun {
2641*4882a593Smuzhiyun struct gendisk *disk = bdev->bd_disk;
2642*4882a593Smuzhiyun struct blkfront_info *info;
2643*4882a593Smuzhiyun int err = 0;
2644*4882a593Smuzhiyun
2645*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
2646*4882a593Smuzhiyun
2647*4882a593Smuzhiyun info = disk->private_data;
2648*4882a593Smuzhiyun if (!info) {
2649*4882a593Smuzhiyun /* xbdev gone */
2650*4882a593Smuzhiyun err = -ERESTARTSYS;
2651*4882a593Smuzhiyun goto out;
2652*4882a593Smuzhiyun }
2653*4882a593Smuzhiyun
2654*4882a593Smuzhiyun mutex_lock(&info->mutex);
2655*4882a593Smuzhiyun
2656*4882a593Smuzhiyun if (!info->gd)
2657*4882a593Smuzhiyun /* xbdev is closed */
2658*4882a593Smuzhiyun err = -ERESTARTSYS;
2659*4882a593Smuzhiyun
2660*4882a593Smuzhiyun mutex_unlock(&info->mutex);
2661*4882a593Smuzhiyun
2662*4882a593Smuzhiyun out:
2663*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
2664*4882a593Smuzhiyun return err;
2665*4882a593Smuzhiyun }
2666*4882a593Smuzhiyun
blkif_release(struct gendisk * disk,fmode_t mode)2667*4882a593Smuzhiyun static void blkif_release(struct gendisk *disk, fmode_t mode)
2668*4882a593Smuzhiyun {
2669*4882a593Smuzhiyun struct blkfront_info *info = disk->private_data;
2670*4882a593Smuzhiyun struct block_device *bdev;
2671*4882a593Smuzhiyun struct xenbus_device *xbdev;
2672*4882a593Smuzhiyun
2673*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
2674*4882a593Smuzhiyun
2675*4882a593Smuzhiyun bdev = bdget_disk(disk, 0);
2676*4882a593Smuzhiyun
2677*4882a593Smuzhiyun if (!bdev) {
2678*4882a593Smuzhiyun WARN(1, "Block device %s yanked out from us!\n", disk->disk_name);
2679*4882a593Smuzhiyun goto out_mutex;
2680*4882a593Smuzhiyun }
2681*4882a593Smuzhiyun if (bdev->bd_openers)
2682*4882a593Smuzhiyun goto out;
2683*4882a593Smuzhiyun
2684*4882a593Smuzhiyun /*
2685*4882a593Smuzhiyun * Check if we have been instructed to close. We will have
2686*4882a593Smuzhiyun * deferred this request, because the bdev was still open.
2687*4882a593Smuzhiyun */
2688*4882a593Smuzhiyun
2689*4882a593Smuzhiyun mutex_lock(&info->mutex);
2690*4882a593Smuzhiyun xbdev = info->xbdev;
2691*4882a593Smuzhiyun
2692*4882a593Smuzhiyun if (xbdev && xbdev->state == XenbusStateClosing) {
2693*4882a593Smuzhiyun /* pending switch to state closed */
2694*4882a593Smuzhiyun dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
2695*4882a593Smuzhiyun xlvbd_release_gendisk(info);
2696*4882a593Smuzhiyun xenbus_frontend_closed(info->xbdev);
2697*4882a593Smuzhiyun }
2698*4882a593Smuzhiyun
2699*4882a593Smuzhiyun mutex_unlock(&info->mutex);
2700*4882a593Smuzhiyun
2701*4882a593Smuzhiyun if (!xbdev) {
2702*4882a593Smuzhiyun /* sudden device removal */
2703*4882a593Smuzhiyun dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
2704*4882a593Smuzhiyun xlvbd_release_gendisk(info);
2705*4882a593Smuzhiyun disk->private_data = NULL;
2706*4882a593Smuzhiyun free_info(info);
2707*4882a593Smuzhiyun }
2708*4882a593Smuzhiyun
2709*4882a593Smuzhiyun out:
2710*4882a593Smuzhiyun bdput(bdev);
2711*4882a593Smuzhiyun out_mutex:
2712*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
2713*4882a593Smuzhiyun }
2714*4882a593Smuzhiyun
2715*4882a593Smuzhiyun static const struct block_device_operations xlvbd_block_fops =
2716*4882a593Smuzhiyun {
2717*4882a593Smuzhiyun .owner = THIS_MODULE,
2718*4882a593Smuzhiyun .open = blkif_open,
2719*4882a593Smuzhiyun .release = blkif_release,
2720*4882a593Smuzhiyun .getgeo = blkif_getgeo,
2721*4882a593Smuzhiyun .ioctl = blkif_ioctl,
2722*4882a593Smuzhiyun .compat_ioctl = blkdev_compat_ptr_ioctl,
2723*4882a593Smuzhiyun };
2724*4882a593Smuzhiyun
2725*4882a593Smuzhiyun
2726*4882a593Smuzhiyun static const struct xenbus_device_id blkfront_ids[] = {
2727*4882a593Smuzhiyun { "vbd" },
2728*4882a593Smuzhiyun { "" }
2729*4882a593Smuzhiyun };
2730*4882a593Smuzhiyun
2731*4882a593Smuzhiyun static struct xenbus_driver blkfront_driver = {
2732*4882a593Smuzhiyun .ids = blkfront_ids,
2733*4882a593Smuzhiyun .probe = blkfront_probe,
2734*4882a593Smuzhiyun .remove = blkfront_remove,
2735*4882a593Smuzhiyun .resume = blkfront_resume,
2736*4882a593Smuzhiyun .otherend_changed = blkback_changed,
2737*4882a593Smuzhiyun .is_ready = blkfront_is_ready,
2738*4882a593Smuzhiyun };
2739*4882a593Smuzhiyun
purge_persistent_grants(struct blkfront_info * info)2740*4882a593Smuzhiyun static void purge_persistent_grants(struct blkfront_info *info)
2741*4882a593Smuzhiyun {
2742*4882a593Smuzhiyun unsigned int i;
2743*4882a593Smuzhiyun unsigned long flags;
2744*4882a593Smuzhiyun struct blkfront_ring_info *rinfo;
2745*4882a593Smuzhiyun
2746*4882a593Smuzhiyun for_each_rinfo(info, rinfo, i) {
2747*4882a593Smuzhiyun struct grant *gnt_list_entry, *tmp;
2748*4882a593Smuzhiyun
2749*4882a593Smuzhiyun spin_lock_irqsave(&rinfo->ring_lock, flags);
2750*4882a593Smuzhiyun
2751*4882a593Smuzhiyun if (rinfo->persistent_gnts_c == 0) {
2752*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
2753*4882a593Smuzhiyun continue;
2754*4882a593Smuzhiyun }
2755*4882a593Smuzhiyun
2756*4882a593Smuzhiyun list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants,
2757*4882a593Smuzhiyun node) {
2758*4882a593Smuzhiyun if (gnt_list_entry->gref == GRANT_INVALID_REF ||
2759*4882a593Smuzhiyun !gnttab_try_end_foreign_access(gnt_list_entry->gref))
2760*4882a593Smuzhiyun continue;
2761*4882a593Smuzhiyun
2762*4882a593Smuzhiyun list_del(&gnt_list_entry->node);
2763*4882a593Smuzhiyun rinfo->persistent_gnts_c--;
2764*4882a593Smuzhiyun gnt_list_entry->gref = GRANT_INVALID_REF;
2765*4882a593Smuzhiyun list_add_tail(&gnt_list_entry->node, &rinfo->grants);
2766*4882a593Smuzhiyun }
2767*4882a593Smuzhiyun
2768*4882a593Smuzhiyun spin_unlock_irqrestore(&rinfo->ring_lock, flags);
2769*4882a593Smuzhiyun }
2770*4882a593Smuzhiyun }
2771*4882a593Smuzhiyun
blkfront_delay_work(struct work_struct * work)2772*4882a593Smuzhiyun static void blkfront_delay_work(struct work_struct *work)
2773*4882a593Smuzhiyun {
2774*4882a593Smuzhiyun struct blkfront_info *info;
2775*4882a593Smuzhiyun bool need_schedule_work = false;
2776*4882a593Smuzhiyun
2777*4882a593Smuzhiyun /*
2778*4882a593Smuzhiyun * Note that when using bounce buffers but not persistent grants
2779*4882a593Smuzhiyun * there's no need to run blkfront_delay_work because grants are
2780*4882a593Smuzhiyun * revoked in blkif_completion or else an error is reported and the
2781*4882a593Smuzhiyun * connection is closed.
2782*4882a593Smuzhiyun */
2783*4882a593Smuzhiyun
2784*4882a593Smuzhiyun mutex_lock(&blkfront_mutex);
2785*4882a593Smuzhiyun
2786*4882a593Smuzhiyun list_for_each_entry(info, &info_list, info_list) {
2787*4882a593Smuzhiyun if (info->feature_persistent) {
2788*4882a593Smuzhiyun need_schedule_work = true;
2789*4882a593Smuzhiyun mutex_lock(&info->mutex);
2790*4882a593Smuzhiyun purge_persistent_grants(info);
2791*4882a593Smuzhiyun mutex_unlock(&info->mutex);
2792*4882a593Smuzhiyun }
2793*4882a593Smuzhiyun }
2794*4882a593Smuzhiyun
2795*4882a593Smuzhiyun if (need_schedule_work)
2796*4882a593Smuzhiyun schedule_delayed_work(&blkfront_work, HZ * 10);
2797*4882a593Smuzhiyun
2798*4882a593Smuzhiyun mutex_unlock(&blkfront_mutex);
2799*4882a593Smuzhiyun }
2800*4882a593Smuzhiyun
xlblk_init(void)2801*4882a593Smuzhiyun static int __init xlblk_init(void)
2802*4882a593Smuzhiyun {
2803*4882a593Smuzhiyun int ret;
2804*4882a593Smuzhiyun int nr_cpus = num_online_cpus();
2805*4882a593Smuzhiyun
2806*4882a593Smuzhiyun if (!xen_domain())
2807*4882a593Smuzhiyun return -ENODEV;
2808*4882a593Smuzhiyun
2809*4882a593Smuzhiyun if (!xen_has_pv_disk_devices())
2810*4882a593Smuzhiyun return -ENODEV;
2811*4882a593Smuzhiyun
2812*4882a593Smuzhiyun if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
2813*4882a593Smuzhiyun pr_warn("xen_blk: can't get major %d with name %s\n",
2814*4882a593Smuzhiyun XENVBD_MAJOR, DEV_NAME);
2815*4882a593Smuzhiyun return -ENODEV;
2816*4882a593Smuzhiyun }
2817*4882a593Smuzhiyun
2818*4882a593Smuzhiyun if (xen_blkif_max_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
2819*4882a593Smuzhiyun xen_blkif_max_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
2820*4882a593Smuzhiyun
2821*4882a593Smuzhiyun if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
2822*4882a593Smuzhiyun pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
2823*4882a593Smuzhiyun xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
2824*4882a593Smuzhiyun xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
2825*4882a593Smuzhiyun }
2826*4882a593Smuzhiyun
2827*4882a593Smuzhiyun if (xen_blkif_max_queues > nr_cpus) {
2828*4882a593Smuzhiyun pr_info("Invalid max_queues (%d), will use default max: %d.\n",
2829*4882a593Smuzhiyun xen_blkif_max_queues, nr_cpus);
2830*4882a593Smuzhiyun xen_blkif_max_queues = nr_cpus;
2831*4882a593Smuzhiyun }
2832*4882a593Smuzhiyun
2833*4882a593Smuzhiyun INIT_DELAYED_WORK(&blkfront_work, blkfront_delay_work);
2834*4882a593Smuzhiyun
2835*4882a593Smuzhiyun ret = xenbus_register_frontend(&blkfront_driver);
2836*4882a593Smuzhiyun if (ret) {
2837*4882a593Smuzhiyun unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2838*4882a593Smuzhiyun return ret;
2839*4882a593Smuzhiyun }
2840*4882a593Smuzhiyun
2841*4882a593Smuzhiyun return 0;
2842*4882a593Smuzhiyun }
2843*4882a593Smuzhiyun module_init(xlblk_init);
2844*4882a593Smuzhiyun
2845*4882a593Smuzhiyun
xlblk_exit(void)2846*4882a593Smuzhiyun static void __exit xlblk_exit(void)
2847*4882a593Smuzhiyun {
2848*4882a593Smuzhiyun cancel_delayed_work_sync(&blkfront_work);
2849*4882a593Smuzhiyun
2850*4882a593Smuzhiyun xenbus_unregister_driver(&blkfront_driver);
2851*4882a593Smuzhiyun unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
2852*4882a593Smuzhiyun kfree(minors);
2853*4882a593Smuzhiyun }
2854*4882a593Smuzhiyun module_exit(xlblk_exit);
2855*4882a593Smuzhiyun
2856*4882a593Smuzhiyun MODULE_DESCRIPTION("Xen virtual block device frontend");
2857*4882a593Smuzhiyun MODULE_LICENSE("GPL");
2858*4882a593Smuzhiyun MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
2859*4882a593Smuzhiyun MODULE_ALIAS("xen:vbd");
2860*4882a593Smuzhiyun MODULE_ALIAS("xenblk");
2861