xref: /OK3568_Linux_fs/kernel/drivers/block/nbd.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-or-later
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * Network block device - make block devices work over TCP
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Note that you can not swap over this thing, yet. Seems to work but
6*4882a593Smuzhiyun  * deadlocks sometimes - you can not swap over TCP in general.
7*4882a593Smuzhiyun  *
8*4882a593Smuzhiyun  * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
9*4882a593Smuzhiyun  * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
10*4882a593Smuzhiyun  *
11*4882a593Smuzhiyun  * (part of code stolen from loop.c)
12*4882a593Smuzhiyun  */
13*4882a593Smuzhiyun 
14*4882a593Smuzhiyun #include <linux/major.h>
15*4882a593Smuzhiyun 
16*4882a593Smuzhiyun #include <linux/blkdev.h>
17*4882a593Smuzhiyun #include <linux/module.h>
18*4882a593Smuzhiyun #include <linux/init.h>
19*4882a593Smuzhiyun #include <linux/sched.h>
20*4882a593Smuzhiyun #include <linux/sched/mm.h>
21*4882a593Smuzhiyun #include <linux/fs.h>
22*4882a593Smuzhiyun #include <linux/bio.h>
23*4882a593Smuzhiyun #include <linux/stat.h>
24*4882a593Smuzhiyun #include <linux/errno.h>
25*4882a593Smuzhiyun #include <linux/file.h>
26*4882a593Smuzhiyun #include <linux/ioctl.h>
27*4882a593Smuzhiyun #include <linux/mutex.h>
28*4882a593Smuzhiyun #include <linux/compiler.h>
29*4882a593Smuzhiyun #include <linux/completion.h>
30*4882a593Smuzhiyun #include <linux/err.h>
31*4882a593Smuzhiyun #include <linux/kernel.h>
32*4882a593Smuzhiyun #include <linux/slab.h>
33*4882a593Smuzhiyun #include <net/sock.h>
34*4882a593Smuzhiyun #include <linux/net.h>
35*4882a593Smuzhiyun #include <linux/kthread.h>
36*4882a593Smuzhiyun #include <linux/types.h>
37*4882a593Smuzhiyun #include <linux/debugfs.h>
38*4882a593Smuzhiyun #include <linux/blk-mq.h>
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun #include <linux/uaccess.h>
41*4882a593Smuzhiyun #include <asm/types.h>
42*4882a593Smuzhiyun 
43*4882a593Smuzhiyun #include <linux/nbd.h>
44*4882a593Smuzhiyun #include <linux/nbd-netlink.h>
45*4882a593Smuzhiyun #include <net/genetlink.h>
46*4882a593Smuzhiyun 
47*4882a593Smuzhiyun #define CREATE_TRACE_POINTS
48*4882a593Smuzhiyun #include <trace/events/nbd.h>
49*4882a593Smuzhiyun 
50*4882a593Smuzhiyun static DEFINE_IDR(nbd_index_idr);
51*4882a593Smuzhiyun static DEFINE_MUTEX(nbd_index_mutex);
52*4882a593Smuzhiyun static int nbd_total_devices = 0;
53*4882a593Smuzhiyun 
54*4882a593Smuzhiyun struct nbd_sock {
55*4882a593Smuzhiyun 	struct socket *sock;
56*4882a593Smuzhiyun 	struct mutex tx_lock;
57*4882a593Smuzhiyun 	struct request *pending;
58*4882a593Smuzhiyun 	int sent;
59*4882a593Smuzhiyun 	bool dead;
60*4882a593Smuzhiyun 	int fallback_index;
61*4882a593Smuzhiyun 	int cookie;
62*4882a593Smuzhiyun };
63*4882a593Smuzhiyun 
64*4882a593Smuzhiyun struct recv_thread_args {
65*4882a593Smuzhiyun 	struct work_struct work;
66*4882a593Smuzhiyun 	struct nbd_device *nbd;
67*4882a593Smuzhiyun 	int index;
68*4882a593Smuzhiyun };
69*4882a593Smuzhiyun 
70*4882a593Smuzhiyun struct link_dead_args {
71*4882a593Smuzhiyun 	struct work_struct work;
72*4882a593Smuzhiyun 	int index;
73*4882a593Smuzhiyun };
74*4882a593Smuzhiyun 
75*4882a593Smuzhiyun #define NBD_RT_TIMEDOUT			0
76*4882a593Smuzhiyun #define NBD_RT_DISCONNECT_REQUESTED	1
77*4882a593Smuzhiyun #define NBD_RT_DISCONNECTED		2
78*4882a593Smuzhiyun #define NBD_RT_HAS_PID_FILE		3
79*4882a593Smuzhiyun #define NBD_RT_HAS_CONFIG_REF		4
80*4882a593Smuzhiyun #define NBD_RT_BOUND			5
81*4882a593Smuzhiyun #define NBD_RT_DISCONNECT_ON_CLOSE	6
82*4882a593Smuzhiyun 
83*4882a593Smuzhiyun #define NBD_DESTROY_ON_DISCONNECT	0
84*4882a593Smuzhiyun #define NBD_DISCONNECT_REQUESTED	1
85*4882a593Smuzhiyun 
86*4882a593Smuzhiyun struct nbd_config {
87*4882a593Smuzhiyun 	u32 flags;
88*4882a593Smuzhiyun 	unsigned long runtime_flags;
89*4882a593Smuzhiyun 	u64 dead_conn_timeout;
90*4882a593Smuzhiyun 
91*4882a593Smuzhiyun 	struct nbd_sock **socks;
92*4882a593Smuzhiyun 	int num_connections;
93*4882a593Smuzhiyun 	atomic_t live_connections;
94*4882a593Smuzhiyun 	wait_queue_head_t conn_wait;
95*4882a593Smuzhiyun 
96*4882a593Smuzhiyun 	atomic_t recv_threads;
97*4882a593Smuzhiyun 	wait_queue_head_t recv_wq;
98*4882a593Smuzhiyun 	loff_t blksize;
99*4882a593Smuzhiyun 	loff_t bytesize;
100*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_DEBUG_FS)
101*4882a593Smuzhiyun 	struct dentry *dbg_dir;
102*4882a593Smuzhiyun #endif
103*4882a593Smuzhiyun };
104*4882a593Smuzhiyun 
105*4882a593Smuzhiyun struct nbd_device {
106*4882a593Smuzhiyun 	struct blk_mq_tag_set tag_set;
107*4882a593Smuzhiyun 
108*4882a593Smuzhiyun 	int index;
109*4882a593Smuzhiyun 	refcount_t config_refs;
110*4882a593Smuzhiyun 	refcount_t refs;
111*4882a593Smuzhiyun 	struct nbd_config *config;
112*4882a593Smuzhiyun 	struct mutex config_lock;
113*4882a593Smuzhiyun 	struct gendisk *disk;
114*4882a593Smuzhiyun 	struct workqueue_struct *recv_workq;
115*4882a593Smuzhiyun 
116*4882a593Smuzhiyun 	struct list_head list;
117*4882a593Smuzhiyun 	struct task_struct *task_recv;
118*4882a593Smuzhiyun 	struct task_struct *task_setup;
119*4882a593Smuzhiyun 
120*4882a593Smuzhiyun 	struct completion *destroy_complete;
121*4882a593Smuzhiyun 	unsigned long flags;
122*4882a593Smuzhiyun };
123*4882a593Smuzhiyun 
124*4882a593Smuzhiyun #define NBD_CMD_REQUEUED	1
125*4882a593Smuzhiyun 
126*4882a593Smuzhiyun struct nbd_cmd {
127*4882a593Smuzhiyun 	struct nbd_device *nbd;
128*4882a593Smuzhiyun 	struct mutex lock;
129*4882a593Smuzhiyun 	int index;
130*4882a593Smuzhiyun 	int cookie;
131*4882a593Smuzhiyun 	int retries;
132*4882a593Smuzhiyun 	blk_status_t status;
133*4882a593Smuzhiyun 	unsigned long flags;
134*4882a593Smuzhiyun 	u32 cmd_cookie;
135*4882a593Smuzhiyun };
136*4882a593Smuzhiyun 
137*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_DEBUG_FS)
138*4882a593Smuzhiyun static struct dentry *nbd_dbg_dir;
139*4882a593Smuzhiyun #endif
140*4882a593Smuzhiyun 
141*4882a593Smuzhiyun #define nbd_name(nbd) ((nbd)->disk->disk_name)
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun #define NBD_MAGIC 0x68797548
144*4882a593Smuzhiyun 
145*4882a593Smuzhiyun #define NBD_DEF_BLKSIZE 1024
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun static unsigned int nbds_max = 16;
148*4882a593Smuzhiyun static int max_part = 16;
149*4882a593Smuzhiyun static int part_shift;
150*4882a593Smuzhiyun 
151*4882a593Smuzhiyun static int nbd_dev_dbg_init(struct nbd_device *nbd);
152*4882a593Smuzhiyun static void nbd_dev_dbg_close(struct nbd_device *nbd);
153*4882a593Smuzhiyun static void nbd_config_put(struct nbd_device *nbd);
154*4882a593Smuzhiyun static void nbd_connect_reply(struct genl_info *info, int index);
155*4882a593Smuzhiyun static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
156*4882a593Smuzhiyun static void nbd_dead_link_work(struct work_struct *work);
157*4882a593Smuzhiyun static void nbd_disconnect_and_put(struct nbd_device *nbd);
158*4882a593Smuzhiyun 
nbd_to_dev(struct nbd_device * nbd)159*4882a593Smuzhiyun static inline struct device *nbd_to_dev(struct nbd_device *nbd)
160*4882a593Smuzhiyun {
161*4882a593Smuzhiyun 	return disk_to_dev(nbd->disk);
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun 
nbd_requeue_cmd(struct nbd_cmd * cmd)164*4882a593Smuzhiyun static void nbd_requeue_cmd(struct nbd_cmd *cmd)
165*4882a593Smuzhiyun {
166*4882a593Smuzhiyun 	struct request *req = blk_mq_rq_from_pdu(cmd);
167*4882a593Smuzhiyun 
168*4882a593Smuzhiyun 	if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
169*4882a593Smuzhiyun 		blk_mq_requeue_request(req, true);
170*4882a593Smuzhiyun }
171*4882a593Smuzhiyun 
172*4882a593Smuzhiyun #define NBD_COOKIE_BITS 32
173*4882a593Smuzhiyun 
nbd_cmd_handle(struct nbd_cmd * cmd)174*4882a593Smuzhiyun static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
175*4882a593Smuzhiyun {
176*4882a593Smuzhiyun 	struct request *req = blk_mq_rq_from_pdu(cmd);
177*4882a593Smuzhiyun 	u32 tag = blk_mq_unique_tag(req);
178*4882a593Smuzhiyun 	u64 cookie = cmd->cmd_cookie;
179*4882a593Smuzhiyun 
180*4882a593Smuzhiyun 	return (cookie << NBD_COOKIE_BITS) | tag;
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun 
nbd_handle_to_tag(u64 handle)183*4882a593Smuzhiyun static u32 nbd_handle_to_tag(u64 handle)
184*4882a593Smuzhiyun {
185*4882a593Smuzhiyun 	return (u32)handle;
186*4882a593Smuzhiyun }
187*4882a593Smuzhiyun 
nbd_handle_to_cookie(u64 handle)188*4882a593Smuzhiyun static u32 nbd_handle_to_cookie(u64 handle)
189*4882a593Smuzhiyun {
190*4882a593Smuzhiyun 	return (u32)(handle >> NBD_COOKIE_BITS);
191*4882a593Smuzhiyun }
192*4882a593Smuzhiyun 
nbdcmd_to_ascii(int cmd)193*4882a593Smuzhiyun static const char *nbdcmd_to_ascii(int cmd)
194*4882a593Smuzhiyun {
195*4882a593Smuzhiyun 	switch (cmd) {
196*4882a593Smuzhiyun 	case  NBD_CMD_READ: return "read";
197*4882a593Smuzhiyun 	case NBD_CMD_WRITE: return "write";
198*4882a593Smuzhiyun 	case  NBD_CMD_DISC: return "disconnect";
199*4882a593Smuzhiyun 	case NBD_CMD_FLUSH: return "flush";
200*4882a593Smuzhiyun 	case  NBD_CMD_TRIM: return "trim/discard";
201*4882a593Smuzhiyun 	}
202*4882a593Smuzhiyun 	return "invalid";
203*4882a593Smuzhiyun }
204*4882a593Smuzhiyun 
pid_show(struct device * dev,struct device_attribute * attr,char * buf)205*4882a593Smuzhiyun static ssize_t pid_show(struct device *dev,
206*4882a593Smuzhiyun 			struct device_attribute *attr, char *buf)
207*4882a593Smuzhiyun {
208*4882a593Smuzhiyun 	struct gendisk *disk = dev_to_disk(dev);
209*4882a593Smuzhiyun 	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
210*4882a593Smuzhiyun 
211*4882a593Smuzhiyun 	return sprintf(buf, "%d\n", task_pid_nr(nbd->task_recv));
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun 
214*4882a593Smuzhiyun static const struct device_attribute pid_attr = {
215*4882a593Smuzhiyun 	.attr = { .name = "pid", .mode = 0444},
216*4882a593Smuzhiyun 	.show = pid_show,
217*4882a593Smuzhiyun };
218*4882a593Smuzhiyun 
nbd_dev_remove(struct nbd_device * nbd)219*4882a593Smuzhiyun static void nbd_dev_remove(struct nbd_device *nbd)
220*4882a593Smuzhiyun {
221*4882a593Smuzhiyun 	struct gendisk *disk = nbd->disk;
222*4882a593Smuzhiyun 	struct request_queue *q;
223*4882a593Smuzhiyun 
224*4882a593Smuzhiyun 	if (disk) {
225*4882a593Smuzhiyun 		q = disk->queue;
226*4882a593Smuzhiyun 		del_gendisk(disk);
227*4882a593Smuzhiyun 		blk_cleanup_queue(q);
228*4882a593Smuzhiyun 		blk_mq_free_tag_set(&nbd->tag_set);
229*4882a593Smuzhiyun 		disk->private_data = NULL;
230*4882a593Smuzhiyun 		put_disk(disk);
231*4882a593Smuzhiyun 	}
232*4882a593Smuzhiyun 
233*4882a593Smuzhiyun 	/*
234*4882a593Smuzhiyun 	 * Place this in the last just before the nbd is freed to
235*4882a593Smuzhiyun 	 * make sure that the disk and the related kobject are also
236*4882a593Smuzhiyun 	 * totally removed to avoid duplicate creation of the same
237*4882a593Smuzhiyun 	 * one.
238*4882a593Smuzhiyun 	 */
239*4882a593Smuzhiyun 	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) && nbd->destroy_complete)
240*4882a593Smuzhiyun 		complete(nbd->destroy_complete);
241*4882a593Smuzhiyun 
242*4882a593Smuzhiyun 	kfree(nbd);
243*4882a593Smuzhiyun }
244*4882a593Smuzhiyun 
nbd_put(struct nbd_device * nbd)245*4882a593Smuzhiyun static void nbd_put(struct nbd_device *nbd)
246*4882a593Smuzhiyun {
247*4882a593Smuzhiyun 	if (refcount_dec_and_mutex_lock(&nbd->refs,
248*4882a593Smuzhiyun 					&nbd_index_mutex)) {
249*4882a593Smuzhiyun 		idr_remove(&nbd_index_idr, nbd->index);
250*4882a593Smuzhiyun 		nbd_dev_remove(nbd);
251*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
252*4882a593Smuzhiyun 	}
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun 
nbd_disconnected(struct nbd_config * config)255*4882a593Smuzhiyun static int nbd_disconnected(struct nbd_config *config)
256*4882a593Smuzhiyun {
257*4882a593Smuzhiyun 	return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
258*4882a593Smuzhiyun 		test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun 
nbd_mark_nsock_dead(struct nbd_device * nbd,struct nbd_sock * nsock,int notify)261*4882a593Smuzhiyun static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
262*4882a593Smuzhiyun 				int notify)
263*4882a593Smuzhiyun {
264*4882a593Smuzhiyun 	if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
265*4882a593Smuzhiyun 		struct link_dead_args *args;
266*4882a593Smuzhiyun 		args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
267*4882a593Smuzhiyun 		if (args) {
268*4882a593Smuzhiyun 			INIT_WORK(&args->work, nbd_dead_link_work);
269*4882a593Smuzhiyun 			args->index = nbd->index;
270*4882a593Smuzhiyun 			queue_work(system_wq, &args->work);
271*4882a593Smuzhiyun 		}
272*4882a593Smuzhiyun 	}
273*4882a593Smuzhiyun 	if (!nsock->dead) {
274*4882a593Smuzhiyun 		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
275*4882a593Smuzhiyun 		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
276*4882a593Smuzhiyun 			if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
277*4882a593Smuzhiyun 					       &nbd->config->runtime_flags)) {
278*4882a593Smuzhiyun 				set_bit(NBD_RT_DISCONNECTED,
279*4882a593Smuzhiyun 					&nbd->config->runtime_flags);
280*4882a593Smuzhiyun 				dev_info(nbd_to_dev(nbd),
281*4882a593Smuzhiyun 					"Disconnected due to user request.\n");
282*4882a593Smuzhiyun 			}
283*4882a593Smuzhiyun 		}
284*4882a593Smuzhiyun 	}
285*4882a593Smuzhiyun 	nsock->dead = true;
286*4882a593Smuzhiyun 	nsock->pending = NULL;
287*4882a593Smuzhiyun 	nsock->sent = 0;
288*4882a593Smuzhiyun }
289*4882a593Smuzhiyun 
nbd_size_clear(struct nbd_device * nbd)290*4882a593Smuzhiyun static void nbd_size_clear(struct nbd_device *nbd)
291*4882a593Smuzhiyun {
292*4882a593Smuzhiyun 	if (nbd->config->bytesize) {
293*4882a593Smuzhiyun 		set_capacity(nbd->disk, 0);
294*4882a593Smuzhiyun 		kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
295*4882a593Smuzhiyun 	}
296*4882a593Smuzhiyun }
297*4882a593Smuzhiyun 
nbd_size_update(struct nbd_device * nbd,bool start)298*4882a593Smuzhiyun static void nbd_size_update(struct nbd_device *nbd, bool start)
299*4882a593Smuzhiyun {
300*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
301*4882a593Smuzhiyun 	struct block_device *bdev = bdget_disk(nbd->disk, 0);
302*4882a593Smuzhiyun 	sector_t nr_sectors = config->bytesize >> 9;
303*4882a593Smuzhiyun 
304*4882a593Smuzhiyun 	if (config->flags & NBD_FLAG_SEND_TRIM) {
305*4882a593Smuzhiyun 		nbd->disk->queue->limits.discard_granularity = config->blksize;
306*4882a593Smuzhiyun 		nbd->disk->queue->limits.discard_alignment = config->blksize;
307*4882a593Smuzhiyun 		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
308*4882a593Smuzhiyun 	}
309*4882a593Smuzhiyun 	blk_queue_logical_block_size(nbd->disk->queue, config->blksize);
310*4882a593Smuzhiyun 	blk_queue_physical_block_size(nbd->disk->queue, config->blksize);
311*4882a593Smuzhiyun 	set_capacity(nbd->disk, nr_sectors);
312*4882a593Smuzhiyun 	if (bdev) {
313*4882a593Smuzhiyun 		if (bdev->bd_disk) {
314*4882a593Smuzhiyun 			bd_set_nr_sectors(bdev, nr_sectors);
315*4882a593Smuzhiyun 			if (start)
316*4882a593Smuzhiyun 				set_blocksize(bdev, config->blksize);
317*4882a593Smuzhiyun 		} else
318*4882a593Smuzhiyun 			set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
319*4882a593Smuzhiyun 		bdput(bdev);
320*4882a593Smuzhiyun 	}
321*4882a593Smuzhiyun 	kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
322*4882a593Smuzhiyun }
323*4882a593Smuzhiyun 
nbd_size_set(struct nbd_device * nbd,loff_t blocksize,loff_t nr_blocks)324*4882a593Smuzhiyun static void nbd_size_set(struct nbd_device *nbd, loff_t blocksize,
325*4882a593Smuzhiyun 			 loff_t nr_blocks)
326*4882a593Smuzhiyun {
327*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
328*4882a593Smuzhiyun 	config->blksize = blocksize;
329*4882a593Smuzhiyun 	config->bytesize = blocksize * nr_blocks;
330*4882a593Smuzhiyun 	if (nbd->task_recv != NULL)
331*4882a593Smuzhiyun 		nbd_size_update(nbd, false);
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun 
nbd_complete_rq(struct request * req)334*4882a593Smuzhiyun static void nbd_complete_rq(struct request *req)
335*4882a593Smuzhiyun {
336*4882a593Smuzhiyun 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
337*4882a593Smuzhiyun 
338*4882a593Smuzhiyun 	dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
339*4882a593Smuzhiyun 		cmd->status ? "failed" : "done");
340*4882a593Smuzhiyun 
341*4882a593Smuzhiyun 	blk_mq_end_request(req, cmd->status);
342*4882a593Smuzhiyun }
343*4882a593Smuzhiyun 
344*4882a593Smuzhiyun /*
345*4882a593Smuzhiyun  * Forcibly shutdown the socket causing all listeners to error
346*4882a593Smuzhiyun  */
sock_shutdown(struct nbd_device * nbd)347*4882a593Smuzhiyun static void sock_shutdown(struct nbd_device *nbd)
348*4882a593Smuzhiyun {
349*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
350*4882a593Smuzhiyun 	int i;
351*4882a593Smuzhiyun 
352*4882a593Smuzhiyun 	if (config->num_connections == 0)
353*4882a593Smuzhiyun 		return;
354*4882a593Smuzhiyun 	if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
355*4882a593Smuzhiyun 		return;
356*4882a593Smuzhiyun 
357*4882a593Smuzhiyun 	for (i = 0; i < config->num_connections; i++) {
358*4882a593Smuzhiyun 		struct nbd_sock *nsock = config->socks[i];
359*4882a593Smuzhiyun 		mutex_lock(&nsock->tx_lock);
360*4882a593Smuzhiyun 		nbd_mark_nsock_dead(nbd, nsock, 0);
361*4882a593Smuzhiyun 		mutex_unlock(&nsock->tx_lock);
362*4882a593Smuzhiyun 	}
363*4882a593Smuzhiyun 	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
364*4882a593Smuzhiyun }
365*4882a593Smuzhiyun 
req_to_nbd_cmd_type(struct request * req)366*4882a593Smuzhiyun static u32 req_to_nbd_cmd_type(struct request *req)
367*4882a593Smuzhiyun {
368*4882a593Smuzhiyun 	switch (req_op(req)) {
369*4882a593Smuzhiyun 	case REQ_OP_DISCARD:
370*4882a593Smuzhiyun 		return NBD_CMD_TRIM;
371*4882a593Smuzhiyun 	case REQ_OP_FLUSH:
372*4882a593Smuzhiyun 		return NBD_CMD_FLUSH;
373*4882a593Smuzhiyun 	case REQ_OP_WRITE:
374*4882a593Smuzhiyun 		return NBD_CMD_WRITE;
375*4882a593Smuzhiyun 	case REQ_OP_READ:
376*4882a593Smuzhiyun 		return NBD_CMD_READ;
377*4882a593Smuzhiyun 	default:
378*4882a593Smuzhiyun 		return U32_MAX;
379*4882a593Smuzhiyun 	}
380*4882a593Smuzhiyun }
381*4882a593Smuzhiyun 
nbd_xmit_timeout(struct request * req,bool reserved)382*4882a593Smuzhiyun static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req,
383*4882a593Smuzhiyun 						 bool reserved)
384*4882a593Smuzhiyun {
385*4882a593Smuzhiyun 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
386*4882a593Smuzhiyun 	struct nbd_device *nbd = cmd->nbd;
387*4882a593Smuzhiyun 	struct nbd_config *config;
388*4882a593Smuzhiyun 
389*4882a593Smuzhiyun 	if (!mutex_trylock(&cmd->lock))
390*4882a593Smuzhiyun 		return BLK_EH_RESET_TIMER;
391*4882a593Smuzhiyun 
392*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
393*4882a593Smuzhiyun 		cmd->status = BLK_STS_TIMEOUT;
394*4882a593Smuzhiyun 		mutex_unlock(&cmd->lock);
395*4882a593Smuzhiyun 		goto done;
396*4882a593Smuzhiyun 	}
397*4882a593Smuzhiyun 	config = nbd->config;
398*4882a593Smuzhiyun 
399*4882a593Smuzhiyun 	if (config->num_connections > 1 ||
400*4882a593Smuzhiyun 	    (config->num_connections == 1 && nbd->tag_set.timeout)) {
401*4882a593Smuzhiyun 		dev_err_ratelimited(nbd_to_dev(nbd),
402*4882a593Smuzhiyun 				    "Connection timed out, retrying (%d/%d alive)\n",
403*4882a593Smuzhiyun 				    atomic_read(&config->live_connections),
404*4882a593Smuzhiyun 				    config->num_connections);
405*4882a593Smuzhiyun 		/*
406*4882a593Smuzhiyun 		 * Hooray we have more connections, requeue this IO, the submit
407*4882a593Smuzhiyun 		 * path will put it on a real connection. Or if only one
408*4882a593Smuzhiyun 		 * connection is configured, the submit path will wait util
409*4882a593Smuzhiyun 		 * a new connection is reconfigured or util dead timeout.
410*4882a593Smuzhiyun 		 */
411*4882a593Smuzhiyun 		if (config->socks) {
412*4882a593Smuzhiyun 			if (cmd->index < config->num_connections) {
413*4882a593Smuzhiyun 				struct nbd_sock *nsock =
414*4882a593Smuzhiyun 					config->socks[cmd->index];
415*4882a593Smuzhiyun 				mutex_lock(&nsock->tx_lock);
416*4882a593Smuzhiyun 				/* We can have multiple outstanding requests, so
417*4882a593Smuzhiyun 				 * we don't want to mark the nsock dead if we've
418*4882a593Smuzhiyun 				 * already reconnected with a new socket, so
419*4882a593Smuzhiyun 				 * only mark it dead if its the same socket we
420*4882a593Smuzhiyun 				 * were sent out on.
421*4882a593Smuzhiyun 				 */
422*4882a593Smuzhiyun 				if (cmd->cookie == nsock->cookie)
423*4882a593Smuzhiyun 					nbd_mark_nsock_dead(nbd, nsock, 1);
424*4882a593Smuzhiyun 				mutex_unlock(&nsock->tx_lock);
425*4882a593Smuzhiyun 			}
426*4882a593Smuzhiyun 			mutex_unlock(&cmd->lock);
427*4882a593Smuzhiyun 			nbd_requeue_cmd(cmd);
428*4882a593Smuzhiyun 			nbd_config_put(nbd);
429*4882a593Smuzhiyun 			return BLK_EH_DONE;
430*4882a593Smuzhiyun 		}
431*4882a593Smuzhiyun 	}
432*4882a593Smuzhiyun 
433*4882a593Smuzhiyun 	if (!nbd->tag_set.timeout) {
434*4882a593Smuzhiyun 		/*
435*4882a593Smuzhiyun 		 * Userspace sets timeout=0 to disable socket disconnection,
436*4882a593Smuzhiyun 		 * so just warn and reset the timer.
437*4882a593Smuzhiyun 		 */
438*4882a593Smuzhiyun 		struct nbd_sock *nsock = config->socks[cmd->index];
439*4882a593Smuzhiyun 		cmd->retries++;
440*4882a593Smuzhiyun 		dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
441*4882a593Smuzhiyun 			req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
442*4882a593Smuzhiyun 			(unsigned long long)blk_rq_pos(req) << 9,
443*4882a593Smuzhiyun 			blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
444*4882a593Smuzhiyun 
445*4882a593Smuzhiyun 		mutex_lock(&nsock->tx_lock);
446*4882a593Smuzhiyun 		if (cmd->cookie != nsock->cookie) {
447*4882a593Smuzhiyun 			nbd_requeue_cmd(cmd);
448*4882a593Smuzhiyun 			mutex_unlock(&nsock->tx_lock);
449*4882a593Smuzhiyun 			mutex_unlock(&cmd->lock);
450*4882a593Smuzhiyun 			nbd_config_put(nbd);
451*4882a593Smuzhiyun 			return BLK_EH_DONE;
452*4882a593Smuzhiyun 		}
453*4882a593Smuzhiyun 		mutex_unlock(&nsock->tx_lock);
454*4882a593Smuzhiyun 		mutex_unlock(&cmd->lock);
455*4882a593Smuzhiyun 		nbd_config_put(nbd);
456*4882a593Smuzhiyun 		return BLK_EH_RESET_TIMER;
457*4882a593Smuzhiyun 	}
458*4882a593Smuzhiyun 
459*4882a593Smuzhiyun 	dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
460*4882a593Smuzhiyun 	set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
461*4882a593Smuzhiyun 	cmd->status = BLK_STS_IOERR;
462*4882a593Smuzhiyun 	mutex_unlock(&cmd->lock);
463*4882a593Smuzhiyun 	sock_shutdown(nbd);
464*4882a593Smuzhiyun 	nbd_config_put(nbd);
465*4882a593Smuzhiyun done:
466*4882a593Smuzhiyun 	blk_mq_complete_request(req);
467*4882a593Smuzhiyun 	return BLK_EH_DONE;
468*4882a593Smuzhiyun }
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun /*
471*4882a593Smuzhiyun  *  Send or receive packet.
472*4882a593Smuzhiyun  */
sock_xmit(struct nbd_device * nbd,int index,int send,struct iov_iter * iter,int msg_flags,int * sent)473*4882a593Smuzhiyun static int sock_xmit(struct nbd_device *nbd, int index, int send,
474*4882a593Smuzhiyun 		     struct iov_iter *iter, int msg_flags, int *sent)
475*4882a593Smuzhiyun {
476*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
477*4882a593Smuzhiyun 	struct socket *sock = config->socks[index]->sock;
478*4882a593Smuzhiyun 	int result;
479*4882a593Smuzhiyun 	struct msghdr msg;
480*4882a593Smuzhiyun 	unsigned int noreclaim_flag;
481*4882a593Smuzhiyun 
482*4882a593Smuzhiyun 	if (unlikely(!sock)) {
483*4882a593Smuzhiyun 		dev_err_ratelimited(disk_to_dev(nbd->disk),
484*4882a593Smuzhiyun 			"Attempted %s on closed socket in sock_xmit\n",
485*4882a593Smuzhiyun 			(send ? "send" : "recv"));
486*4882a593Smuzhiyun 		return -EINVAL;
487*4882a593Smuzhiyun 	}
488*4882a593Smuzhiyun 
489*4882a593Smuzhiyun 	msg.msg_iter = *iter;
490*4882a593Smuzhiyun 
491*4882a593Smuzhiyun 	noreclaim_flag = memalloc_noreclaim_save();
492*4882a593Smuzhiyun 	do {
493*4882a593Smuzhiyun 		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
494*4882a593Smuzhiyun 		msg.msg_name = NULL;
495*4882a593Smuzhiyun 		msg.msg_namelen = 0;
496*4882a593Smuzhiyun 		msg.msg_control = NULL;
497*4882a593Smuzhiyun 		msg.msg_controllen = 0;
498*4882a593Smuzhiyun 		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
499*4882a593Smuzhiyun 
500*4882a593Smuzhiyun 		if (send)
501*4882a593Smuzhiyun 			result = sock_sendmsg(sock, &msg);
502*4882a593Smuzhiyun 		else
503*4882a593Smuzhiyun 			result = sock_recvmsg(sock, &msg, msg.msg_flags);
504*4882a593Smuzhiyun 
505*4882a593Smuzhiyun 		if (result <= 0) {
506*4882a593Smuzhiyun 			if (result == 0)
507*4882a593Smuzhiyun 				result = -EPIPE; /* short read */
508*4882a593Smuzhiyun 			break;
509*4882a593Smuzhiyun 		}
510*4882a593Smuzhiyun 		if (sent)
511*4882a593Smuzhiyun 			*sent += result;
512*4882a593Smuzhiyun 	} while (msg_data_left(&msg));
513*4882a593Smuzhiyun 
514*4882a593Smuzhiyun 	memalloc_noreclaim_restore(noreclaim_flag);
515*4882a593Smuzhiyun 
516*4882a593Smuzhiyun 	return result;
517*4882a593Smuzhiyun }
518*4882a593Smuzhiyun 
519*4882a593Smuzhiyun /*
520*4882a593Smuzhiyun  * Different settings for sk->sk_sndtimeo can result in different return values
521*4882a593Smuzhiyun  * if there is a signal pending when we enter sendmsg, because reasons?
522*4882a593Smuzhiyun  */
was_interrupted(int result)523*4882a593Smuzhiyun static inline int was_interrupted(int result)
524*4882a593Smuzhiyun {
525*4882a593Smuzhiyun 	return result == -ERESTARTSYS || result == -EINTR;
526*4882a593Smuzhiyun }
527*4882a593Smuzhiyun 
528*4882a593Smuzhiyun /* always call with the tx_lock held */
nbd_send_cmd(struct nbd_device * nbd,struct nbd_cmd * cmd,int index)529*4882a593Smuzhiyun static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
530*4882a593Smuzhiyun {
531*4882a593Smuzhiyun 	struct request *req = blk_mq_rq_from_pdu(cmd);
532*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
533*4882a593Smuzhiyun 	struct nbd_sock *nsock = config->socks[index];
534*4882a593Smuzhiyun 	int result;
535*4882a593Smuzhiyun 	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
536*4882a593Smuzhiyun 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
537*4882a593Smuzhiyun 	struct iov_iter from;
538*4882a593Smuzhiyun 	unsigned long size = blk_rq_bytes(req);
539*4882a593Smuzhiyun 	struct bio *bio;
540*4882a593Smuzhiyun 	u64 handle;
541*4882a593Smuzhiyun 	u32 type;
542*4882a593Smuzhiyun 	u32 nbd_cmd_flags = 0;
543*4882a593Smuzhiyun 	int sent = nsock->sent, skip = 0;
544*4882a593Smuzhiyun 
545*4882a593Smuzhiyun 	iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
546*4882a593Smuzhiyun 
547*4882a593Smuzhiyun 	type = req_to_nbd_cmd_type(req);
548*4882a593Smuzhiyun 	if (type == U32_MAX)
549*4882a593Smuzhiyun 		return -EIO;
550*4882a593Smuzhiyun 
551*4882a593Smuzhiyun 	if (rq_data_dir(req) == WRITE &&
552*4882a593Smuzhiyun 	    (config->flags & NBD_FLAG_READ_ONLY)) {
553*4882a593Smuzhiyun 		dev_err_ratelimited(disk_to_dev(nbd->disk),
554*4882a593Smuzhiyun 				    "Write on read-only\n");
555*4882a593Smuzhiyun 		return -EIO;
556*4882a593Smuzhiyun 	}
557*4882a593Smuzhiyun 
558*4882a593Smuzhiyun 	if (req->cmd_flags & REQ_FUA)
559*4882a593Smuzhiyun 		nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
560*4882a593Smuzhiyun 
561*4882a593Smuzhiyun 	/* We did a partial send previously, and we at least sent the whole
562*4882a593Smuzhiyun 	 * request struct, so just go and send the rest of the pages in the
563*4882a593Smuzhiyun 	 * request.
564*4882a593Smuzhiyun 	 */
565*4882a593Smuzhiyun 	if (sent) {
566*4882a593Smuzhiyun 		if (sent >= sizeof(request)) {
567*4882a593Smuzhiyun 			skip = sent - sizeof(request);
568*4882a593Smuzhiyun 
569*4882a593Smuzhiyun 			/* initialize handle for tracing purposes */
570*4882a593Smuzhiyun 			handle = nbd_cmd_handle(cmd);
571*4882a593Smuzhiyun 
572*4882a593Smuzhiyun 			goto send_pages;
573*4882a593Smuzhiyun 		}
574*4882a593Smuzhiyun 		iov_iter_advance(&from, sent);
575*4882a593Smuzhiyun 	} else {
576*4882a593Smuzhiyun 		cmd->cmd_cookie++;
577*4882a593Smuzhiyun 	}
578*4882a593Smuzhiyun 	cmd->index = index;
579*4882a593Smuzhiyun 	cmd->cookie = nsock->cookie;
580*4882a593Smuzhiyun 	cmd->retries = 0;
581*4882a593Smuzhiyun 	request.type = htonl(type | nbd_cmd_flags);
582*4882a593Smuzhiyun 	if (type != NBD_CMD_FLUSH) {
583*4882a593Smuzhiyun 		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
584*4882a593Smuzhiyun 		request.len = htonl(size);
585*4882a593Smuzhiyun 	}
586*4882a593Smuzhiyun 	handle = nbd_cmd_handle(cmd);
587*4882a593Smuzhiyun 	memcpy(request.handle, &handle, sizeof(handle));
588*4882a593Smuzhiyun 
589*4882a593Smuzhiyun 	trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
590*4882a593Smuzhiyun 
591*4882a593Smuzhiyun 	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
592*4882a593Smuzhiyun 		req, nbdcmd_to_ascii(type),
593*4882a593Smuzhiyun 		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
594*4882a593Smuzhiyun 	result = sock_xmit(nbd, index, 1, &from,
595*4882a593Smuzhiyun 			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
596*4882a593Smuzhiyun 	trace_nbd_header_sent(req, handle);
597*4882a593Smuzhiyun 	if (result <= 0) {
598*4882a593Smuzhiyun 		if (was_interrupted(result)) {
599*4882a593Smuzhiyun 			/* If we havne't sent anything we can just return BUSY,
600*4882a593Smuzhiyun 			 * however if we have sent something we need to make
601*4882a593Smuzhiyun 			 * sure we only allow this req to be sent until we are
602*4882a593Smuzhiyun 			 * completely done.
603*4882a593Smuzhiyun 			 */
604*4882a593Smuzhiyun 			if (sent) {
605*4882a593Smuzhiyun 				nsock->pending = req;
606*4882a593Smuzhiyun 				nsock->sent = sent;
607*4882a593Smuzhiyun 			}
608*4882a593Smuzhiyun 			set_bit(NBD_CMD_REQUEUED, &cmd->flags);
609*4882a593Smuzhiyun 			return BLK_STS_RESOURCE;
610*4882a593Smuzhiyun 		}
611*4882a593Smuzhiyun 		dev_err_ratelimited(disk_to_dev(nbd->disk),
612*4882a593Smuzhiyun 			"Send control failed (result %d)\n", result);
613*4882a593Smuzhiyun 		return -EAGAIN;
614*4882a593Smuzhiyun 	}
615*4882a593Smuzhiyun send_pages:
616*4882a593Smuzhiyun 	if (type != NBD_CMD_WRITE)
617*4882a593Smuzhiyun 		goto out;
618*4882a593Smuzhiyun 
619*4882a593Smuzhiyun 	bio = req->bio;
620*4882a593Smuzhiyun 	while (bio) {
621*4882a593Smuzhiyun 		struct bio *next = bio->bi_next;
622*4882a593Smuzhiyun 		struct bvec_iter iter;
623*4882a593Smuzhiyun 		struct bio_vec bvec;
624*4882a593Smuzhiyun 
625*4882a593Smuzhiyun 		bio_for_each_segment(bvec, bio, iter) {
626*4882a593Smuzhiyun 			bool is_last = !next && bio_iter_last(bvec, iter);
627*4882a593Smuzhiyun 			int flags = is_last ? 0 : MSG_MORE;
628*4882a593Smuzhiyun 
629*4882a593Smuzhiyun 			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
630*4882a593Smuzhiyun 				req, bvec.bv_len);
631*4882a593Smuzhiyun 			iov_iter_bvec(&from, WRITE, &bvec, 1, bvec.bv_len);
632*4882a593Smuzhiyun 			if (skip) {
633*4882a593Smuzhiyun 				if (skip >= iov_iter_count(&from)) {
634*4882a593Smuzhiyun 					skip -= iov_iter_count(&from);
635*4882a593Smuzhiyun 					continue;
636*4882a593Smuzhiyun 				}
637*4882a593Smuzhiyun 				iov_iter_advance(&from, skip);
638*4882a593Smuzhiyun 				skip = 0;
639*4882a593Smuzhiyun 			}
640*4882a593Smuzhiyun 			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
641*4882a593Smuzhiyun 			if (result <= 0) {
642*4882a593Smuzhiyun 				if (was_interrupted(result)) {
643*4882a593Smuzhiyun 					/* We've already sent the header, we
644*4882a593Smuzhiyun 					 * have no choice but to set pending and
645*4882a593Smuzhiyun 					 * return BUSY.
646*4882a593Smuzhiyun 					 */
647*4882a593Smuzhiyun 					nsock->pending = req;
648*4882a593Smuzhiyun 					nsock->sent = sent;
649*4882a593Smuzhiyun 					set_bit(NBD_CMD_REQUEUED, &cmd->flags);
650*4882a593Smuzhiyun 					return BLK_STS_RESOURCE;
651*4882a593Smuzhiyun 				}
652*4882a593Smuzhiyun 				dev_err(disk_to_dev(nbd->disk),
653*4882a593Smuzhiyun 					"Send data failed (result %d)\n",
654*4882a593Smuzhiyun 					result);
655*4882a593Smuzhiyun 				return -EAGAIN;
656*4882a593Smuzhiyun 			}
657*4882a593Smuzhiyun 			/*
658*4882a593Smuzhiyun 			 * The completion might already have come in,
659*4882a593Smuzhiyun 			 * so break for the last one instead of letting
660*4882a593Smuzhiyun 			 * the iterator do it. This prevents use-after-free
661*4882a593Smuzhiyun 			 * of the bio.
662*4882a593Smuzhiyun 			 */
663*4882a593Smuzhiyun 			if (is_last)
664*4882a593Smuzhiyun 				break;
665*4882a593Smuzhiyun 		}
666*4882a593Smuzhiyun 		bio = next;
667*4882a593Smuzhiyun 	}
668*4882a593Smuzhiyun out:
669*4882a593Smuzhiyun 	trace_nbd_payload_sent(req, handle);
670*4882a593Smuzhiyun 	nsock->pending = NULL;
671*4882a593Smuzhiyun 	nsock->sent = 0;
672*4882a593Smuzhiyun 	return 0;
673*4882a593Smuzhiyun }
674*4882a593Smuzhiyun 
675*4882a593Smuzhiyun /* NULL returned = something went wrong, inform userspace */
nbd_read_stat(struct nbd_device * nbd,int index)676*4882a593Smuzhiyun static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd, int index)
677*4882a593Smuzhiyun {
678*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
679*4882a593Smuzhiyun 	int result;
680*4882a593Smuzhiyun 	struct nbd_reply reply;
681*4882a593Smuzhiyun 	struct nbd_cmd *cmd;
682*4882a593Smuzhiyun 	struct request *req = NULL;
683*4882a593Smuzhiyun 	u64 handle;
684*4882a593Smuzhiyun 	u16 hwq;
685*4882a593Smuzhiyun 	u32 tag;
686*4882a593Smuzhiyun 	struct kvec iov = {.iov_base = &reply, .iov_len = sizeof(reply)};
687*4882a593Smuzhiyun 	struct iov_iter to;
688*4882a593Smuzhiyun 	int ret = 0;
689*4882a593Smuzhiyun 
690*4882a593Smuzhiyun 	reply.magic = 0;
691*4882a593Smuzhiyun 	iov_iter_kvec(&to, READ, &iov, 1, sizeof(reply));
692*4882a593Smuzhiyun 	result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
693*4882a593Smuzhiyun 	if (result <= 0) {
694*4882a593Smuzhiyun 		if (!nbd_disconnected(config))
695*4882a593Smuzhiyun 			dev_err(disk_to_dev(nbd->disk),
696*4882a593Smuzhiyun 				"Receive control failed (result %d)\n", result);
697*4882a593Smuzhiyun 		return ERR_PTR(result);
698*4882a593Smuzhiyun 	}
699*4882a593Smuzhiyun 
700*4882a593Smuzhiyun 	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
701*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
702*4882a593Smuzhiyun 				(unsigned long)ntohl(reply.magic));
703*4882a593Smuzhiyun 		return ERR_PTR(-EPROTO);
704*4882a593Smuzhiyun 	}
705*4882a593Smuzhiyun 
706*4882a593Smuzhiyun 	memcpy(&handle, reply.handle, sizeof(handle));
707*4882a593Smuzhiyun 	tag = nbd_handle_to_tag(handle);
708*4882a593Smuzhiyun 	hwq = blk_mq_unique_tag_to_hwq(tag);
709*4882a593Smuzhiyun 	if (hwq < nbd->tag_set.nr_hw_queues)
710*4882a593Smuzhiyun 		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
711*4882a593Smuzhiyun 				       blk_mq_unique_tag_to_tag(tag));
712*4882a593Smuzhiyun 	if (!req || !blk_mq_request_started(req)) {
713*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
714*4882a593Smuzhiyun 			tag, req);
715*4882a593Smuzhiyun 		return ERR_PTR(-ENOENT);
716*4882a593Smuzhiyun 	}
717*4882a593Smuzhiyun 	trace_nbd_header_received(req, handle);
718*4882a593Smuzhiyun 	cmd = blk_mq_rq_to_pdu(req);
719*4882a593Smuzhiyun 
720*4882a593Smuzhiyun 	mutex_lock(&cmd->lock);
721*4882a593Smuzhiyun 	if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
722*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
723*4882a593Smuzhiyun 			req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
724*4882a593Smuzhiyun 		ret = -ENOENT;
725*4882a593Smuzhiyun 		goto out;
726*4882a593Smuzhiyun 	}
727*4882a593Smuzhiyun 	if (cmd->status != BLK_STS_OK) {
728*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
729*4882a593Smuzhiyun 			req);
730*4882a593Smuzhiyun 		ret = -ENOENT;
731*4882a593Smuzhiyun 		goto out;
732*4882a593Smuzhiyun 	}
733*4882a593Smuzhiyun 	if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
734*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
735*4882a593Smuzhiyun 			req);
736*4882a593Smuzhiyun 		ret = -ENOENT;
737*4882a593Smuzhiyun 		goto out;
738*4882a593Smuzhiyun 	}
739*4882a593Smuzhiyun 	if (ntohl(reply.error)) {
740*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
741*4882a593Smuzhiyun 			ntohl(reply.error));
742*4882a593Smuzhiyun 		cmd->status = BLK_STS_IOERR;
743*4882a593Smuzhiyun 		goto out;
744*4882a593Smuzhiyun 	}
745*4882a593Smuzhiyun 
746*4882a593Smuzhiyun 	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
747*4882a593Smuzhiyun 	if (rq_data_dir(req) != WRITE) {
748*4882a593Smuzhiyun 		struct req_iterator iter;
749*4882a593Smuzhiyun 		struct bio_vec bvec;
750*4882a593Smuzhiyun 
751*4882a593Smuzhiyun 		rq_for_each_segment(bvec, req, iter) {
752*4882a593Smuzhiyun 			iov_iter_bvec(&to, READ, &bvec, 1, bvec.bv_len);
753*4882a593Smuzhiyun 			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
754*4882a593Smuzhiyun 			if (result <= 0) {
755*4882a593Smuzhiyun 				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
756*4882a593Smuzhiyun 					result);
757*4882a593Smuzhiyun 				/*
758*4882a593Smuzhiyun 				 * If we've disconnected, we need to make sure we
759*4882a593Smuzhiyun 				 * complete this request, otherwise error out
760*4882a593Smuzhiyun 				 * and let the timeout stuff handle resubmitting
761*4882a593Smuzhiyun 				 * this request onto another connection.
762*4882a593Smuzhiyun 				 */
763*4882a593Smuzhiyun 				if (nbd_disconnected(config)) {
764*4882a593Smuzhiyun 					cmd->status = BLK_STS_IOERR;
765*4882a593Smuzhiyun 					goto out;
766*4882a593Smuzhiyun 				}
767*4882a593Smuzhiyun 				ret = -EIO;
768*4882a593Smuzhiyun 				goto out;
769*4882a593Smuzhiyun 			}
770*4882a593Smuzhiyun 			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
771*4882a593Smuzhiyun 				req, bvec.bv_len);
772*4882a593Smuzhiyun 		}
773*4882a593Smuzhiyun 	}
774*4882a593Smuzhiyun out:
775*4882a593Smuzhiyun 	trace_nbd_payload_received(req, handle);
776*4882a593Smuzhiyun 	mutex_unlock(&cmd->lock);
777*4882a593Smuzhiyun 	return ret ? ERR_PTR(ret) : cmd;
778*4882a593Smuzhiyun }
779*4882a593Smuzhiyun 
recv_work(struct work_struct * work)780*4882a593Smuzhiyun static void recv_work(struct work_struct *work)
781*4882a593Smuzhiyun {
782*4882a593Smuzhiyun 	struct recv_thread_args *args = container_of(work,
783*4882a593Smuzhiyun 						     struct recv_thread_args,
784*4882a593Smuzhiyun 						     work);
785*4882a593Smuzhiyun 	struct nbd_device *nbd = args->nbd;
786*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
787*4882a593Smuzhiyun 	struct nbd_cmd *cmd;
788*4882a593Smuzhiyun 	struct request *rq;
789*4882a593Smuzhiyun 
790*4882a593Smuzhiyun 	while (1) {
791*4882a593Smuzhiyun 		cmd = nbd_read_stat(nbd, args->index);
792*4882a593Smuzhiyun 		if (IS_ERR(cmd)) {
793*4882a593Smuzhiyun 			struct nbd_sock *nsock = config->socks[args->index];
794*4882a593Smuzhiyun 
795*4882a593Smuzhiyun 			mutex_lock(&nsock->tx_lock);
796*4882a593Smuzhiyun 			nbd_mark_nsock_dead(nbd, nsock, 1);
797*4882a593Smuzhiyun 			mutex_unlock(&nsock->tx_lock);
798*4882a593Smuzhiyun 			break;
799*4882a593Smuzhiyun 		}
800*4882a593Smuzhiyun 
801*4882a593Smuzhiyun 		rq = blk_mq_rq_from_pdu(cmd);
802*4882a593Smuzhiyun 		if (likely(!blk_should_fake_timeout(rq->q)))
803*4882a593Smuzhiyun 			blk_mq_complete_request(rq);
804*4882a593Smuzhiyun 	}
805*4882a593Smuzhiyun 	nbd_config_put(nbd);
806*4882a593Smuzhiyun 	atomic_dec(&config->recv_threads);
807*4882a593Smuzhiyun 	wake_up(&config->recv_wq);
808*4882a593Smuzhiyun 	kfree(args);
809*4882a593Smuzhiyun }
810*4882a593Smuzhiyun 
nbd_clear_req(struct request * req,void * data,bool reserved)811*4882a593Smuzhiyun static bool nbd_clear_req(struct request *req, void *data, bool reserved)
812*4882a593Smuzhiyun {
813*4882a593Smuzhiyun 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
814*4882a593Smuzhiyun 
815*4882a593Smuzhiyun 	/* don't abort one completed request */
816*4882a593Smuzhiyun 	if (blk_mq_request_completed(req))
817*4882a593Smuzhiyun 		return true;
818*4882a593Smuzhiyun 
819*4882a593Smuzhiyun 	mutex_lock(&cmd->lock);
820*4882a593Smuzhiyun 	cmd->status = BLK_STS_IOERR;
821*4882a593Smuzhiyun 	mutex_unlock(&cmd->lock);
822*4882a593Smuzhiyun 
823*4882a593Smuzhiyun 	blk_mq_complete_request(req);
824*4882a593Smuzhiyun 	return true;
825*4882a593Smuzhiyun }
826*4882a593Smuzhiyun 
nbd_clear_que(struct nbd_device * nbd)827*4882a593Smuzhiyun static void nbd_clear_que(struct nbd_device *nbd)
828*4882a593Smuzhiyun {
829*4882a593Smuzhiyun 	blk_mq_quiesce_queue(nbd->disk->queue);
830*4882a593Smuzhiyun 	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
831*4882a593Smuzhiyun 	blk_mq_unquiesce_queue(nbd->disk->queue);
832*4882a593Smuzhiyun 	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
833*4882a593Smuzhiyun }
834*4882a593Smuzhiyun 
find_fallback(struct nbd_device * nbd,int index)835*4882a593Smuzhiyun static int find_fallback(struct nbd_device *nbd, int index)
836*4882a593Smuzhiyun {
837*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
838*4882a593Smuzhiyun 	int new_index = -1;
839*4882a593Smuzhiyun 	struct nbd_sock *nsock = config->socks[index];
840*4882a593Smuzhiyun 	int fallback = nsock->fallback_index;
841*4882a593Smuzhiyun 
842*4882a593Smuzhiyun 	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
843*4882a593Smuzhiyun 		return new_index;
844*4882a593Smuzhiyun 
845*4882a593Smuzhiyun 	if (config->num_connections <= 1) {
846*4882a593Smuzhiyun 		dev_err_ratelimited(disk_to_dev(nbd->disk),
847*4882a593Smuzhiyun 				    "Dead connection, failed to find a fallback\n");
848*4882a593Smuzhiyun 		return new_index;
849*4882a593Smuzhiyun 	}
850*4882a593Smuzhiyun 
851*4882a593Smuzhiyun 	if (fallback >= 0 && fallback < config->num_connections &&
852*4882a593Smuzhiyun 	    !config->socks[fallback]->dead)
853*4882a593Smuzhiyun 		return fallback;
854*4882a593Smuzhiyun 
855*4882a593Smuzhiyun 	if (nsock->fallback_index < 0 ||
856*4882a593Smuzhiyun 	    nsock->fallback_index >= config->num_connections ||
857*4882a593Smuzhiyun 	    config->socks[nsock->fallback_index]->dead) {
858*4882a593Smuzhiyun 		int i;
859*4882a593Smuzhiyun 		for (i = 0; i < config->num_connections; i++) {
860*4882a593Smuzhiyun 			if (i == index)
861*4882a593Smuzhiyun 				continue;
862*4882a593Smuzhiyun 			if (!config->socks[i]->dead) {
863*4882a593Smuzhiyun 				new_index = i;
864*4882a593Smuzhiyun 				break;
865*4882a593Smuzhiyun 			}
866*4882a593Smuzhiyun 		}
867*4882a593Smuzhiyun 		nsock->fallback_index = new_index;
868*4882a593Smuzhiyun 		if (new_index < 0) {
869*4882a593Smuzhiyun 			dev_err_ratelimited(disk_to_dev(nbd->disk),
870*4882a593Smuzhiyun 					    "Dead connection, failed to find a fallback\n");
871*4882a593Smuzhiyun 			return new_index;
872*4882a593Smuzhiyun 		}
873*4882a593Smuzhiyun 	}
874*4882a593Smuzhiyun 	new_index = nsock->fallback_index;
875*4882a593Smuzhiyun 	return new_index;
876*4882a593Smuzhiyun }
877*4882a593Smuzhiyun 
wait_for_reconnect(struct nbd_device * nbd)878*4882a593Smuzhiyun static int wait_for_reconnect(struct nbd_device *nbd)
879*4882a593Smuzhiyun {
880*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
881*4882a593Smuzhiyun 	if (!config->dead_conn_timeout)
882*4882a593Smuzhiyun 		return 0;
883*4882a593Smuzhiyun 
884*4882a593Smuzhiyun 	if (!wait_event_timeout(config->conn_wait,
885*4882a593Smuzhiyun 				test_bit(NBD_RT_DISCONNECTED,
886*4882a593Smuzhiyun 					 &config->runtime_flags) ||
887*4882a593Smuzhiyun 				atomic_read(&config->live_connections) > 0,
888*4882a593Smuzhiyun 				config->dead_conn_timeout))
889*4882a593Smuzhiyun 		return 0;
890*4882a593Smuzhiyun 
891*4882a593Smuzhiyun 	return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
892*4882a593Smuzhiyun }
893*4882a593Smuzhiyun 
nbd_handle_cmd(struct nbd_cmd * cmd,int index)894*4882a593Smuzhiyun static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
895*4882a593Smuzhiyun {
896*4882a593Smuzhiyun 	struct request *req = blk_mq_rq_from_pdu(cmd);
897*4882a593Smuzhiyun 	struct nbd_device *nbd = cmd->nbd;
898*4882a593Smuzhiyun 	struct nbd_config *config;
899*4882a593Smuzhiyun 	struct nbd_sock *nsock;
900*4882a593Smuzhiyun 	int ret;
901*4882a593Smuzhiyun 
902*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
903*4882a593Smuzhiyun 		dev_err_ratelimited(disk_to_dev(nbd->disk),
904*4882a593Smuzhiyun 				    "Socks array is empty\n");
905*4882a593Smuzhiyun 		blk_mq_start_request(req);
906*4882a593Smuzhiyun 		return -EINVAL;
907*4882a593Smuzhiyun 	}
908*4882a593Smuzhiyun 	config = nbd->config;
909*4882a593Smuzhiyun 
910*4882a593Smuzhiyun 	if (index >= config->num_connections) {
911*4882a593Smuzhiyun 		dev_err_ratelimited(disk_to_dev(nbd->disk),
912*4882a593Smuzhiyun 				    "Attempted send on invalid socket\n");
913*4882a593Smuzhiyun 		nbd_config_put(nbd);
914*4882a593Smuzhiyun 		blk_mq_start_request(req);
915*4882a593Smuzhiyun 		return -EINVAL;
916*4882a593Smuzhiyun 	}
917*4882a593Smuzhiyun 	cmd->status = BLK_STS_OK;
918*4882a593Smuzhiyun again:
919*4882a593Smuzhiyun 	nsock = config->socks[index];
920*4882a593Smuzhiyun 	mutex_lock(&nsock->tx_lock);
921*4882a593Smuzhiyun 	if (nsock->dead) {
922*4882a593Smuzhiyun 		int old_index = index;
923*4882a593Smuzhiyun 		index = find_fallback(nbd, index);
924*4882a593Smuzhiyun 		mutex_unlock(&nsock->tx_lock);
925*4882a593Smuzhiyun 		if (index < 0) {
926*4882a593Smuzhiyun 			if (wait_for_reconnect(nbd)) {
927*4882a593Smuzhiyun 				index = old_index;
928*4882a593Smuzhiyun 				goto again;
929*4882a593Smuzhiyun 			}
930*4882a593Smuzhiyun 			/* All the sockets should already be down at this point,
931*4882a593Smuzhiyun 			 * we just want to make sure that DISCONNECTED is set so
932*4882a593Smuzhiyun 			 * any requests that come in that were queue'ed waiting
933*4882a593Smuzhiyun 			 * for the reconnect timer don't trigger the timer again
934*4882a593Smuzhiyun 			 * and instead just error out.
935*4882a593Smuzhiyun 			 */
936*4882a593Smuzhiyun 			sock_shutdown(nbd);
937*4882a593Smuzhiyun 			nbd_config_put(nbd);
938*4882a593Smuzhiyun 			blk_mq_start_request(req);
939*4882a593Smuzhiyun 			return -EIO;
940*4882a593Smuzhiyun 		}
941*4882a593Smuzhiyun 		goto again;
942*4882a593Smuzhiyun 	}
943*4882a593Smuzhiyun 
944*4882a593Smuzhiyun 	/* Handle the case that we have a pending request that was partially
945*4882a593Smuzhiyun 	 * transmitted that _has_ to be serviced first.  We need to call requeue
946*4882a593Smuzhiyun 	 * here so that it gets put _after_ the request that is already on the
947*4882a593Smuzhiyun 	 * dispatch list.
948*4882a593Smuzhiyun 	 */
949*4882a593Smuzhiyun 	blk_mq_start_request(req);
950*4882a593Smuzhiyun 	if (unlikely(nsock->pending && nsock->pending != req)) {
951*4882a593Smuzhiyun 		nbd_requeue_cmd(cmd);
952*4882a593Smuzhiyun 		ret = 0;
953*4882a593Smuzhiyun 		goto out;
954*4882a593Smuzhiyun 	}
955*4882a593Smuzhiyun 	/*
956*4882a593Smuzhiyun 	 * Some failures are related to the link going down, so anything that
957*4882a593Smuzhiyun 	 * returns EAGAIN can be retried on a different socket.
958*4882a593Smuzhiyun 	 */
959*4882a593Smuzhiyun 	ret = nbd_send_cmd(nbd, cmd, index);
960*4882a593Smuzhiyun 	if (ret == -EAGAIN) {
961*4882a593Smuzhiyun 		dev_err_ratelimited(disk_to_dev(nbd->disk),
962*4882a593Smuzhiyun 				    "Request send failed, requeueing\n");
963*4882a593Smuzhiyun 		nbd_mark_nsock_dead(nbd, nsock, 1);
964*4882a593Smuzhiyun 		nbd_requeue_cmd(cmd);
965*4882a593Smuzhiyun 		ret = 0;
966*4882a593Smuzhiyun 	}
967*4882a593Smuzhiyun out:
968*4882a593Smuzhiyun 	mutex_unlock(&nsock->tx_lock);
969*4882a593Smuzhiyun 	nbd_config_put(nbd);
970*4882a593Smuzhiyun 	return ret;
971*4882a593Smuzhiyun }
972*4882a593Smuzhiyun 
nbd_queue_rq(struct blk_mq_hw_ctx * hctx,const struct blk_mq_queue_data * bd)973*4882a593Smuzhiyun static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
974*4882a593Smuzhiyun 			const struct blk_mq_queue_data *bd)
975*4882a593Smuzhiyun {
976*4882a593Smuzhiyun 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
977*4882a593Smuzhiyun 	int ret;
978*4882a593Smuzhiyun 
979*4882a593Smuzhiyun 	/*
980*4882a593Smuzhiyun 	 * Since we look at the bio's to send the request over the network we
981*4882a593Smuzhiyun 	 * need to make sure the completion work doesn't mark this request done
982*4882a593Smuzhiyun 	 * before we are done doing our send.  This keeps us from dereferencing
983*4882a593Smuzhiyun 	 * freed data if we have particularly fast completions (ie we get the
984*4882a593Smuzhiyun 	 * completion before we exit sock_xmit on the last bvec) or in the case
985*4882a593Smuzhiyun 	 * that the server is misbehaving (or there was an error) before we're
986*4882a593Smuzhiyun 	 * done sending everything over the wire.
987*4882a593Smuzhiyun 	 */
988*4882a593Smuzhiyun 	mutex_lock(&cmd->lock);
989*4882a593Smuzhiyun 	clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
990*4882a593Smuzhiyun 
991*4882a593Smuzhiyun 	/* We can be called directly from the user space process, which means we
992*4882a593Smuzhiyun 	 * could possibly have signals pending so our sendmsg will fail.  In
993*4882a593Smuzhiyun 	 * this case we need to return that we are busy, otherwise error out as
994*4882a593Smuzhiyun 	 * appropriate.
995*4882a593Smuzhiyun 	 */
996*4882a593Smuzhiyun 	ret = nbd_handle_cmd(cmd, hctx->queue_num);
997*4882a593Smuzhiyun 	if (ret < 0)
998*4882a593Smuzhiyun 		ret = BLK_STS_IOERR;
999*4882a593Smuzhiyun 	else if (!ret)
1000*4882a593Smuzhiyun 		ret = BLK_STS_OK;
1001*4882a593Smuzhiyun 	mutex_unlock(&cmd->lock);
1002*4882a593Smuzhiyun 
1003*4882a593Smuzhiyun 	return ret;
1004*4882a593Smuzhiyun }
1005*4882a593Smuzhiyun 
nbd_get_socket(struct nbd_device * nbd,unsigned long fd,int * err)1006*4882a593Smuzhiyun static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
1007*4882a593Smuzhiyun 				     int *err)
1008*4882a593Smuzhiyun {
1009*4882a593Smuzhiyun 	struct socket *sock;
1010*4882a593Smuzhiyun 
1011*4882a593Smuzhiyun 	*err = 0;
1012*4882a593Smuzhiyun 	sock = sockfd_lookup(fd, err);
1013*4882a593Smuzhiyun 	if (!sock)
1014*4882a593Smuzhiyun 		return NULL;
1015*4882a593Smuzhiyun 
1016*4882a593Smuzhiyun 	if (sock->ops->shutdown == sock_no_shutdown) {
1017*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
1018*4882a593Smuzhiyun 		*err = -EINVAL;
1019*4882a593Smuzhiyun 		sockfd_put(sock);
1020*4882a593Smuzhiyun 		return NULL;
1021*4882a593Smuzhiyun 	}
1022*4882a593Smuzhiyun 
1023*4882a593Smuzhiyun 	return sock;
1024*4882a593Smuzhiyun }
1025*4882a593Smuzhiyun 
nbd_add_socket(struct nbd_device * nbd,unsigned long arg,bool netlink)1026*4882a593Smuzhiyun static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
1027*4882a593Smuzhiyun 			  bool netlink)
1028*4882a593Smuzhiyun {
1029*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1030*4882a593Smuzhiyun 	struct socket *sock;
1031*4882a593Smuzhiyun 	struct nbd_sock **socks;
1032*4882a593Smuzhiyun 	struct nbd_sock *nsock;
1033*4882a593Smuzhiyun 	int err;
1034*4882a593Smuzhiyun 
1035*4882a593Smuzhiyun 	sock = nbd_get_socket(nbd, arg, &err);
1036*4882a593Smuzhiyun 	if (!sock)
1037*4882a593Smuzhiyun 		return err;
1038*4882a593Smuzhiyun 
1039*4882a593Smuzhiyun 	/*
1040*4882a593Smuzhiyun 	 * We need to make sure we don't get any errant requests while we're
1041*4882a593Smuzhiyun 	 * reallocating the ->socks array.
1042*4882a593Smuzhiyun 	 */
1043*4882a593Smuzhiyun 	blk_mq_freeze_queue(nbd->disk->queue);
1044*4882a593Smuzhiyun 
1045*4882a593Smuzhiyun 	if (!netlink && !nbd->task_setup &&
1046*4882a593Smuzhiyun 	    !test_bit(NBD_RT_BOUND, &config->runtime_flags))
1047*4882a593Smuzhiyun 		nbd->task_setup = current;
1048*4882a593Smuzhiyun 
1049*4882a593Smuzhiyun 	if (!netlink &&
1050*4882a593Smuzhiyun 	    (nbd->task_setup != current ||
1051*4882a593Smuzhiyun 	     test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
1052*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk),
1053*4882a593Smuzhiyun 			"Device being setup by another task");
1054*4882a593Smuzhiyun 		err = -EBUSY;
1055*4882a593Smuzhiyun 		goto put_socket;
1056*4882a593Smuzhiyun 	}
1057*4882a593Smuzhiyun 
1058*4882a593Smuzhiyun 	nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
1059*4882a593Smuzhiyun 	if (!nsock) {
1060*4882a593Smuzhiyun 		err = -ENOMEM;
1061*4882a593Smuzhiyun 		goto put_socket;
1062*4882a593Smuzhiyun 	}
1063*4882a593Smuzhiyun 
1064*4882a593Smuzhiyun 	socks = krealloc(config->socks, (config->num_connections + 1) *
1065*4882a593Smuzhiyun 			 sizeof(struct nbd_sock *), GFP_KERNEL);
1066*4882a593Smuzhiyun 	if (!socks) {
1067*4882a593Smuzhiyun 		kfree(nsock);
1068*4882a593Smuzhiyun 		err = -ENOMEM;
1069*4882a593Smuzhiyun 		goto put_socket;
1070*4882a593Smuzhiyun 	}
1071*4882a593Smuzhiyun 
1072*4882a593Smuzhiyun 	config->socks = socks;
1073*4882a593Smuzhiyun 
1074*4882a593Smuzhiyun 	nsock->fallback_index = -1;
1075*4882a593Smuzhiyun 	nsock->dead = false;
1076*4882a593Smuzhiyun 	mutex_init(&nsock->tx_lock);
1077*4882a593Smuzhiyun 	nsock->sock = sock;
1078*4882a593Smuzhiyun 	nsock->pending = NULL;
1079*4882a593Smuzhiyun 	nsock->sent = 0;
1080*4882a593Smuzhiyun 	nsock->cookie = 0;
1081*4882a593Smuzhiyun 	socks[config->num_connections++] = nsock;
1082*4882a593Smuzhiyun 	atomic_inc(&config->live_connections);
1083*4882a593Smuzhiyun 	blk_mq_unfreeze_queue(nbd->disk->queue);
1084*4882a593Smuzhiyun 
1085*4882a593Smuzhiyun 	return 0;
1086*4882a593Smuzhiyun 
1087*4882a593Smuzhiyun put_socket:
1088*4882a593Smuzhiyun 	blk_mq_unfreeze_queue(nbd->disk->queue);
1089*4882a593Smuzhiyun 	sockfd_put(sock);
1090*4882a593Smuzhiyun 	return err;
1091*4882a593Smuzhiyun }
1092*4882a593Smuzhiyun 
nbd_reconnect_socket(struct nbd_device * nbd,unsigned long arg)1093*4882a593Smuzhiyun static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
1094*4882a593Smuzhiyun {
1095*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1096*4882a593Smuzhiyun 	struct socket *sock, *old;
1097*4882a593Smuzhiyun 	struct recv_thread_args *args;
1098*4882a593Smuzhiyun 	int i;
1099*4882a593Smuzhiyun 	int err;
1100*4882a593Smuzhiyun 
1101*4882a593Smuzhiyun 	sock = nbd_get_socket(nbd, arg, &err);
1102*4882a593Smuzhiyun 	if (!sock)
1103*4882a593Smuzhiyun 		return err;
1104*4882a593Smuzhiyun 
1105*4882a593Smuzhiyun 	args = kzalloc(sizeof(*args), GFP_KERNEL);
1106*4882a593Smuzhiyun 	if (!args) {
1107*4882a593Smuzhiyun 		sockfd_put(sock);
1108*4882a593Smuzhiyun 		return -ENOMEM;
1109*4882a593Smuzhiyun 	}
1110*4882a593Smuzhiyun 
1111*4882a593Smuzhiyun 	for (i = 0; i < config->num_connections; i++) {
1112*4882a593Smuzhiyun 		struct nbd_sock *nsock = config->socks[i];
1113*4882a593Smuzhiyun 
1114*4882a593Smuzhiyun 		if (!nsock->dead)
1115*4882a593Smuzhiyun 			continue;
1116*4882a593Smuzhiyun 
1117*4882a593Smuzhiyun 		mutex_lock(&nsock->tx_lock);
1118*4882a593Smuzhiyun 		if (!nsock->dead) {
1119*4882a593Smuzhiyun 			mutex_unlock(&nsock->tx_lock);
1120*4882a593Smuzhiyun 			continue;
1121*4882a593Smuzhiyun 		}
1122*4882a593Smuzhiyun 		sk_set_memalloc(sock->sk);
1123*4882a593Smuzhiyun 		if (nbd->tag_set.timeout)
1124*4882a593Smuzhiyun 			sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
1125*4882a593Smuzhiyun 		atomic_inc(&config->recv_threads);
1126*4882a593Smuzhiyun 		refcount_inc(&nbd->config_refs);
1127*4882a593Smuzhiyun 		old = nsock->sock;
1128*4882a593Smuzhiyun 		nsock->fallback_index = -1;
1129*4882a593Smuzhiyun 		nsock->sock = sock;
1130*4882a593Smuzhiyun 		nsock->dead = false;
1131*4882a593Smuzhiyun 		INIT_WORK(&args->work, recv_work);
1132*4882a593Smuzhiyun 		args->index = i;
1133*4882a593Smuzhiyun 		args->nbd = nbd;
1134*4882a593Smuzhiyun 		nsock->cookie++;
1135*4882a593Smuzhiyun 		mutex_unlock(&nsock->tx_lock);
1136*4882a593Smuzhiyun 		sockfd_put(old);
1137*4882a593Smuzhiyun 
1138*4882a593Smuzhiyun 		clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
1139*4882a593Smuzhiyun 
1140*4882a593Smuzhiyun 		/* We take the tx_mutex in an error path in the recv_work, so we
1141*4882a593Smuzhiyun 		 * need to queue_work outside of the tx_mutex.
1142*4882a593Smuzhiyun 		 */
1143*4882a593Smuzhiyun 		queue_work(nbd->recv_workq, &args->work);
1144*4882a593Smuzhiyun 
1145*4882a593Smuzhiyun 		atomic_inc(&config->live_connections);
1146*4882a593Smuzhiyun 		wake_up(&config->conn_wait);
1147*4882a593Smuzhiyun 		return 0;
1148*4882a593Smuzhiyun 	}
1149*4882a593Smuzhiyun 	sockfd_put(sock);
1150*4882a593Smuzhiyun 	kfree(args);
1151*4882a593Smuzhiyun 	return -ENOSPC;
1152*4882a593Smuzhiyun }
1153*4882a593Smuzhiyun 
nbd_bdev_reset(struct block_device * bdev)1154*4882a593Smuzhiyun static void nbd_bdev_reset(struct block_device *bdev)
1155*4882a593Smuzhiyun {
1156*4882a593Smuzhiyun 	if (bdev->bd_openers > 1)
1157*4882a593Smuzhiyun 		return;
1158*4882a593Smuzhiyun 	bd_set_nr_sectors(bdev, 0);
1159*4882a593Smuzhiyun }
1160*4882a593Smuzhiyun 
nbd_parse_flags(struct nbd_device * nbd)1161*4882a593Smuzhiyun static void nbd_parse_flags(struct nbd_device *nbd)
1162*4882a593Smuzhiyun {
1163*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1164*4882a593Smuzhiyun 	if (config->flags & NBD_FLAG_READ_ONLY)
1165*4882a593Smuzhiyun 		set_disk_ro(nbd->disk, true);
1166*4882a593Smuzhiyun 	else
1167*4882a593Smuzhiyun 		set_disk_ro(nbd->disk, false);
1168*4882a593Smuzhiyun 	if (config->flags & NBD_FLAG_SEND_TRIM)
1169*4882a593Smuzhiyun 		blk_queue_flag_set(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1170*4882a593Smuzhiyun 	if (config->flags & NBD_FLAG_SEND_FLUSH) {
1171*4882a593Smuzhiyun 		if (config->flags & NBD_FLAG_SEND_FUA)
1172*4882a593Smuzhiyun 			blk_queue_write_cache(nbd->disk->queue, true, true);
1173*4882a593Smuzhiyun 		else
1174*4882a593Smuzhiyun 			blk_queue_write_cache(nbd->disk->queue, true, false);
1175*4882a593Smuzhiyun 	}
1176*4882a593Smuzhiyun 	else
1177*4882a593Smuzhiyun 		blk_queue_write_cache(nbd->disk->queue, false, false);
1178*4882a593Smuzhiyun }
1179*4882a593Smuzhiyun 
send_disconnects(struct nbd_device * nbd)1180*4882a593Smuzhiyun static void send_disconnects(struct nbd_device *nbd)
1181*4882a593Smuzhiyun {
1182*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1183*4882a593Smuzhiyun 	struct nbd_request request = {
1184*4882a593Smuzhiyun 		.magic = htonl(NBD_REQUEST_MAGIC),
1185*4882a593Smuzhiyun 		.type = htonl(NBD_CMD_DISC),
1186*4882a593Smuzhiyun 	};
1187*4882a593Smuzhiyun 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
1188*4882a593Smuzhiyun 	struct iov_iter from;
1189*4882a593Smuzhiyun 	int i, ret;
1190*4882a593Smuzhiyun 
1191*4882a593Smuzhiyun 	for (i = 0; i < config->num_connections; i++) {
1192*4882a593Smuzhiyun 		struct nbd_sock *nsock = config->socks[i];
1193*4882a593Smuzhiyun 
1194*4882a593Smuzhiyun 		iov_iter_kvec(&from, WRITE, &iov, 1, sizeof(request));
1195*4882a593Smuzhiyun 		mutex_lock(&nsock->tx_lock);
1196*4882a593Smuzhiyun 		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
1197*4882a593Smuzhiyun 		if (ret <= 0)
1198*4882a593Smuzhiyun 			dev_err(disk_to_dev(nbd->disk),
1199*4882a593Smuzhiyun 				"Send disconnect failed %d\n", ret);
1200*4882a593Smuzhiyun 		mutex_unlock(&nsock->tx_lock);
1201*4882a593Smuzhiyun 	}
1202*4882a593Smuzhiyun }
1203*4882a593Smuzhiyun 
nbd_disconnect(struct nbd_device * nbd)1204*4882a593Smuzhiyun static int nbd_disconnect(struct nbd_device *nbd)
1205*4882a593Smuzhiyun {
1206*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1207*4882a593Smuzhiyun 
1208*4882a593Smuzhiyun 	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
1209*4882a593Smuzhiyun 	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
1210*4882a593Smuzhiyun 	set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
1211*4882a593Smuzhiyun 	send_disconnects(nbd);
1212*4882a593Smuzhiyun 	return 0;
1213*4882a593Smuzhiyun }
1214*4882a593Smuzhiyun 
nbd_clear_sock(struct nbd_device * nbd)1215*4882a593Smuzhiyun static void nbd_clear_sock(struct nbd_device *nbd)
1216*4882a593Smuzhiyun {
1217*4882a593Smuzhiyun 	sock_shutdown(nbd);
1218*4882a593Smuzhiyun 	nbd_clear_que(nbd);
1219*4882a593Smuzhiyun 	nbd->task_setup = NULL;
1220*4882a593Smuzhiyun }
1221*4882a593Smuzhiyun 
nbd_config_put(struct nbd_device * nbd)1222*4882a593Smuzhiyun static void nbd_config_put(struct nbd_device *nbd)
1223*4882a593Smuzhiyun {
1224*4882a593Smuzhiyun 	if (refcount_dec_and_mutex_lock(&nbd->config_refs,
1225*4882a593Smuzhiyun 					&nbd->config_lock)) {
1226*4882a593Smuzhiyun 		struct nbd_config *config = nbd->config;
1227*4882a593Smuzhiyun 		nbd_dev_dbg_close(nbd);
1228*4882a593Smuzhiyun 		nbd_size_clear(nbd);
1229*4882a593Smuzhiyun 		if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
1230*4882a593Smuzhiyun 				       &config->runtime_flags))
1231*4882a593Smuzhiyun 			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
1232*4882a593Smuzhiyun 		nbd->task_recv = NULL;
1233*4882a593Smuzhiyun 		nbd_clear_sock(nbd);
1234*4882a593Smuzhiyun 		if (config->num_connections) {
1235*4882a593Smuzhiyun 			int i;
1236*4882a593Smuzhiyun 			for (i = 0; i < config->num_connections; i++) {
1237*4882a593Smuzhiyun 				sockfd_put(config->socks[i]->sock);
1238*4882a593Smuzhiyun 				kfree(config->socks[i]);
1239*4882a593Smuzhiyun 			}
1240*4882a593Smuzhiyun 			kfree(config->socks);
1241*4882a593Smuzhiyun 		}
1242*4882a593Smuzhiyun 		kfree(nbd->config);
1243*4882a593Smuzhiyun 		nbd->config = NULL;
1244*4882a593Smuzhiyun 
1245*4882a593Smuzhiyun 		if (nbd->recv_workq)
1246*4882a593Smuzhiyun 			destroy_workqueue(nbd->recv_workq);
1247*4882a593Smuzhiyun 		nbd->recv_workq = NULL;
1248*4882a593Smuzhiyun 
1249*4882a593Smuzhiyun 		nbd->tag_set.timeout = 0;
1250*4882a593Smuzhiyun 		nbd->disk->queue->limits.discard_granularity = 0;
1251*4882a593Smuzhiyun 		nbd->disk->queue->limits.discard_alignment = 0;
1252*4882a593Smuzhiyun 		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
1253*4882a593Smuzhiyun 		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, nbd->disk->queue);
1254*4882a593Smuzhiyun 
1255*4882a593Smuzhiyun 		mutex_unlock(&nbd->config_lock);
1256*4882a593Smuzhiyun 		nbd_put(nbd);
1257*4882a593Smuzhiyun 		module_put(THIS_MODULE);
1258*4882a593Smuzhiyun 	}
1259*4882a593Smuzhiyun }
1260*4882a593Smuzhiyun 
nbd_start_device(struct nbd_device * nbd)1261*4882a593Smuzhiyun static int nbd_start_device(struct nbd_device *nbd)
1262*4882a593Smuzhiyun {
1263*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1264*4882a593Smuzhiyun 	int num_connections = config->num_connections;
1265*4882a593Smuzhiyun 	int error = 0, i;
1266*4882a593Smuzhiyun 
1267*4882a593Smuzhiyun 	if (nbd->task_recv)
1268*4882a593Smuzhiyun 		return -EBUSY;
1269*4882a593Smuzhiyun 	if (!config->socks)
1270*4882a593Smuzhiyun 		return -EINVAL;
1271*4882a593Smuzhiyun 	if (num_connections > 1 &&
1272*4882a593Smuzhiyun 	    !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
1273*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
1274*4882a593Smuzhiyun 		return -EINVAL;
1275*4882a593Smuzhiyun 	}
1276*4882a593Smuzhiyun 
1277*4882a593Smuzhiyun 	nbd->recv_workq = alloc_workqueue("knbd%d-recv",
1278*4882a593Smuzhiyun 					  WQ_MEM_RECLAIM | WQ_HIGHPRI |
1279*4882a593Smuzhiyun 					  WQ_UNBOUND, 0, nbd->index);
1280*4882a593Smuzhiyun 	if (!nbd->recv_workq) {
1281*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
1282*4882a593Smuzhiyun 		return -ENOMEM;
1283*4882a593Smuzhiyun 	}
1284*4882a593Smuzhiyun 
1285*4882a593Smuzhiyun 	blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
1286*4882a593Smuzhiyun 	nbd->task_recv = current;
1287*4882a593Smuzhiyun 
1288*4882a593Smuzhiyun 	nbd_parse_flags(nbd);
1289*4882a593Smuzhiyun 
1290*4882a593Smuzhiyun 	error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
1291*4882a593Smuzhiyun 	if (error) {
1292*4882a593Smuzhiyun 		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
1293*4882a593Smuzhiyun 		return error;
1294*4882a593Smuzhiyun 	}
1295*4882a593Smuzhiyun 	set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
1296*4882a593Smuzhiyun 
1297*4882a593Smuzhiyun 	nbd_dev_dbg_init(nbd);
1298*4882a593Smuzhiyun 	for (i = 0; i < num_connections; i++) {
1299*4882a593Smuzhiyun 		struct recv_thread_args *args;
1300*4882a593Smuzhiyun 
1301*4882a593Smuzhiyun 		args = kzalloc(sizeof(*args), GFP_KERNEL);
1302*4882a593Smuzhiyun 		if (!args) {
1303*4882a593Smuzhiyun 			sock_shutdown(nbd);
1304*4882a593Smuzhiyun 			/*
1305*4882a593Smuzhiyun 			 * If num_connections is m (2 < m),
1306*4882a593Smuzhiyun 			 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
1307*4882a593Smuzhiyun 			 * But NO.(n + 1) failed. We still have n recv threads.
1308*4882a593Smuzhiyun 			 * So, add flush_workqueue here to prevent recv threads
1309*4882a593Smuzhiyun 			 * dropping the last config_refs and trying to destroy
1310*4882a593Smuzhiyun 			 * the workqueue from inside the workqueue.
1311*4882a593Smuzhiyun 			 */
1312*4882a593Smuzhiyun 			if (i)
1313*4882a593Smuzhiyun 				flush_workqueue(nbd->recv_workq);
1314*4882a593Smuzhiyun 			return -ENOMEM;
1315*4882a593Smuzhiyun 		}
1316*4882a593Smuzhiyun 		sk_set_memalloc(config->socks[i]->sock->sk);
1317*4882a593Smuzhiyun 		if (nbd->tag_set.timeout)
1318*4882a593Smuzhiyun 			config->socks[i]->sock->sk->sk_sndtimeo =
1319*4882a593Smuzhiyun 				nbd->tag_set.timeout;
1320*4882a593Smuzhiyun 		atomic_inc(&config->recv_threads);
1321*4882a593Smuzhiyun 		refcount_inc(&nbd->config_refs);
1322*4882a593Smuzhiyun 		INIT_WORK(&args->work, recv_work);
1323*4882a593Smuzhiyun 		args->nbd = nbd;
1324*4882a593Smuzhiyun 		args->index = i;
1325*4882a593Smuzhiyun 		queue_work(nbd->recv_workq, &args->work);
1326*4882a593Smuzhiyun 	}
1327*4882a593Smuzhiyun 	nbd_size_update(nbd, true);
1328*4882a593Smuzhiyun 	return error;
1329*4882a593Smuzhiyun }
1330*4882a593Smuzhiyun 
nbd_start_device_ioctl(struct nbd_device * nbd,struct block_device * bdev)1331*4882a593Smuzhiyun static int nbd_start_device_ioctl(struct nbd_device *nbd, struct block_device *bdev)
1332*4882a593Smuzhiyun {
1333*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1334*4882a593Smuzhiyun 	int ret;
1335*4882a593Smuzhiyun 
1336*4882a593Smuzhiyun 	ret = nbd_start_device(nbd);
1337*4882a593Smuzhiyun 	if (ret)
1338*4882a593Smuzhiyun 		return ret;
1339*4882a593Smuzhiyun 
1340*4882a593Smuzhiyun 	if (max_part)
1341*4882a593Smuzhiyun 		set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
1342*4882a593Smuzhiyun 	mutex_unlock(&nbd->config_lock);
1343*4882a593Smuzhiyun 	ret = wait_event_interruptible(config->recv_wq,
1344*4882a593Smuzhiyun 					 atomic_read(&config->recv_threads) == 0);
1345*4882a593Smuzhiyun 	if (ret) {
1346*4882a593Smuzhiyun 		sock_shutdown(nbd);
1347*4882a593Smuzhiyun 		nbd_clear_que(nbd);
1348*4882a593Smuzhiyun 	}
1349*4882a593Smuzhiyun 
1350*4882a593Smuzhiyun 	flush_workqueue(nbd->recv_workq);
1351*4882a593Smuzhiyun 	mutex_lock(&nbd->config_lock);
1352*4882a593Smuzhiyun 	nbd_bdev_reset(bdev);
1353*4882a593Smuzhiyun 	/* user requested, ignore socket errors */
1354*4882a593Smuzhiyun 	if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
1355*4882a593Smuzhiyun 		ret = 0;
1356*4882a593Smuzhiyun 	if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
1357*4882a593Smuzhiyun 		ret = -ETIMEDOUT;
1358*4882a593Smuzhiyun 	return ret;
1359*4882a593Smuzhiyun }
1360*4882a593Smuzhiyun 
nbd_clear_sock_ioctl(struct nbd_device * nbd,struct block_device * bdev)1361*4882a593Smuzhiyun static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
1362*4882a593Smuzhiyun 				 struct block_device *bdev)
1363*4882a593Smuzhiyun {
1364*4882a593Smuzhiyun 	nbd_clear_sock(nbd);
1365*4882a593Smuzhiyun 	__invalidate_device(bdev, true);
1366*4882a593Smuzhiyun 	nbd_bdev_reset(bdev);
1367*4882a593Smuzhiyun 	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
1368*4882a593Smuzhiyun 			       &nbd->config->runtime_flags))
1369*4882a593Smuzhiyun 		nbd_config_put(nbd);
1370*4882a593Smuzhiyun }
1371*4882a593Smuzhiyun 
nbd_is_valid_blksize(unsigned long blksize)1372*4882a593Smuzhiyun static bool nbd_is_valid_blksize(unsigned long blksize)
1373*4882a593Smuzhiyun {
1374*4882a593Smuzhiyun 	if (!blksize || !is_power_of_2(blksize) || blksize < 512 ||
1375*4882a593Smuzhiyun 	    blksize > PAGE_SIZE)
1376*4882a593Smuzhiyun 		return false;
1377*4882a593Smuzhiyun 	return true;
1378*4882a593Smuzhiyun }
1379*4882a593Smuzhiyun 
nbd_set_cmd_timeout(struct nbd_device * nbd,u64 timeout)1380*4882a593Smuzhiyun static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
1381*4882a593Smuzhiyun {
1382*4882a593Smuzhiyun 	nbd->tag_set.timeout = timeout * HZ;
1383*4882a593Smuzhiyun 	if (timeout)
1384*4882a593Smuzhiyun 		blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
1385*4882a593Smuzhiyun 	else
1386*4882a593Smuzhiyun 		blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
1387*4882a593Smuzhiyun }
1388*4882a593Smuzhiyun 
1389*4882a593Smuzhiyun /* Must be called with config_lock held */
__nbd_ioctl(struct block_device * bdev,struct nbd_device * nbd,unsigned int cmd,unsigned long arg)1390*4882a593Smuzhiyun static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
1391*4882a593Smuzhiyun 		       unsigned int cmd, unsigned long arg)
1392*4882a593Smuzhiyun {
1393*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1394*4882a593Smuzhiyun 
1395*4882a593Smuzhiyun 	switch (cmd) {
1396*4882a593Smuzhiyun 	case NBD_DISCONNECT:
1397*4882a593Smuzhiyun 		return nbd_disconnect(nbd);
1398*4882a593Smuzhiyun 	case NBD_CLEAR_SOCK:
1399*4882a593Smuzhiyun 		nbd_clear_sock_ioctl(nbd, bdev);
1400*4882a593Smuzhiyun 		return 0;
1401*4882a593Smuzhiyun 	case NBD_SET_SOCK:
1402*4882a593Smuzhiyun 		return nbd_add_socket(nbd, arg, false);
1403*4882a593Smuzhiyun 	case NBD_SET_BLKSIZE:
1404*4882a593Smuzhiyun 		if (!arg)
1405*4882a593Smuzhiyun 			arg = NBD_DEF_BLKSIZE;
1406*4882a593Smuzhiyun 		if (!nbd_is_valid_blksize(arg))
1407*4882a593Smuzhiyun 			return -EINVAL;
1408*4882a593Smuzhiyun 		nbd_size_set(nbd, arg,
1409*4882a593Smuzhiyun 			     div_s64(config->bytesize, arg));
1410*4882a593Smuzhiyun 		return 0;
1411*4882a593Smuzhiyun 	case NBD_SET_SIZE:
1412*4882a593Smuzhiyun 		nbd_size_set(nbd, config->blksize,
1413*4882a593Smuzhiyun 			     div_s64(arg, config->blksize));
1414*4882a593Smuzhiyun 		return 0;
1415*4882a593Smuzhiyun 	case NBD_SET_SIZE_BLOCKS:
1416*4882a593Smuzhiyun 		nbd_size_set(nbd, config->blksize, arg);
1417*4882a593Smuzhiyun 		return 0;
1418*4882a593Smuzhiyun 	case NBD_SET_TIMEOUT:
1419*4882a593Smuzhiyun 		nbd_set_cmd_timeout(nbd, arg);
1420*4882a593Smuzhiyun 		return 0;
1421*4882a593Smuzhiyun 
1422*4882a593Smuzhiyun 	case NBD_SET_FLAGS:
1423*4882a593Smuzhiyun 		config->flags = arg;
1424*4882a593Smuzhiyun 		return 0;
1425*4882a593Smuzhiyun 	case NBD_DO_IT:
1426*4882a593Smuzhiyun 		return nbd_start_device_ioctl(nbd, bdev);
1427*4882a593Smuzhiyun 	case NBD_CLEAR_QUE:
1428*4882a593Smuzhiyun 		/*
1429*4882a593Smuzhiyun 		 * This is for compatibility only.  The queue is always cleared
1430*4882a593Smuzhiyun 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
1431*4882a593Smuzhiyun 		 */
1432*4882a593Smuzhiyun 		return 0;
1433*4882a593Smuzhiyun 	case NBD_PRINT_DEBUG:
1434*4882a593Smuzhiyun 		/*
1435*4882a593Smuzhiyun 		 * For compatibility only, we no longer keep a list of
1436*4882a593Smuzhiyun 		 * outstanding requests.
1437*4882a593Smuzhiyun 		 */
1438*4882a593Smuzhiyun 		return 0;
1439*4882a593Smuzhiyun 	}
1440*4882a593Smuzhiyun 	return -ENOTTY;
1441*4882a593Smuzhiyun }
1442*4882a593Smuzhiyun 
nbd_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)1443*4882a593Smuzhiyun static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
1444*4882a593Smuzhiyun 		     unsigned int cmd, unsigned long arg)
1445*4882a593Smuzhiyun {
1446*4882a593Smuzhiyun 	struct nbd_device *nbd = bdev->bd_disk->private_data;
1447*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1448*4882a593Smuzhiyun 	int error = -EINVAL;
1449*4882a593Smuzhiyun 
1450*4882a593Smuzhiyun 	if (!capable(CAP_SYS_ADMIN))
1451*4882a593Smuzhiyun 		return -EPERM;
1452*4882a593Smuzhiyun 
1453*4882a593Smuzhiyun 	/* The block layer will pass back some non-nbd ioctls in case we have
1454*4882a593Smuzhiyun 	 * special handling for them, but we don't so just return an error.
1455*4882a593Smuzhiyun 	 */
1456*4882a593Smuzhiyun 	if (_IOC_TYPE(cmd) != 0xab)
1457*4882a593Smuzhiyun 		return -EINVAL;
1458*4882a593Smuzhiyun 
1459*4882a593Smuzhiyun 	mutex_lock(&nbd->config_lock);
1460*4882a593Smuzhiyun 
1461*4882a593Smuzhiyun 	/* Don't allow ioctl operations on a nbd device that was created with
1462*4882a593Smuzhiyun 	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
1463*4882a593Smuzhiyun 	 */
1464*4882a593Smuzhiyun 	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
1465*4882a593Smuzhiyun 	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
1466*4882a593Smuzhiyun 		error = __nbd_ioctl(bdev, nbd, cmd, arg);
1467*4882a593Smuzhiyun 	else
1468*4882a593Smuzhiyun 		dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
1469*4882a593Smuzhiyun 	mutex_unlock(&nbd->config_lock);
1470*4882a593Smuzhiyun 	return error;
1471*4882a593Smuzhiyun }
1472*4882a593Smuzhiyun 
nbd_alloc_config(void)1473*4882a593Smuzhiyun static struct nbd_config *nbd_alloc_config(void)
1474*4882a593Smuzhiyun {
1475*4882a593Smuzhiyun 	struct nbd_config *config;
1476*4882a593Smuzhiyun 
1477*4882a593Smuzhiyun 	if (!try_module_get(THIS_MODULE))
1478*4882a593Smuzhiyun 		return ERR_PTR(-ENODEV);
1479*4882a593Smuzhiyun 
1480*4882a593Smuzhiyun 	config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
1481*4882a593Smuzhiyun 	if (!config) {
1482*4882a593Smuzhiyun 		module_put(THIS_MODULE);
1483*4882a593Smuzhiyun 		return ERR_PTR(-ENOMEM);
1484*4882a593Smuzhiyun 	}
1485*4882a593Smuzhiyun 
1486*4882a593Smuzhiyun 	atomic_set(&config->recv_threads, 0);
1487*4882a593Smuzhiyun 	init_waitqueue_head(&config->recv_wq);
1488*4882a593Smuzhiyun 	init_waitqueue_head(&config->conn_wait);
1489*4882a593Smuzhiyun 	config->blksize = NBD_DEF_BLKSIZE;
1490*4882a593Smuzhiyun 	atomic_set(&config->live_connections, 0);
1491*4882a593Smuzhiyun 	return config;
1492*4882a593Smuzhiyun }
1493*4882a593Smuzhiyun 
nbd_open(struct block_device * bdev,fmode_t mode)1494*4882a593Smuzhiyun static int nbd_open(struct block_device *bdev, fmode_t mode)
1495*4882a593Smuzhiyun {
1496*4882a593Smuzhiyun 	struct nbd_device *nbd;
1497*4882a593Smuzhiyun 	int ret = 0;
1498*4882a593Smuzhiyun 
1499*4882a593Smuzhiyun 	mutex_lock(&nbd_index_mutex);
1500*4882a593Smuzhiyun 	nbd = bdev->bd_disk->private_data;
1501*4882a593Smuzhiyun 	if (!nbd) {
1502*4882a593Smuzhiyun 		ret = -ENXIO;
1503*4882a593Smuzhiyun 		goto out;
1504*4882a593Smuzhiyun 	}
1505*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->refs)) {
1506*4882a593Smuzhiyun 		ret = -ENXIO;
1507*4882a593Smuzhiyun 		goto out;
1508*4882a593Smuzhiyun 	}
1509*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
1510*4882a593Smuzhiyun 		struct nbd_config *config;
1511*4882a593Smuzhiyun 
1512*4882a593Smuzhiyun 		mutex_lock(&nbd->config_lock);
1513*4882a593Smuzhiyun 		if (refcount_inc_not_zero(&nbd->config_refs)) {
1514*4882a593Smuzhiyun 			mutex_unlock(&nbd->config_lock);
1515*4882a593Smuzhiyun 			goto out;
1516*4882a593Smuzhiyun 		}
1517*4882a593Smuzhiyun 		config = nbd_alloc_config();
1518*4882a593Smuzhiyun 		if (IS_ERR(config)) {
1519*4882a593Smuzhiyun 			ret = PTR_ERR(config);
1520*4882a593Smuzhiyun 			mutex_unlock(&nbd->config_lock);
1521*4882a593Smuzhiyun 			goto out;
1522*4882a593Smuzhiyun 		}
1523*4882a593Smuzhiyun 		nbd->config = config;
1524*4882a593Smuzhiyun 		refcount_set(&nbd->config_refs, 1);
1525*4882a593Smuzhiyun 		refcount_inc(&nbd->refs);
1526*4882a593Smuzhiyun 		mutex_unlock(&nbd->config_lock);
1527*4882a593Smuzhiyun 		set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
1528*4882a593Smuzhiyun 	} else if (nbd_disconnected(nbd->config)) {
1529*4882a593Smuzhiyun 		set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
1530*4882a593Smuzhiyun 	}
1531*4882a593Smuzhiyun out:
1532*4882a593Smuzhiyun 	mutex_unlock(&nbd_index_mutex);
1533*4882a593Smuzhiyun 	return ret;
1534*4882a593Smuzhiyun }
1535*4882a593Smuzhiyun 
nbd_release(struct gendisk * disk,fmode_t mode)1536*4882a593Smuzhiyun static void nbd_release(struct gendisk *disk, fmode_t mode)
1537*4882a593Smuzhiyun {
1538*4882a593Smuzhiyun 	struct nbd_device *nbd = disk->private_data;
1539*4882a593Smuzhiyun 	struct block_device *bdev = bdget_disk(disk, 0);
1540*4882a593Smuzhiyun 
1541*4882a593Smuzhiyun 	if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
1542*4882a593Smuzhiyun 			bdev->bd_openers == 0)
1543*4882a593Smuzhiyun 		nbd_disconnect_and_put(nbd);
1544*4882a593Smuzhiyun 	bdput(bdev);
1545*4882a593Smuzhiyun 
1546*4882a593Smuzhiyun 	nbd_config_put(nbd);
1547*4882a593Smuzhiyun 	nbd_put(nbd);
1548*4882a593Smuzhiyun }
1549*4882a593Smuzhiyun 
1550*4882a593Smuzhiyun static const struct block_device_operations nbd_fops =
1551*4882a593Smuzhiyun {
1552*4882a593Smuzhiyun 	.owner =	THIS_MODULE,
1553*4882a593Smuzhiyun 	.open =		nbd_open,
1554*4882a593Smuzhiyun 	.release =	nbd_release,
1555*4882a593Smuzhiyun 	.ioctl =	nbd_ioctl,
1556*4882a593Smuzhiyun 	.compat_ioctl =	nbd_ioctl,
1557*4882a593Smuzhiyun };
1558*4882a593Smuzhiyun 
1559*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_DEBUG_FS)
1560*4882a593Smuzhiyun 
nbd_dbg_tasks_show(struct seq_file * s,void * unused)1561*4882a593Smuzhiyun static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
1562*4882a593Smuzhiyun {
1563*4882a593Smuzhiyun 	struct nbd_device *nbd = s->private;
1564*4882a593Smuzhiyun 
1565*4882a593Smuzhiyun 	if (nbd->task_recv)
1566*4882a593Smuzhiyun 		seq_printf(s, "recv: %d\n", task_pid_nr(nbd->task_recv));
1567*4882a593Smuzhiyun 
1568*4882a593Smuzhiyun 	return 0;
1569*4882a593Smuzhiyun }
1570*4882a593Smuzhiyun 
nbd_dbg_tasks_open(struct inode * inode,struct file * file)1571*4882a593Smuzhiyun static int nbd_dbg_tasks_open(struct inode *inode, struct file *file)
1572*4882a593Smuzhiyun {
1573*4882a593Smuzhiyun 	return single_open(file, nbd_dbg_tasks_show, inode->i_private);
1574*4882a593Smuzhiyun }
1575*4882a593Smuzhiyun 
1576*4882a593Smuzhiyun static const struct file_operations nbd_dbg_tasks_ops = {
1577*4882a593Smuzhiyun 	.open = nbd_dbg_tasks_open,
1578*4882a593Smuzhiyun 	.read = seq_read,
1579*4882a593Smuzhiyun 	.llseek = seq_lseek,
1580*4882a593Smuzhiyun 	.release = single_release,
1581*4882a593Smuzhiyun };
1582*4882a593Smuzhiyun 
nbd_dbg_flags_show(struct seq_file * s,void * unused)1583*4882a593Smuzhiyun static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
1584*4882a593Smuzhiyun {
1585*4882a593Smuzhiyun 	struct nbd_device *nbd = s->private;
1586*4882a593Smuzhiyun 	u32 flags = nbd->config->flags;
1587*4882a593Smuzhiyun 
1588*4882a593Smuzhiyun 	seq_printf(s, "Hex: 0x%08x\n\n", flags);
1589*4882a593Smuzhiyun 
1590*4882a593Smuzhiyun 	seq_puts(s, "Known flags:\n");
1591*4882a593Smuzhiyun 
1592*4882a593Smuzhiyun 	if (flags & NBD_FLAG_HAS_FLAGS)
1593*4882a593Smuzhiyun 		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
1594*4882a593Smuzhiyun 	if (flags & NBD_FLAG_READ_ONLY)
1595*4882a593Smuzhiyun 		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
1596*4882a593Smuzhiyun 	if (flags & NBD_FLAG_SEND_FLUSH)
1597*4882a593Smuzhiyun 		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
1598*4882a593Smuzhiyun 	if (flags & NBD_FLAG_SEND_FUA)
1599*4882a593Smuzhiyun 		seq_puts(s, "NBD_FLAG_SEND_FUA\n");
1600*4882a593Smuzhiyun 	if (flags & NBD_FLAG_SEND_TRIM)
1601*4882a593Smuzhiyun 		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
1602*4882a593Smuzhiyun 
1603*4882a593Smuzhiyun 	return 0;
1604*4882a593Smuzhiyun }
1605*4882a593Smuzhiyun 
nbd_dbg_flags_open(struct inode * inode,struct file * file)1606*4882a593Smuzhiyun static int nbd_dbg_flags_open(struct inode *inode, struct file *file)
1607*4882a593Smuzhiyun {
1608*4882a593Smuzhiyun 	return single_open(file, nbd_dbg_flags_show, inode->i_private);
1609*4882a593Smuzhiyun }
1610*4882a593Smuzhiyun 
1611*4882a593Smuzhiyun static const struct file_operations nbd_dbg_flags_ops = {
1612*4882a593Smuzhiyun 	.open = nbd_dbg_flags_open,
1613*4882a593Smuzhiyun 	.read = seq_read,
1614*4882a593Smuzhiyun 	.llseek = seq_lseek,
1615*4882a593Smuzhiyun 	.release = single_release,
1616*4882a593Smuzhiyun };
1617*4882a593Smuzhiyun 
nbd_dev_dbg_init(struct nbd_device * nbd)1618*4882a593Smuzhiyun static int nbd_dev_dbg_init(struct nbd_device *nbd)
1619*4882a593Smuzhiyun {
1620*4882a593Smuzhiyun 	struct dentry *dir;
1621*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1622*4882a593Smuzhiyun 
1623*4882a593Smuzhiyun 	if (!nbd_dbg_dir)
1624*4882a593Smuzhiyun 		return -EIO;
1625*4882a593Smuzhiyun 
1626*4882a593Smuzhiyun 	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
1627*4882a593Smuzhiyun 	if (!dir) {
1628*4882a593Smuzhiyun 		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
1629*4882a593Smuzhiyun 			nbd_name(nbd));
1630*4882a593Smuzhiyun 		return -EIO;
1631*4882a593Smuzhiyun 	}
1632*4882a593Smuzhiyun 	config->dbg_dir = dir;
1633*4882a593Smuzhiyun 
1634*4882a593Smuzhiyun 	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops);
1635*4882a593Smuzhiyun 	debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
1636*4882a593Smuzhiyun 	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
1637*4882a593Smuzhiyun 	debugfs_create_u64("blocksize", 0444, dir, &config->blksize);
1638*4882a593Smuzhiyun 	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops);
1639*4882a593Smuzhiyun 
1640*4882a593Smuzhiyun 	return 0;
1641*4882a593Smuzhiyun }
1642*4882a593Smuzhiyun 
nbd_dev_dbg_close(struct nbd_device * nbd)1643*4882a593Smuzhiyun static void nbd_dev_dbg_close(struct nbd_device *nbd)
1644*4882a593Smuzhiyun {
1645*4882a593Smuzhiyun 	debugfs_remove_recursive(nbd->config->dbg_dir);
1646*4882a593Smuzhiyun }
1647*4882a593Smuzhiyun 
nbd_dbg_init(void)1648*4882a593Smuzhiyun static int nbd_dbg_init(void)
1649*4882a593Smuzhiyun {
1650*4882a593Smuzhiyun 	struct dentry *dbg_dir;
1651*4882a593Smuzhiyun 
1652*4882a593Smuzhiyun 	dbg_dir = debugfs_create_dir("nbd", NULL);
1653*4882a593Smuzhiyun 	if (!dbg_dir)
1654*4882a593Smuzhiyun 		return -EIO;
1655*4882a593Smuzhiyun 
1656*4882a593Smuzhiyun 	nbd_dbg_dir = dbg_dir;
1657*4882a593Smuzhiyun 
1658*4882a593Smuzhiyun 	return 0;
1659*4882a593Smuzhiyun }
1660*4882a593Smuzhiyun 
nbd_dbg_close(void)1661*4882a593Smuzhiyun static void nbd_dbg_close(void)
1662*4882a593Smuzhiyun {
1663*4882a593Smuzhiyun 	debugfs_remove_recursive(nbd_dbg_dir);
1664*4882a593Smuzhiyun }
1665*4882a593Smuzhiyun 
1666*4882a593Smuzhiyun #else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
1667*4882a593Smuzhiyun 
nbd_dev_dbg_init(struct nbd_device * nbd)1668*4882a593Smuzhiyun static int nbd_dev_dbg_init(struct nbd_device *nbd)
1669*4882a593Smuzhiyun {
1670*4882a593Smuzhiyun 	return 0;
1671*4882a593Smuzhiyun }
1672*4882a593Smuzhiyun 
nbd_dev_dbg_close(struct nbd_device * nbd)1673*4882a593Smuzhiyun static void nbd_dev_dbg_close(struct nbd_device *nbd)
1674*4882a593Smuzhiyun {
1675*4882a593Smuzhiyun }
1676*4882a593Smuzhiyun 
nbd_dbg_init(void)1677*4882a593Smuzhiyun static int nbd_dbg_init(void)
1678*4882a593Smuzhiyun {
1679*4882a593Smuzhiyun 	return 0;
1680*4882a593Smuzhiyun }
1681*4882a593Smuzhiyun 
nbd_dbg_close(void)1682*4882a593Smuzhiyun static void nbd_dbg_close(void)
1683*4882a593Smuzhiyun {
1684*4882a593Smuzhiyun }
1685*4882a593Smuzhiyun 
1686*4882a593Smuzhiyun #endif
1687*4882a593Smuzhiyun 
nbd_init_request(struct blk_mq_tag_set * set,struct request * rq,unsigned int hctx_idx,unsigned int numa_node)1688*4882a593Smuzhiyun static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
1689*4882a593Smuzhiyun 			    unsigned int hctx_idx, unsigned int numa_node)
1690*4882a593Smuzhiyun {
1691*4882a593Smuzhiyun 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
1692*4882a593Smuzhiyun 	cmd->nbd = set->driver_data;
1693*4882a593Smuzhiyun 	cmd->flags = 0;
1694*4882a593Smuzhiyun 	mutex_init(&cmd->lock);
1695*4882a593Smuzhiyun 	return 0;
1696*4882a593Smuzhiyun }
1697*4882a593Smuzhiyun 
1698*4882a593Smuzhiyun static const struct blk_mq_ops nbd_mq_ops = {
1699*4882a593Smuzhiyun 	.queue_rq	= nbd_queue_rq,
1700*4882a593Smuzhiyun 	.complete	= nbd_complete_rq,
1701*4882a593Smuzhiyun 	.init_request	= nbd_init_request,
1702*4882a593Smuzhiyun 	.timeout	= nbd_xmit_timeout,
1703*4882a593Smuzhiyun };
1704*4882a593Smuzhiyun 
nbd_dev_add(int index)1705*4882a593Smuzhiyun static int nbd_dev_add(int index)
1706*4882a593Smuzhiyun {
1707*4882a593Smuzhiyun 	struct nbd_device *nbd;
1708*4882a593Smuzhiyun 	struct gendisk *disk;
1709*4882a593Smuzhiyun 	struct request_queue *q;
1710*4882a593Smuzhiyun 	int err = -ENOMEM;
1711*4882a593Smuzhiyun 
1712*4882a593Smuzhiyun 	nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
1713*4882a593Smuzhiyun 	if (!nbd)
1714*4882a593Smuzhiyun 		goto out;
1715*4882a593Smuzhiyun 
1716*4882a593Smuzhiyun 	disk = alloc_disk(1 << part_shift);
1717*4882a593Smuzhiyun 	if (!disk)
1718*4882a593Smuzhiyun 		goto out_free_nbd;
1719*4882a593Smuzhiyun 
1720*4882a593Smuzhiyun 	if (index >= 0) {
1721*4882a593Smuzhiyun 		err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
1722*4882a593Smuzhiyun 				GFP_KERNEL);
1723*4882a593Smuzhiyun 		if (err == -ENOSPC)
1724*4882a593Smuzhiyun 			err = -EEXIST;
1725*4882a593Smuzhiyun 	} else {
1726*4882a593Smuzhiyun 		err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
1727*4882a593Smuzhiyun 		if (err >= 0)
1728*4882a593Smuzhiyun 			index = err;
1729*4882a593Smuzhiyun 	}
1730*4882a593Smuzhiyun 	if (err < 0)
1731*4882a593Smuzhiyun 		goto out_free_disk;
1732*4882a593Smuzhiyun 
1733*4882a593Smuzhiyun 	nbd->index = index;
1734*4882a593Smuzhiyun 	nbd->disk = disk;
1735*4882a593Smuzhiyun 	nbd->tag_set.ops = &nbd_mq_ops;
1736*4882a593Smuzhiyun 	nbd->tag_set.nr_hw_queues = 1;
1737*4882a593Smuzhiyun 	nbd->tag_set.queue_depth = 128;
1738*4882a593Smuzhiyun 	nbd->tag_set.numa_node = NUMA_NO_NODE;
1739*4882a593Smuzhiyun 	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
1740*4882a593Smuzhiyun 	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
1741*4882a593Smuzhiyun 		BLK_MQ_F_BLOCKING;
1742*4882a593Smuzhiyun 	nbd->tag_set.driver_data = nbd;
1743*4882a593Smuzhiyun 	nbd->destroy_complete = NULL;
1744*4882a593Smuzhiyun 
1745*4882a593Smuzhiyun 	err = blk_mq_alloc_tag_set(&nbd->tag_set);
1746*4882a593Smuzhiyun 	if (err)
1747*4882a593Smuzhiyun 		goto out_free_idr;
1748*4882a593Smuzhiyun 
1749*4882a593Smuzhiyun 	q = blk_mq_init_queue(&nbd->tag_set);
1750*4882a593Smuzhiyun 	if (IS_ERR(q)) {
1751*4882a593Smuzhiyun 		err = PTR_ERR(q);
1752*4882a593Smuzhiyun 		goto out_free_tags;
1753*4882a593Smuzhiyun 	}
1754*4882a593Smuzhiyun 	disk->queue = q;
1755*4882a593Smuzhiyun 
1756*4882a593Smuzhiyun 	/*
1757*4882a593Smuzhiyun 	 * Tell the block layer that we are not a rotational device
1758*4882a593Smuzhiyun 	 */
1759*4882a593Smuzhiyun 	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
1760*4882a593Smuzhiyun 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
1761*4882a593Smuzhiyun 	disk->queue->limits.discard_granularity = 0;
1762*4882a593Smuzhiyun 	disk->queue->limits.discard_alignment = 0;
1763*4882a593Smuzhiyun 	blk_queue_max_discard_sectors(disk->queue, 0);
1764*4882a593Smuzhiyun 	blk_queue_max_segment_size(disk->queue, UINT_MAX);
1765*4882a593Smuzhiyun 	blk_queue_max_segments(disk->queue, USHRT_MAX);
1766*4882a593Smuzhiyun 	blk_queue_max_hw_sectors(disk->queue, 65536);
1767*4882a593Smuzhiyun 	disk->queue->limits.max_sectors = 256;
1768*4882a593Smuzhiyun 
1769*4882a593Smuzhiyun 	mutex_init(&nbd->config_lock);
1770*4882a593Smuzhiyun 	refcount_set(&nbd->config_refs, 0);
1771*4882a593Smuzhiyun 	refcount_set(&nbd->refs, 1);
1772*4882a593Smuzhiyun 	INIT_LIST_HEAD(&nbd->list);
1773*4882a593Smuzhiyun 	disk->major = NBD_MAJOR;
1774*4882a593Smuzhiyun 	disk->first_minor = index << part_shift;
1775*4882a593Smuzhiyun 	disk->fops = &nbd_fops;
1776*4882a593Smuzhiyun 	disk->private_data = nbd;
1777*4882a593Smuzhiyun 	sprintf(disk->disk_name, "nbd%d", index);
1778*4882a593Smuzhiyun 	add_disk(disk);
1779*4882a593Smuzhiyun 	nbd_total_devices++;
1780*4882a593Smuzhiyun 	return index;
1781*4882a593Smuzhiyun 
1782*4882a593Smuzhiyun out_free_tags:
1783*4882a593Smuzhiyun 	blk_mq_free_tag_set(&nbd->tag_set);
1784*4882a593Smuzhiyun out_free_idr:
1785*4882a593Smuzhiyun 	idr_remove(&nbd_index_idr, index);
1786*4882a593Smuzhiyun out_free_disk:
1787*4882a593Smuzhiyun 	put_disk(disk);
1788*4882a593Smuzhiyun out_free_nbd:
1789*4882a593Smuzhiyun 	kfree(nbd);
1790*4882a593Smuzhiyun out:
1791*4882a593Smuzhiyun 	return err;
1792*4882a593Smuzhiyun }
1793*4882a593Smuzhiyun 
find_free_cb(int id,void * ptr,void * data)1794*4882a593Smuzhiyun static int find_free_cb(int id, void *ptr, void *data)
1795*4882a593Smuzhiyun {
1796*4882a593Smuzhiyun 	struct nbd_device *nbd = ptr;
1797*4882a593Smuzhiyun 	struct nbd_device **found = data;
1798*4882a593Smuzhiyun 
1799*4882a593Smuzhiyun 	if (!refcount_read(&nbd->config_refs)) {
1800*4882a593Smuzhiyun 		*found = nbd;
1801*4882a593Smuzhiyun 		return 1;
1802*4882a593Smuzhiyun 	}
1803*4882a593Smuzhiyun 	return 0;
1804*4882a593Smuzhiyun }
1805*4882a593Smuzhiyun 
1806*4882a593Smuzhiyun /* Netlink interface. */
1807*4882a593Smuzhiyun static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
1808*4882a593Smuzhiyun 	[NBD_ATTR_INDEX]		=	{ .type = NLA_U32 },
1809*4882a593Smuzhiyun 	[NBD_ATTR_SIZE_BYTES]		=	{ .type = NLA_U64 },
1810*4882a593Smuzhiyun 	[NBD_ATTR_BLOCK_SIZE_BYTES]	=	{ .type = NLA_U64 },
1811*4882a593Smuzhiyun 	[NBD_ATTR_TIMEOUT]		=	{ .type = NLA_U64 },
1812*4882a593Smuzhiyun 	[NBD_ATTR_SERVER_FLAGS]		=	{ .type = NLA_U64 },
1813*4882a593Smuzhiyun 	[NBD_ATTR_CLIENT_FLAGS]		=	{ .type = NLA_U64 },
1814*4882a593Smuzhiyun 	[NBD_ATTR_SOCKETS]		=	{ .type = NLA_NESTED},
1815*4882a593Smuzhiyun 	[NBD_ATTR_DEAD_CONN_TIMEOUT]	=	{ .type = NLA_U64 },
1816*4882a593Smuzhiyun 	[NBD_ATTR_DEVICE_LIST]		=	{ .type = NLA_NESTED},
1817*4882a593Smuzhiyun };
1818*4882a593Smuzhiyun 
1819*4882a593Smuzhiyun static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
1820*4882a593Smuzhiyun 	[NBD_SOCK_FD]			=	{ .type = NLA_U32 },
1821*4882a593Smuzhiyun };
1822*4882a593Smuzhiyun 
1823*4882a593Smuzhiyun /* We don't use this right now since we don't parse the incoming list, but we
1824*4882a593Smuzhiyun  * still want it here so userspace knows what to expect.
1825*4882a593Smuzhiyun  */
1826*4882a593Smuzhiyun static const struct nla_policy __attribute__((unused))
1827*4882a593Smuzhiyun nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
1828*4882a593Smuzhiyun 	[NBD_DEVICE_INDEX]		=	{ .type = NLA_U32 },
1829*4882a593Smuzhiyun 	[NBD_DEVICE_CONNECTED]		=	{ .type = NLA_U8 },
1830*4882a593Smuzhiyun };
1831*4882a593Smuzhiyun 
nbd_genl_size_set(struct genl_info * info,struct nbd_device * nbd)1832*4882a593Smuzhiyun static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
1833*4882a593Smuzhiyun {
1834*4882a593Smuzhiyun 	struct nbd_config *config = nbd->config;
1835*4882a593Smuzhiyun 	u64 bsize = config->blksize;
1836*4882a593Smuzhiyun 	u64 bytes = config->bytesize;
1837*4882a593Smuzhiyun 
1838*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_SIZE_BYTES])
1839*4882a593Smuzhiyun 		bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
1840*4882a593Smuzhiyun 
1841*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]) {
1842*4882a593Smuzhiyun 		bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
1843*4882a593Smuzhiyun 		if (!bsize)
1844*4882a593Smuzhiyun 			bsize = NBD_DEF_BLKSIZE;
1845*4882a593Smuzhiyun 		if (!nbd_is_valid_blksize(bsize)) {
1846*4882a593Smuzhiyun 			printk(KERN_ERR "Invalid block size %llu\n", bsize);
1847*4882a593Smuzhiyun 			return -EINVAL;
1848*4882a593Smuzhiyun 		}
1849*4882a593Smuzhiyun 	}
1850*4882a593Smuzhiyun 
1851*4882a593Smuzhiyun 	if (bytes != config->bytesize || bsize != config->blksize)
1852*4882a593Smuzhiyun 		nbd_size_set(nbd, bsize, div64_u64(bytes, bsize));
1853*4882a593Smuzhiyun 	return 0;
1854*4882a593Smuzhiyun }
1855*4882a593Smuzhiyun 
nbd_genl_connect(struct sk_buff * skb,struct genl_info * info)1856*4882a593Smuzhiyun static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
1857*4882a593Smuzhiyun {
1858*4882a593Smuzhiyun 	DECLARE_COMPLETION_ONSTACK(destroy_complete);
1859*4882a593Smuzhiyun 	struct nbd_device *nbd = NULL;
1860*4882a593Smuzhiyun 	struct nbd_config *config;
1861*4882a593Smuzhiyun 	int index = -1;
1862*4882a593Smuzhiyun 	int ret;
1863*4882a593Smuzhiyun 	bool put_dev = false;
1864*4882a593Smuzhiyun 
1865*4882a593Smuzhiyun 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
1866*4882a593Smuzhiyun 		return -EPERM;
1867*4882a593Smuzhiyun 
1868*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_INDEX])
1869*4882a593Smuzhiyun 		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
1870*4882a593Smuzhiyun 	if (!info->attrs[NBD_ATTR_SOCKETS]) {
1871*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: must specify at least one socket\n");
1872*4882a593Smuzhiyun 		return -EINVAL;
1873*4882a593Smuzhiyun 	}
1874*4882a593Smuzhiyun 	if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
1875*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: must specify a size in bytes for the device\n");
1876*4882a593Smuzhiyun 		return -EINVAL;
1877*4882a593Smuzhiyun 	}
1878*4882a593Smuzhiyun again:
1879*4882a593Smuzhiyun 	mutex_lock(&nbd_index_mutex);
1880*4882a593Smuzhiyun 	if (index == -1) {
1881*4882a593Smuzhiyun 		ret = idr_for_each(&nbd_index_idr, &find_free_cb, &nbd);
1882*4882a593Smuzhiyun 		if (ret == 0) {
1883*4882a593Smuzhiyun 			int new_index;
1884*4882a593Smuzhiyun 			new_index = nbd_dev_add(-1);
1885*4882a593Smuzhiyun 			if (new_index < 0) {
1886*4882a593Smuzhiyun 				mutex_unlock(&nbd_index_mutex);
1887*4882a593Smuzhiyun 				printk(KERN_ERR "nbd: failed to add new device\n");
1888*4882a593Smuzhiyun 				return new_index;
1889*4882a593Smuzhiyun 			}
1890*4882a593Smuzhiyun 			nbd = idr_find(&nbd_index_idr, new_index);
1891*4882a593Smuzhiyun 		}
1892*4882a593Smuzhiyun 	} else {
1893*4882a593Smuzhiyun 		nbd = idr_find(&nbd_index_idr, index);
1894*4882a593Smuzhiyun 		if (!nbd) {
1895*4882a593Smuzhiyun 			ret = nbd_dev_add(index);
1896*4882a593Smuzhiyun 			if (ret < 0) {
1897*4882a593Smuzhiyun 				mutex_unlock(&nbd_index_mutex);
1898*4882a593Smuzhiyun 				printk(KERN_ERR "nbd: failed to add new device\n");
1899*4882a593Smuzhiyun 				return ret;
1900*4882a593Smuzhiyun 			}
1901*4882a593Smuzhiyun 			nbd = idr_find(&nbd_index_idr, index);
1902*4882a593Smuzhiyun 		}
1903*4882a593Smuzhiyun 	}
1904*4882a593Smuzhiyun 	if (!nbd) {
1905*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
1906*4882a593Smuzhiyun 		       index);
1907*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
1908*4882a593Smuzhiyun 		return -EINVAL;
1909*4882a593Smuzhiyun 	}
1910*4882a593Smuzhiyun 
1911*4882a593Smuzhiyun 	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
1912*4882a593Smuzhiyun 	    test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) {
1913*4882a593Smuzhiyun 		nbd->destroy_complete = &destroy_complete;
1914*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
1915*4882a593Smuzhiyun 
1916*4882a593Smuzhiyun 		/* Wait untill the the nbd stuff is totally destroyed */
1917*4882a593Smuzhiyun 		wait_for_completion(&destroy_complete);
1918*4882a593Smuzhiyun 		goto again;
1919*4882a593Smuzhiyun 	}
1920*4882a593Smuzhiyun 
1921*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->refs)) {
1922*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
1923*4882a593Smuzhiyun 		if (index == -1)
1924*4882a593Smuzhiyun 			goto again;
1925*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: device at index %d is going down\n",
1926*4882a593Smuzhiyun 		       index);
1927*4882a593Smuzhiyun 		return -EINVAL;
1928*4882a593Smuzhiyun 	}
1929*4882a593Smuzhiyun 	mutex_unlock(&nbd_index_mutex);
1930*4882a593Smuzhiyun 
1931*4882a593Smuzhiyun 	mutex_lock(&nbd->config_lock);
1932*4882a593Smuzhiyun 	if (refcount_read(&nbd->config_refs)) {
1933*4882a593Smuzhiyun 		mutex_unlock(&nbd->config_lock);
1934*4882a593Smuzhiyun 		nbd_put(nbd);
1935*4882a593Smuzhiyun 		if (index == -1)
1936*4882a593Smuzhiyun 			goto again;
1937*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: nbd%d already in use\n", index);
1938*4882a593Smuzhiyun 		return -EBUSY;
1939*4882a593Smuzhiyun 	}
1940*4882a593Smuzhiyun 	if (WARN_ON(nbd->config)) {
1941*4882a593Smuzhiyun 		mutex_unlock(&nbd->config_lock);
1942*4882a593Smuzhiyun 		nbd_put(nbd);
1943*4882a593Smuzhiyun 		return -EINVAL;
1944*4882a593Smuzhiyun 	}
1945*4882a593Smuzhiyun 	config = nbd_alloc_config();
1946*4882a593Smuzhiyun 	if (IS_ERR(config)) {
1947*4882a593Smuzhiyun 		mutex_unlock(&nbd->config_lock);
1948*4882a593Smuzhiyun 		nbd_put(nbd);
1949*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: couldn't allocate config\n");
1950*4882a593Smuzhiyun 		return PTR_ERR(config);
1951*4882a593Smuzhiyun 	}
1952*4882a593Smuzhiyun 	nbd->config = config;
1953*4882a593Smuzhiyun 	refcount_set(&nbd->config_refs, 1);
1954*4882a593Smuzhiyun 	set_bit(NBD_RT_BOUND, &config->runtime_flags);
1955*4882a593Smuzhiyun 
1956*4882a593Smuzhiyun 	ret = nbd_genl_size_set(info, nbd);
1957*4882a593Smuzhiyun 	if (ret)
1958*4882a593Smuzhiyun 		goto out;
1959*4882a593Smuzhiyun 
1960*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_TIMEOUT])
1961*4882a593Smuzhiyun 		nbd_set_cmd_timeout(nbd,
1962*4882a593Smuzhiyun 				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
1963*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
1964*4882a593Smuzhiyun 		config->dead_conn_timeout =
1965*4882a593Smuzhiyun 			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
1966*4882a593Smuzhiyun 		config->dead_conn_timeout *= HZ;
1967*4882a593Smuzhiyun 	}
1968*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_SERVER_FLAGS])
1969*4882a593Smuzhiyun 		config->flags =
1970*4882a593Smuzhiyun 			nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
1971*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
1972*4882a593Smuzhiyun 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
1973*4882a593Smuzhiyun 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
1974*4882a593Smuzhiyun 			/*
1975*4882a593Smuzhiyun 			 * We have 1 ref to keep the device around, and then 1
1976*4882a593Smuzhiyun 			 * ref for our current operation here, which will be
1977*4882a593Smuzhiyun 			 * inherited by the config.  If we already have
1978*4882a593Smuzhiyun 			 * DESTROY_ON_DISCONNECT set then we know we don't have
1979*4882a593Smuzhiyun 			 * that extra ref already held so we don't need the
1980*4882a593Smuzhiyun 			 * put_dev.
1981*4882a593Smuzhiyun 			 */
1982*4882a593Smuzhiyun 			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
1983*4882a593Smuzhiyun 					      &nbd->flags))
1984*4882a593Smuzhiyun 				put_dev = true;
1985*4882a593Smuzhiyun 		} else {
1986*4882a593Smuzhiyun 			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
1987*4882a593Smuzhiyun 					       &nbd->flags))
1988*4882a593Smuzhiyun 				refcount_inc(&nbd->refs);
1989*4882a593Smuzhiyun 		}
1990*4882a593Smuzhiyun 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
1991*4882a593Smuzhiyun 			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
1992*4882a593Smuzhiyun 				&config->runtime_flags);
1993*4882a593Smuzhiyun 		}
1994*4882a593Smuzhiyun 	}
1995*4882a593Smuzhiyun 
1996*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_SOCKETS]) {
1997*4882a593Smuzhiyun 		struct nlattr *attr;
1998*4882a593Smuzhiyun 		int rem, fd;
1999*4882a593Smuzhiyun 
2000*4882a593Smuzhiyun 		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2001*4882a593Smuzhiyun 				    rem) {
2002*4882a593Smuzhiyun 			struct nlattr *socks[NBD_SOCK_MAX+1];
2003*4882a593Smuzhiyun 
2004*4882a593Smuzhiyun 			if (nla_type(attr) != NBD_SOCK_ITEM) {
2005*4882a593Smuzhiyun 				printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
2006*4882a593Smuzhiyun 				ret = -EINVAL;
2007*4882a593Smuzhiyun 				goto out;
2008*4882a593Smuzhiyun 			}
2009*4882a593Smuzhiyun 			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2010*4882a593Smuzhiyun 							  attr,
2011*4882a593Smuzhiyun 							  nbd_sock_policy,
2012*4882a593Smuzhiyun 							  info->extack);
2013*4882a593Smuzhiyun 			if (ret != 0) {
2014*4882a593Smuzhiyun 				printk(KERN_ERR "nbd: error processing sock list\n");
2015*4882a593Smuzhiyun 				ret = -EINVAL;
2016*4882a593Smuzhiyun 				goto out;
2017*4882a593Smuzhiyun 			}
2018*4882a593Smuzhiyun 			if (!socks[NBD_SOCK_FD])
2019*4882a593Smuzhiyun 				continue;
2020*4882a593Smuzhiyun 			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2021*4882a593Smuzhiyun 			ret = nbd_add_socket(nbd, fd, true);
2022*4882a593Smuzhiyun 			if (ret)
2023*4882a593Smuzhiyun 				goto out;
2024*4882a593Smuzhiyun 		}
2025*4882a593Smuzhiyun 	}
2026*4882a593Smuzhiyun 	ret = nbd_start_device(nbd);
2027*4882a593Smuzhiyun out:
2028*4882a593Smuzhiyun 	mutex_unlock(&nbd->config_lock);
2029*4882a593Smuzhiyun 	if (!ret) {
2030*4882a593Smuzhiyun 		set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
2031*4882a593Smuzhiyun 		refcount_inc(&nbd->config_refs);
2032*4882a593Smuzhiyun 		nbd_connect_reply(info, nbd->index);
2033*4882a593Smuzhiyun 	}
2034*4882a593Smuzhiyun 	nbd_config_put(nbd);
2035*4882a593Smuzhiyun 	if (put_dev)
2036*4882a593Smuzhiyun 		nbd_put(nbd);
2037*4882a593Smuzhiyun 	return ret;
2038*4882a593Smuzhiyun }
2039*4882a593Smuzhiyun 
nbd_disconnect_and_put(struct nbd_device * nbd)2040*4882a593Smuzhiyun static void nbd_disconnect_and_put(struct nbd_device *nbd)
2041*4882a593Smuzhiyun {
2042*4882a593Smuzhiyun 	mutex_lock(&nbd->config_lock);
2043*4882a593Smuzhiyun 	nbd_disconnect(nbd);
2044*4882a593Smuzhiyun 	sock_shutdown(nbd);
2045*4882a593Smuzhiyun 	wake_up(&nbd->config->conn_wait);
2046*4882a593Smuzhiyun 	/*
2047*4882a593Smuzhiyun 	 * Make sure recv thread has finished, so it does not drop the last
2048*4882a593Smuzhiyun 	 * config ref and try to destroy the workqueue from inside the work
2049*4882a593Smuzhiyun 	 * queue. And this also ensure that we can safely call nbd_clear_que()
2050*4882a593Smuzhiyun 	 * to cancel the inflight I/Os.
2051*4882a593Smuzhiyun 	 */
2052*4882a593Smuzhiyun 	if (nbd->recv_workq)
2053*4882a593Smuzhiyun 		flush_workqueue(nbd->recv_workq);
2054*4882a593Smuzhiyun 	nbd_clear_que(nbd);
2055*4882a593Smuzhiyun 	nbd->task_setup = NULL;
2056*4882a593Smuzhiyun 	mutex_unlock(&nbd->config_lock);
2057*4882a593Smuzhiyun 
2058*4882a593Smuzhiyun 	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
2059*4882a593Smuzhiyun 			       &nbd->config->runtime_flags))
2060*4882a593Smuzhiyun 		nbd_config_put(nbd);
2061*4882a593Smuzhiyun }
2062*4882a593Smuzhiyun 
nbd_genl_disconnect(struct sk_buff * skb,struct genl_info * info)2063*4882a593Smuzhiyun static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
2064*4882a593Smuzhiyun {
2065*4882a593Smuzhiyun 	struct nbd_device *nbd;
2066*4882a593Smuzhiyun 	int index;
2067*4882a593Smuzhiyun 
2068*4882a593Smuzhiyun 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
2069*4882a593Smuzhiyun 		return -EPERM;
2070*4882a593Smuzhiyun 
2071*4882a593Smuzhiyun 	if (!info->attrs[NBD_ATTR_INDEX]) {
2072*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: must specify an index to disconnect\n");
2073*4882a593Smuzhiyun 		return -EINVAL;
2074*4882a593Smuzhiyun 	}
2075*4882a593Smuzhiyun 	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2076*4882a593Smuzhiyun 	mutex_lock(&nbd_index_mutex);
2077*4882a593Smuzhiyun 	nbd = idr_find(&nbd_index_idr, index);
2078*4882a593Smuzhiyun 	if (!nbd) {
2079*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
2080*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: couldn't find device at index %d\n",
2081*4882a593Smuzhiyun 		       index);
2082*4882a593Smuzhiyun 		return -EINVAL;
2083*4882a593Smuzhiyun 	}
2084*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->refs)) {
2085*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
2086*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: device at index %d is going down\n",
2087*4882a593Smuzhiyun 		       index);
2088*4882a593Smuzhiyun 		return -EINVAL;
2089*4882a593Smuzhiyun 	}
2090*4882a593Smuzhiyun 	mutex_unlock(&nbd_index_mutex);
2091*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
2092*4882a593Smuzhiyun 		nbd_put(nbd);
2093*4882a593Smuzhiyun 		return 0;
2094*4882a593Smuzhiyun 	}
2095*4882a593Smuzhiyun 	nbd_disconnect_and_put(nbd);
2096*4882a593Smuzhiyun 	nbd_config_put(nbd);
2097*4882a593Smuzhiyun 	nbd_put(nbd);
2098*4882a593Smuzhiyun 	return 0;
2099*4882a593Smuzhiyun }
2100*4882a593Smuzhiyun 
nbd_genl_reconfigure(struct sk_buff * skb,struct genl_info * info)2101*4882a593Smuzhiyun static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
2102*4882a593Smuzhiyun {
2103*4882a593Smuzhiyun 	struct nbd_device *nbd = NULL;
2104*4882a593Smuzhiyun 	struct nbd_config *config;
2105*4882a593Smuzhiyun 	int index;
2106*4882a593Smuzhiyun 	int ret = 0;
2107*4882a593Smuzhiyun 	bool put_dev = false;
2108*4882a593Smuzhiyun 
2109*4882a593Smuzhiyun 	if (!netlink_capable(skb, CAP_SYS_ADMIN))
2110*4882a593Smuzhiyun 		return -EPERM;
2111*4882a593Smuzhiyun 
2112*4882a593Smuzhiyun 	if (!info->attrs[NBD_ATTR_INDEX]) {
2113*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: must specify a device to reconfigure\n");
2114*4882a593Smuzhiyun 		return -EINVAL;
2115*4882a593Smuzhiyun 	}
2116*4882a593Smuzhiyun 	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2117*4882a593Smuzhiyun 	mutex_lock(&nbd_index_mutex);
2118*4882a593Smuzhiyun 	nbd = idr_find(&nbd_index_idr, index);
2119*4882a593Smuzhiyun 	if (!nbd) {
2120*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
2121*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: couldn't find a device at index %d\n",
2122*4882a593Smuzhiyun 		       index);
2123*4882a593Smuzhiyun 		return -EINVAL;
2124*4882a593Smuzhiyun 	}
2125*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->refs)) {
2126*4882a593Smuzhiyun 		mutex_unlock(&nbd_index_mutex);
2127*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: device at index %d is going down\n",
2128*4882a593Smuzhiyun 		       index);
2129*4882a593Smuzhiyun 		return -EINVAL;
2130*4882a593Smuzhiyun 	}
2131*4882a593Smuzhiyun 	mutex_unlock(&nbd_index_mutex);
2132*4882a593Smuzhiyun 
2133*4882a593Smuzhiyun 	if (!refcount_inc_not_zero(&nbd->config_refs)) {
2134*4882a593Smuzhiyun 		dev_err(nbd_to_dev(nbd),
2135*4882a593Smuzhiyun 			"not configured, cannot reconfigure\n");
2136*4882a593Smuzhiyun 		nbd_put(nbd);
2137*4882a593Smuzhiyun 		return -EINVAL;
2138*4882a593Smuzhiyun 	}
2139*4882a593Smuzhiyun 
2140*4882a593Smuzhiyun 	mutex_lock(&nbd->config_lock);
2141*4882a593Smuzhiyun 	config = nbd->config;
2142*4882a593Smuzhiyun 	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
2143*4882a593Smuzhiyun 	    !nbd->task_recv) {
2144*4882a593Smuzhiyun 		dev_err(nbd_to_dev(nbd),
2145*4882a593Smuzhiyun 			"not configured, cannot reconfigure\n");
2146*4882a593Smuzhiyun 		ret = -EINVAL;
2147*4882a593Smuzhiyun 		goto out;
2148*4882a593Smuzhiyun 	}
2149*4882a593Smuzhiyun 
2150*4882a593Smuzhiyun 	ret = nbd_genl_size_set(info, nbd);
2151*4882a593Smuzhiyun 	if (ret)
2152*4882a593Smuzhiyun 		goto out;
2153*4882a593Smuzhiyun 
2154*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_TIMEOUT])
2155*4882a593Smuzhiyun 		nbd_set_cmd_timeout(nbd,
2156*4882a593Smuzhiyun 				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
2157*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
2158*4882a593Smuzhiyun 		config->dead_conn_timeout =
2159*4882a593Smuzhiyun 			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
2160*4882a593Smuzhiyun 		config->dead_conn_timeout *= HZ;
2161*4882a593Smuzhiyun 	}
2162*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
2163*4882a593Smuzhiyun 		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
2164*4882a593Smuzhiyun 		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
2165*4882a593Smuzhiyun 			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
2166*4882a593Smuzhiyun 					      &nbd->flags))
2167*4882a593Smuzhiyun 				put_dev = true;
2168*4882a593Smuzhiyun 		} else {
2169*4882a593Smuzhiyun 			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
2170*4882a593Smuzhiyun 					       &nbd->flags))
2171*4882a593Smuzhiyun 				refcount_inc(&nbd->refs);
2172*4882a593Smuzhiyun 		}
2173*4882a593Smuzhiyun 
2174*4882a593Smuzhiyun 		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
2175*4882a593Smuzhiyun 			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2176*4882a593Smuzhiyun 					&config->runtime_flags);
2177*4882a593Smuzhiyun 		} else {
2178*4882a593Smuzhiyun 			clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
2179*4882a593Smuzhiyun 					&config->runtime_flags);
2180*4882a593Smuzhiyun 		}
2181*4882a593Smuzhiyun 	}
2182*4882a593Smuzhiyun 
2183*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_SOCKETS]) {
2184*4882a593Smuzhiyun 		struct nlattr *attr;
2185*4882a593Smuzhiyun 		int rem, fd;
2186*4882a593Smuzhiyun 
2187*4882a593Smuzhiyun 		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
2188*4882a593Smuzhiyun 				    rem) {
2189*4882a593Smuzhiyun 			struct nlattr *socks[NBD_SOCK_MAX+1];
2190*4882a593Smuzhiyun 
2191*4882a593Smuzhiyun 			if (nla_type(attr) != NBD_SOCK_ITEM) {
2192*4882a593Smuzhiyun 				printk(KERN_ERR "nbd: socks must be embedded in a SOCK_ITEM attr\n");
2193*4882a593Smuzhiyun 				ret = -EINVAL;
2194*4882a593Smuzhiyun 				goto out;
2195*4882a593Smuzhiyun 			}
2196*4882a593Smuzhiyun 			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
2197*4882a593Smuzhiyun 							  attr,
2198*4882a593Smuzhiyun 							  nbd_sock_policy,
2199*4882a593Smuzhiyun 							  info->extack);
2200*4882a593Smuzhiyun 			if (ret != 0) {
2201*4882a593Smuzhiyun 				printk(KERN_ERR "nbd: error processing sock list\n");
2202*4882a593Smuzhiyun 				ret = -EINVAL;
2203*4882a593Smuzhiyun 				goto out;
2204*4882a593Smuzhiyun 			}
2205*4882a593Smuzhiyun 			if (!socks[NBD_SOCK_FD])
2206*4882a593Smuzhiyun 				continue;
2207*4882a593Smuzhiyun 			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
2208*4882a593Smuzhiyun 			ret = nbd_reconnect_socket(nbd, fd);
2209*4882a593Smuzhiyun 			if (ret) {
2210*4882a593Smuzhiyun 				if (ret == -ENOSPC)
2211*4882a593Smuzhiyun 					ret = 0;
2212*4882a593Smuzhiyun 				goto out;
2213*4882a593Smuzhiyun 			}
2214*4882a593Smuzhiyun 			dev_info(nbd_to_dev(nbd), "reconnected socket\n");
2215*4882a593Smuzhiyun 		}
2216*4882a593Smuzhiyun 	}
2217*4882a593Smuzhiyun out:
2218*4882a593Smuzhiyun 	mutex_unlock(&nbd->config_lock);
2219*4882a593Smuzhiyun 	nbd_config_put(nbd);
2220*4882a593Smuzhiyun 	nbd_put(nbd);
2221*4882a593Smuzhiyun 	if (put_dev)
2222*4882a593Smuzhiyun 		nbd_put(nbd);
2223*4882a593Smuzhiyun 	return ret;
2224*4882a593Smuzhiyun }
2225*4882a593Smuzhiyun 
2226*4882a593Smuzhiyun static const struct genl_small_ops nbd_connect_genl_ops[] = {
2227*4882a593Smuzhiyun 	{
2228*4882a593Smuzhiyun 		.cmd	= NBD_CMD_CONNECT,
2229*4882a593Smuzhiyun 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2230*4882a593Smuzhiyun 		.doit	= nbd_genl_connect,
2231*4882a593Smuzhiyun 	},
2232*4882a593Smuzhiyun 	{
2233*4882a593Smuzhiyun 		.cmd	= NBD_CMD_DISCONNECT,
2234*4882a593Smuzhiyun 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2235*4882a593Smuzhiyun 		.doit	= nbd_genl_disconnect,
2236*4882a593Smuzhiyun 	},
2237*4882a593Smuzhiyun 	{
2238*4882a593Smuzhiyun 		.cmd	= NBD_CMD_RECONFIGURE,
2239*4882a593Smuzhiyun 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2240*4882a593Smuzhiyun 		.doit	= nbd_genl_reconfigure,
2241*4882a593Smuzhiyun 	},
2242*4882a593Smuzhiyun 	{
2243*4882a593Smuzhiyun 		.cmd	= NBD_CMD_STATUS,
2244*4882a593Smuzhiyun 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
2245*4882a593Smuzhiyun 		.doit	= nbd_genl_status,
2246*4882a593Smuzhiyun 	},
2247*4882a593Smuzhiyun };
2248*4882a593Smuzhiyun 
2249*4882a593Smuzhiyun static const struct genl_multicast_group nbd_mcast_grps[] = {
2250*4882a593Smuzhiyun 	{ .name = NBD_GENL_MCAST_GROUP_NAME, },
2251*4882a593Smuzhiyun };
2252*4882a593Smuzhiyun 
2253*4882a593Smuzhiyun static struct genl_family nbd_genl_family __ro_after_init = {
2254*4882a593Smuzhiyun 	.hdrsize	= 0,
2255*4882a593Smuzhiyun 	.name		= NBD_GENL_FAMILY_NAME,
2256*4882a593Smuzhiyun 	.version	= NBD_GENL_VERSION,
2257*4882a593Smuzhiyun 	.module		= THIS_MODULE,
2258*4882a593Smuzhiyun 	.small_ops	= nbd_connect_genl_ops,
2259*4882a593Smuzhiyun 	.n_small_ops	= ARRAY_SIZE(nbd_connect_genl_ops),
2260*4882a593Smuzhiyun 	.maxattr	= NBD_ATTR_MAX,
2261*4882a593Smuzhiyun 	.policy = nbd_attr_policy,
2262*4882a593Smuzhiyun 	.mcgrps		= nbd_mcast_grps,
2263*4882a593Smuzhiyun 	.n_mcgrps	= ARRAY_SIZE(nbd_mcast_grps),
2264*4882a593Smuzhiyun };
2265*4882a593Smuzhiyun 
populate_nbd_status(struct nbd_device * nbd,struct sk_buff * reply)2266*4882a593Smuzhiyun static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
2267*4882a593Smuzhiyun {
2268*4882a593Smuzhiyun 	struct nlattr *dev_opt;
2269*4882a593Smuzhiyun 	u8 connected = 0;
2270*4882a593Smuzhiyun 	int ret;
2271*4882a593Smuzhiyun 
2272*4882a593Smuzhiyun 	/* This is a little racey, but for status it's ok.  The
2273*4882a593Smuzhiyun 	 * reason we don't take a ref here is because we can't
2274*4882a593Smuzhiyun 	 * take a ref in the index == -1 case as we would need
2275*4882a593Smuzhiyun 	 * to put under the nbd_index_mutex, which could
2276*4882a593Smuzhiyun 	 * deadlock if we are configured to remove ourselves
2277*4882a593Smuzhiyun 	 * once we're disconnected.
2278*4882a593Smuzhiyun 	 */
2279*4882a593Smuzhiyun 	if (refcount_read(&nbd->config_refs))
2280*4882a593Smuzhiyun 		connected = 1;
2281*4882a593Smuzhiyun 	dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
2282*4882a593Smuzhiyun 	if (!dev_opt)
2283*4882a593Smuzhiyun 		return -EMSGSIZE;
2284*4882a593Smuzhiyun 	ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
2285*4882a593Smuzhiyun 	if (ret)
2286*4882a593Smuzhiyun 		return -EMSGSIZE;
2287*4882a593Smuzhiyun 	ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
2288*4882a593Smuzhiyun 			 connected);
2289*4882a593Smuzhiyun 	if (ret)
2290*4882a593Smuzhiyun 		return -EMSGSIZE;
2291*4882a593Smuzhiyun 	nla_nest_end(reply, dev_opt);
2292*4882a593Smuzhiyun 	return 0;
2293*4882a593Smuzhiyun }
2294*4882a593Smuzhiyun 
status_cb(int id,void * ptr,void * data)2295*4882a593Smuzhiyun static int status_cb(int id, void *ptr, void *data)
2296*4882a593Smuzhiyun {
2297*4882a593Smuzhiyun 	struct nbd_device *nbd = ptr;
2298*4882a593Smuzhiyun 	return populate_nbd_status(nbd, (struct sk_buff *)data);
2299*4882a593Smuzhiyun }
2300*4882a593Smuzhiyun 
nbd_genl_status(struct sk_buff * skb,struct genl_info * info)2301*4882a593Smuzhiyun static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
2302*4882a593Smuzhiyun {
2303*4882a593Smuzhiyun 	struct nlattr *dev_list;
2304*4882a593Smuzhiyun 	struct sk_buff *reply;
2305*4882a593Smuzhiyun 	void *reply_head;
2306*4882a593Smuzhiyun 	size_t msg_size;
2307*4882a593Smuzhiyun 	int index = -1;
2308*4882a593Smuzhiyun 	int ret = -ENOMEM;
2309*4882a593Smuzhiyun 
2310*4882a593Smuzhiyun 	if (info->attrs[NBD_ATTR_INDEX])
2311*4882a593Smuzhiyun 		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
2312*4882a593Smuzhiyun 
2313*4882a593Smuzhiyun 	mutex_lock(&nbd_index_mutex);
2314*4882a593Smuzhiyun 
2315*4882a593Smuzhiyun 	msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
2316*4882a593Smuzhiyun 				  nla_attr_size(sizeof(u8)));
2317*4882a593Smuzhiyun 	msg_size *= (index == -1) ? nbd_total_devices : 1;
2318*4882a593Smuzhiyun 
2319*4882a593Smuzhiyun 	reply = genlmsg_new(msg_size, GFP_KERNEL);
2320*4882a593Smuzhiyun 	if (!reply)
2321*4882a593Smuzhiyun 		goto out;
2322*4882a593Smuzhiyun 	reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
2323*4882a593Smuzhiyun 				       NBD_CMD_STATUS);
2324*4882a593Smuzhiyun 	if (!reply_head) {
2325*4882a593Smuzhiyun 		nlmsg_free(reply);
2326*4882a593Smuzhiyun 		goto out;
2327*4882a593Smuzhiyun 	}
2328*4882a593Smuzhiyun 
2329*4882a593Smuzhiyun 	dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
2330*4882a593Smuzhiyun 	if (index == -1) {
2331*4882a593Smuzhiyun 		ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
2332*4882a593Smuzhiyun 		if (ret) {
2333*4882a593Smuzhiyun 			nlmsg_free(reply);
2334*4882a593Smuzhiyun 			goto out;
2335*4882a593Smuzhiyun 		}
2336*4882a593Smuzhiyun 	} else {
2337*4882a593Smuzhiyun 		struct nbd_device *nbd;
2338*4882a593Smuzhiyun 		nbd = idr_find(&nbd_index_idr, index);
2339*4882a593Smuzhiyun 		if (nbd) {
2340*4882a593Smuzhiyun 			ret = populate_nbd_status(nbd, reply);
2341*4882a593Smuzhiyun 			if (ret) {
2342*4882a593Smuzhiyun 				nlmsg_free(reply);
2343*4882a593Smuzhiyun 				goto out;
2344*4882a593Smuzhiyun 			}
2345*4882a593Smuzhiyun 		}
2346*4882a593Smuzhiyun 	}
2347*4882a593Smuzhiyun 	nla_nest_end(reply, dev_list);
2348*4882a593Smuzhiyun 	genlmsg_end(reply, reply_head);
2349*4882a593Smuzhiyun 	ret = genlmsg_reply(reply, info);
2350*4882a593Smuzhiyun out:
2351*4882a593Smuzhiyun 	mutex_unlock(&nbd_index_mutex);
2352*4882a593Smuzhiyun 	return ret;
2353*4882a593Smuzhiyun }
2354*4882a593Smuzhiyun 
nbd_connect_reply(struct genl_info * info,int index)2355*4882a593Smuzhiyun static void nbd_connect_reply(struct genl_info *info, int index)
2356*4882a593Smuzhiyun {
2357*4882a593Smuzhiyun 	struct sk_buff *skb;
2358*4882a593Smuzhiyun 	void *msg_head;
2359*4882a593Smuzhiyun 	int ret;
2360*4882a593Smuzhiyun 
2361*4882a593Smuzhiyun 	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2362*4882a593Smuzhiyun 	if (!skb)
2363*4882a593Smuzhiyun 		return;
2364*4882a593Smuzhiyun 	msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
2365*4882a593Smuzhiyun 				     NBD_CMD_CONNECT);
2366*4882a593Smuzhiyun 	if (!msg_head) {
2367*4882a593Smuzhiyun 		nlmsg_free(skb);
2368*4882a593Smuzhiyun 		return;
2369*4882a593Smuzhiyun 	}
2370*4882a593Smuzhiyun 	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2371*4882a593Smuzhiyun 	if (ret) {
2372*4882a593Smuzhiyun 		nlmsg_free(skb);
2373*4882a593Smuzhiyun 		return;
2374*4882a593Smuzhiyun 	}
2375*4882a593Smuzhiyun 	genlmsg_end(skb, msg_head);
2376*4882a593Smuzhiyun 	genlmsg_reply(skb, info);
2377*4882a593Smuzhiyun }
2378*4882a593Smuzhiyun 
nbd_mcast_index(int index)2379*4882a593Smuzhiyun static void nbd_mcast_index(int index)
2380*4882a593Smuzhiyun {
2381*4882a593Smuzhiyun 	struct sk_buff *skb;
2382*4882a593Smuzhiyun 	void *msg_head;
2383*4882a593Smuzhiyun 	int ret;
2384*4882a593Smuzhiyun 
2385*4882a593Smuzhiyun 	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
2386*4882a593Smuzhiyun 	if (!skb)
2387*4882a593Smuzhiyun 		return;
2388*4882a593Smuzhiyun 	msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
2389*4882a593Smuzhiyun 				     NBD_CMD_LINK_DEAD);
2390*4882a593Smuzhiyun 	if (!msg_head) {
2391*4882a593Smuzhiyun 		nlmsg_free(skb);
2392*4882a593Smuzhiyun 		return;
2393*4882a593Smuzhiyun 	}
2394*4882a593Smuzhiyun 	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
2395*4882a593Smuzhiyun 	if (ret) {
2396*4882a593Smuzhiyun 		nlmsg_free(skb);
2397*4882a593Smuzhiyun 		return;
2398*4882a593Smuzhiyun 	}
2399*4882a593Smuzhiyun 	genlmsg_end(skb, msg_head);
2400*4882a593Smuzhiyun 	genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
2401*4882a593Smuzhiyun }
2402*4882a593Smuzhiyun 
nbd_dead_link_work(struct work_struct * work)2403*4882a593Smuzhiyun static void nbd_dead_link_work(struct work_struct *work)
2404*4882a593Smuzhiyun {
2405*4882a593Smuzhiyun 	struct link_dead_args *args = container_of(work, struct link_dead_args,
2406*4882a593Smuzhiyun 						   work);
2407*4882a593Smuzhiyun 	nbd_mcast_index(args->index);
2408*4882a593Smuzhiyun 	kfree(args);
2409*4882a593Smuzhiyun }
2410*4882a593Smuzhiyun 
nbd_init(void)2411*4882a593Smuzhiyun static int __init nbd_init(void)
2412*4882a593Smuzhiyun {
2413*4882a593Smuzhiyun 	int i;
2414*4882a593Smuzhiyun 
2415*4882a593Smuzhiyun 	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
2416*4882a593Smuzhiyun 
2417*4882a593Smuzhiyun 	if (max_part < 0) {
2418*4882a593Smuzhiyun 		printk(KERN_ERR "nbd: max_part must be >= 0\n");
2419*4882a593Smuzhiyun 		return -EINVAL;
2420*4882a593Smuzhiyun 	}
2421*4882a593Smuzhiyun 
2422*4882a593Smuzhiyun 	part_shift = 0;
2423*4882a593Smuzhiyun 	if (max_part > 0) {
2424*4882a593Smuzhiyun 		part_shift = fls(max_part);
2425*4882a593Smuzhiyun 
2426*4882a593Smuzhiyun 		/*
2427*4882a593Smuzhiyun 		 * Adjust max_part according to part_shift as it is exported
2428*4882a593Smuzhiyun 		 * to user space so that user can know the max number of
2429*4882a593Smuzhiyun 		 * partition kernel should be able to manage.
2430*4882a593Smuzhiyun 		 *
2431*4882a593Smuzhiyun 		 * Note that -1 is required because partition 0 is reserved
2432*4882a593Smuzhiyun 		 * for the whole disk.
2433*4882a593Smuzhiyun 		 */
2434*4882a593Smuzhiyun 		max_part = (1UL << part_shift) - 1;
2435*4882a593Smuzhiyun 	}
2436*4882a593Smuzhiyun 
2437*4882a593Smuzhiyun 	if ((1UL << part_shift) > DISK_MAX_PARTS)
2438*4882a593Smuzhiyun 		return -EINVAL;
2439*4882a593Smuzhiyun 
2440*4882a593Smuzhiyun 	if (nbds_max > 1UL << (MINORBITS - part_shift))
2441*4882a593Smuzhiyun 		return -EINVAL;
2442*4882a593Smuzhiyun 
2443*4882a593Smuzhiyun 	if (register_blkdev(NBD_MAJOR, "nbd"))
2444*4882a593Smuzhiyun 		return -EIO;
2445*4882a593Smuzhiyun 
2446*4882a593Smuzhiyun 	if (genl_register_family(&nbd_genl_family)) {
2447*4882a593Smuzhiyun 		unregister_blkdev(NBD_MAJOR, "nbd");
2448*4882a593Smuzhiyun 		return -EINVAL;
2449*4882a593Smuzhiyun 	}
2450*4882a593Smuzhiyun 	nbd_dbg_init();
2451*4882a593Smuzhiyun 
2452*4882a593Smuzhiyun 	mutex_lock(&nbd_index_mutex);
2453*4882a593Smuzhiyun 	for (i = 0; i < nbds_max; i++)
2454*4882a593Smuzhiyun 		nbd_dev_add(i);
2455*4882a593Smuzhiyun 	mutex_unlock(&nbd_index_mutex);
2456*4882a593Smuzhiyun 	return 0;
2457*4882a593Smuzhiyun }
2458*4882a593Smuzhiyun 
nbd_exit_cb(int id,void * ptr,void * data)2459*4882a593Smuzhiyun static int nbd_exit_cb(int id, void *ptr, void *data)
2460*4882a593Smuzhiyun {
2461*4882a593Smuzhiyun 	struct list_head *list = (struct list_head *)data;
2462*4882a593Smuzhiyun 	struct nbd_device *nbd = ptr;
2463*4882a593Smuzhiyun 
2464*4882a593Smuzhiyun 	list_add_tail(&nbd->list, list);
2465*4882a593Smuzhiyun 	return 0;
2466*4882a593Smuzhiyun }
2467*4882a593Smuzhiyun 
nbd_cleanup(void)2468*4882a593Smuzhiyun static void __exit nbd_cleanup(void)
2469*4882a593Smuzhiyun {
2470*4882a593Smuzhiyun 	struct nbd_device *nbd;
2471*4882a593Smuzhiyun 	LIST_HEAD(del_list);
2472*4882a593Smuzhiyun 
2473*4882a593Smuzhiyun 	/*
2474*4882a593Smuzhiyun 	 * Unregister netlink interface prior to waiting
2475*4882a593Smuzhiyun 	 * for the completion of netlink commands.
2476*4882a593Smuzhiyun 	 */
2477*4882a593Smuzhiyun 	genl_unregister_family(&nbd_genl_family);
2478*4882a593Smuzhiyun 
2479*4882a593Smuzhiyun 	nbd_dbg_close();
2480*4882a593Smuzhiyun 
2481*4882a593Smuzhiyun 	mutex_lock(&nbd_index_mutex);
2482*4882a593Smuzhiyun 	idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
2483*4882a593Smuzhiyun 	mutex_unlock(&nbd_index_mutex);
2484*4882a593Smuzhiyun 
2485*4882a593Smuzhiyun 	while (!list_empty(&del_list)) {
2486*4882a593Smuzhiyun 		nbd = list_first_entry(&del_list, struct nbd_device, list);
2487*4882a593Smuzhiyun 		list_del_init(&nbd->list);
2488*4882a593Smuzhiyun 		if (refcount_read(&nbd->config_refs))
2489*4882a593Smuzhiyun 			printk(KERN_ERR "nbd: possibly leaking nbd_config (ref %d)\n",
2490*4882a593Smuzhiyun 					refcount_read(&nbd->config_refs));
2491*4882a593Smuzhiyun 		if (refcount_read(&nbd->refs) != 1)
2492*4882a593Smuzhiyun 			printk(KERN_ERR "nbd: possibly leaking a device\n");
2493*4882a593Smuzhiyun 		nbd_put(nbd);
2494*4882a593Smuzhiyun 	}
2495*4882a593Smuzhiyun 
2496*4882a593Smuzhiyun 	idr_destroy(&nbd_index_idr);
2497*4882a593Smuzhiyun 	unregister_blkdev(NBD_MAJOR, "nbd");
2498*4882a593Smuzhiyun }
2499*4882a593Smuzhiyun 
2500*4882a593Smuzhiyun module_init(nbd_init);
2501*4882a593Smuzhiyun module_exit(nbd_cleanup);
2502*4882a593Smuzhiyun 
2503*4882a593Smuzhiyun MODULE_DESCRIPTION("Network Block Device");
2504*4882a593Smuzhiyun MODULE_LICENSE("GPL");
2505*4882a593Smuzhiyun 
2506*4882a593Smuzhiyun module_param(nbds_max, int, 0444);
2507*4882a593Smuzhiyun MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
2508*4882a593Smuzhiyun module_param(max_part, int, 0444);
2509*4882a593Smuzhiyun MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
2510