1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0+
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2020 Google, Inc
4*4882a593Smuzhiyun * Copyright (C) 2020 Palmer Dabbelt <palmerdabbelt@google.com>
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #include <linux/device-mapper.h>
8*4882a593Smuzhiyun #include <uapi/linux/dm-user.h>
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun #include <linux/bio.h>
11*4882a593Smuzhiyun #include <linux/init.h>
12*4882a593Smuzhiyun #include <linux/mempool.h>
13*4882a593Smuzhiyun #include <linux/miscdevice.h>
14*4882a593Smuzhiyun #include <linux/module.h>
15*4882a593Smuzhiyun #include <linux/poll.h>
16*4882a593Smuzhiyun #include <linux/uio.h>
17*4882a593Smuzhiyun #include <linux/wait.h>
18*4882a593Smuzhiyun #include <linux/workqueue.h>
19*4882a593Smuzhiyun
20*4882a593Smuzhiyun #define DM_MSG_PREFIX "user"
21*4882a593Smuzhiyun
22*4882a593Smuzhiyun #define MAX_OUTSTANDING_MESSAGES 128
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun static unsigned int daemon_timeout_msec = 4000;
25*4882a593Smuzhiyun module_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint,
26*4882a593Smuzhiyun 0644);
27*4882a593Smuzhiyun MODULE_PARM_DESC(dm_user_daemon_timeout_msec,
28*4882a593Smuzhiyun "IO Timeout in msec if daemon does not process");
29*4882a593Smuzhiyun
30*4882a593Smuzhiyun /*
31*4882a593Smuzhiyun * dm-user uses four structures:
32*4882a593Smuzhiyun *
33*4882a593Smuzhiyun * - "struct target", the outermost structure, corresponds to a single device
34*4882a593Smuzhiyun * mapper target. This contains the set of outstanding BIOs that have been
35*4882a593Smuzhiyun * provided by DM and are not actively being processed by the user, along
36*4882a593Smuzhiyun * with a misc device that userspace can open to communicate with the
37*4882a593Smuzhiyun * kernel. Each time userspaces opens the misc device a new channel is
38*4882a593Smuzhiyun * created.
39*4882a593Smuzhiyun * - "struct channel", which represents a single active communication channel
40*4882a593Smuzhiyun * with userspace. Userspace may choose arbitrary read/write sizes to use
41*4882a593Smuzhiyun * when processing messages, channels form these into logical accesses.
42*4882a593Smuzhiyun * When userspace responds to a full message the channel completes the BIO
43*4882a593Smuzhiyun * and obtains a new message to process from the target.
44*4882a593Smuzhiyun * - "struct message", which wraps a BIO with the additional information
45*4882a593Smuzhiyun * required by the kernel to sort out what to do with BIOs when they return
46*4882a593Smuzhiyun * from userspace.
47*4882a593Smuzhiyun * - "struct dm_user_message", which is the exact message format that
48*4882a593Smuzhiyun * userspace sees.
49*4882a593Smuzhiyun *
50*4882a593Smuzhiyun * The hot path contains three distinct operations:
51*4882a593Smuzhiyun *
52*4882a593Smuzhiyun * - user_map(), which is provided a BIO from device mapper that is queued
53*4882a593Smuzhiyun * into the target. This allocates and enqueues a new message.
54*4882a593Smuzhiyun * - dev_read(), which dequeues a message, copies it to userspace.
55*4882a593Smuzhiyun * - dev_write(), which looks up a message (keyed by sequence number) and
56*4882a593Smuzhiyun * completes the corresponding BIO.
57*4882a593Smuzhiyun *
58*4882a593Smuzhiyun * Lock ordering (outer to inner)
59*4882a593Smuzhiyun *
60*4882a593Smuzhiyun * 1) miscdevice's global lock. This is held around dev_open, so it has to be
61*4882a593Smuzhiyun * the outermost lock.
62*4882a593Smuzhiyun * 2) target->lock
63*4882a593Smuzhiyun * 3) channel->lock
64*4882a593Smuzhiyun */
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun struct message {
67*4882a593Smuzhiyun /*
68*4882a593Smuzhiyun * Messages themselves do not need a lock, they're protected by either
69*4882a593Smuzhiyun * the target or channel's lock, depending on which can reference them
70*4882a593Smuzhiyun * directly.
71*4882a593Smuzhiyun */
72*4882a593Smuzhiyun struct dm_user_message msg;
73*4882a593Smuzhiyun struct bio *bio;
74*4882a593Smuzhiyun size_t posn_to_user;
75*4882a593Smuzhiyun size_t total_to_user;
76*4882a593Smuzhiyun size_t posn_from_user;
77*4882a593Smuzhiyun size_t total_from_user;
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun struct list_head from_user;
80*4882a593Smuzhiyun struct list_head to_user;
81*4882a593Smuzhiyun
82*4882a593Smuzhiyun /*
83*4882a593Smuzhiyun * These are written back from the user. They live in the same spot in
84*4882a593Smuzhiyun * the message, but we need to either keep the old values around or
85*4882a593Smuzhiyun * call a bunch more BIO helpers. These are only valid after write has
86*4882a593Smuzhiyun * adopted the message.
87*4882a593Smuzhiyun */
88*4882a593Smuzhiyun u64 return_type;
89*4882a593Smuzhiyun u64 return_flags;
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun struct delayed_work work;
92*4882a593Smuzhiyun bool delayed;
93*4882a593Smuzhiyun struct target *t;
94*4882a593Smuzhiyun };
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun struct target {
97*4882a593Smuzhiyun /*
98*4882a593Smuzhiyun * A target has a single lock, which protects everything in the target
99*4882a593Smuzhiyun * (but does not protect the channels associated with a target).
100*4882a593Smuzhiyun */
101*4882a593Smuzhiyun struct mutex lock;
102*4882a593Smuzhiyun
103*4882a593Smuzhiyun /*
104*4882a593Smuzhiyun * There is only one point at which anything blocks: userspace blocks
105*4882a593Smuzhiyun * reading a new message, which is woken up by device mapper providing
106*4882a593Smuzhiyun * a new BIO to process (or tearing down the target). The
107*4882a593Smuzhiyun * corresponding write side doesn't block, instead we treat userspace's
108*4882a593Smuzhiyun * response containing a message that has yet to be mapped as an
109*4882a593Smuzhiyun * invalid operation.
110*4882a593Smuzhiyun */
111*4882a593Smuzhiyun struct wait_queue_head wq;
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun /*
114*4882a593Smuzhiyun * Messages are delivered to userspace in order, but may be returned
115*4882a593Smuzhiyun * out of order. This allows userspace to schedule IO if it wants to.
116*4882a593Smuzhiyun */
117*4882a593Smuzhiyun mempool_t message_pool;
118*4882a593Smuzhiyun u64 next_seq_to_map;
119*4882a593Smuzhiyun u64 next_seq_to_user;
120*4882a593Smuzhiyun struct list_head to_user;
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun /*
123*4882a593Smuzhiyun * There is a misc device per target. The name is selected by
124*4882a593Smuzhiyun * userspace (via a DM create ioctl argument), and each ends up in
125*4882a593Smuzhiyun * /dev/dm-user/. It looks like a better way to do this may be to have
126*4882a593Smuzhiyun * a filesystem to manage these, but this was more expedient. The
127*4882a593Smuzhiyun * current mechanism is functional, but does result in an arbitrary
128*4882a593Smuzhiyun * number of dynamically created misc devices.
129*4882a593Smuzhiyun */
130*4882a593Smuzhiyun struct miscdevice miscdev;
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun /*
133*4882a593Smuzhiyun * Device mapper's target destructor triggers tearing this all down,
134*4882a593Smuzhiyun * but we can't actually free until every channel associated with this
135*4882a593Smuzhiyun * target has been destroyed. Channels each have a reference to their
136*4882a593Smuzhiyun * target, and there is an additional single reference that corresponds
137*4882a593Smuzhiyun * to both DM and the misc device (both of which are destroyed by DM).
138*4882a593Smuzhiyun *
139*4882a593Smuzhiyun * In the common case userspace will be asleep waiting for a new
140*4882a593Smuzhiyun * message when device mapper decides to destroy the target, which
141*4882a593Smuzhiyun * means no new messages will appear. The destroyed flag triggers a
142*4882a593Smuzhiyun * wakeup, which will end up removing the reference.
143*4882a593Smuzhiyun */
144*4882a593Smuzhiyun struct kref references;
145*4882a593Smuzhiyun int dm_destroyed;
146*4882a593Smuzhiyun bool daemon_terminated;
147*4882a593Smuzhiyun };
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun struct channel {
150*4882a593Smuzhiyun struct target *target;
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun /*
153*4882a593Smuzhiyun * A channel has a single lock, which prevents multiple reads (or
154*4882a593Smuzhiyun * multiple writes) from conflicting with each other.
155*4882a593Smuzhiyun */
156*4882a593Smuzhiyun struct mutex lock;
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun struct message *cur_to_user;
159*4882a593Smuzhiyun struct message *cur_from_user;
160*4882a593Smuzhiyun ssize_t to_user_error;
161*4882a593Smuzhiyun ssize_t from_user_error;
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun /*
164*4882a593Smuzhiyun * Once a message has been forwarded to userspace on a channel it must
165*4882a593Smuzhiyun * be responded to on the same channel. This allows us to error out
166*4882a593Smuzhiyun * the messages that have not yet been responded to by a channel when
167*4882a593Smuzhiyun * that channel closes, which makes handling errors more reasonable for
168*4882a593Smuzhiyun * fault-tolerant userspace daemons. It also happens to make avoiding
169*4882a593Smuzhiyun * shared locks between user_map() and dev_read() a lot easier.
170*4882a593Smuzhiyun *
171*4882a593Smuzhiyun * This does preclude a multi-threaded work stealing userspace
172*4882a593Smuzhiyun * implementation (or at least, force a degree of head-of-line blocking
173*4882a593Smuzhiyun * on the response path).
174*4882a593Smuzhiyun */
175*4882a593Smuzhiyun struct list_head from_user;
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun /*
178*4882a593Smuzhiyun * Responses from userspace can arrive in arbitrarily small chunks.
179*4882a593Smuzhiyun * We need some place to buffer one up until we can find the
180*4882a593Smuzhiyun * corresponding kernel-side message to continue processing, so instead
181*4882a593Smuzhiyun * of allocating them we just keep one off to the side here. This can
182*4882a593Smuzhiyun * only ever be pointer to by from_user_cur, and will never have a BIO.
183*4882a593Smuzhiyun */
184*4882a593Smuzhiyun struct message scratch_message_from_user;
185*4882a593Smuzhiyun };
186*4882a593Smuzhiyun
message_kill(struct message * m,mempool_t * pool)187*4882a593Smuzhiyun static void message_kill(struct message *m, mempool_t *pool)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun m->bio->bi_status = BLK_STS_IOERR;
190*4882a593Smuzhiyun bio_endio(m->bio);
191*4882a593Smuzhiyun mempool_free(m, pool);
192*4882a593Smuzhiyun }
193*4882a593Smuzhiyun
is_user_space_thread_present(struct target * t)194*4882a593Smuzhiyun static inline bool is_user_space_thread_present(struct target *t)
195*4882a593Smuzhiyun {
196*4882a593Smuzhiyun lockdep_assert_held(&t->lock);
197*4882a593Smuzhiyun return (kref_read(&t->references) > 1);
198*4882a593Smuzhiyun }
199*4882a593Smuzhiyun
process_delayed_work(struct work_struct * work)200*4882a593Smuzhiyun static void process_delayed_work(struct work_struct *work)
201*4882a593Smuzhiyun {
202*4882a593Smuzhiyun struct delayed_work *del_work = to_delayed_work(work);
203*4882a593Smuzhiyun struct message *msg = container_of(del_work, struct message, work);
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun struct target *t = msg->t;
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun mutex_lock(&t->lock);
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun /*
210*4882a593Smuzhiyun * There is a atleast one thread to process the IO.
211*4882a593Smuzhiyun */
212*4882a593Smuzhiyun if (is_user_space_thread_present(t)) {
213*4882a593Smuzhiyun mutex_unlock(&t->lock);
214*4882a593Smuzhiyun return;
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun /*
218*4882a593Smuzhiyun * Terminate the IO with an error
219*4882a593Smuzhiyun */
220*4882a593Smuzhiyun list_del(&msg->to_user);
221*4882a593Smuzhiyun pr_err("I/O error: sector %llu: no user-space daemon for %s target\n",
222*4882a593Smuzhiyun msg->bio->bi_iter.bi_sector,
223*4882a593Smuzhiyun t->miscdev.name);
224*4882a593Smuzhiyun message_kill(msg, &t->message_pool);
225*4882a593Smuzhiyun mutex_unlock(&t->lock);
226*4882a593Smuzhiyun }
227*4882a593Smuzhiyun
enqueue_delayed_work(struct message * m,bool is_delay)228*4882a593Smuzhiyun static void enqueue_delayed_work(struct message *m, bool is_delay)
229*4882a593Smuzhiyun {
230*4882a593Smuzhiyun unsigned long delay = 0;
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun m->delayed = true;
233*4882a593Smuzhiyun INIT_DELAYED_WORK(&m->work, process_delayed_work);
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun /*
236*4882a593Smuzhiyun * Snapuserd daemon is the user-space process
237*4882a593Smuzhiyun * which processes IO request from dm-user
238*4882a593Smuzhiyun * when OTA is applied. Per the current design,
239*4882a593Smuzhiyun * when a dm-user target is created, daemon
240*4882a593Smuzhiyun * attaches to target and starts processing
241*4882a593Smuzhiyun * the IO's. Daemon is terminated only when
242*4882a593Smuzhiyun * dm-user target is destroyed.
243*4882a593Smuzhiyun *
244*4882a593Smuzhiyun * If for some reason, daemon crashes or terminates early,
245*4882a593Smuzhiyun * without destroying the dm-user target; then
246*4882a593Smuzhiyun * there is no mechanism to restart the daemon
247*4882a593Smuzhiyun * and start processing the IO's from the same target.
248*4882a593Smuzhiyun * Theoretically, it is possible but that infrastructure
249*4882a593Smuzhiyun * doesn't exist in the android ecosystem.
250*4882a593Smuzhiyun *
251*4882a593Smuzhiyun * Thus, when the daemon terminates, there is no way the IO's
252*4882a593Smuzhiyun * issued on that target will be processed. Hence,
253*4882a593Smuzhiyun * we set the delay to 0 and fail the IO's immediately.
254*4882a593Smuzhiyun *
255*4882a593Smuzhiyun * On the other hand, when a new dm-user target is created,
256*4882a593Smuzhiyun * we wait for the daemon to get attached for the first time.
257*4882a593Smuzhiyun * This primarily happens when init first stage spins up
258*4882a593Smuzhiyun * the daemon. At this point, since the snapshot device is mounted
259*4882a593Smuzhiyun * of a root filesystem, dm-user target may receive IO request
260*4882a593Smuzhiyun * even though daemon is not fully launched. We don't want
261*4882a593Smuzhiyun * to fail those IO requests immediately. Thus, we queue these
262*4882a593Smuzhiyun * requests with a timeout so that daemon is ready to process
263*4882a593Smuzhiyun * those IO requests. Again, if the daemon fails to launch within
264*4882a593Smuzhiyun * the timeout period, then IO's will be failed.
265*4882a593Smuzhiyun */
266*4882a593Smuzhiyun if (is_delay)
267*4882a593Smuzhiyun delay = msecs_to_jiffies(daemon_timeout_msec);
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun queue_delayed_work(system_wq, &m->work, delay);
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun
target_from_target(struct dm_target * target)272*4882a593Smuzhiyun static inline struct target *target_from_target(struct dm_target *target)
273*4882a593Smuzhiyun {
274*4882a593Smuzhiyun WARN_ON(target->private == NULL);
275*4882a593Smuzhiyun return target->private;
276*4882a593Smuzhiyun }
277*4882a593Smuzhiyun
target_from_miscdev(struct miscdevice * miscdev)278*4882a593Smuzhiyun static inline struct target *target_from_miscdev(struct miscdevice *miscdev)
279*4882a593Smuzhiyun {
280*4882a593Smuzhiyun return container_of(miscdev, struct target, miscdev);
281*4882a593Smuzhiyun }
282*4882a593Smuzhiyun
channel_from_file(struct file * file)283*4882a593Smuzhiyun static inline struct channel *channel_from_file(struct file *file)
284*4882a593Smuzhiyun {
285*4882a593Smuzhiyun WARN_ON(file->private_data == NULL);
286*4882a593Smuzhiyun return file->private_data;
287*4882a593Smuzhiyun }
288*4882a593Smuzhiyun
target_from_channel(struct channel * c)289*4882a593Smuzhiyun static inline struct target *target_from_channel(struct channel *c)
290*4882a593Smuzhiyun {
291*4882a593Smuzhiyun WARN_ON(c->target == NULL);
292*4882a593Smuzhiyun return c->target;
293*4882a593Smuzhiyun }
294*4882a593Smuzhiyun
bio_size(struct bio * bio)295*4882a593Smuzhiyun static inline size_t bio_size(struct bio *bio)
296*4882a593Smuzhiyun {
297*4882a593Smuzhiyun struct bio_vec bvec;
298*4882a593Smuzhiyun struct bvec_iter iter;
299*4882a593Smuzhiyun size_t out = 0;
300*4882a593Smuzhiyun
301*4882a593Smuzhiyun bio_for_each_segment (bvec, bio, iter)
302*4882a593Smuzhiyun out += bio_iter_len(bio, iter);
303*4882a593Smuzhiyun return out;
304*4882a593Smuzhiyun }
305*4882a593Smuzhiyun
bio_bytes_needed_to_user(struct bio * bio)306*4882a593Smuzhiyun static inline size_t bio_bytes_needed_to_user(struct bio *bio)
307*4882a593Smuzhiyun {
308*4882a593Smuzhiyun switch (bio_op(bio)) {
309*4882a593Smuzhiyun case REQ_OP_WRITE:
310*4882a593Smuzhiyun return sizeof(struct dm_user_message) + bio_size(bio);
311*4882a593Smuzhiyun case REQ_OP_READ:
312*4882a593Smuzhiyun case REQ_OP_FLUSH:
313*4882a593Smuzhiyun case REQ_OP_DISCARD:
314*4882a593Smuzhiyun case REQ_OP_SECURE_ERASE:
315*4882a593Smuzhiyun case REQ_OP_WRITE_SAME:
316*4882a593Smuzhiyun case REQ_OP_WRITE_ZEROES:
317*4882a593Smuzhiyun return sizeof(struct dm_user_message);
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun /*
320*4882a593Smuzhiyun * These ops are not passed to userspace under the assumption that
321*4882a593Smuzhiyun * they're not going to be particularly useful in that context.
322*4882a593Smuzhiyun */
323*4882a593Smuzhiyun default:
324*4882a593Smuzhiyun return -EOPNOTSUPP;
325*4882a593Smuzhiyun }
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun
bio_bytes_needed_from_user(struct bio * bio)328*4882a593Smuzhiyun static inline size_t bio_bytes_needed_from_user(struct bio *bio)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun switch (bio_op(bio)) {
331*4882a593Smuzhiyun case REQ_OP_READ:
332*4882a593Smuzhiyun return sizeof(struct dm_user_message) + bio_size(bio);
333*4882a593Smuzhiyun case REQ_OP_WRITE:
334*4882a593Smuzhiyun case REQ_OP_FLUSH:
335*4882a593Smuzhiyun case REQ_OP_DISCARD:
336*4882a593Smuzhiyun case REQ_OP_SECURE_ERASE:
337*4882a593Smuzhiyun case REQ_OP_WRITE_SAME:
338*4882a593Smuzhiyun case REQ_OP_WRITE_ZEROES:
339*4882a593Smuzhiyun return sizeof(struct dm_user_message);
340*4882a593Smuzhiyun
341*4882a593Smuzhiyun /*
342*4882a593Smuzhiyun * These ops are not passed to userspace under the assumption that
343*4882a593Smuzhiyun * they're not going to be particularly useful in that context.
344*4882a593Smuzhiyun */
345*4882a593Smuzhiyun default:
346*4882a593Smuzhiyun return -EOPNOTSUPP;
347*4882a593Smuzhiyun }
348*4882a593Smuzhiyun }
349*4882a593Smuzhiyun
bio_type_to_user_type(struct bio * bio)350*4882a593Smuzhiyun static inline long bio_type_to_user_type(struct bio *bio)
351*4882a593Smuzhiyun {
352*4882a593Smuzhiyun switch (bio_op(bio)) {
353*4882a593Smuzhiyun case REQ_OP_READ:
354*4882a593Smuzhiyun return DM_USER_REQ_MAP_READ;
355*4882a593Smuzhiyun case REQ_OP_WRITE:
356*4882a593Smuzhiyun return DM_USER_REQ_MAP_WRITE;
357*4882a593Smuzhiyun case REQ_OP_FLUSH:
358*4882a593Smuzhiyun return DM_USER_REQ_MAP_FLUSH;
359*4882a593Smuzhiyun case REQ_OP_DISCARD:
360*4882a593Smuzhiyun return DM_USER_REQ_MAP_DISCARD;
361*4882a593Smuzhiyun case REQ_OP_SECURE_ERASE:
362*4882a593Smuzhiyun return DM_USER_REQ_MAP_SECURE_ERASE;
363*4882a593Smuzhiyun case REQ_OP_WRITE_SAME:
364*4882a593Smuzhiyun return DM_USER_REQ_MAP_WRITE_SAME;
365*4882a593Smuzhiyun case REQ_OP_WRITE_ZEROES:
366*4882a593Smuzhiyun return DM_USER_REQ_MAP_WRITE_ZEROES;
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun /*
369*4882a593Smuzhiyun * These ops are not passed to userspace under the assumption that
370*4882a593Smuzhiyun * they're not going to be particularly useful in that context.
371*4882a593Smuzhiyun */
372*4882a593Smuzhiyun default:
373*4882a593Smuzhiyun return -EOPNOTSUPP;
374*4882a593Smuzhiyun }
375*4882a593Smuzhiyun }
376*4882a593Smuzhiyun
bio_flags_to_user_flags(struct bio * bio)377*4882a593Smuzhiyun static inline long bio_flags_to_user_flags(struct bio *bio)
378*4882a593Smuzhiyun {
379*4882a593Smuzhiyun u64 out = 0;
380*4882a593Smuzhiyun typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK;
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun if (opf & REQ_FAILFAST_DEV) {
383*4882a593Smuzhiyun opf &= ~REQ_FAILFAST_DEV;
384*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV;
385*4882a593Smuzhiyun }
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun if (opf & REQ_FAILFAST_TRANSPORT) {
388*4882a593Smuzhiyun opf &= ~REQ_FAILFAST_TRANSPORT;
389*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT;
390*4882a593Smuzhiyun }
391*4882a593Smuzhiyun
392*4882a593Smuzhiyun if (opf & REQ_FAILFAST_DRIVER) {
393*4882a593Smuzhiyun opf &= ~REQ_FAILFAST_DRIVER;
394*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER;
395*4882a593Smuzhiyun }
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun if (opf & REQ_SYNC) {
398*4882a593Smuzhiyun opf &= ~REQ_SYNC;
399*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_SYNC;
400*4882a593Smuzhiyun }
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun if (opf & REQ_META) {
403*4882a593Smuzhiyun opf &= ~REQ_META;
404*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_META;
405*4882a593Smuzhiyun }
406*4882a593Smuzhiyun
407*4882a593Smuzhiyun if (opf & REQ_PRIO) {
408*4882a593Smuzhiyun opf &= ~REQ_PRIO;
409*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_PRIO;
410*4882a593Smuzhiyun }
411*4882a593Smuzhiyun
412*4882a593Smuzhiyun if (opf & REQ_NOMERGE) {
413*4882a593Smuzhiyun opf &= ~REQ_NOMERGE;
414*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_NOMERGE;
415*4882a593Smuzhiyun }
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun if (opf & REQ_IDLE) {
418*4882a593Smuzhiyun opf &= ~REQ_IDLE;
419*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_IDLE;
420*4882a593Smuzhiyun }
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun if (opf & REQ_INTEGRITY) {
423*4882a593Smuzhiyun opf &= ~REQ_INTEGRITY;
424*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_INTEGRITY;
425*4882a593Smuzhiyun }
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun if (opf & REQ_FUA) {
428*4882a593Smuzhiyun opf &= ~REQ_FUA;
429*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_FUA;
430*4882a593Smuzhiyun }
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun if (opf & REQ_PREFLUSH) {
433*4882a593Smuzhiyun opf &= ~REQ_PREFLUSH;
434*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_PREFLUSH;
435*4882a593Smuzhiyun }
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun if (opf & REQ_RAHEAD) {
438*4882a593Smuzhiyun opf &= ~REQ_RAHEAD;
439*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_RAHEAD;
440*4882a593Smuzhiyun }
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun if (opf & REQ_BACKGROUND) {
443*4882a593Smuzhiyun opf &= ~REQ_BACKGROUND;
444*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_BACKGROUND;
445*4882a593Smuzhiyun }
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun if (opf & REQ_NOWAIT) {
448*4882a593Smuzhiyun opf &= ~REQ_NOWAIT;
449*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_NOWAIT;
450*4882a593Smuzhiyun }
451*4882a593Smuzhiyun
452*4882a593Smuzhiyun if (opf & REQ_NOUNMAP) {
453*4882a593Smuzhiyun opf &= ~REQ_NOUNMAP;
454*4882a593Smuzhiyun out |= DM_USER_REQ_MAP_FLAG_NOUNMAP;
455*4882a593Smuzhiyun }
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun if (unlikely(opf)) {
458*4882a593Smuzhiyun pr_warn("unsupported BIO type %x\n", opf);
459*4882a593Smuzhiyun return -EOPNOTSUPP;
460*4882a593Smuzhiyun }
461*4882a593Smuzhiyun WARN_ON(out < 0);
462*4882a593Smuzhiyun return out;
463*4882a593Smuzhiyun }
464*4882a593Smuzhiyun
465*4882a593Smuzhiyun /*
466*4882a593Smuzhiyun * Not quite what's in blk-map.c, but instead what I thought the functions in
467*4882a593Smuzhiyun * blk-map did. This one seems more generally useful and I think we could
468*4882a593Smuzhiyun * write the blk-map version in terms of this one. The differences are that
469*4882a593Smuzhiyun * this has a return value that counts, and blk-map uses the BIO _all iters.
470*4882a593Smuzhiyun * Neither advance the BIO iter but don't advance the IOV iter, which is a bit
471*4882a593Smuzhiyun * odd here.
472*4882a593Smuzhiyun */
bio_copy_from_iter(struct bio * bio,struct iov_iter * iter)473*4882a593Smuzhiyun static ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
474*4882a593Smuzhiyun {
475*4882a593Smuzhiyun struct bio_vec bvec;
476*4882a593Smuzhiyun struct bvec_iter biter;
477*4882a593Smuzhiyun ssize_t out = 0;
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun bio_for_each_segment (bvec, bio, biter) {
480*4882a593Smuzhiyun ssize_t ret;
481*4882a593Smuzhiyun
482*4882a593Smuzhiyun ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset,
483*4882a593Smuzhiyun bvec.bv_len, iter);
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun /*
486*4882a593Smuzhiyun * FIXME: I thought that IOV copies had a mechanism for
487*4882a593Smuzhiyun * terminating early, if for example a signal came in while
488*4882a593Smuzhiyun * sleeping waiting for a page to be mapped, but I don't see
489*4882a593Smuzhiyun * where that would happen.
490*4882a593Smuzhiyun */
491*4882a593Smuzhiyun WARN_ON(ret < 0);
492*4882a593Smuzhiyun out += ret;
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun if (!iov_iter_count(iter))
495*4882a593Smuzhiyun break;
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun if (ret < bvec.bv_len)
498*4882a593Smuzhiyun return ret;
499*4882a593Smuzhiyun }
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun return out;
502*4882a593Smuzhiyun }
503*4882a593Smuzhiyun
bio_copy_to_iter(struct bio * bio,struct iov_iter * iter)504*4882a593Smuzhiyun static ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter)
505*4882a593Smuzhiyun {
506*4882a593Smuzhiyun struct bio_vec bvec;
507*4882a593Smuzhiyun struct bvec_iter biter;
508*4882a593Smuzhiyun ssize_t out = 0;
509*4882a593Smuzhiyun
510*4882a593Smuzhiyun bio_for_each_segment (bvec, bio, biter) {
511*4882a593Smuzhiyun ssize_t ret;
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset,
514*4882a593Smuzhiyun bvec.bv_len, iter);
515*4882a593Smuzhiyun
516*4882a593Smuzhiyun /* as above */
517*4882a593Smuzhiyun WARN_ON(ret < 0);
518*4882a593Smuzhiyun out += ret;
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun if (!iov_iter_count(iter))
521*4882a593Smuzhiyun break;
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun if (ret < bvec.bv_len)
524*4882a593Smuzhiyun return ret;
525*4882a593Smuzhiyun }
526*4882a593Smuzhiyun
527*4882a593Smuzhiyun return out;
528*4882a593Smuzhiyun }
529*4882a593Smuzhiyun
msg_copy_to_iov(struct message * msg,struct iov_iter * to)530*4882a593Smuzhiyun static ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to)
531*4882a593Smuzhiyun {
532*4882a593Smuzhiyun ssize_t copied = 0;
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun if (!iov_iter_count(to))
535*4882a593Smuzhiyun return 0;
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun if (msg->posn_to_user < sizeof(msg->msg)) {
538*4882a593Smuzhiyun copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user,
539*4882a593Smuzhiyun sizeof(msg->msg) - msg->posn_to_user, to);
540*4882a593Smuzhiyun } else {
541*4882a593Smuzhiyun copied = bio_copy_to_iter(msg->bio, to);
542*4882a593Smuzhiyun if (copied > 0)
543*4882a593Smuzhiyun bio_advance(msg->bio, copied);
544*4882a593Smuzhiyun }
545*4882a593Smuzhiyun
546*4882a593Smuzhiyun if (copied < 0)
547*4882a593Smuzhiyun return copied;
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun msg->posn_to_user += copied;
550*4882a593Smuzhiyun return copied;
551*4882a593Smuzhiyun }
552*4882a593Smuzhiyun
msg_copy_from_iov(struct message * msg,struct iov_iter * from)553*4882a593Smuzhiyun static ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from)
554*4882a593Smuzhiyun {
555*4882a593Smuzhiyun ssize_t copied = 0;
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun if (!iov_iter_count(from))
558*4882a593Smuzhiyun return 0;
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun if (msg->posn_from_user < sizeof(msg->msg)) {
561*4882a593Smuzhiyun copied = copy_from_iter(
562*4882a593Smuzhiyun (char *)(&msg->msg) + msg->posn_from_user,
563*4882a593Smuzhiyun sizeof(msg->msg) - msg->posn_from_user, from);
564*4882a593Smuzhiyun } else {
565*4882a593Smuzhiyun copied = bio_copy_from_iter(msg->bio, from);
566*4882a593Smuzhiyun if (copied > 0)
567*4882a593Smuzhiyun bio_advance(msg->bio, copied);
568*4882a593Smuzhiyun }
569*4882a593Smuzhiyun
570*4882a593Smuzhiyun if (copied < 0)
571*4882a593Smuzhiyun return copied;
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun msg->posn_from_user += copied;
574*4882a593Smuzhiyun return copied;
575*4882a593Smuzhiyun }
576*4882a593Smuzhiyun
msg_get_map(struct target * t)577*4882a593Smuzhiyun static struct message *msg_get_map(struct target *t)
578*4882a593Smuzhiyun {
579*4882a593Smuzhiyun struct message *m;
580*4882a593Smuzhiyun
581*4882a593Smuzhiyun lockdep_assert_held(&t->lock);
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun m = mempool_alloc(&t->message_pool, GFP_NOIO);
584*4882a593Smuzhiyun m->msg.seq = t->next_seq_to_map++;
585*4882a593Smuzhiyun INIT_LIST_HEAD(&m->to_user);
586*4882a593Smuzhiyun INIT_LIST_HEAD(&m->from_user);
587*4882a593Smuzhiyun return m;
588*4882a593Smuzhiyun }
589*4882a593Smuzhiyun
msg_get_to_user(struct target * t)590*4882a593Smuzhiyun static struct message *msg_get_to_user(struct target *t)
591*4882a593Smuzhiyun {
592*4882a593Smuzhiyun struct message *m;
593*4882a593Smuzhiyun
594*4882a593Smuzhiyun lockdep_assert_held(&t->lock);
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun if (list_empty(&t->to_user))
597*4882a593Smuzhiyun return NULL;
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun m = list_first_entry(&t->to_user, struct message, to_user);
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun list_del(&m->to_user);
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun /*
604*4882a593Smuzhiyun * If the IO was queued to workqueue since there
605*4882a593Smuzhiyun * was no daemon to service the IO, then we
606*4882a593Smuzhiyun * will have to cancel the delayed work as the
607*4882a593Smuzhiyun * IO will be processed by this user-space thread.
608*4882a593Smuzhiyun *
609*4882a593Smuzhiyun * If the delayed work was already picked up for
610*4882a593Smuzhiyun * processing, then wait for it to complete. Note
611*4882a593Smuzhiyun * that the IO will not be terminated by the work
612*4882a593Smuzhiyun * queue thread.
613*4882a593Smuzhiyun */
614*4882a593Smuzhiyun if (unlikely(m->delayed)) {
615*4882a593Smuzhiyun mutex_unlock(&t->lock);
616*4882a593Smuzhiyun cancel_delayed_work_sync(&m->work);
617*4882a593Smuzhiyun mutex_lock(&t->lock);
618*4882a593Smuzhiyun }
619*4882a593Smuzhiyun return m;
620*4882a593Smuzhiyun }
621*4882a593Smuzhiyun
msg_get_from_user(struct channel * c,u64 seq)622*4882a593Smuzhiyun static struct message *msg_get_from_user(struct channel *c, u64 seq)
623*4882a593Smuzhiyun {
624*4882a593Smuzhiyun struct message *m;
625*4882a593Smuzhiyun struct list_head *cur, *tmp;
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun lockdep_assert_held(&c->lock);
628*4882a593Smuzhiyun
629*4882a593Smuzhiyun list_for_each_safe (cur, tmp, &c->from_user) {
630*4882a593Smuzhiyun m = list_entry(cur, struct message, from_user);
631*4882a593Smuzhiyun if (m->msg.seq == seq) {
632*4882a593Smuzhiyun list_del(&m->from_user);
633*4882a593Smuzhiyun return m;
634*4882a593Smuzhiyun }
635*4882a593Smuzhiyun }
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun return NULL;
638*4882a593Smuzhiyun }
639*4882a593Smuzhiyun
640*4882a593Smuzhiyun /*
641*4882a593Smuzhiyun * Returns 0 when there is no work left to do. This must be callable without
642*4882a593Smuzhiyun * holding the target lock, as it is part of the waitqueue's check expression.
643*4882a593Smuzhiyun * When called without the lock it may spuriously indicate there is remaining
644*4882a593Smuzhiyun * work, but when called with the lock it must be accurate.
645*4882a593Smuzhiyun */
target_poll(struct target * t)646*4882a593Smuzhiyun int target_poll(struct target *t)
647*4882a593Smuzhiyun {
648*4882a593Smuzhiyun return !list_empty(&t->to_user) || t->dm_destroyed;
649*4882a593Smuzhiyun }
650*4882a593Smuzhiyun
target_release(struct kref * ref)651*4882a593Smuzhiyun void target_release(struct kref *ref)
652*4882a593Smuzhiyun {
653*4882a593Smuzhiyun struct target *t = container_of(ref, struct target, references);
654*4882a593Smuzhiyun struct list_head *cur, *tmp;
655*4882a593Smuzhiyun
656*4882a593Smuzhiyun /*
657*4882a593Smuzhiyun * There may be outstanding BIOs that have not yet been given to
658*4882a593Smuzhiyun * userspace. At this point there's nothing we can do about them, as
659*4882a593Smuzhiyun * there are and will never be any channels.
660*4882a593Smuzhiyun */
661*4882a593Smuzhiyun list_for_each_safe (cur, tmp, &t->to_user) {
662*4882a593Smuzhiyun struct message *m = list_entry(cur, struct message, to_user);
663*4882a593Smuzhiyun
664*4882a593Smuzhiyun if (unlikely(m->delayed)) {
665*4882a593Smuzhiyun bool ret;
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun mutex_unlock(&t->lock);
668*4882a593Smuzhiyun ret = cancel_delayed_work_sync(&m->work);
669*4882a593Smuzhiyun mutex_lock(&t->lock);
670*4882a593Smuzhiyun if (!ret)
671*4882a593Smuzhiyun continue;
672*4882a593Smuzhiyun }
673*4882a593Smuzhiyun message_kill(m, &t->message_pool);
674*4882a593Smuzhiyun }
675*4882a593Smuzhiyun
676*4882a593Smuzhiyun mempool_exit(&t->message_pool);
677*4882a593Smuzhiyun mutex_unlock(&t->lock);
678*4882a593Smuzhiyun mutex_destroy(&t->lock);
679*4882a593Smuzhiyun kfree(t);
680*4882a593Smuzhiyun }
681*4882a593Smuzhiyun
target_put(struct target * t)682*4882a593Smuzhiyun void target_put(struct target *t)
683*4882a593Smuzhiyun {
684*4882a593Smuzhiyun /*
685*4882a593Smuzhiyun * This both releases a reference to the target and the lock. We leave
686*4882a593Smuzhiyun * it up to the caller to hold the lock, as they probably needed it for
687*4882a593Smuzhiyun * something else.
688*4882a593Smuzhiyun */
689*4882a593Smuzhiyun lockdep_assert_held(&t->lock);
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun if (!kref_put(&t->references, target_release)) {
692*4882a593Smuzhiyun /*
693*4882a593Smuzhiyun * User-space thread is getting terminated.
694*4882a593Smuzhiyun * We need to scan the list for all those
695*4882a593Smuzhiyun * pending IO's which were not processed yet
696*4882a593Smuzhiyun * and put them back to work-queue for delayed
697*4882a593Smuzhiyun * processing.
698*4882a593Smuzhiyun */
699*4882a593Smuzhiyun if (!is_user_space_thread_present(t)) {
700*4882a593Smuzhiyun struct list_head *cur, *tmp;
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun list_for_each_safe(cur, tmp, &t->to_user) {
703*4882a593Smuzhiyun struct message *m = list_entry(cur,
704*4882a593Smuzhiyun struct message,
705*4882a593Smuzhiyun to_user);
706*4882a593Smuzhiyun if (!m->delayed)
707*4882a593Smuzhiyun enqueue_delayed_work(m, false);
708*4882a593Smuzhiyun }
709*4882a593Smuzhiyun /*
710*4882a593Smuzhiyun * Daemon attached to this target is terminated.
711*4882a593Smuzhiyun */
712*4882a593Smuzhiyun t->daemon_terminated = true;
713*4882a593Smuzhiyun }
714*4882a593Smuzhiyun mutex_unlock(&t->lock);
715*4882a593Smuzhiyun }
716*4882a593Smuzhiyun }
717*4882a593Smuzhiyun
channel_alloc(struct target * t)718*4882a593Smuzhiyun static struct channel *channel_alloc(struct target *t)
719*4882a593Smuzhiyun {
720*4882a593Smuzhiyun struct channel *c;
721*4882a593Smuzhiyun
722*4882a593Smuzhiyun lockdep_assert_held(&t->lock);
723*4882a593Smuzhiyun
724*4882a593Smuzhiyun c = kzalloc(sizeof(*c), GFP_KERNEL);
725*4882a593Smuzhiyun if (c == NULL)
726*4882a593Smuzhiyun return NULL;
727*4882a593Smuzhiyun
728*4882a593Smuzhiyun kref_get(&t->references);
729*4882a593Smuzhiyun c->target = t;
730*4882a593Smuzhiyun c->cur_from_user = &c->scratch_message_from_user;
731*4882a593Smuzhiyun mutex_init(&c->lock);
732*4882a593Smuzhiyun INIT_LIST_HEAD(&c->from_user);
733*4882a593Smuzhiyun return c;
734*4882a593Smuzhiyun }
735*4882a593Smuzhiyun
channel_free(struct channel * c)736*4882a593Smuzhiyun void channel_free(struct channel *c)
737*4882a593Smuzhiyun {
738*4882a593Smuzhiyun struct list_head *cur, *tmp;
739*4882a593Smuzhiyun
740*4882a593Smuzhiyun lockdep_assert_held(&c->lock);
741*4882a593Smuzhiyun
742*4882a593Smuzhiyun /*
743*4882a593Smuzhiyun * There may be outstanding BIOs that have been given to userspace but
744*4882a593Smuzhiyun * have not yet been completed. The channel has been shut down so
745*4882a593Smuzhiyun * there's no way to process the rest of those messages, so we just go
746*4882a593Smuzhiyun * ahead and error out the BIOs. Hopefully whatever's on the other end
747*4882a593Smuzhiyun * can handle the errors. One could imagine splitting the BIOs and
748*4882a593Smuzhiyun * completing as much as we got, but that seems like overkill here.
749*4882a593Smuzhiyun *
750*4882a593Smuzhiyun * Our only other options would be to let the BIO hang around (which
751*4882a593Smuzhiyun * seems way worse) or to resubmit it to userspace in the hope there's
752*4882a593Smuzhiyun * another channel. I don't really like the idea of submitting a
753*4882a593Smuzhiyun * message twice.
754*4882a593Smuzhiyun */
755*4882a593Smuzhiyun if (c->cur_to_user != NULL)
756*4882a593Smuzhiyun message_kill(c->cur_to_user, &c->target->message_pool);
757*4882a593Smuzhiyun if (c->cur_from_user != &c->scratch_message_from_user)
758*4882a593Smuzhiyun message_kill(c->cur_from_user, &c->target->message_pool);
759*4882a593Smuzhiyun list_for_each_safe (cur, tmp, &c->from_user)
760*4882a593Smuzhiyun message_kill(list_entry(cur, struct message, from_user),
761*4882a593Smuzhiyun &c->target->message_pool);
762*4882a593Smuzhiyun
763*4882a593Smuzhiyun mutex_lock(&c->target->lock);
764*4882a593Smuzhiyun target_put(c->target);
765*4882a593Smuzhiyun mutex_unlock(&c->lock);
766*4882a593Smuzhiyun mutex_destroy(&c->lock);
767*4882a593Smuzhiyun kfree(c);
768*4882a593Smuzhiyun }
769*4882a593Smuzhiyun
dev_open(struct inode * inode,struct file * file)770*4882a593Smuzhiyun static int dev_open(struct inode *inode, struct file *file)
771*4882a593Smuzhiyun {
772*4882a593Smuzhiyun struct channel *c;
773*4882a593Smuzhiyun struct target *t;
774*4882a593Smuzhiyun
775*4882a593Smuzhiyun /*
776*4882a593Smuzhiyun * This is called by miscdev, which sets private_data to point to the
777*4882a593Smuzhiyun * struct miscdevice that was opened. The rest of our file operations
778*4882a593Smuzhiyun * want to refer to the channel that's been opened, so we swap that
779*4882a593Smuzhiyun * pointer out with a fresh channel.
780*4882a593Smuzhiyun *
781*4882a593Smuzhiyun * This is called with the miscdev lock held, which is also held while
782*4882a593Smuzhiyun * registering/unregistering the miscdev. The miscdev must be
783*4882a593Smuzhiyun * registered for this to get called, which means there must be an
784*4882a593Smuzhiyun * outstanding reference to the target, which means it cannot be freed
785*4882a593Smuzhiyun * out from under us despite us not holding a reference yet.
786*4882a593Smuzhiyun */
787*4882a593Smuzhiyun t = container_of(file->private_data, struct target, miscdev);
788*4882a593Smuzhiyun mutex_lock(&t->lock);
789*4882a593Smuzhiyun file->private_data = c = channel_alloc(t);
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun if (c == NULL) {
792*4882a593Smuzhiyun mutex_unlock(&t->lock);
793*4882a593Smuzhiyun return -ENOMEM;
794*4882a593Smuzhiyun }
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun mutex_unlock(&t->lock);
797*4882a593Smuzhiyun return 0;
798*4882a593Smuzhiyun }
799*4882a593Smuzhiyun
dev_read(struct kiocb * iocb,struct iov_iter * to)800*4882a593Smuzhiyun static ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to)
801*4882a593Smuzhiyun {
802*4882a593Smuzhiyun struct channel *c = channel_from_file(iocb->ki_filp);
803*4882a593Smuzhiyun ssize_t total_processed = 0;
804*4882a593Smuzhiyun ssize_t processed;
805*4882a593Smuzhiyun
806*4882a593Smuzhiyun mutex_lock(&c->lock);
807*4882a593Smuzhiyun
808*4882a593Smuzhiyun if (unlikely(c->to_user_error)) {
809*4882a593Smuzhiyun total_processed = c->to_user_error;
810*4882a593Smuzhiyun goto cleanup_unlock;
811*4882a593Smuzhiyun }
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun if (c->cur_to_user == NULL) {
814*4882a593Smuzhiyun struct target *t = target_from_channel(c);
815*4882a593Smuzhiyun
816*4882a593Smuzhiyun mutex_lock(&t->lock);
817*4882a593Smuzhiyun
818*4882a593Smuzhiyun while (!target_poll(t)) {
819*4882a593Smuzhiyun int e;
820*4882a593Smuzhiyun
821*4882a593Smuzhiyun mutex_unlock(&t->lock);
822*4882a593Smuzhiyun mutex_unlock(&c->lock);
823*4882a593Smuzhiyun e = wait_event_interruptible(t->wq, target_poll(t));
824*4882a593Smuzhiyun mutex_lock(&c->lock);
825*4882a593Smuzhiyun mutex_lock(&t->lock);
826*4882a593Smuzhiyun
827*4882a593Smuzhiyun if (unlikely(e != 0)) {
828*4882a593Smuzhiyun /*
829*4882a593Smuzhiyun * We haven't processed any bytes in either the
830*4882a593Smuzhiyun * BIO or the IOV, so we can just terminate
831*4882a593Smuzhiyun * right now. Elsewhere in the kernel handles
832*4882a593Smuzhiyun * restarting the syscall when appropriate.
833*4882a593Smuzhiyun */
834*4882a593Smuzhiyun total_processed = e;
835*4882a593Smuzhiyun mutex_unlock(&t->lock);
836*4882a593Smuzhiyun goto cleanup_unlock;
837*4882a593Smuzhiyun }
838*4882a593Smuzhiyun }
839*4882a593Smuzhiyun
840*4882a593Smuzhiyun if (unlikely(t->dm_destroyed)) {
841*4882a593Smuzhiyun /*
842*4882a593Smuzhiyun * DM has destroyed this target, so just lock
843*4882a593Smuzhiyun * the user out. There's really nothing else
844*4882a593Smuzhiyun * we can do here. Note that we don't actually
845*4882a593Smuzhiyun * tear any thing down until userspace has
846*4882a593Smuzhiyun * closed the FD, as there may still be
847*4882a593Smuzhiyun * outstanding BIOs.
848*4882a593Smuzhiyun *
849*4882a593Smuzhiyun * This is kind of a wacky error code to
850*4882a593Smuzhiyun * return. My goal was really just to try and
851*4882a593Smuzhiyun * find something that wasn't likely to be
852*4882a593Smuzhiyun * returned by anything else in the miscdev
853*4882a593Smuzhiyun * path. The message "block device required"
854*4882a593Smuzhiyun * seems like a somewhat reasonable thing to
855*4882a593Smuzhiyun * say when the target has disappeared out from
856*4882a593Smuzhiyun * under us, but "not block" isn't sensible.
857*4882a593Smuzhiyun */
858*4882a593Smuzhiyun c->to_user_error = total_processed = -ENOTBLK;
859*4882a593Smuzhiyun mutex_unlock(&t->lock);
860*4882a593Smuzhiyun goto cleanup_unlock;
861*4882a593Smuzhiyun }
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun /*
864*4882a593Smuzhiyun * Ensures that accesses to the message data are not ordered
865*4882a593Smuzhiyun * before the remote accesses that produce that message data.
866*4882a593Smuzhiyun *
867*4882a593Smuzhiyun * This pairs with the barrier in user_map(), via the
868*4882a593Smuzhiyun * conditional within the while loop above. Also see the lack
869*4882a593Smuzhiyun * of barrier in user_dtr(), which is why this can be after the
870*4882a593Smuzhiyun * destroyed check.
871*4882a593Smuzhiyun */
872*4882a593Smuzhiyun smp_rmb();
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun c->cur_to_user = msg_get_to_user(t);
875*4882a593Smuzhiyun WARN_ON(c->cur_to_user == NULL);
876*4882a593Smuzhiyun mutex_unlock(&t->lock);
877*4882a593Smuzhiyun }
878*4882a593Smuzhiyun
879*4882a593Smuzhiyun processed = msg_copy_to_iov(c->cur_to_user, to);
880*4882a593Smuzhiyun total_processed += processed;
881*4882a593Smuzhiyun
882*4882a593Smuzhiyun WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user);
883*4882a593Smuzhiyun if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) {
884*4882a593Smuzhiyun struct message *m = c->cur_to_user;
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun c->cur_to_user = NULL;
887*4882a593Smuzhiyun list_add_tail(&m->from_user, &c->from_user);
888*4882a593Smuzhiyun }
889*4882a593Smuzhiyun
890*4882a593Smuzhiyun cleanup_unlock:
891*4882a593Smuzhiyun mutex_unlock(&c->lock);
892*4882a593Smuzhiyun return total_processed;
893*4882a593Smuzhiyun }
894*4882a593Smuzhiyun
dev_write(struct kiocb * iocb,struct iov_iter * from)895*4882a593Smuzhiyun static ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from)
896*4882a593Smuzhiyun {
897*4882a593Smuzhiyun struct channel *c = channel_from_file(iocb->ki_filp);
898*4882a593Smuzhiyun ssize_t total_processed = 0;
899*4882a593Smuzhiyun ssize_t processed;
900*4882a593Smuzhiyun
901*4882a593Smuzhiyun mutex_lock(&c->lock);
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun if (unlikely(c->from_user_error)) {
904*4882a593Smuzhiyun total_processed = c->from_user_error;
905*4882a593Smuzhiyun goto cleanup_unlock;
906*4882a593Smuzhiyun }
907*4882a593Smuzhiyun
908*4882a593Smuzhiyun /*
909*4882a593Smuzhiyun * cur_from_user can never be NULL. If there's no real message it must
910*4882a593Smuzhiyun * point to the scratch space.
911*4882a593Smuzhiyun */
912*4882a593Smuzhiyun WARN_ON(c->cur_from_user == NULL);
913*4882a593Smuzhiyun if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) {
914*4882a593Smuzhiyun struct message *msg, *old;
915*4882a593Smuzhiyun
916*4882a593Smuzhiyun processed = msg_copy_from_iov(c->cur_from_user, from);
917*4882a593Smuzhiyun if (processed <= 0) {
918*4882a593Smuzhiyun pr_warn("msg_copy_from_iov() returned %zu\n",
919*4882a593Smuzhiyun processed);
920*4882a593Smuzhiyun c->from_user_error = -EINVAL;
921*4882a593Smuzhiyun goto cleanup_unlock;
922*4882a593Smuzhiyun }
923*4882a593Smuzhiyun total_processed += processed;
924*4882a593Smuzhiyun
925*4882a593Smuzhiyun /*
926*4882a593Smuzhiyun * In the unlikely event the user has provided us a very short
927*4882a593Smuzhiyun * write, not even big enough to fill a message, just succeed.
928*4882a593Smuzhiyun * We'll eventually build up enough bytes to do something.
929*4882a593Smuzhiyun */
930*4882a593Smuzhiyun if (unlikely(c->cur_from_user->posn_from_user <
931*4882a593Smuzhiyun sizeof(struct dm_user_message)))
932*4882a593Smuzhiyun goto cleanup_unlock;
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun old = c->cur_from_user;
935*4882a593Smuzhiyun mutex_lock(&c->target->lock);
936*4882a593Smuzhiyun msg = msg_get_from_user(c, c->cur_from_user->msg.seq);
937*4882a593Smuzhiyun if (msg == NULL) {
938*4882a593Smuzhiyun pr_info("user provided an invalid messag seq of %llx\n",
939*4882a593Smuzhiyun old->msg.seq);
940*4882a593Smuzhiyun mutex_unlock(&c->target->lock);
941*4882a593Smuzhiyun c->from_user_error = -EINVAL;
942*4882a593Smuzhiyun goto cleanup_unlock;
943*4882a593Smuzhiyun }
944*4882a593Smuzhiyun mutex_unlock(&c->target->lock);
945*4882a593Smuzhiyun
946*4882a593Smuzhiyun WARN_ON(old->posn_from_user != sizeof(struct dm_user_message));
947*4882a593Smuzhiyun msg->posn_from_user = sizeof(struct dm_user_message);
948*4882a593Smuzhiyun msg->return_type = old->msg.type;
949*4882a593Smuzhiyun msg->return_flags = old->msg.flags;
950*4882a593Smuzhiyun WARN_ON(msg->posn_from_user > msg->total_from_user);
951*4882a593Smuzhiyun c->cur_from_user = msg;
952*4882a593Smuzhiyun WARN_ON(old != &c->scratch_message_from_user);
953*4882a593Smuzhiyun }
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun /*
956*4882a593Smuzhiyun * Userspace can signal an error for single requests by overwriting the
957*4882a593Smuzhiyun * seq field.
958*4882a593Smuzhiyun */
959*4882a593Smuzhiyun switch (c->cur_from_user->return_type) {
960*4882a593Smuzhiyun case DM_USER_RESP_SUCCESS:
961*4882a593Smuzhiyun c->cur_from_user->bio->bi_status = BLK_STS_OK;
962*4882a593Smuzhiyun break;
963*4882a593Smuzhiyun case DM_USER_RESP_ERROR:
964*4882a593Smuzhiyun case DM_USER_RESP_UNSUPPORTED:
965*4882a593Smuzhiyun default:
966*4882a593Smuzhiyun c->cur_from_user->bio->bi_status = BLK_STS_IOERR;
967*4882a593Smuzhiyun goto finish_bio;
968*4882a593Smuzhiyun }
969*4882a593Smuzhiyun
970*4882a593Smuzhiyun /*
971*4882a593Smuzhiyun * The op was a success as far as userspace is concerned, so process
972*4882a593Smuzhiyun * whatever data may come along with it. The user may provide the BIO
973*4882a593Smuzhiyun * data in multiple chunks, in which case we don't need to finish the
974*4882a593Smuzhiyun * BIO.
975*4882a593Smuzhiyun */
976*4882a593Smuzhiyun processed = msg_copy_from_iov(c->cur_from_user, from);
977*4882a593Smuzhiyun total_processed += processed;
978*4882a593Smuzhiyun
979*4882a593Smuzhiyun if (c->cur_from_user->posn_from_user <
980*4882a593Smuzhiyun c->cur_from_user->total_from_user)
981*4882a593Smuzhiyun goto cleanup_unlock;
982*4882a593Smuzhiyun
983*4882a593Smuzhiyun finish_bio:
984*4882a593Smuzhiyun /*
985*4882a593Smuzhiyun * When we set up this message the BIO's size matched the
986*4882a593Smuzhiyun * message size, if that's not still the case then something
987*4882a593Smuzhiyun * has gone off the rails.
988*4882a593Smuzhiyun */
989*4882a593Smuzhiyun WARN_ON(bio_size(c->cur_from_user->bio) != 0);
990*4882a593Smuzhiyun bio_endio(c->cur_from_user->bio);
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun /*
993*4882a593Smuzhiyun * We don't actually need to take the target lock here, as all
994*4882a593Smuzhiyun * we're doing is freeing the message and mempools have their
995*4882a593Smuzhiyun * own lock. Each channel has its ows scratch message.
996*4882a593Smuzhiyun */
997*4882a593Smuzhiyun WARN_ON(c->cur_from_user == &c->scratch_message_from_user);
998*4882a593Smuzhiyun mempool_free(c->cur_from_user, &c->target->message_pool);
999*4882a593Smuzhiyun c->scratch_message_from_user.posn_from_user = 0;
1000*4882a593Smuzhiyun c->cur_from_user = &c->scratch_message_from_user;
1001*4882a593Smuzhiyun
1002*4882a593Smuzhiyun cleanup_unlock:
1003*4882a593Smuzhiyun mutex_unlock(&c->lock);
1004*4882a593Smuzhiyun return total_processed;
1005*4882a593Smuzhiyun }
1006*4882a593Smuzhiyun
dev_release(struct inode * inode,struct file * file)1007*4882a593Smuzhiyun static int dev_release(struct inode *inode, struct file *file)
1008*4882a593Smuzhiyun {
1009*4882a593Smuzhiyun struct channel *c;
1010*4882a593Smuzhiyun
1011*4882a593Smuzhiyun c = channel_from_file(file);
1012*4882a593Smuzhiyun mutex_lock(&c->lock);
1013*4882a593Smuzhiyun channel_free(c);
1014*4882a593Smuzhiyun
1015*4882a593Smuzhiyun return 0;
1016*4882a593Smuzhiyun }
1017*4882a593Smuzhiyun
1018*4882a593Smuzhiyun static const struct file_operations file_operations = {
1019*4882a593Smuzhiyun .owner = THIS_MODULE,
1020*4882a593Smuzhiyun .open = dev_open,
1021*4882a593Smuzhiyun .llseek = no_llseek,
1022*4882a593Smuzhiyun .read_iter = dev_read,
1023*4882a593Smuzhiyun .write_iter = dev_write,
1024*4882a593Smuzhiyun .release = dev_release,
1025*4882a593Smuzhiyun };
1026*4882a593Smuzhiyun
user_ctr(struct dm_target * ti,unsigned int argc,char ** argv)1027*4882a593Smuzhiyun static int user_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1028*4882a593Smuzhiyun {
1029*4882a593Smuzhiyun struct target *t;
1030*4882a593Smuzhiyun int r;
1031*4882a593Smuzhiyun
1032*4882a593Smuzhiyun if (argc != 3) {
1033*4882a593Smuzhiyun ti->error = "Invalid argument count";
1034*4882a593Smuzhiyun r = -EINVAL;
1035*4882a593Smuzhiyun goto cleanup_none;
1036*4882a593Smuzhiyun }
1037*4882a593Smuzhiyun
1038*4882a593Smuzhiyun t = kzalloc(sizeof(*t), GFP_KERNEL);
1039*4882a593Smuzhiyun if (t == NULL) {
1040*4882a593Smuzhiyun r = -ENOMEM;
1041*4882a593Smuzhiyun goto cleanup_none;
1042*4882a593Smuzhiyun }
1043*4882a593Smuzhiyun ti->private = t;
1044*4882a593Smuzhiyun
1045*4882a593Smuzhiyun /* Enable more BIO types. */
1046*4882a593Smuzhiyun ti->num_discard_bios = 1;
1047*4882a593Smuzhiyun ti->discards_supported = true;
1048*4882a593Smuzhiyun ti->num_flush_bios = 1;
1049*4882a593Smuzhiyun ti->flush_supported = true;
1050*4882a593Smuzhiyun
1051*4882a593Smuzhiyun /*
1052*4882a593Smuzhiyun * We begin with a single reference to the target, which is miscdev's
1053*4882a593Smuzhiyun * reference. This ensures that the target won't be freed
1054*4882a593Smuzhiyun * until after the miscdev has been unregistered and all extant
1055*4882a593Smuzhiyun * channels have been closed.
1056*4882a593Smuzhiyun */
1057*4882a593Smuzhiyun kref_init(&t->references);
1058*4882a593Smuzhiyun
1059*4882a593Smuzhiyun t->daemon_terminated = false;
1060*4882a593Smuzhiyun mutex_init(&t->lock);
1061*4882a593Smuzhiyun init_waitqueue_head(&t->wq);
1062*4882a593Smuzhiyun INIT_LIST_HEAD(&t->to_user);
1063*4882a593Smuzhiyun mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES,
1064*4882a593Smuzhiyun sizeof(struct message));
1065*4882a593Smuzhiyun
1066*4882a593Smuzhiyun t->miscdev.minor = MISC_DYNAMIC_MINOR;
1067*4882a593Smuzhiyun t->miscdev.fops = &file_operations;
1068*4882a593Smuzhiyun t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]);
1069*4882a593Smuzhiyun if (t->miscdev.name == NULL) {
1070*4882a593Smuzhiyun r = -ENOMEM;
1071*4882a593Smuzhiyun goto cleanup_message_pool;
1072*4882a593Smuzhiyun }
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun /*
1075*4882a593Smuzhiyun * Once the miscdev is registered it can be opened and therefor
1076*4882a593Smuzhiyun * concurrent references to the channel can happen. Holding the target
1077*4882a593Smuzhiyun * lock during misc_register() could deadlock. If registration
1078*4882a593Smuzhiyun * succeeds then we will not access the target again so we just stick a
1079*4882a593Smuzhiyun * barrier here, which pairs with taking the target lock everywhere
1080*4882a593Smuzhiyun * else the target is accessed.
1081*4882a593Smuzhiyun *
1082*4882a593Smuzhiyun * I forgot where we ended up on the RCpc/RCsc locks. IIU RCsc locks
1083*4882a593Smuzhiyun * would mean that we could take the target lock earlier and release it
1084*4882a593Smuzhiyun * here instead of the memory barrier. I'm not sure that's any better,
1085*4882a593Smuzhiyun * though, and this isn't on a hot path so it probably doesn't matter
1086*4882a593Smuzhiyun * either way.
1087*4882a593Smuzhiyun */
1088*4882a593Smuzhiyun smp_mb();
1089*4882a593Smuzhiyun
1090*4882a593Smuzhiyun r = misc_register(&t->miscdev);
1091*4882a593Smuzhiyun if (r) {
1092*4882a593Smuzhiyun DMERR("Unable to register miscdev %s for dm-user",
1093*4882a593Smuzhiyun t->miscdev.name);
1094*4882a593Smuzhiyun r = -ENOMEM;
1095*4882a593Smuzhiyun goto cleanup_misc_name;
1096*4882a593Smuzhiyun }
1097*4882a593Smuzhiyun
1098*4882a593Smuzhiyun return 0;
1099*4882a593Smuzhiyun
1100*4882a593Smuzhiyun cleanup_misc_name:
1101*4882a593Smuzhiyun kfree(t->miscdev.name);
1102*4882a593Smuzhiyun cleanup_message_pool:
1103*4882a593Smuzhiyun mempool_exit(&t->message_pool);
1104*4882a593Smuzhiyun kfree(t);
1105*4882a593Smuzhiyun cleanup_none:
1106*4882a593Smuzhiyun return r;
1107*4882a593Smuzhiyun }
1108*4882a593Smuzhiyun
user_dtr(struct dm_target * ti)1109*4882a593Smuzhiyun static void user_dtr(struct dm_target *ti)
1110*4882a593Smuzhiyun {
1111*4882a593Smuzhiyun struct target *t = target_from_target(ti);
1112*4882a593Smuzhiyun
1113*4882a593Smuzhiyun /*
1114*4882a593Smuzhiyun * Removes the miscdev. This must be called without the target lock
1115*4882a593Smuzhiyun * held to avoid a possible deadlock because our open implementation is
1116*4882a593Smuzhiyun * called holding the miscdev lock and must later take the target lock.
1117*4882a593Smuzhiyun *
1118*4882a593Smuzhiyun * There is no race here because only DM can register/unregister the
1119*4882a593Smuzhiyun * miscdev, and DM ensures that doesn't happen twice. The internal
1120*4882a593Smuzhiyun * miscdev lock is sufficient to ensure there are no races between
1121*4882a593Smuzhiyun * deregistering the miscdev and open.
1122*4882a593Smuzhiyun */
1123*4882a593Smuzhiyun misc_deregister(&t->miscdev);
1124*4882a593Smuzhiyun
1125*4882a593Smuzhiyun /*
1126*4882a593Smuzhiyun * We are now free to take the target's lock and drop our reference to
1127*4882a593Smuzhiyun * the target. There are almost certainly tasks sleeping in read on at
1128*4882a593Smuzhiyun * least one of the channels associated with this target, this
1129*4882a593Smuzhiyun * explicitly wakes them up and terminates the read.
1130*4882a593Smuzhiyun */
1131*4882a593Smuzhiyun mutex_lock(&t->lock);
1132*4882a593Smuzhiyun /*
1133*4882a593Smuzhiyun * No barrier here, as wait/wake ensures that the flag visibility is
1134*4882a593Smuzhiyun * correct WRT the wake/sleep state of the target tasks.
1135*4882a593Smuzhiyun */
1136*4882a593Smuzhiyun t->dm_destroyed = true;
1137*4882a593Smuzhiyun wake_up_all(&t->wq);
1138*4882a593Smuzhiyun target_put(t);
1139*4882a593Smuzhiyun }
1140*4882a593Smuzhiyun
1141*4882a593Smuzhiyun /*
1142*4882a593Smuzhiyun * Consumes a BIO from device mapper, queueing it up for userspace.
1143*4882a593Smuzhiyun */
user_map(struct dm_target * ti,struct bio * bio)1144*4882a593Smuzhiyun static int user_map(struct dm_target *ti, struct bio *bio)
1145*4882a593Smuzhiyun {
1146*4882a593Smuzhiyun struct target *t;
1147*4882a593Smuzhiyun struct message *entry;
1148*4882a593Smuzhiyun
1149*4882a593Smuzhiyun t = target_from_target(ti);
1150*4882a593Smuzhiyun /*
1151*4882a593Smuzhiyun * FIXME
1152*4882a593Smuzhiyun *
1153*4882a593Smuzhiyun * This seems like a bad idea. Specifically, here we're
1154*4882a593Smuzhiyun * directly on the IO path when we take the target lock, which may also
1155*4882a593Smuzhiyun * be taken from a user context. The user context doesn't actively
1156*4882a593Smuzhiyun * trigger anything that may sleep while holding the lock, but this
1157*4882a593Smuzhiyun * still seems like a bad idea.
1158*4882a593Smuzhiyun *
1159*4882a593Smuzhiyun * The obvious way to fix this would be to use a proper queue, which
1160*4882a593Smuzhiyun * would result in no shared locks between the direct IO path and user
1161*4882a593Smuzhiyun * tasks. I had a version that did this, but the head-of-line blocking
1162*4882a593Smuzhiyun * from the circular buffer resulted in us needing a fairly large
1163*4882a593Smuzhiyun * allocation in order to avoid situations in which the queue fills up
1164*4882a593Smuzhiyun * and everything goes off the rails.
1165*4882a593Smuzhiyun *
1166*4882a593Smuzhiyun * I could jump through a some hoops to avoid a shared lock while still
1167*4882a593Smuzhiyun * allowing for a large queue, but I'm not actually sure that allowing
1168*4882a593Smuzhiyun * for very large queues is the right thing to do here. Intuitively it
1169*4882a593Smuzhiyun * seems better to keep the queues small in here (essentially sized to
1170*4882a593Smuzhiyun * the user latency for performance reasons only) and rely on returning
1171*4882a593Smuzhiyun * DM_MAPIO_REQUEUE regularly, as that would give the rest of the
1172*4882a593Smuzhiyun * kernel more information.
1173*4882a593Smuzhiyun *
1174*4882a593Smuzhiyun * I'll spend some time trying to figure out what's going on with
1175*4882a593Smuzhiyun * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix
1176*4882a593Smuzhiyun * this I'm all ears.
1177*4882a593Smuzhiyun */
1178*4882a593Smuzhiyun mutex_lock(&t->lock);
1179*4882a593Smuzhiyun
1180*4882a593Smuzhiyun /*
1181*4882a593Smuzhiyun * FIXME
1182*4882a593Smuzhiyun *
1183*4882a593Smuzhiyun * The assumption here is that there's no benefit to returning
1184*4882a593Smuzhiyun * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not
1185*4882a593Smuzhiyun * sure that's actually true -- for example, I could imagine users
1186*4882a593Smuzhiyun * expecting that submitted BIOs are unlikely to fail and therefor
1187*4882a593Smuzhiyun * relying on submission failure to indicate an unsupported type.
1188*4882a593Smuzhiyun *
1189*4882a593Smuzhiyun * There's two ways I can think of to fix this:
1190*4882a593Smuzhiyun * - Add DM arguments that are parsed during the constructor that
1191*4882a593Smuzhiyun * allow various dm_target flags to be set that indicate the op
1192*4882a593Smuzhiyun * types supported by this target. This may make sense for things
1193*4882a593Smuzhiyun * like discard, where DM can already transform the BIOs to a form
1194*4882a593Smuzhiyun * that's likely to be supported.
1195*4882a593Smuzhiyun * - Some sort of pre-filter that allows userspace to hook in here
1196*4882a593Smuzhiyun * and kill BIOs before marking them as submitted. My guess would
1197*4882a593Smuzhiyun * be that a userspace round trip is a bad idea here, but a BPF
1198*4882a593Smuzhiyun * call seems resonable.
1199*4882a593Smuzhiyun *
1200*4882a593Smuzhiyun * My guess is that we'd likely want to do both. The first one is easy
1201*4882a593Smuzhiyun * and gives DM the proper info, so it seems better. The BPF call
1202*4882a593Smuzhiyun * seems overly complex for just this, but one could imagine wanting to
1203*4882a593Smuzhiyun * sometimes return _MAPPED and a BPF filter would be the way to do
1204*4882a593Smuzhiyun * that.
1205*4882a593Smuzhiyun *
1206*4882a593Smuzhiyun * For example, in Android we have an in-kernel DM device called
1207*4882a593Smuzhiyun * "dm-bow" that takes advange of some portion of the space that has
1208*4882a593Smuzhiyun * been discarded on a device to provide opportunistic block-level
1209*4882a593Smuzhiyun * backups. While one could imagine just implementing this entirely in
1210*4882a593Smuzhiyun * userspace, that would come with an appreciable performance penalty.
1211*4882a593Smuzhiyun * Instead one could keep a BPF program that forwards most accesses
1212*4882a593Smuzhiyun * directly to the backing block device while informing a userspace
1213*4882a593Smuzhiyun * daemon of any discarded space and on writes to blocks that are to be
1214*4882a593Smuzhiyun * backed up.
1215*4882a593Smuzhiyun */
1216*4882a593Smuzhiyun if (unlikely((bio_type_to_user_type(bio) < 0) ||
1217*4882a593Smuzhiyun (bio_flags_to_user_flags(bio) < 0))) {
1218*4882a593Smuzhiyun mutex_unlock(&t->lock);
1219*4882a593Smuzhiyun return DM_MAPIO_KILL;
1220*4882a593Smuzhiyun }
1221*4882a593Smuzhiyun
1222*4882a593Smuzhiyun entry = msg_get_map(t);
1223*4882a593Smuzhiyun if (unlikely(entry == NULL)) {
1224*4882a593Smuzhiyun mutex_unlock(&t->lock);
1225*4882a593Smuzhiyun return DM_MAPIO_REQUEUE;
1226*4882a593Smuzhiyun }
1227*4882a593Smuzhiyun
1228*4882a593Smuzhiyun entry->msg.type = bio_type_to_user_type(bio);
1229*4882a593Smuzhiyun entry->msg.flags = bio_flags_to_user_flags(bio);
1230*4882a593Smuzhiyun entry->msg.sector = bio->bi_iter.bi_sector;
1231*4882a593Smuzhiyun entry->msg.len = bio_size(bio);
1232*4882a593Smuzhiyun entry->bio = bio;
1233*4882a593Smuzhiyun entry->posn_to_user = 0;
1234*4882a593Smuzhiyun entry->total_to_user = bio_bytes_needed_to_user(bio);
1235*4882a593Smuzhiyun entry->posn_from_user = 0;
1236*4882a593Smuzhiyun entry->total_from_user = bio_bytes_needed_from_user(bio);
1237*4882a593Smuzhiyun entry->delayed = false;
1238*4882a593Smuzhiyun entry->t = t;
1239*4882a593Smuzhiyun /* Pairs with the barrier in dev_read() */
1240*4882a593Smuzhiyun smp_wmb();
1241*4882a593Smuzhiyun list_add_tail(&entry->to_user, &t->to_user);
1242*4882a593Smuzhiyun
1243*4882a593Smuzhiyun /*
1244*4882a593Smuzhiyun * If there is no daemon to process the IO's,
1245*4882a593Smuzhiyun * queue these messages into a workqueue with
1246*4882a593Smuzhiyun * a timeout.
1247*4882a593Smuzhiyun */
1248*4882a593Smuzhiyun if (!is_user_space_thread_present(t))
1249*4882a593Smuzhiyun enqueue_delayed_work(entry, !t->daemon_terminated);
1250*4882a593Smuzhiyun
1251*4882a593Smuzhiyun wake_up_interruptible(&t->wq);
1252*4882a593Smuzhiyun mutex_unlock(&t->lock);
1253*4882a593Smuzhiyun return DM_MAPIO_SUBMITTED;
1254*4882a593Smuzhiyun }
1255*4882a593Smuzhiyun
1256*4882a593Smuzhiyun static struct target_type user_target = {
1257*4882a593Smuzhiyun .name = "user",
1258*4882a593Smuzhiyun .version = { 1, 0, 0 },
1259*4882a593Smuzhiyun .module = THIS_MODULE,
1260*4882a593Smuzhiyun .ctr = user_ctr,
1261*4882a593Smuzhiyun .dtr = user_dtr,
1262*4882a593Smuzhiyun .map = user_map,
1263*4882a593Smuzhiyun };
1264*4882a593Smuzhiyun
dm_user_init(void)1265*4882a593Smuzhiyun static int __init dm_user_init(void)
1266*4882a593Smuzhiyun {
1267*4882a593Smuzhiyun int r;
1268*4882a593Smuzhiyun
1269*4882a593Smuzhiyun r = dm_register_target(&user_target);
1270*4882a593Smuzhiyun if (r) {
1271*4882a593Smuzhiyun DMERR("register failed %d", r);
1272*4882a593Smuzhiyun goto error;
1273*4882a593Smuzhiyun }
1274*4882a593Smuzhiyun
1275*4882a593Smuzhiyun return 0;
1276*4882a593Smuzhiyun
1277*4882a593Smuzhiyun error:
1278*4882a593Smuzhiyun return r;
1279*4882a593Smuzhiyun }
1280*4882a593Smuzhiyun
dm_user_exit(void)1281*4882a593Smuzhiyun static void __exit dm_user_exit(void)
1282*4882a593Smuzhiyun {
1283*4882a593Smuzhiyun dm_unregister_target(&user_target);
1284*4882a593Smuzhiyun }
1285*4882a593Smuzhiyun
1286*4882a593Smuzhiyun module_init(dm_user_init);
1287*4882a593Smuzhiyun module_exit(dm_user_exit);
1288*4882a593Smuzhiyun MODULE_AUTHOR("Palmer Dabbelt <palmerdabbelt@google.com>");
1289*4882a593Smuzhiyun MODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace");
1290*4882a593Smuzhiyun MODULE_LICENSE("GPL");
1291