drivers/md/dm-user.c

*4882a593Smuzhiyun// SPDX-License-Identifier: GPL-2.0+
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * Copyright (C) 2020 Google, Inc
*4882a593Smuzhiyun * Copyright (C) 2020 Palmer Dabbelt <palmerdabbelt@google.com>
*4882a593Smuzhiyun */
*4882a593Smuzhiyun
*4882a593Smuzhiyun#include <linux/device-mapper.h>
*4882a593Smuzhiyun#include <uapi/linux/dm-user.h>
*4882a593Smuzhiyun
*4882a593Smuzhiyun#include <linux/bio.h>
*4882a593Smuzhiyun#include <linux/init.h>
*4882a593Smuzhiyun#include <linux/mempool.h>
*4882a593Smuzhiyun#include <linux/miscdevice.h>
*4882a593Smuzhiyun#include <linux/module.h>
*4882a593Smuzhiyun#include <linux/poll.h>
*4882a593Smuzhiyun#include <linux/uio.h>
*4882a593Smuzhiyun#include <linux/wait.h>
*4882a593Smuzhiyun#include <linux/workqueue.h>
*4882a593Smuzhiyun
*4882a593Smuzhiyun#define DM_MSG_PREFIX "user"
*4882a593Smuzhiyun
*4882a593Smuzhiyun#define MAX_OUTSTANDING_MESSAGES 128
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic unsigned int daemon_timeout_msec = 4000;
*4882a593Smuzhiyunmodule_param_named(dm_user_daemon_timeout_msec, daemon_timeout_msec, uint,
*4882a593Smuzhiyun		   0644);
*4882a593SmuzhiyunMODULE_PARM_DESC(dm_user_daemon_timeout_msec,
*4882a593Smuzhiyun		 "IO Timeout in msec if daemon does not process");
*4882a593Smuzhiyun
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * dm-user uses four structures:
*4882a593Smuzhiyun *
*4882a593Smuzhiyun *  - "struct target", the outermost structure, corresponds to a single device
*4882a593Smuzhiyun *    mapper target.  This contains the set of outstanding BIOs that have been
*4882a593Smuzhiyun *    provided by DM and are not actively being processed by the user, along
*4882a593Smuzhiyun *    with a misc device that userspace can open to communicate with the
*4882a593Smuzhiyun *    kernel.  Each time userspaces opens the misc device a new channel is
*4882a593Smuzhiyun *    created.
*4882a593Smuzhiyun *  - "struct channel", which represents a single active communication channel
*4882a593Smuzhiyun *    with userspace.  Userspace may choose arbitrary read/write sizes to use
*4882a593Smuzhiyun *    when processing messages, channels form these into logical accesses.
*4882a593Smuzhiyun *    When userspace responds to a full message the channel completes the BIO
*4882a593Smuzhiyun *    and obtains a new message to process from the target.
*4882a593Smuzhiyun *  - "struct message", which wraps a BIO with the additional information
*4882a593Smuzhiyun *    required by the kernel to sort out what to do with BIOs when they return
*4882a593Smuzhiyun *    from userspace.
*4882a593Smuzhiyun *  - "struct dm_user_message", which is the exact message format that
*4882a593Smuzhiyun *    userspace sees.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * The hot path contains three distinct operations:
*4882a593Smuzhiyun *
*4882a593Smuzhiyun *  - user_map(), which is provided a BIO from device mapper that is queued
*4882a593Smuzhiyun *    into the target.  This allocates and enqueues a new message.
*4882a593Smuzhiyun *  - dev_read(), which dequeues a message, copies it to userspace.
*4882a593Smuzhiyun *  - dev_write(), which looks up a message (keyed by sequence number) and
*4882a593Smuzhiyun *    completes the corresponding BIO.
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * Lock ordering (outer to inner)
*4882a593Smuzhiyun *
*4882a593Smuzhiyun * 1) miscdevice's global lock.  This is held around dev_open, so it has to be
*4882a593Smuzhiyun *    the outermost lock.
*4882a593Smuzhiyun * 2) target->lock
*4882a593Smuzhiyun * 3) channel->lock
*4882a593Smuzhiyun */
*4882a593Smuzhiyun
*4882a593Smuzhiyunstruct message {
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Messages themselves do not need a lock, they're protected by either
*4882a593Smuzhiyun	 * the target or channel's lock, depending on which can reference them
*4882a593Smuzhiyun	 * directly.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct dm_user_message msg;
*4882a593Smuzhiyun	struct bio *bio;
*4882a593Smuzhiyun	size_t posn_to_user;
*4882a593Smuzhiyun	size_t total_to_user;
*4882a593Smuzhiyun	size_t posn_from_user;
*4882a593Smuzhiyun	size_t total_from_user;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	struct list_head from_user;
*4882a593Smuzhiyun	struct list_head to_user;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * These are written back from the user.  They live in the same spot in
*4882a593Smuzhiyun	 * the message, but we need to either keep the old values around or
*4882a593Smuzhiyun	 * call a bunch more BIO helpers.  These are only valid after write has
*4882a593Smuzhiyun	 * adopted the message.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	u64 return_type;
*4882a593Smuzhiyun	u64 return_flags;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	struct delayed_work work;
*4882a593Smuzhiyun	bool delayed;
*4882a593Smuzhiyun	struct target *t;
*4882a593Smuzhiyun};
*4882a593Smuzhiyun
*4882a593Smuzhiyunstruct target {
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * A target has a single lock, which protects everything in the target
*4882a593Smuzhiyun	 * (but does not protect the channels associated with a target).
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct mutex lock;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * There is only one point at which anything blocks: userspace blocks
*4882a593Smuzhiyun	 * reading a new message, which is woken up by device mapper providing
*4882a593Smuzhiyun	 * a new BIO to process (or tearing down the target).  The
*4882a593Smuzhiyun	 * corresponding write side doesn't block, instead we treat userspace's
*4882a593Smuzhiyun	 * response containing a message that has yet to be mapped as an
*4882a593Smuzhiyun	 * invalid operation.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct wait_queue_head wq;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Messages are delivered to userspace in order, but may be returned
*4882a593Smuzhiyun	 * out of order.  This allows userspace to schedule IO if it wants to.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	mempool_t message_pool;
*4882a593Smuzhiyun	u64 next_seq_to_map;
*4882a593Smuzhiyun	u64 next_seq_to_user;
*4882a593Smuzhiyun	struct list_head to_user;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * There is a misc device per target.  The name is selected by
*4882a593Smuzhiyun	 * userspace (via a DM create ioctl argument), and each ends up in
*4882a593Smuzhiyun	 * /dev/dm-user/.  It looks like a better way to do this may be to have
*4882a593Smuzhiyun	 * a filesystem to manage these, but this was more expedient.  The
*4882a593Smuzhiyun	 * current mechanism is functional, but does result in an arbitrary
*4882a593Smuzhiyun	 * number of dynamically created misc devices.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct miscdevice miscdev;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Device mapper's target destructor triggers tearing this all down,
*4882a593Smuzhiyun	 * but we can't actually free until every channel associated with this
*4882a593Smuzhiyun	 * target has been destroyed.  Channels each have a reference to their
*4882a593Smuzhiyun	 * target, and there is an additional single reference that corresponds
*4882a593Smuzhiyun	 * to both DM and the misc device (both of which are destroyed by DM).
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * In the common case userspace will be asleep waiting for a new
*4882a593Smuzhiyun	 * message when device mapper decides to destroy the target, which
*4882a593Smuzhiyun	 * means no new messages will appear.  The destroyed flag triggers a
*4882a593Smuzhiyun	 * wakeup, which will end up removing the reference.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct kref references;
*4882a593Smuzhiyun	int dm_destroyed;
*4882a593Smuzhiyun	bool daemon_terminated;
*4882a593Smuzhiyun};
*4882a593Smuzhiyun
*4882a593Smuzhiyunstruct channel {
*4882a593Smuzhiyun	struct target *target;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * A channel has a single lock, which prevents multiple reads (or
*4882a593Smuzhiyun	 * multiple writes) from conflicting with each other.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct mutex lock;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	struct message *cur_to_user;
*4882a593Smuzhiyun	struct message *cur_from_user;
*4882a593Smuzhiyun	ssize_t to_user_error;
*4882a593Smuzhiyun	ssize_t from_user_error;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Once a message has been forwarded to userspace on a channel it must
*4882a593Smuzhiyun	 * be responded to on the same channel.  This allows us to error out
*4882a593Smuzhiyun	 * the messages that have not yet been responded to by a channel when
*4882a593Smuzhiyun	 * that channel closes, which makes handling errors more reasonable for
*4882a593Smuzhiyun	 * fault-tolerant userspace daemons.  It also happens to make avoiding
*4882a593Smuzhiyun	 * shared locks between user_map() and dev_read() a lot easier.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * This does preclude a multi-threaded work stealing userspace
*4882a593Smuzhiyun	 * implementation (or at least, force a degree of head-of-line blocking
*4882a593Smuzhiyun	 * on the response path).
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct list_head from_user;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Responses from userspace can arrive in arbitrarily small chunks.
*4882a593Smuzhiyun	 * We need some place to buffer one up until we can find the
*4882a593Smuzhiyun	 * corresponding kernel-side message to continue processing, so instead
*4882a593Smuzhiyun	 * of allocating them we just keep one off to the side here.  This can
*4882a593Smuzhiyun	 * only ever be pointer to by from_user_cur, and will never have a BIO.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	struct message scratch_message_from_user;
*4882a593Smuzhiyun};
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic void message_kill(struct message *m, mempool_t *pool)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	m->bio->bi_status = BLK_STS_IOERR;
*4882a593Smuzhiyun	bio_endio(m->bio);
*4882a593Smuzhiyun	mempool_free(m, pool);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline bool is_user_space_thread_present(struct target *t)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	lockdep_assert_held(&t->lock);
*4882a593Smuzhiyun	return (kref_read(&t->references) > 1);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic void process_delayed_work(struct work_struct *work)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct delayed_work *del_work = to_delayed_work(work);
*4882a593Smuzhiyun	struct message *msg = container_of(del_work, struct message, work);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	struct target *t = msg->t;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	mutex_lock(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * There is a atleast one thread to process the IO.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	if (is_user_space_thread_present(t)) {
*4882a593Smuzhiyun		mutex_unlock(&t->lock);
*4882a593Smuzhiyun		return;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Terminate the IO with an error
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	list_del(&msg->to_user);
*4882a593Smuzhiyun	pr_err("I/O error: sector %llu: no user-space daemon for %s target\n",
*4882a593Smuzhiyun	       msg->bio->bi_iter.bi_sector,
*4882a593Smuzhiyun	       t->miscdev.name);
*4882a593Smuzhiyun	message_kill(msg, &t->message_pool);
*4882a593Smuzhiyun	mutex_unlock(&t->lock);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic void enqueue_delayed_work(struct message *m, bool is_delay)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	unsigned long delay = 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	m->delayed = true;
*4882a593Smuzhiyun	INIT_DELAYED_WORK(&m->work, process_delayed_work);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Snapuserd daemon is the user-space process
*4882a593Smuzhiyun	 * which processes IO request from dm-user
*4882a593Smuzhiyun	 * when OTA is applied. Per the current design,
*4882a593Smuzhiyun	 * when a dm-user target is created, daemon
*4882a593Smuzhiyun	 * attaches to target and starts processing
*4882a593Smuzhiyun	 * the IO's. Daemon is terminated only when
*4882a593Smuzhiyun	 * dm-user target is destroyed.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * If for some reason, daemon crashes or terminates early,
*4882a593Smuzhiyun	 * without destroying the dm-user target; then
*4882a593Smuzhiyun	 * there is no mechanism to restart the daemon
*4882a593Smuzhiyun	 * and start processing the IO's from the same target.
*4882a593Smuzhiyun	 * Theoretically, it is possible but that infrastructure
*4882a593Smuzhiyun	 * doesn't exist in the android ecosystem.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * Thus, when the daemon terminates, there is no way the IO's
*4882a593Smuzhiyun	 * issued on that target will be processed. Hence,
*4882a593Smuzhiyun	 * we set the delay to 0 and fail the IO's immediately.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * On the other hand, when a new dm-user target is created,
*4882a593Smuzhiyun	 * we wait for the daemon to get attached for the first time.
*4882a593Smuzhiyun	 * This primarily happens when init first stage spins up
*4882a593Smuzhiyun	 * the daemon. At this point, since the snapshot device is mounted
*4882a593Smuzhiyun	 * of a root filesystem, dm-user target may receive IO request
*4882a593Smuzhiyun	 * even though daemon is not fully launched. We don't want
*4882a593Smuzhiyun	 * to fail those IO requests immediately. Thus, we queue these
*4882a593Smuzhiyun	 * requests with a timeout so that daemon is ready to process
*4882a593Smuzhiyun	 * those IO requests. Again, if the daemon fails to launch within
*4882a593Smuzhiyun	 * the timeout period, then IO's will be failed.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	if (is_delay)
*4882a593Smuzhiyun		delay = msecs_to_jiffies(daemon_timeout_msec);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	queue_delayed_work(system_wq, &m->work, delay);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline struct target *target_from_target(struct dm_target *target)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	WARN_ON(target->private == NULL);
*4882a593Smuzhiyun	return target->private;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline struct target *target_from_miscdev(struct miscdevice *miscdev)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	return container_of(miscdev, struct target, miscdev);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline struct channel *channel_from_file(struct file *file)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	WARN_ON(file->private_data == NULL);
*4882a593Smuzhiyun	return file->private_data;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline struct target *target_from_channel(struct channel *c)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	WARN_ON(c->target == NULL);
*4882a593Smuzhiyun	return c->target;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline size_t bio_size(struct bio *bio)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct bio_vec bvec;
*4882a593Smuzhiyun	struct bvec_iter iter;
*4882a593Smuzhiyun	size_t out = 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bio_for_each_segment (bvec, bio, iter)
*4882a593Smuzhiyun		out += bio_iter_len(bio, iter);
*4882a593Smuzhiyun	return out;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline size_t bio_bytes_needed_to_user(struct bio *bio)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	switch (bio_op(bio)) {
*4882a593Smuzhiyun	case REQ_OP_WRITE:
*4882a593Smuzhiyun		return sizeof(struct dm_user_message) + bio_size(bio);
*4882a593Smuzhiyun	case REQ_OP_READ:
*4882a593Smuzhiyun	case REQ_OP_FLUSH:
*4882a593Smuzhiyun	case REQ_OP_DISCARD:
*4882a593Smuzhiyun	case REQ_OP_SECURE_ERASE:
*4882a593Smuzhiyun	case REQ_OP_WRITE_SAME:
*4882a593Smuzhiyun	case REQ_OP_WRITE_ZEROES:
*4882a593Smuzhiyun		return sizeof(struct dm_user_message);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * These ops are not passed to userspace under the assumption that
*4882a593Smuzhiyun	 * they're not going to be particularly useful in that context.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	default:
*4882a593Smuzhiyun		return -EOPNOTSUPP;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline size_t bio_bytes_needed_from_user(struct bio *bio)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	switch (bio_op(bio)) {
*4882a593Smuzhiyun	case REQ_OP_READ:
*4882a593Smuzhiyun		return sizeof(struct dm_user_message) + bio_size(bio);
*4882a593Smuzhiyun	case REQ_OP_WRITE:
*4882a593Smuzhiyun	case REQ_OP_FLUSH:
*4882a593Smuzhiyun	case REQ_OP_DISCARD:
*4882a593Smuzhiyun	case REQ_OP_SECURE_ERASE:
*4882a593Smuzhiyun	case REQ_OP_WRITE_SAME:
*4882a593Smuzhiyun	case REQ_OP_WRITE_ZEROES:
*4882a593Smuzhiyun		return sizeof(struct dm_user_message);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * These ops are not passed to userspace under the assumption that
*4882a593Smuzhiyun	 * they're not going to be particularly useful in that context.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	default:
*4882a593Smuzhiyun		return -EOPNOTSUPP;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline long bio_type_to_user_type(struct bio *bio)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	switch (bio_op(bio)) {
*4882a593Smuzhiyun	case REQ_OP_READ:
*4882a593Smuzhiyun		return DM_USER_REQ_MAP_READ;
*4882a593Smuzhiyun	case REQ_OP_WRITE:
*4882a593Smuzhiyun		return DM_USER_REQ_MAP_WRITE;
*4882a593Smuzhiyun	case REQ_OP_FLUSH:
*4882a593Smuzhiyun		return DM_USER_REQ_MAP_FLUSH;
*4882a593Smuzhiyun	case REQ_OP_DISCARD:
*4882a593Smuzhiyun		return DM_USER_REQ_MAP_DISCARD;
*4882a593Smuzhiyun	case REQ_OP_SECURE_ERASE:
*4882a593Smuzhiyun		return DM_USER_REQ_MAP_SECURE_ERASE;
*4882a593Smuzhiyun	case REQ_OP_WRITE_SAME:
*4882a593Smuzhiyun		return DM_USER_REQ_MAP_WRITE_SAME;
*4882a593Smuzhiyun	case REQ_OP_WRITE_ZEROES:
*4882a593Smuzhiyun		return DM_USER_REQ_MAP_WRITE_ZEROES;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * These ops are not passed to userspace under the assumption that
*4882a593Smuzhiyun	 * they're not going to be particularly useful in that context.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	default:
*4882a593Smuzhiyun		return -EOPNOTSUPP;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic inline long bio_flags_to_user_flags(struct bio *bio)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	u64 out = 0;
*4882a593Smuzhiyun	typeof(bio->bi_opf) opf = bio->bi_opf & ~REQ_OP_MASK;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_FAILFAST_DEV) {
*4882a593Smuzhiyun		opf &= ~REQ_FAILFAST_DEV;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DEV;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_FAILFAST_TRANSPORT) {
*4882a593Smuzhiyun		opf &= ~REQ_FAILFAST_TRANSPORT;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_FAILFAST_TRANSPORT;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_FAILFAST_DRIVER) {
*4882a593Smuzhiyun		opf &= ~REQ_FAILFAST_DRIVER;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_FAILFAST_DRIVER;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_SYNC) {
*4882a593Smuzhiyun		opf &= ~REQ_SYNC;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_SYNC;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_META) {
*4882a593Smuzhiyun		opf &= ~REQ_META;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_META;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_PRIO) {
*4882a593Smuzhiyun		opf &= ~REQ_PRIO;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_PRIO;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_NOMERGE) {
*4882a593Smuzhiyun		opf &= ~REQ_NOMERGE;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_NOMERGE;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_IDLE) {
*4882a593Smuzhiyun		opf &= ~REQ_IDLE;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_IDLE;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_INTEGRITY) {
*4882a593Smuzhiyun		opf &= ~REQ_INTEGRITY;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_INTEGRITY;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_FUA) {
*4882a593Smuzhiyun		opf &= ~REQ_FUA;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_FUA;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_PREFLUSH) {
*4882a593Smuzhiyun		opf &= ~REQ_PREFLUSH;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_PREFLUSH;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_RAHEAD) {
*4882a593Smuzhiyun		opf &= ~REQ_RAHEAD;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_RAHEAD;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_BACKGROUND) {
*4882a593Smuzhiyun		opf &= ~REQ_BACKGROUND;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_BACKGROUND;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_NOWAIT) {
*4882a593Smuzhiyun		opf &= ~REQ_NOWAIT;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_NOWAIT;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (opf & REQ_NOUNMAP) {
*4882a593Smuzhiyun		opf &= ~REQ_NOUNMAP;
*4882a593Smuzhiyun		out |= DM_USER_REQ_MAP_FLAG_NOUNMAP;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (unlikely(opf)) {
*4882a593Smuzhiyun		pr_warn("unsupported BIO type %x\n", opf);
*4882a593Smuzhiyun		return -EOPNOTSUPP;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun	WARN_ON(out < 0);
*4882a593Smuzhiyun	return out;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * Not quite what's in blk-map.c, but instead what I thought the functions in
*4882a593Smuzhiyun * blk-map did.  This one seems more generally useful and I think we could
*4882a593Smuzhiyun * write the blk-map version in terms of this one.  The differences are that
*4882a593Smuzhiyun * this has a return value that counts, and blk-map uses the BIO _all iters.
*4882a593Smuzhiyun * Neither  advance the BIO iter but don't advance the IOV iter, which is a bit
*4882a593Smuzhiyun * odd here.
*4882a593Smuzhiyun */
*4882a593Smuzhiyunstatic ssize_t bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct bio_vec bvec;
*4882a593Smuzhiyun	struct bvec_iter biter;
*4882a593Smuzhiyun	ssize_t out = 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bio_for_each_segment (bvec, bio, biter) {
*4882a593Smuzhiyun		ssize_t ret;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		ret = copy_page_from_iter(bvec.bv_page, bvec.bv_offset,
*4882a593Smuzhiyun					  bvec.bv_len, iter);
*4882a593Smuzhiyun
*4882a593Smuzhiyun		/*
*4882a593Smuzhiyun		 * FIXME: I thought that IOV copies had a mechanism for
*4882a593Smuzhiyun		 * terminating early, if for example a signal came in while
*4882a593Smuzhiyun		 * sleeping waiting for a page to be mapped, but I don't see
*4882a593Smuzhiyun		 * where that would happen.
*4882a593Smuzhiyun		 */
*4882a593Smuzhiyun		WARN_ON(ret < 0);
*4882a593Smuzhiyun		out += ret;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		if (!iov_iter_count(iter))
*4882a593Smuzhiyun			break;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		if (ret < bvec.bv_len)
*4882a593Smuzhiyun			return ret;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return out;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic ssize_t bio_copy_to_iter(struct bio *bio, struct iov_iter *iter)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct bio_vec bvec;
*4882a593Smuzhiyun	struct bvec_iter biter;
*4882a593Smuzhiyun	ssize_t out = 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	bio_for_each_segment (bvec, bio, biter) {
*4882a593Smuzhiyun		ssize_t ret;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		ret = copy_page_to_iter(bvec.bv_page, bvec.bv_offset,
*4882a593Smuzhiyun					bvec.bv_len, iter);
*4882a593Smuzhiyun
*4882a593Smuzhiyun		/* as above */
*4882a593Smuzhiyun		WARN_ON(ret < 0);
*4882a593Smuzhiyun		out += ret;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		if (!iov_iter_count(iter))
*4882a593Smuzhiyun			break;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		if (ret < bvec.bv_len)
*4882a593Smuzhiyun			return ret;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return out;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic ssize_t msg_copy_to_iov(struct message *msg, struct iov_iter *to)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	ssize_t copied = 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (!iov_iter_count(to))
*4882a593Smuzhiyun		return 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (msg->posn_to_user < sizeof(msg->msg)) {
*4882a593Smuzhiyun		copied = copy_to_iter((char *)(&msg->msg) + msg->posn_to_user,
*4882a593Smuzhiyun				      sizeof(msg->msg) - msg->posn_to_user, to);
*4882a593Smuzhiyun	} else {
*4882a593Smuzhiyun		copied = bio_copy_to_iter(msg->bio, to);
*4882a593Smuzhiyun		if (copied > 0)
*4882a593Smuzhiyun			bio_advance(msg->bio, copied);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (copied < 0)
*4882a593Smuzhiyun		return copied;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	msg->posn_to_user += copied;
*4882a593Smuzhiyun	return copied;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic ssize_t msg_copy_from_iov(struct message *msg, struct iov_iter *from)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	ssize_t copied = 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (!iov_iter_count(from))
*4882a593Smuzhiyun		return 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (msg->posn_from_user < sizeof(msg->msg)) {
*4882a593Smuzhiyun		copied = copy_from_iter(
*4882a593Smuzhiyun			(char *)(&msg->msg) + msg->posn_from_user,
*4882a593Smuzhiyun			sizeof(msg->msg) - msg->posn_from_user, from);
*4882a593Smuzhiyun	} else {
*4882a593Smuzhiyun		copied = bio_copy_from_iter(msg->bio, from);
*4882a593Smuzhiyun		if (copied > 0)
*4882a593Smuzhiyun			bio_advance(msg->bio, copied);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (copied < 0)
*4882a593Smuzhiyun		return copied;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	msg->posn_from_user += copied;
*4882a593Smuzhiyun	return copied;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic struct message *msg_get_map(struct target *t)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct message *m;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	lockdep_assert_held(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	m = mempool_alloc(&t->message_pool, GFP_NOIO);
*4882a593Smuzhiyun	m->msg.seq = t->next_seq_to_map++;
*4882a593Smuzhiyun	INIT_LIST_HEAD(&m->to_user);
*4882a593Smuzhiyun	INIT_LIST_HEAD(&m->from_user);
*4882a593Smuzhiyun	return m;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic struct message *msg_get_to_user(struct target *t)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct message *m;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	lockdep_assert_held(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (list_empty(&t->to_user))
*4882a593Smuzhiyun		return NULL;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	m = list_first_entry(&t->to_user, struct message, to_user);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	list_del(&m->to_user);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * If the IO was queued to workqueue since there
*4882a593Smuzhiyun	 * was no daemon to service the IO, then we
*4882a593Smuzhiyun	 * will have to cancel the delayed work as the
*4882a593Smuzhiyun	 * IO will be processed by this user-space thread.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * If the delayed work was already picked up for
*4882a593Smuzhiyun	 * processing, then wait for it to complete. Note
*4882a593Smuzhiyun	 * that the IO will not be terminated by the work
*4882a593Smuzhiyun	 * queue thread.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	if (unlikely(m->delayed)) {
*4882a593Smuzhiyun		mutex_unlock(&t->lock);
*4882a593Smuzhiyun		cancel_delayed_work_sync(&m->work);
*4882a593Smuzhiyun		mutex_lock(&t->lock);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun	return m;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic struct message *msg_get_from_user(struct channel *c, u64 seq)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct message *m;
*4882a593Smuzhiyun	struct list_head *cur, *tmp;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	lockdep_assert_held(&c->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	list_for_each_safe (cur, tmp, &c->from_user) {
*4882a593Smuzhiyun		m = list_entry(cur, struct message, from_user);
*4882a593Smuzhiyun		if (m->msg.seq == seq) {
*4882a593Smuzhiyun			list_del(&m->from_user);
*4882a593Smuzhiyun			return m;
*4882a593Smuzhiyun		}
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return NULL;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * Returns 0 when there is no work left to do.  This must be callable without
*4882a593Smuzhiyun * holding the target lock, as it is part of the waitqueue's check expression.
*4882a593Smuzhiyun * When called without the lock it may spuriously indicate there is remaining
*4882a593Smuzhiyun * work, but when called with the lock it must be accurate.
*4882a593Smuzhiyun */
*4882a593Smuzhiyunint target_poll(struct target *t)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	return !list_empty(&t->to_user) || t->dm_destroyed;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunvoid target_release(struct kref *ref)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct target *t = container_of(ref, struct target, references);
*4882a593Smuzhiyun	struct list_head *cur, *tmp;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * There may be outstanding BIOs that have not yet been given to
*4882a593Smuzhiyun	 * userspace.  At this point there's nothing we can do about them, as
*4882a593Smuzhiyun	 * there are and will never be any channels.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	list_for_each_safe (cur, tmp, &t->to_user) {
*4882a593Smuzhiyun		struct message *m = list_entry(cur, struct message, to_user);
*4882a593Smuzhiyun
*4882a593Smuzhiyun		if (unlikely(m->delayed)) {
*4882a593Smuzhiyun			bool ret;
*4882a593Smuzhiyun
*4882a593Smuzhiyun			mutex_unlock(&t->lock);
*4882a593Smuzhiyun			ret = cancel_delayed_work_sync(&m->work);
*4882a593Smuzhiyun			mutex_lock(&t->lock);
*4882a593Smuzhiyun			if (!ret)
*4882a593Smuzhiyun				continue;
*4882a593Smuzhiyun		}
*4882a593Smuzhiyun		message_kill(m, &t->message_pool);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	mempool_exit(&t->message_pool);
*4882a593Smuzhiyun	mutex_unlock(&t->lock);
*4882a593Smuzhiyun	mutex_destroy(&t->lock);
*4882a593Smuzhiyun	kfree(t);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunvoid target_put(struct target *t)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * This both releases a reference to the target and the lock.  We leave
*4882a593Smuzhiyun	 * it up to the caller to hold the lock, as they probably needed it for
*4882a593Smuzhiyun	 * something else.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	lockdep_assert_held(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (!kref_put(&t->references, target_release)) {
*4882a593Smuzhiyun		/*
*4882a593Smuzhiyun		 * User-space thread is getting terminated.
*4882a593Smuzhiyun		 * We need to scan the list for all those
*4882a593Smuzhiyun		 * pending IO's which were not processed yet
*4882a593Smuzhiyun		 * and put them back to work-queue for delayed
*4882a593Smuzhiyun		 * processing.
*4882a593Smuzhiyun		 */
*4882a593Smuzhiyun		if (!is_user_space_thread_present(t)) {
*4882a593Smuzhiyun			struct list_head *cur, *tmp;
*4882a593Smuzhiyun
*4882a593Smuzhiyun			list_for_each_safe(cur, tmp, &t->to_user) {
*4882a593Smuzhiyun				struct message *m = list_entry(cur,
*4882a593Smuzhiyun							       struct message,
*4882a593Smuzhiyun							       to_user);
*4882a593Smuzhiyun				if (!m->delayed)
*4882a593Smuzhiyun					enqueue_delayed_work(m, false);
*4882a593Smuzhiyun			}
*4882a593Smuzhiyun			/*
*4882a593Smuzhiyun			 * Daemon attached to this target is terminated.
*4882a593Smuzhiyun			 */
*4882a593Smuzhiyun			t->daemon_terminated = true;
*4882a593Smuzhiyun		}
*4882a593Smuzhiyun		mutex_unlock(&t->lock);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic struct channel *channel_alloc(struct target *t)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct channel *c;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	lockdep_assert_held(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	c = kzalloc(sizeof(*c), GFP_KERNEL);
*4882a593Smuzhiyun	if (c == NULL)
*4882a593Smuzhiyun		return NULL;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	kref_get(&t->references);
*4882a593Smuzhiyun	c->target = t;
*4882a593Smuzhiyun	c->cur_from_user = &c->scratch_message_from_user;
*4882a593Smuzhiyun	mutex_init(&c->lock);
*4882a593Smuzhiyun	INIT_LIST_HEAD(&c->from_user);
*4882a593Smuzhiyun	return c;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunvoid channel_free(struct channel *c)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct list_head *cur, *tmp;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	lockdep_assert_held(&c->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * There may be outstanding BIOs that have been given to userspace but
*4882a593Smuzhiyun	 * have not yet been completed.  The channel has been shut down so
*4882a593Smuzhiyun	 * there's no way to process the rest of those messages, so we just go
*4882a593Smuzhiyun	 * ahead and error out the BIOs.  Hopefully whatever's on the other end
*4882a593Smuzhiyun	 * can handle the errors.  One could imagine splitting the BIOs and
*4882a593Smuzhiyun	 * completing as much as we got, but that seems like overkill here.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * Our only other options would be to let the BIO hang around (which
*4882a593Smuzhiyun	 * seems way worse) or to resubmit it to userspace in the hope there's
*4882a593Smuzhiyun	 * another channel.  I don't really like the idea of submitting a
*4882a593Smuzhiyun	 * message twice.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	if (c->cur_to_user != NULL)
*4882a593Smuzhiyun		message_kill(c->cur_to_user, &c->target->message_pool);
*4882a593Smuzhiyun	if (c->cur_from_user != &c->scratch_message_from_user)
*4882a593Smuzhiyun		message_kill(c->cur_from_user, &c->target->message_pool);
*4882a593Smuzhiyun	list_for_each_safe (cur, tmp, &c->from_user)
*4882a593Smuzhiyun		message_kill(list_entry(cur, struct message, from_user),
*4882a593Smuzhiyun			     &c->target->message_pool);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	mutex_lock(&c->target->lock);
*4882a593Smuzhiyun	target_put(c->target);
*4882a593Smuzhiyun	mutex_unlock(&c->lock);
*4882a593Smuzhiyun	mutex_destroy(&c->lock);
*4882a593Smuzhiyun	kfree(c);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic int dev_open(struct inode *inode, struct file *file)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct channel *c;
*4882a593Smuzhiyun	struct target *t;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * This is called by miscdev, which sets private_data to point to the
*4882a593Smuzhiyun	 * struct miscdevice that was opened.  The rest of our file operations
*4882a593Smuzhiyun	 * want to refer to the channel that's been opened, so we swap that
*4882a593Smuzhiyun	 * pointer out with a fresh channel.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * This is called with the miscdev lock held, which is also held while
*4882a593Smuzhiyun	 * registering/unregistering the miscdev.  The miscdev must be
*4882a593Smuzhiyun	 * registered for this to get called, which means there must be an
*4882a593Smuzhiyun	 * outstanding reference to the target, which means it cannot be freed
*4882a593Smuzhiyun	 * out from under us despite us not holding a reference yet.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	t = container_of(file->private_data, struct target, miscdev);
*4882a593Smuzhiyun	mutex_lock(&t->lock);
*4882a593Smuzhiyun	file->private_data = c = channel_alloc(t);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (c == NULL) {
*4882a593Smuzhiyun		mutex_unlock(&t->lock);
*4882a593Smuzhiyun		return -ENOMEM;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	mutex_unlock(&t->lock);
*4882a593Smuzhiyun	return 0;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic ssize_t dev_read(struct kiocb *iocb, struct iov_iter *to)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct channel *c = channel_from_file(iocb->ki_filp);
*4882a593Smuzhiyun	ssize_t total_processed = 0;
*4882a593Smuzhiyun	ssize_t processed;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	mutex_lock(&c->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (unlikely(c->to_user_error)) {
*4882a593Smuzhiyun		total_processed = c->to_user_error;
*4882a593Smuzhiyun		goto cleanup_unlock;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (c->cur_to_user == NULL) {
*4882a593Smuzhiyun		struct target *t = target_from_channel(c);
*4882a593Smuzhiyun
*4882a593Smuzhiyun		mutex_lock(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun		while (!target_poll(t)) {
*4882a593Smuzhiyun			int e;
*4882a593Smuzhiyun
*4882a593Smuzhiyun			mutex_unlock(&t->lock);
*4882a593Smuzhiyun			mutex_unlock(&c->lock);
*4882a593Smuzhiyun			e = wait_event_interruptible(t->wq, target_poll(t));
*4882a593Smuzhiyun			mutex_lock(&c->lock);
*4882a593Smuzhiyun			mutex_lock(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun			if (unlikely(e != 0)) {
*4882a593Smuzhiyun				/*
*4882a593Smuzhiyun				 * We haven't processed any bytes in either the
*4882a593Smuzhiyun				 * BIO or the IOV, so we can just terminate
*4882a593Smuzhiyun				 * right now.  Elsewhere in the kernel handles
*4882a593Smuzhiyun				 * restarting the syscall when appropriate.
*4882a593Smuzhiyun				 */
*4882a593Smuzhiyun				total_processed = e;
*4882a593Smuzhiyun				mutex_unlock(&t->lock);
*4882a593Smuzhiyun				goto cleanup_unlock;
*4882a593Smuzhiyun			}
*4882a593Smuzhiyun		}
*4882a593Smuzhiyun
*4882a593Smuzhiyun		if (unlikely(t->dm_destroyed)) {
*4882a593Smuzhiyun			/*
*4882a593Smuzhiyun			 * DM has destroyed this target, so just lock
*4882a593Smuzhiyun			 * the user out.  There's really nothing else
*4882a593Smuzhiyun			 * we can do here.  Note that we don't actually
*4882a593Smuzhiyun			 * tear any thing down until userspace has
*4882a593Smuzhiyun			 * closed the FD, as there may still be
*4882a593Smuzhiyun			 * outstanding BIOs.
*4882a593Smuzhiyun			 *
*4882a593Smuzhiyun			 * This is kind of a wacky error code to
*4882a593Smuzhiyun			 * return.  My goal was really just to try and
*4882a593Smuzhiyun			 * find something that wasn't likely to be
*4882a593Smuzhiyun			 * returned by anything else in the miscdev
*4882a593Smuzhiyun			 * path.  The message "block device required"
*4882a593Smuzhiyun			 * seems like a somewhat reasonable thing to
*4882a593Smuzhiyun			 * say when the target has disappeared out from
*4882a593Smuzhiyun			 * under us, but "not block" isn't sensible.
*4882a593Smuzhiyun			 */
*4882a593Smuzhiyun			c->to_user_error = total_processed = -ENOTBLK;
*4882a593Smuzhiyun			mutex_unlock(&t->lock);
*4882a593Smuzhiyun			goto cleanup_unlock;
*4882a593Smuzhiyun		}
*4882a593Smuzhiyun
*4882a593Smuzhiyun		/*
*4882a593Smuzhiyun		 * Ensures that accesses to the message data are not ordered
*4882a593Smuzhiyun		 * before the remote accesses that produce that message data.
*4882a593Smuzhiyun		 *
*4882a593Smuzhiyun		 * This pairs with the barrier in user_map(), via the
*4882a593Smuzhiyun		 * conditional within the while loop above. Also see the lack
*4882a593Smuzhiyun		 * of barrier in user_dtr(), which is why this can be after the
*4882a593Smuzhiyun		 * destroyed check.
*4882a593Smuzhiyun		 */
*4882a593Smuzhiyun		smp_rmb();
*4882a593Smuzhiyun
*4882a593Smuzhiyun		c->cur_to_user = msg_get_to_user(t);
*4882a593Smuzhiyun		WARN_ON(c->cur_to_user == NULL);
*4882a593Smuzhiyun		mutex_unlock(&t->lock);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	processed = msg_copy_to_iov(c->cur_to_user, to);
*4882a593Smuzhiyun	total_processed += processed;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	WARN_ON(c->cur_to_user->posn_to_user > c->cur_to_user->total_to_user);
*4882a593Smuzhiyun	if (c->cur_to_user->posn_to_user == c->cur_to_user->total_to_user) {
*4882a593Smuzhiyun		struct message *m = c->cur_to_user;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		c->cur_to_user = NULL;
*4882a593Smuzhiyun		list_add_tail(&m->from_user, &c->from_user);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyuncleanup_unlock:
*4882a593Smuzhiyun	mutex_unlock(&c->lock);
*4882a593Smuzhiyun	return total_processed;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic ssize_t dev_write(struct kiocb *iocb, struct iov_iter *from)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct channel *c = channel_from_file(iocb->ki_filp);
*4882a593Smuzhiyun	ssize_t total_processed = 0;
*4882a593Smuzhiyun	ssize_t processed;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	mutex_lock(&c->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (unlikely(c->from_user_error)) {
*4882a593Smuzhiyun		total_processed = c->from_user_error;
*4882a593Smuzhiyun		goto cleanup_unlock;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * cur_from_user can never be NULL.  If there's no real message it must
*4882a593Smuzhiyun	 * point to the scratch space.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	WARN_ON(c->cur_from_user == NULL);
*4882a593Smuzhiyun	if (c->cur_from_user->posn_from_user < sizeof(struct dm_user_message)) {
*4882a593Smuzhiyun		struct message *msg, *old;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		processed = msg_copy_from_iov(c->cur_from_user, from);
*4882a593Smuzhiyun		if (processed <= 0) {
*4882a593Smuzhiyun			pr_warn("msg_copy_from_iov() returned %zu\n",
*4882a593Smuzhiyun				processed);
*4882a593Smuzhiyun			c->from_user_error = -EINVAL;
*4882a593Smuzhiyun			goto cleanup_unlock;
*4882a593Smuzhiyun		}
*4882a593Smuzhiyun		total_processed += processed;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		/*
*4882a593Smuzhiyun		 * In the unlikely event the user has provided us a very short
*4882a593Smuzhiyun		 * write, not even big enough to fill a message, just succeed.
*4882a593Smuzhiyun		 * We'll eventually build up enough bytes to do something.
*4882a593Smuzhiyun		 */
*4882a593Smuzhiyun		if (unlikely(c->cur_from_user->posn_from_user <
*4882a593Smuzhiyun			     sizeof(struct dm_user_message)))
*4882a593Smuzhiyun			goto cleanup_unlock;
*4882a593Smuzhiyun
*4882a593Smuzhiyun		old = c->cur_from_user;
*4882a593Smuzhiyun		mutex_lock(&c->target->lock);
*4882a593Smuzhiyun		msg = msg_get_from_user(c, c->cur_from_user->msg.seq);
*4882a593Smuzhiyun		if (msg == NULL) {
*4882a593Smuzhiyun			pr_info("user provided an invalid messag seq of %llx\n",
*4882a593Smuzhiyun				old->msg.seq);
*4882a593Smuzhiyun			mutex_unlock(&c->target->lock);
*4882a593Smuzhiyun			c->from_user_error = -EINVAL;
*4882a593Smuzhiyun			goto cleanup_unlock;
*4882a593Smuzhiyun		}
*4882a593Smuzhiyun		mutex_unlock(&c->target->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun		WARN_ON(old->posn_from_user != sizeof(struct dm_user_message));
*4882a593Smuzhiyun		msg->posn_from_user = sizeof(struct dm_user_message);
*4882a593Smuzhiyun		msg->return_type = old->msg.type;
*4882a593Smuzhiyun		msg->return_flags = old->msg.flags;
*4882a593Smuzhiyun		WARN_ON(msg->posn_from_user > msg->total_from_user);
*4882a593Smuzhiyun		c->cur_from_user = msg;
*4882a593Smuzhiyun		WARN_ON(old != &c->scratch_message_from_user);
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Userspace can signal an error for single requests by overwriting the
*4882a593Smuzhiyun	 * seq field.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	switch (c->cur_from_user->return_type) {
*4882a593Smuzhiyun	case DM_USER_RESP_SUCCESS:
*4882a593Smuzhiyun		c->cur_from_user->bio->bi_status = BLK_STS_OK;
*4882a593Smuzhiyun		break;
*4882a593Smuzhiyun	case DM_USER_RESP_ERROR:
*4882a593Smuzhiyun	case DM_USER_RESP_UNSUPPORTED:
*4882a593Smuzhiyun	default:
*4882a593Smuzhiyun		c->cur_from_user->bio->bi_status = BLK_STS_IOERR;
*4882a593Smuzhiyun		goto finish_bio;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * The op was a success as far as userspace is concerned, so process
*4882a593Smuzhiyun	 * whatever data may come along with it.  The user may provide the BIO
*4882a593Smuzhiyun	 * data in multiple chunks, in which case we don't need to finish the
*4882a593Smuzhiyun	 * BIO.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	processed = msg_copy_from_iov(c->cur_from_user, from);
*4882a593Smuzhiyun	total_processed += processed;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (c->cur_from_user->posn_from_user <
*4882a593Smuzhiyun	    c->cur_from_user->total_from_user)
*4882a593Smuzhiyun		goto cleanup_unlock;
*4882a593Smuzhiyun
*4882a593Smuzhiyunfinish_bio:
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * When we set up this message the BIO's size matched the
*4882a593Smuzhiyun	 * message size, if that's not still the case then something
*4882a593Smuzhiyun	 * has gone off the rails.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	WARN_ON(bio_size(c->cur_from_user->bio) != 0);
*4882a593Smuzhiyun	bio_endio(c->cur_from_user->bio);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * We don't actually need to take the target lock here, as all
*4882a593Smuzhiyun	 * we're doing is freeing the message and mempools have their
*4882a593Smuzhiyun	 * own lock.  Each channel has its ows scratch message.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	WARN_ON(c->cur_from_user == &c->scratch_message_from_user);
*4882a593Smuzhiyun	mempool_free(c->cur_from_user, &c->target->message_pool);
*4882a593Smuzhiyun	c->scratch_message_from_user.posn_from_user = 0;
*4882a593Smuzhiyun	c->cur_from_user = &c->scratch_message_from_user;
*4882a593Smuzhiyun
*4882a593Smuzhiyuncleanup_unlock:
*4882a593Smuzhiyun	mutex_unlock(&c->lock);
*4882a593Smuzhiyun	return total_processed;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic int dev_release(struct inode *inode, struct file *file)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct channel *c;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	c = channel_from_file(file);
*4882a593Smuzhiyun	mutex_lock(&c->lock);
*4882a593Smuzhiyun	channel_free(c);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return 0;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic const struct file_operations file_operations = {
*4882a593Smuzhiyun	.owner = THIS_MODULE,
*4882a593Smuzhiyun	.open = dev_open,
*4882a593Smuzhiyun	.llseek = no_llseek,
*4882a593Smuzhiyun	.read_iter = dev_read,
*4882a593Smuzhiyun	.write_iter = dev_write,
*4882a593Smuzhiyun	.release = dev_release,
*4882a593Smuzhiyun};
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic int user_ctr(struct dm_target *ti, unsigned int argc, char **argv)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct target *t;
*4882a593Smuzhiyun	int r;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	if (argc != 3) {
*4882a593Smuzhiyun		ti->error = "Invalid argument count";
*4882a593Smuzhiyun		r = -EINVAL;
*4882a593Smuzhiyun		goto cleanup_none;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	t = kzalloc(sizeof(*t), GFP_KERNEL);
*4882a593Smuzhiyun	if (t == NULL) {
*4882a593Smuzhiyun		r = -ENOMEM;
*4882a593Smuzhiyun		goto cleanup_none;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun	ti->private = t;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/* Enable more BIO types. */
*4882a593Smuzhiyun	ti->num_discard_bios = 1;
*4882a593Smuzhiyun	ti->discards_supported = true;
*4882a593Smuzhiyun	ti->num_flush_bios = 1;
*4882a593Smuzhiyun	ti->flush_supported = true;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * We begin with a single reference to the target, which is miscdev's
*4882a593Smuzhiyun	 * reference.  This ensures that the target won't be freed
*4882a593Smuzhiyun	 * until after the miscdev has been unregistered and all extant
*4882a593Smuzhiyun	 * channels have been closed.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	kref_init(&t->references);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	t->daemon_terminated = false;
*4882a593Smuzhiyun	mutex_init(&t->lock);
*4882a593Smuzhiyun	init_waitqueue_head(&t->wq);
*4882a593Smuzhiyun	INIT_LIST_HEAD(&t->to_user);
*4882a593Smuzhiyun	mempool_init_kmalloc_pool(&t->message_pool, MAX_OUTSTANDING_MESSAGES,
*4882a593Smuzhiyun				  sizeof(struct message));
*4882a593Smuzhiyun
*4882a593Smuzhiyun	t->miscdev.minor = MISC_DYNAMIC_MINOR;
*4882a593Smuzhiyun	t->miscdev.fops = &file_operations;
*4882a593Smuzhiyun	t->miscdev.name = kasprintf(GFP_KERNEL, "dm-user/%s", argv[2]);
*4882a593Smuzhiyun	if (t->miscdev.name == NULL) {
*4882a593Smuzhiyun		r = -ENOMEM;
*4882a593Smuzhiyun		goto cleanup_message_pool;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Once the miscdev is registered it can be opened and therefor
*4882a593Smuzhiyun	 * concurrent references to the channel can happen.  Holding the target
*4882a593Smuzhiyun	 * lock during misc_register() could deadlock.  If registration
*4882a593Smuzhiyun	 * succeeds then we will not access the target again so we just stick a
*4882a593Smuzhiyun	 * barrier here, which pairs with taking the target lock everywhere
*4882a593Smuzhiyun	 * else the target is accessed.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * I forgot where we ended up on the RCpc/RCsc locks.  IIU RCsc locks
*4882a593Smuzhiyun	 * would mean that we could take the target lock earlier and release it
*4882a593Smuzhiyun	 * here instead of the memory barrier.  I'm not sure that's any better,
*4882a593Smuzhiyun	 * though, and this isn't on a hot path so it probably doesn't matter
*4882a593Smuzhiyun	 * either way.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	smp_mb();
*4882a593Smuzhiyun
*4882a593Smuzhiyun	r = misc_register(&t->miscdev);
*4882a593Smuzhiyun	if (r) {
*4882a593Smuzhiyun		DMERR("Unable to register miscdev %s for dm-user",
*4882a593Smuzhiyun		      t->miscdev.name);
*4882a593Smuzhiyun		r = -ENOMEM;
*4882a593Smuzhiyun		goto cleanup_misc_name;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyuncleanup_misc_name:
*4882a593Smuzhiyun	kfree(t->miscdev.name);
*4882a593Smuzhiyuncleanup_message_pool:
*4882a593Smuzhiyun	mempool_exit(&t->message_pool);
*4882a593Smuzhiyun	kfree(t);
*4882a593Smuzhiyuncleanup_none:
*4882a593Smuzhiyun	return r;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic void user_dtr(struct dm_target *ti)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct target *t = target_from_target(ti);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * Removes the miscdev.  This must be called without the target lock
*4882a593Smuzhiyun	 * held to avoid a possible deadlock because our open implementation is
*4882a593Smuzhiyun	 * called holding the miscdev lock and must later take the target lock.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * There is no race here because only DM can register/unregister the
*4882a593Smuzhiyun	 * miscdev, and DM ensures that doesn't happen twice.  The internal
*4882a593Smuzhiyun	 * miscdev lock is sufficient to ensure there are no races between
*4882a593Smuzhiyun	 * deregistering the miscdev and open.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	misc_deregister(&t->miscdev);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * We are now free to take the target's lock and drop our reference to
*4882a593Smuzhiyun	 * the target.  There are almost certainly tasks sleeping in read on at
*4882a593Smuzhiyun	 * least one of the channels associated with this target, this
*4882a593Smuzhiyun	 * explicitly wakes them up and terminates the read.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	mutex_lock(&t->lock);
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * No barrier here, as wait/wake ensures that the flag visibility is
*4882a593Smuzhiyun	 * correct WRT the wake/sleep state of the target tasks.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	t->dm_destroyed = true;
*4882a593Smuzhiyun	wake_up_all(&t->wq);
*4882a593Smuzhiyun	target_put(t);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyun/*
*4882a593Smuzhiyun * Consumes a BIO from device mapper, queueing it up for userspace.
*4882a593Smuzhiyun */
*4882a593Smuzhiyunstatic int user_map(struct dm_target *ti, struct bio *bio)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	struct target *t;
*4882a593Smuzhiyun	struct message *entry;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	t = target_from_target(ti);
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * FIXME
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * This seems like a bad idea.  Specifically, here we're
*4882a593Smuzhiyun	 * directly on the IO path when we take the target lock, which may also
*4882a593Smuzhiyun	 * be taken from a user context.  The user context doesn't actively
*4882a593Smuzhiyun	 * trigger anything that may sleep while holding the lock, but this
*4882a593Smuzhiyun	 * still seems like a bad idea.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * The obvious way to fix this would be to use a proper queue, which
*4882a593Smuzhiyun	 * would result in no shared locks between the direct IO path and user
*4882a593Smuzhiyun	 * tasks.  I had a version that did this, but the head-of-line blocking
*4882a593Smuzhiyun	 * from the circular buffer resulted in us needing a fairly large
*4882a593Smuzhiyun	 * allocation in order to avoid situations in which the queue fills up
*4882a593Smuzhiyun	 * and everything goes off the rails.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * I could jump through a some hoops to avoid a shared lock while still
*4882a593Smuzhiyun	 * allowing for a large queue, but I'm not actually sure that allowing
*4882a593Smuzhiyun	 * for very large queues is the right thing to do here.  Intuitively it
*4882a593Smuzhiyun	 * seems better to keep the queues small in here (essentially sized to
*4882a593Smuzhiyun	 * the user latency for performance reasons only) and rely on returning
*4882a593Smuzhiyun	 * DM_MAPIO_REQUEUE regularly, as that would give the rest of the
*4882a593Smuzhiyun	 * kernel more information.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * I'll spend some time trying to figure out what's going on with
*4882a593Smuzhiyun	 * DM_MAPIO_REQUEUE, but if someone has a better idea of how to fix
*4882a593Smuzhiyun	 * this I'm all ears.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	mutex_lock(&t->lock);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * FIXME
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * The assumption here is that there's no benefit to returning
*4882a593Smuzhiyun	 * DM_MAPIO_KILL as opposed to just erroring out the BIO, but I'm not
*4882a593Smuzhiyun	 * sure that's actually true -- for example, I could imagine users
*4882a593Smuzhiyun	 * expecting that submitted BIOs are unlikely to fail and therefor
*4882a593Smuzhiyun	 * relying on submission failure to indicate an unsupported type.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * There's two ways I can think of to fix this:
*4882a593Smuzhiyun	 *   - Add DM arguments that are parsed during the constructor that
*4882a593Smuzhiyun	 *     allow various dm_target flags to be set that indicate the op
*4882a593Smuzhiyun	 *     types supported by this target.  This may make sense for things
*4882a593Smuzhiyun	 *     like discard, where DM can already transform the BIOs to a form
*4882a593Smuzhiyun	 *     that's likely to be supported.
*4882a593Smuzhiyun	 *   - Some sort of pre-filter that allows userspace to hook in here
*4882a593Smuzhiyun	 *     and kill BIOs before marking them as submitted.  My guess would
*4882a593Smuzhiyun	 *     be that a userspace round trip is a bad idea here, but a BPF
*4882a593Smuzhiyun	 *     call seems resonable.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * My guess is that we'd likely want to do both.  The first one is easy
*4882a593Smuzhiyun	 * and gives DM the proper info, so it seems better.  The BPF call
*4882a593Smuzhiyun	 * seems overly complex for just this, but one could imagine wanting to
*4882a593Smuzhiyun	 * sometimes return _MAPPED and a BPF filter would be the way to do
*4882a593Smuzhiyun	 * that.
*4882a593Smuzhiyun	 *
*4882a593Smuzhiyun	 * For example, in Android we have an in-kernel DM device called
*4882a593Smuzhiyun	 * "dm-bow" that takes advange of some portion of the space that has
*4882a593Smuzhiyun	 * been discarded on a device to provide opportunistic block-level
*4882a593Smuzhiyun	 * backups.  While one could imagine just implementing this entirely in
*4882a593Smuzhiyun	 * userspace, that would come with an appreciable performance penalty.
*4882a593Smuzhiyun	 * Instead one could keep a BPF program that forwards most accesses
*4882a593Smuzhiyun	 * directly to the backing block device while informing a userspace
*4882a593Smuzhiyun	 * daemon of any discarded space and on writes to blocks that are to be
*4882a593Smuzhiyun	 * backed up.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	if (unlikely((bio_type_to_user_type(bio) < 0) ||
*4882a593Smuzhiyun		     (bio_flags_to_user_flags(bio) < 0))) {
*4882a593Smuzhiyun		mutex_unlock(&t->lock);
*4882a593Smuzhiyun		return DM_MAPIO_KILL;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	entry = msg_get_map(t);
*4882a593Smuzhiyun	if (unlikely(entry == NULL)) {
*4882a593Smuzhiyun		mutex_unlock(&t->lock);
*4882a593Smuzhiyun		return DM_MAPIO_REQUEUE;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	entry->msg.type = bio_type_to_user_type(bio);
*4882a593Smuzhiyun	entry->msg.flags = bio_flags_to_user_flags(bio);
*4882a593Smuzhiyun	entry->msg.sector = bio->bi_iter.bi_sector;
*4882a593Smuzhiyun	entry->msg.len = bio_size(bio);
*4882a593Smuzhiyun	entry->bio = bio;
*4882a593Smuzhiyun	entry->posn_to_user = 0;
*4882a593Smuzhiyun	entry->total_to_user = bio_bytes_needed_to_user(bio);
*4882a593Smuzhiyun	entry->posn_from_user = 0;
*4882a593Smuzhiyun	entry->total_from_user = bio_bytes_needed_from_user(bio);
*4882a593Smuzhiyun	entry->delayed = false;
*4882a593Smuzhiyun	entry->t = t;
*4882a593Smuzhiyun	/* Pairs with the barrier in dev_read() */
*4882a593Smuzhiyun	smp_wmb();
*4882a593Smuzhiyun	list_add_tail(&entry->to_user, &t->to_user);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	/*
*4882a593Smuzhiyun	 * If there is no daemon to process the IO's,
*4882a593Smuzhiyun	 * queue these messages into a workqueue with
*4882a593Smuzhiyun	 * a timeout.
*4882a593Smuzhiyun	 */
*4882a593Smuzhiyun	if (!is_user_space_thread_present(t))
*4882a593Smuzhiyun		enqueue_delayed_work(entry, !t->daemon_terminated);
*4882a593Smuzhiyun
*4882a593Smuzhiyun	wake_up_interruptible(&t->wq);
*4882a593Smuzhiyun	mutex_unlock(&t->lock);
*4882a593Smuzhiyun	return DM_MAPIO_SUBMITTED;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic struct target_type user_target = {
*4882a593Smuzhiyun	.name = "user",
*4882a593Smuzhiyun	.version = { 1, 0, 0 },
*4882a593Smuzhiyun	.module = THIS_MODULE,
*4882a593Smuzhiyun	.ctr = user_ctr,
*4882a593Smuzhiyun	.dtr = user_dtr,
*4882a593Smuzhiyun	.map = user_map,
*4882a593Smuzhiyun};
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic int __init dm_user_init(void)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	int r;
*4882a593Smuzhiyun
*4882a593Smuzhiyun	r = dm_register_target(&user_target);
*4882a593Smuzhiyun	if (r) {
*4882a593Smuzhiyun		DMERR("register failed %d", r);
*4882a593Smuzhiyun		goto error;
*4882a593Smuzhiyun	}
*4882a593Smuzhiyun
*4882a593Smuzhiyun	return 0;
*4882a593Smuzhiyun
*4882a593Smuzhiyunerror:
*4882a593Smuzhiyun	return r;
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunstatic void __exit dm_user_exit(void)
*4882a593Smuzhiyun{
*4882a593Smuzhiyun	dm_unregister_target(&user_target);
*4882a593Smuzhiyun}
*4882a593Smuzhiyun
*4882a593Smuzhiyunmodule_init(dm_user_init);
*4882a593Smuzhiyunmodule_exit(dm_user_exit);
*4882a593SmuzhiyunMODULE_AUTHOR("Palmer Dabbelt <palmerdabbelt@google.com>");
*4882a593SmuzhiyunMODULE_DESCRIPTION(DM_NAME " target returning blocks from userspace");
*4882a593SmuzhiyunMODULE_LICENSE("GPL");