xref: /OK3568_Linux_fs/kernel/fs/nfs/direct.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun  * linux/fs/nfs/direct.c
4*4882a593Smuzhiyun  *
5*4882a593Smuzhiyun  * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6*4882a593Smuzhiyun  *
7*4882a593Smuzhiyun  * High-performance uncached I/O for the Linux NFS client
8*4882a593Smuzhiyun  *
9*4882a593Smuzhiyun  * There are important applications whose performance or correctness
10*4882a593Smuzhiyun  * depends on uncached access to file data.  Database clusters
11*4882a593Smuzhiyun  * (multiple copies of the same instance running on separate hosts)
12*4882a593Smuzhiyun  * implement their own cache coherency protocol that subsumes file
13*4882a593Smuzhiyun  * system cache protocols.  Applications that process datasets
14*4882a593Smuzhiyun  * considerably larger than the client's memory do not always benefit
15*4882a593Smuzhiyun  * from a local cache.  A streaming video server, for instance, has no
16*4882a593Smuzhiyun  * need to cache the contents of a file.
17*4882a593Smuzhiyun  *
18*4882a593Smuzhiyun  * When an application requests uncached I/O, all read and write requests
19*4882a593Smuzhiyun  * are made directly to the server; data stored or fetched via these
20*4882a593Smuzhiyun  * requests is not cached in the Linux page cache.  The client does not
21*4882a593Smuzhiyun  * correct unaligned requests from applications.  All requested bytes are
22*4882a593Smuzhiyun  * held on permanent storage before a direct write system call returns to
23*4882a593Smuzhiyun  * an application.
24*4882a593Smuzhiyun  *
25*4882a593Smuzhiyun  * Solaris implements an uncached I/O facility called directio() that
26*4882a593Smuzhiyun  * is used for backups and sequential I/O to very large files.  Solaris
27*4882a593Smuzhiyun  * also supports uncaching whole NFS partitions with "-o forcedirectio,"
28*4882a593Smuzhiyun  * an undocumented mount option.
29*4882a593Smuzhiyun  *
30*4882a593Smuzhiyun  * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
31*4882a593Smuzhiyun  * help from Andrew Morton.
32*4882a593Smuzhiyun  *
33*4882a593Smuzhiyun  * 18 Dec 2001	Initial implementation for 2.4  --cel
34*4882a593Smuzhiyun  * 08 Jul 2002	Version for 2.4.19, with bug fixes --trondmy
35*4882a593Smuzhiyun  * 08 Jun 2003	Port to 2.5 APIs  --cel
36*4882a593Smuzhiyun  * 31 Mar 2004	Handle direct I/O without VFS support  --cel
37*4882a593Smuzhiyun  * 15 Sep 2004	Parallel async reads  --cel
38*4882a593Smuzhiyun  * 04 May 2005	support O_DIRECT with aio  --cel
39*4882a593Smuzhiyun  *
40*4882a593Smuzhiyun  */
41*4882a593Smuzhiyun 
42*4882a593Smuzhiyun #include <linux/errno.h>
43*4882a593Smuzhiyun #include <linux/sched.h>
44*4882a593Smuzhiyun #include <linux/kernel.h>
45*4882a593Smuzhiyun #include <linux/file.h>
46*4882a593Smuzhiyun #include <linux/pagemap.h>
47*4882a593Smuzhiyun #include <linux/kref.h>
48*4882a593Smuzhiyun #include <linux/slab.h>
49*4882a593Smuzhiyun #include <linux/task_io_accounting_ops.h>
50*4882a593Smuzhiyun #include <linux/module.h>
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun #include <linux/nfs_fs.h>
53*4882a593Smuzhiyun #include <linux/nfs_page.h>
54*4882a593Smuzhiyun #include <linux/sunrpc/clnt.h>
55*4882a593Smuzhiyun 
56*4882a593Smuzhiyun #include <linux/uaccess.h>
57*4882a593Smuzhiyun #include <linux/atomic.h>
58*4882a593Smuzhiyun 
59*4882a593Smuzhiyun #include "internal.h"
60*4882a593Smuzhiyun #include "iostat.h"
61*4882a593Smuzhiyun #include "pnfs.h"
62*4882a593Smuzhiyun 
63*4882a593Smuzhiyun #define NFSDBG_FACILITY		NFSDBG_VFS
64*4882a593Smuzhiyun 
65*4882a593Smuzhiyun static struct kmem_cache *nfs_direct_cachep;
66*4882a593Smuzhiyun 
67*4882a593Smuzhiyun struct nfs_direct_req {
68*4882a593Smuzhiyun 	struct kref		kref;		/* release manager */
69*4882a593Smuzhiyun 
70*4882a593Smuzhiyun 	/* I/O parameters */
71*4882a593Smuzhiyun 	struct nfs_open_context	*ctx;		/* file open context info */
72*4882a593Smuzhiyun 	struct nfs_lock_context *l_ctx;		/* Lock context info */
73*4882a593Smuzhiyun 	struct kiocb *		iocb;		/* controlling i/o request */
74*4882a593Smuzhiyun 	struct inode *		inode;		/* target file of i/o */
75*4882a593Smuzhiyun 
76*4882a593Smuzhiyun 	/* completion state */
77*4882a593Smuzhiyun 	atomic_t		io_count;	/* i/os we're waiting for */
78*4882a593Smuzhiyun 	spinlock_t		lock;		/* protect completion state */
79*4882a593Smuzhiyun 
80*4882a593Smuzhiyun 	loff_t			io_start;	/* Start offset for I/O */
81*4882a593Smuzhiyun 	ssize_t			count,		/* bytes actually processed */
82*4882a593Smuzhiyun 				max_count,	/* max expected count */
83*4882a593Smuzhiyun 				bytes_left,	/* bytes left to be sent */
84*4882a593Smuzhiyun 				error;		/* any reported error */
85*4882a593Smuzhiyun 	struct completion	completion;	/* wait for i/o completion */
86*4882a593Smuzhiyun 
87*4882a593Smuzhiyun 	/* commit state */
88*4882a593Smuzhiyun 	struct nfs_mds_commit_info mds_cinfo;	/* Storage for cinfo */
89*4882a593Smuzhiyun 	struct pnfs_ds_commit_info ds_cinfo;	/* Storage for cinfo */
90*4882a593Smuzhiyun 	struct work_struct	work;
91*4882a593Smuzhiyun 	int			flags;
92*4882a593Smuzhiyun 	/* for write */
93*4882a593Smuzhiyun #define NFS_ODIRECT_DO_COMMIT		(1)	/* an unstable reply was received */
94*4882a593Smuzhiyun #define NFS_ODIRECT_RESCHED_WRITES	(2)	/* write verification failed */
95*4882a593Smuzhiyun 	/* for read */
96*4882a593Smuzhiyun #define NFS_ODIRECT_SHOULD_DIRTY	(3)	/* dirty user-space page after read */
97*4882a593Smuzhiyun #define NFS_ODIRECT_DONE		INT_MAX	/* write verification failed */
98*4882a593Smuzhiyun };
99*4882a593Smuzhiyun 
100*4882a593Smuzhiyun static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
101*4882a593Smuzhiyun static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
102*4882a593Smuzhiyun static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
103*4882a593Smuzhiyun static void nfs_direct_write_schedule_work(struct work_struct *work);
104*4882a593Smuzhiyun 
get_dreq(struct nfs_direct_req * dreq)105*4882a593Smuzhiyun static inline void get_dreq(struct nfs_direct_req *dreq)
106*4882a593Smuzhiyun {
107*4882a593Smuzhiyun 	atomic_inc(&dreq->io_count);
108*4882a593Smuzhiyun }
109*4882a593Smuzhiyun 
put_dreq(struct nfs_direct_req * dreq)110*4882a593Smuzhiyun static inline int put_dreq(struct nfs_direct_req *dreq)
111*4882a593Smuzhiyun {
112*4882a593Smuzhiyun 	return atomic_dec_and_test(&dreq->io_count);
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun 
115*4882a593Smuzhiyun static void
nfs_direct_handle_truncated(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr,ssize_t dreq_len)116*4882a593Smuzhiyun nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
117*4882a593Smuzhiyun 			    const struct nfs_pgio_header *hdr,
118*4882a593Smuzhiyun 			    ssize_t dreq_len)
119*4882a593Smuzhiyun {
120*4882a593Smuzhiyun 	if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
121*4882a593Smuzhiyun 	      test_bit(NFS_IOHDR_EOF, &hdr->flags)))
122*4882a593Smuzhiyun 		return;
123*4882a593Smuzhiyun 	if (dreq->max_count >= dreq_len) {
124*4882a593Smuzhiyun 		dreq->max_count = dreq_len;
125*4882a593Smuzhiyun 		if (dreq->count > dreq_len)
126*4882a593Smuzhiyun 			dreq->count = dreq_len;
127*4882a593Smuzhiyun 
128*4882a593Smuzhiyun 		if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
129*4882a593Smuzhiyun 			dreq->error = hdr->error;
130*4882a593Smuzhiyun 		else /* Clear outstanding error if this is EOF */
131*4882a593Smuzhiyun 			dreq->error = 0;
132*4882a593Smuzhiyun 	}
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun 
135*4882a593Smuzhiyun static void
nfs_direct_count_bytes(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr)136*4882a593Smuzhiyun nfs_direct_count_bytes(struct nfs_direct_req *dreq,
137*4882a593Smuzhiyun 		       const struct nfs_pgio_header *hdr)
138*4882a593Smuzhiyun {
139*4882a593Smuzhiyun 	loff_t hdr_end = hdr->io_start + hdr->good_bytes;
140*4882a593Smuzhiyun 	ssize_t dreq_len = 0;
141*4882a593Smuzhiyun 
142*4882a593Smuzhiyun 	if (hdr_end > dreq->io_start)
143*4882a593Smuzhiyun 		dreq_len = hdr_end - dreq->io_start;
144*4882a593Smuzhiyun 
145*4882a593Smuzhiyun 	nfs_direct_handle_truncated(dreq, hdr, dreq_len);
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun 	if (dreq_len > dreq->max_count)
148*4882a593Smuzhiyun 		dreq_len = dreq->max_count;
149*4882a593Smuzhiyun 
150*4882a593Smuzhiyun 	if (dreq->count < dreq_len)
151*4882a593Smuzhiyun 		dreq->count = dreq_len;
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun 
154*4882a593Smuzhiyun /**
155*4882a593Smuzhiyun  * nfs_direct_IO - NFS address space operation for direct I/O
156*4882a593Smuzhiyun  * @iocb: target I/O control block
157*4882a593Smuzhiyun  * @iter: I/O buffer
158*4882a593Smuzhiyun  *
159*4882a593Smuzhiyun  * The presence of this routine in the address space ops vector means
160*4882a593Smuzhiyun  * the NFS client supports direct I/O. However, for most direct IO, we
161*4882a593Smuzhiyun  * shunt off direct read and write requests before the VFS gets them,
162*4882a593Smuzhiyun  * so this method is only ever called for swap.
163*4882a593Smuzhiyun  */
nfs_direct_IO(struct kiocb * iocb,struct iov_iter * iter)164*4882a593Smuzhiyun ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
165*4882a593Smuzhiyun {
166*4882a593Smuzhiyun 	struct inode *inode = iocb->ki_filp->f_mapping->host;
167*4882a593Smuzhiyun 
168*4882a593Smuzhiyun 	/* we only support swap file calling nfs_direct_IO */
169*4882a593Smuzhiyun 	if (!IS_SWAPFILE(inode))
170*4882a593Smuzhiyun 		return 0;
171*4882a593Smuzhiyun 
172*4882a593Smuzhiyun 	VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
173*4882a593Smuzhiyun 
174*4882a593Smuzhiyun 	if (iov_iter_rw(iter) == READ)
175*4882a593Smuzhiyun 		return nfs_file_direct_read(iocb, iter, true);
176*4882a593Smuzhiyun 	return nfs_file_direct_write(iocb, iter, true);
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun 
nfs_direct_release_pages(struct page ** pages,unsigned int npages)179*4882a593Smuzhiyun static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun 	unsigned int i;
182*4882a593Smuzhiyun 	for (i = 0; i < npages; i++)
183*4882a593Smuzhiyun 		put_page(pages[i]);
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun 
nfs_init_cinfo_from_dreq(struct nfs_commit_info * cinfo,struct nfs_direct_req * dreq)186*4882a593Smuzhiyun void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
187*4882a593Smuzhiyun 			      struct nfs_direct_req *dreq)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun 	cinfo->inode = dreq->inode;
190*4882a593Smuzhiyun 	cinfo->mds = &dreq->mds_cinfo;
191*4882a593Smuzhiyun 	cinfo->ds = &dreq->ds_cinfo;
192*4882a593Smuzhiyun 	cinfo->dreq = dreq;
193*4882a593Smuzhiyun 	cinfo->completion_ops = &nfs_direct_commit_completion_ops;
194*4882a593Smuzhiyun }
195*4882a593Smuzhiyun 
nfs_direct_req_alloc(void)196*4882a593Smuzhiyun static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
197*4882a593Smuzhiyun {
198*4882a593Smuzhiyun 	struct nfs_direct_req *dreq;
199*4882a593Smuzhiyun 
200*4882a593Smuzhiyun 	dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
201*4882a593Smuzhiyun 	if (!dreq)
202*4882a593Smuzhiyun 		return NULL;
203*4882a593Smuzhiyun 
204*4882a593Smuzhiyun 	kref_init(&dreq->kref);
205*4882a593Smuzhiyun 	kref_get(&dreq->kref);
206*4882a593Smuzhiyun 	init_completion(&dreq->completion);
207*4882a593Smuzhiyun 	INIT_LIST_HEAD(&dreq->mds_cinfo.list);
208*4882a593Smuzhiyun 	pnfs_init_ds_commit_info(&dreq->ds_cinfo);
209*4882a593Smuzhiyun 	INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
210*4882a593Smuzhiyun 	spin_lock_init(&dreq->lock);
211*4882a593Smuzhiyun 
212*4882a593Smuzhiyun 	return dreq;
213*4882a593Smuzhiyun }
214*4882a593Smuzhiyun 
nfs_direct_req_free(struct kref * kref)215*4882a593Smuzhiyun static void nfs_direct_req_free(struct kref *kref)
216*4882a593Smuzhiyun {
217*4882a593Smuzhiyun 	struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
218*4882a593Smuzhiyun 
219*4882a593Smuzhiyun 	pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
220*4882a593Smuzhiyun 	if (dreq->l_ctx != NULL)
221*4882a593Smuzhiyun 		nfs_put_lock_context(dreq->l_ctx);
222*4882a593Smuzhiyun 	if (dreq->ctx != NULL)
223*4882a593Smuzhiyun 		put_nfs_open_context(dreq->ctx);
224*4882a593Smuzhiyun 	kmem_cache_free(nfs_direct_cachep, dreq);
225*4882a593Smuzhiyun }
226*4882a593Smuzhiyun 
nfs_direct_req_release(struct nfs_direct_req * dreq)227*4882a593Smuzhiyun static void nfs_direct_req_release(struct nfs_direct_req *dreq)
228*4882a593Smuzhiyun {
229*4882a593Smuzhiyun 	kref_put(&dreq->kref, nfs_direct_req_free);
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun 
nfs_dreq_bytes_left(struct nfs_direct_req * dreq)232*4882a593Smuzhiyun ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
233*4882a593Smuzhiyun {
234*4882a593Smuzhiyun 	return dreq->bytes_left;
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
237*4882a593Smuzhiyun 
238*4882a593Smuzhiyun /*
239*4882a593Smuzhiyun  * Collects and returns the final error value/byte-count.
240*4882a593Smuzhiyun  */
nfs_direct_wait(struct nfs_direct_req * dreq)241*4882a593Smuzhiyun static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
242*4882a593Smuzhiyun {
243*4882a593Smuzhiyun 	ssize_t result = -EIOCBQUEUED;
244*4882a593Smuzhiyun 
245*4882a593Smuzhiyun 	/* Async requests don't wait here */
246*4882a593Smuzhiyun 	if (dreq->iocb)
247*4882a593Smuzhiyun 		goto out;
248*4882a593Smuzhiyun 
249*4882a593Smuzhiyun 	result = wait_for_completion_killable(&dreq->completion);
250*4882a593Smuzhiyun 
251*4882a593Smuzhiyun 	if (!result) {
252*4882a593Smuzhiyun 		result = dreq->count;
253*4882a593Smuzhiyun 		WARN_ON_ONCE(dreq->count < 0);
254*4882a593Smuzhiyun 	}
255*4882a593Smuzhiyun 	if (!result)
256*4882a593Smuzhiyun 		result = dreq->error;
257*4882a593Smuzhiyun 
258*4882a593Smuzhiyun out:
259*4882a593Smuzhiyun 	return (ssize_t) result;
260*4882a593Smuzhiyun }
261*4882a593Smuzhiyun 
262*4882a593Smuzhiyun /*
263*4882a593Smuzhiyun  * Synchronous I/O uses a stack-allocated iocb.  Thus we can't trust
264*4882a593Smuzhiyun  * the iocb is still valid here if this is a synchronous request.
265*4882a593Smuzhiyun  */
nfs_direct_complete(struct nfs_direct_req * dreq)266*4882a593Smuzhiyun static void nfs_direct_complete(struct nfs_direct_req *dreq)
267*4882a593Smuzhiyun {
268*4882a593Smuzhiyun 	struct inode *inode = dreq->inode;
269*4882a593Smuzhiyun 
270*4882a593Smuzhiyun 	inode_dio_end(inode);
271*4882a593Smuzhiyun 
272*4882a593Smuzhiyun 	if (dreq->iocb) {
273*4882a593Smuzhiyun 		long res = (long) dreq->error;
274*4882a593Smuzhiyun 		if (dreq->count != 0) {
275*4882a593Smuzhiyun 			res = (long) dreq->count;
276*4882a593Smuzhiyun 			WARN_ON_ONCE(dreq->count < 0);
277*4882a593Smuzhiyun 		}
278*4882a593Smuzhiyun 		dreq->iocb->ki_complete(dreq->iocb, res, 0);
279*4882a593Smuzhiyun 	}
280*4882a593Smuzhiyun 
281*4882a593Smuzhiyun 	complete(&dreq->completion);
282*4882a593Smuzhiyun 
283*4882a593Smuzhiyun 	nfs_direct_req_release(dreq);
284*4882a593Smuzhiyun }
285*4882a593Smuzhiyun 
nfs_direct_read_completion(struct nfs_pgio_header * hdr)286*4882a593Smuzhiyun static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun 	unsigned long bytes = 0;
289*4882a593Smuzhiyun 	struct nfs_direct_req *dreq = hdr->dreq;
290*4882a593Smuzhiyun 
291*4882a593Smuzhiyun 	spin_lock(&dreq->lock);
292*4882a593Smuzhiyun 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
293*4882a593Smuzhiyun 		spin_unlock(&dreq->lock);
294*4882a593Smuzhiyun 		goto out_put;
295*4882a593Smuzhiyun 	}
296*4882a593Smuzhiyun 
297*4882a593Smuzhiyun 	nfs_direct_count_bytes(dreq, hdr);
298*4882a593Smuzhiyun 	spin_unlock(&dreq->lock);
299*4882a593Smuzhiyun 
300*4882a593Smuzhiyun 	while (!list_empty(&hdr->pages)) {
301*4882a593Smuzhiyun 		struct nfs_page *req = nfs_list_entry(hdr->pages.next);
302*4882a593Smuzhiyun 		struct page *page = req->wb_page;
303*4882a593Smuzhiyun 
304*4882a593Smuzhiyun 		if (!PageCompound(page) && bytes < hdr->good_bytes &&
305*4882a593Smuzhiyun 		    (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
306*4882a593Smuzhiyun 			set_page_dirty(page);
307*4882a593Smuzhiyun 		bytes += req->wb_bytes;
308*4882a593Smuzhiyun 		nfs_list_remove_request(req);
309*4882a593Smuzhiyun 		nfs_release_request(req);
310*4882a593Smuzhiyun 	}
311*4882a593Smuzhiyun out_put:
312*4882a593Smuzhiyun 	if (put_dreq(dreq))
313*4882a593Smuzhiyun 		nfs_direct_complete(dreq);
314*4882a593Smuzhiyun 	hdr->release(hdr);
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun 
nfs_read_sync_pgio_error(struct list_head * head,int error)317*4882a593Smuzhiyun static void nfs_read_sync_pgio_error(struct list_head *head, int error)
318*4882a593Smuzhiyun {
319*4882a593Smuzhiyun 	struct nfs_page *req;
320*4882a593Smuzhiyun 
321*4882a593Smuzhiyun 	while (!list_empty(head)) {
322*4882a593Smuzhiyun 		req = nfs_list_entry(head->next);
323*4882a593Smuzhiyun 		nfs_list_remove_request(req);
324*4882a593Smuzhiyun 		nfs_release_request(req);
325*4882a593Smuzhiyun 	}
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun 
nfs_direct_pgio_init(struct nfs_pgio_header * hdr)328*4882a593Smuzhiyun static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun 	get_dreq(hdr->dreq);
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun 
333*4882a593Smuzhiyun static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
334*4882a593Smuzhiyun 	.error_cleanup = nfs_read_sync_pgio_error,
335*4882a593Smuzhiyun 	.init_hdr = nfs_direct_pgio_init,
336*4882a593Smuzhiyun 	.completion = nfs_direct_read_completion,
337*4882a593Smuzhiyun };
338*4882a593Smuzhiyun 
339*4882a593Smuzhiyun /*
340*4882a593Smuzhiyun  * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
341*4882a593Smuzhiyun  * operation.  If nfs_readdata_alloc() or get_user_pages() fails,
342*4882a593Smuzhiyun  * bail and stop sending more reads.  Read length accounting is
343*4882a593Smuzhiyun  * handled automatically by nfs_direct_read_result().  Otherwise, if
344*4882a593Smuzhiyun  * no requests have been sent, just return an error.
345*4882a593Smuzhiyun  */
346*4882a593Smuzhiyun 
nfs_direct_read_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos)347*4882a593Smuzhiyun static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
348*4882a593Smuzhiyun 					      struct iov_iter *iter,
349*4882a593Smuzhiyun 					      loff_t pos)
350*4882a593Smuzhiyun {
351*4882a593Smuzhiyun 	struct nfs_pageio_descriptor desc;
352*4882a593Smuzhiyun 	struct inode *inode = dreq->inode;
353*4882a593Smuzhiyun 	ssize_t result = -EINVAL;
354*4882a593Smuzhiyun 	size_t requested_bytes = 0;
355*4882a593Smuzhiyun 	size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
356*4882a593Smuzhiyun 
357*4882a593Smuzhiyun 	nfs_pageio_init_read(&desc, dreq->inode, false,
358*4882a593Smuzhiyun 			     &nfs_direct_read_completion_ops);
359*4882a593Smuzhiyun 	get_dreq(dreq);
360*4882a593Smuzhiyun 	desc.pg_dreq = dreq;
361*4882a593Smuzhiyun 	inode_dio_begin(inode);
362*4882a593Smuzhiyun 
363*4882a593Smuzhiyun 	while (iov_iter_count(iter)) {
364*4882a593Smuzhiyun 		struct page **pagevec;
365*4882a593Smuzhiyun 		size_t bytes;
366*4882a593Smuzhiyun 		size_t pgbase;
367*4882a593Smuzhiyun 		unsigned npages, i;
368*4882a593Smuzhiyun 
369*4882a593Smuzhiyun 		result = iov_iter_get_pages_alloc(iter, &pagevec,
370*4882a593Smuzhiyun 						  rsize, &pgbase);
371*4882a593Smuzhiyun 		if (result < 0)
372*4882a593Smuzhiyun 			break;
373*4882a593Smuzhiyun 
374*4882a593Smuzhiyun 		bytes = result;
375*4882a593Smuzhiyun 		iov_iter_advance(iter, bytes);
376*4882a593Smuzhiyun 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
377*4882a593Smuzhiyun 		for (i = 0; i < npages; i++) {
378*4882a593Smuzhiyun 			struct nfs_page *req;
379*4882a593Smuzhiyun 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
380*4882a593Smuzhiyun 			/* XXX do we need to do the eof zeroing found in async_filler? */
381*4882a593Smuzhiyun 			req = nfs_create_request(dreq->ctx, pagevec[i],
382*4882a593Smuzhiyun 						 pgbase, req_len);
383*4882a593Smuzhiyun 			if (IS_ERR(req)) {
384*4882a593Smuzhiyun 				result = PTR_ERR(req);
385*4882a593Smuzhiyun 				break;
386*4882a593Smuzhiyun 			}
387*4882a593Smuzhiyun 			req->wb_index = pos >> PAGE_SHIFT;
388*4882a593Smuzhiyun 			req->wb_offset = pos & ~PAGE_MASK;
389*4882a593Smuzhiyun 			if (!nfs_pageio_add_request(&desc, req)) {
390*4882a593Smuzhiyun 				result = desc.pg_error;
391*4882a593Smuzhiyun 				nfs_release_request(req);
392*4882a593Smuzhiyun 				break;
393*4882a593Smuzhiyun 			}
394*4882a593Smuzhiyun 			pgbase = 0;
395*4882a593Smuzhiyun 			bytes -= req_len;
396*4882a593Smuzhiyun 			requested_bytes += req_len;
397*4882a593Smuzhiyun 			pos += req_len;
398*4882a593Smuzhiyun 			dreq->bytes_left -= req_len;
399*4882a593Smuzhiyun 		}
400*4882a593Smuzhiyun 		nfs_direct_release_pages(pagevec, npages);
401*4882a593Smuzhiyun 		kvfree(pagevec);
402*4882a593Smuzhiyun 		if (result < 0)
403*4882a593Smuzhiyun 			break;
404*4882a593Smuzhiyun 	}
405*4882a593Smuzhiyun 
406*4882a593Smuzhiyun 	nfs_pageio_complete(&desc);
407*4882a593Smuzhiyun 
408*4882a593Smuzhiyun 	/*
409*4882a593Smuzhiyun 	 * If no bytes were started, return the error, and let the
410*4882a593Smuzhiyun 	 * generic layer handle the completion.
411*4882a593Smuzhiyun 	 */
412*4882a593Smuzhiyun 	if (requested_bytes == 0) {
413*4882a593Smuzhiyun 		inode_dio_end(inode);
414*4882a593Smuzhiyun 		nfs_direct_req_release(dreq);
415*4882a593Smuzhiyun 		return result < 0 ? result : -EIO;
416*4882a593Smuzhiyun 	}
417*4882a593Smuzhiyun 
418*4882a593Smuzhiyun 	if (put_dreq(dreq))
419*4882a593Smuzhiyun 		nfs_direct_complete(dreq);
420*4882a593Smuzhiyun 	return requested_bytes;
421*4882a593Smuzhiyun }
422*4882a593Smuzhiyun 
423*4882a593Smuzhiyun /**
424*4882a593Smuzhiyun  * nfs_file_direct_read - file direct read operation for NFS files
425*4882a593Smuzhiyun  * @iocb: target I/O control block
426*4882a593Smuzhiyun  * @iter: vector of user buffers into which to read data
427*4882a593Smuzhiyun  * @swap: flag indicating this is swap IO, not O_DIRECT IO
428*4882a593Smuzhiyun  *
429*4882a593Smuzhiyun  * We use this function for direct reads instead of calling
430*4882a593Smuzhiyun  * generic_file_aio_read() in order to avoid gfar's check to see if
431*4882a593Smuzhiyun  * the request starts before the end of the file.  For that check
432*4882a593Smuzhiyun  * to work, we must generate a GETATTR before each direct read, and
433*4882a593Smuzhiyun  * even then there is a window between the GETATTR and the subsequent
434*4882a593Smuzhiyun  * READ where the file size could change.  Our preference is simply
435*4882a593Smuzhiyun  * to do all reads the application wants, and the server will take
436*4882a593Smuzhiyun  * care of managing the end of file boundary.
437*4882a593Smuzhiyun  *
438*4882a593Smuzhiyun  * This function also eliminates unnecessarily updating the file's
439*4882a593Smuzhiyun  * atime locally, as the NFS server sets the file's atime, and this
440*4882a593Smuzhiyun  * client must read the updated atime from the server back into its
441*4882a593Smuzhiyun  * cache.
442*4882a593Smuzhiyun  */
nfs_file_direct_read(struct kiocb * iocb,struct iov_iter * iter,bool swap)443*4882a593Smuzhiyun ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
444*4882a593Smuzhiyun 			     bool swap)
445*4882a593Smuzhiyun {
446*4882a593Smuzhiyun 	struct file *file = iocb->ki_filp;
447*4882a593Smuzhiyun 	struct address_space *mapping = file->f_mapping;
448*4882a593Smuzhiyun 	struct inode *inode = mapping->host;
449*4882a593Smuzhiyun 	struct nfs_direct_req *dreq;
450*4882a593Smuzhiyun 	struct nfs_lock_context *l_ctx;
451*4882a593Smuzhiyun 	ssize_t result, requested;
452*4882a593Smuzhiyun 	size_t count = iov_iter_count(iter);
453*4882a593Smuzhiyun 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
454*4882a593Smuzhiyun 
455*4882a593Smuzhiyun 	dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
456*4882a593Smuzhiyun 		file, count, (long long) iocb->ki_pos);
457*4882a593Smuzhiyun 
458*4882a593Smuzhiyun 	result = 0;
459*4882a593Smuzhiyun 	if (!count)
460*4882a593Smuzhiyun 		goto out;
461*4882a593Smuzhiyun 
462*4882a593Smuzhiyun 	task_io_account_read(count);
463*4882a593Smuzhiyun 
464*4882a593Smuzhiyun 	result = -ENOMEM;
465*4882a593Smuzhiyun 	dreq = nfs_direct_req_alloc();
466*4882a593Smuzhiyun 	if (dreq == NULL)
467*4882a593Smuzhiyun 		goto out;
468*4882a593Smuzhiyun 
469*4882a593Smuzhiyun 	dreq->inode = inode;
470*4882a593Smuzhiyun 	dreq->bytes_left = dreq->max_count = count;
471*4882a593Smuzhiyun 	dreq->io_start = iocb->ki_pos;
472*4882a593Smuzhiyun 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
473*4882a593Smuzhiyun 	l_ctx = nfs_get_lock_context(dreq->ctx);
474*4882a593Smuzhiyun 	if (IS_ERR(l_ctx)) {
475*4882a593Smuzhiyun 		result = PTR_ERR(l_ctx);
476*4882a593Smuzhiyun 		nfs_direct_req_release(dreq);
477*4882a593Smuzhiyun 		goto out_release;
478*4882a593Smuzhiyun 	}
479*4882a593Smuzhiyun 	dreq->l_ctx = l_ctx;
480*4882a593Smuzhiyun 	if (!is_sync_kiocb(iocb))
481*4882a593Smuzhiyun 		dreq->iocb = iocb;
482*4882a593Smuzhiyun 
483*4882a593Smuzhiyun 	if (iter_is_iovec(iter))
484*4882a593Smuzhiyun 		dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
485*4882a593Smuzhiyun 
486*4882a593Smuzhiyun 	if (!swap)
487*4882a593Smuzhiyun 		nfs_start_io_direct(inode);
488*4882a593Smuzhiyun 
489*4882a593Smuzhiyun 	NFS_I(inode)->read_io += count;
490*4882a593Smuzhiyun 	requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
491*4882a593Smuzhiyun 
492*4882a593Smuzhiyun 	if (!swap)
493*4882a593Smuzhiyun 		nfs_end_io_direct(inode);
494*4882a593Smuzhiyun 
495*4882a593Smuzhiyun 	if (requested > 0) {
496*4882a593Smuzhiyun 		result = nfs_direct_wait(dreq);
497*4882a593Smuzhiyun 		if (result > 0) {
498*4882a593Smuzhiyun 			requested -= result;
499*4882a593Smuzhiyun 			iocb->ki_pos += result;
500*4882a593Smuzhiyun 		}
501*4882a593Smuzhiyun 		iov_iter_revert(iter, requested);
502*4882a593Smuzhiyun 	} else {
503*4882a593Smuzhiyun 		result = requested;
504*4882a593Smuzhiyun 	}
505*4882a593Smuzhiyun 
506*4882a593Smuzhiyun out_release:
507*4882a593Smuzhiyun 	nfs_direct_req_release(dreq);
508*4882a593Smuzhiyun out:
509*4882a593Smuzhiyun 	return result;
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun 
512*4882a593Smuzhiyun static void
nfs_direct_join_group(struct list_head * list,struct inode * inode)513*4882a593Smuzhiyun nfs_direct_join_group(struct list_head *list, struct inode *inode)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun 	struct nfs_page *req, *next;
516*4882a593Smuzhiyun 
517*4882a593Smuzhiyun 	list_for_each_entry(req, list, wb_list) {
518*4882a593Smuzhiyun 		if (req->wb_head != req || req->wb_this_page == req)
519*4882a593Smuzhiyun 			continue;
520*4882a593Smuzhiyun 		for (next = req->wb_this_page;
521*4882a593Smuzhiyun 				next != req->wb_head;
522*4882a593Smuzhiyun 				next = next->wb_this_page) {
523*4882a593Smuzhiyun 			nfs_list_remove_request(next);
524*4882a593Smuzhiyun 			nfs_release_request(next);
525*4882a593Smuzhiyun 		}
526*4882a593Smuzhiyun 		nfs_join_page_group(req, inode);
527*4882a593Smuzhiyun 	}
528*4882a593Smuzhiyun }
529*4882a593Smuzhiyun 
530*4882a593Smuzhiyun static void
nfs_direct_write_scan_commit_list(struct inode * inode,struct list_head * list,struct nfs_commit_info * cinfo)531*4882a593Smuzhiyun nfs_direct_write_scan_commit_list(struct inode *inode,
532*4882a593Smuzhiyun 				  struct list_head *list,
533*4882a593Smuzhiyun 				  struct nfs_commit_info *cinfo)
534*4882a593Smuzhiyun {
535*4882a593Smuzhiyun 	mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
536*4882a593Smuzhiyun 	pnfs_recover_commit_reqs(list, cinfo);
537*4882a593Smuzhiyun 	nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
538*4882a593Smuzhiyun 	mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
539*4882a593Smuzhiyun }
540*4882a593Smuzhiyun 
nfs_direct_write_reschedule(struct nfs_direct_req * dreq)541*4882a593Smuzhiyun static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
542*4882a593Smuzhiyun {
543*4882a593Smuzhiyun 	struct nfs_pageio_descriptor desc;
544*4882a593Smuzhiyun 	struct nfs_page *req, *tmp;
545*4882a593Smuzhiyun 	LIST_HEAD(reqs);
546*4882a593Smuzhiyun 	struct nfs_commit_info cinfo;
547*4882a593Smuzhiyun 	LIST_HEAD(failed);
548*4882a593Smuzhiyun 
549*4882a593Smuzhiyun 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
550*4882a593Smuzhiyun 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
551*4882a593Smuzhiyun 
552*4882a593Smuzhiyun 	nfs_direct_join_group(&reqs, dreq->inode);
553*4882a593Smuzhiyun 
554*4882a593Smuzhiyun 	dreq->count = 0;
555*4882a593Smuzhiyun 	dreq->max_count = 0;
556*4882a593Smuzhiyun 	list_for_each_entry(req, &reqs, wb_list)
557*4882a593Smuzhiyun 		dreq->max_count += req->wb_bytes;
558*4882a593Smuzhiyun 	nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
559*4882a593Smuzhiyun 	get_dreq(dreq);
560*4882a593Smuzhiyun 
561*4882a593Smuzhiyun 	nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
562*4882a593Smuzhiyun 			      &nfs_direct_write_completion_ops);
563*4882a593Smuzhiyun 	desc.pg_dreq = dreq;
564*4882a593Smuzhiyun 
565*4882a593Smuzhiyun 	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
566*4882a593Smuzhiyun 		/* Bump the transmission count */
567*4882a593Smuzhiyun 		req->wb_nio++;
568*4882a593Smuzhiyun 		if (!nfs_pageio_add_request(&desc, req)) {
569*4882a593Smuzhiyun 			nfs_list_move_request(req, &failed);
570*4882a593Smuzhiyun 			spin_lock(&cinfo.inode->i_lock);
571*4882a593Smuzhiyun 			dreq->flags = 0;
572*4882a593Smuzhiyun 			if (desc.pg_error < 0)
573*4882a593Smuzhiyun 				dreq->error = desc.pg_error;
574*4882a593Smuzhiyun 			else
575*4882a593Smuzhiyun 				dreq->error = -EIO;
576*4882a593Smuzhiyun 			spin_unlock(&cinfo.inode->i_lock);
577*4882a593Smuzhiyun 		}
578*4882a593Smuzhiyun 		nfs_release_request(req);
579*4882a593Smuzhiyun 	}
580*4882a593Smuzhiyun 	nfs_pageio_complete(&desc);
581*4882a593Smuzhiyun 
582*4882a593Smuzhiyun 	while (!list_empty(&failed)) {
583*4882a593Smuzhiyun 		req = nfs_list_entry(failed.next);
584*4882a593Smuzhiyun 		nfs_list_remove_request(req);
585*4882a593Smuzhiyun 		nfs_unlock_and_release_request(req);
586*4882a593Smuzhiyun 	}
587*4882a593Smuzhiyun 
588*4882a593Smuzhiyun 	if (put_dreq(dreq))
589*4882a593Smuzhiyun 		nfs_direct_write_complete(dreq);
590*4882a593Smuzhiyun }
591*4882a593Smuzhiyun 
nfs_direct_commit_complete(struct nfs_commit_data * data)592*4882a593Smuzhiyun static void nfs_direct_commit_complete(struct nfs_commit_data *data)
593*4882a593Smuzhiyun {
594*4882a593Smuzhiyun 	const struct nfs_writeverf *verf = data->res.verf;
595*4882a593Smuzhiyun 	struct nfs_direct_req *dreq = data->dreq;
596*4882a593Smuzhiyun 	struct nfs_commit_info cinfo;
597*4882a593Smuzhiyun 	struct nfs_page *req;
598*4882a593Smuzhiyun 	int status = data->task.tk_status;
599*4882a593Smuzhiyun 
600*4882a593Smuzhiyun 	if (status < 0) {
601*4882a593Smuzhiyun 		/* Errors in commit are fatal */
602*4882a593Smuzhiyun 		dreq->error = status;
603*4882a593Smuzhiyun 		dreq->max_count = 0;
604*4882a593Smuzhiyun 		dreq->count = 0;
605*4882a593Smuzhiyun 		dreq->flags = NFS_ODIRECT_DONE;
606*4882a593Smuzhiyun 	} else if (dreq->flags == NFS_ODIRECT_DONE)
607*4882a593Smuzhiyun 		status = dreq->error;
608*4882a593Smuzhiyun 
609*4882a593Smuzhiyun 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
610*4882a593Smuzhiyun 
611*4882a593Smuzhiyun 	while (!list_empty(&data->pages)) {
612*4882a593Smuzhiyun 		req = nfs_list_entry(data->pages.next);
613*4882a593Smuzhiyun 		nfs_list_remove_request(req);
614*4882a593Smuzhiyun 		if (status >= 0 && !nfs_write_match_verf(verf, req)) {
615*4882a593Smuzhiyun 			dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
616*4882a593Smuzhiyun 			/*
617*4882a593Smuzhiyun 			 * Despite the reboot, the write was successful,
618*4882a593Smuzhiyun 			 * so reset wb_nio.
619*4882a593Smuzhiyun 			 */
620*4882a593Smuzhiyun 			req->wb_nio = 0;
621*4882a593Smuzhiyun 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
622*4882a593Smuzhiyun 		} else /* Error or match */
623*4882a593Smuzhiyun 			nfs_release_request(req);
624*4882a593Smuzhiyun 		nfs_unlock_and_release_request(req);
625*4882a593Smuzhiyun 	}
626*4882a593Smuzhiyun 
627*4882a593Smuzhiyun 	if (nfs_commit_end(cinfo.mds))
628*4882a593Smuzhiyun 		nfs_direct_write_complete(dreq);
629*4882a593Smuzhiyun }
630*4882a593Smuzhiyun 
nfs_direct_resched_write(struct nfs_commit_info * cinfo,struct nfs_page * req)631*4882a593Smuzhiyun static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
632*4882a593Smuzhiyun 		struct nfs_page *req)
633*4882a593Smuzhiyun {
634*4882a593Smuzhiyun 	struct nfs_direct_req *dreq = cinfo->dreq;
635*4882a593Smuzhiyun 
636*4882a593Smuzhiyun 	spin_lock(&dreq->lock);
637*4882a593Smuzhiyun 	if (dreq->flags != NFS_ODIRECT_DONE)
638*4882a593Smuzhiyun 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
639*4882a593Smuzhiyun 	spin_unlock(&dreq->lock);
640*4882a593Smuzhiyun 	nfs_mark_request_commit(req, NULL, cinfo, 0);
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun 
643*4882a593Smuzhiyun static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
644*4882a593Smuzhiyun 	.completion = nfs_direct_commit_complete,
645*4882a593Smuzhiyun 	.resched_write = nfs_direct_resched_write,
646*4882a593Smuzhiyun };
647*4882a593Smuzhiyun 
nfs_direct_commit_schedule(struct nfs_direct_req * dreq)648*4882a593Smuzhiyun static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
649*4882a593Smuzhiyun {
650*4882a593Smuzhiyun 	int res;
651*4882a593Smuzhiyun 	struct nfs_commit_info cinfo;
652*4882a593Smuzhiyun 	LIST_HEAD(mds_list);
653*4882a593Smuzhiyun 
654*4882a593Smuzhiyun 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
655*4882a593Smuzhiyun 	nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
656*4882a593Smuzhiyun 	res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
657*4882a593Smuzhiyun 	if (res < 0) /* res == -ENOMEM */
658*4882a593Smuzhiyun 		nfs_direct_write_reschedule(dreq);
659*4882a593Smuzhiyun }
660*4882a593Smuzhiyun 
nfs_direct_write_clear_reqs(struct nfs_direct_req * dreq)661*4882a593Smuzhiyun static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
662*4882a593Smuzhiyun {
663*4882a593Smuzhiyun 	struct nfs_commit_info cinfo;
664*4882a593Smuzhiyun 	struct nfs_page *req;
665*4882a593Smuzhiyun 	LIST_HEAD(reqs);
666*4882a593Smuzhiyun 
667*4882a593Smuzhiyun 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
668*4882a593Smuzhiyun 	nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
669*4882a593Smuzhiyun 
670*4882a593Smuzhiyun 	while (!list_empty(&reqs)) {
671*4882a593Smuzhiyun 		req = nfs_list_entry(reqs.next);
672*4882a593Smuzhiyun 		nfs_list_remove_request(req);
673*4882a593Smuzhiyun 		nfs_release_request(req);
674*4882a593Smuzhiyun 		nfs_unlock_and_release_request(req);
675*4882a593Smuzhiyun 	}
676*4882a593Smuzhiyun }
677*4882a593Smuzhiyun 
nfs_direct_write_schedule_work(struct work_struct * work)678*4882a593Smuzhiyun static void nfs_direct_write_schedule_work(struct work_struct *work)
679*4882a593Smuzhiyun {
680*4882a593Smuzhiyun 	struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
681*4882a593Smuzhiyun 	int flags = dreq->flags;
682*4882a593Smuzhiyun 
683*4882a593Smuzhiyun 	dreq->flags = 0;
684*4882a593Smuzhiyun 	switch (flags) {
685*4882a593Smuzhiyun 		case NFS_ODIRECT_DO_COMMIT:
686*4882a593Smuzhiyun 			nfs_direct_commit_schedule(dreq);
687*4882a593Smuzhiyun 			break;
688*4882a593Smuzhiyun 		case NFS_ODIRECT_RESCHED_WRITES:
689*4882a593Smuzhiyun 			nfs_direct_write_reschedule(dreq);
690*4882a593Smuzhiyun 			break;
691*4882a593Smuzhiyun 		default:
692*4882a593Smuzhiyun 			nfs_direct_write_clear_reqs(dreq);
693*4882a593Smuzhiyun 			nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
694*4882a593Smuzhiyun 			nfs_direct_complete(dreq);
695*4882a593Smuzhiyun 	}
696*4882a593Smuzhiyun }
697*4882a593Smuzhiyun 
nfs_direct_write_complete(struct nfs_direct_req * dreq)698*4882a593Smuzhiyun static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
699*4882a593Smuzhiyun {
700*4882a593Smuzhiyun 	queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
701*4882a593Smuzhiyun }
702*4882a593Smuzhiyun 
nfs_direct_write_completion(struct nfs_pgio_header * hdr)703*4882a593Smuzhiyun static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
704*4882a593Smuzhiyun {
705*4882a593Smuzhiyun 	struct nfs_direct_req *dreq = hdr->dreq;
706*4882a593Smuzhiyun 	struct nfs_commit_info cinfo;
707*4882a593Smuzhiyun 	struct nfs_page *req = nfs_list_entry(hdr->pages.next);
708*4882a593Smuzhiyun 	int flags = NFS_ODIRECT_DONE;
709*4882a593Smuzhiyun 
710*4882a593Smuzhiyun 	nfs_init_cinfo_from_dreq(&cinfo, dreq);
711*4882a593Smuzhiyun 
712*4882a593Smuzhiyun 	spin_lock(&dreq->lock);
713*4882a593Smuzhiyun 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
714*4882a593Smuzhiyun 		spin_unlock(&dreq->lock);
715*4882a593Smuzhiyun 		goto out_put;
716*4882a593Smuzhiyun 	}
717*4882a593Smuzhiyun 
718*4882a593Smuzhiyun 	nfs_direct_count_bytes(dreq, hdr);
719*4882a593Smuzhiyun 	if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
720*4882a593Smuzhiyun 		if (!dreq->flags)
721*4882a593Smuzhiyun 			dreq->flags = NFS_ODIRECT_DO_COMMIT;
722*4882a593Smuzhiyun 		flags = dreq->flags;
723*4882a593Smuzhiyun 	}
724*4882a593Smuzhiyun 	spin_unlock(&dreq->lock);
725*4882a593Smuzhiyun 
726*4882a593Smuzhiyun 	while (!list_empty(&hdr->pages)) {
727*4882a593Smuzhiyun 
728*4882a593Smuzhiyun 		req = nfs_list_entry(hdr->pages.next);
729*4882a593Smuzhiyun 		nfs_list_remove_request(req);
730*4882a593Smuzhiyun 		if (flags == NFS_ODIRECT_DO_COMMIT) {
731*4882a593Smuzhiyun 			kref_get(&req->wb_kref);
732*4882a593Smuzhiyun 			memcpy(&req->wb_verf, &hdr->verf.verifier,
733*4882a593Smuzhiyun 			       sizeof(req->wb_verf));
734*4882a593Smuzhiyun 			nfs_mark_request_commit(req, hdr->lseg, &cinfo,
735*4882a593Smuzhiyun 				hdr->ds_commit_idx);
736*4882a593Smuzhiyun 		} else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
737*4882a593Smuzhiyun 			kref_get(&req->wb_kref);
738*4882a593Smuzhiyun 			nfs_mark_request_commit(req, NULL, &cinfo, 0);
739*4882a593Smuzhiyun 		}
740*4882a593Smuzhiyun 		nfs_unlock_and_release_request(req);
741*4882a593Smuzhiyun 	}
742*4882a593Smuzhiyun 
743*4882a593Smuzhiyun out_put:
744*4882a593Smuzhiyun 	if (put_dreq(dreq))
745*4882a593Smuzhiyun 		nfs_direct_write_complete(dreq);
746*4882a593Smuzhiyun 	hdr->release(hdr);
747*4882a593Smuzhiyun }
748*4882a593Smuzhiyun 
nfs_write_sync_pgio_error(struct list_head * head,int error)749*4882a593Smuzhiyun static void nfs_write_sync_pgio_error(struct list_head *head, int error)
750*4882a593Smuzhiyun {
751*4882a593Smuzhiyun 	struct nfs_page *req;
752*4882a593Smuzhiyun 
753*4882a593Smuzhiyun 	while (!list_empty(head)) {
754*4882a593Smuzhiyun 		req = nfs_list_entry(head->next);
755*4882a593Smuzhiyun 		nfs_list_remove_request(req);
756*4882a593Smuzhiyun 		nfs_unlock_and_release_request(req);
757*4882a593Smuzhiyun 	}
758*4882a593Smuzhiyun }
759*4882a593Smuzhiyun 
nfs_direct_write_reschedule_io(struct nfs_pgio_header * hdr)760*4882a593Smuzhiyun static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
761*4882a593Smuzhiyun {
762*4882a593Smuzhiyun 	struct nfs_direct_req *dreq = hdr->dreq;
763*4882a593Smuzhiyun 
764*4882a593Smuzhiyun 	spin_lock(&dreq->lock);
765*4882a593Smuzhiyun 	if (dreq->error == 0) {
766*4882a593Smuzhiyun 		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
767*4882a593Smuzhiyun 		/* fake unstable write to let common nfs resend pages */
768*4882a593Smuzhiyun 		hdr->verf.committed = NFS_UNSTABLE;
769*4882a593Smuzhiyun 		hdr->good_bytes = hdr->args.offset + hdr->args.count -
770*4882a593Smuzhiyun 			hdr->io_start;
771*4882a593Smuzhiyun 	}
772*4882a593Smuzhiyun 	spin_unlock(&dreq->lock);
773*4882a593Smuzhiyun }
774*4882a593Smuzhiyun 
775*4882a593Smuzhiyun static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
776*4882a593Smuzhiyun 	.error_cleanup = nfs_write_sync_pgio_error,
777*4882a593Smuzhiyun 	.init_hdr = nfs_direct_pgio_init,
778*4882a593Smuzhiyun 	.completion = nfs_direct_write_completion,
779*4882a593Smuzhiyun 	.reschedule_io = nfs_direct_write_reschedule_io,
780*4882a593Smuzhiyun };
781*4882a593Smuzhiyun 
782*4882a593Smuzhiyun 
783*4882a593Smuzhiyun /*
784*4882a593Smuzhiyun  * NB: Return the value of the first error return code.  Subsequent
785*4882a593Smuzhiyun  *     errors after the first one are ignored.
786*4882a593Smuzhiyun  */
787*4882a593Smuzhiyun /*
788*4882a593Smuzhiyun  * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
789*4882a593Smuzhiyun  * operation.  If nfs_writedata_alloc() or get_user_pages() fails,
790*4882a593Smuzhiyun  * bail and stop sending more writes.  Write length accounting is
791*4882a593Smuzhiyun  * handled automatically by nfs_direct_write_result().  Otherwise, if
792*4882a593Smuzhiyun  * no requests have been sent, just return an error.
793*4882a593Smuzhiyun  */
nfs_direct_write_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos,int ioflags)794*4882a593Smuzhiyun static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
795*4882a593Smuzhiyun 					       struct iov_iter *iter,
796*4882a593Smuzhiyun 					       loff_t pos, int ioflags)
797*4882a593Smuzhiyun {
798*4882a593Smuzhiyun 	struct nfs_pageio_descriptor desc;
799*4882a593Smuzhiyun 	struct inode *inode = dreq->inode;
800*4882a593Smuzhiyun 	ssize_t result = 0;
801*4882a593Smuzhiyun 	size_t requested_bytes = 0;
802*4882a593Smuzhiyun 	size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
803*4882a593Smuzhiyun 
804*4882a593Smuzhiyun 	nfs_pageio_init_write(&desc, inode, ioflags, false,
805*4882a593Smuzhiyun 			      &nfs_direct_write_completion_ops);
806*4882a593Smuzhiyun 	desc.pg_dreq = dreq;
807*4882a593Smuzhiyun 	get_dreq(dreq);
808*4882a593Smuzhiyun 	inode_dio_begin(inode);
809*4882a593Smuzhiyun 
810*4882a593Smuzhiyun 	NFS_I(inode)->write_io += iov_iter_count(iter);
811*4882a593Smuzhiyun 	while (iov_iter_count(iter)) {
812*4882a593Smuzhiyun 		struct page **pagevec;
813*4882a593Smuzhiyun 		size_t bytes;
814*4882a593Smuzhiyun 		size_t pgbase;
815*4882a593Smuzhiyun 		unsigned npages, i;
816*4882a593Smuzhiyun 
817*4882a593Smuzhiyun 		result = iov_iter_get_pages_alloc(iter, &pagevec,
818*4882a593Smuzhiyun 						  wsize, &pgbase);
819*4882a593Smuzhiyun 		if (result < 0)
820*4882a593Smuzhiyun 			break;
821*4882a593Smuzhiyun 
822*4882a593Smuzhiyun 		bytes = result;
823*4882a593Smuzhiyun 		iov_iter_advance(iter, bytes);
824*4882a593Smuzhiyun 		npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
825*4882a593Smuzhiyun 		for (i = 0; i < npages; i++) {
826*4882a593Smuzhiyun 			struct nfs_page *req;
827*4882a593Smuzhiyun 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
828*4882a593Smuzhiyun 
829*4882a593Smuzhiyun 			req = nfs_create_request(dreq->ctx, pagevec[i],
830*4882a593Smuzhiyun 						 pgbase, req_len);
831*4882a593Smuzhiyun 			if (IS_ERR(req)) {
832*4882a593Smuzhiyun 				result = PTR_ERR(req);
833*4882a593Smuzhiyun 				break;
834*4882a593Smuzhiyun 			}
835*4882a593Smuzhiyun 
836*4882a593Smuzhiyun 			if (desc.pg_error < 0) {
837*4882a593Smuzhiyun 				nfs_free_request(req);
838*4882a593Smuzhiyun 				result = desc.pg_error;
839*4882a593Smuzhiyun 				break;
840*4882a593Smuzhiyun 			}
841*4882a593Smuzhiyun 
842*4882a593Smuzhiyun 			nfs_lock_request(req);
843*4882a593Smuzhiyun 			req->wb_index = pos >> PAGE_SHIFT;
844*4882a593Smuzhiyun 			req->wb_offset = pos & ~PAGE_MASK;
845*4882a593Smuzhiyun 			if (!nfs_pageio_add_request(&desc, req)) {
846*4882a593Smuzhiyun 				result = desc.pg_error;
847*4882a593Smuzhiyun 				nfs_unlock_and_release_request(req);
848*4882a593Smuzhiyun 				break;
849*4882a593Smuzhiyun 			}
850*4882a593Smuzhiyun 			pgbase = 0;
851*4882a593Smuzhiyun 			bytes -= req_len;
852*4882a593Smuzhiyun 			requested_bytes += req_len;
853*4882a593Smuzhiyun 			pos += req_len;
854*4882a593Smuzhiyun 			dreq->bytes_left -= req_len;
855*4882a593Smuzhiyun 		}
856*4882a593Smuzhiyun 		nfs_direct_release_pages(pagevec, npages);
857*4882a593Smuzhiyun 		kvfree(pagevec);
858*4882a593Smuzhiyun 		if (result < 0)
859*4882a593Smuzhiyun 			break;
860*4882a593Smuzhiyun 	}
861*4882a593Smuzhiyun 	nfs_pageio_complete(&desc);
862*4882a593Smuzhiyun 
863*4882a593Smuzhiyun 	/*
864*4882a593Smuzhiyun 	 * If no bytes were started, return the error, and let the
865*4882a593Smuzhiyun 	 * generic layer handle the completion.
866*4882a593Smuzhiyun 	 */
867*4882a593Smuzhiyun 	if (requested_bytes == 0) {
868*4882a593Smuzhiyun 		inode_dio_end(inode);
869*4882a593Smuzhiyun 		nfs_direct_req_release(dreq);
870*4882a593Smuzhiyun 		return result < 0 ? result : -EIO;
871*4882a593Smuzhiyun 	}
872*4882a593Smuzhiyun 
873*4882a593Smuzhiyun 	if (put_dreq(dreq))
874*4882a593Smuzhiyun 		nfs_direct_write_complete(dreq);
875*4882a593Smuzhiyun 	return requested_bytes;
876*4882a593Smuzhiyun }
877*4882a593Smuzhiyun 
878*4882a593Smuzhiyun /**
879*4882a593Smuzhiyun  * nfs_file_direct_write - file direct write operation for NFS files
880*4882a593Smuzhiyun  * @iocb: target I/O control block
881*4882a593Smuzhiyun  * @iter: vector of user buffers from which to write data
882*4882a593Smuzhiyun  * @swap: flag indicating this is swap IO, not O_DIRECT IO
883*4882a593Smuzhiyun  *
884*4882a593Smuzhiyun  * We use this function for direct writes instead of calling
885*4882a593Smuzhiyun  * generic_file_aio_write() in order to avoid taking the inode
886*4882a593Smuzhiyun  * semaphore and updating the i_size.  The NFS server will set
887*4882a593Smuzhiyun  * the new i_size and this client must read the updated size
888*4882a593Smuzhiyun  * back into its cache.  We let the server do generic write
889*4882a593Smuzhiyun  * parameter checking and report problems.
890*4882a593Smuzhiyun  *
891*4882a593Smuzhiyun  * We eliminate local atime updates, see direct read above.
892*4882a593Smuzhiyun  *
893*4882a593Smuzhiyun  * We avoid unnecessary page cache invalidations for normal cached
894*4882a593Smuzhiyun  * readers of this file.
895*4882a593Smuzhiyun  *
896*4882a593Smuzhiyun  * Note that O_APPEND is not supported for NFS direct writes, as there
897*4882a593Smuzhiyun  * is no atomic O_APPEND write facility in the NFS protocol.
898*4882a593Smuzhiyun  */
nfs_file_direct_write(struct kiocb * iocb,struct iov_iter * iter,bool swap)899*4882a593Smuzhiyun ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
900*4882a593Smuzhiyun 			      bool swap)
901*4882a593Smuzhiyun {
902*4882a593Smuzhiyun 	ssize_t result, requested;
903*4882a593Smuzhiyun 	size_t count;
904*4882a593Smuzhiyun 	struct file *file = iocb->ki_filp;
905*4882a593Smuzhiyun 	struct address_space *mapping = file->f_mapping;
906*4882a593Smuzhiyun 	struct inode *inode = mapping->host;
907*4882a593Smuzhiyun 	struct nfs_direct_req *dreq;
908*4882a593Smuzhiyun 	struct nfs_lock_context *l_ctx;
909*4882a593Smuzhiyun 	loff_t pos, end;
910*4882a593Smuzhiyun 
911*4882a593Smuzhiyun 	dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
912*4882a593Smuzhiyun 		file, iov_iter_count(iter), (long long) iocb->ki_pos);
913*4882a593Smuzhiyun 
914*4882a593Smuzhiyun 	if (swap)
915*4882a593Smuzhiyun 		/* bypass generic checks */
916*4882a593Smuzhiyun 		result =  iov_iter_count(iter);
917*4882a593Smuzhiyun 	else
918*4882a593Smuzhiyun 		result = generic_write_checks(iocb, iter);
919*4882a593Smuzhiyun 	if (result <= 0)
920*4882a593Smuzhiyun 		return result;
921*4882a593Smuzhiyun 	count = result;
922*4882a593Smuzhiyun 	nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
923*4882a593Smuzhiyun 
924*4882a593Smuzhiyun 	pos = iocb->ki_pos;
925*4882a593Smuzhiyun 	end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
926*4882a593Smuzhiyun 
927*4882a593Smuzhiyun 	task_io_account_write(count);
928*4882a593Smuzhiyun 
929*4882a593Smuzhiyun 	result = -ENOMEM;
930*4882a593Smuzhiyun 	dreq = nfs_direct_req_alloc();
931*4882a593Smuzhiyun 	if (!dreq)
932*4882a593Smuzhiyun 		goto out;
933*4882a593Smuzhiyun 
934*4882a593Smuzhiyun 	dreq->inode = inode;
935*4882a593Smuzhiyun 	dreq->bytes_left = dreq->max_count = count;
936*4882a593Smuzhiyun 	dreq->io_start = pos;
937*4882a593Smuzhiyun 	dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
938*4882a593Smuzhiyun 	l_ctx = nfs_get_lock_context(dreq->ctx);
939*4882a593Smuzhiyun 	if (IS_ERR(l_ctx)) {
940*4882a593Smuzhiyun 		result = PTR_ERR(l_ctx);
941*4882a593Smuzhiyun 		nfs_direct_req_release(dreq);
942*4882a593Smuzhiyun 		goto out_release;
943*4882a593Smuzhiyun 	}
944*4882a593Smuzhiyun 	dreq->l_ctx = l_ctx;
945*4882a593Smuzhiyun 	if (!is_sync_kiocb(iocb))
946*4882a593Smuzhiyun 		dreq->iocb = iocb;
947*4882a593Smuzhiyun 	pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
948*4882a593Smuzhiyun 
949*4882a593Smuzhiyun 	if (swap) {
950*4882a593Smuzhiyun 		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
951*4882a593Smuzhiyun 							    FLUSH_STABLE);
952*4882a593Smuzhiyun 	} else {
953*4882a593Smuzhiyun 		nfs_start_io_direct(inode);
954*4882a593Smuzhiyun 
955*4882a593Smuzhiyun 		requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
956*4882a593Smuzhiyun 							    FLUSH_COND_STABLE);
957*4882a593Smuzhiyun 
958*4882a593Smuzhiyun 		if (mapping->nrpages) {
959*4882a593Smuzhiyun 			invalidate_inode_pages2_range(mapping,
960*4882a593Smuzhiyun 						      pos >> PAGE_SHIFT, end);
961*4882a593Smuzhiyun 		}
962*4882a593Smuzhiyun 
963*4882a593Smuzhiyun 		nfs_end_io_direct(inode);
964*4882a593Smuzhiyun 	}
965*4882a593Smuzhiyun 
966*4882a593Smuzhiyun 	if (requested > 0) {
967*4882a593Smuzhiyun 		result = nfs_direct_wait(dreq);
968*4882a593Smuzhiyun 		if (result > 0) {
969*4882a593Smuzhiyun 			requested -= result;
970*4882a593Smuzhiyun 			iocb->ki_pos = pos + result;
971*4882a593Smuzhiyun 			/* XXX: should check the generic_write_sync retval */
972*4882a593Smuzhiyun 			generic_write_sync(iocb, result);
973*4882a593Smuzhiyun 		}
974*4882a593Smuzhiyun 		iov_iter_revert(iter, requested);
975*4882a593Smuzhiyun 	} else {
976*4882a593Smuzhiyun 		result = requested;
977*4882a593Smuzhiyun 	}
978*4882a593Smuzhiyun out_release:
979*4882a593Smuzhiyun 	nfs_direct_req_release(dreq);
980*4882a593Smuzhiyun out:
981*4882a593Smuzhiyun 	return result;
982*4882a593Smuzhiyun }
983*4882a593Smuzhiyun 
984*4882a593Smuzhiyun /**
985*4882a593Smuzhiyun  * nfs_init_directcache - create a slab cache for nfs_direct_req structures
986*4882a593Smuzhiyun  *
987*4882a593Smuzhiyun  */
nfs_init_directcache(void)988*4882a593Smuzhiyun int __init nfs_init_directcache(void)
989*4882a593Smuzhiyun {
990*4882a593Smuzhiyun 	nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
991*4882a593Smuzhiyun 						sizeof(struct nfs_direct_req),
992*4882a593Smuzhiyun 						0, (SLAB_RECLAIM_ACCOUNT|
993*4882a593Smuzhiyun 							SLAB_MEM_SPREAD),
994*4882a593Smuzhiyun 						NULL);
995*4882a593Smuzhiyun 	if (nfs_direct_cachep == NULL)
996*4882a593Smuzhiyun 		return -ENOMEM;
997*4882a593Smuzhiyun 
998*4882a593Smuzhiyun 	return 0;
999*4882a593Smuzhiyun }
1000*4882a593Smuzhiyun 
1001*4882a593Smuzhiyun /**
1002*4882a593Smuzhiyun  * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1003*4882a593Smuzhiyun  *
1004*4882a593Smuzhiyun  */
nfs_destroy_directcache(void)1005*4882a593Smuzhiyun void nfs_destroy_directcache(void)
1006*4882a593Smuzhiyun {
1007*4882a593Smuzhiyun 	kmem_cache_destroy(nfs_direct_cachep);
1008*4882a593Smuzhiyun }
1009