1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0-only
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * linux/fs/nfs/direct.c
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2003 by Chuck Lever <cel@netapp.com>
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * High-performance uncached I/O for the Linux NFS client
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * There are important applications whose performance or correctness
10*4882a593Smuzhiyun * depends on uncached access to file data. Database clusters
11*4882a593Smuzhiyun * (multiple copies of the same instance running on separate hosts)
12*4882a593Smuzhiyun * implement their own cache coherency protocol that subsumes file
13*4882a593Smuzhiyun * system cache protocols. Applications that process datasets
14*4882a593Smuzhiyun * considerably larger than the client's memory do not always benefit
15*4882a593Smuzhiyun * from a local cache. A streaming video server, for instance, has no
16*4882a593Smuzhiyun * need to cache the contents of a file.
17*4882a593Smuzhiyun *
18*4882a593Smuzhiyun * When an application requests uncached I/O, all read and write requests
19*4882a593Smuzhiyun * are made directly to the server; data stored or fetched via these
20*4882a593Smuzhiyun * requests is not cached in the Linux page cache. The client does not
21*4882a593Smuzhiyun * correct unaligned requests from applications. All requested bytes are
22*4882a593Smuzhiyun * held on permanent storage before a direct write system call returns to
23*4882a593Smuzhiyun * an application.
24*4882a593Smuzhiyun *
25*4882a593Smuzhiyun * Solaris implements an uncached I/O facility called directio() that
26*4882a593Smuzhiyun * is used for backups and sequential I/O to very large files. Solaris
27*4882a593Smuzhiyun * also supports uncaching whole NFS partitions with "-o forcedirectio,"
28*4882a593Smuzhiyun * an undocumented mount option.
29*4882a593Smuzhiyun *
30*4882a593Smuzhiyun * Designed by Jeff Kimmel, Chuck Lever, and Trond Myklebust, with
31*4882a593Smuzhiyun * help from Andrew Morton.
32*4882a593Smuzhiyun *
33*4882a593Smuzhiyun * 18 Dec 2001 Initial implementation for 2.4 --cel
34*4882a593Smuzhiyun * 08 Jul 2002 Version for 2.4.19, with bug fixes --trondmy
35*4882a593Smuzhiyun * 08 Jun 2003 Port to 2.5 APIs --cel
36*4882a593Smuzhiyun * 31 Mar 2004 Handle direct I/O without VFS support --cel
37*4882a593Smuzhiyun * 15 Sep 2004 Parallel async reads --cel
38*4882a593Smuzhiyun * 04 May 2005 support O_DIRECT with aio --cel
39*4882a593Smuzhiyun *
40*4882a593Smuzhiyun */
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun #include <linux/errno.h>
43*4882a593Smuzhiyun #include <linux/sched.h>
44*4882a593Smuzhiyun #include <linux/kernel.h>
45*4882a593Smuzhiyun #include <linux/file.h>
46*4882a593Smuzhiyun #include <linux/pagemap.h>
47*4882a593Smuzhiyun #include <linux/kref.h>
48*4882a593Smuzhiyun #include <linux/slab.h>
49*4882a593Smuzhiyun #include <linux/task_io_accounting_ops.h>
50*4882a593Smuzhiyun #include <linux/module.h>
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun #include <linux/nfs_fs.h>
53*4882a593Smuzhiyun #include <linux/nfs_page.h>
54*4882a593Smuzhiyun #include <linux/sunrpc/clnt.h>
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun #include <linux/uaccess.h>
57*4882a593Smuzhiyun #include <linux/atomic.h>
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun #include "internal.h"
60*4882a593Smuzhiyun #include "iostat.h"
61*4882a593Smuzhiyun #include "pnfs.h"
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun #define NFSDBG_FACILITY NFSDBG_VFS
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun static struct kmem_cache *nfs_direct_cachep;
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun struct nfs_direct_req {
68*4882a593Smuzhiyun struct kref kref; /* release manager */
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun /* I/O parameters */
71*4882a593Smuzhiyun struct nfs_open_context *ctx; /* file open context info */
72*4882a593Smuzhiyun struct nfs_lock_context *l_ctx; /* Lock context info */
73*4882a593Smuzhiyun struct kiocb * iocb; /* controlling i/o request */
74*4882a593Smuzhiyun struct inode * inode; /* target file of i/o */
75*4882a593Smuzhiyun
76*4882a593Smuzhiyun /* completion state */
77*4882a593Smuzhiyun atomic_t io_count; /* i/os we're waiting for */
78*4882a593Smuzhiyun spinlock_t lock; /* protect completion state */
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun loff_t io_start; /* Start offset for I/O */
81*4882a593Smuzhiyun ssize_t count, /* bytes actually processed */
82*4882a593Smuzhiyun max_count, /* max expected count */
83*4882a593Smuzhiyun bytes_left, /* bytes left to be sent */
84*4882a593Smuzhiyun error; /* any reported error */
85*4882a593Smuzhiyun struct completion completion; /* wait for i/o completion */
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun /* commit state */
88*4882a593Smuzhiyun struct nfs_mds_commit_info mds_cinfo; /* Storage for cinfo */
89*4882a593Smuzhiyun struct pnfs_ds_commit_info ds_cinfo; /* Storage for cinfo */
90*4882a593Smuzhiyun struct work_struct work;
91*4882a593Smuzhiyun int flags;
92*4882a593Smuzhiyun /* for write */
93*4882a593Smuzhiyun #define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
94*4882a593Smuzhiyun #define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
95*4882a593Smuzhiyun /* for read */
96*4882a593Smuzhiyun #define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
97*4882a593Smuzhiyun #define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
98*4882a593Smuzhiyun };
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
101*4882a593Smuzhiyun static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
102*4882a593Smuzhiyun static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
103*4882a593Smuzhiyun static void nfs_direct_write_schedule_work(struct work_struct *work);
104*4882a593Smuzhiyun
get_dreq(struct nfs_direct_req * dreq)105*4882a593Smuzhiyun static inline void get_dreq(struct nfs_direct_req *dreq)
106*4882a593Smuzhiyun {
107*4882a593Smuzhiyun atomic_inc(&dreq->io_count);
108*4882a593Smuzhiyun }
109*4882a593Smuzhiyun
put_dreq(struct nfs_direct_req * dreq)110*4882a593Smuzhiyun static inline int put_dreq(struct nfs_direct_req *dreq)
111*4882a593Smuzhiyun {
112*4882a593Smuzhiyun return atomic_dec_and_test(&dreq->io_count);
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun static void
nfs_direct_handle_truncated(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr,ssize_t dreq_len)116*4882a593Smuzhiyun nfs_direct_handle_truncated(struct nfs_direct_req *dreq,
117*4882a593Smuzhiyun const struct nfs_pgio_header *hdr,
118*4882a593Smuzhiyun ssize_t dreq_len)
119*4882a593Smuzhiyun {
120*4882a593Smuzhiyun if (!(test_bit(NFS_IOHDR_ERROR, &hdr->flags) ||
121*4882a593Smuzhiyun test_bit(NFS_IOHDR_EOF, &hdr->flags)))
122*4882a593Smuzhiyun return;
123*4882a593Smuzhiyun if (dreq->max_count >= dreq_len) {
124*4882a593Smuzhiyun dreq->max_count = dreq_len;
125*4882a593Smuzhiyun if (dreq->count > dreq_len)
126*4882a593Smuzhiyun dreq->count = dreq_len;
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun if (test_bit(NFS_IOHDR_ERROR, &hdr->flags))
129*4882a593Smuzhiyun dreq->error = hdr->error;
130*4882a593Smuzhiyun else /* Clear outstanding error if this is EOF */
131*4882a593Smuzhiyun dreq->error = 0;
132*4882a593Smuzhiyun }
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun
135*4882a593Smuzhiyun static void
nfs_direct_count_bytes(struct nfs_direct_req * dreq,const struct nfs_pgio_header * hdr)136*4882a593Smuzhiyun nfs_direct_count_bytes(struct nfs_direct_req *dreq,
137*4882a593Smuzhiyun const struct nfs_pgio_header *hdr)
138*4882a593Smuzhiyun {
139*4882a593Smuzhiyun loff_t hdr_end = hdr->io_start + hdr->good_bytes;
140*4882a593Smuzhiyun ssize_t dreq_len = 0;
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun if (hdr_end > dreq->io_start)
143*4882a593Smuzhiyun dreq_len = hdr_end - dreq->io_start;
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun nfs_direct_handle_truncated(dreq, hdr, dreq_len);
146*4882a593Smuzhiyun
147*4882a593Smuzhiyun if (dreq_len > dreq->max_count)
148*4882a593Smuzhiyun dreq_len = dreq->max_count;
149*4882a593Smuzhiyun
150*4882a593Smuzhiyun if (dreq->count < dreq_len)
151*4882a593Smuzhiyun dreq->count = dreq_len;
152*4882a593Smuzhiyun }
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun /**
155*4882a593Smuzhiyun * nfs_direct_IO - NFS address space operation for direct I/O
156*4882a593Smuzhiyun * @iocb: target I/O control block
157*4882a593Smuzhiyun * @iter: I/O buffer
158*4882a593Smuzhiyun *
159*4882a593Smuzhiyun * The presence of this routine in the address space ops vector means
160*4882a593Smuzhiyun * the NFS client supports direct I/O. However, for most direct IO, we
161*4882a593Smuzhiyun * shunt off direct read and write requests before the VFS gets them,
162*4882a593Smuzhiyun * so this method is only ever called for swap.
163*4882a593Smuzhiyun */
nfs_direct_IO(struct kiocb * iocb,struct iov_iter * iter)164*4882a593Smuzhiyun ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
165*4882a593Smuzhiyun {
166*4882a593Smuzhiyun struct inode *inode = iocb->ki_filp->f_mapping->host;
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun /* we only support swap file calling nfs_direct_IO */
169*4882a593Smuzhiyun if (!IS_SWAPFILE(inode))
170*4882a593Smuzhiyun return 0;
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun if (iov_iter_rw(iter) == READ)
175*4882a593Smuzhiyun return nfs_file_direct_read(iocb, iter, true);
176*4882a593Smuzhiyun return nfs_file_direct_write(iocb, iter, true);
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun
nfs_direct_release_pages(struct page ** pages,unsigned int npages)179*4882a593Smuzhiyun static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
180*4882a593Smuzhiyun {
181*4882a593Smuzhiyun unsigned int i;
182*4882a593Smuzhiyun for (i = 0; i < npages; i++)
183*4882a593Smuzhiyun put_page(pages[i]);
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun
nfs_init_cinfo_from_dreq(struct nfs_commit_info * cinfo,struct nfs_direct_req * dreq)186*4882a593Smuzhiyun void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
187*4882a593Smuzhiyun struct nfs_direct_req *dreq)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun cinfo->inode = dreq->inode;
190*4882a593Smuzhiyun cinfo->mds = &dreq->mds_cinfo;
191*4882a593Smuzhiyun cinfo->ds = &dreq->ds_cinfo;
192*4882a593Smuzhiyun cinfo->dreq = dreq;
193*4882a593Smuzhiyun cinfo->completion_ops = &nfs_direct_commit_completion_ops;
194*4882a593Smuzhiyun }
195*4882a593Smuzhiyun
nfs_direct_req_alloc(void)196*4882a593Smuzhiyun static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
197*4882a593Smuzhiyun {
198*4882a593Smuzhiyun struct nfs_direct_req *dreq;
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun dreq = kmem_cache_zalloc(nfs_direct_cachep, GFP_KERNEL);
201*4882a593Smuzhiyun if (!dreq)
202*4882a593Smuzhiyun return NULL;
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun kref_init(&dreq->kref);
205*4882a593Smuzhiyun kref_get(&dreq->kref);
206*4882a593Smuzhiyun init_completion(&dreq->completion);
207*4882a593Smuzhiyun INIT_LIST_HEAD(&dreq->mds_cinfo.list);
208*4882a593Smuzhiyun pnfs_init_ds_commit_info(&dreq->ds_cinfo);
209*4882a593Smuzhiyun INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
210*4882a593Smuzhiyun spin_lock_init(&dreq->lock);
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun return dreq;
213*4882a593Smuzhiyun }
214*4882a593Smuzhiyun
nfs_direct_req_free(struct kref * kref)215*4882a593Smuzhiyun static void nfs_direct_req_free(struct kref *kref)
216*4882a593Smuzhiyun {
217*4882a593Smuzhiyun struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
220*4882a593Smuzhiyun if (dreq->l_ctx != NULL)
221*4882a593Smuzhiyun nfs_put_lock_context(dreq->l_ctx);
222*4882a593Smuzhiyun if (dreq->ctx != NULL)
223*4882a593Smuzhiyun put_nfs_open_context(dreq->ctx);
224*4882a593Smuzhiyun kmem_cache_free(nfs_direct_cachep, dreq);
225*4882a593Smuzhiyun }
226*4882a593Smuzhiyun
nfs_direct_req_release(struct nfs_direct_req * dreq)227*4882a593Smuzhiyun static void nfs_direct_req_release(struct nfs_direct_req *dreq)
228*4882a593Smuzhiyun {
229*4882a593Smuzhiyun kref_put(&dreq->kref, nfs_direct_req_free);
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun
nfs_dreq_bytes_left(struct nfs_direct_req * dreq)232*4882a593Smuzhiyun ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
233*4882a593Smuzhiyun {
234*4882a593Smuzhiyun return dreq->bytes_left;
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun /*
239*4882a593Smuzhiyun * Collects and returns the final error value/byte-count.
240*4882a593Smuzhiyun */
nfs_direct_wait(struct nfs_direct_req * dreq)241*4882a593Smuzhiyun static ssize_t nfs_direct_wait(struct nfs_direct_req *dreq)
242*4882a593Smuzhiyun {
243*4882a593Smuzhiyun ssize_t result = -EIOCBQUEUED;
244*4882a593Smuzhiyun
245*4882a593Smuzhiyun /* Async requests don't wait here */
246*4882a593Smuzhiyun if (dreq->iocb)
247*4882a593Smuzhiyun goto out;
248*4882a593Smuzhiyun
249*4882a593Smuzhiyun result = wait_for_completion_killable(&dreq->completion);
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun if (!result) {
252*4882a593Smuzhiyun result = dreq->count;
253*4882a593Smuzhiyun WARN_ON_ONCE(dreq->count < 0);
254*4882a593Smuzhiyun }
255*4882a593Smuzhiyun if (!result)
256*4882a593Smuzhiyun result = dreq->error;
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun out:
259*4882a593Smuzhiyun return (ssize_t) result;
260*4882a593Smuzhiyun }
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun /*
263*4882a593Smuzhiyun * Synchronous I/O uses a stack-allocated iocb. Thus we can't trust
264*4882a593Smuzhiyun * the iocb is still valid here if this is a synchronous request.
265*4882a593Smuzhiyun */
nfs_direct_complete(struct nfs_direct_req * dreq)266*4882a593Smuzhiyun static void nfs_direct_complete(struct nfs_direct_req *dreq)
267*4882a593Smuzhiyun {
268*4882a593Smuzhiyun struct inode *inode = dreq->inode;
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun inode_dio_end(inode);
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun if (dreq->iocb) {
273*4882a593Smuzhiyun long res = (long) dreq->error;
274*4882a593Smuzhiyun if (dreq->count != 0) {
275*4882a593Smuzhiyun res = (long) dreq->count;
276*4882a593Smuzhiyun WARN_ON_ONCE(dreq->count < 0);
277*4882a593Smuzhiyun }
278*4882a593Smuzhiyun dreq->iocb->ki_complete(dreq->iocb, res, 0);
279*4882a593Smuzhiyun }
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun complete(&dreq->completion);
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun nfs_direct_req_release(dreq);
284*4882a593Smuzhiyun }
285*4882a593Smuzhiyun
nfs_direct_read_completion(struct nfs_pgio_header * hdr)286*4882a593Smuzhiyun static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
287*4882a593Smuzhiyun {
288*4882a593Smuzhiyun unsigned long bytes = 0;
289*4882a593Smuzhiyun struct nfs_direct_req *dreq = hdr->dreq;
290*4882a593Smuzhiyun
291*4882a593Smuzhiyun spin_lock(&dreq->lock);
292*4882a593Smuzhiyun if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
293*4882a593Smuzhiyun spin_unlock(&dreq->lock);
294*4882a593Smuzhiyun goto out_put;
295*4882a593Smuzhiyun }
296*4882a593Smuzhiyun
297*4882a593Smuzhiyun nfs_direct_count_bytes(dreq, hdr);
298*4882a593Smuzhiyun spin_unlock(&dreq->lock);
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun while (!list_empty(&hdr->pages)) {
301*4882a593Smuzhiyun struct nfs_page *req = nfs_list_entry(hdr->pages.next);
302*4882a593Smuzhiyun struct page *page = req->wb_page;
303*4882a593Smuzhiyun
304*4882a593Smuzhiyun if (!PageCompound(page) && bytes < hdr->good_bytes &&
305*4882a593Smuzhiyun (dreq->flags == NFS_ODIRECT_SHOULD_DIRTY))
306*4882a593Smuzhiyun set_page_dirty(page);
307*4882a593Smuzhiyun bytes += req->wb_bytes;
308*4882a593Smuzhiyun nfs_list_remove_request(req);
309*4882a593Smuzhiyun nfs_release_request(req);
310*4882a593Smuzhiyun }
311*4882a593Smuzhiyun out_put:
312*4882a593Smuzhiyun if (put_dreq(dreq))
313*4882a593Smuzhiyun nfs_direct_complete(dreq);
314*4882a593Smuzhiyun hdr->release(hdr);
315*4882a593Smuzhiyun }
316*4882a593Smuzhiyun
nfs_read_sync_pgio_error(struct list_head * head,int error)317*4882a593Smuzhiyun static void nfs_read_sync_pgio_error(struct list_head *head, int error)
318*4882a593Smuzhiyun {
319*4882a593Smuzhiyun struct nfs_page *req;
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun while (!list_empty(head)) {
322*4882a593Smuzhiyun req = nfs_list_entry(head->next);
323*4882a593Smuzhiyun nfs_list_remove_request(req);
324*4882a593Smuzhiyun nfs_release_request(req);
325*4882a593Smuzhiyun }
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun
nfs_direct_pgio_init(struct nfs_pgio_header * hdr)328*4882a593Smuzhiyun static void nfs_direct_pgio_init(struct nfs_pgio_header *hdr)
329*4882a593Smuzhiyun {
330*4882a593Smuzhiyun get_dreq(hdr->dreq);
331*4882a593Smuzhiyun }
332*4882a593Smuzhiyun
333*4882a593Smuzhiyun static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = {
334*4882a593Smuzhiyun .error_cleanup = nfs_read_sync_pgio_error,
335*4882a593Smuzhiyun .init_hdr = nfs_direct_pgio_init,
336*4882a593Smuzhiyun .completion = nfs_direct_read_completion,
337*4882a593Smuzhiyun };
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun /*
340*4882a593Smuzhiyun * For each rsize'd chunk of the user's buffer, dispatch an NFS READ
341*4882a593Smuzhiyun * operation. If nfs_readdata_alloc() or get_user_pages() fails,
342*4882a593Smuzhiyun * bail and stop sending more reads. Read length accounting is
343*4882a593Smuzhiyun * handled automatically by nfs_direct_read_result(). Otherwise, if
344*4882a593Smuzhiyun * no requests have been sent, just return an error.
345*4882a593Smuzhiyun */
346*4882a593Smuzhiyun
nfs_direct_read_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos)347*4882a593Smuzhiyun static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
348*4882a593Smuzhiyun struct iov_iter *iter,
349*4882a593Smuzhiyun loff_t pos)
350*4882a593Smuzhiyun {
351*4882a593Smuzhiyun struct nfs_pageio_descriptor desc;
352*4882a593Smuzhiyun struct inode *inode = dreq->inode;
353*4882a593Smuzhiyun ssize_t result = -EINVAL;
354*4882a593Smuzhiyun size_t requested_bytes = 0;
355*4882a593Smuzhiyun size_t rsize = max_t(size_t, NFS_SERVER(inode)->rsize, PAGE_SIZE);
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun nfs_pageio_init_read(&desc, dreq->inode, false,
358*4882a593Smuzhiyun &nfs_direct_read_completion_ops);
359*4882a593Smuzhiyun get_dreq(dreq);
360*4882a593Smuzhiyun desc.pg_dreq = dreq;
361*4882a593Smuzhiyun inode_dio_begin(inode);
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun while (iov_iter_count(iter)) {
364*4882a593Smuzhiyun struct page **pagevec;
365*4882a593Smuzhiyun size_t bytes;
366*4882a593Smuzhiyun size_t pgbase;
367*4882a593Smuzhiyun unsigned npages, i;
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun result = iov_iter_get_pages_alloc(iter, &pagevec,
370*4882a593Smuzhiyun rsize, &pgbase);
371*4882a593Smuzhiyun if (result < 0)
372*4882a593Smuzhiyun break;
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun bytes = result;
375*4882a593Smuzhiyun iov_iter_advance(iter, bytes);
376*4882a593Smuzhiyun npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
377*4882a593Smuzhiyun for (i = 0; i < npages; i++) {
378*4882a593Smuzhiyun struct nfs_page *req;
379*4882a593Smuzhiyun unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
380*4882a593Smuzhiyun /* XXX do we need to do the eof zeroing found in async_filler? */
381*4882a593Smuzhiyun req = nfs_create_request(dreq->ctx, pagevec[i],
382*4882a593Smuzhiyun pgbase, req_len);
383*4882a593Smuzhiyun if (IS_ERR(req)) {
384*4882a593Smuzhiyun result = PTR_ERR(req);
385*4882a593Smuzhiyun break;
386*4882a593Smuzhiyun }
387*4882a593Smuzhiyun req->wb_index = pos >> PAGE_SHIFT;
388*4882a593Smuzhiyun req->wb_offset = pos & ~PAGE_MASK;
389*4882a593Smuzhiyun if (!nfs_pageio_add_request(&desc, req)) {
390*4882a593Smuzhiyun result = desc.pg_error;
391*4882a593Smuzhiyun nfs_release_request(req);
392*4882a593Smuzhiyun break;
393*4882a593Smuzhiyun }
394*4882a593Smuzhiyun pgbase = 0;
395*4882a593Smuzhiyun bytes -= req_len;
396*4882a593Smuzhiyun requested_bytes += req_len;
397*4882a593Smuzhiyun pos += req_len;
398*4882a593Smuzhiyun dreq->bytes_left -= req_len;
399*4882a593Smuzhiyun }
400*4882a593Smuzhiyun nfs_direct_release_pages(pagevec, npages);
401*4882a593Smuzhiyun kvfree(pagevec);
402*4882a593Smuzhiyun if (result < 0)
403*4882a593Smuzhiyun break;
404*4882a593Smuzhiyun }
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun nfs_pageio_complete(&desc);
407*4882a593Smuzhiyun
408*4882a593Smuzhiyun /*
409*4882a593Smuzhiyun * If no bytes were started, return the error, and let the
410*4882a593Smuzhiyun * generic layer handle the completion.
411*4882a593Smuzhiyun */
412*4882a593Smuzhiyun if (requested_bytes == 0) {
413*4882a593Smuzhiyun inode_dio_end(inode);
414*4882a593Smuzhiyun nfs_direct_req_release(dreq);
415*4882a593Smuzhiyun return result < 0 ? result : -EIO;
416*4882a593Smuzhiyun }
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun if (put_dreq(dreq))
419*4882a593Smuzhiyun nfs_direct_complete(dreq);
420*4882a593Smuzhiyun return requested_bytes;
421*4882a593Smuzhiyun }
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun /**
424*4882a593Smuzhiyun * nfs_file_direct_read - file direct read operation for NFS files
425*4882a593Smuzhiyun * @iocb: target I/O control block
426*4882a593Smuzhiyun * @iter: vector of user buffers into which to read data
427*4882a593Smuzhiyun * @swap: flag indicating this is swap IO, not O_DIRECT IO
428*4882a593Smuzhiyun *
429*4882a593Smuzhiyun * We use this function for direct reads instead of calling
430*4882a593Smuzhiyun * generic_file_aio_read() in order to avoid gfar's check to see if
431*4882a593Smuzhiyun * the request starts before the end of the file. For that check
432*4882a593Smuzhiyun * to work, we must generate a GETATTR before each direct read, and
433*4882a593Smuzhiyun * even then there is a window between the GETATTR and the subsequent
434*4882a593Smuzhiyun * READ where the file size could change. Our preference is simply
435*4882a593Smuzhiyun * to do all reads the application wants, and the server will take
436*4882a593Smuzhiyun * care of managing the end of file boundary.
437*4882a593Smuzhiyun *
438*4882a593Smuzhiyun * This function also eliminates unnecessarily updating the file's
439*4882a593Smuzhiyun * atime locally, as the NFS server sets the file's atime, and this
440*4882a593Smuzhiyun * client must read the updated atime from the server back into its
441*4882a593Smuzhiyun * cache.
442*4882a593Smuzhiyun */
nfs_file_direct_read(struct kiocb * iocb,struct iov_iter * iter,bool swap)443*4882a593Smuzhiyun ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
444*4882a593Smuzhiyun bool swap)
445*4882a593Smuzhiyun {
446*4882a593Smuzhiyun struct file *file = iocb->ki_filp;
447*4882a593Smuzhiyun struct address_space *mapping = file->f_mapping;
448*4882a593Smuzhiyun struct inode *inode = mapping->host;
449*4882a593Smuzhiyun struct nfs_direct_req *dreq;
450*4882a593Smuzhiyun struct nfs_lock_context *l_ctx;
451*4882a593Smuzhiyun ssize_t result, requested;
452*4882a593Smuzhiyun size_t count = iov_iter_count(iter);
453*4882a593Smuzhiyun nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun dfprintk(FILE, "NFS: direct read(%pD2, %zd@%Ld)\n",
456*4882a593Smuzhiyun file, count, (long long) iocb->ki_pos);
457*4882a593Smuzhiyun
458*4882a593Smuzhiyun result = 0;
459*4882a593Smuzhiyun if (!count)
460*4882a593Smuzhiyun goto out;
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun task_io_account_read(count);
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun result = -ENOMEM;
465*4882a593Smuzhiyun dreq = nfs_direct_req_alloc();
466*4882a593Smuzhiyun if (dreq == NULL)
467*4882a593Smuzhiyun goto out;
468*4882a593Smuzhiyun
469*4882a593Smuzhiyun dreq->inode = inode;
470*4882a593Smuzhiyun dreq->bytes_left = dreq->max_count = count;
471*4882a593Smuzhiyun dreq->io_start = iocb->ki_pos;
472*4882a593Smuzhiyun dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
473*4882a593Smuzhiyun l_ctx = nfs_get_lock_context(dreq->ctx);
474*4882a593Smuzhiyun if (IS_ERR(l_ctx)) {
475*4882a593Smuzhiyun result = PTR_ERR(l_ctx);
476*4882a593Smuzhiyun nfs_direct_req_release(dreq);
477*4882a593Smuzhiyun goto out_release;
478*4882a593Smuzhiyun }
479*4882a593Smuzhiyun dreq->l_ctx = l_ctx;
480*4882a593Smuzhiyun if (!is_sync_kiocb(iocb))
481*4882a593Smuzhiyun dreq->iocb = iocb;
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun if (iter_is_iovec(iter))
484*4882a593Smuzhiyun dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
485*4882a593Smuzhiyun
486*4882a593Smuzhiyun if (!swap)
487*4882a593Smuzhiyun nfs_start_io_direct(inode);
488*4882a593Smuzhiyun
489*4882a593Smuzhiyun NFS_I(inode)->read_io += count;
490*4882a593Smuzhiyun requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun if (!swap)
493*4882a593Smuzhiyun nfs_end_io_direct(inode);
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun if (requested > 0) {
496*4882a593Smuzhiyun result = nfs_direct_wait(dreq);
497*4882a593Smuzhiyun if (result > 0) {
498*4882a593Smuzhiyun requested -= result;
499*4882a593Smuzhiyun iocb->ki_pos += result;
500*4882a593Smuzhiyun }
501*4882a593Smuzhiyun iov_iter_revert(iter, requested);
502*4882a593Smuzhiyun } else {
503*4882a593Smuzhiyun result = requested;
504*4882a593Smuzhiyun }
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun out_release:
507*4882a593Smuzhiyun nfs_direct_req_release(dreq);
508*4882a593Smuzhiyun out:
509*4882a593Smuzhiyun return result;
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun
512*4882a593Smuzhiyun static void
nfs_direct_join_group(struct list_head * list,struct inode * inode)513*4882a593Smuzhiyun nfs_direct_join_group(struct list_head *list, struct inode *inode)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun struct nfs_page *req, *next;
516*4882a593Smuzhiyun
517*4882a593Smuzhiyun list_for_each_entry(req, list, wb_list) {
518*4882a593Smuzhiyun if (req->wb_head != req || req->wb_this_page == req)
519*4882a593Smuzhiyun continue;
520*4882a593Smuzhiyun for (next = req->wb_this_page;
521*4882a593Smuzhiyun next != req->wb_head;
522*4882a593Smuzhiyun next = next->wb_this_page) {
523*4882a593Smuzhiyun nfs_list_remove_request(next);
524*4882a593Smuzhiyun nfs_release_request(next);
525*4882a593Smuzhiyun }
526*4882a593Smuzhiyun nfs_join_page_group(req, inode);
527*4882a593Smuzhiyun }
528*4882a593Smuzhiyun }
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun static void
nfs_direct_write_scan_commit_list(struct inode * inode,struct list_head * list,struct nfs_commit_info * cinfo)531*4882a593Smuzhiyun nfs_direct_write_scan_commit_list(struct inode *inode,
532*4882a593Smuzhiyun struct list_head *list,
533*4882a593Smuzhiyun struct nfs_commit_info *cinfo)
534*4882a593Smuzhiyun {
535*4882a593Smuzhiyun mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
536*4882a593Smuzhiyun pnfs_recover_commit_reqs(list, cinfo);
537*4882a593Smuzhiyun nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
538*4882a593Smuzhiyun mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
539*4882a593Smuzhiyun }
540*4882a593Smuzhiyun
nfs_direct_write_reschedule(struct nfs_direct_req * dreq)541*4882a593Smuzhiyun static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
542*4882a593Smuzhiyun {
543*4882a593Smuzhiyun struct nfs_pageio_descriptor desc;
544*4882a593Smuzhiyun struct nfs_page *req, *tmp;
545*4882a593Smuzhiyun LIST_HEAD(reqs);
546*4882a593Smuzhiyun struct nfs_commit_info cinfo;
547*4882a593Smuzhiyun LIST_HEAD(failed);
548*4882a593Smuzhiyun
549*4882a593Smuzhiyun nfs_init_cinfo_from_dreq(&cinfo, dreq);
550*4882a593Smuzhiyun nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun nfs_direct_join_group(&reqs, dreq->inode);
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun dreq->count = 0;
555*4882a593Smuzhiyun dreq->max_count = 0;
556*4882a593Smuzhiyun list_for_each_entry(req, &reqs, wb_list)
557*4882a593Smuzhiyun dreq->max_count += req->wb_bytes;
558*4882a593Smuzhiyun nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
559*4882a593Smuzhiyun get_dreq(dreq);
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
562*4882a593Smuzhiyun &nfs_direct_write_completion_ops);
563*4882a593Smuzhiyun desc.pg_dreq = dreq;
564*4882a593Smuzhiyun
565*4882a593Smuzhiyun list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
566*4882a593Smuzhiyun /* Bump the transmission count */
567*4882a593Smuzhiyun req->wb_nio++;
568*4882a593Smuzhiyun if (!nfs_pageio_add_request(&desc, req)) {
569*4882a593Smuzhiyun nfs_list_move_request(req, &failed);
570*4882a593Smuzhiyun spin_lock(&cinfo.inode->i_lock);
571*4882a593Smuzhiyun dreq->flags = 0;
572*4882a593Smuzhiyun if (desc.pg_error < 0)
573*4882a593Smuzhiyun dreq->error = desc.pg_error;
574*4882a593Smuzhiyun else
575*4882a593Smuzhiyun dreq->error = -EIO;
576*4882a593Smuzhiyun spin_unlock(&cinfo.inode->i_lock);
577*4882a593Smuzhiyun }
578*4882a593Smuzhiyun nfs_release_request(req);
579*4882a593Smuzhiyun }
580*4882a593Smuzhiyun nfs_pageio_complete(&desc);
581*4882a593Smuzhiyun
582*4882a593Smuzhiyun while (!list_empty(&failed)) {
583*4882a593Smuzhiyun req = nfs_list_entry(failed.next);
584*4882a593Smuzhiyun nfs_list_remove_request(req);
585*4882a593Smuzhiyun nfs_unlock_and_release_request(req);
586*4882a593Smuzhiyun }
587*4882a593Smuzhiyun
588*4882a593Smuzhiyun if (put_dreq(dreq))
589*4882a593Smuzhiyun nfs_direct_write_complete(dreq);
590*4882a593Smuzhiyun }
591*4882a593Smuzhiyun
nfs_direct_commit_complete(struct nfs_commit_data * data)592*4882a593Smuzhiyun static void nfs_direct_commit_complete(struct nfs_commit_data *data)
593*4882a593Smuzhiyun {
594*4882a593Smuzhiyun const struct nfs_writeverf *verf = data->res.verf;
595*4882a593Smuzhiyun struct nfs_direct_req *dreq = data->dreq;
596*4882a593Smuzhiyun struct nfs_commit_info cinfo;
597*4882a593Smuzhiyun struct nfs_page *req;
598*4882a593Smuzhiyun int status = data->task.tk_status;
599*4882a593Smuzhiyun
600*4882a593Smuzhiyun if (status < 0) {
601*4882a593Smuzhiyun /* Errors in commit are fatal */
602*4882a593Smuzhiyun dreq->error = status;
603*4882a593Smuzhiyun dreq->max_count = 0;
604*4882a593Smuzhiyun dreq->count = 0;
605*4882a593Smuzhiyun dreq->flags = NFS_ODIRECT_DONE;
606*4882a593Smuzhiyun } else if (dreq->flags == NFS_ODIRECT_DONE)
607*4882a593Smuzhiyun status = dreq->error;
608*4882a593Smuzhiyun
609*4882a593Smuzhiyun nfs_init_cinfo_from_dreq(&cinfo, dreq);
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun while (!list_empty(&data->pages)) {
612*4882a593Smuzhiyun req = nfs_list_entry(data->pages.next);
613*4882a593Smuzhiyun nfs_list_remove_request(req);
614*4882a593Smuzhiyun if (status >= 0 && !nfs_write_match_verf(verf, req)) {
615*4882a593Smuzhiyun dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
616*4882a593Smuzhiyun /*
617*4882a593Smuzhiyun * Despite the reboot, the write was successful,
618*4882a593Smuzhiyun * so reset wb_nio.
619*4882a593Smuzhiyun */
620*4882a593Smuzhiyun req->wb_nio = 0;
621*4882a593Smuzhiyun nfs_mark_request_commit(req, NULL, &cinfo, 0);
622*4882a593Smuzhiyun } else /* Error or match */
623*4882a593Smuzhiyun nfs_release_request(req);
624*4882a593Smuzhiyun nfs_unlock_and_release_request(req);
625*4882a593Smuzhiyun }
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun if (nfs_commit_end(cinfo.mds))
628*4882a593Smuzhiyun nfs_direct_write_complete(dreq);
629*4882a593Smuzhiyun }
630*4882a593Smuzhiyun
nfs_direct_resched_write(struct nfs_commit_info * cinfo,struct nfs_page * req)631*4882a593Smuzhiyun static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
632*4882a593Smuzhiyun struct nfs_page *req)
633*4882a593Smuzhiyun {
634*4882a593Smuzhiyun struct nfs_direct_req *dreq = cinfo->dreq;
635*4882a593Smuzhiyun
636*4882a593Smuzhiyun spin_lock(&dreq->lock);
637*4882a593Smuzhiyun if (dreq->flags != NFS_ODIRECT_DONE)
638*4882a593Smuzhiyun dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
639*4882a593Smuzhiyun spin_unlock(&dreq->lock);
640*4882a593Smuzhiyun nfs_mark_request_commit(req, NULL, cinfo, 0);
641*4882a593Smuzhiyun }
642*4882a593Smuzhiyun
643*4882a593Smuzhiyun static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
644*4882a593Smuzhiyun .completion = nfs_direct_commit_complete,
645*4882a593Smuzhiyun .resched_write = nfs_direct_resched_write,
646*4882a593Smuzhiyun };
647*4882a593Smuzhiyun
nfs_direct_commit_schedule(struct nfs_direct_req * dreq)648*4882a593Smuzhiyun static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
649*4882a593Smuzhiyun {
650*4882a593Smuzhiyun int res;
651*4882a593Smuzhiyun struct nfs_commit_info cinfo;
652*4882a593Smuzhiyun LIST_HEAD(mds_list);
653*4882a593Smuzhiyun
654*4882a593Smuzhiyun nfs_init_cinfo_from_dreq(&cinfo, dreq);
655*4882a593Smuzhiyun nfs_scan_commit(dreq->inode, &mds_list, &cinfo);
656*4882a593Smuzhiyun res = nfs_generic_commit_list(dreq->inode, &mds_list, 0, &cinfo);
657*4882a593Smuzhiyun if (res < 0) /* res == -ENOMEM */
658*4882a593Smuzhiyun nfs_direct_write_reschedule(dreq);
659*4882a593Smuzhiyun }
660*4882a593Smuzhiyun
nfs_direct_write_clear_reqs(struct nfs_direct_req * dreq)661*4882a593Smuzhiyun static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
662*4882a593Smuzhiyun {
663*4882a593Smuzhiyun struct nfs_commit_info cinfo;
664*4882a593Smuzhiyun struct nfs_page *req;
665*4882a593Smuzhiyun LIST_HEAD(reqs);
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun nfs_init_cinfo_from_dreq(&cinfo, dreq);
668*4882a593Smuzhiyun nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
669*4882a593Smuzhiyun
670*4882a593Smuzhiyun while (!list_empty(&reqs)) {
671*4882a593Smuzhiyun req = nfs_list_entry(reqs.next);
672*4882a593Smuzhiyun nfs_list_remove_request(req);
673*4882a593Smuzhiyun nfs_release_request(req);
674*4882a593Smuzhiyun nfs_unlock_and_release_request(req);
675*4882a593Smuzhiyun }
676*4882a593Smuzhiyun }
677*4882a593Smuzhiyun
nfs_direct_write_schedule_work(struct work_struct * work)678*4882a593Smuzhiyun static void nfs_direct_write_schedule_work(struct work_struct *work)
679*4882a593Smuzhiyun {
680*4882a593Smuzhiyun struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
681*4882a593Smuzhiyun int flags = dreq->flags;
682*4882a593Smuzhiyun
683*4882a593Smuzhiyun dreq->flags = 0;
684*4882a593Smuzhiyun switch (flags) {
685*4882a593Smuzhiyun case NFS_ODIRECT_DO_COMMIT:
686*4882a593Smuzhiyun nfs_direct_commit_schedule(dreq);
687*4882a593Smuzhiyun break;
688*4882a593Smuzhiyun case NFS_ODIRECT_RESCHED_WRITES:
689*4882a593Smuzhiyun nfs_direct_write_reschedule(dreq);
690*4882a593Smuzhiyun break;
691*4882a593Smuzhiyun default:
692*4882a593Smuzhiyun nfs_direct_write_clear_reqs(dreq);
693*4882a593Smuzhiyun nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
694*4882a593Smuzhiyun nfs_direct_complete(dreq);
695*4882a593Smuzhiyun }
696*4882a593Smuzhiyun }
697*4882a593Smuzhiyun
nfs_direct_write_complete(struct nfs_direct_req * dreq)698*4882a593Smuzhiyun static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
699*4882a593Smuzhiyun {
700*4882a593Smuzhiyun queue_work(nfsiod_workqueue, &dreq->work); /* Calls nfs_direct_write_schedule_work */
701*4882a593Smuzhiyun }
702*4882a593Smuzhiyun
nfs_direct_write_completion(struct nfs_pgio_header * hdr)703*4882a593Smuzhiyun static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
704*4882a593Smuzhiyun {
705*4882a593Smuzhiyun struct nfs_direct_req *dreq = hdr->dreq;
706*4882a593Smuzhiyun struct nfs_commit_info cinfo;
707*4882a593Smuzhiyun struct nfs_page *req = nfs_list_entry(hdr->pages.next);
708*4882a593Smuzhiyun int flags = NFS_ODIRECT_DONE;
709*4882a593Smuzhiyun
710*4882a593Smuzhiyun nfs_init_cinfo_from_dreq(&cinfo, dreq);
711*4882a593Smuzhiyun
712*4882a593Smuzhiyun spin_lock(&dreq->lock);
713*4882a593Smuzhiyun if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) {
714*4882a593Smuzhiyun spin_unlock(&dreq->lock);
715*4882a593Smuzhiyun goto out_put;
716*4882a593Smuzhiyun }
717*4882a593Smuzhiyun
718*4882a593Smuzhiyun nfs_direct_count_bytes(dreq, hdr);
719*4882a593Smuzhiyun if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
720*4882a593Smuzhiyun if (!dreq->flags)
721*4882a593Smuzhiyun dreq->flags = NFS_ODIRECT_DO_COMMIT;
722*4882a593Smuzhiyun flags = dreq->flags;
723*4882a593Smuzhiyun }
724*4882a593Smuzhiyun spin_unlock(&dreq->lock);
725*4882a593Smuzhiyun
726*4882a593Smuzhiyun while (!list_empty(&hdr->pages)) {
727*4882a593Smuzhiyun
728*4882a593Smuzhiyun req = nfs_list_entry(hdr->pages.next);
729*4882a593Smuzhiyun nfs_list_remove_request(req);
730*4882a593Smuzhiyun if (flags == NFS_ODIRECT_DO_COMMIT) {
731*4882a593Smuzhiyun kref_get(&req->wb_kref);
732*4882a593Smuzhiyun memcpy(&req->wb_verf, &hdr->verf.verifier,
733*4882a593Smuzhiyun sizeof(req->wb_verf));
734*4882a593Smuzhiyun nfs_mark_request_commit(req, hdr->lseg, &cinfo,
735*4882a593Smuzhiyun hdr->ds_commit_idx);
736*4882a593Smuzhiyun } else if (flags == NFS_ODIRECT_RESCHED_WRITES) {
737*4882a593Smuzhiyun kref_get(&req->wb_kref);
738*4882a593Smuzhiyun nfs_mark_request_commit(req, NULL, &cinfo, 0);
739*4882a593Smuzhiyun }
740*4882a593Smuzhiyun nfs_unlock_and_release_request(req);
741*4882a593Smuzhiyun }
742*4882a593Smuzhiyun
743*4882a593Smuzhiyun out_put:
744*4882a593Smuzhiyun if (put_dreq(dreq))
745*4882a593Smuzhiyun nfs_direct_write_complete(dreq);
746*4882a593Smuzhiyun hdr->release(hdr);
747*4882a593Smuzhiyun }
748*4882a593Smuzhiyun
nfs_write_sync_pgio_error(struct list_head * head,int error)749*4882a593Smuzhiyun static void nfs_write_sync_pgio_error(struct list_head *head, int error)
750*4882a593Smuzhiyun {
751*4882a593Smuzhiyun struct nfs_page *req;
752*4882a593Smuzhiyun
753*4882a593Smuzhiyun while (!list_empty(head)) {
754*4882a593Smuzhiyun req = nfs_list_entry(head->next);
755*4882a593Smuzhiyun nfs_list_remove_request(req);
756*4882a593Smuzhiyun nfs_unlock_and_release_request(req);
757*4882a593Smuzhiyun }
758*4882a593Smuzhiyun }
759*4882a593Smuzhiyun
nfs_direct_write_reschedule_io(struct nfs_pgio_header * hdr)760*4882a593Smuzhiyun static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
761*4882a593Smuzhiyun {
762*4882a593Smuzhiyun struct nfs_direct_req *dreq = hdr->dreq;
763*4882a593Smuzhiyun
764*4882a593Smuzhiyun spin_lock(&dreq->lock);
765*4882a593Smuzhiyun if (dreq->error == 0) {
766*4882a593Smuzhiyun dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
767*4882a593Smuzhiyun /* fake unstable write to let common nfs resend pages */
768*4882a593Smuzhiyun hdr->verf.committed = NFS_UNSTABLE;
769*4882a593Smuzhiyun hdr->good_bytes = hdr->args.offset + hdr->args.count -
770*4882a593Smuzhiyun hdr->io_start;
771*4882a593Smuzhiyun }
772*4882a593Smuzhiyun spin_unlock(&dreq->lock);
773*4882a593Smuzhiyun }
774*4882a593Smuzhiyun
775*4882a593Smuzhiyun static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
776*4882a593Smuzhiyun .error_cleanup = nfs_write_sync_pgio_error,
777*4882a593Smuzhiyun .init_hdr = nfs_direct_pgio_init,
778*4882a593Smuzhiyun .completion = nfs_direct_write_completion,
779*4882a593Smuzhiyun .reschedule_io = nfs_direct_write_reschedule_io,
780*4882a593Smuzhiyun };
781*4882a593Smuzhiyun
782*4882a593Smuzhiyun
783*4882a593Smuzhiyun /*
784*4882a593Smuzhiyun * NB: Return the value of the first error return code. Subsequent
785*4882a593Smuzhiyun * errors after the first one are ignored.
786*4882a593Smuzhiyun */
787*4882a593Smuzhiyun /*
788*4882a593Smuzhiyun * For each wsize'd chunk of the user's buffer, dispatch an NFS WRITE
789*4882a593Smuzhiyun * operation. If nfs_writedata_alloc() or get_user_pages() fails,
790*4882a593Smuzhiyun * bail and stop sending more writes. Write length accounting is
791*4882a593Smuzhiyun * handled automatically by nfs_direct_write_result(). Otherwise, if
792*4882a593Smuzhiyun * no requests have been sent, just return an error.
793*4882a593Smuzhiyun */
nfs_direct_write_schedule_iovec(struct nfs_direct_req * dreq,struct iov_iter * iter,loff_t pos,int ioflags)794*4882a593Smuzhiyun static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
795*4882a593Smuzhiyun struct iov_iter *iter,
796*4882a593Smuzhiyun loff_t pos, int ioflags)
797*4882a593Smuzhiyun {
798*4882a593Smuzhiyun struct nfs_pageio_descriptor desc;
799*4882a593Smuzhiyun struct inode *inode = dreq->inode;
800*4882a593Smuzhiyun ssize_t result = 0;
801*4882a593Smuzhiyun size_t requested_bytes = 0;
802*4882a593Smuzhiyun size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun nfs_pageio_init_write(&desc, inode, ioflags, false,
805*4882a593Smuzhiyun &nfs_direct_write_completion_ops);
806*4882a593Smuzhiyun desc.pg_dreq = dreq;
807*4882a593Smuzhiyun get_dreq(dreq);
808*4882a593Smuzhiyun inode_dio_begin(inode);
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun NFS_I(inode)->write_io += iov_iter_count(iter);
811*4882a593Smuzhiyun while (iov_iter_count(iter)) {
812*4882a593Smuzhiyun struct page **pagevec;
813*4882a593Smuzhiyun size_t bytes;
814*4882a593Smuzhiyun size_t pgbase;
815*4882a593Smuzhiyun unsigned npages, i;
816*4882a593Smuzhiyun
817*4882a593Smuzhiyun result = iov_iter_get_pages_alloc(iter, &pagevec,
818*4882a593Smuzhiyun wsize, &pgbase);
819*4882a593Smuzhiyun if (result < 0)
820*4882a593Smuzhiyun break;
821*4882a593Smuzhiyun
822*4882a593Smuzhiyun bytes = result;
823*4882a593Smuzhiyun iov_iter_advance(iter, bytes);
824*4882a593Smuzhiyun npages = (result + pgbase + PAGE_SIZE - 1) / PAGE_SIZE;
825*4882a593Smuzhiyun for (i = 0; i < npages; i++) {
826*4882a593Smuzhiyun struct nfs_page *req;
827*4882a593Smuzhiyun unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
828*4882a593Smuzhiyun
829*4882a593Smuzhiyun req = nfs_create_request(dreq->ctx, pagevec[i],
830*4882a593Smuzhiyun pgbase, req_len);
831*4882a593Smuzhiyun if (IS_ERR(req)) {
832*4882a593Smuzhiyun result = PTR_ERR(req);
833*4882a593Smuzhiyun break;
834*4882a593Smuzhiyun }
835*4882a593Smuzhiyun
836*4882a593Smuzhiyun if (desc.pg_error < 0) {
837*4882a593Smuzhiyun nfs_free_request(req);
838*4882a593Smuzhiyun result = desc.pg_error;
839*4882a593Smuzhiyun break;
840*4882a593Smuzhiyun }
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun nfs_lock_request(req);
843*4882a593Smuzhiyun req->wb_index = pos >> PAGE_SHIFT;
844*4882a593Smuzhiyun req->wb_offset = pos & ~PAGE_MASK;
845*4882a593Smuzhiyun if (!nfs_pageio_add_request(&desc, req)) {
846*4882a593Smuzhiyun result = desc.pg_error;
847*4882a593Smuzhiyun nfs_unlock_and_release_request(req);
848*4882a593Smuzhiyun break;
849*4882a593Smuzhiyun }
850*4882a593Smuzhiyun pgbase = 0;
851*4882a593Smuzhiyun bytes -= req_len;
852*4882a593Smuzhiyun requested_bytes += req_len;
853*4882a593Smuzhiyun pos += req_len;
854*4882a593Smuzhiyun dreq->bytes_left -= req_len;
855*4882a593Smuzhiyun }
856*4882a593Smuzhiyun nfs_direct_release_pages(pagevec, npages);
857*4882a593Smuzhiyun kvfree(pagevec);
858*4882a593Smuzhiyun if (result < 0)
859*4882a593Smuzhiyun break;
860*4882a593Smuzhiyun }
861*4882a593Smuzhiyun nfs_pageio_complete(&desc);
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun /*
864*4882a593Smuzhiyun * If no bytes were started, return the error, and let the
865*4882a593Smuzhiyun * generic layer handle the completion.
866*4882a593Smuzhiyun */
867*4882a593Smuzhiyun if (requested_bytes == 0) {
868*4882a593Smuzhiyun inode_dio_end(inode);
869*4882a593Smuzhiyun nfs_direct_req_release(dreq);
870*4882a593Smuzhiyun return result < 0 ? result : -EIO;
871*4882a593Smuzhiyun }
872*4882a593Smuzhiyun
873*4882a593Smuzhiyun if (put_dreq(dreq))
874*4882a593Smuzhiyun nfs_direct_write_complete(dreq);
875*4882a593Smuzhiyun return requested_bytes;
876*4882a593Smuzhiyun }
877*4882a593Smuzhiyun
878*4882a593Smuzhiyun /**
879*4882a593Smuzhiyun * nfs_file_direct_write - file direct write operation for NFS files
880*4882a593Smuzhiyun * @iocb: target I/O control block
881*4882a593Smuzhiyun * @iter: vector of user buffers from which to write data
882*4882a593Smuzhiyun * @swap: flag indicating this is swap IO, not O_DIRECT IO
883*4882a593Smuzhiyun *
884*4882a593Smuzhiyun * We use this function for direct writes instead of calling
885*4882a593Smuzhiyun * generic_file_aio_write() in order to avoid taking the inode
886*4882a593Smuzhiyun * semaphore and updating the i_size. The NFS server will set
887*4882a593Smuzhiyun * the new i_size and this client must read the updated size
888*4882a593Smuzhiyun * back into its cache. We let the server do generic write
889*4882a593Smuzhiyun * parameter checking and report problems.
890*4882a593Smuzhiyun *
891*4882a593Smuzhiyun * We eliminate local atime updates, see direct read above.
892*4882a593Smuzhiyun *
893*4882a593Smuzhiyun * We avoid unnecessary page cache invalidations for normal cached
894*4882a593Smuzhiyun * readers of this file.
895*4882a593Smuzhiyun *
896*4882a593Smuzhiyun * Note that O_APPEND is not supported for NFS direct writes, as there
897*4882a593Smuzhiyun * is no atomic O_APPEND write facility in the NFS protocol.
898*4882a593Smuzhiyun */
nfs_file_direct_write(struct kiocb * iocb,struct iov_iter * iter,bool swap)899*4882a593Smuzhiyun ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
900*4882a593Smuzhiyun bool swap)
901*4882a593Smuzhiyun {
902*4882a593Smuzhiyun ssize_t result, requested;
903*4882a593Smuzhiyun size_t count;
904*4882a593Smuzhiyun struct file *file = iocb->ki_filp;
905*4882a593Smuzhiyun struct address_space *mapping = file->f_mapping;
906*4882a593Smuzhiyun struct inode *inode = mapping->host;
907*4882a593Smuzhiyun struct nfs_direct_req *dreq;
908*4882a593Smuzhiyun struct nfs_lock_context *l_ctx;
909*4882a593Smuzhiyun loff_t pos, end;
910*4882a593Smuzhiyun
911*4882a593Smuzhiyun dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
912*4882a593Smuzhiyun file, iov_iter_count(iter), (long long) iocb->ki_pos);
913*4882a593Smuzhiyun
914*4882a593Smuzhiyun if (swap)
915*4882a593Smuzhiyun /* bypass generic checks */
916*4882a593Smuzhiyun result = iov_iter_count(iter);
917*4882a593Smuzhiyun else
918*4882a593Smuzhiyun result = generic_write_checks(iocb, iter);
919*4882a593Smuzhiyun if (result <= 0)
920*4882a593Smuzhiyun return result;
921*4882a593Smuzhiyun count = result;
922*4882a593Smuzhiyun nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);
923*4882a593Smuzhiyun
924*4882a593Smuzhiyun pos = iocb->ki_pos;
925*4882a593Smuzhiyun end = (pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT;
926*4882a593Smuzhiyun
927*4882a593Smuzhiyun task_io_account_write(count);
928*4882a593Smuzhiyun
929*4882a593Smuzhiyun result = -ENOMEM;
930*4882a593Smuzhiyun dreq = nfs_direct_req_alloc();
931*4882a593Smuzhiyun if (!dreq)
932*4882a593Smuzhiyun goto out;
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun dreq->inode = inode;
935*4882a593Smuzhiyun dreq->bytes_left = dreq->max_count = count;
936*4882a593Smuzhiyun dreq->io_start = pos;
937*4882a593Smuzhiyun dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
938*4882a593Smuzhiyun l_ctx = nfs_get_lock_context(dreq->ctx);
939*4882a593Smuzhiyun if (IS_ERR(l_ctx)) {
940*4882a593Smuzhiyun result = PTR_ERR(l_ctx);
941*4882a593Smuzhiyun nfs_direct_req_release(dreq);
942*4882a593Smuzhiyun goto out_release;
943*4882a593Smuzhiyun }
944*4882a593Smuzhiyun dreq->l_ctx = l_ctx;
945*4882a593Smuzhiyun if (!is_sync_kiocb(iocb))
946*4882a593Smuzhiyun dreq->iocb = iocb;
947*4882a593Smuzhiyun pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
948*4882a593Smuzhiyun
949*4882a593Smuzhiyun if (swap) {
950*4882a593Smuzhiyun requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
951*4882a593Smuzhiyun FLUSH_STABLE);
952*4882a593Smuzhiyun } else {
953*4882a593Smuzhiyun nfs_start_io_direct(inode);
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
956*4882a593Smuzhiyun FLUSH_COND_STABLE);
957*4882a593Smuzhiyun
958*4882a593Smuzhiyun if (mapping->nrpages) {
959*4882a593Smuzhiyun invalidate_inode_pages2_range(mapping,
960*4882a593Smuzhiyun pos >> PAGE_SHIFT, end);
961*4882a593Smuzhiyun }
962*4882a593Smuzhiyun
963*4882a593Smuzhiyun nfs_end_io_direct(inode);
964*4882a593Smuzhiyun }
965*4882a593Smuzhiyun
966*4882a593Smuzhiyun if (requested > 0) {
967*4882a593Smuzhiyun result = nfs_direct_wait(dreq);
968*4882a593Smuzhiyun if (result > 0) {
969*4882a593Smuzhiyun requested -= result;
970*4882a593Smuzhiyun iocb->ki_pos = pos + result;
971*4882a593Smuzhiyun /* XXX: should check the generic_write_sync retval */
972*4882a593Smuzhiyun generic_write_sync(iocb, result);
973*4882a593Smuzhiyun }
974*4882a593Smuzhiyun iov_iter_revert(iter, requested);
975*4882a593Smuzhiyun } else {
976*4882a593Smuzhiyun result = requested;
977*4882a593Smuzhiyun }
978*4882a593Smuzhiyun out_release:
979*4882a593Smuzhiyun nfs_direct_req_release(dreq);
980*4882a593Smuzhiyun out:
981*4882a593Smuzhiyun return result;
982*4882a593Smuzhiyun }
983*4882a593Smuzhiyun
984*4882a593Smuzhiyun /**
985*4882a593Smuzhiyun * nfs_init_directcache - create a slab cache for nfs_direct_req structures
986*4882a593Smuzhiyun *
987*4882a593Smuzhiyun */
nfs_init_directcache(void)988*4882a593Smuzhiyun int __init nfs_init_directcache(void)
989*4882a593Smuzhiyun {
990*4882a593Smuzhiyun nfs_direct_cachep = kmem_cache_create("nfs_direct_cache",
991*4882a593Smuzhiyun sizeof(struct nfs_direct_req),
992*4882a593Smuzhiyun 0, (SLAB_RECLAIM_ACCOUNT|
993*4882a593Smuzhiyun SLAB_MEM_SPREAD),
994*4882a593Smuzhiyun NULL);
995*4882a593Smuzhiyun if (nfs_direct_cachep == NULL)
996*4882a593Smuzhiyun return -ENOMEM;
997*4882a593Smuzhiyun
998*4882a593Smuzhiyun return 0;
999*4882a593Smuzhiyun }
1000*4882a593Smuzhiyun
1001*4882a593Smuzhiyun /**
1002*4882a593Smuzhiyun * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures
1003*4882a593Smuzhiyun *
1004*4882a593Smuzhiyun */
nfs_destroy_directcache(void)1005*4882a593Smuzhiyun void nfs_destroy_directcache(void)
1006*4882a593Smuzhiyun {
1007*4882a593Smuzhiyun kmem_cache_destroy(nfs_direct_cachep);
1008*4882a593Smuzhiyun }
1009