xref: /OK3568_Linux_fs/kernel/fs/nfs/pnfs.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  *  pNFS functions to call and manage layout drivers.
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  *  Copyright (c) 2002 [year of first publication]
5*4882a593Smuzhiyun  *  The Regents of the University of Michigan
6*4882a593Smuzhiyun  *  All Rights Reserved
7*4882a593Smuzhiyun  *
8*4882a593Smuzhiyun  *  Dean Hildebrand <dhildebz@umich.edu>
9*4882a593Smuzhiyun  *
10*4882a593Smuzhiyun  *  Permission is granted to use, copy, create derivative works, and
11*4882a593Smuzhiyun  *  redistribute this software and such derivative works for any purpose,
12*4882a593Smuzhiyun  *  so long as the name of the University of Michigan is not used in
13*4882a593Smuzhiyun  *  any advertising or publicity pertaining to the use or distribution
14*4882a593Smuzhiyun  *  of this software without specific, written prior authorization. If
15*4882a593Smuzhiyun  *  the above copyright notice or any other identification of the
16*4882a593Smuzhiyun  *  University of Michigan is included in any copy of any portion of
17*4882a593Smuzhiyun  *  this software, then the disclaimer below must also be included.
18*4882a593Smuzhiyun  *
19*4882a593Smuzhiyun  *  This software is provided as is, without representation or warranty
20*4882a593Smuzhiyun  *  of any kind either express or implied, including without limitation
21*4882a593Smuzhiyun  *  the implied warranties of merchantability, fitness for a particular
22*4882a593Smuzhiyun  *  purpose, or noninfringement.  The Regents of the University of
23*4882a593Smuzhiyun  *  Michigan shall not be liable for any damages, including special,
24*4882a593Smuzhiyun  *  indirect, incidental, or consequential damages, with respect to any
25*4882a593Smuzhiyun  *  claim arising out of or in connection with the use of the software,
26*4882a593Smuzhiyun  *  even if it has been or is hereafter advised of the possibility of
27*4882a593Smuzhiyun  *  such damages.
28*4882a593Smuzhiyun  */
29*4882a593Smuzhiyun 
30*4882a593Smuzhiyun #include <linux/nfs_fs.h>
31*4882a593Smuzhiyun #include <linux/nfs_page.h>
32*4882a593Smuzhiyun #include <linux/module.h>
33*4882a593Smuzhiyun #include <linux/sort.h>
34*4882a593Smuzhiyun #include "internal.h"
35*4882a593Smuzhiyun #include "pnfs.h"
36*4882a593Smuzhiyun #include "iostat.h"
37*4882a593Smuzhiyun #include "nfs4trace.h"
38*4882a593Smuzhiyun #include "delegation.h"
39*4882a593Smuzhiyun #include "nfs42.h"
40*4882a593Smuzhiyun #include "nfs4_fs.h"
41*4882a593Smuzhiyun 
42*4882a593Smuzhiyun #define NFSDBG_FACILITY		NFSDBG_PNFS
43*4882a593Smuzhiyun #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
44*4882a593Smuzhiyun 
45*4882a593Smuzhiyun /* Locking:
46*4882a593Smuzhiyun  *
47*4882a593Smuzhiyun  * pnfs_spinlock:
48*4882a593Smuzhiyun  *      protects pnfs_modules_tbl.
49*4882a593Smuzhiyun  */
50*4882a593Smuzhiyun static DEFINE_SPINLOCK(pnfs_spinlock);
51*4882a593Smuzhiyun 
52*4882a593Smuzhiyun /*
53*4882a593Smuzhiyun  * pnfs_modules_tbl holds all pnfs modules
54*4882a593Smuzhiyun  */
55*4882a593Smuzhiyun static LIST_HEAD(pnfs_modules_tbl);
56*4882a593Smuzhiyun 
57*4882a593Smuzhiyun static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
58*4882a593Smuzhiyun static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
59*4882a593Smuzhiyun 		struct list_head *free_me,
60*4882a593Smuzhiyun 		const struct pnfs_layout_range *range,
61*4882a593Smuzhiyun 		u32 seq);
62*4882a593Smuzhiyun static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
63*4882a593Smuzhiyun 		                struct list_head *tmp_list);
64*4882a593Smuzhiyun 
65*4882a593Smuzhiyun /* Return the registered pnfs layout driver module matching given id */
66*4882a593Smuzhiyun static struct pnfs_layoutdriver_type *
find_pnfs_driver_locked(u32 id)67*4882a593Smuzhiyun find_pnfs_driver_locked(u32 id)
68*4882a593Smuzhiyun {
69*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *local;
70*4882a593Smuzhiyun 
71*4882a593Smuzhiyun 	list_for_each_entry(local, &pnfs_modules_tbl, pnfs_tblid)
72*4882a593Smuzhiyun 		if (local->id == id)
73*4882a593Smuzhiyun 			goto out;
74*4882a593Smuzhiyun 	local = NULL;
75*4882a593Smuzhiyun out:
76*4882a593Smuzhiyun 	dprintk("%s: Searching for id %u, found %p\n", __func__, id, local);
77*4882a593Smuzhiyun 	return local;
78*4882a593Smuzhiyun }
79*4882a593Smuzhiyun 
80*4882a593Smuzhiyun static struct pnfs_layoutdriver_type *
find_pnfs_driver(u32 id)81*4882a593Smuzhiyun find_pnfs_driver(u32 id)
82*4882a593Smuzhiyun {
83*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *local;
84*4882a593Smuzhiyun 
85*4882a593Smuzhiyun 	spin_lock(&pnfs_spinlock);
86*4882a593Smuzhiyun 	local = find_pnfs_driver_locked(id);
87*4882a593Smuzhiyun 	if (local != NULL && !try_module_get(local->owner)) {
88*4882a593Smuzhiyun 		dprintk("%s: Could not grab reference on module\n", __func__);
89*4882a593Smuzhiyun 		local = NULL;
90*4882a593Smuzhiyun 	}
91*4882a593Smuzhiyun 	spin_unlock(&pnfs_spinlock);
92*4882a593Smuzhiyun 	return local;
93*4882a593Smuzhiyun }
94*4882a593Smuzhiyun 
pnfs_find_layoutdriver(u32 id)95*4882a593Smuzhiyun const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
96*4882a593Smuzhiyun {
97*4882a593Smuzhiyun 	return find_pnfs_driver(id);
98*4882a593Smuzhiyun }
99*4882a593Smuzhiyun 
pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type * ld)100*4882a593Smuzhiyun void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
101*4882a593Smuzhiyun {
102*4882a593Smuzhiyun 	if (ld)
103*4882a593Smuzhiyun 		module_put(ld->owner);
104*4882a593Smuzhiyun }
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun void
unset_pnfs_layoutdriver(struct nfs_server * nfss)107*4882a593Smuzhiyun unset_pnfs_layoutdriver(struct nfs_server *nfss)
108*4882a593Smuzhiyun {
109*4882a593Smuzhiyun 	if (nfss->pnfs_curr_ld) {
110*4882a593Smuzhiyun 		if (nfss->pnfs_curr_ld->clear_layoutdriver)
111*4882a593Smuzhiyun 			nfss->pnfs_curr_ld->clear_layoutdriver(nfss);
112*4882a593Smuzhiyun 		/* Decrement the MDS count. Purge the deviceid cache if zero */
113*4882a593Smuzhiyun 		if (atomic_dec_and_test(&nfss->nfs_client->cl_mds_count))
114*4882a593Smuzhiyun 			nfs4_deviceid_purge_client(nfss->nfs_client);
115*4882a593Smuzhiyun 		module_put(nfss->pnfs_curr_ld->owner);
116*4882a593Smuzhiyun 	}
117*4882a593Smuzhiyun 	nfss->pnfs_curr_ld = NULL;
118*4882a593Smuzhiyun }
119*4882a593Smuzhiyun 
120*4882a593Smuzhiyun /*
121*4882a593Smuzhiyun  * When the server sends a list of layout types, we choose one in the order
122*4882a593Smuzhiyun  * given in the list below.
123*4882a593Smuzhiyun  *
124*4882a593Smuzhiyun  * FIXME: should this list be configurable in some fashion? module param?
125*4882a593Smuzhiyun  * 	  mount option? something else?
126*4882a593Smuzhiyun  */
127*4882a593Smuzhiyun static const u32 ld_prefs[] = {
128*4882a593Smuzhiyun 	LAYOUT_SCSI,
129*4882a593Smuzhiyun 	LAYOUT_BLOCK_VOLUME,
130*4882a593Smuzhiyun 	LAYOUT_OSD2_OBJECTS,
131*4882a593Smuzhiyun 	LAYOUT_FLEX_FILES,
132*4882a593Smuzhiyun 	LAYOUT_NFSV4_1_FILES,
133*4882a593Smuzhiyun 	0
134*4882a593Smuzhiyun };
135*4882a593Smuzhiyun 
136*4882a593Smuzhiyun static int
ld_cmp(const void * e1,const void * e2)137*4882a593Smuzhiyun ld_cmp(const void *e1, const void *e2)
138*4882a593Smuzhiyun {
139*4882a593Smuzhiyun 	u32 ld1 = *((u32 *)e1);
140*4882a593Smuzhiyun 	u32 ld2 = *((u32 *)e2);
141*4882a593Smuzhiyun 	int i;
142*4882a593Smuzhiyun 
143*4882a593Smuzhiyun 	for (i = 0; ld_prefs[i] != 0; i++) {
144*4882a593Smuzhiyun 		if (ld1 == ld_prefs[i])
145*4882a593Smuzhiyun 			return -1;
146*4882a593Smuzhiyun 
147*4882a593Smuzhiyun 		if (ld2 == ld_prefs[i])
148*4882a593Smuzhiyun 			return 1;
149*4882a593Smuzhiyun 	}
150*4882a593Smuzhiyun 	return 0;
151*4882a593Smuzhiyun }
152*4882a593Smuzhiyun 
153*4882a593Smuzhiyun /*
154*4882a593Smuzhiyun  * Try to set the server's pnfs module to the pnfs layout type specified by id.
155*4882a593Smuzhiyun  * Currently only one pNFS layout driver per filesystem is supported.
156*4882a593Smuzhiyun  *
157*4882a593Smuzhiyun  * @ids array of layout types supported by MDS.
158*4882a593Smuzhiyun  */
159*4882a593Smuzhiyun void
set_pnfs_layoutdriver(struct nfs_server * server,const struct nfs_fh * mntfh,struct nfs_fsinfo * fsinfo)160*4882a593Smuzhiyun set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
161*4882a593Smuzhiyun 		      struct nfs_fsinfo *fsinfo)
162*4882a593Smuzhiyun {
163*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *ld_type = NULL;
164*4882a593Smuzhiyun 	u32 id;
165*4882a593Smuzhiyun 	int i;
166*4882a593Smuzhiyun 
167*4882a593Smuzhiyun 	if (fsinfo->nlayouttypes == 0)
168*4882a593Smuzhiyun 		goto out_no_driver;
169*4882a593Smuzhiyun 	if (!(server->nfs_client->cl_exchange_flags &
170*4882a593Smuzhiyun 		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
171*4882a593Smuzhiyun 		printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
172*4882a593Smuzhiyun 			__func__, server->nfs_client->cl_exchange_flags);
173*4882a593Smuzhiyun 		goto out_no_driver;
174*4882a593Smuzhiyun 	}
175*4882a593Smuzhiyun 
176*4882a593Smuzhiyun 	sort(fsinfo->layouttype, fsinfo->nlayouttypes,
177*4882a593Smuzhiyun 		sizeof(*fsinfo->layouttype), ld_cmp, NULL);
178*4882a593Smuzhiyun 
179*4882a593Smuzhiyun 	for (i = 0; i < fsinfo->nlayouttypes; i++) {
180*4882a593Smuzhiyun 		id = fsinfo->layouttype[i];
181*4882a593Smuzhiyun 		ld_type = find_pnfs_driver(id);
182*4882a593Smuzhiyun 		if (!ld_type) {
183*4882a593Smuzhiyun 			request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
184*4882a593Smuzhiyun 					id);
185*4882a593Smuzhiyun 			ld_type = find_pnfs_driver(id);
186*4882a593Smuzhiyun 		}
187*4882a593Smuzhiyun 		if (ld_type)
188*4882a593Smuzhiyun 			break;
189*4882a593Smuzhiyun 	}
190*4882a593Smuzhiyun 
191*4882a593Smuzhiyun 	if (!ld_type) {
192*4882a593Smuzhiyun 		dprintk("%s: No pNFS module found!\n", __func__);
193*4882a593Smuzhiyun 		goto out_no_driver;
194*4882a593Smuzhiyun 	}
195*4882a593Smuzhiyun 
196*4882a593Smuzhiyun 	server->pnfs_curr_ld = ld_type;
197*4882a593Smuzhiyun 	if (ld_type->set_layoutdriver
198*4882a593Smuzhiyun 	    && ld_type->set_layoutdriver(server, mntfh)) {
199*4882a593Smuzhiyun 		printk(KERN_ERR "NFS: %s: Error initializing pNFS layout "
200*4882a593Smuzhiyun 			"driver %u.\n", __func__, id);
201*4882a593Smuzhiyun 		module_put(ld_type->owner);
202*4882a593Smuzhiyun 		goto out_no_driver;
203*4882a593Smuzhiyun 	}
204*4882a593Smuzhiyun 	/* Bump the MDS count */
205*4882a593Smuzhiyun 	atomic_inc(&server->nfs_client->cl_mds_count);
206*4882a593Smuzhiyun 
207*4882a593Smuzhiyun 	dprintk("%s: pNFS module for %u set\n", __func__, id);
208*4882a593Smuzhiyun 	return;
209*4882a593Smuzhiyun 
210*4882a593Smuzhiyun out_no_driver:
211*4882a593Smuzhiyun 	dprintk("%s: Using NFSv4 I/O\n", __func__);
212*4882a593Smuzhiyun 	server->pnfs_curr_ld = NULL;
213*4882a593Smuzhiyun }
214*4882a593Smuzhiyun 
215*4882a593Smuzhiyun int
pnfs_register_layoutdriver(struct pnfs_layoutdriver_type * ld_type)216*4882a593Smuzhiyun pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
217*4882a593Smuzhiyun {
218*4882a593Smuzhiyun 	int status = -EINVAL;
219*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *tmp;
220*4882a593Smuzhiyun 
221*4882a593Smuzhiyun 	if (ld_type->id == 0) {
222*4882a593Smuzhiyun 		printk(KERN_ERR "NFS: %s id 0 is reserved\n", __func__);
223*4882a593Smuzhiyun 		return status;
224*4882a593Smuzhiyun 	}
225*4882a593Smuzhiyun 	if (!ld_type->alloc_lseg || !ld_type->free_lseg) {
226*4882a593Smuzhiyun 		printk(KERN_ERR "NFS: %s Layout driver must provide "
227*4882a593Smuzhiyun 		       "alloc_lseg and free_lseg.\n", __func__);
228*4882a593Smuzhiyun 		return status;
229*4882a593Smuzhiyun 	}
230*4882a593Smuzhiyun 
231*4882a593Smuzhiyun 	spin_lock(&pnfs_spinlock);
232*4882a593Smuzhiyun 	tmp = find_pnfs_driver_locked(ld_type->id);
233*4882a593Smuzhiyun 	if (!tmp) {
234*4882a593Smuzhiyun 		list_add(&ld_type->pnfs_tblid, &pnfs_modules_tbl);
235*4882a593Smuzhiyun 		status = 0;
236*4882a593Smuzhiyun 		dprintk("%s Registering id:%u name:%s\n", __func__, ld_type->id,
237*4882a593Smuzhiyun 			ld_type->name);
238*4882a593Smuzhiyun 	} else {
239*4882a593Smuzhiyun 		printk(KERN_ERR "NFS: %s Module with id %d already loaded!\n",
240*4882a593Smuzhiyun 			__func__, ld_type->id);
241*4882a593Smuzhiyun 	}
242*4882a593Smuzhiyun 	spin_unlock(&pnfs_spinlock);
243*4882a593Smuzhiyun 
244*4882a593Smuzhiyun 	return status;
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_register_layoutdriver);
247*4882a593Smuzhiyun 
248*4882a593Smuzhiyun void
pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type * ld_type)249*4882a593Smuzhiyun pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *ld_type)
250*4882a593Smuzhiyun {
251*4882a593Smuzhiyun 	dprintk("%s Deregistering id:%u\n", __func__, ld_type->id);
252*4882a593Smuzhiyun 	spin_lock(&pnfs_spinlock);
253*4882a593Smuzhiyun 	list_del(&ld_type->pnfs_tblid);
254*4882a593Smuzhiyun 	spin_unlock(&pnfs_spinlock);
255*4882a593Smuzhiyun }
256*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_unregister_layoutdriver);
257*4882a593Smuzhiyun 
258*4882a593Smuzhiyun /*
259*4882a593Smuzhiyun  * pNFS client layout cache
260*4882a593Smuzhiyun  */
261*4882a593Smuzhiyun 
262*4882a593Smuzhiyun /* Need to hold i_lock if caller does not already hold reference */
263*4882a593Smuzhiyun void
pnfs_get_layout_hdr(struct pnfs_layout_hdr * lo)264*4882a593Smuzhiyun pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo)
265*4882a593Smuzhiyun {
266*4882a593Smuzhiyun 	refcount_inc(&lo->plh_refcount);
267*4882a593Smuzhiyun }
268*4882a593Smuzhiyun 
269*4882a593Smuzhiyun static struct pnfs_layout_hdr *
pnfs_alloc_layout_hdr(struct inode * ino,gfp_t gfp_flags)270*4882a593Smuzhiyun pnfs_alloc_layout_hdr(struct inode *ino, gfp_t gfp_flags)
271*4882a593Smuzhiyun {
272*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
273*4882a593Smuzhiyun 	return ld->alloc_layout_hdr(ino, gfp_flags);
274*4882a593Smuzhiyun }
275*4882a593Smuzhiyun 
276*4882a593Smuzhiyun static void
pnfs_free_layout_hdr(struct pnfs_layout_hdr * lo)277*4882a593Smuzhiyun pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
278*4882a593Smuzhiyun {
279*4882a593Smuzhiyun 	struct nfs_server *server = NFS_SERVER(lo->plh_inode);
280*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
281*4882a593Smuzhiyun 
282*4882a593Smuzhiyun 	if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
283*4882a593Smuzhiyun 		struct nfs_client *clp = server->nfs_client;
284*4882a593Smuzhiyun 
285*4882a593Smuzhiyun 		spin_lock(&clp->cl_lock);
286*4882a593Smuzhiyun 		list_del_rcu(&lo->plh_layouts);
287*4882a593Smuzhiyun 		spin_unlock(&clp->cl_lock);
288*4882a593Smuzhiyun 	}
289*4882a593Smuzhiyun 	put_cred(lo->plh_lc_cred);
290*4882a593Smuzhiyun 	return ld->free_layout_hdr(lo);
291*4882a593Smuzhiyun }
292*4882a593Smuzhiyun 
293*4882a593Smuzhiyun static void
pnfs_detach_layout_hdr(struct pnfs_layout_hdr * lo)294*4882a593Smuzhiyun pnfs_detach_layout_hdr(struct pnfs_layout_hdr *lo)
295*4882a593Smuzhiyun {
296*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(lo->plh_inode);
297*4882a593Smuzhiyun 	dprintk("%s: freeing layout cache %p\n", __func__, lo);
298*4882a593Smuzhiyun 	nfsi->layout = NULL;
299*4882a593Smuzhiyun 	/* Reset MDS Threshold I/O counters */
300*4882a593Smuzhiyun 	nfsi->write_io = 0;
301*4882a593Smuzhiyun 	nfsi->read_io = 0;
302*4882a593Smuzhiyun }
303*4882a593Smuzhiyun 
304*4882a593Smuzhiyun void
pnfs_put_layout_hdr(struct pnfs_layout_hdr * lo)305*4882a593Smuzhiyun pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
306*4882a593Smuzhiyun {
307*4882a593Smuzhiyun 	struct inode *inode;
308*4882a593Smuzhiyun 	unsigned long i_state;
309*4882a593Smuzhiyun 
310*4882a593Smuzhiyun 	if (!lo)
311*4882a593Smuzhiyun 		return;
312*4882a593Smuzhiyun 	inode = lo->plh_inode;
313*4882a593Smuzhiyun 	pnfs_layoutreturn_before_put_layout_hdr(lo);
314*4882a593Smuzhiyun 
315*4882a593Smuzhiyun 	if (refcount_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
316*4882a593Smuzhiyun 		if (!list_empty(&lo->plh_segs))
317*4882a593Smuzhiyun 			WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
318*4882a593Smuzhiyun 		pnfs_detach_layout_hdr(lo);
319*4882a593Smuzhiyun 		i_state = inode->i_state;
320*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
321*4882a593Smuzhiyun 		pnfs_free_layout_hdr(lo);
322*4882a593Smuzhiyun 		/* Notify pnfs_destroy_layout_final() that we're done */
323*4882a593Smuzhiyun 		if (i_state & (I_FREEING | I_CLEAR))
324*4882a593Smuzhiyun 			wake_up_var(lo);
325*4882a593Smuzhiyun 	}
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun 
328*4882a593Smuzhiyun static struct inode *
pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr * lo)329*4882a593Smuzhiyun pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
330*4882a593Smuzhiyun {
331*4882a593Smuzhiyun 	struct inode *inode = igrab(lo->plh_inode);
332*4882a593Smuzhiyun 	if (inode)
333*4882a593Smuzhiyun 		return inode;
334*4882a593Smuzhiyun 	set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
335*4882a593Smuzhiyun 	return NULL;
336*4882a593Smuzhiyun }
337*4882a593Smuzhiyun 
338*4882a593Smuzhiyun /*
339*4882a593Smuzhiyun  * Compare 2 layout stateid sequence ids, to see which is newer,
340*4882a593Smuzhiyun  * taking into account wraparound issues.
341*4882a593Smuzhiyun  */
pnfs_seqid_is_newer(u32 s1,u32 s2)342*4882a593Smuzhiyun static bool pnfs_seqid_is_newer(u32 s1, u32 s2)
343*4882a593Smuzhiyun {
344*4882a593Smuzhiyun 	return (s32)(s1 - s2) > 0;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun 
pnfs_barrier_update(struct pnfs_layout_hdr * lo,u32 newseq)347*4882a593Smuzhiyun static void pnfs_barrier_update(struct pnfs_layout_hdr *lo, u32 newseq)
348*4882a593Smuzhiyun {
349*4882a593Smuzhiyun 	if (pnfs_seqid_is_newer(newseq, lo->plh_barrier) || !lo->plh_barrier)
350*4882a593Smuzhiyun 		lo->plh_barrier = newseq;
351*4882a593Smuzhiyun }
352*4882a593Smuzhiyun 
353*4882a593Smuzhiyun static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr * lo,enum pnfs_iomode iomode,u32 seq)354*4882a593Smuzhiyun pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
355*4882a593Smuzhiyun 			 u32 seq)
356*4882a593Smuzhiyun {
357*4882a593Smuzhiyun 	if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
358*4882a593Smuzhiyun 		iomode = IOMODE_ANY;
359*4882a593Smuzhiyun 	lo->plh_return_iomode = iomode;
360*4882a593Smuzhiyun 	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
361*4882a593Smuzhiyun 	/*
362*4882a593Smuzhiyun 	 * We must set lo->plh_return_seq to avoid livelocks with
363*4882a593Smuzhiyun 	 * pnfs_layout_need_return()
364*4882a593Smuzhiyun 	 */
365*4882a593Smuzhiyun 	if (seq == 0)
366*4882a593Smuzhiyun 		seq = be32_to_cpu(lo->plh_stateid.seqid);
367*4882a593Smuzhiyun 	if (!lo->plh_return_seq || pnfs_seqid_is_newer(seq, lo->plh_return_seq))
368*4882a593Smuzhiyun 		lo->plh_return_seq = seq;
369*4882a593Smuzhiyun 	pnfs_barrier_update(lo, seq);
370*4882a593Smuzhiyun }
371*4882a593Smuzhiyun 
372*4882a593Smuzhiyun static void
pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr * lo)373*4882a593Smuzhiyun pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
374*4882a593Smuzhiyun {
375*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg;
376*4882a593Smuzhiyun 	lo->plh_return_iomode = 0;
377*4882a593Smuzhiyun 	lo->plh_return_seq = 0;
378*4882a593Smuzhiyun 	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
379*4882a593Smuzhiyun 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
380*4882a593Smuzhiyun 		if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
381*4882a593Smuzhiyun 			continue;
382*4882a593Smuzhiyun 		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
383*4882a593Smuzhiyun 	}
384*4882a593Smuzhiyun }
385*4882a593Smuzhiyun 
pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr * lo)386*4882a593Smuzhiyun static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
387*4882a593Smuzhiyun {
388*4882a593Smuzhiyun 	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
389*4882a593Smuzhiyun 	clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
390*4882a593Smuzhiyun 	smp_mb__after_atomic();
391*4882a593Smuzhiyun 	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
392*4882a593Smuzhiyun 	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
393*4882a593Smuzhiyun }
394*4882a593Smuzhiyun 
395*4882a593Smuzhiyun static void
pnfs_clear_lseg_state(struct pnfs_layout_segment * lseg,struct list_head * free_me)396*4882a593Smuzhiyun pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
397*4882a593Smuzhiyun 		struct list_head *free_me)
398*4882a593Smuzhiyun {
399*4882a593Smuzhiyun 	clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
400*4882a593Smuzhiyun 	clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
401*4882a593Smuzhiyun 	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
402*4882a593Smuzhiyun 		pnfs_lseg_dec_and_remove_zero(lseg, free_me);
403*4882a593Smuzhiyun 	if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
404*4882a593Smuzhiyun 		pnfs_lseg_dec_and_remove_zero(lseg, free_me);
405*4882a593Smuzhiyun }
406*4882a593Smuzhiyun 
407*4882a593Smuzhiyun /*
408*4882a593Smuzhiyun  * Update the seqid of a layout stateid after receiving
409*4882a593Smuzhiyun  * NFS4ERR_OLD_STATEID
410*4882a593Smuzhiyun  */
nfs4_layout_refresh_old_stateid(nfs4_stateid * dst,struct pnfs_layout_range * dst_range,struct inode * inode)411*4882a593Smuzhiyun bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
412*4882a593Smuzhiyun 		struct pnfs_layout_range *dst_range,
413*4882a593Smuzhiyun 		struct inode *inode)
414*4882a593Smuzhiyun {
415*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
416*4882a593Smuzhiyun 	struct pnfs_layout_range range = {
417*4882a593Smuzhiyun 		.iomode = IOMODE_ANY,
418*4882a593Smuzhiyun 		.offset = 0,
419*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
420*4882a593Smuzhiyun 	};
421*4882a593Smuzhiyun 	bool ret = false;
422*4882a593Smuzhiyun 	LIST_HEAD(head);
423*4882a593Smuzhiyun 	int err;
424*4882a593Smuzhiyun 
425*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
426*4882a593Smuzhiyun 	lo = NFS_I(inode)->layout;
427*4882a593Smuzhiyun 	if (lo &&  pnfs_layout_is_valid(lo) &&
428*4882a593Smuzhiyun 	    nfs4_stateid_match_other(dst, &lo->plh_stateid)) {
429*4882a593Smuzhiyun 		/* Is our call using the most recent seqid? If so, bump it */
430*4882a593Smuzhiyun 		if (!nfs4_stateid_is_newer(&lo->plh_stateid, dst)) {
431*4882a593Smuzhiyun 			nfs4_stateid_seqid_inc(dst);
432*4882a593Smuzhiyun 			ret = true;
433*4882a593Smuzhiyun 			goto out;
434*4882a593Smuzhiyun 		}
435*4882a593Smuzhiyun 		/* Try to update the seqid to the most recent */
436*4882a593Smuzhiyun 		err = pnfs_mark_matching_lsegs_return(lo, &head, &range, 0);
437*4882a593Smuzhiyun 		if (err != -EBUSY) {
438*4882a593Smuzhiyun 			dst->seqid = lo->plh_stateid.seqid;
439*4882a593Smuzhiyun 			*dst_range = range;
440*4882a593Smuzhiyun 			ret = true;
441*4882a593Smuzhiyun 		}
442*4882a593Smuzhiyun 	}
443*4882a593Smuzhiyun out:
444*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
445*4882a593Smuzhiyun 	pnfs_free_lseg_list(&head);
446*4882a593Smuzhiyun 	return ret;
447*4882a593Smuzhiyun }
448*4882a593Smuzhiyun 
449*4882a593Smuzhiyun /*
450*4882a593Smuzhiyun  * Mark a pnfs_layout_hdr and all associated layout segments as invalid
451*4882a593Smuzhiyun  *
452*4882a593Smuzhiyun  * In order to continue using the pnfs_layout_hdr, a full recovery
453*4882a593Smuzhiyun  * is required.
454*4882a593Smuzhiyun  * Note that caller must hold inode->i_lock.
455*4882a593Smuzhiyun  */
456*4882a593Smuzhiyun int
pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr * lo,struct list_head * lseg_list)457*4882a593Smuzhiyun pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
458*4882a593Smuzhiyun 		struct list_head *lseg_list)
459*4882a593Smuzhiyun {
460*4882a593Smuzhiyun 	struct pnfs_layout_range range = {
461*4882a593Smuzhiyun 		.iomode = IOMODE_ANY,
462*4882a593Smuzhiyun 		.offset = 0,
463*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
464*4882a593Smuzhiyun 	};
465*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *next;
466*4882a593Smuzhiyun 
467*4882a593Smuzhiyun 	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
468*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
469*4882a593Smuzhiyun 		pnfs_clear_lseg_state(lseg, lseg_list);
470*4882a593Smuzhiyun 	pnfs_clear_layoutreturn_info(lo);
471*4882a593Smuzhiyun 	pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
472*4882a593Smuzhiyun 	set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
473*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
474*4882a593Smuzhiyun 	    !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
475*4882a593Smuzhiyun 		pnfs_clear_layoutreturn_waitbit(lo);
476*4882a593Smuzhiyun 	return !list_empty(&lo->plh_segs);
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun 
479*4882a593Smuzhiyun static int
pnfs_iomode_to_fail_bit(u32 iomode)480*4882a593Smuzhiyun pnfs_iomode_to_fail_bit(u32 iomode)
481*4882a593Smuzhiyun {
482*4882a593Smuzhiyun 	return iomode == IOMODE_RW ?
483*4882a593Smuzhiyun 		NFS_LAYOUT_RW_FAILED : NFS_LAYOUT_RO_FAILED;
484*4882a593Smuzhiyun }
485*4882a593Smuzhiyun 
486*4882a593Smuzhiyun static void
pnfs_layout_set_fail_bit(struct pnfs_layout_hdr * lo,int fail_bit)487*4882a593Smuzhiyun pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
488*4882a593Smuzhiyun {
489*4882a593Smuzhiyun 	lo->plh_retry_timestamp = jiffies;
490*4882a593Smuzhiyun 	if (!test_and_set_bit(fail_bit, &lo->plh_flags))
491*4882a593Smuzhiyun 		refcount_inc(&lo->plh_refcount);
492*4882a593Smuzhiyun }
493*4882a593Smuzhiyun 
494*4882a593Smuzhiyun static void
pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr * lo,int fail_bit)495*4882a593Smuzhiyun pnfs_layout_clear_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
496*4882a593Smuzhiyun {
497*4882a593Smuzhiyun 	if (test_and_clear_bit(fail_bit, &lo->plh_flags))
498*4882a593Smuzhiyun 		refcount_dec(&lo->plh_refcount);
499*4882a593Smuzhiyun }
500*4882a593Smuzhiyun 
501*4882a593Smuzhiyun static void
pnfs_layout_io_set_failed(struct pnfs_layout_hdr * lo,u32 iomode)502*4882a593Smuzhiyun pnfs_layout_io_set_failed(struct pnfs_layout_hdr *lo, u32 iomode)
503*4882a593Smuzhiyun {
504*4882a593Smuzhiyun 	struct inode *inode = lo->plh_inode;
505*4882a593Smuzhiyun 	struct pnfs_layout_range range = {
506*4882a593Smuzhiyun 		.iomode = iomode,
507*4882a593Smuzhiyun 		.offset = 0,
508*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
509*4882a593Smuzhiyun 	};
510*4882a593Smuzhiyun 	LIST_HEAD(head);
511*4882a593Smuzhiyun 
512*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
513*4882a593Smuzhiyun 	pnfs_layout_set_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
514*4882a593Smuzhiyun 	pnfs_mark_matching_lsegs_invalid(lo, &head, &range, 0);
515*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
516*4882a593Smuzhiyun 	pnfs_free_lseg_list(&head);
517*4882a593Smuzhiyun 	dprintk("%s Setting layout IOMODE_%s fail bit\n", __func__,
518*4882a593Smuzhiyun 			iomode == IOMODE_RW ?  "RW" : "READ");
519*4882a593Smuzhiyun }
520*4882a593Smuzhiyun 
521*4882a593Smuzhiyun static bool
pnfs_layout_io_test_failed(struct pnfs_layout_hdr * lo,u32 iomode)522*4882a593Smuzhiyun pnfs_layout_io_test_failed(struct pnfs_layout_hdr *lo, u32 iomode)
523*4882a593Smuzhiyun {
524*4882a593Smuzhiyun 	unsigned long start, end;
525*4882a593Smuzhiyun 	int fail_bit = pnfs_iomode_to_fail_bit(iomode);
526*4882a593Smuzhiyun 
527*4882a593Smuzhiyun 	if (test_bit(fail_bit, &lo->plh_flags) == 0)
528*4882a593Smuzhiyun 		return false;
529*4882a593Smuzhiyun 	end = jiffies;
530*4882a593Smuzhiyun 	start = end - PNFS_LAYOUTGET_RETRY_TIMEOUT;
531*4882a593Smuzhiyun 	if (!time_in_range(lo->plh_retry_timestamp, start, end)) {
532*4882a593Smuzhiyun 		/* It is time to retry the failed layoutgets */
533*4882a593Smuzhiyun 		pnfs_layout_clear_fail_bit(lo, fail_bit);
534*4882a593Smuzhiyun 		return false;
535*4882a593Smuzhiyun 	}
536*4882a593Smuzhiyun 	return true;
537*4882a593Smuzhiyun }
538*4882a593Smuzhiyun 
539*4882a593Smuzhiyun static void
pnfs_init_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg,const struct pnfs_layout_range * range,const nfs4_stateid * stateid)540*4882a593Smuzhiyun pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
541*4882a593Smuzhiyun 		const struct pnfs_layout_range *range,
542*4882a593Smuzhiyun 		const nfs4_stateid *stateid)
543*4882a593Smuzhiyun {
544*4882a593Smuzhiyun 	INIT_LIST_HEAD(&lseg->pls_list);
545*4882a593Smuzhiyun 	INIT_LIST_HEAD(&lseg->pls_lc_list);
546*4882a593Smuzhiyun 	INIT_LIST_HEAD(&lseg->pls_commits);
547*4882a593Smuzhiyun 	refcount_set(&lseg->pls_refcount, 1);
548*4882a593Smuzhiyun 	set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
549*4882a593Smuzhiyun 	lseg->pls_layout = lo;
550*4882a593Smuzhiyun 	lseg->pls_range = *range;
551*4882a593Smuzhiyun 	lseg->pls_seq = be32_to_cpu(stateid->seqid);
552*4882a593Smuzhiyun }
553*4882a593Smuzhiyun 
pnfs_free_lseg(struct pnfs_layout_segment * lseg)554*4882a593Smuzhiyun static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
555*4882a593Smuzhiyun {
556*4882a593Smuzhiyun 	if (lseg != NULL) {
557*4882a593Smuzhiyun 		struct inode *inode = lseg->pls_layout->plh_inode;
558*4882a593Smuzhiyun 		NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
559*4882a593Smuzhiyun 	}
560*4882a593Smuzhiyun }
561*4882a593Smuzhiyun 
562*4882a593Smuzhiyun static void
pnfs_layout_remove_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg)563*4882a593Smuzhiyun pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
564*4882a593Smuzhiyun 		struct pnfs_layout_segment *lseg)
565*4882a593Smuzhiyun {
566*4882a593Smuzhiyun 	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
567*4882a593Smuzhiyun 	list_del_init(&lseg->pls_list);
568*4882a593Smuzhiyun 	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
569*4882a593Smuzhiyun 	refcount_dec(&lo->plh_refcount);
570*4882a593Smuzhiyun 	if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
571*4882a593Smuzhiyun 		return;
572*4882a593Smuzhiyun 	if (list_empty(&lo->plh_segs) &&
573*4882a593Smuzhiyun 	    !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
574*4882a593Smuzhiyun 	    !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
575*4882a593Smuzhiyun 		if (atomic_read(&lo->plh_outstanding) == 0)
576*4882a593Smuzhiyun 			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
577*4882a593Smuzhiyun 		clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
578*4882a593Smuzhiyun 	}
579*4882a593Smuzhiyun }
580*4882a593Smuzhiyun 
581*4882a593Smuzhiyun static bool
pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg)582*4882a593Smuzhiyun pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
583*4882a593Smuzhiyun 		struct pnfs_layout_segment *lseg)
584*4882a593Smuzhiyun {
585*4882a593Smuzhiyun 	if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
586*4882a593Smuzhiyun 	    pnfs_layout_is_valid(lo)) {
587*4882a593Smuzhiyun 		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
588*4882a593Smuzhiyun 		list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
589*4882a593Smuzhiyun 		return true;
590*4882a593Smuzhiyun 	}
591*4882a593Smuzhiyun 	return false;
592*4882a593Smuzhiyun }
593*4882a593Smuzhiyun 
594*4882a593Smuzhiyun void
pnfs_put_lseg(struct pnfs_layout_segment * lseg)595*4882a593Smuzhiyun pnfs_put_lseg(struct pnfs_layout_segment *lseg)
596*4882a593Smuzhiyun {
597*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
598*4882a593Smuzhiyun 	struct inode *inode;
599*4882a593Smuzhiyun 
600*4882a593Smuzhiyun 	if (!lseg)
601*4882a593Smuzhiyun 		return;
602*4882a593Smuzhiyun 
603*4882a593Smuzhiyun 	dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
604*4882a593Smuzhiyun 		refcount_read(&lseg->pls_refcount),
605*4882a593Smuzhiyun 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
606*4882a593Smuzhiyun 
607*4882a593Smuzhiyun 	lo = lseg->pls_layout;
608*4882a593Smuzhiyun 	inode = lo->plh_inode;
609*4882a593Smuzhiyun 
610*4882a593Smuzhiyun 	if (refcount_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
611*4882a593Smuzhiyun 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
612*4882a593Smuzhiyun 			spin_unlock(&inode->i_lock);
613*4882a593Smuzhiyun 			return;
614*4882a593Smuzhiyun 		}
615*4882a593Smuzhiyun 		pnfs_get_layout_hdr(lo);
616*4882a593Smuzhiyun 		pnfs_layout_remove_lseg(lo, lseg);
617*4882a593Smuzhiyun 		if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
618*4882a593Smuzhiyun 			lseg = NULL;
619*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
620*4882a593Smuzhiyun 		pnfs_free_lseg(lseg);
621*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
622*4882a593Smuzhiyun 	}
623*4882a593Smuzhiyun }
624*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_put_lseg);
625*4882a593Smuzhiyun 
626*4882a593Smuzhiyun /*
627*4882a593Smuzhiyun  * is l2 fully contained in l1?
628*4882a593Smuzhiyun  *   start1                             end1
629*4882a593Smuzhiyun  *   [----------------------------------)
630*4882a593Smuzhiyun  *           start2           end2
631*4882a593Smuzhiyun  *           [----------------)
632*4882a593Smuzhiyun  */
633*4882a593Smuzhiyun static bool
pnfs_lseg_range_contained(const struct pnfs_layout_range * l1,const struct pnfs_layout_range * l2)634*4882a593Smuzhiyun pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
635*4882a593Smuzhiyun 		 const struct pnfs_layout_range *l2)
636*4882a593Smuzhiyun {
637*4882a593Smuzhiyun 	u64 start1 = l1->offset;
638*4882a593Smuzhiyun 	u64 end1 = pnfs_end_offset(start1, l1->length);
639*4882a593Smuzhiyun 	u64 start2 = l2->offset;
640*4882a593Smuzhiyun 	u64 end2 = pnfs_end_offset(start2, l2->length);
641*4882a593Smuzhiyun 
642*4882a593Smuzhiyun 	return (start1 <= start2) && (end1 >= end2);
643*4882a593Smuzhiyun }
644*4882a593Smuzhiyun 
pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment * lseg,struct list_head * tmp_list)645*4882a593Smuzhiyun static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
646*4882a593Smuzhiyun 		struct list_head *tmp_list)
647*4882a593Smuzhiyun {
648*4882a593Smuzhiyun 	if (!refcount_dec_and_test(&lseg->pls_refcount))
649*4882a593Smuzhiyun 		return false;
650*4882a593Smuzhiyun 	pnfs_layout_remove_lseg(lseg->pls_layout, lseg);
651*4882a593Smuzhiyun 	list_add(&lseg->pls_list, tmp_list);
652*4882a593Smuzhiyun 	return true;
653*4882a593Smuzhiyun }
654*4882a593Smuzhiyun 
655*4882a593Smuzhiyun /* Returns 1 if lseg is removed from list, 0 otherwise */
mark_lseg_invalid(struct pnfs_layout_segment * lseg,struct list_head * tmp_list)656*4882a593Smuzhiyun static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
657*4882a593Smuzhiyun 			     struct list_head *tmp_list)
658*4882a593Smuzhiyun {
659*4882a593Smuzhiyun 	int rv = 0;
660*4882a593Smuzhiyun 
661*4882a593Smuzhiyun 	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
662*4882a593Smuzhiyun 		/* Remove the reference keeping the lseg in the
663*4882a593Smuzhiyun 		 * list.  It will now be removed when all
664*4882a593Smuzhiyun 		 * outstanding io is finished.
665*4882a593Smuzhiyun 		 */
666*4882a593Smuzhiyun 		dprintk("%s: lseg %p ref %d\n", __func__, lseg,
667*4882a593Smuzhiyun 			refcount_read(&lseg->pls_refcount));
668*4882a593Smuzhiyun 		if (pnfs_lseg_dec_and_remove_zero(lseg, tmp_list))
669*4882a593Smuzhiyun 			rv = 1;
670*4882a593Smuzhiyun 	}
671*4882a593Smuzhiyun 	return rv;
672*4882a593Smuzhiyun }
673*4882a593Smuzhiyun 
674*4882a593Smuzhiyun static bool
pnfs_should_free_range(const struct pnfs_layout_range * lseg_range,const struct pnfs_layout_range * recall_range)675*4882a593Smuzhiyun pnfs_should_free_range(const struct pnfs_layout_range *lseg_range,
676*4882a593Smuzhiyun 		 const struct pnfs_layout_range *recall_range)
677*4882a593Smuzhiyun {
678*4882a593Smuzhiyun 	return (recall_range->iomode == IOMODE_ANY ||
679*4882a593Smuzhiyun 		lseg_range->iomode == recall_range->iomode) &&
680*4882a593Smuzhiyun 	       pnfs_lseg_range_intersecting(lseg_range, recall_range);
681*4882a593Smuzhiyun }
682*4882a593Smuzhiyun 
683*4882a593Smuzhiyun static bool
pnfs_match_lseg_recall(const struct pnfs_layout_segment * lseg,const struct pnfs_layout_range * recall_range,u32 seq)684*4882a593Smuzhiyun pnfs_match_lseg_recall(const struct pnfs_layout_segment *lseg,
685*4882a593Smuzhiyun 		const struct pnfs_layout_range *recall_range,
686*4882a593Smuzhiyun 		u32 seq)
687*4882a593Smuzhiyun {
688*4882a593Smuzhiyun 	if (seq != 0 && pnfs_seqid_is_newer(lseg->pls_seq, seq))
689*4882a593Smuzhiyun 		return false;
690*4882a593Smuzhiyun 	if (recall_range == NULL)
691*4882a593Smuzhiyun 		return true;
692*4882a593Smuzhiyun 	return pnfs_should_free_range(&lseg->pls_range, recall_range);
693*4882a593Smuzhiyun }
694*4882a593Smuzhiyun 
695*4882a593Smuzhiyun /**
696*4882a593Smuzhiyun  * pnfs_mark_matching_lsegs_invalid - tear down lsegs or mark them for later
697*4882a593Smuzhiyun  * @lo: layout header containing the lsegs
698*4882a593Smuzhiyun  * @tmp_list: list head where doomed lsegs should go
699*4882a593Smuzhiyun  * @recall_range: optional recall range argument to match (may be NULL)
700*4882a593Smuzhiyun  * @seq: only invalidate lsegs obtained prior to this sequence (may be 0)
701*4882a593Smuzhiyun  *
702*4882a593Smuzhiyun  * Walk the list of lsegs in the layout header, and tear down any that should
703*4882a593Smuzhiyun  * be destroyed. If "recall_range" is specified then the segment must match
704*4882a593Smuzhiyun  * that range. If "seq" is non-zero, then only match segments that were handed
705*4882a593Smuzhiyun  * out at or before that sequence.
706*4882a593Smuzhiyun  *
707*4882a593Smuzhiyun  * Returns number of matching invalid lsegs remaining in list after scanning
708*4882a593Smuzhiyun  * it and purging them.
709*4882a593Smuzhiyun  */
710*4882a593Smuzhiyun int
pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr * lo,struct list_head * tmp_list,const struct pnfs_layout_range * recall_range,u32 seq)711*4882a593Smuzhiyun pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
712*4882a593Smuzhiyun 			    struct list_head *tmp_list,
713*4882a593Smuzhiyun 			    const struct pnfs_layout_range *recall_range,
714*4882a593Smuzhiyun 			    u32 seq)
715*4882a593Smuzhiyun {
716*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *next;
717*4882a593Smuzhiyun 	int remaining = 0;
718*4882a593Smuzhiyun 
719*4882a593Smuzhiyun 	dprintk("%s:Begin lo %p\n", __func__, lo);
720*4882a593Smuzhiyun 
721*4882a593Smuzhiyun 	if (list_empty(&lo->plh_segs))
722*4882a593Smuzhiyun 		return 0;
723*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
724*4882a593Smuzhiyun 		if (pnfs_match_lseg_recall(lseg, recall_range, seq)) {
725*4882a593Smuzhiyun 			dprintk("%s: freeing lseg %p iomode %d seq %u "
726*4882a593Smuzhiyun 				"offset %llu length %llu\n", __func__,
727*4882a593Smuzhiyun 				lseg, lseg->pls_range.iomode, lseg->pls_seq,
728*4882a593Smuzhiyun 				lseg->pls_range.offset, lseg->pls_range.length);
729*4882a593Smuzhiyun 			if (!mark_lseg_invalid(lseg, tmp_list))
730*4882a593Smuzhiyun 				remaining++;
731*4882a593Smuzhiyun 		}
732*4882a593Smuzhiyun 	dprintk("%s:Return %i\n", __func__, remaining);
733*4882a593Smuzhiyun 	return remaining;
734*4882a593Smuzhiyun }
735*4882a593Smuzhiyun 
736*4882a593Smuzhiyun static void
pnfs_free_returned_lsegs(struct pnfs_layout_hdr * lo,struct list_head * free_me,const struct pnfs_layout_range * range,u32 seq)737*4882a593Smuzhiyun pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
738*4882a593Smuzhiyun 		struct list_head *free_me,
739*4882a593Smuzhiyun 		const struct pnfs_layout_range *range,
740*4882a593Smuzhiyun 		u32 seq)
741*4882a593Smuzhiyun {
742*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *next;
743*4882a593Smuzhiyun 
744*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
745*4882a593Smuzhiyun 		if (pnfs_match_lseg_recall(lseg, range, seq))
746*4882a593Smuzhiyun 			list_move_tail(&lseg->pls_list, free_me);
747*4882a593Smuzhiyun 	}
748*4882a593Smuzhiyun }
749*4882a593Smuzhiyun 
750*4882a593Smuzhiyun /* note free_me must contain lsegs from a single layout_hdr */
751*4882a593Smuzhiyun void
pnfs_free_lseg_list(struct list_head * free_me)752*4882a593Smuzhiyun pnfs_free_lseg_list(struct list_head *free_me)
753*4882a593Smuzhiyun {
754*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *tmp;
755*4882a593Smuzhiyun 
756*4882a593Smuzhiyun 	if (list_empty(free_me))
757*4882a593Smuzhiyun 		return;
758*4882a593Smuzhiyun 
759*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, tmp, free_me, pls_list) {
760*4882a593Smuzhiyun 		list_del(&lseg->pls_list);
761*4882a593Smuzhiyun 		pnfs_free_lseg(lseg);
762*4882a593Smuzhiyun 	}
763*4882a593Smuzhiyun }
764*4882a593Smuzhiyun 
__pnfs_destroy_layout(struct nfs_inode * nfsi)765*4882a593Smuzhiyun static struct pnfs_layout_hdr *__pnfs_destroy_layout(struct nfs_inode *nfsi)
766*4882a593Smuzhiyun {
767*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
768*4882a593Smuzhiyun 	LIST_HEAD(tmp_list);
769*4882a593Smuzhiyun 
770*4882a593Smuzhiyun 	spin_lock(&nfsi->vfs_inode.i_lock);
771*4882a593Smuzhiyun 	lo = nfsi->layout;
772*4882a593Smuzhiyun 	if (lo) {
773*4882a593Smuzhiyun 		pnfs_get_layout_hdr(lo);
774*4882a593Smuzhiyun 		pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
775*4882a593Smuzhiyun 		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
776*4882a593Smuzhiyun 		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
777*4882a593Smuzhiyun 		spin_unlock(&nfsi->vfs_inode.i_lock);
778*4882a593Smuzhiyun 		pnfs_free_lseg_list(&tmp_list);
779*4882a593Smuzhiyun 		nfs_commit_inode(&nfsi->vfs_inode, 0);
780*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
781*4882a593Smuzhiyun 	} else
782*4882a593Smuzhiyun 		spin_unlock(&nfsi->vfs_inode.i_lock);
783*4882a593Smuzhiyun 	return lo;
784*4882a593Smuzhiyun }
785*4882a593Smuzhiyun 
pnfs_destroy_layout(struct nfs_inode * nfsi)786*4882a593Smuzhiyun void pnfs_destroy_layout(struct nfs_inode *nfsi)
787*4882a593Smuzhiyun {
788*4882a593Smuzhiyun 	__pnfs_destroy_layout(nfsi);
789*4882a593Smuzhiyun }
790*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
791*4882a593Smuzhiyun 
pnfs_layout_removed(struct nfs_inode * nfsi,struct pnfs_layout_hdr * lo)792*4882a593Smuzhiyun static bool pnfs_layout_removed(struct nfs_inode *nfsi,
793*4882a593Smuzhiyun 				struct pnfs_layout_hdr *lo)
794*4882a593Smuzhiyun {
795*4882a593Smuzhiyun 	bool ret;
796*4882a593Smuzhiyun 
797*4882a593Smuzhiyun 	spin_lock(&nfsi->vfs_inode.i_lock);
798*4882a593Smuzhiyun 	ret = nfsi->layout != lo;
799*4882a593Smuzhiyun 	spin_unlock(&nfsi->vfs_inode.i_lock);
800*4882a593Smuzhiyun 	return ret;
801*4882a593Smuzhiyun }
802*4882a593Smuzhiyun 
pnfs_destroy_layout_final(struct nfs_inode * nfsi)803*4882a593Smuzhiyun void pnfs_destroy_layout_final(struct nfs_inode *nfsi)
804*4882a593Smuzhiyun {
805*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo = __pnfs_destroy_layout(nfsi);
806*4882a593Smuzhiyun 
807*4882a593Smuzhiyun 	if (lo)
808*4882a593Smuzhiyun 		wait_var_event(lo, pnfs_layout_removed(nfsi, lo));
809*4882a593Smuzhiyun }
810*4882a593Smuzhiyun 
811*4882a593Smuzhiyun static bool
pnfs_layout_add_bulk_destroy_list(struct inode * inode,struct list_head * layout_list)812*4882a593Smuzhiyun pnfs_layout_add_bulk_destroy_list(struct inode *inode,
813*4882a593Smuzhiyun 		struct list_head *layout_list)
814*4882a593Smuzhiyun {
815*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
816*4882a593Smuzhiyun 	bool ret = false;
817*4882a593Smuzhiyun 
818*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
819*4882a593Smuzhiyun 	lo = NFS_I(inode)->layout;
820*4882a593Smuzhiyun 	if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
821*4882a593Smuzhiyun 		pnfs_get_layout_hdr(lo);
822*4882a593Smuzhiyun 		list_add(&lo->plh_bulk_destroy, layout_list);
823*4882a593Smuzhiyun 		ret = true;
824*4882a593Smuzhiyun 	}
825*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
826*4882a593Smuzhiyun 	return ret;
827*4882a593Smuzhiyun }
828*4882a593Smuzhiyun 
829*4882a593Smuzhiyun /* Caller must hold rcu_read_lock and clp->cl_lock */
830*4882a593Smuzhiyun static int
pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client * clp,struct nfs_server * server,struct list_head * layout_list)831*4882a593Smuzhiyun pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
832*4882a593Smuzhiyun 		struct nfs_server *server,
833*4882a593Smuzhiyun 		struct list_head *layout_list)
834*4882a593Smuzhiyun 	__must_hold(&clp->cl_lock)
835*4882a593Smuzhiyun 	__must_hold(RCU)
836*4882a593Smuzhiyun {
837*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo, *next;
838*4882a593Smuzhiyun 	struct inode *inode;
839*4882a593Smuzhiyun 
840*4882a593Smuzhiyun 	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
841*4882a593Smuzhiyun 		if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) ||
842*4882a593Smuzhiyun 		    test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) ||
843*4882a593Smuzhiyun 		    !list_empty(&lo->plh_bulk_destroy))
844*4882a593Smuzhiyun 			continue;
845*4882a593Smuzhiyun 		/* If the sb is being destroyed, just bail */
846*4882a593Smuzhiyun 		if (!nfs_sb_active(server->super))
847*4882a593Smuzhiyun 			break;
848*4882a593Smuzhiyun 		inode = pnfs_grab_inode_layout_hdr(lo);
849*4882a593Smuzhiyun 		if (inode != NULL) {
850*4882a593Smuzhiyun 			if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
851*4882a593Smuzhiyun 				list_del_rcu(&lo->plh_layouts);
852*4882a593Smuzhiyun 			if (pnfs_layout_add_bulk_destroy_list(inode,
853*4882a593Smuzhiyun 						layout_list))
854*4882a593Smuzhiyun 				continue;
855*4882a593Smuzhiyun 			rcu_read_unlock();
856*4882a593Smuzhiyun 			spin_unlock(&clp->cl_lock);
857*4882a593Smuzhiyun 			iput(inode);
858*4882a593Smuzhiyun 		} else {
859*4882a593Smuzhiyun 			rcu_read_unlock();
860*4882a593Smuzhiyun 			spin_unlock(&clp->cl_lock);
861*4882a593Smuzhiyun 		}
862*4882a593Smuzhiyun 		nfs_sb_deactive(server->super);
863*4882a593Smuzhiyun 		spin_lock(&clp->cl_lock);
864*4882a593Smuzhiyun 		rcu_read_lock();
865*4882a593Smuzhiyun 		return -EAGAIN;
866*4882a593Smuzhiyun 	}
867*4882a593Smuzhiyun 	return 0;
868*4882a593Smuzhiyun }
869*4882a593Smuzhiyun 
870*4882a593Smuzhiyun static int
pnfs_layout_free_bulk_destroy_list(struct list_head * layout_list,bool is_bulk_recall)871*4882a593Smuzhiyun pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
872*4882a593Smuzhiyun 		bool is_bulk_recall)
873*4882a593Smuzhiyun {
874*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
875*4882a593Smuzhiyun 	struct inode *inode;
876*4882a593Smuzhiyun 	LIST_HEAD(lseg_list);
877*4882a593Smuzhiyun 	int ret = 0;
878*4882a593Smuzhiyun 
879*4882a593Smuzhiyun 	while (!list_empty(layout_list)) {
880*4882a593Smuzhiyun 		lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
881*4882a593Smuzhiyun 				plh_bulk_destroy);
882*4882a593Smuzhiyun 		dprintk("%s freeing layout for inode %lu\n", __func__,
883*4882a593Smuzhiyun 			lo->plh_inode->i_ino);
884*4882a593Smuzhiyun 		inode = lo->plh_inode;
885*4882a593Smuzhiyun 
886*4882a593Smuzhiyun 		pnfs_layoutcommit_inode(inode, false);
887*4882a593Smuzhiyun 
888*4882a593Smuzhiyun 		spin_lock(&inode->i_lock);
889*4882a593Smuzhiyun 		list_del_init(&lo->plh_bulk_destroy);
890*4882a593Smuzhiyun 		if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
891*4882a593Smuzhiyun 			if (is_bulk_recall)
892*4882a593Smuzhiyun 				set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
893*4882a593Smuzhiyun 			ret = -EAGAIN;
894*4882a593Smuzhiyun 		}
895*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
896*4882a593Smuzhiyun 		pnfs_free_lseg_list(&lseg_list);
897*4882a593Smuzhiyun 		/* Free all lsegs that are attached to commit buckets */
898*4882a593Smuzhiyun 		nfs_commit_inode(inode, 0);
899*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
900*4882a593Smuzhiyun 		nfs_iput_and_deactive(inode);
901*4882a593Smuzhiyun 	}
902*4882a593Smuzhiyun 	return ret;
903*4882a593Smuzhiyun }
904*4882a593Smuzhiyun 
905*4882a593Smuzhiyun int
pnfs_destroy_layouts_byfsid(struct nfs_client * clp,struct nfs_fsid * fsid,bool is_recall)906*4882a593Smuzhiyun pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
907*4882a593Smuzhiyun 		struct nfs_fsid *fsid,
908*4882a593Smuzhiyun 		bool is_recall)
909*4882a593Smuzhiyun {
910*4882a593Smuzhiyun 	struct nfs_server *server;
911*4882a593Smuzhiyun 	LIST_HEAD(layout_list);
912*4882a593Smuzhiyun 
913*4882a593Smuzhiyun 	spin_lock(&clp->cl_lock);
914*4882a593Smuzhiyun 	rcu_read_lock();
915*4882a593Smuzhiyun restart:
916*4882a593Smuzhiyun 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
917*4882a593Smuzhiyun 		if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
918*4882a593Smuzhiyun 			continue;
919*4882a593Smuzhiyun 		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
920*4882a593Smuzhiyun 				server,
921*4882a593Smuzhiyun 				&layout_list) != 0)
922*4882a593Smuzhiyun 			goto restart;
923*4882a593Smuzhiyun 	}
924*4882a593Smuzhiyun 	rcu_read_unlock();
925*4882a593Smuzhiyun 	spin_unlock(&clp->cl_lock);
926*4882a593Smuzhiyun 
927*4882a593Smuzhiyun 	if (list_empty(&layout_list))
928*4882a593Smuzhiyun 		return 0;
929*4882a593Smuzhiyun 	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
930*4882a593Smuzhiyun }
931*4882a593Smuzhiyun 
932*4882a593Smuzhiyun int
pnfs_destroy_layouts_byclid(struct nfs_client * clp,bool is_recall)933*4882a593Smuzhiyun pnfs_destroy_layouts_byclid(struct nfs_client *clp,
934*4882a593Smuzhiyun 		bool is_recall)
935*4882a593Smuzhiyun {
936*4882a593Smuzhiyun 	struct nfs_server *server;
937*4882a593Smuzhiyun 	LIST_HEAD(layout_list);
938*4882a593Smuzhiyun 
939*4882a593Smuzhiyun 	spin_lock(&clp->cl_lock);
940*4882a593Smuzhiyun 	rcu_read_lock();
941*4882a593Smuzhiyun restart:
942*4882a593Smuzhiyun 	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
943*4882a593Smuzhiyun 		if (pnfs_layout_bulk_destroy_byserver_locked(clp,
944*4882a593Smuzhiyun 					server,
945*4882a593Smuzhiyun 					&layout_list) != 0)
946*4882a593Smuzhiyun 			goto restart;
947*4882a593Smuzhiyun 	}
948*4882a593Smuzhiyun 	rcu_read_unlock();
949*4882a593Smuzhiyun 	spin_unlock(&clp->cl_lock);
950*4882a593Smuzhiyun 
951*4882a593Smuzhiyun 	if (list_empty(&layout_list))
952*4882a593Smuzhiyun 		return 0;
953*4882a593Smuzhiyun 	return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
954*4882a593Smuzhiyun }
955*4882a593Smuzhiyun 
956*4882a593Smuzhiyun /*
957*4882a593Smuzhiyun  * Called by the state manager to remove all layouts established under an
958*4882a593Smuzhiyun  * expired lease.
959*4882a593Smuzhiyun  */
960*4882a593Smuzhiyun void
pnfs_destroy_all_layouts(struct nfs_client * clp)961*4882a593Smuzhiyun pnfs_destroy_all_layouts(struct nfs_client *clp)
962*4882a593Smuzhiyun {
963*4882a593Smuzhiyun 	nfs4_deviceid_mark_client_invalid(clp);
964*4882a593Smuzhiyun 	nfs4_deviceid_purge_client(clp);
965*4882a593Smuzhiyun 
966*4882a593Smuzhiyun 	pnfs_destroy_layouts_byclid(clp, false);
967*4882a593Smuzhiyun }
968*4882a593Smuzhiyun 
969*4882a593Smuzhiyun static void
pnfs_set_layout_cred(struct pnfs_layout_hdr * lo,const struct cred * cred)970*4882a593Smuzhiyun pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
971*4882a593Smuzhiyun {
972*4882a593Smuzhiyun 	const struct cred *old;
973*4882a593Smuzhiyun 
974*4882a593Smuzhiyun 	if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
975*4882a593Smuzhiyun 		old = xchg(&lo->plh_lc_cred, get_cred(cred));
976*4882a593Smuzhiyun 		put_cred(old);
977*4882a593Smuzhiyun 	}
978*4882a593Smuzhiyun }
979*4882a593Smuzhiyun 
980*4882a593Smuzhiyun /* update lo->plh_stateid with new if is more recent */
981*4882a593Smuzhiyun void
pnfs_set_layout_stateid(struct pnfs_layout_hdr * lo,const nfs4_stateid * new,const struct cred * cred,bool update_barrier)982*4882a593Smuzhiyun pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
983*4882a593Smuzhiyun 			const struct cred *cred, bool update_barrier)
984*4882a593Smuzhiyun {
985*4882a593Smuzhiyun 	u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid);
986*4882a593Smuzhiyun 	u32 newseq = be32_to_cpu(new->seqid);
987*4882a593Smuzhiyun 
988*4882a593Smuzhiyun 	if (!pnfs_layout_is_valid(lo)) {
989*4882a593Smuzhiyun 		pnfs_set_layout_cred(lo, cred);
990*4882a593Smuzhiyun 		nfs4_stateid_copy(&lo->plh_stateid, new);
991*4882a593Smuzhiyun 		lo->plh_barrier = newseq;
992*4882a593Smuzhiyun 		pnfs_clear_layoutreturn_info(lo);
993*4882a593Smuzhiyun 		clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
994*4882a593Smuzhiyun 		return;
995*4882a593Smuzhiyun 	}
996*4882a593Smuzhiyun 
997*4882a593Smuzhiyun 	if (pnfs_seqid_is_newer(newseq, oldseq))
998*4882a593Smuzhiyun 		nfs4_stateid_copy(&lo->plh_stateid, new);
999*4882a593Smuzhiyun 
1000*4882a593Smuzhiyun 	if (update_barrier) {
1001*4882a593Smuzhiyun 		pnfs_barrier_update(lo, newseq);
1002*4882a593Smuzhiyun 		return;
1003*4882a593Smuzhiyun 	}
1004*4882a593Smuzhiyun 	/*
1005*4882a593Smuzhiyun 	 * Because of wraparound, we want to keep the barrier
1006*4882a593Smuzhiyun 	 * "close" to the current seqids. We really only want to
1007*4882a593Smuzhiyun 	 * get here from a layoutget call.
1008*4882a593Smuzhiyun 	 */
1009*4882a593Smuzhiyun 	if (atomic_read(&lo->plh_outstanding) == 1)
1010*4882a593Smuzhiyun 		 pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid));
1011*4882a593Smuzhiyun }
1012*4882a593Smuzhiyun 
1013*4882a593Smuzhiyun static bool
pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr * lo,const nfs4_stateid * stateid)1014*4882a593Smuzhiyun pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
1015*4882a593Smuzhiyun 		const nfs4_stateid *stateid)
1016*4882a593Smuzhiyun {
1017*4882a593Smuzhiyun 	u32 seqid = be32_to_cpu(stateid->seqid);
1018*4882a593Smuzhiyun 
1019*4882a593Smuzhiyun 	return lo->plh_barrier && pnfs_seqid_is_newer(lo->plh_barrier, seqid);
1020*4882a593Smuzhiyun }
1021*4882a593Smuzhiyun 
1022*4882a593Smuzhiyun /* lget is set to 1 if called from inside send_layoutget call chain */
1023*4882a593Smuzhiyun static bool
pnfs_layoutgets_blocked(const struct pnfs_layout_hdr * lo)1024*4882a593Smuzhiyun pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
1025*4882a593Smuzhiyun {
1026*4882a593Smuzhiyun 	return lo->plh_block_lgets ||
1027*4882a593Smuzhiyun 		test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
1028*4882a593Smuzhiyun }
1029*4882a593Smuzhiyun 
1030*4882a593Smuzhiyun static struct nfs_server *
pnfs_find_server(struct inode * inode,struct nfs_open_context * ctx)1031*4882a593Smuzhiyun pnfs_find_server(struct inode *inode, struct nfs_open_context *ctx)
1032*4882a593Smuzhiyun {
1033*4882a593Smuzhiyun 	struct nfs_server *server;
1034*4882a593Smuzhiyun 
1035*4882a593Smuzhiyun 	if (inode) {
1036*4882a593Smuzhiyun 		server = NFS_SERVER(inode);
1037*4882a593Smuzhiyun 	} else {
1038*4882a593Smuzhiyun 		struct dentry *parent_dir = dget_parent(ctx->dentry);
1039*4882a593Smuzhiyun 		server = NFS_SERVER(parent_dir->d_inode);
1040*4882a593Smuzhiyun 		dput(parent_dir);
1041*4882a593Smuzhiyun 	}
1042*4882a593Smuzhiyun 	return server;
1043*4882a593Smuzhiyun }
1044*4882a593Smuzhiyun 
nfs4_free_pages(struct page ** pages,size_t size)1045*4882a593Smuzhiyun static void nfs4_free_pages(struct page **pages, size_t size)
1046*4882a593Smuzhiyun {
1047*4882a593Smuzhiyun 	int i;
1048*4882a593Smuzhiyun 
1049*4882a593Smuzhiyun 	if (!pages)
1050*4882a593Smuzhiyun 		return;
1051*4882a593Smuzhiyun 
1052*4882a593Smuzhiyun 	for (i = 0; i < size; i++) {
1053*4882a593Smuzhiyun 		if (!pages[i])
1054*4882a593Smuzhiyun 			break;
1055*4882a593Smuzhiyun 		__free_page(pages[i]);
1056*4882a593Smuzhiyun 	}
1057*4882a593Smuzhiyun 	kfree(pages);
1058*4882a593Smuzhiyun }
1059*4882a593Smuzhiyun 
nfs4_alloc_pages(size_t size,gfp_t gfp_flags)1060*4882a593Smuzhiyun static struct page **nfs4_alloc_pages(size_t size, gfp_t gfp_flags)
1061*4882a593Smuzhiyun {
1062*4882a593Smuzhiyun 	struct page **pages;
1063*4882a593Smuzhiyun 	int i;
1064*4882a593Smuzhiyun 
1065*4882a593Smuzhiyun 	pages = kmalloc_array(size, sizeof(struct page *), gfp_flags);
1066*4882a593Smuzhiyun 	if (!pages) {
1067*4882a593Smuzhiyun 		dprintk("%s: can't alloc array of %zu pages\n", __func__, size);
1068*4882a593Smuzhiyun 		return NULL;
1069*4882a593Smuzhiyun 	}
1070*4882a593Smuzhiyun 
1071*4882a593Smuzhiyun 	for (i = 0; i < size; i++) {
1072*4882a593Smuzhiyun 		pages[i] = alloc_page(gfp_flags);
1073*4882a593Smuzhiyun 		if (!pages[i]) {
1074*4882a593Smuzhiyun 			dprintk("%s: failed to allocate page\n", __func__);
1075*4882a593Smuzhiyun 			nfs4_free_pages(pages, i);
1076*4882a593Smuzhiyun 			return NULL;
1077*4882a593Smuzhiyun 		}
1078*4882a593Smuzhiyun 	}
1079*4882a593Smuzhiyun 
1080*4882a593Smuzhiyun 	return pages;
1081*4882a593Smuzhiyun }
1082*4882a593Smuzhiyun 
1083*4882a593Smuzhiyun static struct nfs4_layoutget *
pnfs_alloc_init_layoutget_args(struct inode * ino,struct nfs_open_context * ctx,const nfs4_stateid * stateid,const struct pnfs_layout_range * range,gfp_t gfp_flags)1084*4882a593Smuzhiyun pnfs_alloc_init_layoutget_args(struct inode *ino,
1085*4882a593Smuzhiyun 	   struct nfs_open_context *ctx,
1086*4882a593Smuzhiyun 	   const nfs4_stateid *stateid,
1087*4882a593Smuzhiyun 	   const struct pnfs_layout_range *range,
1088*4882a593Smuzhiyun 	   gfp_t gfp_flags)
1089*4882a593Smuzhiyun {
1090*4882a593Smuzhiyun 	struct nfs_server *server = pnfs_find_server(ino, ctx);
1091*4882a593Smuzhiyun 	size_t max_reply_sz = server->pnfs_curr_ld->max_layoutget_response;
1092*4882a593Smuzhiyun 	size_t max_pages = max_response_pages(server);
1093*4882a593Smuzhiyun 	struct nfs4_layoutget *lgp;
1094*4882a593Smuzhiyun 
1095*4882a593Smuzhiyun 	dprintk("--> %s\n", __func__);
1096*4882a593Smuzhiyun 
1097*4882a593Smuzhiyun 	lgp = kzalloc(sizeof(*lgp), gfp_flags);
1098*4882a593Smuzhiyun 	if (lgp == NULL)
1099*4882a593Smuzhiyun 		return NULL;
1100*4882a593Smuzhiyun 
1101*4882a593Smuzhiyun 	if (max_reply_sz) {
1102*4882a593Smuzhiyun 		size_t npages = (max_reply_sz + PAGE_SIZE - 1) >> PAGE_SHIFT;
1103*4882a593Smuzhiyun 		if (npages < max_pages)
1104*4882a593Smuzhiyun 			max_pages = npages;
1105*4882a593Smuzhiyun 	}
1106*4882a593Smuzhiyun 
1107*4882a593Smuzhiyun 	lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags);
1108*4882a593Smuzhiyun 	if (!lgp->args.layout.pages) {
1109*4882a593Smuzhiyun 		kfree(lgp);
1110*4882a593Smuzhiyun 		return NULL;
1111*4882a593Smuzhiyun 	}
1112*4882a593Smuzhiyun 	lgp->args.layout.pglen = max_pages * PAGE_SIZE;
1113*4882a593Smuzhiyun 	lgp->res.layoutp = &lgp->args.layout;
1114*4882a593Smuzhiyun 
1115*4882a593Smuzhiyun 	/* Don't confuse uninitialised result and success */
1116*4882a593Smuzhiyun 	lgp->res.status = -NFS4ERR_DELAY;
1117*4882a593Smuzhiyun 
1118*4882a593Smuzhiyun 	lgp->args.minlength = PAGE_SIZE;
1119*4882a593Smuzhiyun 	if (lgp->args.minlength > range->length)
1120*4882a593Smuzhiyun 		lgp->args.minlength = range->length;
1121*4882a593Smuzhiyun 	if (ino) {
1122*4882a593Smuzhiyun 		loff_t i_size = i_size_read(ino);
1123*4882a593Smuzhiyun 
1124*4882a593Smuzhiyun 		if (range->iomode == IOMODE_READ) {
1125*4882a593Smuzhiyun 			if (range->offset >= i_size)
1126*4882a593Smuzhiyun 				lgp->args.minlength = 0;
1127*4882a593Smuzhiyun 			else if (i_size - range->offset < lgp->args.minlength)
1128*4882a593Smuzhiyun 				lgp->args.minlength = i_size - range->offset;
1129*4882a593Smuzhiyun 		}
1130*4882a593Smuzhiyun 	}
1131*4882a593Smuzhiyun 	lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
1132*4882a593Smuzhiyun 	pnfs_copy_range(&lgp->args.range, range);
1133*4882a593Smuzhiyun 	lgp->args.type = server->pnfs_curr_ld->id;
1134*4882a593Smuzhiyun 	lgp->args.inode = ino;
1135*4882a593Smuzhiyun 	lgp->args.ctx = get_nfs_open_context(ctx);
1136*4882a593Smuzhiyun 	nfs4_stateid_copy(&lgp->args.stateid, stateid);
1137*4882a593Smuzhiyun 	lgp->gfp_flags = gfp_flags;
1138*4882a593Smuzhiyun 	lgp->cred = ctx->cred;
1139*4882a593Smuzhiyun 	return lgp;
1140*4882a593Smuzhiyun }
1141*4882a593Smuzhiyun 
pnfs_layoutget_free(struct nfs4_layoutget * lgp)1142*4882a593Smuzhiyun void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
1143*4882a593Smuzhiyun {
1144*4882a593Smuzhiyun 	size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE;
1145*4882a593Smuzhiyun 
1146*4882a593Smuzhiyun 	nfs4_free_pages(lgp->args.layout.pages, max_pages);
1147*4882a593Smuzhiyun 	if (lgp->args.inode)
1148*4882a593Smuzhiyun 		pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
1149*4882a593Smuzhiyun 	put_nfs_open_context(lgp->args.ctx);
1150*4882a593Smuzhiyun 	kfree(lgp);
1151*4882a593Smuzhiyun }
1152*4882a593Smuzhiyun 
pnfs_clear_layoutcommit(struct inode * inode,struct list_head * head)1153*4882a593Smuzhiyun static void pnfs_clear_layoutcommit(struct inode *inode,
1154*4882a593Smuzhiyun 		struct list_head *head)
1155*4882a593Smuzhiyun {
1156*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(inode);
1157*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *tmp;
1158*4882a593Smuzhiyun 
1159*4882a593Smuzhiyun 	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
1160*4882a593Smuzhiyun 		return;
1161*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, tmp, &nfsi->layout->plh_segs, pls_list) {
1162*4882a593Smuzhiyun 		if (!test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
1163*4882a593Smuzhiyun 			continue;
1164*4882a593Smuzhiyun 		pnfs_lseg_dec_and_remove_zero(lseg, head);
1165*4882a593Smuzhiyun 	}
1166*4882a593Smuzhiyun }
1167*4882a593Smuzhiyun 
pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr * lo,const nfs4_stateid * arg_stateid,const struct pnfs_layout_range * range,const nfs4_stateid * stateid)1168*4882a593Smuzhiyun void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
1169*4882a593Smuzhiyun 		const nfs4_stateid *arg_stateid,
1170*4882a593Smuzhiyun 		const struct pnfs_layout_range *range,
1171*4882a593Smuzhiyun 		const nfs4_stateid *stateid)
1172*4882a593Smuzhiyun {
1173*4882a593Smuzhiyun 	struct inode *inode = lo->plh_inode;
1174*4882a593Smuzhiyun 	LIST_HEAD(freeme);
1175*4882a593Smuzhiyun 
1176*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
1177*4882a593Smuzhiyun 	if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
1178*4882a593Smuzhiyun 	    !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
1179*4882a593Smuzhiyun 		goto out_unlock;
1180*4882a593Smuzhiyun 	if (stateid) {
1181*4882a593Smuzhiyun 		u32 seq = be32_to_cpu(arg_stateid->seqid);
1182*4882a593Smuzhiyun 
1183*4882a593Smuzhiyun 		pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
1184*4882a593Smuzhiyun 		pnfs_free_returned_lsegs(lo, &freeme, range, seq);
1185*4882a593Smuzhiyun 		pnfs_set_layout_stateid(lo, stateid, NULL, true);
1186*4882a593Smuzhiyun 	} else
1187*4882a593Smuzhiyun 		pnfs_mark_layout_stateid_invalid(lo, &freeme);
1188*4882a593Smuzhiyun out_unlock:
1189*4882a593Smuzhiyun 	pnfs_clear_layoutreturn_waitbit(lo);
1190*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
1191*4882a593Smuzhiyun 	pnfs_free_lseg_list(&freeme);
1192*4882a593Smuzhiyun 
1193*4882a593Smuzhiyun }
1194*4882a593Smuzhiyun 
1195*4882a593Smuzhiyun static bool
pnfs_prepare_layoutreturn(struct pnfs_layout_hdr * lo,nfs4_stateid * stateid,const struct cred ** cred,enum pnfs_iomode * iomode)1196*4882a593Smuzhiyun pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
1197*4882a593Smuzhiyun 		nfs4_stateid *stateid,
1198*4882a593Smuzhiyun 		const struct cred **cred,
1199*4882a593Smuzhiyun 		enum pnfs_iomode *iomode)
1200*4882a593Smuzhiyun {
1201*4882a593Smuzhiyun 	/* Serialise LAYOUTGET/LAYOUTRETURN */
1202*4882a593Smuzhiyun 	if (atomic_read(&lo->plh_outstanding) != 0)
1203*4882a593Smuzhiyun 		return false;
1204*4882a593Smuzhiyun 	if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
1205*4882a593Smuzhiyun 		return false;
1206*4882a593Smuzhiyun 	set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
1207*4882a593Smuzhiyun 	pnfs_get_layout_hdr(lo);
1208*4882a593Smuzhiyun 	nfs4_stateid_copy(stateid, &lo->plh_stateid);
1209*4882a593Smuzhiyun 	*cred = get_cred(lo->plh_lc_cred);
1210*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
1211*4882a593Smuzhiyun 		if (lo->plh_return_seq != 0)
1212*4882a593Smuzhiyun 			stateid->seqid = cpu_to_be32(lo->plh_return_seq);
1213*4882a593Smuzhiyun 		if (iomode != NULL)
1214*4882a593Smuzhiyun 			*iomode = lo->plh_return_iomode;
1215*4882a593Smuzhiyun 		pnfs_clear_layoutreturn_info(lo);
1216*4882a593Smuzhiyun 	} else if (iomode != NULL)
1217*4882a593Smuzhiyun 		*iomode = IOMODE_ANY;
1218*4882a593Smuzhiyun 	pnfs_barrier_update(lo, be32_to_cpu(stateid->seqid));
1219*4882a593Smuzhiyun 	return true;
1220*4882a593Smuzhiyun }
1221*4882a593Smuzhiyun 
1222*4882a593Smuzhiyun static void
pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args * args,struct pnfs_layout_hdr * lo,const nfs4_stateid * stateid,enum pnfs_iomode iomode)1223*4882a593Smuzhiyun pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
1224*4882a593Smuzhiyun 		struct pnfs_layout_hdr *lo,
1225*4882a593Smuzhiyun 		const nfs4_stateid *stateid,
1226*4882a593Smuzhiyun 		enum pnfs_iomode iomode)
1227*4882a593Smuzhiyun {
1228*4882a593Smuzhiyun 	struct inode *inode = lo->plh_inode;
1229*4882a593Smuzhiyun 
1230*4882a593Smuzhiyun 	args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
1231*4882a593Smuzhiyun 	args->inode = inode;
1232*4882a593Smuzhiyun 	args->range.iomode = iomode;
1233*4882a593Smuzhiyun 	args->range.offset = 0;
1234*4882a593Smuzhiyun 	args->range.length = NFS4_MAX_UINT64;
1235*4882a593Smuzhiyun 	args->layout = lo;
1236*4882a593Smuzhiyun 	nfs4_stateid_copy(&args->stateid, stateid);
1237*4882a593Smuzhiyun }
1238*4882a593Smuzhiyun 
1239*4882a593Smuzhiyun static int
pnfs_send_layoutreturn(struct pnfs_layout_hdr * lo,const nfs4_stateid * stateid,const struct cred ** pcred,enum pnfs_iomode iomode,bool sync)1240*4882a593Smuzhiyun pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
1241*4882a593Smuzhiyun 		       const nfs4_stateid *stateid,
1242*4882a593Smuzhiyun 		       const struct cred **pcred,
1243*4882a593Smuzhiyun 		       enum pnfs_iomode iomode,
1244*4882a593Smuzhiyun 		       bool sync)
1245*4882a593Smuzhiyun {
1246*4882a593Smuzhiyun 	struct inode *ino = lo->plh_inode;
1247*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1248*4882a593Smuzhiyun 	struct nfs4_layoutreturn *lrp;
1249*4882a593Smuzhiyun 	const struct cred *cred = *pcred;
1250*4882a593Smuzhiyun 	int status = 0;
1251*4882a593Smuzhiyun 
1252*4882a593Smuzhiyun 	*pcred = NULL;
1253*4882a593Smuzhiyun 	lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
1254*4882a593Smuzhiyun 	if (unlikely(lrp == NULL)) {
1255*4882a593Smuzhiyun 		status = -ENOMEM;
1256*4882a593Smuzhiyun 		spin_lock(&ino->i_lock);
1257*4882a593Smuzhiyun 		pnfs_clear_layoutreturn_waitbit(lo);
1258*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
1259*4882a593Smuzhiyun 		put_cred(cred);
1260*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
1261*4882a593Smuzhiyun 		goto out;
1262*4882a593Smuzhiyun 	}
1263*4882a593Smuzhiyun 
1264*4882a593Smuzhiyun 	pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
1265*4882a593Smuzhiyun 	lrp->args.ld_private = &lrp->ld_private;
1266*4882a593Smuzhiyun 	lrp->clp = NFS_SERVER(ino)->nfs_client;
1267*4882a593Smuzhiyun 	lrp->cred = cred;
1268*4882a593Smuzhiyun 	if (ld->prepare_layoutreturn)
1269*4882a593Smuzhiyun 		ld->prepare_layoutreturn(&lrp->args);
1270*4882a593Smuzhiyun 
1271*4882a593Smuzhiyun 	status = nfs4_proc_layoutreturn(lrp, sync);
1272*4882a593Smuzhiyun out:
1273*4882a593Smuzhiyun 	dprintk("<-- %s status: %d\n", __func__, status);
1274*4882a593Smuzhiyun 	return status;
1275*4882a593Smuzhiyun }
1276*4882a593Smuzhiyun 
1277*4882a593Smuzhiyun static bool
pnfs_layout_segments_returnable(struct pnfs_layout_hdr * lo,enum pnfs_iomode iomode,u32 seq)1278*4882a593Smuzhiyun pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
1279*4882a593Smuzhiyun 				enum pnfs_iomode iomode,
1280*4882a593Smuzhiyun 				u32 seq)
1281*4882a593Smuzhiyun {
1282*4882a593Smuzhiyun 	struct pnfs_layout_range recall_range = {
1283*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
1284*4882a593Smuzhiyun 		.iomode = iomode,
1285*4882a593Smuzhiyun 	};
1286*4882a593Smuzhiyun 	return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
1287*4882a593Smuzhiyun 					       &recall_range, seq) != -EBUSY;
1288*4882a593Smuzhiyun }
1289*4882a593Smuzhiyun 
1290*4882a593Smuzhiyun /* Return true if layoutreturn is needed */
1291*4882a593Smuzhiyun static bool
pnfs_layout_need_return(struct pnfs_layout_hdr * lo)1292*4882a593Smuzhiyun pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
1293*4882a593Smuzhiyun {
1294*4882a593Smuzhiyun 	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1295*4882a593Smuzhiyun 		return false;
1296*4882a593Smuzhiyun 	return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
1297*4882a593Smuzhiyun 					       lo->plh_return_seq);
1298*4882a593Smuzhiyun }
1299*4882a593Smuzhiyun 
pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr * lo)1300*4882a593Smuzhiyun static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
1301*4882a593Smuzhiyun {
1302*4882a593Smuzhiyun 	struct inode *inode= lo->plh_inode;
1303*4882a593Smuzhiyun 
1304*4882a593Smuzhiyun 	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1305*4882a593Smuzhiyun 		return;
1306*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
1307*4882a593Smuzhiyun 	if (pnfs_layout_need_return(lo)) {
1308*4882a593Smuzhiyun 		const struct cred *cred;
1309*4882a593Smuzhiyun 		nfs4_stateid stateid;
1310*4882a593Smuzhiyun 		enum pnfs_iomode iomode;
1311*4882a593Smuzhiyun 		bool send;
1312*4882a593Smuzhiyun 
1313*4882a593Smuzhiyun 		send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
1314*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
1315*4882a593Smuzhiyun 		if (send) {
1316*4882a593Smuzhiyun 			/* Send an async layoutreturn so we dont deadlock */
1317*4882a593Smuzhiyun 			pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
1318*4882a593Smuzhiyun 		}
1319*4882a593Smuzhiyun 	} else
1320*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
1321*4882a593Smuzhiyun }
1322*4882a593Smuzhiyun 
1323*4882a593Smuzhiyun /*
1324*4882a593Smuzhiyun  * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
1325*4882a593Smuzhiyun  * when the layout segment list is empty.
1326*4882a593Smuzhiyun  *
1327*4882a593Smuzhiyun  * Note that a pnfs_layout_hdr can exist with an empty layout segment
1328*4882a593Smuzhiyun  * list when LAYOUTGET has failed, or when LAYOUTGET succeeded, but the
1329*4882a593Smuzhiyun  * deviceid is marked invalid.
1330*4882a593Smuzhiyun  */
1331*4882a593Smuzhiyun int
_pnfs_return_layout(struct inode * ino)1332*4882a593Smuzhiyun _pnfs_return_layout(struct inode *ino)
1333*4882a593Smuzhiyun {
1334*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo = NULL;
1335*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(ino);
1336*4882a593Smuzhiyun 	struct pnfs_layout_range range = {
1337*4882a593Smuzhiyun 		.iomode		= IOMODE_ANY,
1338*4882a593Smuzhiyun 		.offset		= 0,
1339*4882a593Smuzhiyun 		.length		= NFS4_MAX_UINT64,
1340*4882a593Smuzhiyun 	};
1341*4882a593Smuzhiyun 	LIST_HEAD(tmp_list);
1342*4882a593Smuzhiyun 	const struct cred *cred;
1343*4882a593Smuzhiyun 	nfs4_stateid stateid;
1344*4882a593Smuzhiyun 	int status = 0;
1345*4882a593Smuzhiyun 	bool send, valid_layout;
1346*4882a593Smuzhiyun 
1347*4882a593Smuzhiyun 	dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
1348*4882a593Smuzhiyun 
1349*4882a593Smuzhiyun 	spin_lock(&ino->i_lock);
1350*4882a593Smuzhiyun 	lo = nfsi->layout;
1351*4882a593Smuzhiyun 	if (!lo) {
1352*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
1353*4882a593Smuzhiyun 		dprintk("NFS: %s no layout to return\n", __func__);
1354*4882a593Smuzhiyun 		goto out;
1355*4882a593Smuzhiyun 	}
1356*4882a593Smuzhiyun 	/* Reference matched in nfs4_layoutreturn_release */
1357*4882a593Smuzhiyun 	pnfs_get_layout_hdr(lo);
1358*4882a593Smuzhiyun 	/* Is there an outstanding layoutreturn ? */
1359*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1360*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
1361*4882a593Smuzhiyun 		if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1362*4882a593Smuzhiyun 					TASK_UNINTERRUPTIBLE))
1363*4882a593Smuzhiyun 			goto out_put_layout_hdr;
1364*4882a593Smuzhiyun 		spin_lock(&ino->i_lock);
1365*4882a593Smuzhiyun 	}
1366*4882a593Smuzhiyun 	valid_layout = pnfs_layout_is_valid(lo);
1367*4882a593Smuzhiyun 	pnfs_clear_layoutcommit(ino, &tmp_list);
1368*4882a593Smuzhiyun 	pnfs_mark_matching_lsegs_return(lo, &tmp_list, &range, 0);
1369*4882a593Smuzhiyun 
1370*4882a593Smuzhiyun 	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range)
1371*4882a593Smuzhiyun 		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range);
1372*4882a593Smuzhiyun 
1373*4882a593Smuzhiyun 	/* Don't send a LAYOUTRETURN if list was initially empty */
1374*4882a593Smuzhiyun 	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) ||
1375*4882a593Smuzhiyun 			!valid_layout) {
1376*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
1377*4882a593Smuzhiyun 		dprintk("NFS: %s no layout segments to return\n", __func__);
1378*4882a593Smuzhiyun 		goto out_wait_layoutreturn;
1379*4882a593Smuzhiyun 	}
1380*4882a593Smuzhiyun 
1381*4882a593Smuzhiyun 	send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
1382*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
1383*4882a593Smuzhiyun 	if (send)
1384*4882a593Smuzhiyun 		status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
1385*4882a593Smuzhiyun out_wait_layoutreturn:
1386*4882a593Smuzhiyun 	wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN, TASK_UNINTERRUPTIBLE);
1387*4882a593Smuzhiyun out_put_layout_hdr:
1388*4882a593Smuzhiyun 	pnfs_free_lseg_list(&tmp_list);
1389*4882a593Smuzhiyun 	pnfs_put_layout_hdr(lo);
1390*4882a593Smuzhiyun out:
1391*4882a593Smuzhiyun 	dprintk("<-- %s status: %d\n", __func__, status);
1392*4882a593Smuzhiyun 	return status;
1393*4882a593Smuzhiyun }
1394*4882a593Smuzhiyun 
1395*4882a593Smuzhiyun int
pnfs_commit_and_return_layout(struct inode * inode)1396*4882a593Smuzhiyun pnfs_commit_and_return_layout(struct inode *inode)
1397*4882a593Smuzhiyun {
1398*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
1399*4882a593Smuzhiyun 	int ret;
1400*4882a593Smuzhiyun 
1401*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
1402*4882a593Smuzhiyun 	lo = NFS_I(inode)->layout;
1403*4882a593Smuzhiyun 	if (lo == NULL) {
1404*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
1405*4882a593Smuzhiyun 		return 0;
1406*4882a593Smuzhiyun 	}
1407*4882a593Smuzhiyun 	pnfs_get_layout_hdr(lo);
1408*4882a593Smuzhiyun 	/* Block new layoutgets and read/write to ds */
1409*4882a593Smuzhiyun 	lo->plh_block_lgets++;
1410*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
1411*4882a593Smuzhiyun 	filemap_fdatawait(inode->i_mapping);
1412*4882a593Smuzhiyun 	ret = pnfs_layoutcommit_inode(inode, true);
1413*4882a593Smuzhiyun 	if (ret == 0)
1414*4882a593Smuzhiyun 		ret = _pnfs_return_layout(inode);
1415*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
1416*4882a593Smuzhiyun 	lo->plh_block_lgets--;
1417*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
1418*4882a593Smuzhiyun 	pnfs_put_layout_hdr(lo);
1419*4882a593Smuzhiyun 	return ret;
1420*4882a593Smuzhiyun }
1421*4882a593Smuzhiyun 
pnfs_roc(struct inode * ino,struct nfs4_layoutreturn_args * args,struct nfs4_layoutreturn_res * res,const struct cred * cred)1422*4882a593Smuzhiyun bool pnfs_roc(struct inode *ino,
1423*4882a593Smuzhiyun 		struct nfs4_layoutreturn_args *args,
1424*4882a593Smuzhiyun 		struct nfs4_layoutreturn_res *res,
1425*4882a593Smuzhiyun 		const struct cred *cred)
1426*4882a593Smuzhiyun {
1427*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(ino);
1428*4882a593Smuzhiyun 	struct nfs_open_context *ctx;
1429*4882a593Smuzhiyun 	struct nfs4_state *state;
1430*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
1431*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *next;
1432*4882a593Smuzhiyun 	const struct cred *lc_cred;
1433*4882a593Smuzhiyun 	nfs4_stateid stateid;
1434*4882a593Smuzhiyun 	enum pnfs_iomode iomode = 0;
1435*4882a593Smuzhiyun 	bool layoutreturn = false, roc = false;
1436*4882a593Smuzhiyun 	bool skip_read = false;
1437*4882a593Smuzhiyun 
1438*4882a593Smuzhiyun 	if (!nfs_have_layout(ino))
1439*4882a593Smuzhiyun 		return false;
1440*4882a593Smuzhiyun retry:
1441*4882a593Smuzhiyun 	rcu_read_lock();
1442*4882a593Smuzhiyun 	spin_lock(&ino->i_lock);
1443*4882a593Smuzhiyun 	lo = nfsi->layout;
1444*4882a593Smuzhiyun 	if (!lo || !pnfs_layout_is_valid(lo) ||
1445*4882a593Smuzhiyun 	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
1446*4882a593Smuzhiyun 		lo = NULL;
1447*4882a593Smuzhiyun 		goto out_noroc;
1448*4882a593Smuzhiyun 	}
1449*4882a593Smuzhiyun 	pnfs_get_layout_hdr(lo);
1450*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
1451*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
1452*4882a593Smuzhiyun 		rcu_read_unlock();
1453*4882a593Smuzhiyun 		wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
1454*4882a593Smuzhiyun 				TASK_UNINTERRUPTIBLE);
1455*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
1456*4882a593Smuzhiyun 		goto retry;
1457*4882a593Smuzhiyun 	}
1458*4882a593Smuzhiyun 
1459*4882a593Smuzhiyun 	/* no roc if we hold a delegation */
1460*4882a593Smuzhiyun 	if (nfs4_check_delegation(ino, FMODE_READ)) {
1461*4882a593Smuzhiyun 		if (nfs4_check_delegation(ino, FMODE_WRITE))
1462*4882a593Smuzhiyun 			goto out_noroc;
1463*4882a593Smuzhiyun 		skip_read = true;
1464*4882a593Smuzhiyun 	}
1465*4882a593Smuzhiyun 
1466*4882a593Smuzhiyun 	list_for_each_entry_rcu(ctx, &nfsi->open_files, list) {
1467*4882a593Smuzhiyun 		state = ctx->state;
1468*4882a593Smuzhiyun 		if (state == NULL)
1469*4882a593Smuzhiyun 			continue;
1470*4882a593Smuzhiyun 		/* Don't return layout if there is open file state */
1471*4882a593Smuzhiyun 		if (state->state & FMODE_WRITE)
1472*4882a593Smuzhiyun 			goto out_noroc;
1473*4882a593Smuzhiyun 		if (state->state & FMODE_READ)
1474*4882a593Smuzhiyun 			skip_read = true;
1475*4882a593Smuzhiyun 	}
1476*4882a593Smuzhiyun 
1477*4882a593Smuzhiyun 
1478*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
1479*4882a593Smuzhiyun 		if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
1480*4882a593Smuzhiyun 			continue;
1481*4882a593Smuzhiyun 		/* If we are sending layoutreturn, invalidate all valid lsegs */
1482*4882a593Smuzhiyun 		if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
1483*4882a593Smuzhiyun 			continue;
1484*4882a593Smuzhiyun 		/*
1485*4882a593Smuzhiyun 		 * Note: mark lseg for return so pnfs_layout_remove_lseg
1486*4882a593Smuzhiyun 		 * doesn't invalidate the layout for us.
1487*4882a593Smuzhiyun 		 */
1488*4882a593Smuzhiyun 		set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
1489*4882a593Smuzhiyun 		if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
1490*4882a593Smuzhiyun 			continue;
1491*4882a593Smuzhiyun 		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
1492*4882a593Smuzhiyun 	}
1493*4882a593Smuzhiyun 
1494*4882a593Smuzhiyun 	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
1495*4882a593Smuzhiyun 		goto out_noroc;
1496*4882a593Smuzhiyun 
1497*4882a593Smuzhiyun 	/* ROC in two conditions:
1498*4882a593Smuzhiyun 	 * 1. there are ROC lsegs
1499*4882a593Smuzhiyun 	 * 2. we don't send layoutreturn
1500*4882a593Smuzhiyun 	 */
1501*4882a593Smuzhiyun 	/* lo ref dropped in pnfs_roc_release() */
1502*4882a593Smuzhiyun 	layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
1503*4882a593Smuzhiyun 	/* If the creds don't match, we can't compound the layoutreturn */
1504*4882a593Smuzhiyun 	if (!layoutreturn || cred_fscmp(cred, lc_cred) != 0)
1505*4882a593Smuzhiyun 		goto out_noroc;
1506*4882a593Smuzhiyun 
1507*4882a593Smuzhiyun 	roc = layoutreturn;
1508*4882a593Smuzhiyun 	pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
1509*4882a593Smuzhiyun 	res->lrs_present = 0;
1510*4882a593Smuzhiyun 	layoutreturn = false;
1511*4882a593Smuzhiyun 	put_cred(lc_cred);
1512*4882a593Smuzhiyun 
1513*4882a593Smuzhiyun out_noroc:
1514*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
1515*4882a593Smuzhiyun 	rcu_read_unlock();
1516*4882a593Smuzhiyun 	pnfs_layoutcommit_inode(ino, true);
1517*4882a593Smuzhiyun 	if (roc) {
1518*4882a593Smuzhiyun 		struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
1519*4882a593Smuzhiyun 		if (ld->prepare_layoutreturn)
1520*4882a593Smuzhiyun 			ld->prepare_layoutreturn(args);
1521*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
1522*4882a593Smuzhiyun 		return true;
1523*4882a593Smuzhiyun 	}
1524*4882a593Smuzhiyun 	if (layoutreturn)
1525*4882a593Smuzhiyun 		pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
1526*4882a593Smuzhiyun 	pnfs_put_layout_hdr(lo);
1527*4882a593Smuzhiyun 	return false;
1528*4882a593Smuzhiyun }
1529*4882a593Smuzhiyun 
pnfs_roc_done(struct rpc_task * task,struct nfs4_layoutreturn_args ** argpp,struct nfs4_layoutreturn_res ** respp,int * ret)1530*4882a593Smuzhiyun int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp,
1531*4882a593Smuzhiyun 		  struct nfs4_layoutreturn_res **respp, int *ret)
1532*4882a593Smuzhiyun {
1533*4882a593Smuzhiyun 	struct nfs4_layoutreturn_args *arg = *argpp;
1534*4882a593Smuzhiyun 	int retval = -EAGAIN;
1535*4882a593Smuzhiyun 
1536*4882a593Smuzhiyun 	if (!arg)
1537*4882a593Smuzhiyun 		return 0;
1538*4882a593Smuzhiyun 	/* Handle Layoutreturn errors */
1539*4882a593Smuzhiyun 	switch (*ret) {
1540*4882a593Smuzhiyun 	case 0:
1541*4882a593Smuzhiyun 		retval = 0;
1542*4882a593Smuzhiyun 		break;
1543*4882a593Smuzhiyun 	case -NFS4ERR_NOMATCHING_LAYOUT:
1544*4882a593Smuzhiyun 		/* Was there an RPC level error? If not, retry */
1545*4882a593Smuzhiyun 		if (task->tk_rpc_status == 0)
1546*4882a593Smuzhiyun 			break;
1547*4882a593Smuzhiyun 		/* If the call was not sent, let caller handle it */
1548*4882a593Smuzhiyun 		if (!RPC_WAS_SENT(task))
1549*4882a593Smuzhiyun 			return 0;
1550*4882a593Smuzhiyun 		/*
1551*4882a593Smuzhiyun 		 * Otherwise, assume the call succeeded and
1552*4882a593Smuzhiyun 		 * that we need to release the layout
1553*4882a593Smuzhiyun 		 */
1554*4882a593Smuzhiyun 		*ret = 0;
1555*4882a593Smuzhiyun 		(*respp)->lrs_present = 0;
1556*4882a593Smuzhiyun 		retval = 0;
1557*4882a593Smuzhiyun 		break;
1558*4882a593Smuzhiyun 	case -NFS4ERR_DELAY:
1559*4882a593Smuzhiyun 		/* Let the caller handle the retry */
1560*4882a593Smuzhiyun 		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
1561*4882a593Smuzhiyun 		return 0;
1562*4882a593Smuzhiyun 	case -NFS4ERR_OLD_STATEID:
1563*4882a593Smuzhiyun 		if (!nfs4_layout_refresh_old_stateid(&arg->stateid,
1564*4882a593Smuzhiyun 						     &arg->range, arg->inode))
1565*4882a593Smuzhiyun 			break;
1566*4882a593Smuzhiyun 		*ret = -NFS4ERR_NOMATCHING_LAYOUT;
1567*4882a593Smuzhiyun 		return -EAGAIN;
1568*4882a593Smuzhiyun 	}
1569*4882a593Smuzhiyun 	*argpp = NULL;
1570*4882a593Smuzhiyun 	*respp = NULL;
1571*4882a593Smuzhiyun 	return retval;
1572*4882a593Smuzhiyun }
1573*4882a593Smuzhiyun 
pnfs_roc_release(struct nfs4_layoutreturn_args * args,struct nfs4_layoutreturn_res * res,int ret)1574*4882a593Smuzhiyun void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
1575*4882a593Smuzhiyun 		struct nfs4_layoutreturn_res *res,
1576*4882a593Smuzhiyun 		int ret)
1577*4882a593Smuzhiyun {
1578*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo = args->layout;
1579*4882a593Smuzhiyun 	struct inode *inode = args->inode;
1580*4882a593Smuzhiyun 	const nfs4_stateid *arg_stateid = NULL;
1581*4882a593Smuzhiyun 	const nfs4_stateid *res_stateid = NULL;
1582*4882a593Smuzhiyun 	struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
1583*4882a593Smuzhiyun 
1584*4882a593Smuzhiyun 	switch (ret) {
1585*4882a593Smuzhiyun 	case -NFS4ERR_NOMATCHING_LAYOUT:
1586*4882a593Smuzhiyun 		spin_lock(&inode->i_lock);
1587*4882a593Smuzhiyun 		if (pnfs_layout_is_valid(lo) &&
1588*4882a593Smuzhiyun 		    nfs4_stateid_match_other(&args->stateid, &lo->plh_stateid))
1589*4882a593Smuzhiyun 			pnfs_set_plh_return_info(lo, args->range.iomode, 0);
1590*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
1591*4882a593Smuzhiyun 		break;
1592*4882a593Smuzhiyun 	case 0:
1593*4882a593Smuzhiyun 		if (res->lrs_present)
1594*4882a593Smuzhiyun 			res_stateid = &res->stateid;
1595*4882a593Smuzhiyun 		fallthrough;
1596*4882a593Smuzhiyun 	default:
1597*4882a593Smuzhiyun 		arg_stateid = &args->stateid;
1598*4882a593Smuzhiyun 	}
1599*4882a593Smuzhiyun 	trace_nfs4_layoutreturn_on_close(args->inode, &args->stateid, ret);
1600*4882a593Smuzhiyun 	pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
1601*4882a593Smuzhiyun 			res_stateid);
1602*4882a593Smuzhiyun 	if (ld_private && ld_private->ops && ld_private->ops->free)
1603*4882a593Smuzhiyun 		ld_private->ops->free(ld_private);
1604*4882a593Smuzhiyun 	pnfs_put_layout_hdr(lo);
1605*4882a593Smuzhiyun }
1606*4882a593Smuzhiyun 
pnfs_wait_on_layoutreturn(struct inode * ino,struct rpc_task * task)1607*4882a593Smuzhiyun bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
1608*4882a593Smuzhiyun {
1609*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(ino);
1610*4882a593Smuzhiyun         struct pnfs_layout_hdr *lo;
1611*4882a593Smuzhiyun         bool sleep = false;
1612*4882a593Smuzhiyun 
1613*4882a593Smuzhiyun 	/* we might not have grabbed lo reference. so need to check under
1614*4882a593Smuzhiyun 	 * i_lock */
1615*4882a593Smuzhiyun         spin_lock(&ino->i_lock);
1616*4882a593Smuzhiyun         lo = nfsi->layout;
1617*4882a593Smuzhiyun         if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
1618*4882a593Smuzhiyun                 rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
1619*4882a593Smuzhiyun                 sleep = true;
1620*4882a593Smuzhiyun 	}
1621*4882a593Smuzhiyun         spin_unlock(&ino->i_lock);
1622*4882a593Smuzhiyun         return sleep;
1623*4882a593Smuzhiyun }
1624*4882a593Smuzhiyun 
1625*4882a593Smuzhiyun /*
1626*4882a593Smuzhiyun  * Compare two layout segments for sorting into layout cache.
1627*4882a593Smuzhiyun  * We want to preferentially return RW over RO layouts, so ensure those
1628*4882a593Smuzhiyun  * are seen first.
1629*4882a593Smuzhiyun  */
1630*4882a593Smuzhiyun static s64
pnfs_lseg_range_cmp(const struct pnfs_layout_range * l1,const struct pnfs_layout_range * l2)1631*4882a593Smuzhiyun pnfs_lseg_range_cmp(const struct pnfs_layout_range *l1,
1632*4882a593Smuzhiyun 	   const struct pnfs_layout_range *l2)
1633*4882a593Smuzhiyun {
1634*4882a593Smuzhiyun 	s64 d;
1635*4882a593Smuzhiyun 
1636*4882a593Smuzhiyun 	/* high offset > low offset */
1637*4882a593Smuzhiyun 	d = l1->offset - l2->offset;
1638*4882a593Smuzhiyun 	if (d)
1639*4882a593Smuzhiyun 		return d;
1640*4882a593Smuzhiyun 
1641*4882a593Smuzhiyun 	/* short length > long length */
1642*4882a593Smuzhiyun 	d = l2->length - l1->length;
1643*4882a593Smuzhiyun 	if (d)
1644*4882a593Smuzhiyun 		return d;
1645*4882a593Smuzhiyun 
1646*4882a593Smuzhiyun 	/* read > read/write */
1647*4882a593Smuzhiyun 	return (int)(l1->iomode == IOMODE_READ) - (int)(l2->iomode == IOMODE_READ);
1648*4882a593Smuzhiyun }
1649*4882a593Smuzhiyun 
1650*4882a593Smuzhiyun static bool
pnfs_lseg_range_is_after(const struct pnfs_layout_range * l1,const struct pnfs_layout_range * l2)1651*4882a593Smuzhiyun pnfs_lseg_range_is_after(const struct pnfs_layout_range *l1,
1652*4882a593Smuzhiyun 		const struct pnfs_layout_range *l2)
1653*4882a593Smuzhiyun {
1654*4882a593Smuzhiyun 	return pnfs_lseg_range_cmp(l1, l2) > 0;
1655*4882a593Smuzhiyun }
1656*4882a593Smuzhiyun 
1657*4882a593Smuzhiyun static bool
pnfs_lseg_no_merge(struct pnfs_layout_segment * lseg,struct pnfs_layout_segment * old)1658*4882a593Smuzhiyun pnfs_lseg_no_merge(struct pnfs_layout_segment *lseg,
1659*4882a593Smuzhiyun 		struct pnfs_layout_segment *old)
1660*4882a593Smuzhiyun {
1661*4882a593Smuzhiyun 	return false;
1662*4882a593Smuzhiyun }
1663*4882a593Smuzhiyun 
1664*4882a593Smuzhiyun void
pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg,bool (* is_after)(const struct pnfs_layout_range *,const struct pnfs_layout_range *),bool (* do_merge)(struct pnfs_layout_segment *,struct pnfs_layout_segment *),struct list_head * free_me)1665*4882a593Smuzhiyun pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1666*4882a593Smuzhiyun 		   struct pnfs_layout_segment *lseg,
1667*4882a593Smuzhiyun 		   bool (*is_after)(const struct pnfs_layout_range *,
1668*4882a593Smuzhiyun 			   const struct pnfs_layout_range *),
1669*4882a593Smuzhiyun 		   bool (*do_merge)(struct pnfs_layout_segment *,
1670*4882a593Smuzhiyun 			   struct pnfs_layout_segment *),
1671*4882a593Smuzhiyun 		   struct list_head *free_me)
1672*4882a593Smuzhiyun {
1673*4882a593Smuzhiyun 	struct pnfs_layout_segment *lp, *tmp;
1674*4882a593Smuzhiyun 
1675*4882a593Smuzhiyun 	dprintk("%s:Begin\n", __func__);
1676*4882a593Smuzhiyun 
1677*4882a593Smuzhiyun 	list_for_each_entry_safe(lp, tmp, &lo->plh_segs, pls_list) {
1678*4882a593Smuzhiyun 		if (test_bit(NFS_LSEG_VALID, &lp->pls_flags) == 0)
1679*4882a593Smuzhiyun 			continue;
1680*4882a593Smuzhiyun 		if (do_merge(lseg, lp)) {
1681*4882a593Smuzhiyun 			mark_lseg_invalid(lp, free_me);
1682*4882a593Smuzhiyun 			continue;
1683*4882a593Smuzhiyun 		}
1684*4882a593Smuzhiyun 		if (is_after(&lseg->pls_range, &lp->pls_range))
1685*4882a593Smuzhiyun 			continue;
1686*4882a593Smuzhiyun 		list_add_tail(&lseg->pls_list, &lp->pls_list);
1687*4882a593Smuzhiyun 		dprintk("%s: inserted lseg %p "
1688*4882a593Smuzhiyun 			"iomode %d offset %llu length %llu before "
1689*4882a593Smuzhiyun 			"lp %p iomode %d offset %llu length %llu\n",
1690*4882a593Smuzhiyun 			__func__, lseg, lseg->pls_range.iomode,
1691*4882a593Smuzhiyun 			lseg->pls_range.offset, lseg->pls_range.length,
1692*4882a593Smuzhiyun 			lp, lp->pls_range.iomode, lp->pls_range.offset,
1693*4882a593Smuzhiyun 			lp->pls_range.length);
1694*4882a593Smuzhiyun 		goto out;
1695*4882a593Smuzhiyun 	}
1696*4882a593Smuzhiyun 	list_add_tail(&lseg->pls_list, &lo->plh_segs);
1697*4882a593Smuzhiyun 	dprintk("%s: inserted lseg %p "
1698*4882a593Smuzhiyun 		"iomode %d offset %llu length %llu at tail\n",
1699*4882a593Smuzhiyun 		__func__, lseg, lseg->pls_range.iomode,
1700*4882a593Smuzhiyun 		lseg->pls_range.offset, lseg->pls_range.length);
1701*4882a593Smuzhiyun out:
1702*4882a593Smuzhiyun 	pnfs_get_layout_hdr(lo);
1703*4882a593Smuzhiyun 
1704*4882a593Smuzhiyun 	dprintk("%s:Return\n", __func__);
1705*4882a593Smuzhiyun }
1706*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_layout_insert_lseg);
1707*4882a593Smuzhiyun 
1708*4882a593Smuzhiyun static void
pnfs_layout_insert_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_segment * lseg,struct list_head * free_me)1709*4882a593Smuzhiyun pnfs_layout_insert_lseg(struct pnfs_layout_hdr *lo,
1710*4882a593Smuzhiyun 		   struct pnfs_layout_segment *lseg,
1711*4882a593Smuzhiyun 		   struct list_head *free_me)
1712*4882a593Smuzhiyun {
1713*4882a593Smuzhiyun 	struct inode *inode = lo->plh_inode;
1714*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
1715*4882a593Smuzhiyun 
1716*4882a593Smuzhiyun 	if (ld->add_lseg != NULL)
1717*4882a593Smuzhiyun 		ld->add_lseg(lo, lseg, free_me);
1718*4882a593Smuzhiyun 	else
1719*4882a593Smuzhiyun 		pnfs_generic_layout_insert_lseg(lo, lseg,
1720*4882a593Smuzhiyun 				pnfs_lseg_range_is_after,
1721*4882a593Smuzhiyun 				pnfs_lseg_no_merge,
1722*4882a593Smuzhiyun 				free_me);
1723*4882a593Smuzhiyun }
1724*4882a593Smuzhiyun 
1725*4882a593Smuzhiyun static struct pnfs_layout_hdr *
alloc_init_layout_hdr(struct inode * ino,struct nfs_open_context * ctx,gfp_t gfp_flags)1726*4882a593Smuzhiyun alloc_init_layout_hdr(struct inode *ino,
1727*4882a593Smuzhiyun 		      struct nfs_open_context *ctx,
1728*4882a593Smuzhiyun 		      gfp_t gfp_flags)
1729*4882a593Smuzhiyun {
1730*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
1731*4882a593Smuzhiyun 
1732*4882a593Smuzhiyun 	lo = pnfs_alloc_layout_hdr(ino, gfp_flags);
1733*4882a593Smuzhiyun 	if (!lo)
1734*4882a593Smuzhiyun 		return NULL;
1735*4882a593Smuzhiyun 	refcount_set(&lo->plh_refcount, 1);
1736*4882a593Smuzhiyun 	INIT_LIST_HEAD(&lo->plh_layouts);
1737*4882a593Smuzhiyun 	INIT_LIST_HEAD(&lo->plh_segs);
1738*4882a593Smuzhiyun 	INIT_LIST_HEAD(&lo->plh_return_segs);
1739*4882a593Smuzhiyun 	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
1740*4882a593Smuzhiyun 	lo->plh_inode = ino;
1741*4882a593Smuzhiyun 	lo->plh_lc_cred = get_cred(ctx->cred);
1742*4882a593Smuzhiyun 	lo->plh_flags |= 1 << NFS_LAYOUT_INVALID_STID;
1743*4882a593Smuzhiyun 	return lo;
1744*4882a593Smuzhiyun }
1745*4882a593Smuzhiyun 
1746*4882a593Smuzhiyun static struct pnfs_layout_hdr *
pnfs_find_alloc_layout(struct inode * ino,struct nfs_open_context * ctx,gfp_t gfp_flags)1747*4882a593Smuzhiyun pnfs_find_alloc_layout(struct inode *ino,
1748*4882a593Smuzhiyun 		       struct nfs_open_context *ctx,
1749*4882a593Smuzhiyun 		       gfp_t gfp_flags)
1750*4882a593Smuzhiyun 	__releases(&ino->i_lock)
1751*4882a593Smuzhiyun 	__acquires(&ino->i_lock)
1752*4882a593Smuzhiyun {
1753*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(ino);
1754*4882a593Smuzhiyun 	struct pnfs_layout_hdr *new = NULL;
1755*4882a593Smuzhiyun 
1756*4882a593Smuzhiyun 	dprintk("%s Begin ino=%p layout=%p\n", __func__, ino, nfsi->layout);
1757*4882a593Smuzhiyun 
1758*4882a593Smuzhiyun 	if (nfsi->layout != NULL)
1759*4882a593Smuzhiyun 		goto out_existing;
1760*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
1761*4882a593Smuzhiyun 	new = alloc_init_layout_hdr(ino, ctx, gfp_flags);
1762*4882a593Smuzhiyun 	spin_lock(&ino->i_lock);
1763*4882a593Smuzhiyun 
1764*4882a593Smuzhiyun 	if (likely(nfsi->layout == NULL)) {	/* Won the race? */
1765*4882a593Smuzhiyun 		nfsi->layout = new;
1766*4882a593Smuzhiyun 		return new;
1767*4882a593Smuzhiyun 	} else if (new != NULL)
1768*4882a593Smuzhiyun 		pnfs_free_layout_hdr(new);
1769*4882a593Smuzhiyun out_existing:
1770*4882a593Smuzhiyun 	pnfs_get_layout_hdr(nfsi->layout);
1771*4882a593Smuzhiyun 	return nfsi->layout;
1772*4882a593Smuzhiyun }
1773*4882a593Smuzhiyun 
1774*4882a593Smuzhiyun /*
1775*4882a593Smuzhiyun  * iomode matching rules:
1776*4882a593Smuzhiyun  * iomode	lseg	strict match
1777*4882a593Smuzhiyun  *                      iomode
1778*4882a593Smuzhiyun  * -----	-----	------ -----
1779*4882a593Smuzhiyun  * ANY		READ	N/A    true
1780*4882a593Smuzhiyun  * ANY		RW	N/A    true
1781*4882a593Smuzhiyun  * RW		READ	N/A    false
1782*4882a593Smuzhiyun  * RW		RW	N/A    true
1783*4882a593Smuzhiyun  * READ		READ	N/A    true
1784*4882a593Smuzhiyun  * READ		RW	true   false
1785*4882a593Smuzhiyun  * READ		RW	false  true
1786*4882a593Smuzhiyun  */
1787*4882a593Smuzhiyun static bool
pnfs_lseg_range_match(const struct pnfs_layout_range * ls_range,const struct pnfs_layout_range * range,bool strict_iomode)1788*4882a593Smuzhiyun pnfs_lseg_range_match(const struct pnfs_layout_range *ls_range,
1789*4882a593Smuzhiyun 		 const struct pnfs_layout_range *range,
1790*4882a593Smuzhiyun 		 bool strict_iomode)
1791*4882a593Smuzhiyun {
1792*4882a593Smuzhiyun 	struct pnfs_layout_range range1;
1793*4882a593Smuzhiyun 
1794*4882a593Smuzhiyun 	if ((range->iomode == IOMODE_RW &&
1795*4882a593Smuzhiyun 	     ls_range->iomode != IOMODE_RW) ||
1796*4882a593Smuzhiyun 	    (range->iomode != ls_range->iomode &&
1797*4882a593Smuzhiyun 	     strict_iomode) ||
1798*4882a593Smuzhiyun 	    !pnfs_lseg_range_intersecting(ls_range, range))
1799*4882a593Smuzhiyun 		return false;
1800*4882a593Smuzhiyun 
1801*4882a593Smuzhiyun 	/* range1 covers only the first byte in the range */
1802*4882a593Smuzhiyun 	range1 = *range;
1803*4882a593Smuzhiyun 	range1.length = 1;
1804*4882a593Smuzhiyun 	return pnfs_lseg_range_contained(ls_range, &range1);
1805*4882a593Smuzhiyun }
1806*4882a593Smuzhiyun 
1807*4882a593Smuzhiyun /*
1808*4882a593Smuzhiyun  * lookup range in layout
1809*4882a593Smuzhiyun  */
1810*4882a593Smuzhiyun static struct pnfs_layout_segment *
pnfs_find_lseg(struct pnfs_layout_hdr * lo,struct pnfs_layout_range * range,bool strict_iomode)1811*4882a593Smuzhiyun pnfs_find_lseg(struct pnfs_layout_hdr *lo,
1812*4882a593Smuzhiyun 		struct pnfs_layout_range *range,
1813*4882a593Smuzhiyun 		bool strict_iomode)
1814*4882a593Smuzhiyun {
1815*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *ret = NULL;
1816*4882a593Smuzhiyun 
1817*4882a593Smuzhiyun 	dprintk("%s:Begin\n", __func__);
1818*4882a593Smuzhiyun 
1819*4882a593Smuzhiyun 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
1820*4882a593Smuzhiyun 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
1821*4882a593Smuzhiyun 		    pnfs_lseg_range_match(&lseg->pls_range, range,
1822*4882a593Smuzhiyun 					  strict_iomode)) {
1823*4882a593Smuzhiyun 			ret = pnfs_get_lseg(lseg);
1824*4882a593Smuzhiyun 			break;
1825*4882a593Smuzhiyun 		}
1826*4882a593Smuzhiyun 	}
1827*4882a593Smuzhiyun 
1828*4882a593Smuzhiyun 	dprintk("%s:Return lseg %p ref %d\n",
1829*4882a593Smuzhiyun 		__func__, ret, ret ? refcount_read(&ret->pls_refcount) : 0);
1830*4882a593Smuzhiyun 	return ret;
1831*4882a593Smuzhiyun }
1832*4882a593Smuzhiyun 
1833*4882a593Smuzhiyun /*
1834*4882a593Smuzhiyun  * Use mdsthreshold hints set at each OPEN to determine if I/O should go
1835*4882a593Smuzhiyun  * to the MDS or over pNFS
1836*4882a593Smuzhiyun  *
1837*4882a593Smuzhiyun  * The nfs_inode read_io and write_io fields are cumulative counters reset
1838*4882a593Smuzhiyun  * when there are no layout segments. Note that in pnfs_update_layout iomode
1839*4882a593Smuzhiyun  * is set to IOMODE_READ for a READ request, and set to IOMODE_RW for a
1840*4882a593Smuzhiyun  * WRITE request.
1841*4882a593Smuzhiyun  *
1842*4882a593Smuzhiyun  * A return of true means use MDS I/O.
1843*4882a593Smuzhiyun  *
1844*4882a593Smuzhiyun  * From rfc 5661:
1845*4882a593Smuzhiyun  * If a file's size is smaller than the file size threshold, data accesses
1846*4882a593Smuzhiyun  * SHOULD be sent to the metadata server.  If an I/O request has a length that
1847*4882a593Smuzhiyun  * is below the I/O size threshold, the I/O SHOULD be sent to the metadata
1848*4882a593Smuzhiyun  * server.  If both file size and I/O size are provided, the client SHOULD
1849*4882a593Smuzhiyun  * reach or exceed  both thresholds before sending its read or write
1850*4882a593Smuzhiyun  * requests to the data server.
1851*4882a593Smuzhiyun  */
pnfs_within_mdsthreshold(struct nfs_open_context * ctx,struct inode * ino,int iomode)1852*4882a593Smuzhiyun static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
1853*4882a593Smuzhiyun 				     struct inode *ino, int iomode)
1854*4882a593Smuzhiyun {
1855*4882a593Smuzhiyun 	struct nfs4_threshold *t = ctx->mdsthreshold;
1856*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(ino);
1857*4882a593Smuzhiyun 	loff_t fsize = i_size_read(ino);
1858*4882a593Smuzhiyun 	bool size = false, size_set = false, io = false, io_set = false, ret = false;
1859*4882a593Smuzhiyun 
1860*4882a593Smuzhiyun 	if (t == NULL)
1861*4882a593Smuzhiyun 		return ret;
1862*4882a593Smuzhiyun 
1863*4882a593Smuzhiyun 	dprintk("%s bm=0x%x rd_sz=%llu wr_sz=%llu rd_io=%llu wr_io=%llu\n",
1864*4882a593Smuzhiyun 		__func__, t->bm, t->rd_sz, t->wr_sz, t->rd_io_sz, t->wr_io_sz);
1865*4882a593Smuzhiyun 
1866*4882a593Smuzhiyun 	switch (iomode) {
1867*4882a593Smuzhiyun 	case IOMODE_READ:
1868*4882a593Smuzhiyun 		if (t->bm & THRESHOLD_RD) {
1869*4882a593Smuzhiyun 			dprintk("%s fsize %llu\n", __func__, fsize);
1870*4882a593Smuzhiyun 			size_set = true;
1871*4882a593Smuzhiyun 			if (fsize < t->rd_sz)
1872*4882a593Smuzhiyun 				size = true;
1873*4882a593Smuzhiyun 		}
1874*4882a593Smuzhiyun 		if (t->bm & THRESHOLD_RD_IO) {
1875*4882a593Smuzhiyun 			dprintk("%s nfsi->read_io %llu\n", __func__,
1876*4882a593Smuzhiyun 				nfsi->read_io);
1877*4882a593Smuzhiyun 			io_set = true;
1878*4882a593Smuzhiyun 			if (nfsi->read_io < t->rd_io_sz)
1879*4882a593Smuzhiyun 				io = true;
1880*4882a593Smuzhiyun 		}
1881*4882a593Smuzhiyun 		break;
1882*4882a593Smuzhiyun 	case IOMODE_RW:
1883*4882a593Smuzhiyun 		if (t->bm & THRESHOLD_WR) {
1884*4882a593Smuzhiyun 			dprintk("%s fsize %llu\n", __func__, fsize);
1885*4882a593Smuzhiyun 			size_set = true;
1886*4882a593Smuzhiyun 			if (fsize < t->wr_sz)
1887*4882a593Smuzhiyun 				size = true;
1888*4882a593Smuzhiyun 		}
1889*4882a593Smuzhiyun 		if (t->bm & THRESHOLD_WR_IO) {
1890*4882a593Smuzhiyun 			dprintk("%s nfsi->write_io %llu\n", __func__,
1891*4882a593Smuzhiyun 				nfsi->write_io);
1892*4882a593Smuzhiyun 			io_set = true;
1893*4882a593Smuzhiyun 			if (nfsi->write_io < t->wr_io_sz)
1894*4882a593Smuzhiyun 				io = true;
1895*4882a593Smuzhiyun 		}
1896*4882a593Smuzhiyun 		break;
1897*4882a593Smuzhiyun 	}
1898*4882a593Smuzhiyun 	if (size_set && io_set) {
1899*4882a593Smuzhiyun 		if (size && io)
1900*4882a593Smuzhiyun 			ret = true;
1901*4882a593Smuzhiyun 	} else if (size || io)
1902*4882a593Smuzhiyun 		ret = true;
1903*4882a593Smuzhiyun 
1904*4882a593Smuzhiyun 	dprintk("<-- %s size %d io %d ret %d\n", __func__, size, io, ret);
1905*4882a593Smuzhiyun 	return ret;
1906*4882a593Smuzhiyun }
1907*4882a593Smuzhiyun 
pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr * lo)1908*4882a593Smuzhiyun static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
1909*4882a593Smuzhiyun {
1910*4882a593Smuzhiyun 	/*
1911*4882a593Smuzhiyun 	 * send layoutcommit as it can hold up layoutreturn due to lseg
1912*4882a593Smuzhiyun 	 * reference
1913*4882a593Smuzhiyun 	 */
1914*4882a593Smuzhiyun 	pnfs_layoutcommit_inode(lo->plh_inode, false);
1915*4882a593Smuzhiyun 	return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
1916*4882a593Smuzhiyun 				   nfs_wait_bit_killable,
1917*4882a593Smuzhiyun 				   TASK_KILLABLE);
1918*4882a593Smuzhiyun }
1919*4882a593Smuzhiyun 
nfs_layoutget_begin(struct pnfs_layout_hdr * lo)1920*4882a593Smuzhiyun static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
1921*4882a593Smuzhiyun {
1922*4882a593Smuzhiyun 	atomic_inc(&lo->plh_outstanding);
1923*4882a593Smuzhiyun }
1924*4882a593Smuzhiyun 
nfs_layoutget_end(struct pnfs_layout_hdr * lo)1925*4882a593Smuzhiyun static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
1926*4882a593Smuzhiyun {
1927*4882a593Smuzhiyun 	if (atomic_dec_and_test(&lo->plh_outstanding) &&
1928*4882a593Smuzhiyun 	    test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
1929*4882a593Smuzhiyun 		wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
1930*4882a593Smuzhiyun }
1931*4882a593Smuzhiyun 
pnfs_is_first_layoutget(struct pnfs_layout_hdr * lo)1932*4882a593Smuzhiyun static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
1933*4882a593Smuzhiyun {
1934*4882a593Smuzhiyun 	return test_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags);
1935*4882a593Smuzhiyun }
1936*4882a593Smuzhiyun 
pnfs_clear_first_layoutget(struct pnfs_layout_hdr * lo)1937*4882a593Smuzhiyun static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
1938*4882a593Smuzhiyun {
1939*4882a593Smuzhiyun 	unsigned long *bitlock = &lo->plh_flags;
1940*4882a593Smuzhiyun 
1941*4882a593Smuzhiyun 	clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
1942*4882a593Smuzhiyun 	smp_mb__after_atomic();
1943*4882a593Smuzhiyun 	wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
1944*4882a593Smuzhiyun }
1945*4882a593Smuzhiyun 
_add_to_server_list(struct pnfs_layout_hdr * lo,struct nfs_server * server)1946*4882a593Smuzhiyun static void _add_to_server_list(struct pnfs_layout_hdr *lo,
1947*4882a593Smuzhiyun 				struct nfs_server *server)
1948*4882a593Smuzhiyun {
1949*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
1950*4882a593Smuzhiyun 		struct nfs_client *clp = server->nfs_client;
1951*4882a593Smuzhiyun 
1952*4882a593Smuzhiyun 		/* The lo must be on the clp list if there is any
1953*4882a593Smuzhiyun 		 * chance of a CB_LAYOUTRECALL(FILE) coming in.
1954*4882a593Smuzhiyun 		 */
1955*4882a593Smuzhiyun 		spin_lock(&clp->cl_lock);
1956*4882a593Smuzhiyun 		list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
1957*4882a593Smuzhiyun 		spin_unlock(&clp->cl_lock);
1958*4882a593Smuzhiyun 	}
1959*4882a593Smuzhiyun }
1960*4882a593Smuzhiyun 
1961*4882a593Smuzhiyun /*
1962*4882a593Smuzhiyun  * Layout segment is retreived from the server if not cached.
1963*4882a593Smuzhiyun  * The appropriate layout segment is referenced and returned to the caller.
1964*4882a593Smuzhiyun  */
1965*4882a593Smuzhiyun struct pnfs_layout_segment *
pnfs_update_layout(struct inode * ino,struct nfs_open_context * ctx,loff_t pos,u64 count,enum pnfs_iomode iomode,bool strict_iomode,gfp_t gfp_flags)1966*4882a593Smuzhiyun pnfs_update_layout(struct inode *ino,
1967*4882a593Smuzhiyun 		   struct nfs_open_context *ctx,
1968*4882a593Smuzhiyun 		   loff_t pos,
1969*4882a593Smuzhiyun 		   u64 count,
1970*4882a593Smuzhiyun 		   enum pnfs_iomode iomode,
1971*4882a593Smuzhiyun 		   bool strict_iomode,
1972*4882a593Smuzhiyun 		   gfp_t gfp_flags)
1973*4882a593Smuzhiyun {
1974*4882a593Smuzhiyun 	struct pnfs_layout_range arg = {
1975*4882a593Smuzhiyun 		.iomode = iomode,
1976*4882a593Smuzhiyun 		.offset = pos,
1977*4882a593Smuzhiyun 		.length = count,
1978*4882a593Smuzhiyun 	};
1979*4882a593Smuzhiyun 	unsigned pg_offset;
1980*4882a593Smuzhiyun 	struct nfs_server *server = NFS_SERVER(ino);
1981*4882a593Smuzhiyun 	struct nfs_client *clp = server->nfs_client;
1982*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo = NULL;
1983*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg = NULL;
1984*4882a593Smuzhiyun 	struct nfs4_layoutget *lgp;
1985*4882a593Smuzhiyun 	nfs4_stateid stateid;
1986*4882a593Smuzhiyun 	long timeout = 0;
1987*4882a593Smuzhiyun 	unsigned long giveup = jiffies + (clp->cl_lease_time << 1);
1988*4882a593Smuzhiyun 	bool first;
1989*4882a593Smuzhiyun 
1990*4882a593Smuzhiyun 	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
1991*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1992*4882a593Smuzhiyun 				 PNFS_UPDATE_LAYOUT_NO_PNFS);
1993*4882a593Smuzhiyun 		goto out;
1994*4882a593Smuzhiyun 	}
1995*4882a593Smuzhiyun 
1996*4882a593Smuzhiyun 	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
1997*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
1998*4882a593Smuzhiyun 				 PNFS_UPDATE_LAYOUT_MDSTHRESH);
1999*4882a593Smuzhiyun 		goto out;
2000*4882a593Smuzhiyun 	}
2001*4882a593Smuzhiyun 
2002*4882a593Smuzhiyun lookup_again:
2003*4882a593Smuzhiyun 	lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
2004*4882a593Smuzhiyun 	if (IS_ERR(lseg))
2005*4882a593Smuzhiyun 		goto out;
2006*4882a593Smuzhiyun 	first = false;
2007*4882a593Smuzhiyun 	spin_lock(&ino->i_lock);
2008*4882a593Smuzhiyun 	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
2009*4882a593Smuzhiyun 	if (lo == NULL) {
2010*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
2011*4882a593Smuzhiyun 		lseg = ERR_PTR(-ENOMEM);
2012*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2013*4882a593Smuzhiyun 				 PNFS_UPDATE_LAYOUT_NOMEM);
2014*4882a593Smuzhiyun 		goto out;
2015*4882a593Smuzhiyun 	}
2016*4882a593Smuzhiyun 
2017*4882a593Smuzhiyun 	/* Do we even need to bother with this? */
2018*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
2019*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2020*4882a593Smuzhiyun 				 PNFS_UPDATE_LAYOUT_BULK_RECALL);
2021*4882a593Smuzhiyun 		dprintk("%s matches recall, use MDS\n", __func__);
2022*4882a593Smuzhiyun 		goto out_unlock;
2023*4882a593Smuzhiyun 	}
2024*4882a593Smuzhiyun 
2025*4882a593Smuzhiyun 	/* if LAYOUTGET already failed once we don't try again */
2026*4882a593Smuzhiyun 	if (pnfs_layout_io_test_failed(lo, iomode)) {
2027*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2028*4882a593Smuzhiyun 				 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
2029*4882a593Smuzhiyun 		goto out_unlock;
2030*4882a593Smuzhiyun 	}
2031*4882a593Smuzhiyun 
2032*4882a593Smuzhiyun 	/*
2033*4882a593Smuzhiyun 	 * If the layout segment list is empty, but there are outstanding
2034*4882a593Smuzhiyun 	 * layoutget calls, then they might be subject to a layoutrecall.
2035*4882a593Smuzhiyun 	 */
2036*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
2037*4882a593Smuzhiyun 	    atomic_read(&lo->plh_outstanding) != 0) {
2038*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
2039*4882a593Smuzhiyun 		lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
2040*4882a593Smuzhiyun 					   TASK_KILLABLE));
2041*4882a593Smuzhiyun 		if (IS_ERR(lseg))
2042*4882a593Smuzhiyun 			goto out_put_layout_hdr;
2043*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
2044*4882a593Smuzhiyun 		goto lookup_again;
2045*4882a593Smuzhiyun 	}
2046*4882a593Smuzhiyun 
2047*4882a593Smuzhiyun 	/*
2048*4882a593Smuzhiyun 	 * Because we free lsegs when sending LAYOUTRETURN, we need to wait
2049*4882a593Smuzhiyun 	 * for LAYOUTRETURN.
2050*4882a593Smuzhiyun 	 */
2051*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
2052*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
2053*4882a593Smuzhiyun 		dprintk("%s wait for layoutreturn\n", __func__);
2054*4882a593Smuzhiyun 		lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
2055*4882a593Smuzhiyun 		if (!IS_ERR(lseg)) {
2056*4882a593Smuzhiyun 			pnfs_put_layout_hdr(lo);
2057*4882a593Smuzhiyun 			dprintk("%s retrying\n", __func__);
2058*4882a593Smuzhiyun 			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
2059*4882a593Smuzhiyun 						 lseg,
2060*4882a593Smuzhiyun 						 PNFS_UPDATE_LAYOUT_RETRY);
2061*4882a593Smuzhiyun 			goto lookup_again;
2062*4882a593Smuzhiyun 		}
2063*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2064*4882a593Smuzhiyun 					 PNFS_UPDATE_LAYOUT_RETURN);
2065*4882a593Smuzhiyun 		goto out_put_layout_hdr;
2066*4882a593Smuzhiyun 	}
2067*4882a593Smuzhiyun 
2068*4882a593Smuzhiyun 	lseg = pnfs_find_lseg(lo, &arg, strict_iomode);
2069*4882a593Smuzhiyun 	if (lseg) {
2070*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2071*4882a593Smuzhiyun 				PNFS_UPDATE_LAYOUT_FOUND_CACHED);
2072*4882a593Smuzhiyun 		goto out_unlock;
2073*4882a593Smuzhiyun 	}
2074*4882a593Smuzhiyun 
2075*4882a593Smuzhiyun 	/*
2076*4882a593Smuzhiyun 	 * Choose a stateid for the LAYOUTGET. If we don't have a layout
2077*4882a593Smuzhiyun 	 * stateid, or it has been invalidated, then we must use the open
2078*4882a593Smuzhiyun 	 * stateid.
2079*4882a593Smuzhiyun 	 */
2080*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) {
2081*4882a593Smuzhiyun 		int status;
2082*4882a593Smuzhiyun 
2083*4882a593Smuzhiyun 		/*
2084*4882a593Smuzhiyun 		 * The first layoutget for the file. Need to serialize per
2085*4882a593Smuzhiyun 		 * RFC 5661 Errata 3208.
2086*4882a593Smuzhiyun 		 */
2087*4882a593Smuzhiyun 		if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
2088*4882a593Smuzhiyun 				     &lo->plh_flags)) {
2089*4882a593Smuzhiyun 			spin_unlock(&ino->i_lock);
2090*4882a593Smuzhiyun 			lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
2091*4882a593Smuzhiyun 						NFS_LAYOUT_FIRST_LAYOUTGET,
2092*4882a593Smuzhiyun 						TASK_KILLABLE));
2093*4882a593Smuzhiyun 			if (IS_ERR(lseg))
2094*4882a593Smuzhiyun 				goto out_put_layout_hdr;
2095*4882a593Smuzhiyun 			pnfs_put_layout_hdr(lo);
2096*4882a593Smuzhiyun 			dprintk("%s retrying\n", __func__);
2097*4882a593Smuzhiyun 			goto lookup_again;
2098*4882a593Smuzhiyun 		}
2099*4882a593Smuzhiyun 
2100*4882a593Smuzhiyun 		spin_unlock(&ino->i_lock);
2101*4882a593Smuzhiyun 		first = true;
2102*4882a593Smuzhiyun 		status = nfs4_select_rw_stateid(ctx->state,
2103*4882a593Smuzhiyun 					iomode == IOMODE_RW ? FMODE_WRITE : FMODE_READ,
2104*4882a593Smuzhiyun 					NULL, &stateid, NULL);
2105*4882a593Smuzhiyun 		if (status != 0) {
2106*4882a593Smuzhiyun 			lseg = ERR_PTR(status);
2107*4882a593Smuzhiyun 			trace_pnfs_update_layout(ino, pos, count,
2108*4882a593Smuzhiyun 					iomode, lo, lseg,
2109*4882a593Smuzhiyun 					PNFS_UPDATE_LAYOUT_INVALID_OPEN);
2110*4882a593Smuzhiyun 			nfs4_schedule_stateid_recovery(server, ctx->state);
2111*4882a593Smuzhiyun 			pnfs_clear_first_layoutget(lo);
2112*4882a593Smuzhiyun 			pnfs_put_layout_hdr(lo);
2113*4882a593Smuzhiyun 			goto lookup_again;
2114*4882a593Smuzhiyun 		}
2115*4882a593Smuzhiyun 		spin_lock(&ino->i_lock);
2116*4882a593Smuzhiyun 	} else {
2117*4882a593Smuzhiyun 		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
2118*4882a593Smuzhiyun 	}
2119*4882a593Smuzhiyun 
2120*4882a593Smuzhiyun 	if (pnfs_layoutgets_blocked(lo)) {
2121*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2122*4882a593Smuzhiyun 				PNFS_UPDATE_LAYOUT_BLOCKED);
2123*4882a593Smuzhiyun 		goto out_unlock;
2124*4882a593Smuzhiyun 	}
2125*4882a593Smuzhiyun 	nfs_layoutget_begin(lo);
2126*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
2127*4882a593Smuzhiyun 
2128*4882a593Smuzhiyun 	_add_to_server_list(lo, server);
2129*4882a593Smuzhiyun 
2130*4882a593Smuzhiyun 	pg_offset = arg.offset & ~PAGE_MASK;
2131*4882a593Smuzhiyun 	if (pg_offset) {
2132*4882a593Smuzhiyun 		arg.offset -= pg_offset;
2133*4882a593Smuzhiyun 		arg.length += pg_offset;
2134*4882a593Smuzhiyun 	}
2135*4882a593Smuzhiyun 	if (arg.length != NFS4_MAX_UINT64)
2136*4882a593Smuzhiyun 		arg.length = PAGE_ALIGN(arg.length);
2137*4882a593Smuzhiyun 
2138*4882a593Smuzhiyun 	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
2139*4882a593Smuzhiyun 	if (!lgp) {
2140*4882a593Smuzhiyun 		lseg = ERR_PTR(-ENOMEM);
2141*4882a593Smuzhiyun 		trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
2142*4882a593Smuzhiyun 					 PNFS_UPDATE_LAYOUT_NOMEM);
2143*4882a593Smuzhiyun 		nfs_layoutget_end(lo);
2144*4882a593Smuzhiyun 		goto out_put_layout_hdr;
2145*4882a593Smuzhiyun 	}
2146*4882a593Smuzhiyun 
2147*4882a593Smuzhiyun 	lseg = nfs4_proc_layoutget(lgp, &timeout);
2148*4882a593Smuzhiyun 	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2149*4882a593Smuzhiyun 				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
2150*4882a593Smuzhiyun 	nfs_layoutget_end(lo);
2151*4882a593Smuzhiyun 	if (IS_ERR(lseg)) {
2152*4882a593Smuzhiyun 		switch(PTR_ERR(lseg)) {
2153*4882a593Smuzhiyun 		case -EBUSY:
2154*4882a593Smuzhiyun 			if (time_after(jiffies, giveup))
2155*4882a593Smuzhiyun 				lseg = NULL;
2156*4882a593Smuzhiyun 			break;
2157*4882a593Smuzhiyun 		case -ERECALLCONFLICT:
2158*4882a593Smuzhiyun 		case -EAGAIN:
2159*4882a593Smuzhiyun 			break;
2160*4882a593Smuzhiyun 		case -ENODATA:
2161*4882a593Smuzhiyun 			/* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
2162*4882a593Smuzhiyun 			pnfs_layout_set_fail_bit(
2163*4882a593Smuzhiyun 				lo, pnfs_iomode_to_fail_bit(iomode));
2164*4882a593Smuzhiyun 			lseg = NULL;
2165*4882a593Smuzhiyun 			goto out_put_layout_hdr;
2166*4882a593Smuzhiyun 		default:
2167*4882a593Smuzhiyun 			if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
2168*4882a593Smuzhiyun 				pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2169*4882a593Smuzhiyun 				lseg = NULL;
2170*4882a593Smuzhiyun 			}
2171*4882a593Smuzhiyun 			goto out_put_layout_hdr;
2172*4882a593Smuzhiyun 		}
2173*4882a593Smuzhiyun 		if (lseg) {
2174*4882a593Smuzhiyun 			if (first)
2175*4882a593Smuzhiyun 				pnfs_clear_first_layoutget(lo);
2176*4882a593Smuzhiyun 			trace_pnfs_update_layout(ino, pos, count,
2177*4882a593Smuzhiyun 				iomode, lo, lseg, PNFS_UPDATE_LAYOUT_RETRY);
2178*4882a593Smuzhiyun 			pnfs_put_layout_hdr(lo);
2179*4882a593Smuzhiyun 			goto lookup_again;
2180*4882a593Smuzhiyun 		}
2181*4882a593Smuzhiyun 	} else {
2182*4882a593Smuzhiyun 		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2183*4882a593Smuzhiyun 	}
2184*4882a593Smuzhiyun 
2185*4882a593Smuzhiyun out_put_layout_hdr:
2186*4882a593Smuzhiyun 	if (first)
2187*4882a593Smuzhiyun 		pnfs_clear_first_layoutget(lo);
2188*4882a593Smuzhiyun 	trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
2189*4882a593Smuzhiyun 				 PNFS_UPDATE_LAYOUT_EXIT);
2190*4882a593Smuzhiyun 	pnfs_put_layout_hdr(lo);
2191*4882a593Smuzhiyun out:
2192*4882a593Smuzhiyun 	dprintk("%s: inode %s/%llu pNFS layout segment %s for "
2193*4882a593Smuzhiyun 			"(%s, offset: %llu, length: %llu)\n",
2194*4882a593Smuzhiyun 			__func__, ino->i_sb->s_id,
2195*4882a593Smuzhiyun 			(unsigned long long)NFS_FILEID(ino),
2196*4882a593Smuzhiyun 			IS_ERR_OR_NULL(lseg) ? "not found" : "found",
2197*4882a593Smuzhiyun 			iomode==IOMODE_RW ?  "read/write" : "read-only",
2198*4882a593Smuzhiyun 			(unsigned long long)pos,
2199*4882a593Smuzhiyun 			(unsigned long long)count);
2200*4882a593Smuzhiyun 	return lseg;
2201*4882a593Smuzhiyun out_unlock:
2202*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
2203*4882a593Smuzhiyun 	goto out_put_layout_hdr;
2204*4882a593Smuzhiyun }
2205*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_update_layout);
2206*4882a593Smuzhiyun 
2207*4882a593Smuzhiyun static bool
pnfs_sanity_check_layout_range(struct pnfs_layout_range * range)2208*4882a593Smuzhiyun pnfs_sanity_check_layout_range(struct pnfs_layout_range *range)
2209*4882a593Smuzhiyun {
2210*4882a593Smuzhiyun 	switch (range->iomode) {
2211*4882a593Smuzhiyun 	case IOMODE_READ:
2212*4882a593Smuzhiyun 	case IOMODE_RW:
2213*4882a593Smuzhiyun 		break;
2214*4882a593Smuzhiyun 	default:
2215*4882a593Smuzhiyun 		return false;
2216*4882a593Smuzhiyun 	}
2217*4882a593Smuzhiyun 	if (range->offset == NFS4_MAX_UINT64)
2218*4882a593Smuzhiyun 		return false;
2219*4882a593Smuzhiyun 	if (range->length == 0)
2220*4882a593Smuzhiyun 		return false;
2221*4882a593Smuzhiyun 	if (range->length != NFS4_MAX_UINT64 &&
2222*4882a593Smuzhiyun 	    range->length > NFS4_MAX_UINT64 - range->offset)
2223*4882a593Smuzhiyun 		return false;
2224*4882a593Smuzhiyun 	return true;
2225*4882a593Smuzhiyun }
2226*4882a593Smuzhiyun 
2227*4882a593Smuzhiyun static struct pnfs_layout_hdr *
_pnfs_grab_empty_layout(struct inode * ino,struct nfs_open_context * ctx)2228*4882a593Smuzhiyun _pnfs_grab_empty_layout(struct inode *ino, struct nfs_open_context *ctx)
2229*4882a593Smuzhiyun {
2230*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
2231*4882a593Smuzhiyun 
2232*4882a593Smuzhiyun 	spin_lock(&ino->i_lock);
2233*4882a593Smuzhiyun 	lo = pnfs_find_alloc_layout(ino, ctx, GFP_KERNEL);
2234*4882a593Smuzhiyun 	if (!lo)
2235*4882a593Smuzhiyun 		goto out_unlock;
2236*4882a593Smuzhiyun 	if (!test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
2237*4882a593Smuzhiyun 		goto out_unlock;
2238*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
2239*4882a593Smuzhiyun 		goto out_unlock;
2240*4882a593Smuzhiyun 	if (pnfs_layoutgets_blocked(lo))
2241*4882a593Smuzhiyun 		goto out_unlock;
2242*4882a593Smuzhiyun 	if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET, &lo->plh_flags))
2243*4882a593Smuzhiyun 		goto out_unlock;
2244*4882a593Smuzhiyun 	nfs_layoutget_begin(lo);
2245*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
2246*4882a593Smuzhiyun 	_add_to_server_list(lo, NFS_SERVER(ino));
2247*4882a593Smuzhiyun 	return lo;
2248*4882a593Smuzhiyun 
2249*4882a593Smuzhiyun out_unlock:
2250*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
2251*4882a593Smuzhiyun 	pnfs_put_layout_hdr(lo);
2252*4882a593Smuzhiyun 	return NULL;
2253*4882a593Smuzhiyun }
2254*4882a593Smuzhiyun 
_lgopen_prepare_attached(struct nfs4_opendata * data,struct nfs_open_context * ctx)2255*4882a593Smuzhiyun static void _lgopen_prepare_attached(struct nfs4_opendata *data,
2256*4882a593Smuzhiyun 				     struct nfs_open_context *ctx)
2257*4882a593Smuzhiyun {
2258*4882a593Smuzhiyun 	struct inode *ino = data->dentry->d_inode;
2259*4882a593Smuzhiyun 	struct pnfs_layout_range rng = {
2260*4882a593Smuzhiyun 		.iomode = (data->o_arg.fmode & FMODE_WRITE) ?
2261*4882a593Smuzhiyun 			  IOMODE_RW: IOMODE_READ,
2262*4882a593Smuzhiyun 		.offset = 0,
2263*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
2264*4882a593Smuzhiyun 	};
2265*4882a593Smuzhiyun 	struct nfs4_layoutget *lgp;
2266*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
2267*4882a593Smuzhiyun 
2268*4882a593Smuzhiyun 	/* Heuristic: don't send layoutget if we have cached data */
2269*4882a593Smuzhiyun 	if (rng.iomode == IOMODE_READ &&
2270*4882a593Smuzhiyun 	   (i_size_read(ino) == 0 || ino->i_mapping->nrpages != 0))
2271*4882a593Smuzhiyun 		return;
2272*4882a593Smuzhiyun 
2273*4882a593Smuzhiyun 	lo = _pnfs_grab_empty_layout(ino, ctx);
2274*4882a593Smuzhiyun 	if (!lo)
2275*4882a593Smuzhiyun 		return;
2276*4882a593Smuzhiyun 	lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &current_stateid,
2277*4882a593Smuzhiyun 					     &rng, GFP_KERNEL);
2278*4882a593Smuzhiyun 	if (!lgp) {
2279*4882a593Smuzhiyun 		pnfs_clear_first_layoutget(lo);
2280*4882a593Smuzhiyun 		nfs_layoutget_end(lo);
2281*4882a593Smuzhiyun 		pnfs_put_layout_hdr(lo);
2282*4882a593Smuzhiyun 		return;
2283*4882a593Smuzhiyun 	}
2284*4882a593Smuzhiyun 	data->lgp = lgp;
2285*4882a593Smuzhiyun 	data->o_arg.lg_args = &lgp->args;
2286*4882a593Smuzhiyun 	data->o_res.lg_res = &lgp->res;
2287*4882a593Smuzhiyun }
2288*4882a593Smuzhiyun 
_lgopen_prepare_floating(struct nfs4_opendata * data,struct nfs_open_context * ctx)2289*4882a593Smuzhiyun static void _lgopen_prepare_floating(struct nfs4_opendata *data,
2290*4882a593Smuzhiyun 				     struct nfs_open_context *ctx)
2291*4882a593Smuzhiyun {
2292*4882a593Smuzhiyun 	struct pnfs_layout_range rng = {
2293*4882a593Smuzhiyun 		.iomode = (data->o_arg.fmode & FMODE_WRITE) ?
2294*4882a593Smuzhiyun 			  IOMODE_RW: IOMODE_READ,
2295*4882a593Smuzhiyun 		.offset = 0,
2296*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
2297*4882a593Smuzhiyun 	};
2298*4882a593Smuzhiyun 	struct nfs4_layoutget *lgp;
2299*4882a593Smuzhiyun 
2300*4882a593Smuzhiyun 	lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, &current_stateid,
2301*4882a593Smuzhiyun 					     &rng, GFP_KERNEL);
2302*4882a593Smuzhiyun 	if (!lgp)
2303*4882a593Smuzhiyun 		return;
2304*4882a593Smuzhiyun 	data->lgp = lgp;
2305*4882a593Smuzhiyun 	data->o_arg.lg_args = &lgp->args;
2306*4882a593Smuzhiyun 	data->o_res.lg_res = &lgp->res;
2307*4882a593Smuzhiyun }
2308*4882a593Smuzhiyun 
pnfs_lgopen_prepare(struct nfs4_opendata * data,struct nfs_open_context * ctx)2309*4882a593Smuzhiyun void pnfs_lgopen_prepare(struct nfs4_opendata *data,
2310*4882a593Smuzhiyun 			 struct nfs_open_context *ctx)
2311*4882a593Smuzhiyun {
2312*4882a593Smuzhiyun 	struct nfs_server *server = NFS_SERVER(data->dir->d_inode);
2313*4882a593Smuzhiyun 
2314*4882a593Smuzhiyun 	if (!(pnfs_enabled_sb(server) &&
2315*4882a593Smuzhiyun 	      server->pnfs_curr_ld->flags & PNFS_LAYOUTGET_ON_OPEN))
2316*4882a593Smuzhiyun 		return;
2317*4882a593Smuzhiyun 	/* Could check on max_ops, but currently hardcoded high enough */
2318*4882a593Smuzhiyun 	if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN))
2319*4882a593Smuzhiyun 		return;
2320*4882a593Smuzhiyun 	if (data->state)
2321*4882a593Smuzhiyun 		_lgopen_prepare_attached(data, ctx);
2322*4882a593Smuzhiyun 	else
2323*4882a593Smuzhiyun 		_lgopen_prepare_floating(data, ctx);
2324*4882a593Smuzhiyun }
2325*4882a593Smuzhiyun 
pnfs_parse_lgopen(struct inode * ino,struct nfs4_layoutget * lgp,struct nfs_open_context * ctx)2326*4882a593Smuzhiyun void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp,
2327*4882a593Smuzhiyun 		       struct nfs_open_context *ctx)
2328*4882a593Smuzhiyun {
2329*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
2330*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg;
2331*4882a593Smuzhiyun 	struct nfs_server *srv = NFS_SERVER(ino);
2332*4882a593Smuzhiyun 	u32 iomode;
2333*4882a593Smuzhiyun 
2334*4882a593Smuzhiyun 	if (!lgp)
2335*4882a593Smuzhiyun 		return;
2336*4882a593Smuzhiyun 	dprintk("%s: entered with status %i\n", __func__, lgp->res.status);
2337*4882a593Smuzhiyun 	if (lgp->res.status) {
2338*4882a593Smuzhiyun 		switch (lgp->res.status) {
2339*4882a593Smuzhiyun 		default:
2340*4882a593Smuzhiyun 			break;
2341*4882a593Smuzhiyun 		/*
2342*4882a593Smuzhiyun 		 * Halt lgopen attempts if the server doesn't recognise
2343*4882a593Smuzhiyun 		 * the "current stateid" value, the layout type, or the
2344*4882a593Smuzhiyun 		 * layoutget operation as being valid.
2345*4882a593Smuzhiyun 		 * Also if it complains about too many ops in the compound
2346*4882a593Smuzhiyun 		 * or of the request/reply being too big.
2347*4882a593Smuzhiyun 		 */
2348*4882a593Smuzhiyun 		case -NFS4ERR_BAD_STATEID:
2349*4882a593Smuzhiyun 		case -NFS4ERR_NOTSUPP:
2350*4882a593Smuzhiyun 		case -NFS4ERR_REP_TOO_BIG:
2351*4882a593Smuzhiyun 		case -NFS4ERR_REP_TOO_BIG_TO_CACHE:
2352*4882a593Smuzhiyun 		case -NFS4ERR_REQ_TOO_BIG:
2353*4882a593Smuzhiyun 		case -NFS4ERR_TOO_MANY_OPS:
2354*4882a593Smuzhiyun 		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
2355*4882a593Smuzhiyun 			srv->caps &= ~NFS_CAP_LGOPEN;
2356*4882a593Smuzhiyun 		}
2357*4882a593Smuzhiyun 		return;
2358*4882a593Smuzhiyun 	}
2359*4882a593Smuzhiyun 	if (!lgp->args.inode) {
2360*4882a593Smuzhiyun 		lo = _pnfs_grab_empty_layout(ino, ctx);
2361*4882a593Smuzhiyun 		if (!lo)
2362*4882a593Smuzhiyun 			return;
2363*4882a593Smuzhiyun 		lgp->args.inode = ino;
2364*4882a593Smuzhiyun 	} else
2365*4882a593Smuzhiyun 		lo = NFS_I(lgp->args.inode)->layout;
2366*4882a593Smuzhiyun 
2367*4882a593Smuzhiyun 	lseg = pnfs_layout_process(lgp);
2368*4882a593Smuzhiyun 	if (!IS_ERR(lseg)) {
2369*4882a593Smuzhiyun 		iomode = lgp->args.range.iomode;
2370*4882a593Smuzhiyun 		pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
2371*4882a593Smuzhiyun 		pnfs_put_lseg(lseg);
2372*4882a593Smuzhiyun 	}
2373*4882a593Smuzhiyun }
2374*4882a593Smuzhiyun 
nfs4_lgopen_release(struct nfs4_layoutget * lgp)2375*4882a593Smuzhiyun void nfs4_lgopen_release(struct nfs4_layoutget *lgp)
2376*4882a593Smuzhiyun {
2377*4882a593Smuzhiyun 	if (lgp != NULL) {
2378*4882a593Smuzhiyun 		struct inode *inode = lgp->args.inode;
2379*4882a593Smuzhiyun 		if (inode) {
2380*4882a593Smuzhiyun 			struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
2381*4882a593Smuzhiyun 			pnfs_clear_first_layoutget(lo);
2382*4882a593Smuzhiyun 			nfs_layoutget_end(lo);
2383*4882a593Smuzhiyun 		}
2384*4882a593Smuzhiyun 		pnfs_layoutget_free(lgp);
2385*4882a593Smuzhiyun 	}
2386*4882a593Smuzhiyun }
2387*4882a593Smuzhiyun 
2388*4882a593Smuzhiyun struct pnfs_layout_segment *
pnfs_layout_process(struct nfs4_layoutget * lgp)2389*4882a593Smuzhiyun pnfs_layout_process(struct nfs4_layoutget *lgp)
2390*4882a593Smuzhiyun {
2391*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout;
2392*4882a593Smuzhiyun 	struct nfs4_layoutget_res *res = &lgp->res;
2393*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg;
2394*4882a593Smuzhiyun 	struct inode *ino = lo->plh_inode;
2395*4882a593Smuzhiyun 	LIST_HEAD(free_me);
2396*4882a593Smuzhiyun 
2397*4882a593Smuzhiyun 	if (!pnfs_sanity_check_layout_range(&res->range))
2398*4882a593Smuzhiyun 		return ERR_PTR(-EINVAL);
2399*4882a593Smuzhiyun 
2400*4882a593Smuzhiyun 	/* Inject layout blob into I/O device driver */
2401*4882a593Smuzhiyun 	lseg = NFS_SERVER(ino)->pnfs_curr_ld->alloc_lseg(lo, res, lgp->gfp_flags);
2402*4882a593Smuzhiyun 	if (IS_ERR_OR_NULL(lseg)) {
2403*4882a593Smuzhiyun 		if (!lseg)
2404*4882a593Smuzhiyun 			lseg = ERR_PTR(-ENOMEM);
2405*4882a593Smuzhiyun 
2406*4882a593Smuzhiyun 		dprintk("%s: Could not allocate layout: error %ld\n",
2407*4882a593Smuzhiyun 		       __func__, PTR_ERR(lseg));
2408*4882a593Smuzhiyun 		return lseg;
2409*4882a593Smuzhiyun 	}
2410*4882a593Smuzhiyun 
2411*4882a593Smuzhiyun 	pnfs_init_lseg(lo, lseg, &res->range, &res->stateid);
2412*4882a593Smuzhiyun 
2413*4882a593Smuzhiyun 	spin_lock(&ino->i_lock);
2414*4882a593Smuzhiyun 	if (pnfs_layoutgets_blocked(lo)) {
2415*4882a593Smuzhiyun 		dprintk("%s forget reply due to state\n", __func__);
2416*4882a593Smuzhiyun 		goto out_forget;
2417*4882a593Smuzhiyun 	}
2418*4882a593Smuzhiyun 
2419*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
2420*4882a593Smuzhiyun 	    !pnfs_is_first_layoutget(lo))
2421*4882a593Smuzhiyun 		goto out_forget;
2422*4882a593Smuzhiyun 
2423*4882a593Smuzhiyun 	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
2424*4882a593Smuzhiyun 		/* existing state ID, make sure the sequence number matches. */
2425*4882a593Smuzhiyun 		if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
2426*4882a593Smuzhiyun 			if (!pnfs_layout_is_valid(lo))
2427*4882a593Smuzhiyun 				lo->plh_barrier = 0;
2428*4882a593Smuzhiyun 			dprintk("%s forget reply due to sequence\n", __func__);
2429*4882a593Smuzhiyun 			goto out_forget;
2430*4882a593Smuzhiyun 		}
2431*4882a593Smuzhiyun 		pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
2432*4882a593Smuzhiyun 	} else if (pnfs_layout_is_valid(lo)) {
2433*4882a593Smuzhiyun 		/*
2434*4882a593Smuzhiyun 		 * We got an entirely new state ID.  Mark all segments for the
2435*4882a593Smuzhiyun 		 * inode invalid, and retry the layoutget
2436*4882a593Smuzhiyun 		 */
2437*4882a593Smuzhiyun 		struct pnfs_layout_range range = {
2438*4882a593Smuzhiyun 			.iomode = IOMODE_ANY,
2439*4882a593Smuzhiyun 			.length = NFS4_MAX_UINT64,
2440*4882a593Smuzhiyun 		};
2441*4882a593Smuzhiyun 		pnfs_set_plh_return_info(lo, IOMODE_ANY, 0);
2442*4882a593Smuzhiyun 		pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
2443*4882a593Smuzhiyun 						&range, 0);
2444*4882a593Smuzhiyun 		goto out_forget;
2445*4882a593Smuzhiyun 	} else {
2446*4882a593Smuzhiyun 		/* We have a completely new layout */
2447*4882a593Smuzhiyun 		pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
2448*4882a593Smuzhiyun 	}
2449*4882a593Smuzhiyun 
2450*4882a593Smuzhiyun 	pnfs_get_lseg(lseg);
2451*4882a593Smuzhiyun 	pnfs_layout_insert_lseg(lo, lseg, &free_me);
2452*4882a593Smuzhiyun 
2453*4882a593Smuzhiyun 
2454*4882a593Smuzhiyun 	if (res->return_on_close)
2455*4882a593Smuzhiyun 		set_bit(NFS_LSEG_ROC, &lseg->pls_flags);
2456*4882a593Smuzhiyun 
2457*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
2458*4882a593Smuzhiyun 	pnfs_free_lseg_list(&free_me);
2459*4882a593Smuzhiyun 	return lseg;
2460*4882a593Smuzhiyun 
2461*4882a593Smuzhiyun out_forget:
2462*4882a593Smuzhiyun 	spin_unlock(&ino->i_lock);
2463*4882a593Smuzhiyun 	lseg->pls_layout = lo;
2464*4882a593Smuzhiyun 	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
2465*4882a593Smuzhiyun 	pnfs_free_lseg_list(&free_me);
2466*4882a593Smuzhiyun 	return ERR_PTR(-EAGAIN);
2467*4882a593Smuzhiyun }
2468*4882a593Smuzhiyun 
2469*4882a593Smuzhiyun /**
2470*4882a593Smuzhiyun  * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
2471*4882a593Smuzhiyun  * @lo: pointer to layout header
2472*4882a593Smuzhiyun  * @tmp_list: list header to be used with pnfs_free_lseg_list()
2473*4882a593Smuzhiyun  * @return_range: describe layout segment ranges to be returned
2474*4882a593Smuzhiyun  * @seq: stateid seqid to match
2475*4882a593Smuzhiyun  *
2476*4882a593Smuzhiyun  * This function is mainly intended for use by layoutrecall. It attempts
2477*4882a593Smuzhiyun  * to free the layout segment immediately, or else to mark it for return
2478*4882a593Smuzhiyun  * as soon as its reference count drops to zero.
2479*4882a593Smuzhiyun  *
2480*4882a593Smuzhiyun  * Returns
2481*4882a593Smuzhiyun  * - 0: a layoutreturn needs to be scheduled.
2482*4882a593Smuzhiyun  * - EBUSY: there are layout segment that are still in use.
2483*4882a593Smuzhiyun  * - ENOENT: there are no layout segments that need to be returned.
2484*4882a593Smuzhiyun  */
2485*4882a593Smuzhiyun int
pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr * lo,struct list_head * tmp_list,const struct pnfs_layout_range * return_range,u32 seq)2486*4882a593Smuzhiyun pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
2487*4882a593Smuzhiyun 				struct list_head *tmp_list,
2488*4882a593Smuzhiyun 				const struct pnfs_layout_range *return_range,
2489*4882a593Smuzhiyun 				u32 seq)
2490*4882a593Smuzhiyun {
2491*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *next;
2492*4882a593Smuzhiyun 	int remaining = 0;
2493*4882a593Smuzhiyun 
2494*4882a593Smuzhiyun 	dprintk("%s:Begin lo %p\n", __func__, lo);
2495*4882a593Smuzhiyun 
2496*4882a593Smuzhiyun 	assert_spin_locked(&lo->plh_inode->i_lock);
2497*4882a593Smuzhiyun 
2498*4882a593Smuzhiyun 	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
2499*4882a593Smuzhiyun 		tmp_list = &lo->plh_return_segs;
2500*4882a593Smuzhiyun 
2501*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
2502*4882a593Smuzhiyun 		if (pnfs_match_lseg_recall(lseg, return_range, seq)) {
2503*4882a593Smuzhiyun 			dprintk("%s: marking lseg %p iomode %d "
2504*4882a593Smuzhiyun 				"offset %llu length %llu\n", __func__,
2505*4882a593Smuzhiyun 				lseg, lseg->pls_range.iomode,
2506*4882a593Smuzhiyun 				lseg->pls_range.offset,
2507*4882a593Smuzhiyun 				lseg->pls_range.length);
2508*4882a593Smuzhiyun 			if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
2509*4882a593Smuzhiyun 				tmp_list = &lo->plh_return_segs;
2510*4882a593Smuzhiyun 			if (mark_lseg_invalid(lseg, tmp_list))
2511*4882a593Smuzhiyun 				continue;
2512*4882a593Smuzhiyun 			remaining++;
2513*4882a593Smuzhiyun 			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
2514*4882a593Smuzhiyun 		}
2515*4882a593Smuzhiyun 
2516*4882a593Smuzhiyun 	if (remaining) {
2517*4882a593Smuzhiyun 		pnfs_set_plh_return_info(lo, return_range->iomode, seq);
2518*4882a593Smuzhiyun 		return -EBUSY;
2519*4882a593Smuzhiyun 	}
2520*4882a593Smuzhiyun 
2521*4882a593Smuzhiyun 	if (!list_empty(&lo->plh_return_segs)) {
2522*4882a593Smuzhiyun 		pnfs_set_plh_return_info(lo, return_range->iomode, seq);
2523*4882a593Smuzhiyun 		return 0;
2524*4882a593Smuzhiyun 	}
2525*4882a593Smuzhiyun 
2526*4882a593Smuzhiyun 	return -ENOENT;
2527*4882a593Smuzhiyun }
2528*4882a593Smuzhiyun 
2529*4882a593Smuzhiyun static void
pnfs_mark_layout_for_return(struct inode * inode,const struct pnfs_layout_range * range)2530*4882a593Smuzhiyun pnfs_mark_layout_for_return(struct inode *inode,
2531*4882a593Smuzhiyun 			    const struct pnfs_layout_range *range)
2532*4882a593Smuzhiyun {
2533*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
2534*4882a593Smuzhiyun 	bool return_now = false;
2535*4882a593Smuzhiyun 
2536*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
2537*4882a593Smuzhiyun 	lo = NFS_I(inode)->layout;
2538*4882a593Smuzhiyun 	if (!pnfs_layout_is_valid(lo)) {
2539*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
2540*4882a593Smuzhiyun 		return;
2541*4882a593Smuzhiyun 	}
2542*4882a593Smuzhiyun 	pnfs_set_plh_return_info(lo, range->iomode, 0);
2543*4882a593Smuzhiyun 	/*
2544*4882a593Smuzhiyun 	 * mark all matching lsegs so that we are sure to have no live
2545*4882a593Smuzhiyun 	 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
2546*4882a593Smuzhiyun 	 * for how it works.
2547*4882a593Smuzhiyun 	 */
2548*4882a593Smuzhiyun 	if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
2549*4882a593Smuzhiyun 		const struct cred *cred;
2550*4882a593Smuzhiyun 		nfs4_stateid stateid;
2551*4882a593Smuzhiyun 		enum pnfs_iomode iomode;
2552*4882a593Smuzhiyun 
2553*4882a593Smuzhiyun 		return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
2554*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
2555*4882a593Smuzhiyun 		if (return_now)
2556*4882a593Smuzhiyun 			pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
2557*4882a593Smuzhiyun 	} else {
2558*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
2559*4882a593Smuzhiyun 		nfs_commit_inode(inode, 0);
2560*4882a593Smuzhiyun 	}
2561*4882a593Smuzhiyun }
2562*4882a593Smuzhiyun 
pnfs_error_mark_layout_for_return(struct inode * inode,struct pnfs_layout_segment * lseg)2563*4882a593Smuzhiyun void pnfs_error_mark_layout_for_return(struct inode *inode,
2564*4882a593Smuzhiyun 				       struct pnfs_layout_segment *lseg)
2565*4882a593Smuzhiyun {
2566*4882a593Smuzhiyun 	struct pnfs_layout_range range = {
2567*4882a593Smuzhiyun 		.iomode = lseg->pls_range.iomode,
2568*4882a593Smuzhiyun 		.offset = 0,
2569*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
2570*4882a593Smuzhiyun 	};
2571*4882a593Smuzhiyun 
2572*4882a593Smuzhiyun 	pnfs_mark_layout_for_return(inode, &range);
2573*4882a593Smuzhiyun }
2574*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
2575*4882a593Smuzhiyun 
2576*4882a593Smuzhiyun static bool
pnfs_layout_can_be_returned(struct pnfs_layout_hdr * lo)2577*4882a593Smuzhiyun pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
2578*4882a593Smuzhiyun {
2579*4882a593Smuzhiyun 	return pnfs_layout_is_valid(lo) &&
2580*4882a593Smuzhiyun 		!test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
2581*4882a593Smuzhiyun 		!test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
2582*4882a593Smuzhiyun }
2583*4882a593Smuzhiyun 
2584*4882a593Smuzhiyun static struct pnfs_layout_segment *
pnfs_find_first_lseg(struct pnfs_layout_hdr * lo,const struct pnfs_layout_range * range,enum pnfs_iomode iomode)2585*4882a593Smuzhiyun pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
2586*4882a593Smuzhiyun 		     const struct pnfs_layout_range *range,
2587*4882a593Smuzhiyun 		     enum pnfs_iomode iomode)
2588*4882a593Smuzhiyun {
2589*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg;
2590*4882a593Smuzhiyun 
2591*4882a593Smuzhiyun 	list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
2592*4882a593Smuzhiyun 		if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
2593*4882a593Smuzhiyun 			continue;
2594*4882a593Smuzhiyun 		if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
2595*4882a593Smuzhiyun 			continue;
2596*4882a593Smuzhiyun 		if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
2597*4882a593Smuzhiyun 			continue;
2598*4882a593Smuzhiyun 		if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
2599*4882a593Smuzhiyun 			return lseg;
2600*4882a593Smuzhiyun 	}
2601*4882a593Smuzhiyun 	return NULL;
2602*4882a593Smuzhiyun }
2603*4882a593Smuzhiyun 
2604*4882a593Smuzhiyun /* Find open file states whose mode matches that of the range */
2605*4882a593Smuzhiyun static bool
pnfs_should_return_unused_layout(struct pnfs_layout_hdr * lo,const struct pnfs_layout_range * range)2606*4882a593Smuzhiyun pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
2607*4882a593Smuzhiyun 				 const struct pnfs_layout_range *range)
2608*4882a593Smuzhiyun {
2609*4882a593Smuzhiyun 	struct list_head *head;
2610*4882a593Smuzhiyun 	struct nfs_open_context *ctx;
2611*4882a593Smuzhiyun 	fmode_t mode = 0;
2612*4882a593Smuzhiyun 
2613*4882a593Smuzhiyun 	if (!pnfs_layout_can_be_returned(lo) ||
2614*4882a593Smuzhiyun 	    !pnfs_find_first_lseg(lo, range, range->iomode))
2615*4882a593Smuzhiyun 		return false;
2616*4882a593Smuzhiyun 
2617*4882a593Smuzhiyun 	head = &NFS_I(lo->plh_inode)->open_files;
2618*4882a593Smuzhiyun 	list_for_each_entry_rcu(ctx, head, list) {
2619*4882a593Smuzhiyun 		if (ctx->state)
2620*4882a593Smuzhiyun 			mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
2621*4882a593Smuzhiyun 	}
2622*4882a593Smuzhiyun 
2623*4882a593Smuzhiyun 	switch (range->iomode) {
2624*4882a593Smuzhiyun 	default:
2625*4882a593Smuzhiyun 		break;
2626*4882a593Smuzhiyun 	case IOMODE_READ:
2627*4882a593Smuzhiyun 		mode &= ~FMODE_WRITE;
2628*4882a593Smuzhiyun 		break;
2629*4882a593Smuzhiyun 	case IOMODE_RW:
2630*4882a593Smuzhiyun 		if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
2631*4882a593Smuzhiyun 			mode &= ~FMODE_READ;
2632*4882a593Smuzhiyun 	}
2633*4882a593Smuzhiyun 	return mode == 0;
2634*4882a593Smuzhiyun }
2635*4882a593Smuzhiyun 
2636*4882a593Smuzhiyun static int
pnfs_layout_return_unused_byserver(struct nfs_server * server,void * data)2637*4882a593Smuzhiyun pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
2638*4882a593Smuzhiyun {
2639*4882a593Smuzhiyun 	const struct pnfs_layout_range *range = data;
2640*4882a593Smuzhiyun 	struct pnfs_layout_hdr *lo;
2641*4882a593Smuzhiyun 	struct inode *inode;
2642*4882a593Smuzhiyun restart:
2643*4882a593Smuzhiyun 	rcu_read_lock();
2644*4882a593Smuzhiyun 	list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
2645*4882a593Smuzhiyun 		if (!pnfs_layout_can_be_returned(lo) ||
2646*4882a593Smuzhiyun 		    test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
2647*4882a593Smuzhiyun 			continue;
2648*4882a593Smuzhiyun 		inode = lo->plh_inode;
2649*4882a593Smuzhiyun 		spin_lock(&inode->i_lock);
2650*4882a593Smuzhiyun 		if (!pnfs_should_return_unused_layout(lo, range)) {
2651*4882a593Smuzhiyun 			spin_unlock(&inode->i_lock);
2652*4882a593Smuzhiyun 			continue;
2653*4882a593Smuzhiyun 		}
2654*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
2655*4882a593Smuzhiyun 		inode = pnfs_grab_inode_layout_hdr(lo);
2656*4882a593Smuzhiyun 		if (!inode)
2657*4882a593Smuzhiyun 			continue;
2658*4882a593Smuzhiyun 		rcu_read_unlock();
2659*4882a593Smuzhiyun 		pnfs_mark_layout_for_return(inode, range);
2660*4882a593Smuzhiyun 		iput(inode);
2661*4882a593Smuzhiyun 		cond_resched();
2662*4882a593Smuzhiyun 		goto restart;
2663*4882a593Smuzhiyun 	}
2664*4882a593Smuzhiyun 	rcu_read_unlock();
2665*4882a593Smuzhiyun 	return 0;
2666*4882a593Smuzhiyun }
2667*4882a593Smuzhiyun 
2668*4882a593Smuzhiyun void
pnfs_layout_return_unused_byclid(struct nfs_client * clp,enum pnfs_iomode iomode)2669*4882a593Smuzhiyun pnfs_layout_return_unused_byclid(struct nfs_client *clp,
2670*4882a593Smuzhiyun 				 enum pnfs_iomode iomode)
2671*4882a593Smuzhiyun {
2672*4882a593Smuzhiyun 	struct pnfs_layout_range range = {
2673*4882a593Smuzhiyun 		.iomode = iomode,
2674*4882a593Smuzhiyun 		.offset = 0,
2675*4882a593Smuzhiyun 		.length = NFS4_MAX_UINT64,
2676*4882a593Smuzhiyun 	};
2677*4882a593Smuzhiyun 
2678*4882a593Smuzhiyun 	nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
2679*4882a593Smuzhiyun 			&range);
2680*4882a593Smuzhiyun }
2681*4882a593Smuzhiyun 
2682*4882a593Smuzhiyun void
pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor * pgio)2683*4882a593Smuzhiyun pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
2684*4882a593Smuzhiyun {
2685*4882a593Smuzhiyun 	if (pgio->pg_lseg == NULL ||
2686*4882a593Smuzhiyun 	    test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags))
2687*4882a593Smuzhiyun 		return;
2688*4882a593Smuzhiyun 	pnfs_put_lseg(pgio->pg_lseg);
2689*4882a593Smuzhiyun 	pgio->pg_lseg = NULL;
2690*4882a593Smuzhiyun }
2691*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
2692*4882a593Smuzhiyun 
2693*4882a593Smuzhiyun /*
2694*4882a593Smuzhiyun  * Check for any intersection between the request and the pgio->pg_lseg,
2695*4882a593Smuzhiyun  * and if none, put this pgio->pg_lseg away.
2696*4882a593Smuzhiyun  */
2697*4882a593Smuzhiyun void
pnfs_generic_pg_check_range(struct nfs_pageio_descriptor * pgio,struct nfs_page * req)2698*4882a593Smuzhiyun pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2699*4882a593Smuzhiyun {
2700*4882a593Smuzhiyun 	if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
2701*4882a593Smuzhiyun 		pnfs_put_lseg(pgio->pg_lseg);
2702*4882a593Smuzhiyun 		pgio->pg_lseg = NULL;
2703*4882a593Smuzhiyun 	}
2704*4882a593Smuzhiyun }
2705*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
2706*4882a593Smuzhiyun 
2707*4882a593Smuzhiyun void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor * pgio,struct nfs_page * req)2708*4882a593Smuzhiyun pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
2709*4882a593Smuzhiyun {
2710*4882a593Smuzhiyun 	u64 rd_size = req->wb_bytes;
2711*4882a593Smuzhiyun 
2712*4882a593Smuzhiyun 	pnfs_generic_pg_check_layout(pgio);
2713*4882a593Smuzhiyun 	pnfs_generic_pg_check_range(pgio, req);
2714*4882a593Smuzhiyun 	if (pgio->pg_lseg == NULL) {
2715*4882a593Smuzhiyun 		if (pgio->pg_dreq == NULL)
2716*4882a593Smuzhiyun 			rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
2717*4882a593Smuzhiyun 		else
2718*4882a593Smuzhiyun 			rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
2719*4882a593Smuzhiyun 
2720*4882a593Smuzhiyun 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2721*4882a593Smuzhiyun 						   nfs_req_openctx(req),
2722*4882a593Smuzhiyun 						   req_offset(req),
2723*4882a593Smuzhiyun 						   rd_size,
2724*4882a593Smuzhiyun 						   IOMODE_READ,
2725*4882a593Smuzhiyun 						   false,
2726*4882a593Smuzhiyun 						   GFP_KERNEL);
2727*4882a593Smuzhiyun 		if (IS_ERR(pgio->pg_lseg)) {
2728*4882a593Smuzhiyun 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2729*4882a593Smuzhiyun 			pgio->pg_lseg = NULL;
2730*4882a593Smuzhiyun 			return;
2731*4882a593Smuzhiyun 		}
2732*4882a593Smuzhiyun 	}
2733*4882a593Smuzhiyun 	/* If no lseg, fall back to read through mds */
2734*4882a593Smuzhiyun 	if (pgio->pg_lseg == NULL)
2735*4882a593Smuzhiyun 		nfs_pageio_reset_read_mds(pgio);
2736*4882a593Smuzhiyun 
2737*4882a593Smuzhiyun }
2738*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_read);
2739*4882a593Smuzhiyun 
2740*4882a593Smuzhiyun void
pnfs_generic_pg_init_write(struct nfs_pageio_descriptor * pgio,struct nfs_page * req,u64 wb_size)2741*4882a593Smuzhiyun pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
2742*4882a593Smuzhiyun 			   struct nfs_page *req, u64 wb_size)
2743*4882a593Smuzhiyun {
2744*4882a593Smuzhiyun 	pnfs_generic_pg_check_layout(pgio);
2745*4882a593Smuzhiyun 	pnfs_generic_pg_check_range(pgio, req);
2746*4882a593Smuzhiyun 	if (pgio->pg_lseg == NULL) {
2747*4882a593Smuzhiyun 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
2748*4882a593Smuzhiyun 						   nfs_req_openctx(req),
2749*4882a593Smuzhiyun 						   req_offset(req),
2750*4882a593Smuzhiyun 						   wb_size,
2751*4882a593Smuzhiyun 						   IOMODE_RW,
2752*4882a593Smuzhiyun 						   false,
2753*4882a593Smuzhiyun 						   GFP_KERNEL);
2754*4882a593Smuzhiyun 		if (IS_ERR(pgio->pg_lseg)) {
2755*4882a593Smuzhiyun 			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
2756*4882a593Smuzhiyun 			pgio->pg_lseg = NULL;
2757*4882a593Smuzhiyun 			return;
2758*4882a593Smuzhiyun 		}
2759*4882a593Smuzhiyun 	}
2760*4882a593Smuzhiyun 	/* If no lseg, fall back to write through mds */
2761*4882a593Smuzhiyun 	if (pgio->pg_lseg == NULL)
2762*4882a593Smuzhiyun 		nfs_pageio_reset_write_mds(pgio);
2763*4882a593Smuzhiyun }
2764*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
2765*4882a593Smuzhiyun 
2766*4882a593Smuzhiyun void
pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor * desc)2767*4882a593Smuzhiyun pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
2768*4882a593Smuzhiyun {
2769*4882a593Smuzhiyun 	if (desc->pg_lseg) {
2770*4882a593Smuzhiyun 		pnfs_put_lseg(desc->pg_lseg);
2771*4882a593Smuzhiyun 		desc->pg_lseg = NULL;
2772*4882a593Smuzhiyun 	}
2773*4882a593Smuzhiyun }
2774*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
2775*4882a593Smuzhiyun 
2776*4882a593Smuzhiyun /*
2777*4882a593Smuzhiyun  * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
2778*4882a593Smuzhiyun  * of bytes (maximum @req->wb_bytes) that can be coalesced.
2779*4882a593Smuzhiyun  */
2780*4882a593Smuzhiyun size_t
pnfs_generic_pg_test(struct nfs_pageio_descriptor * pgio,struct nfs_page * prev,struct nfs_page * req)2781*4882a593Smuzhiyun pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
2782*4882a593Smuzhiyun 		     struct nfs_page *prev, struct nfs_page *req)
2783*4882a593Smuzhiyun {
2784*4882a593Smuzhiyun 	unsigned int size;
2785*4882a593Smuzhiyun 	u64 seg_end, req_start, seg_left;
2786*4882a593Smuzhiyun 
2787*4882a593Smuzhiyun 	size = nfs_generic_pg_test(pgio, prev, req);
2788*4882a593Smuzhiyun 	if (!size)
2789*4882a593Smuzhiyun 		return 0;
2790*4882a593Smuzhiyun 
2791*4882a593Smuzhiyun 	/*
2792*4882a593Smuzhiyun 	 * 'size' contains the number of bytes left in the current page (up
2793*4882a593Smuzhiyun 	 * to the original size asked for in @req->wb_bytes).
2794*4882a593Smuzhiyun 	 *
2795*4882a593Smuzhiyun 	 * Calculate how many bytes are left in the layout segment
2796*4882a593Smuzhiyun 	 * and if there are less bytes than 'size', return that instead.
2797*4882a593Smuzhiyun 	 *
2798*4882a593Smuzhiyun 	 * Please also note that 'end_offset' is actually the offset of the
2799*4882a593Smuzhiyun 	 * first byte that lies outside the pnfs_layout_range. FIXME?
2800*4882a593Smuzhiyun 	 *
2801*4882a593Smuzhiyun 	 */
2802*4882a593Smuzhiyun 	if (pgio->pg_lseg) {
2803*4882a593Smuzhiyun 		seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
2804*4882a593Smuzhiyun 				     pgio->pg_lseg->pls_range.length);
2805*4882a593Smuzhiyun 		req_start = req_offset(req);
2806*4882a593Smuzhiyun 
2807*4882a593Smuzhiyun 		/* start of request is past the last byte of this segment */
2808*4882a593Smuzhiyun 		if (req_start >= seg_end)
2809*4882a593Smuzhiyun 			return 0;
2810*4882a593Smuzhiyun 
2811*4882a593Smuzhiyun 		/* adjust 'size' iff there are fewer bytes left in the
2812*4882a593Smuzhiyun 		 * segment than what nfs_generic_pg_test returned */
2813*4882a593Smuzhiyun 		seg_left = seg_end - req_start;
2814*4882a593Smuzhiyun 		if (seg_left < size)
2815*4882a593Smuzhiyun 			size = (unsigned int)seg_left;
2816*4882a593Smuzhiyun 	}
2817*4882a593Smuzhiyun 
2818*4882a593Smuzhiyun 	return size;
2819*4882a593Smuzhiyun }
2820*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_test);
2821*4882a593Smuzhiyun 
pnfs_write_done_resend_to_mds(struct nfs_pgio_header * hdr)2822*4882a593Smuzhiyun int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr)
2823*4882a593Smuzhiyun {
2824*4882a593Smuzhiyun 	struct nfs_pageio_descriptor pgio;
2825*4882a593Smuzhiyun 
2826*4882a593Smuzhiyun 	/* Resend all requests through the MDS */
2827*4882a593Smuzhiyun 	nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true,
2828*4882a593Smuzhiyun 			      hdr->completion_ops);
2829*4882a593Smuzhiyun 	set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags);
2830*4882a593Smuzhiyun 	return nfs_pageio_resend(&pgio, hdr);
2831*4882a593Smuzhiyun }
2832*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds);
2833*4882a593Smuzhiyun 
pnfs_ld_handle_write_error(struct nfs_pgio_header * hdr)2834*4882a593Smuzhiyun static void pnfs_ld_handle_write_error(struct nfs_pgio_header *hdr)
2835*4882a593Smuzhiyun {
2836*4882a593Smuzhiyun 
2837*4882a593Smuzhiyun 	dprintk("pnfs write error = %d\n", hdr->pnfs_error);
2838*4882a593Smuzhiyun 	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2839*4882a593Smuzhiyun 	    PNFS_LAYOUTRET_ON_ERROR) {
2840*4882a593Smuzhiyun 		pnfs_return_layout(hdr->inode);
2841*4882a593Smuzhiyun 	}
2842*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2843*4882a593Smuzhiyun 		hdr->task.tk_status = pnfs_write_done_resend_to_mds(hdr);
2844*4882a593Smuzhiyun }
2845*4882a593Smuzhiyun 
2846*4882a593Smuzhiyun /*
2847*4882a593Smuzhiyun  * Called by non rpc-based layout drivers
2848*4882a593Smuzhiyun  */
pnfs_ld_write_done(struct nfs_pgio_header * hdr)2849*4882a593Smuzhiyun void pnfs_ld_write_done(struct nfs_pgio_header *hdr)
2850*4882a593Smuzhiyun {
2851*4882a593Smuzhiyun 	if (likely(!hdr->pnfs_error)) {
2852*4882a593Smuzhiyun 		pnfs_set_layoutcommit(hdr->inode, hdr->lseg,
2853*4882a593Smuzhiyun 				hdr->mds_offset + hdr->res.count);
2854*4882a593Smuzhiyun 		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2855*4882a593Smuzhiyun 	}
2856*4882a593Smuzhiyun 	trace_nfs4_pnfs_write(hdr, hdr->pnfs_error);
2857*4882a593Smuzhiyun 	if (unlikely(hdr->pnfs_error))
2858*4882a593Smuzhiyun 		pnfs_ld_handle_write_error(hdr);
2859*4882a593Smuzhiyun 	hdr->mds_ops->rpc_release(hdr);
2860*4882a593Smuzhiyun }
2861*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_ld_write_done);
2862*4882a593Smuzhiyun 
2863*4882a593Smuzhiyun static void
pnfs_write_through_mds(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr)2864*4882a593Smuzhiyun pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
2865*4882a593Smuzhiyun 		struct nfs_pgio_header *hdr)
2866*4882a593Smuzhiyun {
2867*4882a593Smuzhiyun 	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2868*4882a593Smuzhiyun 
2869*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2870*4882a593Smuzhiyun 		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2871*4882a593Smuzhiyun 		nfs_pageio_reset_write_mds(desc);
2872*4882a593Smuzhiyun 		mirror->pg_recoalesce = 1;
2873*4882a593Smuzhiyun 	}
2874*4882a593Smuzhiyun 	hdr->completion_ops->completion(hdr);
2875*4882a593Smuzhiyun }
2876*4882a593Smuzhiyun 
2877*4882a593Smuzhiyun static enum pnfs_try_status
pnfs_try_to_write_data(struct nfs_pgio_header * hdr,const struct rpc_call_ops * call_ops,struct pnfs_layout_segment * lseg,int how)2878*4882a593Smuzhiyun pnfs_try_to_write_data(struct nfs_pgio_header *hdr,
2879*4882a593Smuzhiyun 			const struct rpc_call_ops *call_ops,
2880*4882a593Smuzhiyun 			struct pnfs_layout_segment *lseg,
2881*4882a593Smuzhiyun 			int how)
2882*4882a593Smuzhiyun {
2883*4882a593Smuzhiyun 	struct inode *inode = hdr->inode;
2884*4882a593Smuzhiyun 	enum pnfs_try_status trypnfs;
2885*4882a593Smuzhiyun 	struct nfs_server *nfss = NFS_SERVER(inode);
2886*4882a593Smuzhiyun 
2887*4882a593Smuzhiyun 	hdr->mds_ops = call_ops;
2888*4882a593Smuzhiyun 
2889*4882a593Smuzhiyun 	dprintk("%s: Writing ino:%lu %u@%llu (how %d)\n", __func__,
2890*4882a593Smuzhiyun 		inode->i_ino, hdr->args.count, hdr->args.offset, how);
2891*4882a593Smuzhiyun 	trypnfs = nfss->pnfs_curr_ld->write_pagelist(hdr, how);
2892*4882a593Smuzhiyun 	if (trypnfs != PNFS_NOT_ATTEMPTED)
2893*4882a593Smuzhiyun 		nfs_inc_stats(inode, NFSIOS_PNFS_WRITE);
2894*4882a593Smuzhiyun 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
2895*4882a593Smuzhiyun 	return trypnfs;
2896*4882a593Smuzhiyun }
2897*4882a593Smuzhiyun 
2898*4882a593Smuzhiyun static void
pnfs_do_write(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr,int how)2899*4882a593Smuzhiyun pnfs_do_write(struct nfs_pageio_descriptor *desc,
2900*4882a593Smuzhiyun 	      struct nfs_pgio_header *hdr, int how)
2901*4882a593Smuzhiyun {
2902*4882a593Smuzhiyun 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
2903*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
2904*4882a593Smuzhiyun 	enum pnfs_try_status trypnfs;
2905*4882a593Smuzhiyun 
2906*4882a593Smuzhiyun 	trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
2907*4882a593Smuzhiyun 	switch (trypnfs) {
2908*4882a593Smuzhiyun 	case PNFS_NOT_ATTEMPTED:
2909*4882a593Smuzhiyun 		pnfs_write_through_mds(desc, hdr);
2910*4882a593Smuzhiyun 	case PNFS_ATTEMPTED:
2911*4882a593Smuzhiyun 		break;
2912*4882a593Smuzhiyun 	case PNFS_TRY_AGAIN:
2913*4882a593Smuzhiyun 		/* cleanup hdr and prepare to redo pnfs */
2914*4882a593Smuzhiyun 		if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2915*4882a593Smuzhiyun 			struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2916*4882a593Smuzhiyun 			list_splice_init(&hdr->pages, &mirror->pg_list);
2917*4882a593Smuzhiyun 			mirror->pg_recoalesce = 1;
2918*4882a593Smuzhiyun 		}
2919*4882a593Smuzhiyun 		hdr->mds_ops->rpc_release(hdr);
2920*4882a593Smuzhiyun 	}
2921*4882a593Smuzhiyun }
2922*4882a593Smuzhiyun 
pnfs_writehdr_free(struct nfs_pgio_header * hdr)2923*4882a593Smuzhiyun static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
2924*4882a593Smuzhiyun {
2925*4882a593Smuzhiyun 	pnfs_put_lseg(hdr->lseg);
2926*4882a593Smuzhiyun 	nfs_pgio_header_free(hdr);
2927*4882a593Smuzhiyun }
2928*4882a593Smuzhiyun 
2929*4882a593Smuzhiyun int
pnfs_generic_pg_writepages(struct nfs_pageio_descriptor * desc)2930*4882a593Smuzhiyun pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
2931*4882a593Smuzhiyun {
2932*4882a593Smuzhiyun 	struct nfs_pgio_header *hdr;
2933*4882a593Smuzhiyun 	int ret;
2934*4882a593Smuzhiyun 
2935*4882a593Smuzhiyun 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
2936*4882a593Smuzhiyun 	if (!hdr) {
2937*4882a593Smuzhiyun 		desc->pg_error = -ENOMEM;
2938*4882a593Smuzhiyun 		return desc->pg_error;
2939*4882a593Smuzhiyun 	}
2940*4882a593Smuzhiyun 	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
2941*4882a593Smuzhiyun 
2942*4882a593Smuzhiyun 	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
2943*4882a593Smuzhiyun 	ret = nfs_generic_pgio(desc, hdr);
2944*4882a593Smuzhiyun 	if (!ret)
2945*4882a593Smuzhiyun 		pnfs_do_write(desc, hdr, desc->pg_ioflags);
2946*4882a593Smuzhiyun 
2947*4882a593Smuzhiyun 	return ret;
2948*4882a593Smuzhiyun }
2949*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
2950*4882a593Smuzhiyun 
pnfs_read_done_resend_to_mds(struct nfs_pgio_header * hdr)2951*4882a593Smuzhiyun int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *hdr)
2952*4882a593Smuzhiyun {
2953*4882a593Smuzhiyun 	struct nfs_pageio_descriptor pgio;
2954*4882a593Smuzhiyun 
2955*4882a593Smuzhiyun 	/* Resend all requests through the MDS */
2956*4882a593Smuzhiyun 	nfs_pageio_init_read(&pgio, hdr->inode, true, hdr->completion_ops);
2957*4882a593Smuzhiyun 	return nfs_pageio_resend(&pgio, hdr);
2958*4882a593Smuzhiyun }
2959*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_read_done_resend_to_mds);
2960*4882a593Smuzhiyun 
pnfs_ld_handle_read_error(struct nfs_pgio_header * hdr)2961*4882a593Smuzhiyun static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
2962*4882a593Smuzhiyun {
2963*4882a593Smuzhiyun 	dprintk("pnfs read error = %d\n", hdr->pnfs_error);
2964*4882a593Smuzhiyun 	if (NFS_SERVER(hdr->inode)->pnfs_curr_ld->flags &
2965*4882a593Smuzhiyun 	    PNFS_LAYOUTRET_ON_ERROR) {
2966*4882a593Smuzhiyun 		pnfs_return_layout(hdr->inode);
2967*4882a593Smuzhiyun 	}
2968*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags))
2969*4882a593Smuzhiyun 		hdr->task.tk_status = pnfs_read_done_resend_to_mds(hdr);
2970*4882a593Smuzhiyun }
2971*4882a593Smuzhiyun 
2972*4882a593Smuzhiyun /*
2973*4882a593Smuzhiyun  * Called by non rpc-based layout drivers
2974*4882a593Smuzhiyun  */
pnfs_ld_read_done(struct nfs_pgio_header * hdr)2975*4882a593Smuzhiyun void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
2976*4882a593Smuzhiyun {
2977*4882a593Smuzhiyun 	if (likely(!hdr->pnfs_error))
2978*4882a593Smuzhiyun 		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
2979*4882a593Smuzhiyun 	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
2980*4882a593Smuzhiyun 	if (unlikely(hdr->pnfs_error))
2981*4882a593Smuzhiyun 		pnfs_ld_handle_read_error(hdr);
2982*4882a593Smuzhiyun 	hdr->mds_ops->rpc_release(hdr);
2983*4882a593Smuzhiyun }
2984*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_ld_read_done);
2985*4882a593Smuzhiyun 
2986*4882a593Smuzhiyun static void
pnfs_read_through_mds(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr)2987*4882a593Smuzhiyun pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
2988*4882a593Smuzhiyun 		struct nfs_pgio_header *hdr)
2989*4882a593Smuzhiyun {
2990*4882a593Smuzhiyun 	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
2991*4882a593Smuzhiyun 
2992*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
2993*4882a593Smuzhiyun 		list_splice_tail_init(&hdr->pages, &mirror->pg_list);
2994*4882a593Smuzhiyun 		nfs_pageio_reset_read_mds(desc);
2995*4882a593Smuzhiyun 		mirror->pg_recoalesce = 1;
2996*4882a593Smuzhiyun 	}
2997*4882a593Smuzhiyun 	hdr->completion_ops->completion(hdr);
2998*4882a593Smuzhiyun }
2999*4882a593Smuzhiyun 
3000*4882a593Smuzhiyun /*
3001*4882a593Smuzhiyun  * Call the appropriate parallel I/O subsystem read function.
3002*4882a593Smuzhiyun  */
3003*4882a593Smuzhiyun static enum pnfs_try_status
pnfs_try_to_read_data(struct nfs_pgio_header * hdr,const struct rpc_call_ops * call_ops,struct pnfs_layout_segment * lseg)3004*4882a593Smuzhiyun pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
3005*4882a593Smuzhiyun 		       const struct rpc_call_ops *call_ops,
3006*4882a593Smuzhiyun 		       struct pnfs_layout_segment *lseg)
3007*4882a593Smuzhiyun {
3008*4882a593Smuzhiyun 	struct inode *inode = hdr->inode;
3009*4882a593Smuzhiyun 	struct nfs_server *nfss = NFS_SERVER(inode);
3010*4882a593Smuzhiyun 	enum pnfs_try_status trypnfs;
3011*4882a593Smuzhiyun 
3012*4882a593Smuzhiyun 	hdr->mds_ops = call_ops;
3013*4882a593Smuzhiyun 
3014*4882a593Smuzhiyun 	dprintk("%s: Reading ino:%lu %u@%llu\n",
3015*4882a593Smuzhiyun 		__func__, inode->i_ino, hdr->args.count, hdr->args.offset);
3016*4882a593Smuzhiyun 
3017*4882a593Smuzhiyun 	trypnfs = nfss->pnfs_curr_ld->read_pagelist(hdr);
3018*4882a593Smuzhiyun 	if (trypnfs != PNFS_NOT_ATTEMPTED)
3019*4882a593Smuzhiyun 		nfs_inc_stats(inode, NFSIOS_PNFS_READ);
3020*4882a593Smuzhiyun 	dprintk("%s End (trypnfs:%d)\n", __func__, trypnfs);
3021*4882a593Smuzhiyun 	return trypnfs;
3022*4882a593Smuzhiyun }
3023*4882a593Smuzhiyun 
3024*4882a593Smuzhiyun /* Resend all requests through pnfs. */
pnfs_read_resend_pnfs(struct nfs_pgio_header * hdr,unsigned int mirror_idx)3025*4882a593Smuzhiyun void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr,
3026*4882a593Smuzhiyun 			   unsigned int mirror_idx)
3027*4882a593Smuzhiyun {
3028*4882a593Smuzhiyun 	struct nfs_pageio_descriptor pgio;
3029*4882a593Smuzhiyun 
3030*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
3031*4882a593Smuzhiyun 		/* Prevent deadlocks with layoutreturn! */
3032*4882a593Smuzhiyun 		pnfs_put_lseg(hdr->lseg);
3033*4882a593Smuzhiyun 		hdr->lseg = NULL;
3034*4882a593Smuzhiyun 
3035*4882a593Smuzhiyun 		nfs_pageio_init_read(&pgio, hdr->inode, false,
3036*4882a593Smuzhiyun 					hdr->completion_ops);
3037*4882a593Smuzhiyun 		pgio.pg_mirror_idx = mirror_idx;
3038*4882a593Smuzhiyun 		hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
3039*4882a593Smuzhiyun 	}
3040*4882a593Smuzhiyun }
3041*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
3042*4882a593Smuzhiyun 
3043*4882a593Smuzhiyun static void
pnfs_do_read(struct nfs_pageio_descriptor * desc,struct nfs_pgio_header * hdr)3044*4882a593Smuzhiyun pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
3045*4882a593Smuzhiyun {
3046*4882a593Smuzhiyun 	const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
3047*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg = desc->pg_lseg;
3048*4882a593Smuzhiyun 	enum pnfs_try_status trypnfs;
3049*4882a593Smuzhiyun 
3050*4882a593Smuzhiyun 	trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
3051*4882a593Smuzhiyun 	switch (trypnfs) {
3052*4882a593Smuzhiyun 	case PNFS_NOT_ATTEMPTED:
3053*4882a593Smuzhiyun 		pnfs_read_through_mds(desc, hdr);
3054*4882a593Smuzhiyun 	case PNFS_ATTEMPTED:
3055*4882a593Smuzhiyun 		break;
3056*4882a593Smuzhiyun 	case PNFS_TRY_AGAIN:
3057*4882a593Smuzhiyun 		/* cleanup hdr and prepare to redo pnfs */
3058*4882a593Smuzhiyun 		if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
3059*4882a593Smuzhiyun 			struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
3060*4882a593Smuzhiyun 			list_splice_init(&hdr->pages, &mirror->pg_list);
3061*4882a593Smuzhiyun 			mirror->pg_recoalesce = 1;
3062*4882a593Smuzhiyun 		}
3063*4882a593Smuzhiyun 		hdr->mds_ops->rpc_release(hdr);
3064*4882a593Smuzhiyun 	}
3065*4882a593Smuzhiyun }
3066*4882a593Smuzhiyun 
pnfs_readhdr_free(struct nfs_pgio_header * hdr)3067*4882a593Smuzhiyun static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
3068*4882a593Smuzhiyun {
3069*4882a593Smuzhiyun 	pnfs_put_lseg(hdr->lseg);
3070*4882a593Smuzhiyun 	nfs_pgio_header_free(hdr);
3071*4882a593Smuzhiyun }
3072*4882a593Smuzhiyun 
3073*4882a593Smuzhiyun int
pnfs_generic_pg_readpages(struct nfs_pageio_descriptor * desc)3074*4882a593Smuzhiyun pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
3075*4882a593Smuzhiyun {
3076*4882a593Smuzhiyun 	struct nfs_pgio_header *hdr;
3077*4882a593Smuzhiyun 	int ret;
3078*4882a593Smuzhiyun 
3079*4882a593Smuzhiyun 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
3080*4882a593Smuzhiyun 	if (!hdr) {
3081*4882a593Smuzhiyun 		desc->pg_error = -ENOMEM;
3082*4882a593Smuzhiyun 		return desc->pg_error;
3083*4882a593Smuzhiyun 	}
3084*4882a593Smuzhiyun 	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
3085*4882a593Smuzhiyun 	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
3086*4882a593Smuzhiyun 	ret = nfs_generic_pgio(desc, hdr);
3087*4882a593Smuzhiyun 	if (!ret)
3088*4882a593Smuzhiyun 		pnfs_do_read(desc, hdr);
3089*4882a593Smuzhiyun 	return ret;
3090*4882a593Smuzhiyun }
3091*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_pg_readpages);
3092*4882a593Smuzhiyun 
pnfs_clear_layoutcommitting(struct inode * inode)3093*4882a593Smuzhiyun static void pnfs_clear_layoutcommitting(struct inode *inode)
3094*4882a593Smuzhiyun {
3095*4882a593Smuzhiyun 	unsigned long *bitlock = &NFS_I(inode)->flags;
3096*4882a593Smuzhiyun 
3097*4882a593Smuzhiyun 	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
3098*4882a593Smuzhiyun 	smp_mb__after_atomic();
3099*4882a593Smuzhiyun 	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
3100*4882a593Smuzhiyun }
3101*4882a593Smuzhiyun 
3102*4882a593Smuzhiyun /*
3103*4882a593Smuzhiyun  * There can be multiple RW segments.
3104*4882a593Smuzhiyun  */
pnfs_list_write_lseg(struct inode * inode,struct list_head * listp)3105*4882a593Smuzhiyun static void pnfs_list_write_lseg(struct inode *inode, struct list_head *listp)
3106*4882a593Smuzhiyun {
3107*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg;
3108*4882a593Smuzhiyun 
3109*4882a593Smuzhiyun 	list_for_each_entry(lseg, &NFS_I(inode)->layout->plh_segs, pls_list) {
3110*4882a593Smuzhiyun 		if (lseg->pls_range.iomode == IOMODE_RW &&
3111*4882a593Smuzhiyun 		    test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
3112*4882a593Smuzhiyun 			list_add(&lseg->pls_lc_list, listp);
3113*4882a593Smuzhiyun 	}
3114*4882a593Smuzhiyun }
3115*4882a593Smuzhiyun 
pnfs_list_write_lseg_done(struct inode * inode,struct list_head * listp)3116*4882a593Smuzhiyun static void pnfs_list_write_lseg_done(struct inode *inode, struct list_head *listp)
3117*4882a593Smuzhiyun {
3118*4882a593Smuzhiyun 	struct pnfs_layout_segment *lseg, *tmp;
3119*4882a593Smuzhiyun 
3120*4882a593Smuzhiyun 	/* Matched by references in pnfs_set_layoutcommit */
3121*4882a593Smuzhiyun 	list_for_each_entry_safe(lseg, tmp, listp, pls_lc_list) {
3122*4882a593Smuzhiyun 		list_del_init(&lseg->pls_lc_list);
3123*4882a593Smuzhiyun 		pnfs_put_lseg(lseg);
3124*4882a593Smuzhiyun 	}
3125*4882a593Smuzhiyun 
3126*4882a593Smuzhiyun 	pnfs_clear_layoutcommitting(inode);
3127*4882a593Smuzhiyun }
3128*4882a593Smuzhiyun 
pnfs_set_lo_fail(struct pnfs_layout_segment * lseg)3129*4882a593Smuzhiyun void pnfs_set_lo_fail(struct pnfs_layout_segment *lseg)
3130*4882a593Smuzhiyun {
3131*4882a593Smuzhiyun 	pnfs_layout_io_set_failed(lseg->pls_layout, lseg->pls_range.iomode);
3132*4882a593Smuzhiyun }
3133*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_set_lo_fail);
3134*4882a593Smuzhiyun 
3135*4882a593Smuzhiyun void
pnfs_set_layoutcommit(struct inode * inode,struct pnfs_layout_segment * lseg,loff_t end_pos)3136*4882a593Smuzhiyun pnfs_set_layoutcommit(struct inode *inode, struct pnfs_layout_segment *lseg,
3137*4882a593Smuzhiyun 		loff_t end_pos)
3138*4882a593Smuzhiyun {
3139*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(inode);
3140*4882a593Smuzhiyun 	bool mark_as_dirty = false;
3141*4882a593Smuzhiyun 
3142*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
3143*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) {
3144*4882a593Smuzhiyun 		nfsi->layout->plh_lwb = end_pos;
3145*4882a593Smuzhiyun 		mark_as_dirty = true;
3146*4882a593Smuzhiyun 		dprintk("%s: Set layoutcommit for inode %lu ",
3147*4882a593Smuzhiyun 			__func__, inode->i_ino);
3148*4882a593Smuzhiyun 	} else if (end_pos > nfsi->layout->plh_lwb)
3149*4882a593Smuzhiyun 		nfsi->layout->plh_lwb = end_pos;
3150*4882a593Smuzhiyun 	if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags)) {
3151*4882a593Smuzhiyun 		/* references matched in nfs4_layoutcommit_release */
3152*4882a593Smuzhiyun 		pnfs_get_lseg(lseg);
3153*4882a593Smuzhiyun 	}
3154*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
3155*4882a593Smuzhiyun 	dprintk("%s: lseg %p end_pos %llu\n",
3156*4882a593Smuzhiyun 		__func__, lseg, nfsi->layout->plh_lwb);
3157*4882a593Smuzhiyun 
3158*4882a593Smuzhiyun 	/* if pnfs_layoutcommit_inode() runs between inode locks, the next one
3159*4882a593Smuzhiyun 	 * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */
3160*4882a593Smuzhiyun 	if (mark_as_dirty)
3161*4882a593Smuzhiyun 		mark_inode_dirty_sync(inode);
3162*4882a593Smuzhiyun }
3163*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit);
3164*4882a593Smuzhiyun 
pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data * data)3165*4882a593Smuzhiyun void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data)
3166*4882a593Smuzhiyun {
3167*4882a593Smuzhiyun 	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
3168*4882a593Smuzhiyun 
3169*4882a593Smuzhiyun 	if (nfss->pnfs_curr_ld->cleanup_layoutcommit)
3170*4882a593Smuzhiyun 		nfss->pnfs_curr_ld->cleanup_layoutcommit(data);
3171*4882a593Smuzhiyun 	pnfs_list_write_lseg_done(data->args.inode, &data->lseg_list);
3172*4882a593Smuzhiyun }
3173*4882a593Smuzhiyun 
3174*4882a593Smuzhiyun /*
3175*4882a593Smuzhiyun  * For the LAYOUT4_NFSV4_1_FILES layout type, NFS_DATA_SYNC WRITEs and
3176*4882a593Smuzhiyun  * NFS_UNSTABLE WRITEs with a COMMIT to data servers must store enough
3177*4882a593Smuzhiyun  * data to disk to allow the server to recover the data if it crashes.
3178*4882a593Smuzhiyun  * LAYOUTCOMMIT is only needed when the NFL4_UFLG_COMMIT_THRU_MDS flag
3179*4882a593Smuzhiyun  * is off, and a COMMIT is sent to a data server, or
3180*4882a593Smuzhiyun  * if WRITEs to a data server return NFS_DATA_SYNC.
3181*4882a593Smuzhiyun  */
3182*4882a593Smuzhiyun int
pnfs_layoutcommit_inode(struct inode * inode,bool sync)3183*4882a593Smuzhiyun pnfs_layoutcommit_inode(struct inode *inode, bool sync)
3184*4882a593Smuzhiyun {
3185*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
3186*4882a593Smuzhiyun 	struct nfs4_layoutcommit_data *data;
3187*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(inode);
3188*4882a593Smuzhiyun 	loff_t end_pos;
3189*4882a593Smuzhiyun 	int status;
3190*4882a593Smuzhiyun 
3191*4882a593Smuzhiyun 	if (!pnfs_layoutcommit_outstanding(inode))
3192*4882a593Smuzhiyun 		return 0;
3193*4882a593Smuzhiyun 
3194*4882a593Smuzhiyun 	dprintk("--> %s inode %lu\n", __func__, inode->i_ino);
3195*4882a593Smuzhiyun 
3196*4882a593Smuzhiyun 	status = -EAGAIN;
3197*4882a593Smuzhiyun 	if (test_and_set_bit(NFS_INO_LAYOUTCOMMITTING, &nfsi->flags)) {
3198*4882a593Smuzhiyun 		if (!sync)
3199*4882a593Smuzhiyun 			goto out;
3200*4882a593Smuzhiyun 		status = wait_on_bit_lock_action(&nfsi->flags,
3201*4882a593Smuzhiyun 				NFS_INO_LAYOUTCOMMITTING,
3202*4882a593Smuzhiyun 				nfs_wait_bit_killable,
3203*4882a593Smuzhiyun 				TASK_KILLABLE);
3204*4882a593Smuzhiyun 		if (status)
3205*4882a593Smuzhiyun 			goto out;
3206*4882a593Smuzhiyun 	}
3207*4882a593Smuzhiyun 
3208*4882a593Smuzhiyun 	status = -ENOMEM;
3209*4882a593Smuzhiyun 	/* Note kzalloc ensures data->res.seq_res.sr_slot == NULL */
3210*4882a593Smuzhiyun 	data = kzalloc(sizeof(*data), GFP_NOFS);
3211*4882a593Smuzhiyun 	if (!data)
3212*4882a593Smuzhiyun 		goto clear_layoutcommitting;
3213*4882a593Smuzhiyun 
3214*4882a593Smuzhiyun 	status = 0;
3215*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
3216*4882a593Smuzhiyun 	if (!test_and_clear_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags))
3217*4882a593Smuzhiyun 		goto out_unlock;
3218*4882a593Smuzhiyun 
3219*4882a593Smuzhiyun 	INIT_LIST_HEAD(&data->lseg_list);
3220*4882a593Smuzhiyun 	pnfs_list_write_lseg(inode, &data->lseg_list);
3221*4882a593Smuzhiyun 
3222*4882a593Smuzhiyun 	end_pos = nfsi->layout->plh_lwb;
3223*4882a593Smuzhiyun 
3224*4882a593Smuzhiyun 	nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
3225*4882a593Smuzhiyun 	data->cred = get_cred(nfsi->layout->plh_lc_cred);
3226*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
3227*4882a593Smuzhiyun 
3228*4882a593Smuzhiyun 	data->args.inode = inode;
3229*4882a593Smuzhiyun 	nfs_fattr_init(&data->fattr);
3230*4882a593Smuzhiyun 	data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
3231*4882a593Smuzhiyun 	data->res.fattr = &data->fattr;
3232*4882a593Smuzhiyun 	if (end_pos != 0)
3233*4882a593Smuzhiyun 		data->args.lastbytewritten = end_pos - 1;
3234*4882a593Smuzhiyun 	else
3235*4882a593Smuzhiyun 		data->args.lastbytewritten = U64_MAX;
3236*4882a593Smuzhiyun 	data->res.server = NFS_SERVER(inode);
3237*4882a593Smuzhiyun 
3238*4882a593Smuzhiyun 	if (ld->prepare_layoutcommit) {
3239*4882a593Smuzhiyun 		status = ld->prepare_layoutcommit(&data->args);
3240*4882a593Smuzhiyun 		if (status) {
3241*4882a593Smuzhiyun 			put_cred(data->cred);
3242*4882a593Smuzhiyun 			spin_lock(&inode->i_lock);
3243*4882a593Smuzhiyun 			set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags);
3244*4882a593Smuzhiyun 			if (end_pos > nfsi->layout->plh_lwb)
3245*4882a593Smuzhiyun 				nfsi->layout->plh_lwb = end_pos;
3246*4882a593Smuzhiyun 			goto out_unlock;
3247*4882a593Smuzhiyun 		}
3248*4882a593Smuzhiyun 	}
3249*4882a593Smuzhiyun 
3250*4882a593Smuzhiyun 
3251*4882a593Smuzhiyun 	status = nfs4_proc_layoutcommit(data, sync);
3252*4882a593Smuzhiyun out:
3253*4882a593Smuzhiyun 	if (status)
3254*4882a593Smuzhiyun 		mark_inode_dirty_sync(inode);
3255*4882a593Smuzhiyun 	dprintk("<-- %s status %d\n", __func__, status);
3256*4882a593Smuzhiyun 	return status;
3257*4882a593Smuzhiyun out_unlock:
3258*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
3259*4882a593Smuzhiyun 	kfree(data);
3260*4882a593Smuzhiyun clear_layoutcommitting:
3261*4882a593Smuzhiyun 	pnfs_clear_layoutcommitting(inode);
3262*4882a593Smuzhiyun 	goto out;
3263*4882a593Smuzhiyun }
3264*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
3265*4882a593Smuzhiyun 
3266*4882a593Smuzhiyun int
pnfs_generic_sync(struct inode * inode,bool datasync)3267*4882a593Smuzhiyun pnfs_generic_sync(struct inode *inode, bool datasync)
3268*4882a593Smuzhiyun {
3269*4882a593Smuzhiyun 	return pnfs_layoutcommit_inode(inode, true);
3270*4882a593Smuzhiyun }
3271*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_generic_sync);
3272*4882a593Smuzhiyun 
pnfs_mdsthreshold_alloc(void)3273*4882a593Smuzhiyun struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
3274*4882a593Smuzhiyun {
3275*4882a593Smuzhiyun 	struct nfs4_threshold *thp;
3276*4882a593Smuzhiyun 
3277*4882a593Smuzhiyun 	thp = kzalloc(sizeof(*thp), GFP_NOFS);
3278*4882a593Smuzhiyun 	if (!thp) {
3279*4882a593Smuzhiyun 		dprintk("%s mdsthreshold allocation failed\n", __func__);
3280*4882a593Smuzhiyun 		return NULL;
3281*4882a593Smuzhiyun 	}
3282*4882a593Smuzhiyun 	return thp;
3283*4882a593Smuzhiyun }
3284*4882a593Smuzhiyun 
3285*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_NFS_V4_2)
3286*4882a593Smuzhiyun int
pnfs_report_layoutstat(struct inode * inode,gfp_t gfp_flags)3287*4882a593Smuzhiyun pnfs_report_layoutstat(struct inode *inode, gfp_t gfp_flags)
3288*4882a593Smuzhiyun {
3289*4882a593Smuzhiyun 	struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
3290*4882a593Smuzhiyun 	struct nfs_server *server = NFS_SERVER(inode);
3291*4882a593Smuzhiyun 	struct nfs_inode *nfsi = NFS_I(inode);
3292*4882a593Smuzhiyun 	struct nfs42_layoutstat_data *data;
3293*4882a593Smuzhiyun 	struct pnfs_layout_hdr *hdr;
3294*4882a593Smuzhiyun 	int status = 0;
3295*4882a593Smuzhiyun 
3296*4882a593Smuzhiyun 	if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats)
3297*4882a593Smuzhiyun 		goto out;
3298*4882a593Smuzhiyun 
3299*4882a593Smuzhiyun 	if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS))
3300*4882a593Smuzhiyun 		goto out;
3301*4882a593Smuzhiyun 
3302*4882a593Smuzhiyun 	if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags))
3303*4882a593Smuzhiyun 		goto out;
3304*4882a593Smuzhiyun 
3305*4882a593Smuzhiyun 	spin_lock(&inode->i_lock);
3306*4882a593Smuzhiyun 	if (!NFS_I(inode)->layout) {
3307*4882a593Smuzhiyun 		spin_unlock(&inode->i_lock);
3308*4882a593Smuzhiyun 		goto out_clear_layoutstats;
3309*4882a593Smuzhiyun 	}
3310*4882a593Smuzhiyun 	hdr = NFS_I(inode)->layout;
3311*4882a593Smuzhiyun 	pnfs_get_layout_hdr(hdr);
3312*4882a593Smuzhiyun 	spin_unlock(&inode->i_lock);
3313*4882a593Smuzhiyun 
3314*4882a593Smuzhiyun 	data = kzalloc(sizeof(*data), gfp_flags);
3315*4882a593Smuzhiyun 	if (!data) {
3316*4882a593Smuzhiyun 		status = -ENOMEM;
3317*4882a593Smuzhiyun 		goto out_put;
3318*4882a593Smuzhiyun 	}
3319*4882a593Smuzhiyun 
3320*4882a593Smuzhiyun 	data->args.fh = NFS_FH(inode);
3321*4882a593Smuzhiyun 	data->args.inode = inode;
3322*4882a593Smuzhiyun 	status = ld->prepare_layoutstats(&data->args);
3323*4882a593Smuzhiyun 	if (status)
3324*4882a593Smuzhiyun 		goto out_free;
3325*4882a593Smuzhiyun 
3326*4882a593Smuzhiyun 	status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data);
3327*4882a593Smuzhiyun 
3328*4882a593Smuzhiyun out:
3329*4882a593Smuzhiyun 	dprintk("%s returns %d\n", __func__, status);
3330*4882a593Smuzhiyun 	return status;
3331*4882a593Smuzhiyun 
3332*4882a593Smuzhiyun out_free:
3333*4882a593Smuzhiyun 	kfree(data);
3334*4882a593Smuzhiyun out_put:
3335*4882a593Smuzhiyun 	pnfs_put_layout_hdr(hdr);
3336*4882a593Smuzhiyun out_clear_layoutstats:
3337*4882a593Smuzhiyun 	smp_mb__before_atomic();
3338*4882a593Smuzhiyun 	clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags);
3339*4882a593Smuzhiyun 	smp_mb__after_atomic();
3340*4882a593Smuzhiyun 	goto out;
3341*4882a593Smuzhiyun }
3342*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(pnfs_report_layoutstat);
3343*4882a593Smuzhiyun #endif
3344*4882a593Smuzhiyun 
3345*4882a593Smuzhiyun unsigned int layoutstats_timer;
3346*4882a593Smuzhiyun module_param(layoutstats_timer, uint, 0644);
3347*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(layoutstats_timer);
3348