1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * linux/fs/nfs/blocklayout/blocklayout.c
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * Module for the NFSv4.1 pNFS block layout driver.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Copyright (c) 2006 The Regents of the University of Michigan.
7*4882a593Smuzhiyun * All rights reserved.
8*4882a593Smuzhiyun *
9*4882a593Smuzhiyun * Andy Adamson <andros@citi.umich.edu>
10*4882a593Smuzhiyun * Fred Isaman <iisaman@umich.edu>
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * permission is granted to use, copy, create derivative works and
13*4882a593Smuzhiyun * redistribute this software and such derivative works for any purpose,
14*4882a593Smuzhiyun * so long as the name of the university of michigan is not used in
15*4882a593Smuzhiyun * any advertising or publicity pertaining to the use or distribution
16*4882a593Smuzhiyun * of this software without specific, written prior authorization. if
17*4882a593Smuzhiyun * the above copyright notice or any other identification of the
18*4882a593Smuzhiyun * university of michigan is included in any copy of any portion of
19*4882a593Smuzhiyun * this software, then the disclaimer below must also be included.
20*4882a593Smuzhiyun *
21*4882a593Smuzhiyun * this software is provided as is, without representation from the
22*4882a593Smuzhiyun * university of michigan as to its fitness for any purpose, and without
23*4882a593Smuzhiyun * warranty by the university of michigan of any kind, either express
24*4882a593Smuzhiyun * or implied, including without limitation the implied warranties of
25*4882a593Smuzhiyun * merchantability and fitness for a particular purpose. the regents
26*4882a593Smuzhiyun * of the university of michigan shall not be liable for any damages,
27*4882a593Smuzhiyun * including special, indirect, incidental, or consequential damages,
28*4882a593Smuzhiyun * with respect to any claim arising out or in connection with the use
29*4882a593Smuzhiyun * of the software, even if it has been or is hereafter advised of the
30*4882a593Smuzhiyun * possibility of such damages.
31*4882a593Smuzhiyun */
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun #include <linux/module.h>
34*4882a593Smuzhiyun #include <linux/init.h>
35*4882a593Smuzhiyun #include <linux/mount.h>
36*4882a593Smuzhiyun #include <linux/namei.h>
37*4882a593Smuzhiyun #include <linux/bio.h> /* struct bio */
38*4882a593Smuzhiyun #include <linux/prefetch.h>
39*4882a593Smuzhiyun #include <linux/pagevec.h>
40*4882a593Smuzhiyun
41*4882a593Smuzhiyun #include "../pnfs.h"
42*4882a593Smuzhiyun #include "../nfs4session.h"
43*4882a593Smuzhiyun #include "../internal.h"
44*4882a593Smuzhiyun #include "blocklayout.h"
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun #define NFSDBG_FACILITY NFSDBG_PNFS_LD
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun MODULE_LICENSE("GPL");
49*4882a593Smuzhiyun MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>");
50*4882a593Smuzhiyun MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver");
51*4882a593Smuzhiyun
is_hole(struct pnfs_block_extent * be)52*4882a593Smuzhiyun static bool is_hole(struct pnfs_block_extent *be)
53*4882a593Smuzhiyun {
54*4882a593Smuzhiyun switch (be->be_state) {
55*4882a593Smuzhiyun case PNFS_BLOCK_NONE_DATA:
56*4882a593Smuzhiyun return true;
57*4882a593Smuzhiyun case PNFS_BLOCK_INVALID_DATA:
58*4882a593Smuzhiyun return be->be_tag ? false : true;
59*4882a593Smuzhiyun default:
60*4882a593Smuzhiyun return false;
61*4882a593Smuzhiyun }
62*4882a593Smuzhiyun }
63*4882a593Smuzhiyun
64*4882a593Smuzhiyun /* The data we are handed might be spread across several bios. We need
65*4882a593Smuzhiyun * to track when the last one is finished.
66*4882a593Smuzhiyun */
67*4882a593Smuzhiyun struct parallel_io {
68*4882a593Smuzhiyun struct kref refcnt;
69*4882a593Smuzhiyun void (*pnfs_callback) (void *data);
70*4882a593Smuzhiyun void *data;
71*4882a593Smuzhiyun };
72*4882a593Smuzhiyun
alloc_parallel(void * data)73*4882a593Smuzhiyun static inline struct parallel_io *alloc_parallel(void *data)
74*4882a593Smuzhiyun {
75*4882a593Smuzhiyun struct parallel_io *rv;
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun rv = kmalloc(sizeof(*rv), GFP_NOFS);
78*4882a593Smuzhiyun if (rv) {
79*4882a593Smuzhiyun rv->data = data;
80*4882a593Smuzhiyun kref_init(&rv->refcnt);
81*4882a593Smuzhiyun }
82*4882a593Smuzhiyun return rv;
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun
get_parallel(struct parallel_io * p)85*4882a593Smuzhiyun static inline void get_parallel(struct parallel_io *p)
86*4882a593Smuzhiyun {
87*4882a593Smuzhiyun kref_get(&p->refcnt);
88*4882a593Smuzhiyun }
89*4882a593Smuzhiyun
destroy_parallel(struct kref * kref)90*4882a593Smuzhiyun static void destroy_parallel(struct kref *kref)
91*4882a593Smuzhiyun {
92*4882a593Smuzhiyun struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun dprintk("%s enter\n", __func__);
95*4882a593Smuzhiyun p->pnfs_callback(p->data);
96*4882a593Smuzhiyun kfree(p);
97*4882a593Smuzhiyun }
98*4882a593Smuzhiyun
put_parallel(struct parallel_io * p)99*4882a593Smuzhiyun static inline void put_parallel(struct parallel_io *p)
100*4882a593Smuzhiyun {
101*4882a593Smuzhiyun kref_put(&p->refcnt, destroy_parallel);
102*4882a593Smuzhiyun }
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun static struct bio *
bl_submit_bio(struct bio * bio)105*4882a593Smuzhiyun bl_submit_bio(struct bio *bio)
106*4882a593Smuzhiyun {
107*4882a593Smuzhiyun if (bio) {
108*4882a593Smuzhiyun get_parallel(bio->bi_private);
109*4882a593Smuzhiyun dprintk("%s submitting %s bio %u@%llu\n", __func__,
110*4882a593Smuzhiyun bio_op(bio) == READ ? "read" : "write",
111*4882a593Smuzhiyun bio->bi_iter.bi_size,
112*4882a593Smuzhiyun (unsigned long long)bio->bi_iter.bi_sector);
113*4882a593Smuzhiyun submit_bio(bio);
114*4882a593Smuzhiyun }
115*4882a593Smuzhiyun return NULL;
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun
118*4882a593Smuzhiyun static struct bio *
bl_alloc_init_bio(int npg,struct block_device * bdev,sector_t disk_sector,bio_end_io_t end_io,struct parallel_io * par)119*4882a593Smuzhiyun bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
120*4882a593Smuzhiyun bio_end_io_t end_io, struct parallel_io *par)
121*4882a593Smuzhiyun {
122*4882a593Smuzhiyun struct bio *bio;
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun npg = min(npg, BIO_MAX_PAGES);
125*4882a593Smuzhiyun bio = bio_alloc(GFP_NOIO, npg);
126*4882a593Smuzhiyun if (!bio && (current->flags & PF_MEMALLOC)) {
127*4882a593Smuzhiyun while (!bio && (npg /= 2))
128*4882a593Smuzhiyun bio = bio_alloc(GFP_NOIO, npg);
129*4882a593Smuzhiyun }
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun if (bio) {
132*4882a593Smuzhiyun bio->bi_iter.bi_sector = disk_sector;
133*4882a593Smuzhiyun bio_set_dev(bio, bdev);
134*4882a593Smuzhiyun bio->bi_end_io = end_io;
135*4882a593Smuzhiyun bio->bi_private = par;
136*4882a593Smuzhiyun }
137*4882a593Smuzhiyun return bio;
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun
offset_in_map(u64 offset,struct pnfs_block_dev_map * map)140*4882a593Smuzhiyun static bool offset_in_map(u64 offset, struct pnfs_block_dev_map *map)
141*4882a593Smuzhiyun {
142*4882a593Smuzhiyun return offset >= map->start && offset < map->start + map->len;
143*4882a593Smuzhiyun }
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun static struct bio *
do_add_page_to_bio(struct bio * bio,int npg,int rw,sector_t isect,struct page * page,struct pnfs_block_dev_map * map,struct pnfs_block_extent * be,bio_end_io_t end_io,struct parallel_io * par,unsigned int offset,int * len)146*4882a593Smuzhiyun do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
147*4882a593Smuzhiyun struct page *page, struct pnfs_block_dev_map *map,
148*4882a593Smuzhiyun struct pnfs_block_extent *be, bio_end_io_t end_io,
149*4882a593Smuzhiyun struct parallel_io *par, unsigned int offset, int *len)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun struct pnfs_block_dev *dev =
152*4882a593Smuzhiyun container_of(be->be_device, struct pnfs_block_dev, node);
153*4882a593Smuzhiyun u64 disk_addr, end;
154*4882a593Smuzhiyun
155*4882a593Smuzhiyun dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
156*4882a593Smuzhiyun npg, rw, (unsigned long long)isect, offset, *len);
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun /* translate to device offset */
159*4882a593Smuzhiyun isect += be->be_v_offset;
160*4882a593Smuzhiyun isect -= be->be_f_offset;
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun /* translate to physical disk offset */
163*4882a593Smuzhiyun disk_addr = (u64)isect << SECTOR_SHIFT;
164*4882a593Smuzhiyun if (!offset_in_map(disk_addr, map)) {
165*4882a593Smuzhiyun if (!dev->map(dev, disk_addr, map) || !offset_in_map(disk_addr, map))
166*4882a593Smuzhiyun return ERR_PTR(-EIO);
167*4882a593Smuzhiyun bio = bl_submit_bio(bio);
168*4882a593Smuzhiyun }
169*4882a593Smuzhiyun disk_addr += map->disk_offset;
170*4882a593Smuzhiyun disk_addr -= map->start;
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun /* limit length to what the device mapping allows */
173*4882a593Smuzhiyun end = disk_addr + *len;
174*4882a593Smuzhiyun if (end >= map->start + map->len)
175*4882a593Smuzhiyun *len = map->start + map->len - disk_addr;
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun retry:
178*4882a593Smuzhiyun if (!bio) {
179*4882a593Smuzhiyun bio = bl_alloc_init_bio(npg, map->bdev,
180*4882a593Smuzhiyun disk_addr >> SECTOR_SHIFT, end_io, par);
181*4882a593Smuzhiyun if (!bio)
182*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
183*4882a593Smuzhiyun bio_set_op_attrs(bio, rw, 0);
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun if (bio_add_page(bio, page, *len, offset) < *len) {
186*4882a593Smuzhiyun bio = bl_submit_bio(bio);
187*4882a593Smuzhiyun goto retry;
188*4882a593Smuzhiyun }
189*4882a593Smuzhiyun return bio;
190*4882a593Smuzhiyun }
191*4882a593Smuzhiyun
bl_mark_devices_unavailable(struct nfs_pgio_header * header,bool rw)192*4882a593Smuzhiyun static void bl_mark_devices_unavailable(struct nfs_pgio_header *header, bool rw)
193*4882a593Smuzhiyun {
194*4882a593Smuzhiyun struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
195*4882a593Smuzhiyun size_t bytes_left = header->args.count;
196*4882a593Smuzhiyun sector_t isect, extent_length = 0;
197*4882a593Smuzhiyun struct pnfs_block_extent be;
198*4882a593Smuzhiyun
199*4882a593Smuzhiyun isect = header->args.offset >> SECTOR_SHIFT;
200*4882a593Smuzhiyun bytes_left += header->args.offset - (isect << SECTOR_SHIFT);
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun while (bytes_left > 0) {
203*4882a593Smuzhiyun if (!ext_tree_lookup(bl, isect, &be, rw))
204*4882a593Smuzhiyun return;
205*4882a593Smuzhiyun extent_length = be.be_length - (isect - be.be_f_offset);
206*4882a593Smuzhiyun nfs4_mark_deviceid_unavailable(be.be_device);
207*4882a593Smuzhiyun isect += extent_length;
208*4882a593Smuzhiyun if (bytes_left > extent_length << SECTOR_SHIFT)
209*4882a593Smuzhiyun bytes_left -= extent_length << SECTOR_SHIFT;
210*4882a593Smuzhiyun else
211*4882a593Smuzhiyun bytes_left = 0;
212*4882a593Smuzhiyun }
213*4882a593Smuzhiyun }
214*4882a593Smuzhiyun
bl_end_io_read(struct bio * bio)215*4882a593Smuzhiyun static void bl_end_io_read(struct bio *bio)
216*4882a593Smuzhiyun {
217*4882a593Smuzhiyun struct parallel_io *par = bio->bi_private;
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun if (bio->bi_status) {
220*4882a593Smuzhiyun struct nfs_pgio_header *header = par->data;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun if (!header->pnfs_error)
223*4882a593Smuzhiyun header->pnfs_error = -EIO;
224*4882a593Smuzhiyun pnfs_set_lo_fail(header->lseg);
225*4882a593Smuzhiyun bl_mark_devices_unavailable(header, false);
226*4882a593Smuzhiyun }
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun bio_put(bio);
229*4882a593Smuzhiyun put_parallel(par);
230*4882a593Smuzhiyun }
231*4882a593Smuzhiyun
bl_read_cleanup(struct work_struct * work)232*4882a593Smuzhiyun static void bl_read_cleanup(struct work_struct *work)
233*4882a593Smuzhiyun {
234*4882a593Smuzhiyun struct rpc_task *task;
235*4882a593Smuzhiyun struct nfs_pgio_header *hdr;
236*4882a593Smuzhiyun dprintk("%s enter\n", __func__);
237*4882a593Smuzhiyun task = container_of(work, struct rpc_task, u.tk_work);
238*4882a593Smuzhiyun hdr = container_of(task, struct nfs_pgio_header, task);
239*4882a593Smuzhiyun pnfs_ld_read_done(hdr);
240*4882a593Smuzhiyun }
241*4882a593Smuzhiyun
242*4882a593Smuzhiyun static void
bl_end_par_io_read(void * data)243*4882a593Smuzhiyun bl_end_par_io_read(void *data)
244*4882a593Smuzhiyun {
245*4882a593Smuzhiyun struct nfs_pgio_header *hdr = data;
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun hdr->task.tk_status = hdr->pnfs_error;
248*4882a593Smuzhiyun INIT_WORK(&hdr->task.u.tk_work, bl_read_cleanup);
249*4882a593Smuzhiyun schedule_work(&hdr->task.u.tk_work);
250*4882a593Smuzhiyun }
251*4882a593Smuzhiyun
252*4882a593Smuzhiyun static enum pnfs_try_status
bl_read_pagelist(struct nfs_pgio_header * header)253*4882a593Smuzhiyun bl_read_pagelist(struct nfs_pgio_header *header)
254*4882a593Smuzhiyun {
255*4882a593Smuzhiyun struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
256*4882a593Smuzhiyun struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
257*4882a593Smuzhiyun struct bio *bio = NULL;
258*4882a593Smuzhiyun struct pnfs_block_extent be;
259*4882a593Smuzhiyun sector_t isect, extent_length = 0;
260*4882a593Smuzhiyun struct parallel_io *par;
261*4882a593Smuzhiyun loff_t f_offset = header->args.offset;
262*4882a593Smuzhiyun size_t bytes_left = header->args.count;
263*4882a593Smuzhiyun unsigned int pg_offset = header->args.pgbase, pg_len;
264*4882a593Smuzhiyun struct page **pages = header->args.pages;
265*4882a593Smuzhiyun int pg_index = header->args.pgbase >> PAGE_SHIFT;
266*4882a593Smuzhiyun const bool is_dio = (header->dreq != NULL);
267*4882a593Smuzhiyun struct blk_plug plug;
268*4882a593Smuzhiyun int i;
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__,
271*4882a593Smuzhiyun header->page_array.npages, f_offset,
272*4882a593Smuzhiyun (unsigned int)header->args.count);
273*4882a593Smuzhiyun
274*4882a593Smuzhiyun par = alloc_parallel(header);
275*4882a593Smuzhiyun if (!par)
276*4882a593Smuzhiyun return PNFS_NOT_ATTEMPTED;
277*4882a593Smuzhiyun par->pnfs_callback = bl_end_par_io_read;
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun blk_start_plug(&plug);
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun isect = (sector_t) (f_offset >> SECTOR_SHIFT);
282*4882a593Smuzhiyun /* Code assumes extents are page-aligned */
283*4882a593Smuzhiyun for (i = pg_index; i < header->page_array.npages; i++) {
284*4882a593Smuzhiyun if (extent_length <= 0) {
285*4882a593Smuzhiyun /* We've used up the previous extent */
286*4882a593Smuzhiyun bio = bl_submit_bio(bio);
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun /* Get the next one */
289*4882a593Smuzhiyun if (!ext_tree_lookup(bl, isect, &be, false)) {
290*4882a593Smuzhiyun header->pnfs_error = -EIO;
291*4882a593Smuzhiyun goto out;
292*4882a593Smuzhiyun }
293*4882a593Smuzhiyun extent_length = be.be_length - (isect - be.be_f_offset);
294*4882a593Smuzhiyun }
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun if (is_dio) {
297*4882a593Smuzhiyun if (pg_offset + bytes_left > PAGE_SIZE)
298*4882a593Smuzhiyun pg_len = PAGE_SIZE - pg_offset;
299*4882a593Smuzhiyun else
300*4882a593Smuzhiyun pg_len = bytes_left;
301*4882a593Smuzhiyun } else {
302*4882a593Smuzhiyun BUG_ON(pg_offset != 0);
303*4882a593Smuzhiyun pg_len = PAGE_SIZE;
304*4882a593Smuzhiyun }
305*4882a593Smuzhiyun
306*4882a593Smuzhiyun if (is_hole(&be)) {
307*4882a593Smuzhiyun bio = bl_submit_bio(bio);
308*4882a593Smuzhiyun /* Fill hole w/ zeroes w/o accessing device */
309*4882a593Smuzhiyun dprintk("%s Zeroing page for hole\n", __func__);
310*4882a593Smuzhiyun zero_user_segment(pages[i], pg_offset, pg_len);
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun /* invalidate map */
313*4882a593Smuzhiyun map.start = NFS4_MAX_UINT64;
314*4882a593Smuzhiyun } else {
315*4882a593Smuzhiyun bio = do_add_page_to_bio(bio,
316*4882a593Smuzhiyun header->page_array.npages - i,
317*4882a593Smuzhiyun READ,
318*4882a593Smuzhiyun isect, pages[i], &map, &be,
319*4882a593Smuzhiyun bl_end_io_read, par,
320*4882a593Smuzhiyun pg_offset, &pg_len);
321*4882a593Smuzhiyun if (IS_ERR(bio)) {
322*4882a593Smuzhiyun header->pnfs_error = PTR_ERR(bio);
323*4882a593Smuzhiyun bio = NULL;
324*4882a593Smuzhiyun goto out;
325*4882a593Smuzhiyun }
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun isect += (pg_len >> SECTOR_SHIFT);
328*4882a593Smuzhiyun extent_length -= (pg_len >> SECTOR_SHIFT);
329*4882a593Smuzhiyun f_offset += pg_len;
330*4882a593Smuzhiyun bytes_left -= pg_len;
331*4882a593Smuzhiyun pg_offset = 0;
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
334*4882a593Smuzhiyun header->res.eof = 1;
335*4882a593Smuzhiyun header->res.count = header->inode->i_size - header->args.offset;
336*4882a593Smuzhiyun } else {
337*4882a593Smuzhiyun header->res.count = (isect << SECTOR_SHIFT) - header->args.offset;
338*4882a593Smuzhiyun }
339*4882a593Smuzhiyun out:
340*4882a593Smuzhiyun bl_submit_bio(bio);
341*4882a593Smuzhiyun blk_finish_plug(&plug);
342*4882a593Smuzhiyun put_parallel(par);
343*4882a593Smuzhiyun return PNFS_ATTEMPTED;
344*4882a593Smuzhiyun }
345*4882a593Smuzhiyun
bl_end_io_write(struct bio * bio)346*4882a593Smuzhiyun static void bl_end_io_write(struct bio *bio)
347*4882a593Smuzhiyun {
348*4882a593Smuzhiyun struct parallel_io *par = bio->bi_private;
349*4882a593Smuzhiyun struct nfs_pgio_header *header = par->data;
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun if (bio->bi_status) {
352*4882a593Smuzhiyun if (!header->pnfs_error)
353*4882a593Smuzhiyun header->pnfs_error = -EIO;
354*4882a593Smuzhiyun pnfs_set_lo_fail(header->lseg);
355*4882a593Smuzhiyun bl_mark_devices_unavailable(header, true);
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun bio_put(bio);
358*4882a593Smuzhiyun put_parallel(par);
359*4882a593Smuzhiyun }
360*4882a593Smuzhiyun
361*4882a593Smuzhiyun /* Function scheduled for call during bl_end_par_io_write,
362*4882a593Smuzhiyun * it marks sectors as written and extends the commitlist.
363*4882a593Smuzhiyun */
bl_write_cleanup(struct work_struct * work)364*4882a593Smuzhiyun static void bl_write_cleanup(struct work_struct *work)
365*4882a593Smuzhiyun {
366*4882a593Smuzhiyun struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work);
367*4882a593Smuzhiyun struct nfs_pgio_header *hdr =
368*4882a593Smuzhiyun container_of(task, struct nfs_pgio_header, task);
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun dprintk("%s enter\n", __func__);
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun if (likely(!hdr->pnfs_error)) {
373*4882a593Smuzhiyun struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg);
374*4882a593Smuzhiyun u64 start = hdr->args.offset & (loff_t)PAGE_MASK;
375*4882a593Smuzhiyun u64 end = (hdr->args.offset + hdr->args.count +
376*4882a593Smuzhiyun PAGE_SIZE - 1) & (loff_t)PAGE_MASK;
377*4882a593Smuzhiyun u64 lwb = hdr->args.offset + hdr->args.count;
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun ext_tree_mark_written(bl, start >> SECTOR_SHIFT,
380*4882a593Smuzhiyun (end - start) >> SECTOR_SHIFT, lwb);
381*4882a593Smuzhiyun }
382*4882a593Smuzhiyun
383*4882a593Smuzhiyun pnfs_ld_write_done(hdr);
384*4882a593Smuzhiyun }
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun /* Called when last of bios associated with a bl_write_pagelist call finishes */
bl_end_par_io_write(void * data)387*4882a593Smuzhiyun static void bl_end_par_io_write(void *data)
388*4882a593Smuzhiyun {
389*4882a593Smuzhiyun struct nfs_pgio_header *hdr = data;
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun hdr->task.tk_status = hdr->pnfs_error;
392*4882a593Smuzhiyun hdr->verf.committed = NFS_FILE_SYNC;
393*4882a593Smuzhiyun INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup);
394*4882a593Smuzhiyun schedule_work(&hdr->task.u.tk_work);
395*4882a593Smuzhiyun }
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun static enum pnfs_try_status
bl_write_pagelist(struct nfs_pgio_header * header,int sync)398*4882a593Smuzhiyun bl_write_pagelist(struct nfs_pgio_header *header, int sync)
399*4882a593Smuzhiyun {
400*4882a593Smuzhiyun struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
401*4882a593Smuzhiyun struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
402*4882a593Smuzhiyun struct bio *bio = NULL;
403*4882a593Smuzhiyun struct pnfs_block_extent be;
404*4882a593Smuzhiyun sector_t isect, extent_length = 0;
405*4882a593Smuzhiyun struct parallel_io *par = NULL;
406*4882a593Smuzhiyun loff_t offset = header->args.offset;
407*4882a593Smuzhiyun size_t count = header->args.count;
408*4882a593Smuzhiyun struct page **pages = header->args.pages;
409*4882a593Smuzhiyun int pg_index = header->args.pgbase >> PAGE_SHIFT;
410*4882a593Smuzhiyun unsigned int pg_len;
411*4882a593Smuzhiyun struct blk_plug plug;
412*4882a593Smuzhiyun int i;
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun dprintk("%s enter, %zu@%lld\n", __func__, count, offset);
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun /* At this point, header->page_aray is a (sequential) list of nfs_pages.
417*4882a593Smuzhiyun * We want to write each, and if there is an error set pnfs_error
418*4882a593Smuzhiyun * to have it redone using nfs.
419*4882a593Smuzhiyun */
420*4882a593Smuzhiyun par = alloc_parallel(header);
421*4882a593Smuzhiyun if (!par)
422*4882a593Smuzhiyun return PNFS_NOT_ATTEMPTED;
423*4882a593Smuzhiyun par->pnfs_callback = bl_end_par_io_write;
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun blk_start_plug(&plug);
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun /* we always write out the whole page */
428*4882a593Smuzhiyun offset = offset & (loff_t)PAGE_MASK;
429*4882a593Smuzhiyun isect = offset >> SECTOR_SHIFT;
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun for (i = pg_index; i < header->page_array.npages; i++) {
432*4882a593Smuzhiyun if (extent_length <= 0) {
433*4882a593Smuzhiyun /* We've used up the previous extent */
434*4882a593Smuzhiyun bio = bl_submit_bio(bio);
435*4882a593Smuzhiyun /* Get the next one */
436*4882a593Smuzhiyun if (!ext_tree_lookup(bl, isect, &be, true)) {
437*4882a593Smuzhiyun header->pnfs_error = -EINVAL;
438*4882a593Smuzhiyun goto out;
439*4882a593Smuzhiyun }
440*4882a593Smuzhiyun
441*4882a593Smuzhiyun extent_length = be.be_length - (isect - be.be_f_offset);
442*4882a593Smuzhiyun }
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun pg_len = PAGE_SIZE;
445*4882a593Smuzhiyun bio = do_add_page_to_bio(bio, header->page_array.npages - i,
446*4882a593Smuzhiyun WRITE, isect, pages[i], &map, &be,
447*4882a593Smuzhiyun bl_end_io_write, par,
448*4882a593Smuzhiyun 0, &pg_len);
449*4882a593Smuzhiyun if (IS_ERR(bio)) {
450*4882a593Smuzhiyun header->pnfs_error = PTR_ERR(bio);
451*4882a593Smuzhiyun bio = NULL;
452*4882a593Smuzhiyun goto out;
453*4882a593Smuzhiyun }
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun offset += pg_len;
456*4882a593Smuzhiyun count -= pg_len;
457*4882a593Smuzhiyun isect += (pg_len >> SECTOR_SHIFT);
458*4882a593Smuzhiyun extent_length -= (pg_len >> SECTOR_SHIFT);
459*4882a593Smuzhiyun }
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun header->res.count = header->args.count;
462*4882a593Smuzhiyun out:
463*4882a593Smuzhiyun bl_submit_bio(bio);
464*4882a593Smuzhiyun blk_finish_plug(&plug);
465*4882a593Smuzhiyun put_parallel(par);
466*4882a593Smuzhiyun return PNFS_ATTEMPTED;
467*4882a593Smuzhiyun }
468*4882a593Smuzhiyun
bl_free_layout_hdr(struct pnfs_layout_hdr * lo)469*4882a593Smuzhiyun static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
470*4882a593Smuzhiyun {
471*4882a593Smuzhiyun struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
472*4882a593Smuzhiyun int err;
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun dprintk("%s enter\n", __func__);
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun err = ext_tree_remove(bl, true, 0, LLONG_MAX);
477*4882a593Smuzhiyun WARN_ON(err);
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun kfree_rcu(bl, bl_layout.plh_rcu);
480*4882a593Smuzhiyun }
481*4882a593Smuzhiyun
__bl_alloc_layout_hdr(struct inode * inode,gfp_t gfp_flags,bool is_scsi_layout)482*4882a593Smuzhiyun static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
483*4882a593Smuzhiyun gfp_t gfp_flags, bool is_scsi_layout)
484*4882a593Smuzhiyun {
485*4882a593Smuzhiyun struct pnfs_block_layout *bl;
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun dprintk("%s enter\n", __func__);
488*4882a593Smuzhiyun bl = kzalloc(sizeof(*bl), gfp_flags);
489*4882a593Smuzhiyun if (!bl)
490*4882a593Smuzhiyun return NULL;
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun bl->bl_ext_rw = RB_ROOT;
493*4882a593Smuzhiyun bl->bl_ext_ro = RB_ROOT;
494*4882a593Smuzhiyun spin_lock_init(&bl->bl_ext_lock);
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun bl->bl_scsi_layout = is_scsi_layout;
497*4882a593Smuzhiyun return &bl->bl_layout;
498*4882a593Smuzhiyun }
499*4882a593Smuzhiyun
bl_alloc_layout_hdr(struct inode * inode,gfp_t gfp_flags)500*4882a593Smuzhiyun static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode,
501*4882a593Smuzhiyun gfp_t gfp_flags)
502*4882a593Smuzhiyun {
503*4882a593Smuzhiyun return __bl_alloc_layout_hdr(inode, gfp_flags, false);
504*4882a593Smuzhiyun }
505*4882a593Smuzhiyun
sl_alloc_layout_hdr(struct inode * inode,gfp_t gfp_flags)506*4882a593Smuzhiyun static struct pnfs_layout_hdr *sl_alloc_layout_hdr(struct inode *inode,
507*4882a593Smuzhiyun gfp_t gfp_flags)
508*4882a593Smuzhiyun {
509*4882a593Smuzhiyun return __bl_alloc_layout_hdr(inode, gfp_flags, true);
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun
bl_free_lseg(struct pnfs_layout_segment * lseg)512*4882a593Smuzhiyun static void bl_free_lseg(struct pnfs_layout_segment *lseg)
513*4882a593Smuzhiyun {
514*4882a593Smuzhiyun dprintk("%s enter\n", __func__);
515*4882a593Smuzhiyun kfree(lseg);
516*4882a593Smuzhiyun }
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun /* Tracks info needed to ensure extents in layout obey constraints of spec */
519*4882a593Smuzhiyun struct layout_verification {
520*4882a593Smuzhiyun u32 mode; /* R or RW */
521*4882a593Smuzhiyun u64 start; /* Expected start of next non-COW extent */
522*4882a593Smuzhiyun u64 inval; /* Start of INVAL coverage */
523*4882a593Smuzhiyun u64 cowread; /* End of COW read coverage */
524*4882a593Smuzhiyun };
525*4882a593Smuzhiyun
526*4882a593Smuzhiyun /* Verify the extent meets the layout requirements of the pnfs-block draft,
527*4882a593Smuzhiyun * section 2.3.1.
528*4882a593Smuzhiyun */
verify_extent(struct pnfs_block_extent * be,struct layout_verification * lv)529*4882a593Smuzhiyun static int verify_extent(struct pnfs_block_extent *be,
530*4882a593Smuzhiyun struct layout_verification *lv)
531*4882a593Smuzhiyun {
532*4882a593Smuzhiyun if (lv->mode == IOMODE_READ) {
533*4882a593Smuzhiyun if (be->be_state == PNFS_BLOCK_READWRITE_DATA ||
534*4882a593Smuzhiyun be->be_state == PNFS_BLOCK_INVALID_DATA)
535*4882a593Smuzhiyun return -EIO;
536*4882a593Smuzhiyun if (be->be_f_offset != lv->start)
537*4882a593Smuzhiyun return -EIO;
538*4882a593Smuzhiyun lv->start += be->be_length;
539*4882a593Smuzhiyun return 0;
540*4882a593Smuzhiyun }
541*4882a593Smuzhiyun /* lv->mode == IOMODE_RW */
542*4882a593Smuzhiyun if (be->be_state == PNFS_BLOCK_READWRITE_DATA) {
543*4882a593Smuzhiyun if (be->be_f_offset != lv->start)
544*4882a593Smuzhiyun return -EIO;
545*4882a593Smuzhiyun if (lv->cowread > lv->start)
546*4882a593Smuzhiyun return -EIO;
547*4882a593Smuzhiyun lv->start += be->be_length;
548*4882a593Smuzhiyun lv->inval = lv->start;
549*4882a593Smuzhiyun return 0;
550*4882a593Smuzhiyun } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
551*4882a593Smuzhiyun if (be->be_f_offset != lv->start)
552*4882a593Smuzhiyun return -EIO;
553*4882a593Smuzhiyun lv->start += be->be_length;
554*4882a593Smuzhiyun return 0;
555*4882a593Smuzhiyun } else if (be->be_state == PNFS_BLOCK_READ_DATA) {
556*4882a593Smuzhiyun if (be->be_f_offset > lv->start)
557*4882a593Smuzhiyun return -EIO;
558*4882a593Smuzhiyun if (be->be_f_offset < lv->inval)
559*4882a593Smuzhiyun return -EIO;
560*4882a593Smuzhiyun if (be->be_f_offset < lv->cowread)
561*4882a593Smuzhiyun return -EIO;
562*4882a593Smuzhiyun /* It looks like you might want to min this with lv->start,
563*4882a593Smuzhiyun * but you really don't.
564*4882a593Smuzhiyun */
565*4882a593Smuzhiyun lv->inval = lv->inval + be->be_length;
566*4882a593Smuzhiyun lv->cowread = be->be_f_offset + be->be_length;
567*4882a593Smuzhiyun return 0;
568*4882a593Smuzhiyun } else
569*4882a593Smuzhiyun return -EIO;
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun
decode_sector_number(__be32 ** rp,sector_t * sp)572*4882a593Smuzhiyun static int decode_sector_number(__be32 **rp, sector_t *sp)
573*4882a593Smuzhiyun {
574*4882a593Smuzhiyun uint64_t s;
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun *rp = xdr_decode_hyper(*rp, &s);
577*4882a593Smuzhiyun if (s & 0x1ff) {
578*4882a593Smuzhiyun printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__);
579*4882a593Smuzhiyun return -1;
580*4882a593Smuzhiyun }
581*4882a593Smuzhiyun *sp = s >> SECTOR_SHIFT;
582*4882a593Smuzhiyun return 0;
583*4882a593Smuzhiyun }
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun static struct nfs4_deviceid_node *
bl_find_get_deviceid(struct nfs_server * server,const struct nfs4_deviceid * id,const struct cred * cred,gfp_t gfp_mask)586*4882a593Smuzhiyun bl_find_get_deviceid(struct nfs_server *server,
587*4882a593Smuzhiyun const struct nfs4_deviceid *id, const struct cred *cred,
588*4882a593Smuzhiyun gfp_t gfp_mask)
589*4882a593Smuzhiyun {
590*4882a593Smuzhiyun struct nfs4_deviceid_node *node;
591*4882a593Smuzhiyun unsigned long start, end;
592*4882a593Smuzhiyun
593*4882a593Smuzhiyun retry:
594*4882a593Smuzhiyun node = nfs4_find_get_deviceid(server, id, cred, gfp_mask);
595*4882a593Smuzhiyun if (!node)
596*4882a593Smuzhiyun return ERR_PTR(-ENODEV);
597*4882a593Smuzhiyun
598*4882a593Smuzhiyun if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags) == 0)
599*4882a593Smuzhiyun return node;
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun end = jiffies;
602*4882a593Smuzhiyun start = end - PNFS_DEVICE_RETRY_TIMEOUT;
603*4882a593Smuzhiyun if (!time_in_range(node->timestamp_unavailable, start, end)) {
604*4882a593Smuzhiyun nfs4_delete_deviceid(node->ld, node->nfs_client, id);
605*4882a593Smuzhiyun goto retry;
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun return ERR_PTR(-ENODEV);
608*4882a593Smuzhiyun }
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun static int
bl_alloc_extent(struct xdr_stream * xdr,struct pnfs_layout_hdr * lo,struct layout_verification * lv,struct list_head * extents,gfp_t gfp_mask)611*4882a593Smuzhiyun bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo,
612*4882a593Smuzhiyun struct layout_verification *lv, struct list_head *extents,
613*4882a593Smuzhiyun gfp_t gfp_mask)
614*4882a593Smuzhiyun {
615*4882a593Smuzhiyun struct pnfs_block_extent *be;
616*4882a593Smuzhiyun struct nfs4_deviceid id;
617*4882a593Smuzhiyun int error;
618*4882a593Smuzhiyun __be32 *p;
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE);
621*4882a593Smuzhiyun if (!p)
622*4882a593Smuzhiyun return -EIO;
623*4882a593Smuzhiyun
624*4882a593Smuzhiyun be = kzalloc(sizeof(*be), GFP_NOFS);
625*4882a593Smuzhiyun if (!be)
626*4882a593Smuzhiyun return -ENOMEM;
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun memcpy(&id, p, NFS4_DEVICEID4_SIZE);
629*4882a593Smuzhiyun p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE);
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun be->be_device = bl_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id,
632*4882a593Smuzhiyun lo->plh_lc_cred, gfp_mask);
633*4882a593Smuzhiyun if (IS_ERR(be->be_device)) {
634*4882a593Smuzhiyun error = PTR_ERR(be->be_device);
635*4882a593Smuzhiyun goto out_free_be;
636*4882a593Smuzhiyun }
637*4882a593Smuzhiyun
638*4882a593Smuzhiyun /*
639*4882a593Smuzhiyun * The next three values are read in as bytes, but stored in the
640*4882a593Smuzhiyun * extent structure in 512-byte granularity.
641*4882a593Smuzhiyun */
642*4882a593Smuzhiyun error = -EIO;
643*4882a593Smuzhiyun if (decode_sector_number(&p, &be->be_f_offset) < 0)
644*4882a593Smuzhiyun goto out_put_deviceid;
645*4882a593Smuzhiyun if (decode_sector_number(&p, &be->be_length) < 0)
646*4882a593Smuzhiyun goto out_put_deviceid;
647*4882a593Smuzhiyun if (decode_sector_number(&p, &be->be_v_offset) < 0)
648*4882a593Smuzhiyun goto out_put_deviceid;
649*4882a593Smuzhiyun be->be_state = be32_to_cpup(p++);
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun error = verify_extent(be, lv);
652*4882a593Smuzhiyun if (error) {
653*4882a593Smuzhiyun dprintk("%s: extent verification failed\n", __func__);
654*4882a593Smuzhiyun goto out_put_deviceid;
655*4882a593Smuzhiyun }
656*4882a593Smuzhiyun
657*4882a593Smuzhiyun list_add_tail(&be->be_list, extents);
658*4882a593Smuzhiyun return 0;
659*4882a593Smuzhiyun
660*4882a593Smuzhiyun out_put_deviceid:
661*4882a593Smuzhiyun nfs4_put_deviceid_node(be->be_device);
662*4882a593Smuzhiyun out_free_be:
663*4882a593Smuzhiyun kfree(be);
664*4882a593Smuzhiyun return error;
665*4882a593Smuzhiyun }
666*4882a593Smuzhiyun
667*4882a593Smuzhiyun static struct pnfs_layout_segment *
bl_alloc_lseg(struct pnfs_layout_hdr * lo,struct nfs4_layoutget_res * lgr,gfp_t gfp_mask)668*4882a593Smuzhiyun bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr,
669*4882a593Smuzhiyun gfp_t gfp_mask)
670*4882a593Smuzhiyun {
671*4882a593Smuzhiyun struct layout_verification lv = {
672*4882a593Smuzhiyun .mode = lgr->range.iomode,
673*4882a593Smuzhiyun .start = lgr->range.offset >> SECTOR_SHIFT,
674*4882a593Smuzhiyun .inval = lgr->range.offset >> SECTOR_SHIFT,
675*4882a593Smuzhiyun .cowread = lgr->range.offset >> SECTOR_SHIFT,
676*4882a593Smuzhiyun };
677*4882a593Smuzhiyun struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
678*4882a593Smuzhiyun struct pnfs_layout_segment *lseg;
679*4882a593Smuzhiyun struct xdr_buf buf;
680*4882a593Smuzhiyun struct xdr_stream xdr;
681*4882a593Smuzhiyun struct page *scratch;
682*4882a593Smuzhiyun int status, i;
683*4882a593Smuzhiyun uint32_t count;
684*4882a593Smuzhiyun __be32 *p;
685*4882a593Smuzhiyun LIST_HEAD(extents);
686*4882a593Smuzhiyun
687*4882a593Smuzhiyun dprintk("---> %s\n", __func__);
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun lseg = kzalloc(sizeof(*lseg), gfp_mask);
690*4882a593Smuzhiyun if (!lseg)
691*4882a593Smuzhiyun return ERR_PTR(-ENOMEM);
692*4882a593Smuzhiyun
693*4882a593Smuzhiyun status = -ENOMEM;
694*4882a593Smuzhiyun scratch = alloc_page(gfp_mask);
695*4882a593Smuzhiyun if (!scratch)
696*4882a593Smuzhiyun goto out;
697*4882a593Smuzhiyun
698*4882a593Smuzhiyun xdr_init_decode_pages(&xdr, &buf,
699*4882a593Smuzhiyun lgr->layoutp->pages, lgr->layoutp->len);
700*4882a593Smuzhiyun xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun status = -EIO;
703*4882a593Smuzhiyun p = xdr_inline_decode(&xdr, 4);
704*4882a593Smuzhiyun if (unlikely(!p))
705*4882a593Smuzhiyun goto out_free_scratch;
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun count = be32_to_cpup(p++);
708*4882a593Smuzhiyun dprintk("%s: number of extents %d\n", __func__, count);
709*4882a593Smuzhiyun
710*4882a593Smuzhiyun /*
711*4882a593Smuzhiyun * Decode individual extents, putting them in temporary staging area
712*4882a593Smuzhiyun * until whole layout is decoded to make error recovery easier.
713*4882a593Smuzhiyun */
714*4882a593Smuzhiyun for (i = 0; i < count; i++) {
715*4882a593Smuzhiyun status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask);
716*4882a593Smuzhiyun if (status)
717*4882a593Smuzhiyun goto process_extents;
718*4882a593Smuzhiyun }
719*4882a593Smuzhiyun
720*4882a593Smuzhiyun if (lgr->range.offset + lgr->range.length !=
721*4882a593Smuzhiyun lv.start << SECTOR_SHIFT) {
722*4882a593Smuzhiyun dprintk("%s Final length mismatch\n", __func__);
723*4882a593Smuzhiyun status = -EIO;
724*4882a593Smuzhiyun goto process_extents;
725*4882a593Smuzhiyun }
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun if (lv.start < lv.cowread) {
728*4882a593Smuzhiyun dprintk("%s Final uncovered COW extent\n", __func__);
729*4882a593Smuzhiyun status = -EIO;
730*4882a593Smuzhiyun }
731*4882a593Smuzhiyun
732*4882a593Smuzhiyun process_extents:
733*4882a593Smuzhiyun while (!list_empty(&extents)) {
734*4882a593Smuzhiyun struct pnfs_block_extent *be =
735*4882a593Smuzhiyun list_first_entry(&extents, struct pnfs_block_extent,
736*4882a593Smuzhiyun be_list);
737*4882a593Smuzhiyun list_del(&be->be_list);
738*4882a593Smuzhiyun
739*4882a593Smuzhiyun if (!status)
740*4882a593Smuzhiyun status = ext_tree_insert(bl, be);
741*4882a593Smuzhiyun
742*4882a593Smuzhiyun if (status) {
743*4882a593Smuzhiyun nfs4_put_deviceid_node(be->be_device);
744*4882a593Smuzhiyun kfree(be);
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun }
747*4882a593Smuzhiyun
748*4882a593Smuzhiyun out_free_scratch:
749*4882a593Smuzhiyun __free_page(scratch);
750*4882a593Smuzhiyun out:
751*4882a593Smuzhiyun dprintk("%s returns %d\n", __func__, status);
752*4882a593Smuzhiyun switch (status) {
753*4882a593Smuzhiyun case -ENODEV:
754*4882a593Smuzhiyun /* Our extent block devices are unavailable */
755*4882a593Smuzhiyun set_bit(NFS_LSEG_UNAVAILABLE, &lseg->pls_flags);
756*4882a593Smuzhiyun fallthrough;
757*4882a593Smuzhiyun case 0:
758*4882a593Smuzhiyun return lseg;
759*4882a593Smuzhiyun default:
760*4882a593Smuzhiyun kfree(lseg);
761*4882a593Smuzhiyun return ERR_PTR(status);
762*4882a593Smuzhiyun }
763*4882a593Smuzhiyun }
764*4882a593Smuzhiyun
765*4882a593Smuzhiyun static void
bl_return_range(struct pnfs_layout_hdr * lo,struct pnfs_layout_range * range)766*4882a593Smuzhiyun bl_return_range(struct pnfs_layout_hdr *lo,
767*4882a593Smuzhiyun struct pnfs_layout_range *range)
768*4882a593Smuzhiyun {
769*4882a593Smuzhiyun struct pnfs_block_layout *bl = BLK_LO2EXT(lo);
770*4882a593Smuzhiyun sector_t offset = range->offset >> SECTOR_SHIFT, end;
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun if (range->offset % 8) {
773*4882a593Smuzhiyun dprintk("%s: offset %lld not block size aligned\n",
774*4882a593Smuzhiyun __func__, range->offset);
775*4882a593Smuzhiyun return;
776*4882a593Smuzhiyun }
777*4882a593Smuzhiyun
778*4882a593Smuzhiyun if (range->length != NFS4_MAX_UINT64) {
779*4882a593Smuzhiyun if (range->length % 8) {
780*4882a593Smuzhiyun dprintk("%s: length %lld not block size aligned\n",
781*4882a593Smuzhiyun __func__, range->length);
782*4882a593Smuzhiyun return;
783*4882a593Smuzhiyun }
784*4882a593Smuzhiyun
785*4882a593Smuzhiyun end = offset + (range->length >> SECTOR_SHIFT);
786*4882a593Smuzhiyun } else {
787*4882a593Smuzhiyun end = round_down(NFS4_MAX_UINT64, PAGE_SIZE);
788*4882a593Smuzhiyun }
789*4882a593Smuzhiyun
790*4882a593Smuzhiyun ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end);
791*4882a593Smuzhiyun }
792*4882a593Smuzhiyun
793*4882a593Smuzhiyun static int
bl_prepare_layoutcommit(struct nfs4_layoutcommit_args * arg)794*4882a593Smuzhiyun bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg)
795*4882a593Smuzhiyun {
796*4882a593Smuzhiyun return ext_tree_prepare_commit(arg);
797*4882a593Smuzhiyun }
798*4882a593Smuzhiyun
799*4882a593Smuzhiyun static void
bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data * lcdata)800*4882a593Smuzhiyun bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata)
801*4882a593Smuzhiyun {
802*4882a593Smuzhiyun ext_tree_mark_committed(&lcdata->args, lcdata->res.status);
803*4882a593Smuzhiyun }
804*4882a593Smuzhiyun
805*4882a593Smuzhiyun static int
bl_set_layoutdriver(struct nfs_server * server,const struct nfs_fh * fh)806*4882a593Smuzhiyun bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh)
807*4882a593Smuzhiyun {
808*4882a593Smuzhiyun dprintk("%s enter\n", __func__);
809*4882a593Smuzhiyun
810*4882a593Smuzhiyun if (server->pnfs_blksize == 0) {
811*4882a593Smuzhiyun dprintk("%s Server did not return blksize\n", __func__);
812*4882a593Smuzhiyun return -EINVAL;
813*4882a593Smuzhiyun }
814*4882a593Smuzhiyun if (server->pnfs_blksize > PAGE_SIZE) {
815*4882a593Smuzhiyun printk(KERN_ERR "%s: pNFS blksize %d not supported.\n",
816*4882a593Smuzhiyun __func__, server->pnfs_blksize);
817*4882a593Smuzhiyun return -EINVAL;
818*4882a593Smuzhiyun }
819*4882a593Smuzhiyun
820*4882a593Smuzhiyun return 0;
821*4882a593Smuzhiyun }
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun static bool
is_aligned_req(struct nfs_pageio_descriptor * pgio,struct nfs_page * req,unsigned int alignment,bool is_write)824*4882a593Smuzhiyun is_aligned_req(struct nfs_pageio_descriptor *pgio,
825*4882a593Smuzhiyun struct nfs_page *req, unsigned int alignment, bool is_write)
826*4882a593Smuzhiyun {
827*4882a593Smuzhiyun /*
828*4882a593Smuzhiyun * Always accept buffered writes, higher layers take care of the
829*4882a593Smuzhiyun * right alignment.
830*4882a593Smuzhiyun */
831*4882a593Smuzhiyun if (pgio->pg_dreq == NULL)
832*4882a593Smuzhiyun return true;
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun if (!IS_ALIGNED(req->wb_offset, alignment))
835*4882a593Smuzhiyun return false;
836*4882a593Smuzhiyun
837*4882a593Smuzhiyun if (IS_ALIGNED(req->wb_bytes, alignment))
838*4882a593Smuzhiyun return true;
839*4882a593Smuzhiyun
840*4882a593Smuzhiyun if (is_write &&
841*4882a593Smuzhiyun (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode))) {
842*4882a593Smuzhiyun /*
843*4882a593Smuzhiyun * If the write goes up to the inode size, just write
844*4882a593Smuzhiyun * the full page. Data past the inode size is
845*4882a593Smuzhiyun * guaranteed to be zeroed by the higher level client
846*4882a593Smuzhiyun * code, and this behaviour is mandated by RFC 5663
847*4882a593Smuzhiyun * section 2.3.2.
848*4882a593Smuzhiyun */
849*4882a593Smuzhiyun return true;
850*4882a593Smuzhiyun }
851*4882a593Smuzhiyun
852*4882a593Smuzhiyun return false;
853*4882a593Smuzhiyun }
854*4882a593Smuzhiyun
855*4882a593Smuzhiyun static void
bl_pg_init_read(struct nfs_pageio_descriptor * pgio,struct nfs_page * req)856*4882a593Smuzhiyun bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
857*4882a593Smuzhiyun {
858*4882a593Smuzhiyun if (!is_aligned_req(pgio, req, SECTOR_SIZE, false)) {
859*4882a593Smuzhiyun nfs_pageio_reset_read_mds(pgio);
860*4882a593Smuzhiyun return;
861*4882a593Smuzhiyun }
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun pnfs_generic_pg_init_read(pgio, req);
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun if (pgio->pg_lseg &&
866*4882a593Smuzhiyun test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
867*4882a593Smuzhiyun pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
868*4882a593Smuzhiyun pnfs_set_lo_fail(pgio->pg_lseg);
869*4882a593Smuzhiyun nfs_pageio_reset_read_mds(pgio);
870*4882a593Smuzhiyun }
871*4882a593Smuzhiyun }
872*4882a593Smuzhiyun
873*4882a593Smuzhiyun /*
874*4882a593Smuzhiyun * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
875*4882a593Smuzhiyun * of bytes (maximum @req->wb_bytes) that can be coalesced.
876*4882a593Smuzhiyun */
877*4882a593Smuzhiyun static size_t
bl_pg_test_read(struct nfs_pageio_descriptor * pgio,struct nfs_page * prev,struct nfs_page * req)878*4882a593Smuzhiyun bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
879*4882a593Smuzhiyun struct nfs_page *req)
880*4882a593Smuzhiyun {
881*4882a593Smuzhiyun if (!is_aligned_req(pgio, req, SECTOR_SIZE, false))
882*4882a593Smuzhiyun return 0;
883*4882a593Smuzhiyun return pnfs_generic_pg_test(pgio, prev, req);
884*4882a593Smuzhiyun }
885*4882a593Smuzhiyun
886*4882a593Smuzhiyun /*
887*4882a593Smuzhiyun * Return the number of contiguous bytes for a given inode
888*4882a593Smuzhiyun * starting at page frame idx.
889*4882a593Smuzhiyun */
pnfs_num_cont_bytes(struct inode * inode,pgoff_t idx)890*4882a593Smuzhiyun static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx)
891*4882a593Smuzhiyun {
892*4882a593Smuzhiyun struct address_space *mapping = inode->i_mapping;
893*4882a593Smuzhiyun pgoff_t end;
894*4882a593Smuzhiyun
895*4882a593Smuzhiyun /* Optimize common case that writes from 0 to end of file */
896*4882a593Smuzhiyun end = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
897*4882a593Smuzhiyun if (end != inode->i_mapping->nrpages) {
898*4882a593Smuzhiyun rcu_read_lock();
899*4882a593Smuzhiyun end = page_cache_next_miss(mapping, idx + 1, ULONG_MAX);
900*4882a593Smuzhiyun rcu_read_unlock();
901*4882a593Smuzhiyun }
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun if (!end)
904*4882a593Smuzhiyun return i_size_read(inode) - (idx << PAGE_SHIFT);
905*4882a593Smuzhiyun else
906*4882a593Smuzhiyun return (end - idx) << PAGE_SHIFT;
907*4882a593Smuzhiyun }
908*4882a593Smuzhiyun
909*4882a593Smuzhiyun static void
bl_pg_init_write(struct nfs_pageio_descriptor * pgio,struct nfs_page * req)910*4882a593Smuzhiyun bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
911*4882a593Smuzhiyun {
912*4882a593Smuzhiyun u64 wb_size;
913*4882a593Smuzhiyun
914*4882a593Smuzhiyun if (!is_aligned_req(pgio, req, PAGE_SIZE, true)) {
915*4882a593Smuzhiyun nfs_pageio_reset_write_mds(pgio);
916*4882a593Smuzhiyun return;
917*4882a593Smuzhiyun }
918*4882a593Smuzhiyun
919*4882a593Smuzhiyun if (pgio->pg_dreq == NULL)
920*4882a593Smuzhiyun wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
921*4882a593Smuzhiyun req->wb_index);
922*4882a593Smuzhiyun else
923*4882a593Smuzhiyun wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
924*4882a593Smuzhiyun
925*4882a593Smuzhiyun pnfs_generic_pg_init_write(pgio, req, wb_size);
926*4882a593Smuzhiyun
927*4882a593Smuzhiyun if (pgio->pg_lseg &&
928*4882a593Smuzhiyun test_bit(NFS_LSEG_UNAVAILABLE, &pgio->pg_lseg->pls_flags)) {
929*4882a593Smuzhiyun
930*4882a593Smuzhiyun pnfs_error_mark_layout_for_return(pgio->pg_inode, pgio->pg_lseg);
931*4882a593Smuzhiyun pnfs_set_lo_fail(pgio->pg_lseg);
932*4882a593Smuzhiyun nfs_pageio_reset_write_mds(pgio);
933*4882a593Smuzhiyun }
934*4882a593Smuzhiyun }
935*4882a593Smuzhiyun
936*4882a593Smuzhiyun /*
937*4882a593Smuzhiyun * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
938*4882a593Smuzhiyun * of bytes (maximum @req->wb_bytes) that can be coalesced.
939*4882a593Smuzhiyun */
940*4882a593Smuzhiyun static size_t
bl_pg_test_write(struct nfs_pageio_descriptor * pgio,struct nfs_page * prev,struct nfs_page * req)941*4882a593Smuzhiyun bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
942*4882a593Smuzhiyun struct nfs_page *req)
943*4882a593Smuzhiyun {
944*4882a593Smuzhiyun if (!is_aligned_req(pgio, req, PAGE_SIZE, true))
945*4882a593Smuzhiyun return 0;
946*4882a593Smuzhiyun return pnfs_generic_pg_test(pgio, prev, req);
947*4882a593Smuzhiyun }
948*4882a593Smuzhiyun
949*4882a593Smuzhiyun static const struct nfs_pageio_ops bl_pg_read_ops = {
950*4882a593Smuzhiyun .pg_init = bl_pg_init_read,
951*4882a593Smuzhiyun .pg_test = bl_pg_test_read,
952*4882a593Smuzhiyun .pg_doio = pnfs_generic_pg_readpages,
953*4882a593Smuzhiyun .pg_cleanup = pnfs_generic_pg_cleanup,
954*4882a593Smuzhiyun };
955*4882a593Smuzhiyun
956*4882a593Smuzhiyun static const struct nfs_pageio_ops bl_pg_write_ops = {
957*4882a593Smuzhiyun .pg_init = bl_pg_init_write,
958*4882a593Smuzhiyun .pg_test = bl_pg_test_write,
959*4882a593Smuzhiyun .pg_doio = pnfs_generic_pg_writepages,
960*4882a593Smuzhiyun .pg_cleanup = pnfs_generic_pg_cleanup,
961*4882a593Smuzhiyun };
962*4882a593Smuzhiyun
963*4882a593Smuzhiyun static struct pnfs_layoutdriver_type blocklayout_type = {
964*4882a593Smuzhiyun .id = LAYOUT_BLOCK_VOLUME,
965*4882a593Smuzhiyun .name = "LAYOUT_BLOCK_VOLUME",
966*4882a593Smuzhiyun .owner = THIS_MODULE,
967*4882a593Smuzhiyun .flags = PNFS_LAYOUTRET_ON_SETATTR |
968*4882a593Smuzhiyun PNFS_LAYOUTRET_ON_ERROR |
969*4882a593Smuzhiyun PNFS_READ_WHOLE_PAGE,
970*4882a593Smuzhiyun .read_pagelist = bl_read_pagelist,
971*4882a593Smuzhiyun .write_pagelist = bl_write_pagelist,
972*4882a593Smuzhiyun .alloc_layout_hdr = bl_alloc_layout_hdr,
973*4882a593Smuzhiyun .free_layout_hdr = bl_free_layout_hdr,
974*4882a593Smuzhiyun .alloc_lseg = bl_alloc_lseg,
975*4882a593Smuzhiyun .free_lseg = bl_free_lseg,
976*4882a593Smuzhiyun .return_range = bl_return_range,
977*4882a593Smuzhiyun .prepare_layoutcommit = bl_prepare_layoutcommit,
978*4882a593Smuzhiyun .cleanup_layoutcommit = bl_cleanup_layoutcommit,
979*4882a593Smuzhiyun .set_layoutdriver = bl_set_layoutdriver,
980*4882a593Smuzhiyun .alloc_deviceid_node = bl_alloc_deviceid_node,
981*4882a593Smuzhiyun .free_deviceid_node = bl_free_deviceid_node,
982*4882a593Smuzhiyun .pg_read_ops = &bl_pg_read_ops,
983*4882a593Smuzhiyun .pg_write_ops = &bl_pg_write_ops,
984*4882a593Smuzhiyun .sync = pnfs_generic_sync,
985*4882a593Smuzhiyun };
986*4882a593Smuzhiyun
987*4882a593Smuzhiyun static struct pnfs_layoutdriver_type scsilayout_type = {
988*4882a593Smuzhiyun .id = LAYOUT_SCSI,
989*4882a593Smuzhiyun .name = "LAYOUT_SCSI",
990*4882a593Smuzhiyun .owner = THIS_MODULE,
991*4882a593Smuzhiyun .flags = PNFS_LAYOUTRET_ON_SETATTR |
992*4882a593Smuzhiyun PNFS_LAYOUTRET_ON_ERROR |
993*4882a593Smuzhiyun PNFS_READ_WHOLE_PAGE,
994*4882a593Smuzhiyun .read_pagelist = bl_read_pagelist,
995*4882a593Smuzhiyun .write_pagelist = bl_write_pagelist,
996*4882a593Smuzhiyun .alloc_layout_hdr = sl_alloc_layout_hdr,
997*4882a593Smuzhiyun .free_layout_hdr = bl_free_layout_hdr,
998*4882a593Smuzhiyun .alloc_lseg = bl_alloc_lseg,
999*4882a593Smuzhiyun .free_lseg = bl_free_lseg,
1000*4882a593Smuzhiyun .return_range = bl_return_range,
1001*4882a593Smuzhiyun .prepare_layoutcommit = bl_prepare_layoutcommit,
1002*4882a593Smuzhiyun .cleanup_layoutcommit = bl_cleanup_layoutcommit,
1003*4882a593Smuzhiyun .set_layoutdriver = bl_set_layoutdriver,
1004*4882a593Smuzhiyun .alloc_deviceid_node = bl_alloc_deviceid_node,
1005*4882a593Smuzhiyun .free_deviceid_node = bl_free_deviceid_node,
1006*4882a593Smuzhiyun .pg_read_ops = &bl_pg_read_ops,
1007*4882a593Smuzhiyun .pg_write_ops = &bl_pg_write_ops,
1008*4882a593Smuzhiyun .sync = pnfs_generic_sync,
1009*4882a593Smuzhiyun };
1010*4882a593Smuzhiyun
1011*4882a593Smuzhiyun
nfs4blocklayout_init(void)1012*4882a593Smuzhiyun static int __init nfs4blocklayout_init(void)
1013*4882a593Smuzhiyun {
1014*4882a593Smuzhiyun int ret;
1015*4882a593Smuzhiyun
1016*4882a593Smuzhiyun dprintk("%s: NFSv4 Block Layout Driver Registering...\n", __func__);
1017*4882a593Smuzhiyun
1018*4882a593Smuzhiyun ret = bl_init_pipefs();
1019*4882a593Smuzhiyun if (ret)
1020*4882a593Smuzhiyun goto out;
1021*4882a593Smuzhiyun
1022*4882a593Smuzhiyun ret = pnfs_register_layoutdriver(&blocklayout_type);
1023*4882a593Smuzhiyun if (ret)
1024*4882a593Smuzhiyun goto out_cleanup_pipe;
1025*4882a593Smuzhiyun
1026*4882a593Smuzhiyun ret = pnfs_register_layoutdriver(&scsilayout_type);
1027*4882a593Smuzhiyun if (ret)
1028*4882a593Smuzhiyun goto out_unregister_block;
1029*4882a593Smuzhiyun return 0;
1030*4882a593Smuzhiyun
1031*4882a593Smuzhiyun out_unregister_block:
1032*4882a593Smuzhiyun pnfs_unregister_layoutdriver(&blocklayout_type);
1033*4882a593Smuzhiyun out_cleanup_pipe:
1034*4882a593Smuzhiyun bl_cleanup_pipefs();
1035*4882a593Smuzhiyun out:
1036*4882a593Smuzhiyun return ret;
1037*4882a593Smuzhiyun }
1038*4882a593Smuzhiyun
nfs4blocklayout_exit(void)1039*4882a593Smuzhiyun static void __exit nfs4blocklayout_exit(void)
1040*4882a593Smuzhiyun {
1041*4882a593Smuzhiyun dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n",
1042*4882a593Smuzhiyun __func__);
1043*4882a593Smuzhiyun
1044*4882a593Smuzhiyun pnfs_unregister_layoutdriver(&scsilayout_type);
1045*4882a593Smuzhiyun pnfs_unregister_layoutdriver(&blocklayout_type);
1046*4882a593Smuzhiyun bl_cleanup_pipefs();
1047*4882a593Smuzhiyun }
1048*4882a593Smuzhiyun
1049*4882a593Smuzhiyun MODULE_ALIAS("nfs-layouttype4-3");
1050*4882a593Smuzhiyun MODULE_ALIAS("nfs-layouttype4-5");
1051*4882a593Smuzhiyun
1052*4882a593Smuzhiyun module_init(nfs4blocklayout_init);
1053*4882a593Smuzhiyun module_exit(nfs4blocklayout_exit);
1054