1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Zoned block device handling
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (c) 2015, Hannes Reinecke
6*4882a593Smuzhiyun * Copyright (c) 2015, SUSE Linux GmbH
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Copyright (c) 2016, Damien Le Moal
9*4882a593Smuzhiyun * Copyright (c) 2016, Western Digital
10*4882a593Smuzhiyun */
11*4882a593Smuzhiyun
12*4882a593Smuzhiyun #include <linux/kernel.h>
13*4882a593Smuzhiyun #include <linux/module.h>
14*4882a593Smuzhiyun #include <linux/rbtree.h>
15*4882a593Smuzhiyun #include <linux/blkdev.h>
16*4882a593Smuzhiyun #include <linux/blk-mq.h>
17*4882a593Smuzhiyun #include <linux/mm.h>
18*4882a593Smuzhiyun #include <linux/vmalloc.h>
19*4882a593Smuzhiyun #include <linux/sched/mm.h>
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun #include "blk.h"
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
24*4882a593Smuzhiyun static const char *const zone_cond_name[] = {
25*4882a593Smuzhiyun ZONE_COND_NAME(NOT_WP),
26*4882a593Smuzhiyun ZONE_COND_NAME(EMPTY),
27*4882a593Smuzhiyun ZONE_COND_NAME(IMP_OPEN),
28*4882a593Smuzhiyun ZONE_COND_NAME(EXP_OPEN),
29*4882a593Smuzhiyun ZONE_COND_NAME(CLOSED),
30*4882a593Smuzhiyun ZONE_COND_NAME(READONLY),
31*4882a593Smuzhiyun ZONE_COND_NAME(FULL),
32*4882a593Smuzhiyun ZONE_COND_NAME(OFFLINE),
33*4882a593Smuzhiyun };
34*4882a593Smuzhiyun #undef ZONE_COND_NAME
35*4882a593Smuzhiyun
36*4882a593Smuzhiyun /**
37*4882a593Smuzhiyun * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
38*4882a593Smuzhiyun * @zone_cond: BLK_ZONE_COND_XXX.
39*4882a593Smuzhiyun *
40*4882a593Smuzhiyun * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
41*4882a593Smuzhiyun * into string format. Useful in the debugging and tracing zone conditions. For
42*4882a593Smuzhiyun * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
43*4882a593Smuzhiyun */
blk_zone_cond_str(enum blk_zone_cond zone_cond)44*4882a593Smuzhiyun const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
45*4882a593Smuzhiyun {
46*4882a593Smuzhiyun static const char *zone_cond_str = "UNKNOWN";
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
49*4882a593Smuzhiyun zone_cond_str = zone_cond_name[zone_cond];
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun return zone_cond_str;
52*4882a593Smuzhiyun }
53*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(blk_zone_cond_str);
54*4882a593Smuzhiyun
blk_zone_start(struct request_queue * q,sector_t sector)55*4882a593Smuzhiyun static inline sector_t blk_zone_start(struct request_queue *q,
56*4882a593Smuzhiyun sector_t sector)
57*4882a593Smuzhiyun {
58*4882a593Smuzhiyun sector_t zone_mask = blk_queue_zone_sectors(q) - 1;
59*4882a593Smuzhiyun
60*4882a593Smuzhiyun return sector & ~zone_mask;
61*4882a593Smuzhiyun }
62*4882a593Smuzhiyun
63*4882a593Smuzhiyun /*
64*4882a593Smuzhiyun * Return true if a request is a write requests that needs zone write locking.
65*4882a593Smuzhiyun */
blk_req_needs_zone_write_lock(struct request * rq)66*4882a593Smuzhiyun bool blk_req_needs_zone_write_lock(struct request *rq)
67*4882a593Smuzhiyun {
68*4882a593Smuzhiyun if (!rq->q->seq_zones_wlock)
69*4882a593Smuzhiyun return false;
70*4882a593Smuzhiyun
71*4882a593Smuzhiyun if (blk_rq_is_passthrough(rq))
72*4882a593Smuzhiyun return false;
73*4882a593Smuzhiyun
74*4882a593Smuzhiyun switch (req_op(rq)) {
75*4882a593Smuzhiyun case REQ_OP_WRITE_ZEROES:
76*4882a593Smuzhiyun case REQ_OP_WRITE_SAME:
77*4882a593Smuzhiyun case REQ_OP_WRITE:
78*4882a593Smuzhiyun return blk_rq_zone_is_seq(rq);
79*4882a593Smuzhiyun default:
80*4882a593Smuzhiyun return false;
81*4882a593Smuzhiyun }
82*4882a593Smuzhiyun }
83*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
84*4882a593Smuzhiyun
blk_req_zone_write_trylock(struct request * rq)85*4882a593Smuzhiyun bool blk_req_zone_write_trylock(struct request *rq)
86*4882a593Smuzhiyun {
87*4882a593Smuzhiyun unsigned int zno = blk_rq_zone_no(rq);
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
90*4882a593Smuzhiyun return false;
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
93*4882a593Smuzhiyun rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun return true;
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
98*4882a593Smuzhiyun
__blk_req_zone_write_lock(struct request * rq)99*4882a593Smuzhiyun void __blk_req_zone_write_lock(struct request *rq)
100*4882a593Smuzhiyun {
101*4882a593Smuzhiyun if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
102*4882a593Smuzhiyun rq->q->seq_zones_wlock)))
103*4882a593Smuzhiyun return;
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
106*4882a593Smuzhiyun rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
107*4882a593Smuzhiyun }
108*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
109*4882a593Smuzhiyun
__blk_req_zone_write_unlock(struct request * rq)110*4882a593Smuzhiyun void __blk_req_zone_write_unlock(struct request *rq)
111*4882a593Smuzhiyun {
112*4882a593Smuzhiyun rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
113*4882a593Smuzhiyun if (rq->q->seq_zones_wlock)
114*4882a593Smuzhiyun WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
115*4882a593Smuzhiyun rq->q->seq_zones_wlock));
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
118*4882a593Smuzhiyun
119*4882a593Smuzhiyun /**
120*4882a593Smuzhiyun * blkdev_nr_zones - Get number of zones
121*4882a593Smuzhiyun * @disk: Target gendisk
122*4882a593Smuzhiyun *
123*4882a593Smuzhiyun * Return the total number of zones of a zoned block device. For a block
124*4882a593Smuzhiyun * device without zone capabilities, the number of zones is always 0.
125*4882a593Smuzhiyun */
blkdev_nr_zones(struct gendisk * disk)126*4882a593Smuzhiyun unsigned int blkdev_nr_zones(struct gendisk *disk)
127*4882a593Smuzhiyun {
128*4882a593Smuzhiyun sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun if (!blk_queue_is_zoned(disk->queue))
131*4882a593Smuzhiyun return 0;
132*4882a593Smuzhiyun return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(blkdev_nr_zones);
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun /**
137*4882a593Smuzhiyun * blkdev_report_zones - Get zones information
138*4882a593Smuzhiyun * @bdev: Target block device
139*4882a593Smuzhiyun * @sector: Sector from which to report zones
140*4882a593Smuzhiyun * @nr_zones: Maximum number of zones to report
141*4882a593Smuzhiyun * @cb: Callback function called for each reported zone
142*4882a593Smuzhiyun * @data: Private data for the callback
143*4882a593Smuzhiyun *
144*4882a593Smuzhiyun * Description:
145*4882a593Smuzhiyun * Get zone information starting from the zone containing @sector for at most
146*4882a593Smuzhiyun * @nr_zones, and call @cb for each zone reported by the device.
147*4882a593Smuzhiyun * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
148*4882a593Smuzhiyun * constant can be passed to @nr_zones.
149*4882a593Smuzhiyun * Returns the number of zones reported by the device, or a negative errno
150*4882a593Smuzhiyun * value in case of failure.
151*4882a593Smuzhiyun *
152*4882a593Smuzhiyun * Note: The caller must use memalloc_noXX_save/restore() calls to control
153*4882a593Smuzhiyun * memory allocations done within this function.
154*4882a593Smuzhiyun */
blkdev_report_zones(struct block_device * bdev,sector_t sector,unsigned int nr_zones,report_zones_cb cb,void * data)155*4882a593Smuzhiyun int blkdev_report_zones(struct block_device *bdev, sector_t sector,
156*4882a593Smuzhiyun unsigned int nr_zones, report_zones_cb cb, void *data)
157*4882a593Smuzhiyun {
158*4882a593Smuzhiyun struct gendisk *disk = bdev->bd_disk;
159*4882a593Smuzhiyun sector_t capacity = get_capacity(disk);
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun if (!blk_queue_is_zoned(bdev_get_queue(bdev)) ||
162*4882a593Smuzhiyun WARN_ON_ONCE(!disk->fops->report_zones))
163*4882a593Smuzhiyun return -EOPNOTSUPP;
164*4882a593Smuzhiyun
165*4882a593Smuzhiyun if (!nr_zones || sector >= capacity)
166*4882a593Smuzhiyun return 0;
167*4882a593Smuzhiyun
168*4882a593Smuzhiyun return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
169*4882a593Smuzhiyun }
170*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(blkdev_report_zones);
171*4882a593Smuzhiyun
blkdev_allow_reset_all_zones(struct block_device * bdev,sector_t sector,sector_t nr_sectors)172*4882a593Smuzhiyun static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
173*4882a593Smuzhiyun sector_t sector,
174*4882a593Smuzhiyun sector_t nr_sectors)
175*4882a593Smuzhiyun {
176*4882a593Smuzhiyun if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
177*4882a593Smuzhiyun return false;
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun /*
180*4882a593Smuzhiyun * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
181*4882a593Smuzhiyun * of the applicable zone range is the entire disk.
182*4882a593Smuzhiyun */
183*4882a593Smuzhiyun return !sector && nr_sectors == get_capacity(bdev->bd_disk);
184*4882a593Smuzhiyun }
185*4882a593Smuzhiyun
186*4882a593Smuzhiyun /**
187*4882a593Smuzhiyun * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
188*4882a593Smuzhiyun * @bdev: Target block device
189*4882a593Smuzhiyun * @op: Operation to be performed on the zones
190*4882a593Smuzhiyun * @sector: Start sector of the first zone to operate on
191*4882a593Smuzhiyun * @nr_sectors: Number of sectors, should be at least the length of one zone and
192*4882a593Smuzhiyun * must be zone size aligned.
193*4882a593Smuzhiyun * @gfp_mask: Memory allocation flags (for bio_alloc)
194*4882a593Smuzhiyun *
195*4882a593Smuzhiyun * Description:
196*4882a593Smuzhiyun * Perform the specified operation on the range of zones specified by
197*4882a593Smuzhiyun * @sector..@sector+@nr_sectors. Specifying the entire disk sector range
198*4882a593Smuzhiyun * is valid, but the specified range should not contain conventional zones.
199*4882a593Smuzhiyun * The operation to execute on each zone can be a zone reset, open, close
200*4882a593Smuzhiyun * or finish request.
201*4882a593Smuzhiyun */
blkdev_zone_mgmt(struct block_device * bdev,enum req_opf op,sector_t sector,sector_t nr_sectors,gfp_t gfp_mask)202*4882a593Smuzhiyun int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
203*4882a593Smuzhiyun sector_t sector, sector_t nr_sectors,
204*4882a593Smuzhiyun gfp_t gfp_mask)
205*4882a593Smuzhiyun {
206*4882a593Smuzhiyun struct request_queue *q = bdev_get_queue(bdev);
207*4882a593Smuzhiyun sector_t zone_sectors = blk_queue_zone_sectors(q);
208*4882a593Smuzhiyun sector_t capacity = get_capacity(bdev->bd_disk);
209*4882a593Smuzhiyun sector_t end_sector = sector + nr_sectors;
210*4882a593Smuzhiyun struct bio *bio = NULL;
211*4882a593Smuzhiyun int ret;
212*4882a593Smuzhiyun
213*4882a593Smuzhiyun if (!blk_queue_is_zoned(q))
214*4882a593Smuzhiyun return -EOPNOTSUPP;
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun if (bdev_read_only(bdev))
217*4882a593Smuzhiyun return -EPERM;
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun if (!op_is_zone_mgmt(op))
220*4882a593Smuzhiyun return -EOPNOTSUPP;
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun if (end_sector <= sector || end_sector > capacity)
223*4882a593Smuzhiyun /* Out of range */
224*4882a593Smuzhiyun return -EINVAL;
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun /* Check alignment (handle eventual smaller last zone) */
227*4882a593Smuzhiyun if (sector & (zone_sectors - 1))
228*4882a593Smuzhiyun return -EINVAL;
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
231*4882a593Smuzhiyun return -EINVAL;
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun while (sector < end_sector) {
234*4882a593Smuzhiyun bio = blk_next_bio(bio, 0, gfp_mask);
235*4882a593Smuzhiyun bio_set_dev(bio, bdev);
236*4882a593Smuzhiyun
237*4882a593Smuzhiyun /*
238*4882a593Smuzhiyun * Special case for the zone reset operation that reset all
239*4882a593Smuzhiyun * zones, this is useful for applications like mkfs.
240*4882a593Smuzhiyun */
241*4882a593Smuzhiyun if (op == REQ_OP_ZONE_RESET &&
242*4882a593Smuzhiyun blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
243*4882a593Smuzhiyun bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
244*4882a593Smuzhiyun break;
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun bio->bi_opf = op | REQ_SYNC;
248*4882a593Smuzhiyun bio->bi_iter.bi_sector = sector;
249*4882a593Smuzhiyun sector += zone_sectors;
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun /* This may take a while, so be nice to others */
252*4882a593Smuzhiyun cond_resched();
253*4882a593Smuzhiyun }
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun ret = submit_bio_wait(bio);
256*4882a593Smuzhiyun bio_put(bio);
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun return ret;
259*4882a593Smuzhiyun }
260*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun struct zone_report_args {
263*4882a593Smuzhiyun struct blk_zone __user *zones;
264*4882a593Smuzhiyun };
265*4882a593Smuzhiyun
blkdev_copy_zone_to_user(struct blk_zone * zone,unsigned int idx,void * data)266*4882a593Smuzhiyun static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
267*4882a593Smuzhiyun void *data)
268*4882a593Smuzhiyun {
269*4882a593Smuzhiyun struct zone_report_args *args = data;
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
272*4882a593Smuzhiyun return -EFAULT;
273*4882a593Smuzhiyun return 0;
274*4882a593Smuzhiyun }
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun /*
277*4882a593Smuzhiyun * BLKREPORTZONE ioctl processing.
278*4882a593Smuzhiyun * Called from blkdev_ioctl.
279*4882a593Smuzhiyun */
blkdev_report_zones_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)280*4882a593Smuzhiyun int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
281*4882a593Smuzhiyun unsigned int cmd, unsigned long arg)
282*4882a593Smuzhiyun {
283*4882a593Smuzhiyun void __user *argp = (void __user *)arg;
284*4882a593Smuzhiyun struct zone_report_args args;
285*4882a593Smuzhiyun struct request_queue *q;
286*4882a593Smuzhiyun struct blk_zone_report rep;
287*4882a593Smuzhiyun int ret;
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun if (!argp)
290*4882a593Smuzhiyun return -EINVAL;
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun q = bdev_get_queue(bdev);
293*4882a593Smuzhiyun if (!q)
294*4882a593Smuzhiyun return -ENXIO;
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun if (!blk_queue_is_zoned(q))
297*4882a593Smuzhiyun return -ENOTTY;
298*4882a593Smuzhiyun
299*4882a593Smuzhiyun if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
300*4882a593Smuzhiyun return -EFAULT;
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun if (!rep.nr_zones)
303*4882a593Smuzhiyun return -EINVAL;
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun args.zones = argp + sizeof(struct blk_zone_report);
306*4882a593Smuzhiyun ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
307*4882a593Smuzhiyun blkdev_copy_zone_to_user, &args);
308*4882a593Smuzhiyun if (ret < 0)
309*4882a593Smuzhiyun return ret;
310*4882a593Smuzhiyun
311*4882a593Smuzhiyun rep.nr_zones = ret;
312*4882a593Smuzhiyun rep.flags = BLK_ZONE_REP_CAPACITY;
313*4882a593Smuzhiyun if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
314*4882a593Smuzhiyun return -EFAULT;
315*4882a593Smuzhiyun return 0;
316*4882a593Smuzhiyun }
317*4882a593Smuzhiyun
blkdev_truncate_zone_range(struct block_device * bdev,fmode_t mode,const struct blk_zone_range * zrange)318*4882a593Smuzhiyun static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode,
319*4882a593Smuzhiyun const struct blk_zone_range *zrange)
320*4882a593Smuzhiyun {
321*4882a593Smuzhiyun loff_t start, end;
322*4882a593Smuzhiyun
323*4882a593Smuzhiyun if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
324*4882a593Smuzhiyun zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
325*4882a593Smuzhiyun /* Out of range */
326*4882a593Smuzhiyun return -EINVAL;
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun start = zrange->sector << SECTOR_SHIFT;
329*4882a593Smuzhiyun end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun return truncate_bdev_range(bdev, mode, start, end);
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun /*
335*4882a593Smuzhiyun * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
336*4882a593Smuzhiyun * Called from blkdev_ioctl.
337*4882a593Smuzhiyun */
blkdev_zone_mgmt_ioctl(struct block_device * bdev,fmode_t mode,unsigned int cmd,unsigned long arg)338*4882a593Smuzhiyun int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
339*4882a593Smuzhiyun unsigned int cmd, unsigned long arg)
340*4882a593Smuzhiyun {
341*4882a593Smuzhiyun void __user *argp = (void __user *)arg;
342*4882a593Smuzhiyun struct request_queue *q;
343*4882a593Smuzhiyun struct blk_zone_range zrange;
344*4882a593Smuzhiyun enum req_opf op;
345*4882a593Smuzhiyun int ret;
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun if (!argp)
348*4882a593Smuzhiyun return -EINVAL;
349*4882a593Smuzhiyun
350*4882a593Smuzhiyun q = bdev_get_queue(bdev);
351*4882a593Smuzhiyun if (!q)
352*4882a593Smuzhiyun return -ENXIO;
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun if (!blk_queue_is_zoned(q))
355*4882a593Smuzhiyun return -ENOTTY;
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun if (!(mode & FMODE_WRITE))
358*4882a593Smuzhiyun return -EBADF;
359*4882a593Smuzhiyun
360*4882a593Smuzhiyun if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
361*4882a593Smuzhiyun return -EFAULT;
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun switch (cmd) {
364*4882a593Smuzhiyun case BLKRESETZONE:
365*4882a593Smuzhiyun op = REQ_OP_ZONE_RESET;
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun /* Invalidate the page cache, including dirty pages. */
368*4882a593Smuzhiyun ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
369*4882a593Smuzhiyun if (ret)
370*4882a593Smuzhiyun return ret;
371*4882a593Smuzhiyun break;
372*4882a593Smuzhiyun case BLKOPENZONE:
373*4882a593Smuzhiyun op = REQ_OP_ZONE_OPEN;
374*4882a593Smuzhiyun break;
375*4882a593Smuzhiyun case BLKCLOSEZONE:
376*4882a593Smuzhiyun op = REQ_OP_ZONE_CLOSE;
377*4882a593Smuzhiyun break;
378*4882a593Smuzhiyun case BLKFINISHZONE:
379*4882a593Smuzhiyun op = REQ_OP_ZONE_FINISH;
380*4882a593Smuzhiyun break;
381*4882a593Smuzhiyun default:
382*4882a593Smuzhiyun return -ENOTTY;
383*4882a593Smuzhiyun }
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
386*4882a593Smuzhiyun GFP_KERNEL);
387*4882a593Smuzhiyun
388*4882a593Smuzhiyun /*
389*4882a593Smuzhiyun * Invalidate the page cache again for zone reset: writes can only be
390*4882a593Smuzhiyun * direct for zoned devices so concurrent writes would not add any page
391*4882a593Smuzhiyun * to the page cache after/during reset. The page cache may be filled
392*4882a593Smuzhiyun * again due to concurrent reads though and dropping the pages for
393*4882a593Smuzhiyun * these is fine.
394*4882a593Smuzhiyun */
395*4882a593Smuzhiyun if (!ret && cmd == BLKRESETZONE)
396*4882a593Smuzhiyun ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun return ret;
399*4882a593Smuzhiyun }
400*4882a593Smuzhiyun
blk_alloc_zone_bitmap(int node,unsigned int nr_zones)401*4882a593Smuzhiyun static inline unsigned long *blk_alloc_zone_bitmap(int node,
402*4882a593Smuzhiyun unsigned int nr_zones)
403*4882a593Smuzhiyun {
404*4882a593Smuzhiyun return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
405*4882a593Smuzhiyun GFP_NOIO, node);
406*4882a593Smuzhiyun }
407*4882a593Smuzhiyun
blk_queue_free_zone_bitmaps(struct request_queue * q)408*4882a593Smuzhiyun void blk_queue_free_zone_bitmaps(struct request_queue *q)
409*4882a593Smuzhiyun {
410*4882a593Smuzhiyun kfree(q->conv_zones_bitmap);
411*4882a593Smuzhiyun q->conv_zones_bitmap = NULL;
412*4882a593Smuzhiyun kfree(q->seq_zones_wlock);
413*4882a593Smuzhiyun q->seq_zones_wlock = NULL;
414*4882a593Smuzhiyun }
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun struct blk_revalidate_zone_args {
417*4882a593Smuzhiyun struct gendisk *disk;
418*4882a593Smuzhiyun unsigned long *conv_zones_bitmap;
419*4882a593Smuzhiyun unsigned long *seq_zones_wlock;
420*4882a593Smuzhiyun unsigned int nr_zones;
421*4882a593Smuzhiyun sector_t zone_sectors;
422*4882a593Smuzhiyun sector_t sector;
423*4882a593Smuzhiyun };
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun /*
426*4882a593Smuzhiyun * Helper function to check the validity of zones of a zoned block device.
427*4882a593Smuzhiyun */
blk_revalidate_zone_cb(struct blk_zone * zone,unsigned int idx,void * data)428*4882a593Smuzhiyun static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
429*4882a593Smuzhiyun void *data)
430*4882a593Smuzhiyun {
431*4882a593Smuzhiyun struct blk_revalidate_zone_args *args = data;
432*4882a593Smuzhiyun struct gendisk *disk = args->disk;
433*4882a593Smuzhiyun struct request_queue *q = disk->queue;
434*4882a593Smuzhiyun sector_t capacity = get_capacity(disk);
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun /*
437*4882a593Smuzhiyun * All zones must have the same size, with the exception on an eventual
438*4882a593Smuzhiyun * smaller last zone.
439*4882a593Smuzhiyun */
440*4882a593Smuzhiyun if (zone->start == 0) {
441*4882a593Smuzhiyun if (zone->len == 0 || !is_power_of_2(zone->len)) {
442*4882a593Smuzhiyun pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
443*4882a593Smuzhiyun disk->disk_name, zone->len);
444*4882a593Smuzhiyun return -ENODEV;
445*4882a593Smuzhiyun }
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun args->zone_sectors = zone->len;
448*4882a593Smuzhiyun args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
449*4882a593Smuzhiyun } else if (zone->start + args->zone_sectors < capacity) {
450*4882a593Smuzhiyun if (zone->len != args->zone_sectors) {
451*4882a593Smuzhiyun pr_warn("%s: Invalid zoned device with non constant zone size\n",
452*4882a593Smuzhiyun disk->disk_name);
453*4882a593Smuzhiyun return -ENODEV;
454*4882a593Smuzhiyun }
455*4882a593Smuzhiyun } else {
456*4882a593Smuzhiyun if (zone->len > args->zone_sectors) {
457*4882a593Smuzhiyun pr_warn("%s: Invalid zoned device with larger last zone size\n",
458*4882a593Smuzhiyun disk->disk_name);
459*4882a593Smuzhiyun return -ENODEV;
460*4882a593Smuzhiyun }
461*4882a593Smuzhiyun }
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun /* Check for holes in the zone report */
464*4882a593Smuzhiyun if (zone->start != args->sector) {
465*4882a593Smuzhiyun pr_warn("%s: Zone gap at sectors %llu..%llu\n",
466*4882a593Smuzhiyun disk->disk_name, args->sector, zone->start);
467*4882a593Smuzhiyun return -ENODEV;
468*4882a593Smuzhiyun }
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun /* Check zone type */
471*4882a593Smuzhiyun switch (zone->type) {
472*4882a593Smuzhiyun case BLK_ZONE_TYPE_CONVENTIONAL:
473*4882a593Smuzhiyun if (!args->conv_zones_bitmap) {
474*4882a593Smuzhiyun args->conv_zones_bitmap =
475*4882a593Smuzhiyun blk_alloc_zone_bitmap(q->node, args->nr_zones);
476*4882a593Smuzhiyun if (!args->conv_zones_bitmap)
477*4882a593Smuzhiyun return -ENOMEM;
478*4882a593Smuzhiyun }
479*4882a593Smuzhiyun set_bit(idx, args->conv_zones_bitmap);
480*4882a593Smuzhiyun break;
481*4882a593Smuzhiyun case BLK_ZONE_TYPE_SEQWRITE_REQ:
482*4882a593Smuzhiyun case BLK_ZONE_TYPE_SEQWRITE_PREF:
483*4882a593Smuzhiyun if (!args->seq_zones_wlock) {
484*4882a593Smuzhiyun args->seq_zones_wlock =
485*4882a593Smuzhiyun blk_alloc_zone_bitmap(q->node, args->nr_zones);
486*4882a593Smuzhiyun if (!args->seq_zones_wlock)
487*4882a593Smuzhiyun return -ENOMEM;
488*4882a593Smuzhiyun }
489*4882a593Smuzhiyun break;
490*4882a593Smuzhiyun default:
491*4882a593Smuzhiyun pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
492*4882a593Smuzhiyun disk->disk_name, (int)zone->type, zone->start);
493*4882a593Smuzhiyun return -ENODEV;
494*4882a593Smuzhiyun }
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun args->sector += zone->len;
497*4882a593Smuzhiyun return 0;
498*4882a593Smuzhiyun }
499*4882a593Smuzhiyun
500*4882a593Smuzhiyun /**
501*4882a593Smuzhiyun * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
502*4882a593Smuzhiyun * @disk: Target disk
503*4882a593Smuzhiyun * @update_driver_data: Callback to update driver data on the frozen disk
504*4882a593Smuzhiyun *
505*4882a593Smuzhiyun * Helper function for low-level device drivers to (re) allocate and initialize
506*4882a593Smuzhiyun * a disk request queue zone bitmaps. This functions should normally be called
507*4882a593Smuzhiyun * within the disk ->revalidate method for blk-mq based drivers. For BIO based
508*4882a593Smuzhiyun * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
509*4882a593Smuzhiyun * is correct.
510*4882a593Smuzhiyun * If the @update_driver_data callback function is not NULL, the callback is
511*4882a593Smuzhiyun * executed with the device request queue frozen after all zones have been
512*4882a593Smuzhiyun * checked.
513*4882a593Smuzhiyun */
blk_revalidate_disk_zones(struct gendisk * disk,void (* update_driver_data)(struct gendisk * disk))514*4882a593Smuzhiyun int blk_revalidate_disk_zones(struct gendisk *disk,
515*4882a593Smuzhiyun void (*update_driver_data)(struct gendisk *disk))
516*4882a593Smuzhiyun {
517*4882a593Smuzhiyun struct request_queue *q = disk->queue;
518*4882a593Smuzhiyun struct blk_revalidate_zone_args args = {
519*4882a593Smuzhiyun .disk = disk,
520*4882a593Smuzhiyun };
521*4882a593Smuzhiyun unsigned int noio_flag;
522*4882a593Smuzhiyun int ret;
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
525*4882a593Smuzhiyun return -EIO;
526*4882a593Smuzhiyun if (WARN_ON_ONCE(!queue_is_mq(q)))
527*4882a593Smuzhiyun return -EIO;
528*4882a593Smuzhiyun
529*4882a593Smuzhiyun if (!get_capacity(disk))
530*4882a593Smuzhiyun return -EIO;
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun /*
533*4882a593Smuzhiyun * Ensure that all memory allocations in this context are done as if
534*4882a593Smuzhiyun * GFP_NOIO was specified.
535*4882a593Smuzhiyun */
536*4882a593Smuzhiyun noio_flag = memalloc_noio_save();
537*4882a593Smuzhiyun ret = disk->fops->report_zones(disk, 0, UINT_MAX,
538*4882a593Smuzhiyun blk_revalidate_zone_cb, &args);
539*4882a593Smuzhiyun memalloc_noio_restore(noio_flag);
540*4882a593Smuzhiyun
541*4882a593Smuzhiyun /*
542*4882a593Smuzhiyun * Install the new bitmaps and update nr_zones only once the queue is
543*4882a593Smuzhiyun * stopped and all I/Os are completed (i.e. a scheduler is not
544*4882a593Smuzhiyun * referencing the bitmaps).
545*4882a593Smuzhiyun */
546*4882a593Smuzhiyun blk_mq_freeze_queue(q);
547*4882a593Smuzhiyun if (ret >= 0) {
548*4882a593Smuzhiyun blk_queue_chunk_sectors(q, args.zone_sectors);
549*4882a593Smuzhiyun q->nr_zones = args.nr_zones;
550*4882a593Smuzhiyun swap(q->seq_zones_wlock, args.seq_zones_wlock);
551*4882a593Smuzhiyun swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
552*4882a593Smuzhiyun if (update_driver_data)
553*4882a593Smuzhiyun update_driver_data(disk);
554*4882a593Smuzhiyun ret = 0;
555*4882a593Smuzhiyun } else {
556*4882a593Smuzhiyun pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
557*4882a593Smuzhiyun blk_queue_free_zone_bitmaps(q);
558*4882a593Smuzhiyun }
559*4882a593Smuzhiyun blk_mq_unfreeze_queue(q);
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun kfree(args.seq_zones_wlock);
562*4882a593Smuzhiyun kfree(args.conv_zones_bitmap);
563*4882a593Smuzhiyun return ret;
564*4882a593Smuzhiyun }
565*4882a593Smuzhiyun EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
566