1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun * Copyright (C) 2014 Facebook. All rights reserved.
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This file is released under the GPL.
5*4882a593Smuzhiyun */
6*4882a593Smuzhiyun
7*4882a593Smuzhiyun #include <linux/device-mapper.h>
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun #include <linux/module.h>
10*4882a593Smuzhiyun #include <linux/init.h>
11*4882a593Smuzhiyun #include <linux/blkdev.h>
12*4882a593Smuzhiyun #include <linux/bio.h>
13*4882a593Smuzhiyun #include <linux/dax.h>
14*4882a593Smuzhiyun #include <linux/slab.h>
15*4882a593Smuzhiyun #include <linux/kthread.h>
16*4882a593Smuzhiyun #include <linux/freezer.h>
17*4882a593Smuzhiyun #include <linux/uio.h>
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun #define DM_MSG_PREFIX "log-writes"
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun /*
22*4882a593Smuzhiyun * This target will sequentially log all writes to the target device onto the
23*4882a593Smuzhiyun * log device. This is helpful for replaying writes to check for fs consistency
24*4882a593Smuzhiyun * at all times. This target provides a mechanism to mark specific events to
25*4882a593Smuzhiyun * check data at a later time. So for example you would:
26*4882a593Smuzhiyun *
27*4882a593Smuzhiyun * write data
28*4882a593Smuzhiyun * fsync
29*4882a593Smuzhiyun * dmsetup message /dev/whatever mark mymark
30*4882a593Smuzhiyun * unmount /mnt/test
31*4882a593Smuzhiyun *
32*4882a593Smuzhiyun * Then replay the log up to mymark and check the contents of the replay to
33*4882a593Smuzhiyun * verify it matches what was written.
34*4882a593Smuzhiyun *
35*4882a593Smuzhiyun * We log writes only after they have been flushed, this makes the log describe
36*4882a593Smuzhiyun * close to the order in which the data hits the actual disk, not its cache. So
37*4882a593Smuzhiyun * for example the following sequence (W means write, C means complete)
38*4882a593Smuzhiyun *
39*4882a593Smuzhiyun * Wa,Wb,Wc,Cc,Ca,FLUSH,FUAd,Cb,CFLUSH,CFUAd
40*4882a593Smuzhiyun *
41*4882a593Smuzhiyun * Would result in the log looking like this:
42*4882a593Smuzhiyun *
43*4882a593Smuzhiyun * c,a,b,flush,fuad,<other writes>,<next flush>
44*4882a593Smuzhiyun *
45*4882a593Smuzhiyun * This is meant to help expose problems where file systems do not properly wait
46*4882a593Smuzhiyun * on data being written before invoking a FLUSH. FUA bypasses cache so once it
47*4882a593Smuzhiyun * completes it is added to the log as it should be on disk.
48*4882a593Smuzhiyun *
49*4882a593Smuzhiyun * We treat DISCARDs as if they don't bypass cache so that they are logged in
50*4882a593Smuzhiyun * order of completion along with the normal writes. If we didn't do it this
51*4882a593Smuzhiyun * way we would process all the discards first and then write all the data, when
52*4882a593Smuzhiyun * in fact we want to do the data and the discard in the order that they
53*4882a593Smuzhiyun * completed.
54*4882a593Smuzhiyun */
55*4882a593Smuzhiyun #define LOG_FLUSH_FLAG (1 << 0)
56*4882a593Smuzhiyun #define LOG_FUA_FLAG (1 << 1)
57*4882a593Smuzhiyun #define LOG_DISCARD_FLAG (1 << 2)
58*4882a593Smuzhiyun #define LOG_MARK_FLAG (1 << 3)
59*4882a593Smuzhiyun #define LOG_METADATA_FLAG (1 << 4)
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun #define WRITE_LOG_VERSION 1ULL
62*4882a593Smuzhiyun #define WRITE_LOG_MAGIC 0x6a736677736872ULL
63*4882a593Smuzhiyun #define WRITE_LOG_SUPER_SECTOR 0
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun /*
66*4882a593Smuzhiyun * The disk format for this is braindead simple.
67*4882a593Smuzhiyun *
68*4882a593Smuzhiyun * At byte 0 we have our super, followed by the following sequence for
69*4882a593Smuzhiyun * nr_entries:
70*4882a593Smuzhiyun *
71*4882a593Smuzhiyun * [ 1 sector ][ entry->nr_sectors ]
72*4882a593Smuzhiyun * [log_write_entry][ data written ]
73*4882a593Smuzhiyun *
74*4882a593Smuzhiyun * The log_write_entry takes up a full sector so we can have arbitrary length
75*4882a593Smuzhiyun * marks and it leaves us room for extra content in the future.
76*4882a593Smuzhiyun */
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun /*
79*4882a593Smuzhiyun * Basic info about the log for userspace.
80*4882a593Smuzhiyun */
81*4882a593Smuzhiyun struct log_write_super {
82*4882a593Smuzhiyun __le64 magic;
83*4882a593Smuzhiyun __le64 version;
84*4882a593Smuzhiyun __le64 nr_entries;
85*4882a593Smuzhiyun __le32 sectorsize;
86*4882a593Smuzhiyun };
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun /*
89*4882a593Smuzhiyun * sector - the sector we wrote.
90*4882a593Smuzhiyun * nr_sectors - the number of sectors we wrote.
91*4882a593Smuzhiyun * flags - flags for this log entry.
92*4882a593Smuzhiyun * data_len - the size of the data in this log entry, this is for private log
93*4882a593Smuzhiyun * entry stuff, the MARK data provided by userspace for example.
94*4882a593Smuzhiyun */
95*4882a593Smuzhiyun struct log_write_entry {
96*4882a593Smuzhiyun __le64 sector;
97*4882a593Smuzhiyun __le64 nr_sectors;
98*4882a593Smuzhiyun __le64 flags;
99*4882a593Smuzhiyun __le64 data_len;
100*4882a593Smuzhiyun };
101*4882a593Smuzhiyun
102*4882a593Smuzhiyun struct log_writes_c {
103*4882a593Smuzhiyun struct dm_dev *dev;
104*4882a593Smuzhiyun struct dm_dev *logdev;
105*4882a593Smuzhiyun u64 logged_entries;
106*4882a593Smuzhiyun u32 sectorsize;
107*4882a593Smuzhiyun u32 sectorshift;
108*4882a593Smuzhiyun atomic_t io_blocks;
109*4882a593Smuzhiyun atomic_t pending_blocks;
110*4882a593Smuzhiyun sector_t next_sector;
111*4882a593Smuzhiyun sector_t end_sector;
112*4882a593Smuzhiyun bool logging_enabled;
113*4882a593Smuzhiyun bool device_supports_discard;
114*4882a593Smuzhiyun spinlock_t blocks_lock;
115*4882a593Smuzhiyun struct list_head unflushed_blocks;
116*4882a593Smuzhiyun struct list_head logging_blocks;
117*4882a593Smuzhiyun wait_queue_head_t wait;
118*4882a593Smuzhiyun struct task_struct *log_kthread;
119*4882a593Smuzhiyun struct completion super_done;
120*4882a593Smuzhiyun };
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun struct pending_block {
123*4882a593Smuzhiyun int vec_cnt;
124*4882a593Smuzhiyun u64 flags;
125*4882a593Smuzhiyun sector_t sector;
126*4882a593Smuzhiyun sector_t nr_sectors;
127*4882a593Smuzhiyun char *data;
128*4882a593Smuzhiyun u32 datalen;
129*4882a593Smuzhiyun struct list_head list;
130*4882a593Smuzhiyun struct bio_vec vecs[];
131*4882a593Smuzhiyun };
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun struct per_bio_data {
134*4882a593Smuzhiyun struct pending_block *block;
135*4882a593Smuzhiyun };
136*4882a593Smuzhiyun
bio_to_dev_sectors(struct log_writes_c * lc,sector_t sectors)137*4882a593Smuzhiyun static inline sector_t bio_to_dev_sectors(struct log_writes_c *lc,
138*4882a593Smuzhiyun sector_t sectors)
139*4882a593Smuzhiyun {
140*4882a593Smuzhiyun return sectors >> (lc->sectorshift - SECTOR_SHIFT);
141*4882a593Smuzhiyun }
142*4882a593Smuzhiyun
dev_to_bio_sectors(struct log_writes_c * lc,sector_t sectors)143*4882a593Smuzhiyun static inline sector_t dev_to_bio_sectors(struct log_writes_c *lc,
144*4882a593Smuzhiyun sector_t sectors)
145*4882a593Smuzhiyun {
146*4882a593Smuzhiyun return sectors << (lc->sectorshift - SECTOR_SHIFT);
147*4882a593Smuzhiyun }
148*4882a593Smuzhiyun
put_pending_block(struct log_writes_c * lc)149*4882a593Smuzhiyun static void put_pending_block(struct log_writes_c *lc)
150*4882a593Smuzhiyun {
151*4882a593Smuzhiyun if (atomic_dec_and_test(&lc->pending_blocks)) {
152*4882a593Smuzhiyun smp_mb__after_atomic();
153*4882a593Smuzhiyun if (waitqueue_active(&lc->wait))
154*4882a593Smuzhiyun wake_up(&lc->wait);
155*4882a593Smuzhiyun }
156*4882a593Smuzhiyun }
157*4882a593Smuzhiyun
put_io_block(struct log_writes_c * lc)158*4882a593Smuzhiyun static void put_io_block(struct log_writes_c *lc)
159*4882a593Smuzhiyun {
160*4882a593Smuzhiyun if (atomic_dec_and_test(&lc->io_blocks)) {
161*4882a593Smuzhiyun smp_mb__after_atomic();
162*4882a593Smuzhiyun if (waitqueue_active(&lc->wait))
163*4882a593Smuzhiyun wake_up(&lc->wait);
164*4882a593Smuzhiyun }
165*4882a593Smuzhiyun }
166*4882a593Smuzhiyun
log_end_io(struct bio * bio)167*4882a593Smuzhiyun static void log_end_io(struct bio *bio)
168*4882a593Smuzhiyun {
169*4882a593Smuzhiyun struct log_writes_c *lc = bio->bi_private;
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun if (bio->bi_status) {
172*4882a593Smuzhiyun unsigned long flags;
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun DMERR("Error writing log block, error=%d", bio->bi_status);
175*4882a593Smuzhiyun spin_lock_irqsave(&lc->blocks_lock, flags);
176*4882a593Smuzhiyun lc->logging_enabled = false;
177*4882a593Smuzhiyun spin_unlock_irqrestore(&lc->blocks_lock, flags);
178*4882a593Smuzhiyun }
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun bio_free_pages(bio);
181*4882a593Smuzhiyun put_io_block(lc);
182*4882a593Smuzhiyun bio_put(bio);
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun
log_end_super(struct bio * bio)185*4882a593Smuzhiyun static void log_end_super(struct bio *bio)
186*4882a593Smuzhiyun {
187*4882a593Smuzhiyun struct log_writes_c *lc = bio->bi_private;
188*4882a593Smuzhiyun
189*4882a593Smuzhiyun complete(&lc->super_done);
190*4882a593Smuzhiyun log_end_io(bio);
191*4882a593Smuzhiyun }
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun /*
194*4882a593Smuzhiyun * Meant to be called if there is an error, it will free all the pages
195*4882a593Smuzhiyun * associated with the block.
196*4882a593Smuzhiyun */
free_pending_block(struct log_writes_c * lc,struct pending_block * block)197*4882a593Smuzhiyun static void free_pending_block(struct log_writes_c *lc,
198*4882a593Smuzhiyun struct pending_block *block)
199*4882a593Smuzhiyun {
200*4882a593Smuzhiyun int i;
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun for (i = 0; i < block->vec_cnt; i++) {
203*4882a593Smuzhiyun if (block->vecs[i].bv_page)
204*4882a593Smuzhiyun __free_page(block->vecs[i].bv_page);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun kfree(block->data);
207*4882a593Smuzhiyun kfree(block);
208*4882a593Smuzhiyun put_pending_block(lc);
209*4882a593Smuzhiyun }
210*4882a593Smuzhiyun
write_metadata(struct log_writes_c * lc,void * entry,size_t entrylen,void * data,size_t datalen,sector_t sector)211*4882a593Smuzhiyun static int write_metadata(struct log_writes_c *lc, void *entry,
212*4882a593Smuzhiyun size_t entrylen, void *data, size_t datalen,
213*4882a593Smuzhiyun sector_t sector)
214*4882a593Smuzhiyun {
215*4882a593Smuzhiyun struct bio *bio;
216*4882a593Smuzhiyun struct page *page;
217*4882a593Smuzhiyun void *ptr;
218*4882a593Smuzhiyun size_t ret;
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun bio = bio_alloc(GFP_KERNEL, 1);
221*4882a593Smuzhiyun if (!bio) {
222*4882a593Smuzhiyun DMERR("Couldn't alloc log bio");
223*4882a593Smuzhiyun goto error;
224*4882a593Smuzhiyun }
225*4882a593Smuzhiyun bio->bi_iter.bi_size = 0;
226*4882a593Smuzhiyun bio->bi_iter.bi_sector = sector;
227*4882a593Smuzhiyun bio_set_dev(bio, lc->logdev->bdev);
228*4882a593Smuzhiyun bio->bi_end_io = (sector == WRITE_LOG_SUPER_SECTOR) ?
229*4882a593Smuzhiyun log_end_super : log_end_io;
230*4882a593Smuzhiyun bio->bi_private = lc;
231*4882a593Smuzhiyun bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun page = alloc_page(GFP_KERNEL);
234*4882a593Smuzhiyun if (!page) {
235*4882a593Smuzhiyun DMERR("Couldn't alloc log page");
236*4882a593Smuzhiyun bio_put(bio);
237*4882a593Smuzhiyun goto error;
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun ptr = kmap_atomic(page);
241*4882a593Smuzhiyun memcpy(ptr, entry, entrylen);
242*4882a593Smuzhiyun if (datalen)
243*4882a593Smuzhiyun memcpy(ptr + entrylen, data, datalen);
244*4882a593Smuzhiyun memset(ptr + entrylen + datalen, 0,
245*4882a593Smuzhiyun lc->sectorsize - entrylen - datalen);
246*4882a593Smuzhiyun kunmap_atomic(ptr);
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun ret = bio_add_page(bio, page, lc->sectorsize, 0);
249*4882a593Smuzhiyun if (ret != lc->sectorsize) {
250*4882a593Smuzhiyun DMERR("Couldn't add page to the log block");
251*4882a593Smuzhiyun goto error_bio;
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun submit_bio(bio);
254*4882a593Smuzhiyun return 0;
255*4882a593Smuzhiyun error_bio:
256*4882a593Smuzhiyun bio_put(bio);
257*4882a593Smuzhiyun __free_page(page);
258*4882a593Smuzhiyun error:
259*4882a593Smuzhiyun put_io_block(lc);
260*4882a593Smuzhiyun return -1;
261*4882a593Smuzhiyun }
262*4882a593Smuzhiyun
write_inline_data(struct log_writes_c * lc,void * entry,size_t entrylen,void * data,size_t datalen,sector_t sector)263*4882a593Smuzhiyun static int write_inline_data(struct log_writes_c *lc, void *entry,
264*4882a593Smuzhiyun size_t entrylen, void *data, size_t datalen,
265*4882a593Smuzhiyun sector_t sector)
266*4882a593Smuzhiyun {
267*4882a593Smuzhiyun int num_pages, bio_pages, pg_datalen, pg_sectorlen, i;
268*4882a593Smuzhiyun struct page *page;
269*4882a593Smuzhiyun struct bio *bio;
270*4882a593Smuzhiyun size_t ret;
271*4882a593Smuzhiyun void *ptr;
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun while (datalen) {
274*4882a593Smuzhiyun num_pages = ALIGN(datalen, PAGE_SIZE) >> PAGE_SHIFT;
275*4882a593Smuzhiyun bio_pages = min(num_pages, BIO_MAX_PAGES);
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun atomic_inc(&lc->io_blocks);
278*4882a593Smuzhiyun
279*4882a593Smuzhiyun bio = bio_alloc(GFP_KERNEL, bio_pages);
280*4882a593Smuzhiyun if (!bio) {
281*4882a593Smuzhiyun DMERR("Couldn't alloc inline data bio");
282*4882a593Smuzhiyun goto error;
283*4882a593Smuzhiyun }
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun bio->bi_iter.bi_size = 0;
286*4882a593Smuzhiyun bio->bi_iter.bi_sector = sector;
287*4882a593Smuzhiyun bio_set_dev(bio, lc->logdev->bdev);
288*4882a593Smuzhiyun bio->bi_end_io = log_end_io;
289*4882a593Smuzhiyun bio->bi_private = lc;
290*4882a593Smuzhiyun bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun for (i = 0; i < bio_pages; i++) {
293*4882a593Smuzhiyun pg_datalen = min_t(int, datalen, PAGE_SIZE);
294*4882a593Smuzhiyun pg_sectorlen = ALIGN(pg_datalen, lc->sectorsize);
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun page = alloc_page(GFP_KERNEL);
297*4882a593Smuzhiyun if (!page) {
298*4882a593Smuzhiyun DMERR("Couldn't alloc inline data page");
299*4882a593Smuzhiyun goto error_bio;
300*4882a593Smuzhiyun }
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun ptr = kmap_atomic(page);
303*4882a593Smuzhiyun memcpy(ptr, data, pg_datalen);
304*4882a593Smuzhiyun if (pg_sectorlen > pg_datalen)
305*4882a593Smuzhiyun memset(ptr + pg_datalen, 0, pg_sectorlen - pg_datalen);
306*4882a593Smuzhiyun kunmap_atomic(ptr);
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun ret = bio_add_page(bio, page, pg_sectorlen, 0);
309*4882a593Smuzhiyun if (ret != pg_sectorlen) {
310*4882a593Smuzhiyun DMERR("Couldn't add page of inline data");
311*4882a593Smuzhiyun __free_page(page);
312*4882a593Smuzhiyun goto error_bio;
313*4882a593Smuzhiyun }
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun datalen -= pg_datalen;
316*4882a593Smuzhiyun data += pg_datalen;
317*4882a593Smuzhiyun }
318*4882a593Smuzhiyun submit_bio(bio);
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun sector += bio_pages * PAGE_SECTORS;
321*4882a593Smuzhiyun }
322*4882a593Smuzhiyun return 0;
323*4882a593Smuzhiyun error_bio:
324*4882a593Smuzhiyun bio_free_pages(bio);
325*4882a593Smuzhiyun bio_put(bio);
326*4882a593Smuzhiyun error:
327*4882a593Smuzhiyun put_io_block(lc);
328*4882a593Smuzhiyun return -1;
329*4882a593Smuzhiyun }
330*4882a593Smuzhiyun
log_one_block(struct log_writes_c * lc,struct pending_block * block,sector_t sector)331*4882a593Smuzhiyun static int log_one_block(struct log_writes_c *lc,
332*4882a593Smuzhiyun struct pending_block *block, sector_t sector)
333*4882a593Smuzhiyun {
334*4882a593Smuzhiyun struct bio *bio;
335*4882a593Smuzhiyun struct log_write_entry entry;
336*4882a593Smuzhiyun size_t metadatalen, ret;
337*4882a593Smuzhiyun int i;
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun entry.sector = cpu_to_le64(block->sector);
340*4882a593Smuzhiyun entry.nr_sectors = cpu_to_le64(block->nr_sectors);
341*4882a593Smuzhiyun entry.flags = cpu_to_le64(block->flags);
342*4882a593Smuzhiyun entry.data_len = cpu_to_le64(block->datalen);
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun metadatalen = (block->flags & LOG_MARK_FLAG) ? block->datalen : 0;
345*4882a593Smuzhiyun if (write_metadata(lc, &entry, sizeof(entry), block->data,
346*4882a593Smuzhiyun metadatalen, sector)) {
347*4882a593Smuzhiyun free_pending_block(lc, block);
348*4882a593Smuzhiyun return -1;
349*4882a593Smuzhiyun }
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun sector += dev_to_bio_sectors(lc, 1);
352*4882a593Smuzhiyun
353*4882a593Smuzhiyun if (block->datalen && metadatalen == 0) {
354*4882a593Smuzhiyun if (write_inline_data(lc, &entry, sizeof(entry), block->data,
355*4882a593Smuzhiyun block->datalen, sector)) {
356*4882a593Smuzhiyun free_pending_block(lc, block);
357*4882a593Smuzhiyun return -1;
358*4882a593Smuzhiyun }
359*4882a593Smuzhiyun /* we don't support both inline data & bio data */
360*4882a593Smuzhiyun goto out;
361*4882a593Smuzhiyun }
362*4882a593Smuzhiyun
363*4882a593Smuzhiyun if (!block->vec_cnt)
364*4882a593Smuzhiyun goto out;
365*4882a593Smuzhiyun
366*4882a593Smuzhiyun atomic_inc(&lc->io_blocks);
367*4882a593Smuzhiyun bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt, BIO_MAX_PAGES));
368*4882a593Smuzhiyun if (!bio) {
369*4882a593Smuzhiyun DMERR("Couldn't alloc log bio");
370*4882a593Smuzhiyun goto error;
371*4882a593Smuzhiyun }
372*4882a593Smuzhiyun bio->bi_iter.bi_size = 0;
373*4882a593Smuzhiyun bio->bi_iter.bi_sector = sector;
374*4882a593Smuzhiyun bio_set_dev(bio, lc->logdev->bdev);
375*4882a593Smuzhiyun bio->bi_end_io = log_end_io;
376*4882a593Smuzhiyun bio->bi_private = lc;
377*4882a593Smuzhiyun bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun for (i = 0; i < block->vec_cnt; i++) {
380*4882a593Smuzhiyun /*
381*4882a593Smuzhiyun * The page offset is always 0 because we allocate a new page
382*4882a593Smuzhiyun * for every bvec in the original bio for simplicity sake.
383*4882a593Smuzhiyun */
384*4882a593Smuzhiyun ret = bio_add_page(bio, block->vecs[i].bv_page,
385*4882a593Smuzhiyun block->vecs[i].bv_len, 0);
386*4882a593Smuzhiyun if (ret != block->vecs[i].bv_len) {
387*4882a593Smuzhiyun atomic_inc(&lc->io_blocks);
388*4882a593Smuzhiyun submit_bio(bio);
389*4882a593Smuzhiyun bio = bio_alloc(GFP_KERNEL, min(block->vec_cnt - i, BIO_MAX_PAGES));
390*4882a593Smuzhiyun if (!bio) {
391*4882a593Smuzhiyun DMERR("Couldn't alloc log bio");
392*4882a593Smuzhiyun goto error;
393*4882a593Smuzhiyun }
394*4882a593Smuzhiyun bio->bi_iter.bi_size = 0;
395*4882a593Smuzhiyun bio->bi_iter.bi_sector = sector;
396*4882a593Smuzhiyun bio_set_dev(bio, lc->logdev->bdev);
397*4882a593Smuzhiyun bio->bi_end_io = log_end_io;
398*4882a593Smuzhiyun bio->bi_private = lc;
399*4882a593Smuzhiyun bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun ret = bio_add_page(bio, block->vecs[i].bv_page,
402*4882a593Smuzhiyun block->vecs[i].bv_len, 0);
403*4882a593Smuzhiyun if (ret != block->vecs[i].bv_len) {
404*4882a593Smuzhiyun DMERR("Couldn't add page on new bio?");
405*4882a593Smuzhiyun bio_put(bio);
406*4882a593Smuzhiyun goto error;
407*4882a593Smuzhiyun }
408*4882a593Smuzhiyun }
409*4882a593Smuzhiyun sector += block->vecs[i].bv_len >> SECTOR_SHIFT;
410*4882a593Smuzhiyun }
411*4882a593Smuzhiyun submit_bio(bio);
412*4882a593Smuzhiyun out:
413*4882a593Smuzhiyun kfree(block->data);
414*4882a593Smuzhiyun kfree(block);
415*4882a593Smuzhiyun put_pending_block(lc);
416*4882a593Smuzhiyun return 0;
417*4882a593Smuzhiyun error:
418*4882a593Smuzhiyun free_pending_block(lc, block);
419*4882a593Smuzhiyun put_io_block(lc);
420*4882a593Smuzhiyun return -1;
421*4882a593Smuzhiyun }
422*4882a593Smuzhiyun
log_super(struct log_writes_c * lc)423*4882a593Smuzhiyun static int log_super(struct log_writes_c *lc)
424*4882a593Smuzhiyun {
425*4882a593Smuzhiyun struct log_write_super super;
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun super.magic = cpu_to_le64(WRITE_LOG_MAGIC);
428*4882a593Smuzhiyun super.version = cpu_to_le64(WRITE_LOG_VERSION);
429*4882a593Smuzhiyun super.nr_entries = cpu_to_le64(lc->logged_entries);
430*4882a593Smuzhiyun super.sectorsize = cpu_to_le32(lc->sectorsize);
431*4882a593Smuzhiyun
432*4882a593Smuzhiyun if (write_metadata(lc, &super, sizeof(super), NULL, 0,
433*4882a593Smuzhiyun WRITE_LOG_SUPER_SECTOR)) {
434*4882a593Smuzhiyun DMERR("Couldn't write super");
435*4882a593Smuzhiyun return -1;
436*4882a593Smuzhiyun }
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun /*
439*4882a593Smuzhiyun * Super sector should be writen in-order, otherwise the
440*4882a593Smuzhiyun * nr_entries could be rewritten incorrectly by an old bio.
441*4882a593Smuzhiyun */
442*4882a593Smuzhiyun wait_for_completion_io(&lc->super_done);
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun return 0;
445*4882a593Smuzhiyun }
446*4882a593Smuzhiyun
logdev_last_sector(struct log_writes_c * lc)447*4882a593Smuzhiyun static inline sector_t logdev_last_sector(struct log_writes_c *lc)
448*4882a593Smuzhiyun {
449*4882a593Smuzhiyun return i_size_read(lc->logdev->bdev->bd_inode) >> SECTOR_SHIFT;
450*4882a593Smuzhiyun }
451*4882a593Smuzhiyun
log_writes_kthread(void * arg)452*4882a593Smuzhiyun static int log_writes_kthread(void *arg)
453*4882a593Smuzhiyun {
454*4882a593Smuzhiyun struct log_writes_c *lc = (struct log_writes_c *)arg;
455*4882a593Smuzhiyun sector_t sector = 0;
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun while (!kthread_should_stop()) {
458*4882a593Smuzhiyun bool super = false;
459*4882a593Smuzhiyun bool logging_enabled;
460*4882a593Smuzhiyun struct pending_block *block = NULL;
461*4882a593Smuzhiyun int ret;
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
464*4882a593Smuzhiyun if (!list_empty(&lc->logging_blocks)) {
465*4882a593Smuzhiyun block = list_first_entry(&lc->logging_blocks,
466*4882a593Smuzhiyun struct pending_block, list);
467*4882a593Smuzhiyun list_del_init(&block->list);
468*4882a593Smuzhiyun if (!lc->logging_enabled)
469*4882a593Smuzhiyun goto next;
470*4882a593Smuzhiyun
471*4882a593Smuzhiyun sector = lc->next_sector;
472*4882a593Smuzhiyun if (!(block->flags & LOG_DISCARD_FLAG))
473*4882a593Smuzhiyun lc->next_sector += dev_to_bio_sectors(lc, block->nr_sectors);
474*4882a593Smuzhiyun lc->next_sector += dev_to_bio_sectors(lc, 1);
475*4882a593Smuzhiyun
476*4882a593Smuzhiyun /*
477*4882a593Smuzhiyun * Apparently the size of the device may not be known
478*4882a593Smuzhiyun * right away, so handle this properly.
479*4882a593Smuzhiyun */
480*4882a593Smuzhiyun if (!lc->end_sector)
481*4882a593Smuzhiyun lc->end_sector = logdev_last_sector(lc);
482*4882a593Smuzhiyun if (lc->end_sector &&
483*4882a593Smuzhiyun lc->next_sector >= lc->end_sector) {
484*4882a593Smuzhiyun DMERR("Ran out of space on the logdev");
485*4882a593Smuzhiyun lc->logging_enabled = false;
486*4882a593Smuzhiyun goto next;
487*4882a593Smuzhiyun }
488*4882a593Smuzhiyun lc->logged_entries++;
489*4882a593Smuzhiyun atomic_inc(&lc->io_blocks);
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun super = (block->flags & (LOG_FUA_FLAG | LOG_MARK_FLAG));
492*4882a593Smuzhiyun if (super)
493*4882a593Smuzhiyun atomic_inc(&lc->io_blocks);
494*4882a593Smuzhiyun }
495*4882a593Smuzhiyun next:
496*4882a593Smuzhiyun logging_enabled = lc->logging_enabled;
497*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
498*4882a593Smuzhiyun if (block) {
499*4882a593Smuzhiyun if (logging_enabled) {
500*4882a593Smuzhiyun ret = log_one_block(lc, block, sector);
501*4882a593Smuzhiyun if (!ret && super)
502*4882a593Smuzhiyun ret = log_super(lc);
503*4882a593Smuzhiyun if (ret) {
504*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
505*4882a593Smuzhiyun lc->logging_enabled = false;
506*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
507*4882a593Smuzhiyun }
508*4882a593Smuzhiyun } else
509*4882a593Smuzhiyun free_pending_block(lc, block);
510*4882a593Smuzhiyun continue;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun if (!try_to_freeze()) {
514*4882a593Smuzhiyun set_current_state(TASK_INTERRUPTIBLE);
515*4882a593Smuzhiyun if (!kthread_should_stop() &&
516*4882a593Smuzhiyun list_empty(&lc->logging_blocks))
517*4882a593Smuzhiyun schedule();
518*4882a593Smuzhiyun __set_current_state(TASK_RUNNING);
519*4882a593Smuzhiyun }
520*4882a593Smuzhiyun }
521*4882a593Smuzhiyun return 0;
522*4882a593Smuzhiyun }
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun /*
525*4882a593Smuzhiyun * Construct a log-writes mapping:
526*4882a593Smuzhiyun * log-writes <dev_path> <log_dev_path>
527*4882a593Smuzhiyun */
log_writes_ctr(struct dm_target * ti,unsigned int argc,char ** argv)528*4882a593Smuzhiyun static int log_writes_ctr(struct dm_target *ti, unsigned int argc, char **argv)
529*4882a593Smuzhiyun {
530*4882a593Smuzhiyun struct log_writes_c *lc;
531*4882a593Smuzhiyun struct dm_arg_set as;
532*4882a593Smuzhiyun const char *devname, *logdevname;
533*4882a593Smuzhiyun int ret;
534*4882a593Smuzhiyun
535*4882a593Smuzhiyun as.argc = argc;
536*4882a593Smuzhiyun as.argv = argv;
537*4882a593Smuzhiyun
538*4882a593Smuzhiyun if (argc < 2) {
539*4882a593Smuzhiyun ti->error = "Invalid argument count";
540*4882a593Smuzhiyun return -EINVAL;
541*4882a593Smuzhiyun }
542*4882a593Smuzhiyun
543*4882a593Smuzhiyun lc = kzalloc(sizeof(struct log_writes_c), GFP_KERNEL);
544*4882a593Smuzhiyun if (!lc) {
545*4882a593Smuzhiyun ti->error = "Cannot allocate context";
546*4882a593Smuzhiyun return -ENOMEM;
547*4882a593Smuzhiyun }
548*4882a593Smuzhiyun spin_lock_init(&lc->blocks_lock);
549*4882a593Smuzhiyun INIT_LIST_HEAD(&lc->unflushed_blocks);
550*4882a593Smuzhiyun INIT_LIST_HEAD(&lc->logging_blocks);
551*4882a593Smuzhiyun init_waitqueue_head(&lc->wait);
552*4882a593Smuzhiyun init_completion(&lc->super_done);
553*4882a593Smuzhiyun atomic_set(&lc->io_blocks, 0);
554*4882a593Smuzhiyun atomic_set(&lc->pending_blocks, 0);
555*4882a593Smuzhiyun
556*4882a593Smuzhiyun devname = dm_shift_arg(&as);
557*4882a593Smuzhiyun ret = dm_get_device(ti, devname, dm_table_get_mode(ti->table), &lc->dev);
558*4882a593Smuzhiyun if (ret) {
559*4882a593Smuzhiyun ti->error = "Device lookup failed";
560*4882a593Smuzhiyun goto bad;
561*4882a593Smuzhiyun }
562*4882a593Smuzhiyun
563*4882a593Smuzhiyun logdevname = dm_shift_arg(&as);
564*4882a593Smuzhiyun ret = dm_get_device(ti, logdevname, dm_table_get_mode(ti->table),
565*4882a593Smuzhiyun &lc->logdev);
566*4882a593Smuzhiyun if (ret) {
567*4882a593Smuzhiyun ti->error = "Log device lookup failed";
568*4882a593Smuzhiyun dm_put_device(ti, lc->dev);
569*4882a593Smuzhiyun goto bad;
570*4882a593Smuzhiyun }
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun lc->sectorsize = bdev_logical_block_size(lc->dev->bdev);
573*4882a593Smuzhiyun lc->sectorshift = ilog2(lc->sectorsize);
574*4882a593Smuzhiyun lc->log_kthread = kthread_run(log_writes_kthread, lc, "log-write");
575*4882a593Smuzhiyun if (IS_ERR(lc->log_kthread)) {
576*4882a593Smuzhiyun ret = PTR_ERR(lc->log_kthread);
577*4882a593Smuzhiyun ti->error = "Couldn't alloc kthread";
578*4882a593Smuzhiyun dm_put_device(ti, lc->dev);
579*4882a593Smuzhiyun dm_put_device(ti, lc->logdev);
580*4882a593Smuzhiyun goto bad;
581*4882a593Smuzhiyun }
582*4882a593Smuzhiyun
583*4882a593Smuzhiyun /*
584*4882a593Smuzhiyun * next_sector is in 512b sectors to correspond to what bi_sector expects.
585*4882a593Smuzhiyun * The super starts at sector 0, and the next_sector is the next logical
586*4882a593Smuzhiyun * one based on the sectorsize of the device.
587*4882a593Smuzhiyun */
588*4882a593Smuzhiyun lc->next_sector = lc->sectorsize >> SECTOR_SHIFT;
589*4882a593Smuzhiyun lc->logging_enabled = true;
590*4882a593Smuzhiyun lc->end_sector = logdev_last_sector(lc);
591*4882a593Smuzhiyun lc->device_supports_discard = true;
592*4882a593Smuzhiyun
593*4882a593Smuzhiyun ti->num_flush_bios = 1;
594*4882a593Smuzhiyun ti->flush_supported = true;
595*4882a593Smuzhiyun ti->num_discard_bios = 1;
596*4882a593Smuzhiyun ti->discards_supported = true;
597*4882a593Smuzhiyun ti->per_io_data_size = sizeof(struct per_bio_data);
598*4882a593Smuzhiyun ti->private = lc;
599*4882a593Smuzhiyun return 0;
600*4882a593Smuzhiyun
601*4882a593Smuzhiyun bad:
602*4882a593Smuzhiyun kfree(lc);
603*4882a593Smuzhiyun return ret;
604*4882a593Smuzhiyun }
605*4882a593Smuzhiyun
log_mark(struct log_writes_c * lc,char * data)606*4882a593Smuzhiyun static int log_mark(struct log_writes_c *lc, char *data)
607*4882a593Smuzhiyun {
608*4882a593Smuzhiyun struct pending_block *block;
609*4882a593Smuzhiyun size_t maxsize = lc->sectorsize - sizeof(struct log_write_entry);
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
612*4882a593Smuzhiyun if (!block) {
613*4882a593Smuzhiyun DMERR("Error allocating pending block");
614*4882a593Smuzhiyun return -ENOMEM;
615*4882a593Smuzhiyun }
616*4882a593Smuzhiyun
617*4882a593Smuzhiyun block->data = kstrndup(data, maxsize - 1, GFP_KERNEL);
618*4882a593Smuzhiyun if (!block->data) {
619*4882a593Smuzhiyun DMERR("Error copying mark data");
620*4882a593Smuzhiyun kfree(block);
621*4882a593Smuzhiyun return -ENOMEM;
622*4882a593Smuzhiyun }
623*4882a593Smuzhiyun atomic_inc(&lc->pending_blocks);
624*4882a593Smuzhiyun block->datalen = strlen(block->data);
625*4882a593Smuzhiyun block->flags |= LOG_MARK_FLAG;
626*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
627*4882a593Smuzhiyun list_add_tail(&block->list, &lc->logging_blocks);
628*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
629*4882a593Smuzhiyun wake_up_process(lc->log_kthread);
630*4882a593Smuzhiyun return 0;
631*4882a593Smuzhiyun }
632*4882a593Smuzhiyun
log_writes_dtr(struct dm_target * ti)633*4882a593Smuzhiyun static void log_writes_dtr(struct dm_target *ti)
634*4882a593Smuzhiyun {
635*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
636*4882a593Smuzhiyun
637*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
638*4882a593Smuzhiyun list_splice_init(&lc->unflushed_blocks, &lc->logging_blocks);
639*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
640*4882a593Smuzhiyun
641*4882a593Smuzhiyun /*
642*4882a593Smuzhiyun * This is just nice to have since it'll update the super to include the
643*4882a593Smuzhiyun * unflushed blocks, if it fails we don't really care.
644*4882a593Smuzhiyun */
645*4882a593Smuzhiyun log_mark(lc, "dm-log-writes-end");
646*4882a593Smuzhiyun wake_up_process(lc->log_kthread);
647*4882a593Smuzhiyun wait_event(lc->wait, !atomic_read(&lc->io_blocks) &&
648*4882a593Smuzhiyun !atomic_read(&lc->pending_blocks));
649*4882a593Smuzhiyun kthread_stop(lc->log_kthread);
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun WARN_ON(!list_empty(&lc->logging_blocks));
652*4882a593Smuzhiyun WARN_ON(!list_empty(&lc->unflushed_blocks));
653*4882a593Smuzhiyun dm_put_device(ti, lc->dev);
654*4882a593Smuzhiyun dm_put_device(ti, lc->logdev);
655*4882a593Smuzhiyun kfree(lc);
656*4882a593Smuzhiyun }
657*4882a593Smuzhiyun
normal_map_bio(struct dm_target * ti,struct bio * bio)658*4882a593Smuzhiyun static void normal_map_bio(struct dm_target *ti, struct bio *bio)
659*4882a593Smuzhiyun {
660*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
661*4882a593Smuzhiyun
662*4882a593Smuzhiyun bio_set_dev(bio, lc->dev->bdev);
663*4882a593Smuzhiyun }
664*4882a593Smuzhiyun
log_writes_map(struct dm_target * ti,struct bio * bio)665*4882a593Smuzhiyun static int log_writes_map(struct dm_target *ti, struct bio *bio)
666*4882a593Smuzhiyun {
667*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
668*4882a593Smuzhiyun struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
669*4882a593Smuzhiyun struct pending_block *block;
670*4882a593Smuzhiyun struct bvec_iter iter;
671*4882a593Smuzhiyun struct bio_vec bv;
672*4882a593Smuzhiyun size_t alloc_size;
673*4882a593Smuzhiyun int i = 0;
674*4882a593Smuzhiyun bool flush_bio = (bio->bi_opf & REQ_PREFLUSH);
675*4882a593Smuzhiyun bool fua_bio = (bio->bi_opf & REQ_FUA);
676*4882a593Smuzhiyun bool discard_bio = (bio_op(bio) == REQ_OP_DISCARD);
677*4882a593Smuzhiyun bool meta_bio = (bio->bi_opf & REQ_META);
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun pb->block = NULL;
680*4882a593Smuzhiyun
681*4882a593Smuzhiyun /* Don't bother doing anything if logging has been disabled */
682*4882a593Smuzhiyun if (!lc->logging_enabled)
683*4882a593Smuzhiyun goto map_bio;
684*4882a593Smuzhiyun
685*4882a593Smuzhiyun /*
686*4882a593Smuzhiyun * Map reads as normal.
687*4882a593Smuzhiyun */
688*4882a593Smuzhiyun if (bio_data_dir(bio) == READ)
689*4882a593Smuzhiyun goto map_bio;
690*4882a593Smuzhiyun
691*4882a593Smuzhiyun /* No sectors and not a flush? Don't care */
692*4882a593Smuzhiyun if (!bio_sectors(bio) && !flush_bio)
693*4882a593Smuzhiyun goto map_bio;
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun /*
696*4882a593Smuzhiyun * Discards will have bi_size set but there's no actual data, so just
697*4882a593Smuzhiyun * allocate the size of the pending block.
698*4882a593Smuzhiyun */
699*4882a593Smuzhiyun if (discard_bio)
700*4882a593Smuzhiyun alloc_size = sizeof(struct pending_block);
701*4882a593Smuzhiyun else
702*4882a593Smuzhiyun alloc_size = struct_size(block, vecs, bio_segments(bio));
703*4882a593Smuzhiyun
704*4882a593Smuzhiyun block = kzalloc(alloc_size, GFP_NOIO);
705*4882a593Smuzhiyun if (!block) {
706*4882a593Smuzhiyun DMERR("Error allocating pending block");
707*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
708*4882a593Smuzhiyun lc->logging_enabled = false;
709*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
710*4882a593Smuzhiyun return DM_MAPIO_KILL;
711*4882a593Smuzhiyun }
712*4882a593Smuzhiyun INIT_LIST_HEAD(&block->list);
713*4882a593Smuzhiyun pb->block = block;
714*4882a593Smuzhiyun atomic_inc(&lc->pending_blocks);
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun if (flush_bio)
717*4882a593Smuzhiyun block->flags |= LOG_FLUSH_FLAG;
718*4882a593Smuzhiyun if (fua_bio)
719*4882a593Smuzhiyun block->flags |= LOG_FUA_FLAG;
720*4882a593Smuzhiyun if (discard_bio)
721*4882a593Smuzhiyun block->flags |= LOG_DISCARD_FLAG;
722*4882a593Smuzhiyun if (meta_bio)
723*4882a593Smuzhiyun block->flags |= LOG_METADATA_FLAG;
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun block->sector = bio_to_dev_sectors(lc, bio->bi_iter.bi_sector);
726*4882a593Smuzhiyun block->nr_sectors = bio_to_dev_sectors(lc, bio_sectors(bio));
727*4882a593Smuzhiyun
728*4882a593Smuzhiyun /* We don't need the data, just submit */
729*4882a593Smuzhiyun if (discard_bio) {
730*4882a593Smuzhiyun WARN_ON(flush_bio || fua_bio);
731*4882a593Smuzhiyun if (lc->device_supports_discard)
732*4882a593Smuzhiyun goto map_bio;
733*4882a593Smuzhiyun bio_endio(bio);
734*4882a593Smuzhiyun return DM_MAPIO_SUBMITTED;
735*4882a593Smuzhiyun }
736*4882a593Smuzhiyun
737*4882a593Smuzhiyun /* Flush bio, splice the unflushed blocks onto this list and submit */
738*4882a593Smuzhiyun if (flush_bio && !bio_sectors(bio)) {
739*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
740*4882a593Smuzhiyun list_splice_init(&lc->unflushed_blocks, &block->list);
741*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
742*4882a593Smuzhiyun goto map_bio;
743*4882a593Smuzhiyun }
744*4882a593Smuzhiyun
745*4882a593Smuzhiyun /*
746*4882a593Smuzhiyun * We will write this bio somewhere else way later so we need to copy
747*4882a593Smuzhiyun * the actual contents into new pages so we know the data will always be
748*4882a593Smuzhiyun * there.
749*4882a593Smuzhiyun *
750*4882a593Smuzhiyun * We do this because this could be a bio from O_DIRECT in which case we
751*4882a593Smuzhiyun * can't just hold onto the page until some later point, we have to
752*4882a593Smuzhiyun * manually copy the contents.
753*4882a593Smuzhiyun */
754*4882a593Smuzhiyun bio_for_each_segment(bv, bio, iter) {
755*4882a593Smuzhiyun struct page *page;
756*4882a593Smuzhiyun void *src, *dst;
757*4882a593Smuzhiyun
758*4882a593Smuzhiyun page = alloc_page(GFP_NOIO);
759*4882a593Smuzhiyun if (!page) {
760*4882a593Smuzhiyun DMERR("Error allocing page");
761*4882a593Smuzhiyun free_pending_block(lc, block);
762*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
763*4882a593Smuzhiyun lc->logging_enabled = false;
764*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
765*4882a593Smuzhiyun return DM_MAPIO_KILL;
766*4882a593Smuzhiyun }
767*4882a593Smuzhiyun
768*4882a593Smuzhiyun src = kmap_atomic(bv.bv_page);
769*4882a593Smuzhiyun dst = kmap_atomic(page);
770*4882a593Smuzhiyun memcpy(dst, src + bv.bv_offset, bv.bv_len);
771*4882a593Smuzhiyun kunmap_atomic(dst);
772*4882a593Smuzhiyun kunmap_atomic(src);
773*4882a593Smuzhiyun block->vecs[i].bv_page = page;
774*4882a593Smuzhiyun block->vecs[i].bv_len = bv.bv_len;
775*4882a593Smuzhiyun block->vec_cnt++;
776*4882a593Smuzhiyun i++;
777*4882a593Smuzhiyun }
778*4882a593Smuzhiyun
779*4882a593Smuzhiyun /* Had a flush with data in it, weird */
780*4882a593Smuzhiyun if (flush_bio) {
781*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
782*4882a593Smuzhiyun list_splice_init(&lc->unflushed_blocks, &block->list);
783*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
784*4882a593Smuzhiyun }
785*4882a593Smuzhiyun map_bio:
786*4882a593Smuzhiyun normal_map_bio(ti, bio);
787*4882a593Smuzhiyun return DM_MAPIO_REMAPPED;
788*4882a593Smuzhiyun }
789*4882a593Smuzhiyun
normal_end_io(struct dm_target * ti,struct bio * bio,blk_status_t * error)790*4882a593Smuzhiyun static int normal_end_io(struct dm_target *ti, struct bio *bio,
791*4882a593Smuzhiyun blk_status_t *error)
792*4882a593Smuzhiyun {
793*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
794*4882a593Smuzhiyun struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun if (bio_data_dir(bio) == WRITE && pb->block) {
797*4882a593Smuzhiyun struct pending_block *block = pb->block;
798*4882a593Smuzhiyun unsigned long flags;
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun spin_lock_irqsave(&lc->blocks_lock, flags);
801*4882a593Smuzhiyun if (block->flags & LOG_FLUSH_FLAG) {
802*4882a593Smuzhiyun list_splice_tail_init(&block->list, &lc->logging_blocks);
803*4882a593Smuzhiyun list_add_tail(&block->list, &lc->logging_blocks);
804*4882a593Smuzhiyun wake_up_process(lc->log_kthread);
805*4882a593Smuzhiyun } else if (block->flags & LOG_FUA_FLAG) {
806*4882a593Smuzhiyun list_add_tail(&block->list, &lc->logging_blocks);
807*4882a593Smuzhiyun wake_up_process(lc->log_kthread);
808*4882a593Smuzhiyun } else
809*4882a593Smuzhiyun list_add_tail(&block->list, &lc->unflushed_blocks);
810*4882a593Smuzhiyun spin_unlock_irqrestore(&lc->blocks_lock, flags);
811*4882a593Smuzhiyun }
812*4882a593Smuzhiyun
813*4882a593Smuzhiyun return DM_ENDIO_DONE;
814*4882a593Smuzhiyun }
815*4882a593Smuzhiyun
816*4882a593Smuzhiyun /*
817*4882a593Smuzhiyun * INFO format: <logged entries> <highest allocated sector>
818*4882a593Smuzhiyun */
log_writes_status(struct dm_target * ti,status_type_t type,unsigned status_flags,char * result,unsigned maxlen)819*4882a593Smuzhiyun static void log_writes_status(struct dm_target *ti, status_type_t type,
820*4882a593Smuzhiyun unsigned status_flags, char *result,
821*4882a593Smuzhiyun unsigned maxlen)
822*4882a593Smuzhiyun {
823*4882a593Smuzhiyun unsigned sz = 0;
824*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun switch (type) {
827*4882a593Smuzhiyun case STATUSTYPE_INFO:
828*4882a593Smuzhiyun DMEMIT("%llu %llu", lc->logged_entries,
829*4882a593Smuzhiyun (unsigned long long)lc->next_sector - 1);
830*4882a593Smuzhiyun if (!lc->logging_enabled)
831*4882a593Smuzhiyun DMEMIT(" logging_disabled");
832*4882a593Smuzhiyun break;
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun case STATUSTYPE_TABLE:
835*4882a593Smuzhiyun DMEMIT("%s %s", lc->dev->name, lc->logdev->name);
836*4882a593Smuzhiyun break;
837*4882a593Smuzhiyun }
838*4882a593Smuzhiyun }
839*4882a593Smuzhiyun
log_writes_prepare_ioctl(struct dm_target * ti,struct block_device ** bdev)840*4882a593Smuzhiyun static int log_writes_prepare_ioctl(struct dm_target *ti,
841*4882a593Smuzhiyun struct block_device **bdev)
842*4882a593Smuzhiyun {
843*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
844*4882a593Smuzhiyun struct dm_dev *dev = lc->dev;
845*4882a593Smuzhiyun
846*4882a593Smuzhiyun *bdev = dev->bdev;
847*4882a593Smuzhiyun /*
848*4882a593Smuzhiyun * Only pass ioctls through if the device sizes match exactly.
849*4882a593Smuzhiyun */
850*4882a593Smuzhiyun if (ti->len != i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT)
851*4882a593Smuzhiyun return 1;
852*4882a593Smuzhiyun return 0;
853*4882a593Smuzhiyun }
854*4882a593Smuzhiyun
log_writes_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)855*4882a593Smuzhiyun static int log_writes_iterate_devices(struct dm_target *ti,
856*4882a593Smuzhiyun iterate_devices_callout_fn fn,
857*4882a593Smuzhiyun void *data)
858*4882a593Smuzhiyun {
859*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
860*4882a593Smuzhiyun
861*4882a593Smuzhiyun return fn(ti, lc->dev, 0, ti->len, data);
862*4882a593Smuzhiyun }
863*4882a593Smuzhiyun
864*4882a593Smuzhiyun /*
865*4882a593Smuzhiyun * Messages supported:
866*4882a593Smuzhiyun * mark <mark data> - specify the marked data.
867*4882a593Smuzhiyun */
log_writes_message(struct dm_target * ti,unsigned argc,char ** argv,char * result,unsigned maxlen)868*4882a593Smuzhiyun static int log_writes_message(struct dm_target *ti, unsigned argc, char **argv,
869*4882a593Smuzhiyun char *result, unsigned maxlen)
870*4882a593Smuzhiyun {
871*4882a593Smuzhiyun int r = -EINVAL;
872*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun if (argc != 2) {
875*4882a593Smuzhiyun DMWARN("Invalid log-writes message arguments, expect 2 arguments, got %d", argc);
876*4882a593Smuzhiyun return r;
877*4882a593Smuzhiyun }
878*4882a593Smuzhiyun
879*4882a593Smuzhiyun if (!strcasecmp(argv[0], "mark"))
880*4882a593Smuzhiyun r = log_mark(lc, argv[1]);
881*4882a593Smuzhiyun else
882*4882a593Smuzhiyun DMWARN("Unrecognised log writes target message received: %s", argv[0]);
883*4882a593Smuzhiyun
884*4882a593Smuzhiyun return r;
885*4882a593Smuzhiyun }
886*4882a593Smuzhiyun
log_writes_io_hints(struct dm_target * ti,struct queue_limits * limits)887*4882a593Smuzhiyun static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limits)
888*4882a593Smuzhiyun {
889*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
890*4882a593Smuzhiyun struct request_queue *q = bdev_get_queue(lc->dev->bdev);
891*4882a593Smuzhiyun
892*4882a593Smuzhiyun if (!q || !blk_queue_discard(q)) {
893*4882a593Smuzhiyun lc->device_supports_discard = false;
894*4882a593Smuzhiyun limits->discard_granularity = lc->sectorsize;
895*4882a593Smuzhiyun limits->max_discard_sectors = (UINT_MAX >> SECTOR_SHIFT);
896*4882a593Smuzhiyun }
897*4882a593Smuzhiyun limits->logical_block_size = bdev_logical_block_size(lc->dev->bdev);
898*4882a593Smuzhiyun limits->physical_block_size = bdev_physical_block_size(lc->dev->bdev);
899*4882a593Smuzhiyun limits->io_min = limits->physical_block_size;
900*4882a593Smuzhiyun }
901*4882a593Smuzhiyun
902*4882a593Smuzhiyun #if IS_ENABLED(CONFIG_DAX_DRIVER)
log_dax(struct log_writes_c * lc,sector_t sector,size_t bytes,struct iov_iter * i)903*4882a593Smuzhiyun static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
904*4882a593Smuzhiyun struct iov_iter *i)
905*4882a593Smuzhiyun {
906*4882a593Smuzhiyun struct pending_block *block;
907*4882a593Smuzhiyun
908*4882a593Smuzhiyun if (!bytes)
909*4882a593Smuzhiyun return 0;
910*4882a593Smuzhiyun
911*4882a593Smuzhiyun block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
912*4882a593Smuzhiyun if (!block) {
913*4882a593Smuzhiyun DMERR("Error allocating dax pending block");
914*4882a593Smuzhiyun return -ENOMEM;
915*4882a593Smuzhiyun }
916*4882a593Smuzhiyun
917*4882a593Smuzhiyun block->data = kzalloc(bytes, GFP_KERNEL);
918*4882a593Smuzhiyun if (!block->data) {
919*4882a593Smuzhiyun DMERR("Error allocating dax data space");
920*4882a593Smuzhiyun kfree(block);
921*4882a593Smuzhiyun return -ENOMEM;
922*4882a593Smuzhiyun }
923*4882a593Smuzhiyun
924*4882a593Smuzhiyun /* write data provided via the iterator */
925*4882a593Smuzhiyun if (!copy_from_iter(block->data, bytes, i)) {
926*4882a593Smuzhiyun DMERR("Error copying dax data");
927*4882a593Smuzhiyun kfree(block->data);
928*4882a593Smuzhiyun kfree(block);
929*4882a593Smuzhiyun return -EIO;
930*4882a593Smuzhiyun }
931*4882a593Smuzhiyun
932*4882a593Smuzhiyun /* rewind the iterator so that the block driver can use it */
933*4882a593Smuzhiyun iov_iter_revert(i, bytes);
934*4882a593Smuzhiyun
935*4882a593Smuzhiyun block->datalen = bytes;
936*4882a593Smuzhiyun block->sector = bio_to_dev_sectors(lc, sector);
937*4882a593Smuzhiyun block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
938*4882a593Smuzhiyun
939*4882a593Smuzhiyun atomic_inc(&lc->pending_blocks);
940*4882a593Smuzhiyun spin_lock_irq(&lc->blocks_lock);
941*4882a593Smuzhiyun list_add_tail(&block->list, &lc->unflushed_blocks);
942*4882a593Smuzhiyun spin_unlock_irq(&lc->blocks_lock);
943*4882a593Smuzhiyun wake_up_process(lc->log_kthread);
944*4882a593Smuzhiyun
945*4882a593Smuzhiyun return 0;
946*4882a593Smuzhiyun }
947*4882a593Smuzhiyun
log_writes_dax_direct_access(struct dm_target * ti,pgoff_t pgoff,long nr_pages,void ** kaddr,pfn_t * pfn)948*4882a593Smuzhiyun static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
949*4882a593Smuzhiyun long nr_pages, void **kaddr, pfn_t *pfn)
950*4882a593Smuzhiyun {
951*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
952*4882a593Smuzhiyun sector_t sector = pgoff * PAGE_SECTORS;
953*4882a593Smuzhiyun int ret;
954*4882a593Smuzhiyun
955*4882a593Smuzhiyun ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages * PAGE_SIZE, &pgoff);
956*4882a593Smuzhiyun if (ret)
957*4882a593Smuzhiyun return ret;
958*4882a593Smuzhiyun return dax_direct_access(lc->dev->dax_dev, pgoff, nr_pages, kaddr, pfn);
959*4882a593Smuzhiyun }
960*4882a593Smuzhiyun
log_writes_dax_copy_from_iter(struct dm_target * ti,pgoff_t pgoff,void * addr,size_t bytes,struct iov_iter * i)961*4882a593Smuzhiyun static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
962*4882a593Smuzhiyun pgoff_t pgoff, void *addr, size_t bytes,
963*4882a593Smuzhiyun struct iov_iter *i)
964*4882a593Smuzhiyun {
965*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
966*4882a593Smuzhiyun sector_t sector = pgoff * PAGE_SECTORS;
967*4882a593Smuzhiyun int err;
968*4882a593Smuzhiyun
969*4882a593Smuzhiyun if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
970*4882a593Smuzhiyun return 0;
971*4882a593Smuzhiyun
972*4882a593Smuzhiyun /* Don't bother doing anything if logging has been disabled */
973*4882a593Smuzhiyun if (!lc->logging_enabled)
974*4882a593Smuzhiyun goto dax_copy;
975*4882a593Smuzhiyun
976*4882a593Smuzhiyun err = log_dax(lc, sector, bytes, i);
977*4882a593Smuzhiyun if (err) {
978*4882a593Smuzhiyun DMWARN("Error %d logging DAX write", err);
979*4882a593Smuzhiyun return 0;
980*4882a593Smuzhiyun }
981*4882a593Smuzhiyun dax_copy:
982*4882a593Smuzhiyun return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
983*4882a593Smuzhiyun }
984*4882a593Smuzhiyun
log_writes_dax_copy_to_iter(struct dm_target * ti,pgoff_t pgoff,void * addr,size_t bytes,struct iov_iter * i)985*4882a593Smuzhiyun static size_t log_writes_dax_copy_to_iter(struct dm_target *ti,
986*4882a593Smuzhiyun pgoff_t pgoff, void *addr, size_t bytes,
987*4882a593Smuzhiyun struct iov_iter *i)
988*4882a593Smuzhiyun {
989*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
990*4882a593Smuzhiyun sector_t sector = pgoff * PAGE_SECTORS;
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun if (bdev_dax_pgoff(lc->dev->bdev, sector, ALIGN(bytes, PAGE_SIZE), &pgoff))
993*4882a593Smuzhiyun return 0;
994*4882a593Smuzhiyun return dax_copy_to_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
995*4882a593Smuzhiyun }
996*4882a593Smuzhiyun
log_writes_dax_zero_page_range(struct dm_target * ti,pgoff_t pgoff,size_t nr_pages)997*4882a593Smuzhiyun static int log_writes_dax_zero_page_range(struct dm_target *ti, pgoff_t pgoff,
998*4882a593Smuzhiyun size_t nr_pages)
999*4882a593Smuzhiyun {
1000*4882a593Smuzhiyun int ret;
1001*4882a593Smuzhiyun struct log_writes_c *lc = ti->private;
1002*4882a593Smuzhiyun sector_t sector = pgoff * PAGE_SECTORS;
1003*4882a593Smuzhiyun
1004*4882a593Smuzhiyun ret = bdev_dax_pgoff(lc->dev->bdev, sector, nr_pages << PAGE_SHIFT,
1005*4882a593Smuzhiyun &pgoff);
1006*4882a593Smuzhiyun if (ret)
1007*4882a593Smuzhiyun return ret;
1008*4882a593Smuzhiyun return dax_zero_page_range(lc->dev->dax_dev, pgoff,
1009*4882a593Smuzhiyun nr_pages << PAGE_SHIFT);
1010*4882a593Smuzhiyun }
1011*4882a593Smuzhiyun
1012*4882a593Smuzhiyun #else
1013*4882a593Smuzhiyun #define log_writes_dax_direct_access NULL
1014*4882a593Smuzhiyun #define log_writes_dax_copy_from_iter NULL
1015*4882a593Smuzhiyun #define log_writes_dax_copy_to_iter NULL
1016*4882a593Smuzhiyun #define log_writes_dax_zero_page_range NULL
1017*4882a593Smuzhiyun #endif
1018*4882a593Smuzhiyun
1019*4882a593Smuzhiyun static struct target_type log_writes_target = {
1020*4882a593Smuzhiyun .name = "log-writes",
1021*4882a593Smuzhiyun .version = {1, 1, 0},
1022*4882a593Smuzhiyun .module = THIS_MODULE,
1023*4882a593Smuzhiyun .ctr = log_writes_ctr,
1024*4882a593Smuzhiyun .dtr = log_writes_dtr,
1025*4882a593Smuzhiyun .map = log_writes_map,
1026*4882a593Smuzhiyun .end_io = normal_end_io,
1027*4882a593Smuzhiyun .status = log_writes_status,
1028*4882a593Smuzhiyun .prepare_ioctl = log_writes_prepare_ioctl,
1029*4882a593Smuzhiyun .message = log_writes_message,
1030*4882a593Smuzhiyun .iterate_devices = log_writes_iterate_devices,
1031*4882a593Smuzhiyun .io_hints = log_writes_io_hints,
1032*4882a593Smuzhiyun .direct_access = log_writes_dax_direct_access,
1033*4882a593Smuzhiyun .dax_copy_from_iter = log_writes_dax_copy_from_iter,
1034*4882a593Smuzhiyun .dax_copy_to_iter = log_writes_dax_copy_to_iter,
1035*4882a593Smuzhiyun .dax_zero_page_range = log_writes_dax_zero_page_range,
1036*4882a593Smuzhiyun };
1037*4882a593Smuzhiyun
dm_log_writes_init(void)1038*4882a593Smuzhiyun static int __init dm_log_writes_init(void)
1039*4882a593Smuzhiyun {
1040*4882a593Smuzhiyun int r = dm_register_target(&log_writes_target);
1041*4882a593Smuzhiyun
1042*4882a593Smuzhiyun if (r < 0)
1043*4882a593Smuzhiyun DMERR("register failed %d", r);
1044*4882a593Smuzhiyun
1045*4882a593Smuzhiyun return r;
1046*4882a593Smuzhiyun }
1047*4882a593Smuzhiyun
dm_log_writes_exit(void)1048*4882a593Smuzhiyun static void __exit dm_log_writes_exit(void)
1049*4882a593Smuzhiyun {
1050*4882a593Smuzhiyun dm_unregister_target(&log_writes_target);
1051*4882a593Smuzhiyun }
1052*4882a593Smuzhiyun
1053*4882a593Smuzhiyun module_init(dm_log_writes_init);
1054*4882a593Smuzhiyun module_exit(dm_log_writes_exit);
1055*4882a593Smuzhiyun
1056*4882a593Smuzhiyun MODULE_DESCRIPTION(DM_NAME " log writes target");
1057*4882a593Smuzhiyun MODULE_AUTHOR("Josef Bacik <jbacik@fb.com>");
1058*4882a593Smuzhiyun MODULE_LICENSE("GPL");
1059