xref: /OK3568_Linux_fs/kernel/drivers/md/dm-cache-target.c (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun /*
2*4882a593Smuzhiyun  * Copyright (C) 2012 Red Hat. All rights reserved.
3*4882a593Smuzhiyun  *
4*4882a593Smuzhiyun  * This file is released under the GPL.
5*4882a593Smuzhiyun  */
6*4882a593Smuzhiyun 
7*4882a593Smuzhiyun #include "dm.h"
8*4882a593Smuzhiyun #include "dm-bio-prison-v2.h"
9*4882a593Smuzhiyun #include "dm-bio-record.h"
10*4882a593Smuzhiyun #include "dm-cache-metadata.h"
11*4882a593Smuzhiyun 
12*4882a593Smuzhiyun #include <linux/dm-io.h>
13*4882a593Smuzhiyun #include <linux/dm-kcopyd.h>
14*4882a593Smuzhiyun #include <linux/jiffies.h>
15*4882a593Smuzhiyun #include <linux/init.h>
16*4882a593Smuzhiyun #include <linux/mempool.h>
17*4882a593Smuzhiyun #include <linux/module.h>
18*4882a593Smuzhiyun #include <linux/rwsem.h>
19*4882a593Smuzhiyun #include <linux/slab.h>
20*4882a593Smuzhiyun #include <linux/vmalloc.h>
21*4882a593Smuzhiyun 
22*4882a593Smuzhiyun #define DM_MSG_PREFIX "cache"
23*4882a593Smuzhiyun 
24*4882a593Smuzhiyun DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
25*4882a593Smuzhiyun 	"A percentage of time allocated for copying to and/or from cache");
26*4882a593Smuzhiyun 
27*4882a593Smuzhiyun /*----------------------------------------------------------------*/
28*4882a593Smuzhiyun 
29*4882a593Smuzhiyun /*
30*4882a593Smuzhiyun  * Glossary:
31*4882a593Smuzhiyun  *
32*4882a593Smuzhiyun  * oblock: index of an origin block
33*4882a593Smuzhiyun  * cblock: index of a cache block
34*4882a593Smuzhiyun  * promotion: movement of a block from origin to cache
35*4882a593Smuzhiyun  * demotion: movement of a block from cache to origin
36*4882a593Smuzhiyun  * migration: movement of a block between the origin and cache device,
37*4882a593Smuzhiyun  *	      either direction
38*4882a593Smuzhiyun  */
39*4882a593Smuzhiyun 
40*4882a593Smuzhiyun /*----------------------------------------------------------------*/
41*4882a593Smuzhiyun 
42*4882a593Smuzhiyun struct io_tracker {
43*4882a593Smuzhiyun 	spinlock_t lock;
44*4882a593Smuzhiyun 
45*4882a593Smuzhiyun 	/*
46*4882a593Smuzhiyun 	 * Sectors of in-flight IO.
47*4882a593Smuzhiyun 	 */
48*4882a593Smuzhiyun 	sector_t in_flight;
49*4882a593Smuzhiyun 
50*4882a593Smuzhiyun 	/*
51*4882a593Smuzhiyun 	 * The time, in jiffies, when this device became idle (if it is
52*4882a593Smuzhiyun 	 * indeed idle).
53*4882a593Smuzhiyun 	 */
54*4882a593Smuzhiyun 	unsigned long idle_time;
55*4882a593Smuzhiyun 	unsigned long last_update_time;
56*4882a593Smuzhiyun };
57*4882a593Smuzhiyun 
iot_init(struct io_tracker * iot)58*4882a593Smuzhiyun static void iot_init(struct io_tracker *iot)
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun 	spin_lock_init(&iot->lock);
61*4882a593Smuzhiyun 	iot->in_flight = 0ul;
62*4882a593Smuzhiyun 	iot->idle_time = 0ul;
63*4882a593Smuzhiyun 	iot->last_update_time = jiffies;
64*4882a593Smuzhiyun }
65*4882a593Smuzhiyun 
__iot_idle_for(struct io_tracker * iot,unsigned long jifs)66*4882a593Smuzhiyun static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
67*4882a593Smuzhiyun {
68*4882a593Smuzhiyun 	if (iot->in_flight)
69*4882a593Smuzhiyun 		return false;
70*4882a593Smuzhiyun 
71*4882a593Smuzhiyun 	return time_after(jiffies, iot->idle_time + jifs);
72*4882a593Smuzhiyun }
73*4882a593Smuzhiyun 
iot_idle_for(struct io_tracker * iot,unsigned long jifs)74*4882a593Smuzhiyun static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
75*4882a593Smuzhiyun {
76*4882a593Smuzhiyun 	bool r;
77*4882a593Smuzhiyun 
78*4882a593Smuzhiyun 	spin_lock_irq(&iot->lock);
79*4882a593Smuzhiyun 	r = __iot_idle_for(iot, jifs);
80*4882a593Smuzhiyun 	spin_unlock_irq(&iot->lock);
81*4882a593Smuzhiyun 
82*4882a593Smuzhiyun 	return r;
83*4882a593Smuzhiyun }
84*4882a593Smuzhiyun 
iot_io_begin(struct io_tracker * iot,sector_t len)85*4882a593Smuzhiyun static void iot_io_begin(struct io_tracker *iot, sector_t len)
86*4882a593Smuzhiyun {
87*4882a593Smuzhiyun 	spin_lock_irq(&iot->lock);
88*4882a593Smuzhiyun 	iot->in_flight += len;
89*4882a593Smuzhiyun 	spin_unlock_irq(&iot->lock);
90*4882a593Smuzhiyun }
91*4882a593Smuzhiyun 
__iot_io_end(struct io_tracker * iot,sector_t len)92*4882a593Smuzhiyun static void __iot_io_end(struct io_tracker *iot, sector_t len)
93*4882a593Smuzhiyun {
94*4882a593Smuzhiyun 	if (!len)
95*4882a593Smuzhiyun 		return;
96*4882a593Smuzhiyun 
97*4882a593Smuzhiyun 	iot->in_flight -= len;
98*4882a593Smuzhiyun 	if (!iot->in_flight)
99*4882a593Smuzhiyun 		iot->idle_time = jiffies;
100*4882a593Smuzhiyun }
101*4882a593Smuzhiyun 
iot_io_end(struct io_tracker * iot,sector_t len)102*4882a593Smuzhiyun static void iot_io_end(struct io_tracker *iot, sector_t len)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun 	unsigned long flags;
105*4882a593Smuzhiyun 
106*4882a593Smuzhiyun 	spin_lock_irqsave(&iot->lock, flags);
107*4882a593Smuzhiyun 	__iot_io_end(iot, len);
108*4882a593Smuzhiyun 	spin_unlock_irqrestore(&iot->lock, flags);
109*4882a593Smuzhiyun }
110*4882a593Smuzhiyun 
111*4882a593Smuzhiyun /*----------------------------------------------------------------*/
112*4882a593Smuzhiyun 
113*4882a593Smuzhiyun /*
114*4882a593Smuzhiyun  * Represents a chunk of future work.  'input' allows continuations to pass
115*4882a593Smuzhiyun  * values between themselves, typically error values.
116*4882a593Smuzhiyun  */
117*4882a593Smuzhiyun struct continuation {
118*4882a593Smuzhiyun 	struct work_struct ws;
119*4882a593Smuzhiyun 	blk_status_t input;
120*4882a593Smuzhiyun };
121*4882a593Smuzhiyun 
init_continuation(struct continuation * k,void (* fn)(struct work_struct *))122*4882a593Smuzhiyun static inline void init_continuation(struct continuation *k,
123*4882a593Smuzhiyun 				     void (*fn)(struct work_struct *))
124*4882a593Smuzhiyun {
125*4882a593Smuzhiyun 	INIT_WORK(&k->ws, fn);
126*4882a593Smuzhiyun 	k->input = 0;
127*4882a593Smuzhiyun }
128*4882a593Smuzhiyun 
queue_continuation(struct workqueue_struct * wq,struct continuation * k)129*4882a593Smuzhiyun static inline void queue_continuation(struct workqueue_struct *wq,
130*4882a593Smuzhiyun 				      struct continuation *k)
131*4882a593Smuzhiyun {
132*4882a593Smuzhiyun 	queue_work(wq, &k->ws);
133*4882a593Smuzhiyun }
134*4882a593Smuzhiyun 
135*4882a593Smuzhiyun /*----------------------------------------------------------------*/
136*4882a593Smuzhiyun 
137*4882a593Smuzhiyun /*
138*4882a593Smuzhiyun  * The batcher collects together pieces of work that need a particular
139*4882a593Smuzhiyun  * operation to occur before they can proceed (typically a commit).
140*4882a593Smuzhiyun  */
141*4882a593Smuzhiyun struct batcher {
142*4882a593Smuzhiyun 	/*
143*4882a593Smuzhiyun 	 * The operation that everyone is waiting for.
144*4882a593Smuzhiyun 	 */
145*4882a593Smuzhiyun 	blk_status_t (*commit_op)(void *context);
146*4882a593Smuzhiyun 	void *commit_context;
147*4882a593Smuzhiyun 
148*4882a593Smuzhiyun 	/*
149*4882a593Smuzhiyun 	 * This is how bios should be issued once the commit op is complete
150*4882a593Smuzhiyun 	 * (accounted_request).
151*4882a593Smuzhiyun 	 */
152*4882a593Smuzhiyun 	void (*issue_op)(struct bio *bio, void *context);
153*4882a593Smuzhiyun 	void *issue_context;
154*4882a593Smuzhiyun 
155*4882a593Smuzhiyun 	/*
156*4882a593Smuzhiyun 	 * Queued work gets put on here after commit.
157*4882a593Smuzhiyun 	 */
158*4882a593Smuzhiyun 	struct workqueue_struct *wq;
159*4882a593Smuzhiyun 
160*4882a593Smuzhiyun 	spinlock_t lock;
161*4882a593Smuzhiyun 	struct list_head work_items;
162*4882a593Smuzhiyun 	struct bio_list bios;
163*4882a593Smuzhiyun 	struct work_struct commit_work;
164*4882a593Smuzhiyun 
165*4882a593Smuzhiyun 	bool commit_scheduled;
166*4882a593Smuzhiyun };
167*4882a593Smuzhiyun 
__commit(struct work_struct * _ws)168*4882a593Smuzhiyun static void __commit(struct work_struct *_ws)
169*4882a593Smuzhiyun {
170*4882a593Smuzhiyun 	struct batcher *b = container_of(_ws, struct batcher, commit_work);
171*4882a593Smuzhiyun 	blk_status_t r;
172*4882a593Smuzhiyun 	struct list_head work_items;
173*4882a593Smuzhiyun 	struct work_struct *ws, *tmp;
174*4882a593Smuzhiyun 	struct continuation *k;
175*4882a593Smuzhiyun 	struct bio *bio;
176*4882a593Smuzhiyun 	struct bio_list bios;
177*4882a593Smuzhiyun 
178*4882a593Smuzhiyun 	INIT_LIST_HEAD(&work_items);
179*4882a593Smuzhiyun 	bio_list_init(&bios);
180*4882a593Smuzhiyun 
181*4882a593Smuzhiyun 	/*
182*4882a593Smuzhiyun 	 * We have to grab these before the commit_op to avoid a race
183*4882a593Smuzhiyun 	 * condition.
184*4882a593Smuzhiyun 	 */
185*4882a593Smuzhiyun 	spin_lock_irq(&b->lock);
186*4882a593Smuzhiyun 	list_splice_init(&b->work_items, &work_items);
187*4882a593Smuzhiyun 	bio_list_merge(&bios, &b->bios);
188*4882a593Smuzhiyun 	bio_list_init(&b->bios);
189*4882a593Smuzhiyun 	b->commit_scheduled = false;
190*4882a593Smuzhiyun 	spin_unlock_irq(&b->lock);
191*4882a593Smuzhiyun 
192*4882a593Smuzhiyun 	r = b->commit_op(b->commit_context);
193*4882a593Smuzhiyun 
194*4882a593Smuzhiyun 	list_for_each_entry_safe(ws, tmp, &work_items, entry) {
195*4882a593Smuzhiyun 		k = container_of(ws, struct continuation, ws);
196*4882a593Smuzhiyun 		k->input = r;
197*4882a593Smuzhiyun 		INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
198*4882a593Smuzhiyun 		queue_work(b->wq, ws);
199*4882a593Smuzhiyun 	}
200*4882a593Smuzhiyun 
201*4882a593Smuzhiyun 	while ((bio = bio_list_pop(&bios))) {
202*4882a593Smuzhiyun 		if (r) {
203*4882a593Smuzhiyun 			bio->bi_status = r;
204*4882a593Smuzhiyun 			bio_endio(bio);
205*4882a593Smuzhiyun 		} else
206*4882a593Smuzhiyun 			b->issue_op(bio, b->issue_context);
207*4882a593Smuzhiyun 	}
208*4882a593Smuzhiyun }
209*4882a593Smuzhiyun 
batcher_init(struct batcher * b,blk_status_t (* commit_op)(void *),void * commit_context,void (* issue_op)(struct bio * bio,void *),void * issue_context,struct workqueue_struct * wq)210*4882a593Smuzhiyun static void batcher_init(struct batcher *b,
211*4882a593Smuzhiyun 			 blk_status_t (*commit_op)(void *),
212*4882a593Smuzhiyun 			 void *commit_context,
213*4882a593Smuzhiyun 			 void (*issue_op)(struct bio *bio, void *),
214*4882a593Smuzhiyun 			 void *issue_context,
215*4882a593Smuzhiyun 			 struct workqueue_struct *wq)
216*4882a593Smuzhiyun {
217*4882a593Smuzhiyun 	b->commit_op = commit_op;
218*4882a593Smuzhiyun 	b->commit_context = commit_context;
219*4882a593Smuzhiyun 	b->issue_op = issue_op;
220*4882a593Smuzhiyun 	b->issue_context = issue_context;
221*4882a593Smuzhiyun 	b->wq = wq;
222*4882a593Smuzhiyun 
223*4882a593Smuzhiyun 	spin_lock_init(&b->lock);
224*4882a593Smuzhiyun 	INIT_LIST_HEAD(&b->work_items);
225*4882a593Smuzhiyun 	bio_list_init(&b->bios);
226*4882a593Smuzhiyun 	INIT_WORK(&b->commit_work, __commit);
227*4882a593Smuzhiyun 	b->commit_scheduled = false;
228*4882a593Smuzhiyun }
229*4882a593Smuzhiyun 
async_commit(struct batcher * b)230*4882a593Smuzhiyun static void async_commit(struct batcher *b)
231*4882a593Smuzhiyun {
232*4882a593Smuzhiyun 	queue_work(b->wq, &b->commit_work);
233*4882a593Smuzhiyun }
234*4882a593Smuzhiyun 
continue_after_commit(struct batcher * b,struct continuation * k)235*4882a593Smuzhiyun static void continue_after_commit(struct batcher *b, struct continuation *k)
236*4882a593Smuzhiyun {
237*4882a593Smuzhiyun 	bool commit_scheduled;
238*4882a593Smuzhiyun 
239*4882a593Smuzhiyun 	spin_lock_irq(&b->lock);
240*4882a593Smuzhiyun 	commit_scheduled = b->commit_scheduled;
241*4882a593Smuzhiyun 	list_add_tail(&k->ws.entry, &b->work_items);
242*4882a593Smuzhiyun 	spin_unlock_irq(&b->lock);
243*4882a593Smuzhiyun 
244*4882a593Smuzhiyun 	if (commit_scheduled)
245*4882a593Smuzhiyun 		async_commit(b);
246*4882a593Smuzhiyun }
247*4882a593Smuzhiyun 
248*4882a593Smuzhiyun /*
249*4882a593Smuzhiyun  * Bios are errored if commit failed.
250*4882a593Smuzhiyun  */
issue_after_commit(struct batcher * b,struct bio * bio)251*4882a593Smuzhiyun static void issue_after_commit(struct batcher *b, struct bio *bio)
252*4882a593Smuzhiyun {
253*4882a593Smuzhiyun        bool commit_scheduled;
254*4882a593Smuzhiyun 
255*4882a593Smuzhiyun        spin_lock_irq(&b->lock);
256*4882a593Smuzhiyun        commit_scheduled = b->commit_scheduled;
257*4882a593Smuzhiyun        bio_list_add(&b->bios, bio);
258*4882a593Smuzhiyun        spin_unlock_irq(&b->lock);
259*4882a593Smuzhiyun 
260*4882a593Smuzhiyun        if (commit_scheduled)
261*4882a593Smuzhiyun 	       async_commit(b);
262*4882a593Smuzhiyun }
263*4882a593Smuzhiyun 
264*4882a593Smuzhiyun /*
265*4882a593Smuzhiyun  * Call this if some urgent work is waiting for the commit to complete.
266*4882a593Smuzhiyun  */
schedule_commit(struct batcher * b)267*4882a593Smuzhiyun static void schedule_commit(struct batcher *b)
268*4882a593Smuzhiyun {
269*4882a593Smuzhiyun 	bool immediate;
270*4882a593Smuzhiyun 
271*4882a593Smuzhiyun 	spin_lock_irq(&b->lock);
272*4882a593Smuzhiyun 	immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
273*4882a593Smuzhiyun 	b->commit_scheduled = true;
274*4882a593Smuzhiyun 	spin_unlock_irq(&b->lock);
275*4882a593Smuzhiyun 
276*4882a593Smuzhiyun 	if (immediate)
277*4882a593Smuzhiyun 		async_commit(b);
278*4882a593Smuzhiyun }
279*4882a593Smuzhiyun 
280*4882a593Smuzhiyun /*
281*4882a593Smuzhiyun  * There are a couple of places where we let a bio run, but want to do some
282*4882a593Smuzhiyun  * work before calling its endio function.  We do this by temporarily
283*4882a593Smuzhiyun  * changing the endio fn.
284*4882a593Smuzhiyun  */
285*4882a593Smuzhiyun struct dm_hook_info {
286*4882a593Smuzhiyun 	bio_end_io_t *bi_end_io;
287*4882a593Smuzhiyun };
288*4882a593Smuzhiyun 
dm_hook_bio(struct dm_hook_info * h,struct bio * bio,bio_end_io_t * bi_end_io,void * bi_private)289*4882a593Smuzhiyun static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
290*4882a593Smuzhiyun 			bio_end_io_t *bi_end_io, void *bi_private)
291*4882a593Smuzhiyun {
292*4882a593Smuzhiyun 	h->bi_end_io = bio->bi_end_io;
293*4882a593Smuzhiyun 
294*4882a593Smuzhiyun 	bio->bi_end_io = bi_end_io;
295*4882a593Smuzhiyun 	bio->bi_private = bi_private;
296*4882a593Smuzhiyun }
297*4882a593Smuzhiyun 
dm_unhook_bio(struct dm_hook_info * h,struct bio * bio)298*4882a593Smuzhiyun static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
299*4882a593Smuzhiyun {
300*4882a593Smuzhiyun 	bio->bi_end_io = h->bi_end_io;
301*4882a593Smuzhiyun }
302*4882a593Smuzhiyun 
303*4882a593Smuzhiyun /*----------------------------------------------------------------*/
304*4882a593Smuzhiyun 
305*4882a593Smuzhiyun #define MIGRATION_POOL_SIZE 128
306*4882a593Smuzhiyun #define COMMIT_PERIOD HZ
307*4882a593Smuzhiyun #define MIGRATION_COUNT_WINDOW 10
308*4882a593Smuzhiyun 
309*4882a593Smuzhiyun /*
310*4882a593Smuzhiyun  * The block size of the device holding cache data must be
311*4882a593Smuzhiyun  * between 32KB and 1GB.
312*4882a593Smuzhiyun  */
313*4882a593Smuzhiyun #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
314*4882a593Smuzhiyun #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
315*4882a593Smuzhiyun 
316*4882a593Smuzhiyun enum cache_metadata_mode {
317*4882a593Smuzhiyun 	CM_WRITE,		/* metadata may be changed */
318*4882a593Smuzhiyun 	CM_READ_ONLY,		/* metadata may not be changed */
319*4882a593Smuzhiyun 	CM_FAIL
320*4882a593Smuzhiyun };
321*4882a593Smuzhiyun 
322*4882a593Smuzhiyun enum cache_io_mode {
323*4882a593Smuzhiyun 	/*
324*4882a593Smuzhiyun 	 * Data is written to cached blocks only.  These blocks are marked
325*4882a593Smuzhiyun 	 * dirty.  If you lose the cache device you will lose data.
326*4882a593Smuzhiyun 	 * Potential performance increase for both reads and writes.
327*4882a593Smuzhiyun 	 */
328*4882a593Smuzhiyun 	CM_IO_WRITEBACK,
329*4882a593Smuzhiyun 
330*4882a593Smuzhiyun 	/*
331*4882a593Smuzhiyun 	 * Data is written to both cache and origin.  Blocks are never
332*4882a593Smuzhiyun 	 * dirty.  Potential performance benfit for reads only.
333*4882a593Smuzhiyun 	 */
334*4882a593Smuzhiyun 	CM_IO_WRITETHROUGH,
335*4882a593Smuzhiyun 
336*4882a593Smuzhiyun 	/*
337*4882a593Smuzhiyun 	 * A degraded mode useful for various cache coherency situations
338*4882a593Smuzhiyun 	 * (eg, rolling back snapshots).  Reads and writes always go to the
339*4882a593Smuzhiyun 	 * origin.  If a write goes to a cached oblock, then the cache
340*4882a593Smuzhiyun 	 * block is invalidated.
341*4882a593Smuzhiyun 	 */
342*4882a593Smuzhiyun 	CM_IO_PASSTHROUGH
343*4882a593Smuzhiyun };
344*4882a593Smuzhiyun 
345*4882a593Smuzhiyun struct cache_features {
346*4882a593Smuzhiyun 	enum cache_metadata_mode mode;
347*4882a593Smuzhiyun 	enum cache_io_mode io_mode;
348*4882a593Smuzhiyun 	unsigned metadata_version;
349*4882a593Smuzhiyun 	bool discard_passdown:1;
350*4882a593Smuzhiyun };
351*4882a593Smuzhiyun 
352*4882a593Smuzhiyun struct cache_stats {
353*4882a593Smuzhiyun 	atomic_t read_hit;
354*4882a593Smuzhiyun 	atomic_t read_miss;
355*4882a593Smuzhiyun 	atomic_t write_hit;
356*4882a593Smuzhiyun 	atomic_t write_miss;
357*4882a593Smuzhiyun 	atomic_t demotion;
358*4882a593Smuzhiyun 	atomic_t promotion;
359*4882a593Smuzhiyun 	atomic_t writeback;
360*4882a593Smuzhiyun 	atomic_t copies_avoided;
361*4882a593Smuzhiyun 	atomic_t cache_cell_clash;
362*4882a593Smuzhiyun 	atomic_t commit_count;
363*4882a593Smuzhiyun 	atomic_t discard_count;
364*4882a593Smuzhiyun };
365*4882a593Smuzhiyun 
366*4882a593Smuzhiyun struct cache {
367*4882a593Smuzhiyun 	struct dm_target *ti;
368*4882a593Smuzhiyun 	spinlock_t lock;
369*4882a593Smuzhiyun 
370*4882a593Smuzhiyun 	/*
371*4882a593Smuzhiyun 	 * Fields for converting from sectors to blocks.
372*4882a593Smuzhiyun 	 */
373*4882a593Smuzhiyun 	int sectors_per_block_shift;
374*4882a593Smuzhiyun 	sector_t sectors_per_block;
375*4882a593Smuzhiyun 
376*4882a593Smuzhiyun 	struct dm_cache_metadata *cmd;
377*4882a593Smuzhiyun 
378*4882a593Smuzhiyun 	/*
379*4882a593Smuzhiyun 	 * Metadata is written to this device.
380*4882a593Smuzhiyun 	 */
381*4882a593Smuzhiyun 	struct dm_dev *metadata_dev;
382*4882a593Smuzhiyun 
383*4882a593Smuzhiyun 	/*
384*4882a593Smuzhiyun 	 * The slower of the two data devices.  Typically a spindle.
385*4882a593Smuzhiyun 	 */
386*4882a593Smuzhiyun 	struct dm_dev *origin_dev;
387*4882a593Smuzhiyun 
388*4882a593Smuzhiyun 	/*
389*4882a593Smuzhiyun 	 * The faster of the two data devices.  Typically an SSD.
390*4882a593Smuzhiyun 	 */
391*4882a593Smuzhiyun 	struct dm_dev *cache_dev;
392*4882a593Smuzhiyun 
393*4882a593Smuzhiyun 	/*
394*4882a593Smuzhiyun 	 * Size of the origin device in _complete_ blocks and native sectors.
395*4882a593Smuzhiyun 	 */
396*4882a593Smuzhiyun 	dm_oblock_t origin_blocks;
397*4882a593Smuzhiyun 	sector_t origin_sectors;
398*4882a593Smuzhiyun 
399*4882a593Smuzhiyun 	/*
400*4882a593Smuzhiyun 	 * Size of the cache device in blocks.
401*4882a593Smuzhiyun 	 */
402*4882a593Smuzhiyun 	dm_cblock_t cache_size;
403*4882a593Smuzhiyun 
404*4882a593Smuzhiyun 	/*
405*4882a593Smuzhiyun 	 * Invalidation fields.
406*4882a593Smuzhiyun 	 */
407*4882a593Smuzhiyun 	spinlock_t invalidation_lock;
408*4882a593Smuzhiyun 	struct list_head invalidation_requests;
409*4882a593Smuzhiyun 
410*4882a593Smuzhiyun 	sector_t migration_threshold;
411*4882a593Smuzhiyun 	wait_queue_head_t migration_wait;
412*4882a593Smuzhiyun 	atomic_t nr_allocated_migrations;
413*4882a593Smuzhiyun 
414*4882a593Smuzhiyun 	/*
415*4882a593Smuzhiyun 	 * The number of in flight migrations that are performing
416*4882a593Smuzhiyun 	 * background io. eg, promotion, writeback.
417*4882a593Smuzhiyun 	 */
418*4882a593Smuzhiyun 	atomic_t nr_io_migrations;
419*4882a593Smuzhiyun 
420*4882a593Smuzhiyun 	struct bio_list deferred_bios;
421*4882a593Smuzhiyun 
422*4882a593Smuzhiyun 	struct rw_semaphore quiesce_lock;
423*4882a593Smuzhiyun 
424*4882a593Smuzhiyun 	/*
425*4882a593Smuzhiyun 	 * origin_blocks entries, discarded if set.
426*4882a593Smuzhiyun 	 */
427*4882a593Smuzhiyun 	dm_dblock_t discard_nr_blocks;
428*4882a593Smuzhiyun 	unsigned long *discard_bitset;
429*4882a593Smuzhiyun 	uint32_t discard_block_size; /* a power of 2 times sectors per block */
430*4882a593Smuzhiyun 
431*4882a593Smuzhiyun 	/*
432*4882a593Smuzhiyun 	 * Rather than reconstructing the table line for the status we just
433*4882a593Smuzhiyun 	 * save it and regurgitate.
434*4882a593Smuzhiyun 	 */
435*4882a593Smuzhiyun 	unsigned nr_ctr_args;
436*4882a593Smuzhiyun 	const char **ctr_args;
437*4882a593Smuzhiyun 
438*4882a593Smuzhiyun 	struct dm_kcopyd_client *copier;
439*4882a593Smuzhiyun 	struct work_struct deferred_bio_worker;
440*4882a593Smuzhiyun 	struct work_struct migration_worker;
441*4882a593Smuzhiyun 	struct workqueue_struct *wq;
442*4882a593Smuzhiyun 	struct delayed_work waker;
443*4882a593Smuzhiyun 	struct dm_bio_prison_v2 *prison;
444*4882a593Smuzhiyun 
445*4882a593Smuzhiyun 	/*
446*4882a593Smuzhiyun 	 * cache_size entries, dirty if set
447*4882a593Smuzhiyun 	 */
448*4882a593Smuzhiyun 	unsigned long *dirty_bitset;
449*4882a593Smuzhiyun 	atomic_t nr_dirty;
450*4882a593Smuzhiyun 
451*4882a593Smuzhiyun 	unsigned policy_nr_args;
452*4882a593Smuzhiyun 	struct dm_cache_policy *policy;
453*4882a593Smuzhiyun 
454*4882a593Smuzhiyun 	/*
455*4882a593Smuzhiyun 	 * Cache features such as write-through.
456*4882a593Smuzhiyun 	 */
457*4882a593Smuzhiyun 	struct cache_features features;
458*4882a593Smuzhiyun 
459*4882a593Smuzhiyun 	struct cache_stats stats;
460*4882a593Smuzhiyun 
461*4882a593Smuzhiyun 	bool need_tick_bio:1;
462*4882a593Smuzhiyun 	bool sized:1;
463*4882a593Smuzhiyun 	bool invalidate:1;
464*4882a593Smuzhiyun 	bool commit_requested:1;
465*4882a593Smuzhiyun 	bool loaded_mappings:1;
466*4882a593Smuzhiyun 	bool loaded_discards:1;
467*4882a593Smuzhiyun 
468*4882a593Smuzhiyun 	struct rw_semaphore background_work_lock;
469*4882a593Smuzhiyun 
470*4882a593Smuzhiyun 	struct batcher committer;
471*4882a593Smuzhiyun 	struct work_struct commit_ws;
472*4882a593Smuzhiyun 
473*4882a593Smuzhiyun 	struct io_tracker tracker;
474*4882a593Smuzhiyun 
475*4882a593Smuzhiyun 	mempool_t migration_pool;
476*4882a593Smuzhiyun 
477*4882a593Smuzhiyun 	struct bio_set bs;
478*4882a593Smuzhiyun };
479*4882a593Smuzhiyun 
480*4882a593Smuzhiyun struct per_bio_data {
481*4882a593Smuzhiyun 	bool tick:1;
482*4882a593Smuzhiyun 	unsigned req_nr:2;
483*4882a593Smuzhiyun 	struct dm_bio_prison_cell_v2 *cell;
484*4882a593Smuzhiyun 	struct dm_hook_info hook_info;
485*4882a593Smuzhiyun 	sector_t len;
486*4882a593Smuzhiyun };
487*4882a593Smuzhiyun 
488*4882a593Smuzhiyun struct dm_cache_migration {
489*4882a593Smuzhiyun 	struct continuation k;
490*4882a593Smuzhiyun 	struct cache *cache;
491*4882a593Smuzhiyun 
492*4882a593Smuzhiyun 	struct policy_work *op;
493*4882a593Smuzhiyun 	struct bio *overwrite_bio;
494*4882a593Smuzhiyun 	struct dm_bio_prison_cell_v2 *cell;
495*4882a593Smuzhiyun 
496*4882a593Smuzhiyun 	dm_cblock_t invalidate_cblock;
497*4882a593Smuzhiyun 	dm_oblock_t invalidate_oblock;
498*4882a593Smuzhiyun };
499*4882a593Smuzhiyun 
500*4882a593Smuzhiyun /*----------------------------------------------------------------*/
501*4882a593Smuzhiyun 
writethrough_mode(struct cache * cache)502*4882a593Smuzhiyun static bool writethrough_mode(struct cache *cache)
503*4882a593Smuzhiyun {
504*4882a593Smuzhiyun 	return cache->features.io_mode == CM_IO_WRITETHROUGH;
505*4882a593Smuzhiyun }
506*4882a593Smuzhiyun 
writeback_mode(struct cache * cache)507*4882a593Smuzhiyun static bool writeback_mode(struct cache *cache)
508*4882a593Smuzhiyun {
509*4882a593Smuzhiyun 	return cache->features.io_mode == CM_IO_WRITEBACK;
510*4882a593Smuzhiyun }
511*4882a593Smuzhiyun 
passthrough_mode(struct cache * cache)512*4882a593Smuzhiyun static inline bool passthrough_mode(struct cache *cache)
513*4882a593Smuzhiyun {
514*4882a593Smuzhiyun 	return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
515*4882a593Smuzhiyun }
516*4882a593Smuzhiyun 
517*4882a593Smuzhiyun /*----------------------------------------------------------------*/
518*4882a593Smuzhiyun 
wake_deferred_bio_worker(struct cache * cache)519*4882a593Smuzhiyun static void wake_deferred_bio_worker(struct cache *cache)
520*4882a593Smuzhiyun {
521*4882a593Smuzhiyun 	queue_work(cache->wq, &cache->deferred_bio_worker);
522*4882a593Smuzhiyun }
523*4882a593Smuzhiyun 
wake_migration_worker(struct cache * cache)524*4882a593Smuzhiyun static void wake_migration_worker(struct cache *cache)
525*4882a593Smuzhiyun {
526*4882a593Smuzhiyun 	if (passthrough_mode(cache))
527*4882a593Smuzhiyun 		return;
528*4882a593Smuzhiyun 
529*4882a593Smuzhiyun 	queue_work(cache->wq, &cache->migration_worker);
530*4882a593Smuzhiyun }
531*4882a593Smuzhiyun 
532*4882a593Smuzhiyun /*----------------------------------------------------------------*/
533*4882a593Smuzhiyun 
alloc_prison_cell(struct cache * cache)534*4882a593Smuzhiyun static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
535*4882a593Smuzhiyun {
536*4882a593Smuzhiyun 	return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
537*4882a593Smuzhiyun }
538*4882a593Smuzhiyun 
free_prison_cell(struct cache * cache,struct dm_bio_prison_cell_v2 * cell)539*4882a593Smuzhiyun static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
540*4882a593Smuzhiyun {
541*4882a593Smuzhiyun 	dm_bio_prison_free_cell_v2(cache->prison, cell);
542*4882a593Smuzhiyun }
543*4882a593Smuzhiyun 
alloc_migration(struct cache * cache)544*4882a593Smuzhiyun static struct dm_cache_migration *alloc_migration(struct cache *cache)
545*4882a593Smuzhiyun {
546*4882a593Smuzhiyun 	struct dm_cache_migration *mg;
547*4882a593Smuzhiyun 
548*4882a593Smuzhiyun 	mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
549*4882a593Smuzhiyun 
550*4882a593Smuzhiyun 	memset(mg, 0, sizeof(*mg));
551*4882a593Smuzhiyun 
552*4882a593Smuzhiyun 	mg->cache = cache;
553*4882a593Smuzhiyun 	atomic_inc(&cache->nr_allocated_migrations);
554*4882a593Smuzhiyun 
555*4882a593Smuzhiyun 	return mg;
556*4882a593Smuzhiyun }
557*4882a593Smuzhiyun 
free_migration(struct dm_cache_migration * mg)558*4882a593Smuzhiyun static void free_migration(struct dm_cache_migration *mg)
559*4882a593Smuzhiyun {
560*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
561*4882a593Smuzhiyun 
562*4882a593Smuzhiyun 	if (atomic_dec_and_test(&cache->nr_allocated_migrations))
563*4882a593Smuzhiyun 		wake_up(&cache->migration_wait);
564*4882a593Smuzhiyun 
565*4882a593Smuzhiyun 	mempool_free(mg, &cache->migration_pool);
566*4882a593Smuzhiyun }
567*4882a593Smuzhiyun 
568*4882a593Smuzhiyun /*----------------------------------------------------------------*/
569*4882a593Smuzhiyun 
oblock_succ(dm_oblock_t b)570*4882a593Smuzhiyun static inline dm_oblock_t oblock_succ(dm_oblock_t b)
571*4882a593Smuzhiyun {
572*4882a593Smuzhiyun 	return to_oblock(from_oblock(b) + 1ull);
573*4882a593Smuzhiyun }
574*4882a593Smuzhiyun 
build_key(dm_oblock_t begin,dm_oblock_t end,struct dm_cell_key_v2 * key)575*4882a593Smuzhiyun static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
576*4882a593Smuzhiyun {
577*4882a593Smuzhiyun 	key->virtual = 0;
578*4882a593Smuzhiyun 	key->dev = 0;
579*4882a593Smuzhiyun 	key->block_begin = from_oblock(begin);
580*4882a593Smuzhiyun 	key->block_end = from_oblock(end);
581*4882a593Smuzhiyun }
582*4882a593Smuzhiyun 
583*4882a593Smuzhiyun /*
584*4882a593Smuzhiyun  * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
585*4882a593Smuzhiyun  * level 1 which prevents *both* READs and WRITEs.
586*4882a593Smuzhiyun  */
587*4882a593Smuzhiyun #define WRITE_LOCK_LEVEL 0
588*4882a593Smuzhiyun #define READ_WRITE_LOCK_LEVEL 1
589*4882a593Smuzhiyun 
lock_level(struct bio * bio)590*4882a593Smuzhiyun static unsigned lock_level(struct bio *bio)
591*4882a593Smuzhiyun {
592*4882a593Smuzhiyun 	return bio_data_dir(bio) == WRITE ?
593*4882a593Smuzhiyun 		WRITE_LOCK_LEVEL :
594*4882a593Smuzhiyun 		READ_WRITE_LOCK_LEVEL;
595*4882a593Smuzhiyun }
596*4882a593Smuzhiyun 
597*4882a593Smuzhiyun /*----------------------------------------------------------------
598*4882a593Smuzhiyun  * Per bio data
599*4882a593Smuzhiyun  *--------------------------------------------------------------*/
600*4882a593Smuzhiyun 
get_per_bio_data(struct bio * bio)601*4882a593Smuzhiyun static struct per_bio_data *get_per_bio_data(struct bio *bio)
602*4882a593Smuzhiyun {
603*4882a593Smuzhiyun 	struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
604*4882a593Smuzhiyun 	BUG_ON(!pb);
605*4882a593Smuzhiyun 	return pb;
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun 
init_per_bio_data(struct bio * bio)608*4882a593Smuzhiyun static struct per_bio_data *init_per_bio_data(struct bio *bio)
609*4882a593Smuzhiyun {
610*4882a593Smuzhiyun 	struct per_bio_data *pb = get_per_bio_data(bio);
611*4882a593Smuzhiyun 
612*4882a593Smuzhiyun 	pb->tick = false;
613*4882a593Smuzhiyun 	pb->req_nr = dm_bio_get_target_bio_nr(bio);
614*4882a593Smuzhiyun 	pb->cell = NULL;
615*4882a593Smuzhiyun 	pb->len = 0;
616*4882a593Smuzhiyun 
617*4882a593Smuzhiyun 	return pb;
618*4882a593Smuzhiyun }
619*4882a593Smuzhiyun 
620*4882a593Smuzhiyun /*----------------------------------------------------------------*/
621*4882a593Smuzhiyun 
defer_bio(struct cache * cache,struct bio * bio)622*4882a593Smuzhiyun static void defer_bio(struct cache *cache, struct bio *bio)
623*4882a593Smuzhiyun {
624*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
625*4882a593Smuzhiyun 	bio_list_add(&cache->deferred_bios, bio);
626*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
627*4882a593Smuzhiyun 
628*4882a593Smuzhiyun 	wake_deferred_bio_worker(cache);
629*4882a593Smuzhiyun }
630*4882a593Smuzhiyun 
defer_bios(struct cache * cache,struct bio_list * bios)631*4882a593Smuzhiyun static void defer_bios(struct cache *cache, struct bio_list *bios)
632*4882a593Smuzhiyun {
633*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
634*4882a593Smuzhiyun 	bio_list_merge(&cache->deferred_bios, bios);
635*4882a593Smuzhiyun 	bio_list_init(bios);
636*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
637*4882a593Smuzhiyun 
638*4882a593Smuzhiyun 	wake_deferred_bio_worker(cache);
639*4882a593Smuzhiyun }
640*4882a593Smuzhiyun 
641*4882a593Smuzhiyun /*----------------------------------------------------------------*/
642*4882a593Smuzhiyun 
bio_detain_shared(struct cache * cache,dm_oblock_t oblock,struct bio * bio)643*4882a593Smuzhiyun static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
644*4882a593Smuzhiyun {
645*4882a593Smuzhiyun 	bool r;
646*4882a593Smuzhiyun 	struct per_bio_data *pb;
647*4882a593Smuzhiyun 	struct dm_cell_key_v2 key;
648*4882a593Smuzhiyun 	dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
649*4882a593Smuzhiyun 	struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
650*4882a593Smuzhiyun 
651*4882a593Smuzhiyun 	cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
652*4882a593Smuzhiyun 
653*4882a593Smuzhiyun 	build_key(oblock, end, &key);
654*4882a593Smuzhiyun 	r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
655*4882a593Smuzhiyun 	if (!r) {
656*4882a593Smuzhiyun 		/*
657*4882a593Smuzhiyun 		 * Failed to get the lock.
658*4882a593Smuzhiyun 		 */
659*4882a593Smuzhiyun 		free_prison_cell(cache, cell_prealloc);
660*4882a593Smuzhiyun 		return r;
661*4882a593Smuzhiyun 	}
662*4882a593Smuzhiyun 
663*4882a593Smuzhiyun 	if (cell != cell_prealloc)
664*4882a593Smuzhiyun 		free_prison_cell(cache, cell_prealloc);
665*4882a593Smuzhiyun 
666*4882a593Smuzhiyun 	pb = get_per_bio_data(bio);
667*4882a593Smuzhiyun 	pb->cell = cell;
668*4882a593Smuzhiyun 
669*4882a593Smuzhiyun 	return r;
670*4882a593Smuzhiyun }
671*4882a593Smuzhiyun 
672*4882a593Smuzhiyun /*----------------------------------------------------------------*/
673*4882a593Smuzhiyun 
is_dirty(struct cache * cache,dm_cblock_t b)674*4882a593Smuzhiyun static bool is_dirty(struct cache *cache, dm_cblock_t b)
675*4882a593Smuzhiyun {
676*4882a593Smuzhiyun 	return test_bit(from_cblock(b), cache->dirty_bitset);
677*4882a593Smuzhiyun }
678*4882a593Smuzhiyun 
set_dirty(struct cache * cache,dm_cblock_t cblock)679*4882a593Smuzhiyun static void set_dirty(struct cache *cache, dm_cblock_t cblock)
680*4882a593Smuzhiyun {
681*4882a593Smuzhiyun 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
682*4882a593Smuzhiyun 		atomic_inc(&cache->nr_dirty);
683*4882a593Smuzhiyun 		policy_set_dirty(cache->policy, cblock);
684*4882a593Smuzhiyun 	}
685*4882a593Smuzhiyun }
686*4882a593Smuzhiyun 
687*4882a593Smuzhiyun /*
688*4882a593Smuzhiyun  * These two are called when setting after migrations to force the policy
689*4882a593Smuzhiyun  * and dirty bitset to be in sync.
690*4882a593Smuzhiyun  */
force_set_dirty(struct cache * cache,dm_cblock_t cblock)691*4882a593Smuzhiyun static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
692*4882a593Smuzhiyun {
693*4882a593Smuzhiyun 	if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
694*4882a593Smuzhiyun 		atomic_inc(&cache->nr_dirty);
695*4882a593Smuzhiyun 	policy_set_dirty(cache->policy, cblock);
696*4882a593Smuzhiyun }
697*4882a593Smuzhiyun 
force_clear_dirty(struct cache * cache,dm_cblock_t cblock)698*4882a593Smuzhiyun static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
699*4882a593Smuzhiyun {
700*4882a593Smuzhiyun 	if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
701*4882a593Smuzhiyun 		if (atomic_dec_return(&cache->nr_dirty) == 0)
702*4882a593Smuzhiyun 			dm_table_event(cache->ti->table);
703*4882a593Smuzhiyun 	}
704*4882a593Smuzhiyun 
705*4882a593Smuzhiyun 	policy_clear_dirty(cache->policy, cblock);
706*4882a593Smuzhiyun }
707*4882a593Smuzhiyun 
708*4882a593Smuzhiyun /*----------------------------------------------------------------*/
709*4882a593Smuzhiyun 
block_size_is_power_of_two(struct cache * cache)710*4882a593Smuzhiyun static bool block_size_is_power_of_two(struct cache *cache)
711*4882a593Smuzhiyun {
712*4882a593Smuzhiyun 	return cache->sectors_per_block_shift >= 0;
713*4882a593Smuzhiyun }
714*4882a593Smuzhiyun 
block_div(dm_block_t b,uint32_t n)715*4882a593Smuzhiyun static dm_block_t block_div(dm_block_t b, uint32_t n)
716*4882a593Smuzhiyun {
717*4882a593Smuzhiyun 	do_div(b, n);
718*4882a593Smuzhiyun 
719*4882a593Smuzhiyun 	return b;
720*4882a593Smuzhiyun }
721*4882a593Smuzhiyun 
oblocks_per_dblock(struct cache * cache)722*4882a593Smuzhiyun static dm_block_t oblocks_per_dblock(struct cache *cache)
723*4882a593Smuzhiyun {
724*4882a593Smuzhiyun 	dm_block_t oblocks = cache->discard_block_size;
725*4882a593Smuzhiyun 
726*4882a593Smuzhiyun 	if (block_size_is_power_of_two(cache))
727*4882a593Smuzhiyun 		oblocks >>= cache->sectors_per_block_shift;
728*4882a593Smuzhiyun 	else
729*4882a593Smuzhiyun 		oblocks = block_div(oblocks, cache->sectors_per_block);
730*4882a593Smuzhiyun 
731*4882a593Smuzhiyun 	return oblocks;
732*4882a593Smuzhiyun }
733*4882a593Smuzhiyun 
oblock_to_dblock(struct cache * cache,dm_oblock_t oblock)734*4882a593Smuzhiyun static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
735*4882a593Smuzhiyun {
736*4882a593Smuzhiyun 	return to_dblock(block_div(from_oblock(oblock),
737*4882a593Smuzhiyun 				   oblocks_per_dblock(cache)));
738*4882a593Smuzhiyun }
739*4882a593Smuzhiyun 
set_discard(struct cache * cache,dm_dblock_t b)740*4882a593Smuzhiyun static void set_discard(struct cache *cache, dm_dblock_t b)
741*4882a593Smuzhiyun {
742*4882a593Smuzhiyun 	BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
743*4882a593Smuzhiyun 	atomic_inc(&cache->stats.discard_count);
744*4882a593Smuzhiyun 
745*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
746*4882a593Smuzhiyun 	set_bit(from_dblock(b), cache->discard_bitset);
747*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
748*4882a593Smuzhiyun }
749*4882a593Smuzhiyun 
clear_discard(struct cache * cache,dm_dblock_t b)750*4882a593Smuzhiyun static void clear_discard(struct cache *cache, dm_dblock_t b)
751*4882a593Smuzhiyun {
752*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
753*4882a593Smuzhiyun 	clear_bit(from_dblock(b), cache->discard_bitset);
754*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
755*4882a593Smuzhiyun }
756*4882a593Smuzhiyun 
is_discarded(struct cache * cache,dm_dblock_t b)757*4882a593Smuzhiyun static bool is_discarded(struct cache *cache, dm_dblock_t b)
758*4882a593Smuzhiyun {
759*4882a593Smuzhiyun 	int r;
760*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
761*4882a593Smuzhiyun 	r = test_bit(from_dblock(b), cache->discard_bitset);
762*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
763*4882a593Smuzhiyun 
764*4882a593Smuzhiyun 	return r;
765*4882a593Smuzhiyun }
766*4882a593Smuzhiyun 
is_discarded_oblock(struct cache * cache,dm_oblock_t b)767*4882a593Smuzhiyun static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
768*4882a593Smuzhiyun {
769*4882a593Smuzhiyun 	int r;
770*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
771*4882a593Smuzhiyun 	r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
772*4882a593Smuzhiyun 		     cache->discard_bitset);
773*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
774*4882a593Smuzhiyun 
775*4882a593Smuzhiyun 	return r;
776*4882a593Smuzhiyun }
777*4882a593Smuzhiyun 
778*4882a593Smuzhiyun /*----------------------------------------------------------------
779*4882a593Smuzhiyun  * Remapping
780*4882a593Smuzhiyun  *--------------------------------------------------------------*/
remap_to_origin(struct cache * cache,struct bio * bio)781*4882a593Smuzhiyun static void remap_to_origin(struct cache *cache, struct bio *bio)
782*4882a593Smuzhiyun {
783*4882a593Smuzhiyun 	bio_set_dev(bio, cache->origin_dev->bdev);
784*4882a593Smuzhiyun }
785*4882a593Smuzhiyun 
remap_to_cache(struct cache * cache,struct bio * bio,dm_cblock_t cblock)786*4882a593Smuzhiyun static void remap_to_cache(struct cache *cache, struct bio *bio,
787*4882a593Smuzhiyun 			   dm_cblock_t cblock)
788*4882a593Smuzhiyun {
789*4882a593Smuzhiyun 	sector_t bi_sector = bio->bi_iter.bi_sector;
790*4882a593Smuzhiyun 	sector_t block = from_cblock(cblock);
791*4882a593Smuzhiyun 
792*4882a593Smuzhiyun 	bio_set_dev(bio, cache->cache_dev->bdev);
793*4882a593Smuzhiyun 	if (!block_size_is_power_of_two(cache))
794*4882a593Smuzhiyun 		bio->bi_iter.bi_sector =
795*4882a593Smuzhiyun 			(block * cache->sectors_per_block) +
796*4882a593Smuzhiyun 			sector_div(bi_sector, cache->sectors_per_block);
797*4882a593Smuzhiyun 	else
798*4882a593Smuzhiyun 		bio->bi_iter.bi_sector =
799*4882a593Smuzhiyun 			(block << cache->sectors_per_block_shift) |
800*4882a593Smuzhiyun 			(bi_sector & (cache->sectors_per_block - 1));
801*4882a593Smuzhiyun }
802*4882a593Smuzhiyun 
check_if_tick_bio_needed(struct cache * cache,struct bio * bio)803*4882a593Smuzhiyun static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
804*4882a593Smuzhiyun {
805*4882a593Smuzhiyun 	struct per_bio_data *pb;
806*4882a593Smuzhiyun 
807*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
808*4882a593Smuzhiyun 	if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
809*4882a593Smuzhiyun 	    bio_op(bio) != REQ_OP_DISCARD) {
810*4882a593Smuzhiyun 		pb = get_per_bio_data(bio);
811*4882a593Smuzhiyun 		pb->tick = true;
812*4882a593Smuzhiyun 		cache->need_tick_bio = false;
813*4882a593Smuzhiyun 	}
814*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
815*4882a593Smuzhiyun }
816*4882a593Smuzhiyun 
__remap_to_origin_clear_discard(struct cache * cache,struct bio * bio,dm_oblock_t oblock,bool bio_has_pbd)817*4882a593Smuzhiyun static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
818*4882a593Smuzhiyun 					    dm_oblock_t oblock, bool bio_has_pbd)
819*4882a593Smuzhiyun {
820*4882a593Smuzhiyun 	if (bio_has_pbd)
821*4882a593Smuzhiyun 		check_if_tick_bio_needed(cache, bio);
822*4882a593Smuzhiyun 	remap_to_origin(cache, bio);
823*4882a593Smuzhiyun 	if (bio_data_dir(bio) == WRITE)
824*4882a593Smuzhiyun 		clear_discard(cache, oblock_to_dblock(cache, oblock));
825*4882a593Smuzhiyun }
826*4882a593Smuzhiyun 
remap_to_origin_clear_discard(struct cache * cache,struct bio * bio,dm_oblock_t oblock)827*4882a593Smuzhiyun static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
828*4882a593Smuzhiyun 					  dm_oblock_t oblock)
829*4882a593Smuzhiyun {
830*4882a593Smuzhiyun 	// FIXME: check_if_tick_bio_needed() is called way too much through this interface
831*4882a593Smuzhiyun 	__remap_to_origin_clear_discard(cache, bio, oblock, true);
832*4882a593Smuzhiyun }
833*4882a593Smuzhiyun 
remap_to_cache_dirty(struct cache * cache,struct bio * bio,dm_oblock_t oblock,dm_cblock_t cblock)834*4882a593Smuzhiyun static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
835*4882a593Smuzhiyun 				 dm_oblock_t oblock, dm_cblock_t cblock)
836*4882a593Smuzhiyun {
837*4882a593Smuzhiyun 	check_if_tick_bio_needed(cache, bio);
838*4882a593Smuzhiyun 	remap_to_cache(cache, bio, cblock);
839*4882a593Smuzhiyun 	if (bio_data_dir(bio) == WRITE) {
840*4882a593Smuzhiyun 		set_dirty(cache, cblock);
841*4882a593Smuzhiyun 		clear_discard(cache, oblock_to_dblock(cache, oblock));
842*4882a593Smuzhiyun 	}
843*4882a593Smuzhiyun }
844*4882a593Smuzhiyun 
get_bio_block(struct cache * cache,struct bio * bio)845*4882a593Smuzhiyun static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
846*4882a593Smuzhiyun {
847*4882a593Smuzhiyun 	sector_t block_nr = bio->bi_iter.bi_sector;
848*4882a593Smuzhiyun 
849*4882a593Smuzhiyun 	if (!block_size_is_power_of_two(cache))
850*4882a593Smuzhiyun 		(void) sector_div(block_nr, cache->sectors_per_block);
851*4882a593Smuzhiyun 	else
852*4882a593Smuzhiyun 		block_nr >>= cache->sectors_per_block_shift;
853*4882a593Smuzhiyun 
854*4882a593Smuzhiyun 	return to_oblock(block_nr);
855*4882a593Smuzhiyun }
856*4882a593Smuzhiyun 
accountable_bio(struct cache * cache,struct bio * bio)857*4882a593Smuzhiyun static bool accountable_bio(struct cache *cache, struct bio *bio)
858*4882a593Smuzhiyun {
859*4882a593Smuzhiyun 	return bio_op(bio) != REQ_OP_DISCARD;
860*4882a593Smuzhiyun }
861*4882a593Smuzhiyun 
accounted_begin(struct cache * cache,struct bio * bio)862*4882a593Smuzhiyun static void accounted_begin(struct cache *cache, struct bio *bio)
863*4882a593Smuzhiyun {
864*4882a593Smuzhiyun 	struct per_bio_data *pb;
865*4882a593Smuzhiyun 
866*4882a593Smuzhiyun 	if (accountable_bio(cache, bio)) {
867*4882a593Smuzhiyun 		pb = get_per_bio_data(bio);
868*4882a593Smuzhiyun 		pb->len = bio_sectors(bio);
869*4882a593Smuzhiyun 		iot_io_begin(&cache->tracker, pb->len);
870*4882a593Smuzhiyun 	}
871*4882a593Smuzhiyun }
872*4882a593Smuzhiyun 
accounted_complete(struct cache * cache,struct bio * bio)873*4882a593Smuzhiyun static void accounted_complete(struct cache *cache, struct bio *bio)
874*4882a593Smuzhiyun {
875*4882a593Smuzhiyun 	struct per_bio_data *pb = get_per_bio_data(bio);
876*4882a593Smuzhiyun 
877*4882a593Smuzhiyun 	iot_io_end(&cache->tracker, pb->len);
878*4882a593Smuzhiyun }
879*4882a593Smuzhiyun 
accounted_request(struct cache * cache,struct bio * bio)880*4882a593Smuzhiyun static void accounted_request(struct cache *cache, struct bio *bio)
881*4882a593Smuzhiyun {
882*4882a593Smuzhiyun 	accounted_begin(cache, bio);
883*4882a593Smuzhiyun 	submit_bio_noacct(bio);
884*4882a593Smuzhiyun }
885*4882a593Smuzhiyun 
issue_op(struct bio * bio,void * context)886*4882a593Smuzhiyun static void issue_op(struct bio *bio, void *context)
887*4882a593Smuzhiyun {
888*4882a593Smuzhiyun 	struct cache *cache = context;
889*4882a593Smuzhiyun 	accounted_request(cache, bio);
890*4882a593Smuzhiyun }
891*4882a593Smuzhiyun 
892*4882a593Smuzhiyun /*
893*4882a593Smuzhiyun  * When running in writethrough mode we need to send writes to clean blocks
894*4882a593Smuzhiyun  * to both the cache and origin devices.  Clone the bio and send them in parallel.
895*4882a593Smuzhiyun  */
remap_to_origin_and_cache(struct cache * cache,struct bio * bio,dm_oblock_t oblock,dm_cblock_t cblock)896*4882a593Smuzhiyun static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
897*4882a593Smuzhiyun 				      dm_oblock_t oblock, dm_cblock_t cblock)
898*4882a593Smuzhiyun {
899*4882a593Smuzhiyun 	struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
900*4882a593Smuzhiyun 
901*4882a593Smuzhiyun 	BUG_ON(!origin_bio);
902*4882a593Smuzhiyun 
903*4882a593Smuzhiyun 	bio_chain(origin_bio, bio);
904*4882a593Smuzhiyun 	/*
905*4882a593Smuzhiyun 	 * Passing false to __remap_to_origin_clear_discard() skips
906*4882a593Smuzhiyun 	 * all code that might use per_bio_data (since clone doesn't have it)
907*4882a593Smuzhiyun 	 */
908*4882a593Smuzhiyun 	__remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
909*4882a593Smuzhiyun 	submit_bio(origin_bio);
910*4882a593Smuzhiyun 
911*4882a593Smuzhiyun 	remap_to_cache(cache, bio, cblock);
912*4882a593Smuzhiyun }
913*4882a593Smuzhiyun 
914*4882a593Smuzhiyun /*----------------------------------------------------------------
915*4882a593Smuzhiyun  * Failure modes
916*4882a593Smuzhiyun  *--------------------------------------------------------------*/
get_cache_mode(struct cache * cache)917*4882a593Smuzhiyun static enum cache_metadata_mode get_cache_mode(struct cache *cache)
918*4882a593Smuzhiyun {
919*4882a593Smuzhiyun 	return cache->features.mode;
920*4882a593Smuzhiyun }
921*4882a593Smuzhiyun 
cache_device_name(struct cache * cache)922*4882a593Smuzhiyun static const char *cache_device_name(struct cache *cache)
923*4882a593Smuzhiyun {
924*4882a593Smuzhiyun 	return dm_table_device_name(cache->ti->table);
925*4882a593Smuzhiyun }
926*4882a593Smuzhiyun 
notify_mode_switch(struct cache * cache,enum cache_metadata_mode mode)927*4882a593Smuzhiyun static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
928*4882a593Smuzhiyun {
929*4882a593Smuzhiyun 	const char *descs[] = {
930*4882a593Smuzhiyun 		"write",
931*4882a593Smuzhiyun 		"read-only",
932*4882a593Smuzhiyun 		"fail"
933*4882a593Smuzhiyun 	};
934*4882a593Smuzhiyun 
935*4882a593Smuzhiyun 	dm_table_event(cache->ti->table);
936*4882a593Smuzhiyun 	DMINFO("%s: switching cache to %s mode",
937*4882a593Smuzhiyun 	       cache_device_name(cache), descs[(int)mode]);
938*4882a593Smuzhiyun }
939*4882a593Smuzhiyun 
set_cache_mode(struct cache * cache,enum cache_metadata_mode new_mode)940*4882a593Smuzhiyun static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
941*4882a593Smuzhiyun {
942*4882a593Smuzhiyun 	bool needs_check;
943*4882a593Smuzhiyun 	enum cache_metadata_mode old_mode = get_cache_mode(cache);
944*4882a593Smuzhiyun 
945*4882a593Smuzhiyun 	if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
946*4882a593Smuzhiyun 		DMERR("%s: unable to read needs_check flag, setting failure mode.",
947*4882a593Smuzhiyun 		      cache_device_name(cache));
948*4882a593Smuzhiyun 		new_mode = CM_FAIL;
949*4882a593Smuzhiyun 	}
950*4882a593Smuzhiyun 
951*4882a593Smuzhiyun 	if (new_mode == CM_WRITE && needs_check) {
952*4882a593Smuzhiyun 		DMERR("%s: unable to switch cache to write mode until repaired.",
953*4882a593Smuzhiyun 		      cache_device_name(cache));
954*4882a593Smuzhiyun 		if (old_mode != new_mode)
955*4882a593Smuzhiyun 			new_mode = old_mode;
956*4882a593Smuzhiyun 		else
957*4882a593Smuzhiyun 			new_mode = CM_READ_ONLY;
958*4882a593Smuzhiyun 	}
959*4882a593Smuzhiyun 
960*4882a593Smuzhiyun 	/* Never move out of fail mode */
961*4882a593Smuzhiyun 	if (old_mode == CM_FAIL)
962*4882a593Smuzhiyun 		new_mode = CM_FAIL;
963*4882a593Smuzhiyun 
964*4882a593Smuzhiyun 	switch (new_mode) {
965*4882a593Smuzhiyun 	case CM_FAIL:
966*4882a593Smuzhiyun 	case CM_READ_ONLY:
967*4882a593Smuzhiyun 		dm_cache_metadata_set_read_only(cache->cmd);
968*4882a593Smuzhiyun 		break;
969*4882a593Smuzhiyun 
970*4882a593Smuzhiyun 	case CM_WRITE:
971*4882a593Smuzhiyun 		dm_cache_metadata_set_read_write(cache->cmd);
972*4882a593Smuzhiyun 		break;
973*4882a593Smuzhiyun 	}
974*4882a593Smuzhiyun 
975*4882a593Smuzhiyun 	cache->features.mode = new_mode;
976*4882a593Smuzhiyun 
977*4882a593Smuzhiyun 	if (new_mode != old_mode)
978*4882a593Smuzhiyun 		notify_mode_switch(cache, new_mode);
979*4882a593Smuzhiyun }
980*4882a593Smuzhiyun 
abort_transaction(struct cache * cache)981*4882a593Smuzhiyun static void abort_transaction(struct cache *cache)
982*4882a593Smuzhiyun {
983*4882a593Smuzhiyun 	const char *dev_name = cache_device_name(cache);
984*4882a593Smuzhiyun 
985*4882a593Smuzhiyun 	if (get_cache_mode(cache) >= CM_READ_ONLY)
986*4882a593Smuzhiyun 		return;
987*4882a593Smuzhiyun 
988*4882a593Smuzhiyun 	if (dm_cache_metadata_set_needs_check(cache->cmd)) {
989*4882a593Smuzhiyun 		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
990*4882a593Smuzhiyun 		set_cache_mode(cache, CM_FAIL);
991*4882a593Smuzhiyun 	}
992*4882a593Smuzhiyun 
993*4882a593Smuzhiyun 	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
994*4882a593Smuzhiyun 	if (dm_cache_metadata_abort(cache->cmd)) {
995*4882a593Smuzhiyun 		DMERR("%s: failed to abort metadata transaction", dev_name);
996*4882a593Smuzhiyun 		set_cache_mode(cache, CM_FAIL);
997*4882a593Smuzhiyun 	}
998*4882a593Smuzhiyun }
999*4882a593Smuzhiyun 
metadata_operation_failed(struct cache * cache,const char * op,int r)1000*4882a593Smuzhiyun static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1001*4882a593Smuzhiyun {
1002*4882a593Smuzhiyun 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1003*4882a593Smuzhiyun 		    cache_device_name(cache), op, r);
1004*4882a593Smuzhiyun 	abort_transaction(cache);
1005*4882a593Smuzhiyun 	set_cache_mode(cache, CM_READ_ONLY);
1006*4882a593Smuzhiyun }
1007*4882a593Smuzhiyun 
1008*4882a593Smuzhiyun /*----------------------------------------------------------------*/
1009*4882a593Smuzhiyun 
load_stats(struct cache * cache)1010*4882a593Smuzhiyun static void load_stats(struct cache *cache)
1011*4882a593Smuzhiyun {
1012*4882a593Smuzhiyun 	struct dm_cache_statistics stats;
1013*4882a593Smuzhiyun 
1014*4882a593Smuzhiyun 	dm_cache_metadata_get_stats(cache->cmd, &stats);
1015*4882a593Smuzhiyun 	atomic_set(&cache->stats.read_hit, stats.read_hits);
1016*4882a593Smuzhiyun 	atomic_set(&cache->stats.read_miss, stats.read_misses);
1017*4882a593Smuzhiyun 	atomic_set(&cache->stats.write_hit, stats.write_hits);
1018*4882a593Smuzhiyun 	atomic_set(&cache->stats.write_miss, stats.write_misses);
1019*4882a593Smuzhiyun }
1020*4882a593Smuzhiyun 
save_stats(struct cache * cache)1021*4882a593Smuzhiyun static void save_stats(struct cache *cache)
1022*4882a593Smuzhiyun {
1023*4882a593Smuzhiyun 	struct dm_cache_statistics stats;
1024*4882a593Smuzhiyun 
1025*4882a593Smuzhiyun 	if (get_cache_mode(cache) >= CM_READ_ONLY)
1026*4882a593Smuzhiyun 		return;
1027*4882a593Smuzhiyun 
1028*4882a593Smuzhiyun 	stats.read_hits = atomic_read(&cache->stats.read_hit);
1029*4882a593Smuzhiyun 	stats.read_misses = atomic_read(&cache->stats.read_miss);
1030*4882a593Smuzhiyun 	stats.write_hits = atomic_read(&cache->stats.write_hit);
1031*4882a593Smuzhiyun 	stats.write_misses = atomic_read(&cache->stats.write_miss);
1032*4882a593Smuzhiyun 
1033*4882a593Smuzhiyun 	dm_cache_metadata_set_stats(cache->cmd, &stats);
1034*4882a593Smuzhiyun }
1035*4882a593Smuzhiyun 
update_stats(struct cache_stats * stats,enum policy_operation op)1036*4882a593Smuzhiyun static void update_stats(struct cache_stats *stats, enum policy_operation op)
1037*4882a593Smuzhiyun {
1038*4882a593Smuzhiyun 	switch (op) {
1039*4882a593Smuzhiyun 	case POLICY_PROMOTE:
1040*4882a593Smuzhiyun 		atomic_inc(&stats->promotion);
1041*4882a593Smuzhiyun 		break;
1042*4882a593Smuzhiyun 
1043*4882a593Smuzhiyun 	case POLICY_DEMOTE:
1044*4882a593Smuzhiyun 		atomic_inc(&stats->demotion);
1045*4882a593Smuzhiyun 		break;
1046*4882a593Smuzhiyun 
1047*4882a593Smuzhiyun 	case POLICY_WRITEBACK:
1048*4882a593Smuzhiyun 		atomic_inc(&stats->writeback);
1049*4882a593Smuzhiyun 		break;
1050*4882a593Smuzhiyun 	}
1051*4882a593Smuzhiyun }
1052*4882a593Smuzhiyun 
1053*4882a593Smuzhiyun /*----------------------------------------------------------------
1054*4882a593Smuzhiyun  * Migration processing
1055*4882a593Smuzhiyun  *
1056*4882a593Smuzhiyun  * Migration covers moving data from the origin device to the cache, or
1057*4882a593Smuzhiyun  * vice versa.
1058*4882a593Smuzhiyun  *--------------------------------------------------------------*/
1059*4882a593Smuzhiyun 
inc_io_migrations(struct cache * cache)1060*4882a593Smuzhiyun static void inc_io_migrations(struct cache *cache)
1061*4882a593Smuzhiyun {
1062*4882a593Smuzhiyun 	atomic_inc(&cache->nr_io_migrations);
1063*4882a593Smuzhiyun }
1064*4882a593Smuzhiyun 
dec_io_migrations(struct cache * cache)1065*4882a593Smuzhiyun static void dec_io_migrations(struct cache *cache)
1066*4882a593Smuzhiyun {
1067*4882a593Smuzhiyun 	atomic_dec(&cache->nr_io_migrations);
1068*4882a593Smuzhiyun }
1069*4882a593Smuzhiyun 
discard_or_flush(struct bio * bio)1070*4882a593Smuzhiyun static bool discard_or_flush(struct bio *bio)
1071*4882a593Smuzhiyun {
1072*4882a593Smuzhiyun 	return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1073*4882a593Smuzhiyun }
1074*4882a593Smuzhiyun 
calc_discard_block_range(struct cache * cache,struct bio * bio,dm_dblock_t * b,dm_dblock_t * e)1075*4882a593Smuzhiyun static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1076*4882a593Smuzhiyun 				     dm_dblock_t *b, dm_dblock_t *e)
1077*4882a593Smuzhiyun {
1078*4882a593Smuzhiyun 	sector_t sb = bio->bi_iter.bi_sector;
1079*4882a593Smuzhiyun 	sector_t se = bio_end_sector(bio);
1080*4882a593Smuzhiyun 
1081*4882a593Smuzhiyun 	*b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1082*4882a593Smuzhiyun 
1083*4882a593Smuzhiyun 	if (se - sb < cache->discard_block_size)
1084*4882a593Smuzhiyun 		*e = *b;
1085*4882a593Smuzhiyun 	else
1086*4882a593Smuzhiyun 		*e = to_dblock(block_div(se, cache->discard_block_size));
1087*4882a593Smuzhiyun }
1088*4882a593Smuzhiyun 
1089*4882a593Smuzhiyun /*----------------------------------------------------------------*/
1090*4882a593Smuzhiyun 
prevent_background_work(struct cache * cache)1091*4882a593Smuzhiyun static void prevent_background_work(struct cache *cache)
1092*4882a593Smuzhiyun {
1093*4882a593Smuzhiyun 	lockdep_off();
1094*4882a593Smuzhiyun 	down_write(&cache->background_work_lock);
1095*4882a593Smuzhiyun 	lockdep_on();
1096*4882a593Smuzhiyun }
1097*4882a593Smuzhiyun 
allow_background_work(struct cache * cache)1098*4882a593Smuzhiyun static void allow_background_work(struct cache *cache)
1099*4882a593Smuzhiyun {
1100*4882a593Smuzhiyun 	lockdep_off();
1101*4882a593Smuzhiyun 	up_write(&cache->background_work_lock);
1102*4882a593Smuzhiyun 	lockdep_on();
1103*4882a593Smuzhiyun }
1104*4882a593Smuzhiyun 
background_work_begin(struct cache * cache)1105*4882a593Smuzhiyun static bool background_work_begin(struct cache *cache)
1106*4882a593Smuzhiyun {
1107*4882a593Smuzhiyun 	bool r;
1108*4882a593Smuzhiyun 
1109*4882a593Smuzhiyun 	lockdep_off();
1110*4882a593Smuzhiyun 	r = down_read_trylock(&cache->background_work_lock);
1111*4882a593Smuzhiyun 	lockdep_on();
1112*4882a593Smuzhiyun 
1113*4882a593Smuzhiyun 	return r;
1114*4882a593Smuzhiyun }
1115*4882a593Smuzhiyun 
background_work_end(struct cache * cache)1116*4882a593Smuzhiyun static void background_work_end(struct cache *cache)
1117*4882a593Smuzhiyun {
1118*4882a593Smuzhiyun 	lockdep_off();
1119*4882a593Smuzhiyun 	up_read(&cache->background_work_lock);
1120*4882a593Smuzhiyun 	lockdep_on();
1121*4882a593Smuzhiyun }
1122*4882a593Smuzhiyun 
1123*4882a593Smuzhiyun /*----------------------------------------------------------------*/
1124*4882a593Smuzhiyun 
bio_writes_complete_block(struct cache * cache,struct bio * bio)1125*4882a593Smuzhiyun static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1126*4882a593Smuzhiyun {
1127*4882a593Smuzhiyun 	return (bio_data_dir(bio) == WRITE) &&
1128*4882a593Smuzhiyun 		(bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1129*4882a593Smuzhiyun }
1130*4882a593Smuzhiyun 
optimisable_bio(struct cache * cache,struct bio * bio,dm_oblock_t block)1131*4882a593Smuzhiyun static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1132*4882a593Smuzhiyun {
1133*4882a593Smuzhiyun 	return writeback_mode(cache) &&
1134*4882a593Smuzhiyun 		(is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1135*4882a593Smuzhiyun }
1136*4882a593Smuzhiyun 
quiesce(struct dm_cache_migration * mg,void (* continuation)(struct work_struct *))1137*4882a593Smuzhiyun static void quiesce(struct dm_cache_migration *mg,
1138*4882a593Smuzhiyun 		    void (*continuation)(struct work_struct *))
1139*4882a593Smuzhiyun {
1140*4882a593Smuzhiyun 	init_continuation(&mg->k, continuation);
1141*4882a593Smuzhiyun 	dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1142*4882a593Smuzhiyun }
1143*4882a593Smuzhiyun 
ws_to_mg(struct work_struct * ws)1144*4882a593Smuzhiyun static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1145*4882a593Smuzhiyun {
1146*4882a593Smuzhiyun 	struct continuation *k = container_of(ws, struct continuation, ws);
1147*4882a593Smuzhiyun 	return container_of(k, struct dm_cache_migration, k);
1148*4882a593Smuzhiyun }
1149*4882a593Smuzhiyun 
copy_complete(int read_err,unsigned long write_err,void * context)1150*4882a593Smuzhiyun static void copy_complete(int read_err, unsigned long write_err, void *context)
1151*4882a593Smuzhiyun {
1152*4882a593Smuzhiyun 	struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1153*4882a593Smuzhiyun 
1154*4882a593Smuzhiyun 	if (read_err || write_err)
1155*4882a593Smuzhiyun 		mg->k.input = BLK_STS_IOERR;
1156*4882a593Smuzhiyun 
1157*4882a593Smuzhiyun 	queue_continuation(mg->cache->wq, &mg->k);
1158*4882a593Smuzhiyun }
1159*4882a593Smuzhiyun 
copy(struct dm_cache_migration * mg,bool promote)1160*4882a593Smuzhiyun static void copy(struct dm_cache_migration *mg, bool promote)
1161*4882a593Smuzhiyun {
1162*4882a593Smuzhiyun 	struct dm_io_region o_region, c_region;
1163*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1164*4882a593Smuzhiyun 
1165*4882a593Smuzhiyun 	o_region.bdev = cache->origin_dev->bdev;
1166*4882a593Smuzhiyun 	o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1167*4882a593Smuzhiyun 	o_region.count = cache->sectors_per_block;
1168*4882a593Smuzhiyun 
1169*4882a593Smuzhiyun 	c_region.bdev = cache->cache_dev->bdev;
1170*4882a593Smuzhiyun 	c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1171*4882a593Smuzhiyun 	c_region.count = cache->sectors_per_block;
1172*4882a593Smuzhiyun 
1173*4882a593Smuzhiyun 	if (promote)
1174*4882a593Smuzhiyun 		dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1175*4882a593Smuzhiyun 	else
1176*4882a593Smuzhiyun 		dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1177*4882a593Smuzhiyun }
1178*4882a593Smuzhiyun 
bio_drop_shared_lock(struct cache * cache,struct bio * bio)1179*4882a593Smuzhiyun static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1180*4882a593Smuzhiyun {
1181*4882a593Smuzhiyun 	struct per_bio_data *pb = get_per_bio_data(bio);
1182*4882a593Smuzhiyun 
1183*4882a593Smuzhiyun 	if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1184*4882a593Smuzhiyun 		free_prison_cell(cache, pb->cell);
1185*4882a593Smuzhiyun 	pb->cell = NULL;
1186*4882a593Smuzhiyun }
1187*4882a593Smuzhiyun 
overwrite_endio(struct bio * bio)1188*4882a593Smuzhiyun static void overwrite_endio(struct bio *bio)
1189*4882a593Smuzhiyun {
1190*4882a593Smuzhiyun 	struct dm_cache_migration *mg = bio->bi_private;
1191*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1192*4882a593Smuzhiyun 	struct per_bio_data *pb = get_per_bio_data(bio);
1193*4882a593Smuzhiyun 
1194*4882a593Smuzhiyun 	dm_unhook_bio(&pb->hook_info, bio);
1195*4882a593Smuzhiyun 
1196*4882a593Smuzhiyun 	if (bio->bi_status)
1197*4882a593Smuzhiyun 		mg->k.input = bio->bi_status;
1198*4882a593Smuzhiyun 
1199*4882a593Smuzhiyun 	queue_continuation(cache->wq, &mg->k);
1200*4882a593Smuzhiyun }
1201*4882a593Smuzhiyun 
overwrite(struct dm_cache_migration * mg,void (* continuation)(struct work_struct *))1202*4882a593Smuzhiyun static void overwrite(struct dm_cache_migration *mg,
1203*4882a593Smuzhiyun 		      void (*continuation)(struct work_struct *))
1204*4882a593Smuzhiyun {
1205*4882a593Smuzhiyun 	struct bio *bio = mg->overwrite_bio;
1206*4882a593Smuzhiyun 	struct per_bio_data *pb = get_per_bio_data(bio);
1207*4882a593Smuzhiyun 
1208*4882a593Smuzhiyun 	dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1209*4882a593Smuzhiyun 
1210*4882a593Smuzhiyun 	/*
1211*4882a593Smuzhiyun 	 * The overwrite bio is part of the copy operation, as such it does
1212*4882a593Smuzhiyun 	 * not set/clear discard or dirty flags.
1213*4882a593Smuzhiyun 	 */
1214*4882a593Smuzhiyun 	if (mg->op->op == POLICY_PROMOTE)
1215*4882a593Smuzhiyun 		remap_to_cache(mg->cache, bio, mg->op->cblock);
1216*4882a593Smuzhiyun 	else
1217*4882a593Smuzhiyun 		remap_to_origin(mg->cache, bio);
1218*4882a593Smuzhiyun 
1219*4882a593Smuzhiyun 	init_continuation(&mg->k, continuation);
1220*4882a593Smuzhiyun 	accounted_request(mg->cache, bio);
1221*4882a593Smuzhiyun }
1222*4882a593Smuzhiyun 
1223*4882a593Smuzhiyun /*
1224*4882a593Smuzhiyun  * Migration steps:
1225*4882a593Smuzhiyun  *
1226*4882a593Smuzhiyun  * 1) exclusive lock preventing WRITEs
1227*4882a593Smuzhiyun  * 2) quiesce
1228*4882a593Smuzhiyun  * 3) copy or issue overwrite bio
1229*4882a593Smuzhiyun  * 4) upgrade to exclusive lock preventing READs and WRITEs
1230*4882a593Smuzhiyun  * 5) quiesce
1231*4882a593Smuzhiyun  * 6) update metadata and commit
1232*4882a593Smuzhiyun  * 7) unlock
1233*4882a593Smuzhiyun  */
mg_complete(struct dm_cache_migration * mg,bool success)1234*4882a593Smuzhiyun static void mg_complete(struct dm_cache_migration *mg, bool success)
1235*4882a593Smuzhiyun {
1236*4882a593Smuzhiyun 	struct bio_list bios;
1237*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1238*4882a593Smuzhiyun 	struct policy_work *op = mg->op;
1239*4882a593Smuzhiyun 	dm_cblock_t cblock = op->cblock;
1240*4882a593Smuzhiyun 
1241*4882a593Smuzhiyun 	if (success)
1242*4882a593Smuzhiyun 		update_stats(&cache->stats, op->op);
1243*4882a593Smuzhiyun 
1244*4882a593Smuzhiyun 	switch (op->op) {
1245*4882a593Smuzhiyun 	case POLICY_PROMOTE:
1246*4882a593Smuzhiyun 		clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1247*4882a593Smuzhiyun 		policy_complete_background_work(cache->policy, op, success);
1248*4882a593Smuzhiyun 
1249*4882a593Smuzhiyun 		if (mg->overwrite_bio) {
1250*4882a593Smuzhiyun 			if (success)
1251*4882a593Smuzhiyun 				force_set_dirty(cache, cblock);
1252*4882a593Smuzhiyun 			else if (mg->k.input)
1253*4882a593Smuzhiyun 				mg->overwrite_bio->bi_status = mg->k.input;
1254*4882a593Smuzhiyun 			else
1255*4882a593Smuzhiyun 				mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1256*4882a593Smuzhiyun 			bio_endio(mg->overwrite_bio);
1257*4882a593Smuzhiyun 		} else {
1258*4882a593Smuzhiyun 			if (success)
1259*4882a593Smuzhiyun 				force_clear_dirty(cache, cblock);
1260*4882a593Smuzhiyun 			dec_io_migrations(cache);
1261*4882a593Smuzhiyun 		}
1262*4882a593Smuzhiyun 		break;
1263*4882a593Smuzhiyun 
1264*4882a593Smuzhiyun 	case POLICY_DEMOTE:
1265*4882a593Smuzhiyun 		/*
1266*4882a593Smuzhiyun 		 * We clear dirty here to update the nr_dirty counter.
1267*4882a593Smuzhiyun 		 */
1268*4882a593Smuzhiyun 		if (success)
1269*4882a593Smuzhiyun 			force_clear_dirty(cache, cblock);
1270*4882a593Smuzhiyun 		policy_complete_background_work(cache->policy, op, success);
1271*4882a593Smuzhiyun 		dec_io_migrations(cache);
1272*4882a593Smuzhiyun 		break;
1273*4882a593Smuzhiyun 
1274*4882a593Smuzhiyun 	case POLICY_WRITEBACK:
1275*4882a593Smuzhiyun 		if (success)
1276*4882a593Smuzhiyun 			force_clear_dirty(cache, cblock);
1277*4882a593Smuzhiyun 		policy_complete_background_work(cache->policy, op, success);
1278*4882a593Smuzhiyun 		dec_io_migrations(cache);
1279*4882a593Smuzhiyun 		break;
1280*4882a593Smuzhiyun 	}
1281*4882a593Smuzhiyun 
1282*4882a593Smuzhiyun 	bio_list_init(&bios);
1283*4882a593Smuzhiyun 	if (mg->cell) {
1284*4882a593Smuzhiyun 		if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1285*4882a593Smuzhiyun 			free_prison_cell(cache, mg->cell);
1286*4882a593Smuzhiyun 	}
1287*4882a593Smuzhiyun 
1288*4882a593Smuzhiyun 	free_migration(mg);
1289*4882a593Smuzhiyun 	defer_bios(cache, &bios);
1290*4882a593Smuzhiyun 	wake_migration_worker(cache);
1291*4882a593Smuzhiyun 
1292*4882a593Smuzhiyun 	background_work_end(cache);
1293*4882a593Smuzhiyun }
1294*4882a593Smuzhiyun 
mg_success(struct work_struct * ws)1295*4882a593Smuzhiyun static void mg_success(struct work_struct *ws)
1296*4882a593Smuzhiyun {
1297*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1298*4882a593Smuzhiyun 	mg_complete(mg, mg->k.input == 0);
1299*4882a593Smuzhiyun }
1300*4882a593Smuzhiyun 
mg_update_metadata(struct work_struct * ws)1301*4882a593Smuzhiyun static void mg_update_metadata(struct work_struct *ws)
1302*4882a593Smuzhiyun {
1303*4882a593Smuzhiyun 	int r;
1304*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1305*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1306*4882a593Smuzhiyun 	struct policy_work *op = mg->op;
1307*4882a593Smuzhiyun 
1308*4882a593Smuzhiyun 	switch (op->op) {
1309*4882a593Smuzhiyun 	case POLICY_PROMOTE:
1310*4882a593Smuzhiyun 		r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1311*4882a593Smuzhiyun 		if (r) {
1312*4882a593Smuzhiyun 			DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1313*4882a593Smuzhiyun 				    cache_device_name(cache));
1314*4882a593Smuzhiyun 			metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1315*4882a593Smuzhiyun 
1316*4882a593Smuzhiyun 			mg_complete(mg, false);
1317*4882a593Smuzhiyun 			return;
1318*4882a593Smuzhiyun 		}
1319*4882a593Smuzhiyun 		mg_complete(mg, true);
1320*4882a593Smuzhiyun 		break;
1321*4882a593Smuzhiyun 
1322*4882a593Smuzhiyun 	case POLICY_DEMOTE:
1323*4882a593Smuzhiyun 		r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1324*4882a593Smuzhiyun 		if (r) {
1325*4882a593Smuzhiyun 			DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1326*4882a593Smuzhiyun 				    cache_device_name(cache));
1327*4882a593Smuzhiyun 			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1328*4882a593Smuzhiyun 
1329*4882a593Smuzhiyun 			mg_complete(mg, false);
1330*4882a593Smuzhiyun 			return;
1331*4882a593Smuzhiyun 		}
1332*4882a593Smuzhiyun 
1333*4882a593Smuzhiyun 		/*
1334*4882a593Smuzhiyun 		 * It would be nice if we only had to commit when a REQ_FLUSH
1335*4882a593Smuzhiyun 		 * comes through.  But there's one scenario that we have to
1336*4882a593Smuzhiyun 		 * look out for:
1337*4882a593Smuzhiyun 		 *
1338*4882a593Smuzhiyun 		 * - vblock x in a cache block
1339*4882a593Smuzhiyun 		 * - domotion occurs
1340*4882a593Smuzhiyun 		 * - cache block gets reallocated and over written
1341*4882a593Smuzhiyun 		 * - crash
1342*4882a593Smuzhiyun 		 *
1343*4882a593Smuzhiyun 		 * When we recover, because there was no commit the cache will
1344*4882a593Smuzhiyun 		 * rollback to having the data for vblock x in the cache block.
1345*4882a593Smuzhiyun 		 * But the cache block has since been overwritten, so it'll end
1346*4882a593Smuzhiyun 		 * up pointing to data that was never in 'x' during the history
1347*4882a593Smuzhiyun 		 * of the device.
1348*4882a593Smuzhiyun 		 *
1349*4882a593Smuzhiyun 		 * To avoid this issue we require a commit as part of the
1350*4882a593Smuzhiyun 		 * demotion operation.
1351*4882a593Smuzhiyun 		 */
1352*4882a593Smuzhiyun 		init_continuation(&mg->k, mg_success);
1353*4882a593Smuzhiyun 		continue_after_commit(&cache->committer, &mg->k);
1354*4882a593Smuzhiyun 		schedule_commit(&cache->committer);
1355*4882a593Smuzhiyun 		break;
1356*4882a593Smuzhiyun 
1357*4882a593Smuzhiyun 	case POLICY_WRITEBACK:
1358*4882a593Smuzhiyun 		mg_complete(mg, true);
1359*4882a593Smuzhiyun 		break;
1360*4882a593Smuzhiyun 	}
1361*4882a593Smuzhiyun }
1362*4882a593Smuzhiyun 
mg_update_metadata_after_copy(struct work_struct * ws)1363*4882a593Smuzhiyun static void mg_update_metadata_after_copy(struct work_struct *ws)
1364*4882a593Smuzhiyun {
1365*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1366*4882a593Smuzhiyun 
1367*4882a593Smuzhiyun 	/*
1368*4882a593Smuzhiyun 	 * Did the copy succeed?
1369*4882a593Smuzhiyun 	 */
1370*4882a593Smuzhiyun 	if (mg->k.input)
1371*4882a593Smuzhiyun 		mg_complete(mg, false);
1372*4882a593Smuzhiyun 	else
1373*4882a593Smuzhiyun 		mg_update_metadata(ws);
1374*4882a593Smuzhiyun }
1375*4882a593Smuzhiyun 
mg_upgrade_lock(struct work_struct * ws)1376*4882a593Smuzhiyun static void mg_upgrade_lock(struct work_struct *ws)
1377*4882a593Smuzhiyun {
1378*4882a593Smuzhiyun 	int r;
1379*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1380*4882a593Smuzhiyun 
1381*4882a593Smuzhiyun 	/*
1382*4882a593Smuzhiyun 	 * Did the copy succeed?
1383*4882a593Smuzhiyun 	 */
1384*4882a593Smuzhiyun 	if (mg->k.input)
1385*4882a593Smuzhiyun 		mg_complete(mg, false);
1386*4882a593Smuzhiyun 
1387*4882a593Smuzhiyun 	else {
1388*4882a593Smuzhiyun 		/*
1389*4882a593Smuzhiyun 		 * Now we want the lock to prevent both reads and writes.
1390*4882a593Smuzhiyun 		 */
1391*4882a593Smuzhiyun 		r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1392*4882a593Smuzhiyun 					    READ_WRITE_LOCK_LEVEL);
1393*4882a593Smuzhiyun 		if (r < 0)
1394*4882a593Smuzhiyun 			mg_complete(mg, false);
1395*4882a593Smuzhiyun 
1396*4882a593Smuzhiyun 		else if (r)
1397*4882a593Smuzhiyun 			quiesce(mg, mg_update_metadata);
1398*4882a593Smuzhiyun 
1399*4882a593Smuzhiyun 		else
1400*4882a593Smuzhiyun 			mg_update_metadata(ws);
1401*4882a593Smuzhiyun 	}
1402*4882a593Smuzhiyun }
1403*4882a593Smuzhiyun 
mg_full_copy(struct work_struct * ws)1404*4882a593Smuzhiyun static void mg_full_copy(struct work_struct *ws)
1405*4882a593Smuzhiyun {
1406*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1407*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1408*4882a593Smuzhiyun 	struct policy_work *op = mg->op;
1409*4882a593Smuzhiyun 	bool is_policy_promote = (op->op == POLICY_PROMOTE);
1410*4882a593Smuzhiyun 
1411*4882a593Smuzhiyun 	if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1412*4882a593Smuzhiyun 	    is_discarded_oblock(cache, op->oblock)) {
1413*4882a593Smuzhiyun 		mg_upgrade_lock(ws);
1414*4882a593Smuzhiyun 		return;
1415*4882a593Smuzhiyun 	}
1416*4882a593Smuzhiyun 
1417*4882a593Smuzhiyun 	init_continuation(&mg->k, mg_upgrade_lock);
1418*4882a593Smuzhiyun 	copy(mg, is_policy_promote);
1419*4882a593Smuzhiyun }
1420*4882a593Smuzhiyun 
mg_copy(struct work_struct * ws)1421*4882a593Smuzhiyun static void mg_copy(struct work_struct *ws)
1422*4882a593Smuzhiyun {
1423*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1424*4882a593Smuzhiyun 
1425*4882a593Smuzhiyun 	if (mg->overwrite_bio) {
1426*4882a593Smuzhiyun 		/*
1427*4882a593Smuzhiyun 		 * No exclusive lock was held when we last checked if the bio
1428*4882a593Smuzhiyun 		 * was optimisable.  So we have to check again in case things
1429*4882a593Smuzhiyun 		 * have changed (eg, the block may no longer be discarded).
1430*4882a593Smuzhiyun 		 */
1431*4882a593Smuzhiyun 		if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
1432*4882a593Smuzhiyun 			/*
1433*4882a593Smuzhiyun 			 * Fallback to a real full copy after doing some tidying up.
1434*4882a593Smuzhiyun 			 */
1435*4882a593Smuzhiyun 			bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
1436*4882a593Smuzhiyun 			BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
1437*4882a593Smuzhiyun 			mg->overwrite_bio = NULL;
1438*4882a593Smuzhiyun 			inc_io_migrations(mg->cache);
1439*4882a593Smuzhiyun 			mg_full_copy(ws);
1440*4882a593Smuzhiyun 			return;
1441*4882a593Smuzhiyun 		}
1442*4882a593Smuzhiyun 
1443*4882a593Smuzhiyun 		/*
1444*4882a593Smuzhiyun 		 * It's safe to do this here, even though it's new data
1445*4882a593Smuzhiyun 		 * because all IO has been locked out of the block.
1446*4882a593Smuzhiyun 		 *
1447*4882a593Smuzhiyun 		 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1448*4882a593Smuzhiyun 		 * so _not_ using mg_upgrade_lock() as continutation.
1449*4882a593Smuzhiyun 		 */
1450*4882a593Smuzhiyun 		overwrite(mg, mg_update_metadata_after_copy);
1451*4882a593Smuzhiyun 
1452*4882a593Smuzhiyun 	} else
1453*4882a593Smuzhiyun 		mg_full_copy(ws);
1454*4882a593Smuzhiyun }
1455*4882a593Smuzhiyun 
mg_lock_writes(struct dm_cache_migration * mg)1456*4882a593Smuzhiyun static int mg_lock_writes(struct dm_cache_migration *mg)
1457*4882a593Smuzhiyun {
1458*4882a593Smuzhiyun 	int r;
1459*4882a593Smuzhiyun 	struct dm_cell_key_v2 key;
1460*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1461*4882a593Smuzhiyun 	struct dm_bio_prison_cell_v2 *prealloc;
1462*4882a593Smuzhiyun 
1463*4882a593Smuzhiyun 	prealloc = alloc_prison_cell(cache);
1464*4882a593Smuzhiyun 
1465*4882a593Smuzhiyun 	/*
1466*4882a593Smuzhiyun 	 * Prevent writes to the block, but allow reads to continue.
1467*4882a593Smuzhiyun 	 * Unless we're using an overwrite bio, in which case we lock
1468*4882a593Smuzhiyun 	 * everything.
1469*4882a593Smuzhiyun 	 */
1470*4882a593Smuzhiyun 	build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1471*4882a593Smuzhiyun 	r = dm_cell_lock_v2(cache->prison, &key,
1472*4882a593Smuzhiyun 			    mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1473*4882a593Smuzhiyun 			    prealloc, &mg->cell);
1474*4882a593Smuzhiyun 	if (r < 0) {
1475*4882a593Smuzhiyun 		free_prison_cell(cache, prealloc);
1476*4882a593Smuzhiyun 		mg_complete(mg, false);
1477*4882a593Smuzhiyun 		return r;
1478*4882a593Smuzhiyun 	}
1479*4882a593Smuzhiyun 
1480*4882a593Smuzhiyun 	if (mg->cell != prealloc)
1481*4882a593Smuzhiyun 		free_prison_cell(cache, prealloc);
1482*4882a593Smuzhiyun 
1483*4882a593Smuzhiyun 	if (r == 0)
1484*4882a593Smuzhiyun 		mg_copy(&mg->k.ws);
1485*4882a593Smuzhiyun 	else
1486*4882a593Smuzhiyun 		quiesce(mg, mg_copy);
1487*4882a593Smuzhiyun 
1488*4882a593Smuzhiyun 	return 0;
1489*4882a593Smuzhiyun }
1490*4882a593Smuzhiyun 
mg_start(struct cache * cache,struct policy_work * op,struct bio * bio)1491*4882a593Smuzhiyun static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1492*4882a593Smuzhiyun {
1493*4882a593Smuzhiyun 	struct dm_cache_migration *mg;
1494*4882a593Smuzhiyun 
1495*4882a593Smuzhiyun 	if (!background_work_begin(cache)) {
1496*4882a593Smuzhiyun 		policy_complete_background_work(cache->policy, op, false);
1497*4882a593Smuzhiyun 		return -EPERM;
1498*4882a593Smuzhiyun 	}
1499*4882a593Smuzhiyun 
1500*4882a593Smuzhiyun 	mg = alloc_migration(cache);
1501*4882a593Smuzhiyun 
1502*4882a593Smuzhiyun 	mg->op = op;
1503*4882a593Smuzhiyun 	mg->overwrite_bio = bio;
1504*4882a593Smuzhiyun 
1505*4882a593Smuzhiyun 	if (!bio)
1506*4882a593Smuzhiyun 		inc_io_migrations(cache);
1507*4882a593Smuzhiyun 
1508*4882a593Smuzhiyun 	return mg_lock_writes(mg);
1509*4882a593Smuzhiyun }
1510*4882a593Smuzhiyun 
1511*4882a593Smuzhiyun /*----------------------------------------------------------------
1512*4882a593Smuzhiyun  * invalidation processing
1513*4882a593Smuzhiyun  *--------------------------------------------------------------*/
1514*4882a593Smuzhiyun 
invalidate_complete(struct dm_cache_migration * mg,bool success)1515*4882a593Smuzhiyun static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1516*4882a593Smuzhiyun {
1517*4882a593Smuzhiyun 	struct bio_list bios;
1518*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1519*4882a593Smuzhiyun 
1520*4882a593Smuzhiyun 	bio_list_init(&bios);
1521*4882a593Smuzhiyun 	if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1522*4882a593Smuzhiyun 		free_prison_cell(cache, mg->cell);
1523*4882a593Smuzhiyun 
1524*4882a593Smuzhiyun 	if (!success && mg->overwrite_bio)
1525*4882a593Smuzhiyun 		bio_io_error(mg->overwrite_bio);
1526*4882a593Smuzhiyun 
1527*4882a593Smuzhiyun 	free_migration(mg);
1528*4882a593Smuzhiyun 	defer_bios(cache, &bios);
1529*4882a593Smuzhiyun 
1530*4882a593Smuzhiyun 	background_work_end(cache);
1531*4882a593Smuzhiyun }
1532*4882a593Smuzhiyun 
invalidate_completed(struct work_struct * ws)1533*4882a593Smuzhiyun static void invalidate_completed(struct work_struct *ws)
1534*4882a593Smuzhiyun {
1535*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1536*4882a593Smuzhiyun 	invalidate_complete(mg, !mg->k.input);
1537*4882a593Smuzhiyun }
1538*4882a593Smuzhiyun 
invalidate_cblock(struct cache * cache,dm_cblock_t cblock)1539*4882a593Smuzhiyun static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1540*4882a593Smuzhiyun {
1541*4882a593Smuzhiyun 	int r = policy_invalidate_mapping(cache->policy, cblock);
1542*4882a593Smuzhiyun 	if (!r) {
1543*4882a593Smuzhiyun 		r = dm_cache_remove_mapping(cache->cmd, cblock);
1544*4882a593Smuzhiyun 		if (r) {
1545*4882a593Smuzhiyun 			DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1546*4882a593Smuzhiyun 				    cache_device_name(cache));
1547*4882a593Smuzhiyun 			metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1548*4882a593Smuzhiyun 		}
1549*4882a593Smuzhiyun 
1550*4882a593Smuzhiyun 	} else if (r == -ENODATA) {
1551*4882a593Smuzhiyun 		/*
1552*4882a593Smuzhiyun 		 * Harmless, already unmapped.
1553*4882a593Smuzhiyun 		 */
1554*4882a593Smuzhiyun 		r = 0;
1555*4882a593Smuzhiyun 
1556*4882a593Smuzhiyun 	} else
1557*4882a593Smuzhiyun 		DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1558*4882a593Smuzhiyun 
1559*4882a593Smuzhiyun 	return r;
1560*4882a593Smuzhiyun }
1561*4882a593Smuzhiyun 
invalidate_remove(struct work_struct * ws)1562*4882a593Smuzhiyun static void invalidate_remove(struct work_struct *ws)
1563*4882a593Smuzhiyun {
1564*4882a593Smuzhiyun 	int r;
1565*4882a593Smuzhiyun 	struct dm_cache_migration *mg = ws_to_mg(ws);
1566*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1567*4882a593Smuzhiyun 
1568*4882a593Smuzhiyun 	r = invalidate_cblock(cache, mg->invalidate_cblock);
1569*4882a593Smuzhiyun 	if (r) {
1570*4882a593Smuzhiyun 		invalidate_complete(mg, false);
1571*4882a593Smuzhiyun 		return;
1572*4882a593Smuzhiyun 	}
1573*4882a593Smuzhiyun 
1574*4882a593Smuzhiyun 	init_continuation(&mg->k, invalidate_completed);
1575*4882a593Smuzhiyun 	continue_after_commit(&cache->committer, &mg->k);
1576*4882a593Smuzhiyun 	remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1577*4882a593Smuzhiyun 	mg->overwrite_bio = NULL;
1578*4882a593Smuzhiyun 	schedule_commit(&cache->committer);
1579*4882a593Smuzhiyun }
1580*4882a593Smuzhiyun 
invalidate_lock(struct dm_cache_migration * mg)1581*4882a593Smuzhiyun static int invalidate_lock(struct dm_cache_migration *mg)
1582*4882a593Smuzhiyun {
1583*4882a593Smuzhiyun 	int r;
1584*4882a593Smuzhiyun 	struct dm_cell_key_v2 key;
1585*4882a593Smuzhiyun 	struct cache *cache = mg->cache;
1586*4882a593Smuzhiyun 	struct dm_bio_prison_cell_v2 *prealloc;
1587*4882a593Smuzhiyun 
1588*4882a593Smuzhiyun 	prealloc = alloc_prison_cell(cache);
1589*4882a593Smuzhiyun 
1590*4882a593Smuzhiyun 	build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1591*4882a593Smuzhiyun 	r = dm_cell_lock_v2(cache->prison, &key,
1592*4882a593Smuzhiyun 			    READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1593*4882a593Smuzhiyun 	if (r < 0) {
1594*4882a593Smuzhiyun 		free_prison_cell(cache, prealloc);
1595*4882a593Smuzhiyun 		invalidate_complete(mg, false);
1596*4882a593Smuzhiyun 		return r;
1597*4882a593Smuzhiyun 	}
1598*4882a593Smuzhiyun 
1599*4882a593Smuzhiyun 	if (mg->cell != prealloc)
1600*4882a593Smuzhiyun 		free_prison_cell(cache, prealloc);
1601*4882a593Smuzhiyun 
1602*4882a593Smuzhiyun 	if (r)
1603*4882a593Smuzhiyun 		quiesce(mg, invalidate_remove);
1604*4882a593Smuzhiyun 
1605*4882a593Smuzhiyun 	else {
1606*4882a593Smuzhiyun 		/*
1607*4882a593Smuzhiyun 		 * We can't call invalidate_remove() directly here because we
1608*4882a593Smuzhiyun 		 * might still be in request context.
1609*4882a593Smuzhiyun 		 */
1610*4882a593Smuzhiyun 		init_continuation(&mg->k, invalidate_remove);
1611*4882a593Smuzhiyun 		queue_work(cache->wq, &mg->k.ws);
1612*4882a593Smuzhiyun 	}
1613*4882a593Smuzhiyun 
1614*4882a593Smuzhiyun 	return 0;
1615*4882a593Smuzhiyun }
1616*4882a593Smuzhiyun 
invalidate_start(struct cache * cache,dm_cblock_t cblock,dm_oblock_t oblock,struct bio * bio)1617*4882a593Smuzhiyun static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1618*4882a593Smuzhiyun 			    dm_oblock_t oblock, struct bio *bio)
1619*4882a593Smuzhiyun {
1620*4882a593Smuzhiyun 	struct dm_cache_migration *mg;
1621*4882a593Smuzhiyun 
1622*4882a593Smuzhiyun 	if (!background_work_begin(cache))
1623*4882a593Smuzhiyun 		return -EPERM;
1624*4882a593Smuzhiyun 
1625*4882a593Smuzhiyun 	mg = alloc_migration(cache);
1626*4882a593Smuzhiyun 
1627*4882a593Smuzhiyun 	mg->overwrite_bio = bio;
1628*4882a593Smuzhiyun 	mg->invalidate_cblock = cblock;
1629*4882a593Smuzhiyun 	mg->invalidate_oblock = oblock;
1630*4882a593Smuzhiyun 
1631*4882a593Smuzhiyun 	return invalidate_lock(mg);
1632*4882a593Smuzhiyun }
1633*4882a593Smuzhiyun 
1634*4882a593Smuzhiyun /*----------------------------------------------------------------
1635*4882a593Smuzhiyun  * bio processing
1636*4882a593Smuzhiyun  *--------------------------------------------------------------*/
1637*4882a593Smuzhiyun 
1638*4882a593Smuzhiyun enum busy {
1639*4882a593Smuzhiyun 	IDLE,
1640*4882a593Smuzhiyun 	BUSY
1641*4882a593Smuzhiyun };
1642*4882a593Smuzhiyun 
spare_migration_bandwidth(struct cache * cache)1643*4882a593Smuzhiyun static enum busy spare_migration_bandwidth(struct cache *cache)
1644*4882a593Smuzhiyun {
1645*4882a593Smuzhiyun 	bool idle = iot_idle_for(&cache->tracker, HZ);
1646*4882a593Smuzhiyun 	sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1647*4882a593Smuzhiyun 		cache->sectors_per_block;
1648*4882a593Smuzhiyun 
1649*4882a593Smuzhiyun 	if (idle && current_volume <= cache->migration_threshold)
1650*4882a593Smuzhiyun 		return IDLE;
1651*4882a593Smuzhiyun 	else
1652*4882a593Smuzhiyun 		return BUSY;
1653*4882a593Smuzhiyun }
1654*4882a593Smuzhiyun 
inc_hit_counter(struct cache * cache,struct bio * bio)1655*4882a593Smuzhiyun static void inc_hit_counter(struct cache *cache, struct bio *bio)
1656*4882a593Smuzhiyun {
1657*4882a593Smuzhiyun 	atomic_inc(bio_data_dir(bio) == READ ?
1658*4882a593Smuzhiyun 		   &cache->stats.read_hit : &cache->stats.write_hit);
1659*4882a593Smuzhiyun }
1660*4882a593Smuzhiyun 
inc_miss_counter(struct cache * cache,struct bio * bio)1661*4882a593Smuzhiyun static void inc_miss_counter(struct cache *cache, struct bio *bio)
1662*4882a593Smuzhiyun {
1663*4882a593Smuzhiyun 	atomic_inc(bio_data_dir(bio) == READ ?
1664*4882a593Smuzhiyun 		   &cache->stats.read_miss : &cache->stats.write_miss);
1665*4882a593Smuzhiyun }
1666*4882a593Smuzhiyun 
1667*4882a593Smuzhiyun /*----------------------------------------------------------------*/
1668*4882a593Smuzhiyun 
map_bio(struct cache * cache,struct bio * bio,dm_oblock_t block,bool * commit_needed)1669*4882a593Smuzhiyun static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1670*4882a593Smuzhiyun 		   bool *commit_needed)
1671*4882a593Smuzhiyun {
1672*4882a593Smuzhiyun 	int r, data_dir;
1673*4882a593Smuzhiyun 	bool rb, background_queued;
1674*4882a593Smuzhiyun 	dm_cblock_t cblock;
1675*4882a593Smuzhiyun 
1676*4882a593Smuzhiyun 	*commit_needed = false;
1677*4882a593Smuzhiyun 
1678*4882a593Smuzhiyun 	rb = bio_detain_shared(cache, block, bio);
1679*4882a593Smuzhiyun 	if (!rb) {
1680*4882a593Smuzhiyun 		/*
1681*4882a593Smuzhiyun 		 * An exclusive lock is held for this block, so we have to
1682*4882a593Smuzhiyun 		 * wait.  We set the commit_needed flag so the current
1683*4882a593Smuzhiyun 		 * transaction will be committed asap, allowing this lock
1684*4882a593Smuzhiyun 		 * to be dropped.
1685*4882a593Smuzhiyun 		 */
1686*4882a593Smuzhiyun 		*commit_needed = true;
1687*4882a593Smuzhiyun 		return DM_MAPIO_SUBMITTED;
1688*4882a593Smuzhiyun 	}
1689*4882a593Smuzhiyun 
1690*4882a593Smuzhiyun 	data_dir = bio_data_dir(bio);
1691*4882a593Smuzhiyun 
1692*4882a593Smuzhiyun 	if (optimisable_bio(cache, bio, block)) {
1693*4882a593Smuzhiyun 		struct policy_work *op = NULL;
1694*4882a593Smuzhiyun 
1695*4882a593Smuzhiyun 		r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1696*4882a593Smuzhiyun 		if (unlikely(r && r != -ENOENT)) {
1697*4882a593Smuzhiyun 			DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1698*4882a593Smuzhiyun 				    cache_device_name(cache), r);
1699*4882a593Smuzhiyun 			bio_io_error(bio);
1700*4882a593Smuzhiyun 			return DM_MAPIO_SUBMITTED;
1701*4882a593Smuzhiyun 		}
1702*4882a593Smuzhiyun 
1703*4882a593Smuzhiyun 		if (r == -ENOENT && op) {
1704*4882a593Smuzhiyun 			bio_drop_shared_lock(cache, bio);
1705*4882a593Smuzhiyun 			BUG_ON(op->op != POLICY_PROMOTE);
1706*4882a593Smuzhiyun 			mg_start(cache, op, bio);
1707*4882a593Smuzhiyun 			return DM_MAPIO_SUBMITTED;
1708*4882a593Smuzhiyun 		}
1709*4882a593Smuzhiyun 	} else {
1710*4882a593Smuzhiyun 		r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1711*4882a593Smuzhiyun 		if (unlikely(r && r != -ENOENT)) {
1712*4882a593Smuzhiyun 			DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1713*4882a593Smuzhiyun 				    cache_device_name(cache), r);
1714*4882a593Smuzhiyun 			bio_io_error(bio);
1715*4882a593Smuzhiyun 			return DM_MAPIO_SUBMITTED;
1716*4882a593Smuzhiyun 		}
1717*4882a593Smuzhiyun 
1718*4882a593Smuzhiyun 		if (background_queued)
1719*4882a593Smuzhiyun 			wake_migration_worker(cache);
1720*4882a593Smuzhiyun 	}
1721*4882a593Smuzhiyun 
1722*4882a593Smuzhiyun 	if (r == -ENOENT) {
1723*4882a593Smuzhiyun 		struct per_bio_data *pb = get_per_bio_data(bio);
1724*4882a593Smuzhiyun 
1725*4882a593Smuzhiyun 		/*
1726*4882a593Smuzhiyun 		 * Miss.
1727*4882a593Smuzhiyun 		 */
1728*4882a593Smuzhiyun 		inc_miss_counter(cache, bio);
1729*4882a593Smuzhiyun 		if (pb->req_nr == 0) {
1730*4882a593Smuzhiyun 			accounted_begin(cache, bio);
1731*4882a593Smuzhiyun 			remap_to_origin_clear_discard(cache, bio, block);
1732*4882a593Smuzhiyun 		} else {
1733*4882a593Smuzhiyun 			/*
1734*4882a593Smuzhiyun 			 * This is a duplicate writethrough io that is no
1735*4882a593Smuzhiyun 			 * longer needed because the block has been demoted.
1736*4882a593Smuzhiyun 			 */
1737*4882a593Smuzhiyun 			bio_endio(bio);
1738*4882a593Smuzhiyun 			return DM_MAPIO_SUBMITTED;
1739*4882a593Smuzhiyun 		}
1740*4882a593Smuzhiyun 	} else {
1741*4882a593Smuzhiyun 		/*
1742*4882a593Smuzhiyun 		 * Hit.
1743*4882a593Smuzhiyun 		 */
1744*4882a593Smuzhiyun 		inc_hit_counter(cache, bio);
1745*4882a593Smuzhiyun 
1746*4882a593Smuzhiyun 		/*
1747*4882a593Smuzhiyun 		 * Passthrough always maps to the origin, invalidating any
1748*4882a593Smuzhiyun 		 * cache blocks that are written to.
1749*4882a593Smuzhiyun 		 */
1750*4882a593Smuzhiyun 		if (passthrough_mode(cache)) {
1751*4882a593Smuzhiyun 			if (bio_data_dir(bio) == WRITE) {
1752*4882a593Smuzhiyun 				bio_drop_shared_lock(cache, bio);
1753*4882a593Smuzhiyun 				atomic_inc(&cache->stats.demotion);
1754*4882a593Smuzhiyun 				invalidate_start(cache, cblock, block, bio);
1755*4882a593Smuzhiyun 			} else
1756*4882a593Smuzhiyun 				remap_to_origin_clear_discard(cache, bio, block);
1757*4882a593Smuzhiyun 		} else {
1758*4882a593Smuzhiyun 			if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1759*4882a593Smuzhiyun 			    !is_dirty(cache, cblock)) {
1760*4882a593Smuzhiyun 				remap_to_origin_and_cache(cache, bio, block, cblock);
1761*4882a593Smuzhiyun 				accounted_begin(cache, bio);
1762*4882a593Smuzhiyun 			} else
1763*4882a593Smuzhiyun 				remap_to_cache_dirty(cache, bio, block, cblock);
1764*4882a593Smuzhiyun 		}
1765*4882a593Smuzhiyun 	}
1766*4882a593Smuzhiyun 
1767*4882a593Smuzhiyun 	/*
1768*4882a593Smuzhiyun 	 * dm core turns FUA requests into a separate payload and FLUSH req.
1769*4882a593Smuzhiyun 	 */
1770*4882a593Smuzhiyun 	if (bio->bi_opf & REQ_FUA) {
1771*4882a593Smuzhiyun 		/*
1772*4882a593Smuzhiyun 		 * issue_after_commit will call accounted_begin a second time.  So
1773*4882a593Smuzhiyun 		 * we call accounted_complete() to avoid double accounting.
1774*4882a593Smuzhiyun 		 */
1775*4882a593Smuzhiyun 		accounted_complete(cache, bio);
1776*4882a593Smuzhiyun 		issue_after_commit(&cache->committer, bio);
1777*4882a593Smuzhiyun 		*commit_needed = true;
1778*4882a593Smuzhiyun 		return DM_MAPIO_SUBMITTED;
1779*4882a593Smuzhiyun 	}
1780*4882a593Smuzhiyun 
1781*4882a593Smuzhiyun 	return DM_MAPIO_REMAPPED;
1782*4882a593Smuzhiyun }
1783*4882a593Smuzhiyun 
process_bio(struct cache * cache,struct bio * bio)1784*4882a593Smuzhiyun static bool process_bio(struct cache *cache, struct bio *bio)
1785*4882a593Smuzhiyun {
1786*4882a593Smuzhiyun 	bool commit_needed;
1787*4882a593Smuzhiyun 
1788*4882a593Smuzhiyun 	if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1789*4882a593Smuzhiyun 		submit_bio_noacct(bio);
1790*4882a593Smuzhiyun 
1791*4882a593Smuzhiyun 	return commit_needed;
1792*4882a593Smuzhiyun }
1793*4882a593Smuzhiyun 
1794*4882a593Smuzhiyun /*
1795*4882a593Smuzhiyun  * A non-zero return indicates read_only or fail_io mode.
1796*4882a593Smuzhiyun  */
commit(struct cache * cache,bool clean_shutdown)1797*4882a593Smuzhiyun static int commit(struct cache *cache, bool clean_shutdown)
1798*4882a593Smuzhiyun {
1799*4882a593Smuzhiyun 	int r;
1800*4882a593Smuzhiyun 
1801*4882a593Smuzhiyun 	if (get_cache_mode(cache) >= CM_READ_ONLY)
1802*4882a593Smuzhiyun 		return -EINVAL;
1803*4882a593Smuzhiyun 
1804*4882a593Smuzhiyun 	atomic_inc(&cache->stats.commit_count);
1805*4882a593Smuzhiyun 	r = dm_cache_commit(cache->cmd, clean_shutdown);
1806*4882a593Smuzhiyun 	if (r)
1807*4882a593Smuzhiyun 		metadata_operation_failed(cache, "dm_cache_commit", r);
1808*4882a593Smuzhiyun 
1809*4882a593Smuzhiyun 	return r;
1810*4882a593Smuzhiyun }
1811*4882a593Smuzhiyun 
1812*4882a593Smuzhiyun /*
1813*4882a593Smuzhiyun  * Used by the batcher.
1814*4882a593Smuzhiyun  */
commit_op(void * context)1815*4882a593Smuzhiyun static blk_status_t commit_op(void *context)
1816*4882a593Smuzhiyun {
1817*4882a593Smuzhiyun 	struct cache *cache = context;
1818*4882a593Smuzhiyun 
1819*4882a593Smuzhiyun 	if (dm_cache_changed_this_transaction(cache->cmd))
1820*4882a593Smuzhiyun 		return errno_to_blk_status(commit(cache, false));
1821*4882a593Smuzhiyun 
1822*4882a593Smuzhiyun 	return 0;
1823*4882a593Smuzhiyun }
1824*4882a593Smuzhiyun 
1825*4882a593Smuzhiyun /*----------------------------------------------------------------*/
1826*4882a593Smuzhiyun 
process_flush_bio(struct cache * cache,struct bio * bio)1827*4882a593Smuzhiyun static bool process_flush_bio(struct cache *cache, struct bio *bio)
1828*4882a593Smuzhiyun {
1829*4882a593Smuzhiyun 	struct per_bio_data *pb = get_per_bio_data(bio);
1830*4882a593Smuzhiyun 
1831*4882a593Smuzhiyun 	if (!pb->req_nr)
1832*4882a593Smuzhiyun 		remap_to_origin(cache, bio);
1833*4882a593Smuzhiyun 	else
1834*4882a593Smuzhiyun 		remap_to_cache(cache, bio, 0);
1835*4882a593Smuzhiyun 
1836*4882a593Smuzhiyun 	issue_after_commit(&cache->committer, bio);
1837*4882a593Smuzhiyun 	return true;
1838*4882a593Smuzhiyun }
1839*4882a593Smuzhiyun 
process_discard_bio(struct cache * cache,struct bio * bio)1840*4882a593Smuzhiyun static bool process_discard_bio(struct cache *cache, struct bio *bio)
1841*4882a593Smuzhiyun {
1842*4882a593Smuzhiyun 	dm_dblock_t b, e;
1843*4882a593Smuzhiyun 
1844*4882a593Smuzhiyun 	// FIXME: do we need to lock the region?  Or can we just assume the
1845*4882a593Smuzhiyun 	// user wont be so foolish as to issue discard concurrently with
1846*4882a593Smuzhiyun 	// other IO?
1847*4882a593Smuzhiyun 	calc_discard_block_range(cache, bio, &b, &e);
1848*4882a593Smuzhiyun 	while (b != e) {
1849*4882a593Smuzhiyun 		set_discard(cache, b);
1850*4882a593Smuzhiyun 		b = to_dblock(from_dblock(b) + 1);
1851*4882a593Smuzhiyun 	}
1852*4882a593Smuzhiyun 
1853*4882a593Smuzhiyun 	if (cache->features.discard_passdown) {
1854*4882a593Smuzhiyun 		remap_to_origin(cache, bio);
1855*4882a593Smuzhiyun 		submit_bio_noacct(bio);
1856*4882a593Smuzhiyun 	} else
1857*4882a593Smuzhiyun 		bio_endio(bio);
1858*4882a593Smuzhiyun 
1859*4882a593Smuzhiyun 	return false;
1860*4882a593Smuzhiyun }
1861*4882a593Smuzhiyun 
process_deferred_bios(struct work_struct * ws)1862*4882a593Smuzhiyun static void process_deferred_bios(struct work_struct *ws)
1863*4882a593Smuzhiyun {
1864*4882a593Smuzhiyun 	struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1865*4882a593Smuzhiyun 
1866*4882a593Smuzhiyun 	bool commit_needed = false;
1867*4882a593Smuzhiyun 	struct bio_list bios;
1868*4882a593Smuzhiyun 	struct bio *bio;
1869*4882a593Smuzhiyun 
1870*4882a593Smuzhiyun 	bio_list_init(&bios);
1871*4882a593Smuzhiyun 
1872*4882a593Smuzhiyun 	spin_lock_irq(&cache->lock);
1873*4882a593Smuzhiyun 	bio_list_merge(&bios, &cache->deferred_bios);
1874*4882a593Smuzhiyun 	bio_list_init(&cache->deferred_bios);
1875*4882a593Smuzhiyun 	spin_unlock_irq(&cache->lock);
1876*4882a593Smuzhiyun 
1877*4882a593Smuzhiyun 	while ((bio = bio_list_pop(&bios))) {
1878*4882a593Smuzhiyun 		if (bio->bi_opf & REQ_PREFLUSH)
1879*4882a593Smuzhiyun 			commit_needed = process_flush_bio(cache, bio) || commit_needed;
1880*4882a593Smuzhiyun 
1881*4882a593Smuzhiyun 		else if (bio_op(bio) == REQ_OP_DISCARD)
1882*4882a593Smuzhiyun 			commit_needed = process_discard_bio(cache, bio) || commit_needed;
1883*4882a593Smuzhiyun 
1884*4882a593Smuzhiyun 		else
1885*4882a593Smuzhiyun 			commit_needed = process_bio(cache, bio) || commit_needed;
1886*4882a593Smuzhiyun 	}
1887*4882a593Smuzhiyun 
1888*4882a593Smuzhiyun 	if (commit_needed)
1889*4882a593Smuzhiyun 		schedule_commit(&cache->committer);
1890*4882a593Smuzhiyun }
1891*4882a593Smuzhiyun 
1892*4882a593Smuzhiyun /*----------------------------------------------------------------
1893*4882a593Smuzhiyun  * Main worker loop
1894*4882a593Smuzhiyun  *--------------------------------------------------------------*/
1895*4882a593Smuzhiyun 
requeue_deferred_bios(struct cache * cache)1896*4882a593Smuzhiyun static void requeue_deferred_bios(struct cache *cache)
1897*4882a593Smuzhiyun {
1898*4882a593Smuzhiyun 	struct bio *bio;
1899*4882a593Smuzhiyun 	struct bio_list bios;
1900*4882a593Smuzhiyun 
1901*4882a593Smuzhiyun 	bio_list_init(&bios);
1902*4882a593Smuzhiyun 	bio_list_merge(&bios, &cache->deferred_bios);
1903*4882a593Smuzhiyun 	bio_list_init(&cache->deferred_bios);
1904*4882a593Smuzhiyun 
1905*4882a593Smuzhiyun 	while ((bio = bio_list_pop(&bios))) {
1906*4882a593Smuzhiyun 		bio->bi_status = BLK_STS_DM_REQUEUE;
1907*4882a593Smuzhiyun 		bio_endio(bio);
1908*4882a593Smuzhiyun 	}
1909*4882a593Smuzhiyun }
1910*4882a593Smuzhiyun 
1911*4882a593Smuzhiyun /*
1912*4882a593Smuzhiyun  * We want to commit periodically so that not too much
1913*4882a593Smuzhiyun  * unwritten metadata builds up.
1914*4882a593Smuzhiyun  */
do_waker(struct work_struct * ws)1915*4882a593Smuzhiyun static void do_waker(struct work_struct *ws)
1916*4882a593Smuzhiyun {
1917*4882a593Smuzhiyun 	struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1918*4882a593Smuzhiyun 
1919*4882a593Smuzhiyun 	policy_tick(cache->policy, true);
1920*4882a593Smuzhiyun 	wake_migration_worker(cache);
1921*4882a593Smuzhiyun 	schedule_commit(&cache->committer);
1922*4882a593Smuzhiyun 	queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1923*4882a593Smuzhiyun }
1924*4882a593Smuzhiyun 
check_migrations(struct work_struct * ws)1925*4882a593Smuzhiyun static void check_migrations(struct work_struct *ws)
1926*4882a593Smuzhiyun {
1927*4882a593Smuzhiyun 	int r;
1928*4882a593Smuzhiyun 	struct policy_work *op;
1929*4882a593Smuzhiyun 	struct cache *cache = container_of(ws, struct cache, migration_worker);
1930*4882a593Smuzhiyun 	enum busy b;
1931*4882a593Smuzhiyun 
1932*4882a593Smuzhiyun 	for (;;) {
1933*4882a593Smuzhiyun 		b = spare_migration_bandwidth(cache);
1934*4882a593Smuzhiyun 
1935*4882a593Smuzhiyun 		r = policy_get_background_work(cache->policy, b == IDLE, &op);
1936*4882a593Smuzhiyun 		if (r == -ENODATA)
1937*4882a593Smuzhiyun 			break;
1938*4882a593Smuzhiyun 
1939*4882a593Smuzhiyun 		if (r) {
1940*4882a593Smuzhiyun 			DMERR_LIMIT("%s: policy_background_work failed",
1941*4882a593Smuzhiyun 				    cache_device_name(cache));
1942*4882a593Smuzhiyun 			break;
1943*4882a593Smuzhiyun 		}
1944*4882a593Smuzhiyun 
1945*4882a593Smuzhiyun 		r = mg_start(cache, op, NULL);
1946*4882a593Smuzhiyun 		if (r)
1947*4882a593Smuzhiyun 			break;
1948*4882a593Smuzhiyun 	}
1949*4882a593Smuzhiyun }
1950*4882a593Smuzhiyun 
1951*4882a593Smuzhiyun /*----------------------------------------------------------------
1952*4882a593Smuzhiyun  * Target methods
1953*4882a593Smuzhiyun  *--------------------------------------------------------------*/
1954*4882a593Smuzhiyun 
1955*4882a593Smuzhiyun /*
1956*4882a593Smuzhiyun  * This function gets called on the error paths of the constructor, so we
1957*4882a593Smuzhiyun  * have to cope with a partially initialised struct.
1958*4882a593Smuzhiyun  */
destroy(struct cache * cache)1959*4882a593Smuzhiyun static void destroy(struct cache *cache)
1960*4882a593Smuzhiyun {
1961*4882a593Smuzhiyun 	unsigned i;
1962*4882a593Smuzhiyun 
1963*4882a593Smuzhiyun 	mempool_exit(&cache->migration_pool);
1964*4882a593Smuzhiyun 
1965*4882a593Smuzhiyun 	if (cache->prison)
1966*4882a593Smuzhiyun 		dm_bio_prison_destroy_v2(cache->prison);
1967*4882a593Smuzhiyun 
1968*4882a593Smuzhiyun 	if (cache->wq)
1969*4882a593Smuzhiyun 		destroy_workqueue(cache->wq);
1970*4882a593Smuzhiyun 
1971*4882a593Smuzhiyun 	if (cache->dirty_bitset)
1972*4882a593Smuzhiyun 		free_bitset(cache->dirty_bitset);
1973*4882a593Smuzhiyun 
1974*4882a593Smuzhiyun 	if (cache->discard_bitset)
1975*4882a593Smuzhiyun 		free_bitset(cache->discard_bitset);
1976*4882a593Smuzhiyun 
1977*4882a593Smuzhiyun 	if (cache->copier)
1978*4882a593Smuzhiyun 		dm_kcopyd_client_destroy(cache->copier);
1979*4882a593Smuzhiyun 
1980*4882a593Smuzhiyun 	if (cache->cmd)
1981*4882a593Smuzhiyun 		dm_cache_metadata_close(cache->cmd);
1982*4882a593Smuzhiyun 
1983*4882a593Smuzhiyun 	if (cache->metadata_dev)
1984*4882a593Smuzhiyun 		dm_put_device(cache->ti, cache->metadata_dev);
1985*4882a593Smuzhiyun 
1986*4882a593Smuzhiyun 	if (cache->origin_dev)
1987*4882a593Smuzhiyun 		dm_put_device(cache->ti, cache->origin_dev);
1988*4882a593Smuzhiyun 
1989*4882a593Smuzhiyun 	if (cache->cache_dev)
1990*4882a593Smuzhiyun 		dm_put_device(cache->ti, cache->cache_dev);
1991*4882a593Smuzhiyun 
1992*4882a593Smuzhiyun 	if (cache->policy)
1993*4882a593Smuzhiyun 		dm_cache_policy_destroy(cache->policy);
1994*4882a593Smuzhiyun 
1995*4882a593Smuzhiyun 	for (i = 0; i < cache->nr_ctr_args ; i++)
1996*4882a593Smuzhiyun 		kfree(cache->ctr_args[i]);
1997*4882a593Smuzhiyun 	kfree(cache->ctr_args);
1998*4882a593Smuzhiyun 
1999*4882a593Smuzhiyun 	bioset_exit(&cache->bs);
2000*4882a593Smuzhiyun 
2001*4882a593Smuzhiyun 	kfree(cache);
2002*4882a593Smuzhiyun }
2003*4882a593Smuzhiyun 
cache_dtr(struct dm_target * ti)2004*4882a593Smuzhiyun static void cache_dtr(struct dm_target *ti)
2005*4882a593Smuzhiyun {
2006*4882a593Smuzhiyun 	struct cache *cache = ti->private;
2007*4882a593Smuzhiyun 
2008*4882a593Smuzhiyun 	destroy(cache);
2009*4882a593Smuzhiyun }
2010*4882a593Smuzhiyun 
get_dev_size(struct dm_dev * dev)2011*4882a593Smuzhiyun static sector_t get_dev_size(struct dm_dev *dev)
2012*4882a593Smuzhiyun {
2013*4882a593Smuzhiyun 	return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2014*4882a593Smuzhiyun }
2015*4882a593Smuzhiyun 
2016*4882a593Smuzhiyun /*----------------------------------------------------------------*/
2017*4882a593Smuzhiyun 
2018*4882a593Smuzhiyun /*
2019*4882a593Smuzhiyun  * Construct a cache device mapping.
2020*4882a593Smuzhiyun  *
2021*4882a593Smuzhiyun  * cache <metadata dev> <cache dev> <origin dev> <block size>
2022*4882a593Smuzhiyun  *       <#feature args> [<feature arg>]*
2023*4882a593Smuzhiyun  *       <policy> <#policy args> [<policy arg>]*
2024*4882a593Smuzhiyun  *
2025*4882a593Smuzhiyun  * metadata dev    : fast device holding the persistent metadata
2026*4882a593Smuzhiyun  * cache dev	   : fast device holding cached data blocks
2027*4882a593Smuzhiyun  * origin dev	   : slow device holding original data blocks
2028*4882a593Smuzhiyun  * block size	   : cache unit size in sectors
2029*4882a593Smuzhiyun  *
2030*4882a593Smuzhiyun  * #feature args   : number of feature arguments passed
2031*4882a593Smuzhiyun  * feature args    : writethrough.  (The default is writeback.)
2032*4882a593Smuzhiyun  *
2033*4882a593Smuzhiyun  * policy	   : the replacement policy to use
2034*4882a593Smuzhiyun  * #policy args    : an even number of policy arguments corresponding
2035*4882a593Smuzhiyun  *		     to key/value pairs passed to the policy
2036*4882a593Smuzhiyun  * policy args	   : key/value pairs passed to the policy
2037*4882a593Smuzhiyun  *		     E.g. 'sequential_threshold 1024'
2038*4882a593Smuzhiyun  *		     See cache-policies.txt for details.
2039*4882a593Smuzhiyun  *
2040*4882a593Smuzhiyun  * Optional feature arguments are:
2041*4882a593Smuzhiyun  *   writethrough  : write through caching that prohibits cache block
2042*4882a593Smuzhiyun  *		     content from being different from origin block content.
2043*4882a593Smuzhiyun  *		     Without this argument, the default behaviour is to write
2044*4882a593Smuzhiyun  *		     back cache block contents later for performance reasons,
2045*4882a593Smuzhiyun  *		     so they may differ from the corresponding origin blocks.
2046*4882a593Smuzhiyun  */
2047*4882a593Smuzhiyun struct cache_args {
2048*4882a593Smuzhiyun 	struct dm_target *ti;
2049*4882a593Smuzhiyun 
2050*4882a593Smuzhiyun 	struct dm_dev *metadata_dev;
2051*4882a593Smuzhiyun 
2052*4882a593Smuzhiyun 	struct dm_dev *cache_dev;
2053*4882a593Smuzhiyun 	sector_t cache_sectors;
2054*4882a593Smuzhiyun 
2055*4882a593Smuzhiyun 	struct dm_dev *origin_dev;
2056*4882a593Smuzhiyun 	sector_t origin_sectors;
2057*4882a593Smuzhiyun 
2058*4882a593Smuzhiyun 	uint32_t block_size;
2059*4882a593Smuzhiyun 
2060*4882a593Smuzhiyun 	const char *policy_name;
2061*4882a593Smuzhiyun 	int policy_argc;
2062*4882a593Smuzhiyun 	const char **policy_argv;
2063*4882a593Smuzhiyun 
2064*4882a593Smuzhiyun 	struct cache_features features;
2065*4882a593Smuzhiyun };
2066*4882a593Smuzhiyun 
destroy_cache_args(struct cache_args * ca)2067*4882a593Smuzhiyun static void destroy_cache_args(struct cache_args *ca)
2068*4882a593Smuzhiyun {
2069*4882a593Smuzhiyun 	if (ca->metadata_dev)
2070*4882a593Smuzhiyun 		dm_put_device(ca->ti, ca->metadata_dev);
2071*4882a593Smuzhiyun 
2072*4882a593Smuzhiyun 	if (ca->cache_dev)
2073*4882a593Smuzhiyun 		dm_put_device(ca->ti, ca->cache_dev);
2074*4882a593Smuzhiyun 
2075*4882a593Smuzhiyun 	if (ca->origin_dev)
2076*4882a593Smuzhiyun 		dm_put_device(ca->ti, ca->origin_dev);
2077*4882a593Smuzhiyun 
2078*4882a593Smuzhiyun 	kfree(ca);
2079*4882a593Smuzhiyun }
2080*4882a593Smuzhiyun 
at_least_one_arg(struct dm_arg_set * as,char ** error)2081*4882a593Smuzhiyun static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2082*4882a593Smuzhiyun {
2083*4882a593Smuzhiyun 	if (!as->argc) {
2084*4882a593Smuzhiyun 		*error = "Insufficient args";
2085*4882a593Smuzhiyun 		return false;
2086*4882a593Smuzhiyun 	}
2087*4882a593Smuzhiyun 
2088*4882a593Smuzhiyun 	return true;
2089*4882a593Smuzhiyun }
2090*4882a593Smuzhiyun 
parse_metadata_dev(struct cache_args * ca,struct dm_arg_set * as,char ** error)2091*4882a593Smuzhiyun static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2092*4882a593Smuzhiyun 			      char **error)
2093*4882a593Smuzhiyun {
2094*4882a593Smuzhiyun 	int r;
2095*4882a593Smuzhiyun 	sector_t metadata_dev_size;
2096*4882a593Smuzhiyun 	char b[BDEVNAME_SIZE];
2097*4882a593Smuzhiyun 
2098*4882a593Smuzhiyun 	if (!at_least_one_arg(as, error))
2099*4882a593Smuzhiyun 		return -EINVAL;
2100*4882a593Smuzhiyun 
2101*4882a593Smuzhiyun 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2102*4882a593Smuzhiyun 			  &ca->metadata_dev);
2103*4882a593Smuzhiyun 	if (r) {
2104*4882a593Smuzhiyun 		*error = "Error opening metadata device";
2105*4882a593Smuzhiyun 		return r;
2106*4882a593Smuzhiyun 	}
2107*4882a593Smuzhiyun 
2108*4882a593Smuzhiyun 	metadata_dev_size = get_dev_size(ca->metadata_dev);
2109*4882a593Smuzhiyun 	if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2110*4882a593Smuzhiyun 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2111*4882a593Smuzhiyun 		       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2112*4882a593Smuzhiyun 
2113*4882a593Smuzhiyun 	return 0;
2114*4882a593Smuzhiyun }
2115*4882a593Smuzhiyun 
parse_cache_dev(struct cache_args * ca,struct dm_arg_set * as,char ** error)2116*4882a593Smuzhiyun static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2117*4882a593Smuzhiyun 			   char **error)
2118*4882a593Smuzhiyun {
2119*4882a593Smuzhiyun 	int r;
2120*4882a593Smuzhiyun 
2121*4882a593Smuzhiyun 	if (!at_least_one_arg(as, error))
2122*4882a593Smuzhiyun 		return -EINVAL;
2123*4882a593Smuzhiyun 
2124*4882a593Smuzhiyun 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2125*4882a593Smuzhiyun 			  &ca->cache_dev);
2126*4882a593Smuzhiyun 	if (r) {
2127*4882a593Smuzhiyun 		*error = "Error opening cache device";
2128*4882a593Smuzhiyun 		return r;
2129*4882a593Smuzhiyun 	}
2130*4882a593Smuzhiyun 	ca->cache_sectors = get_dev_size(ca->cache_dev);
2131*4882a593Smuzhiyun 
2132*4882a593Smuzhiyun 	return 0;
2133*4882a593Smuzhiyun }
2134*4882a593Smuzhiyun 
parse_origin_dev(struct cache_args * ca,struct dm_arg_set * as,char ** error)2135*4882a593Smuzhiyun static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2136*4882a593Smuzhiyun 			    char **error)
2137*4882a593Smuzhiyun {
2138*4882a593Smuzhiyun 	int r;
2139*4882a593Smuzhiyun 
2140*4882a593Smuzhiyun 	if (!at_least_one_arg(as, error))
2141*4882a593Smuzhiyun 		return -EINVAL;
2142*4882a593Smuzhiyun 
2143*4882a593Smuzhiyun 	r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2144*4882a593Smuzhiyun 			  &ca->origin_dev);
2145*4882a593Smuzhiyun 	if (r) {
2146*4882a593Smuzhiyun 		*error = "Error opening origin device";
2147*4882a593Smuzhiyun 		return r;
2148*4882a593Smuzhiyun 	}
2149*4882a593Smuzhiyun 
2150*4882a593Smuzhiyun 	ca->origin_sectors = get_dev_size(ca->origin_dev);
2151*4882a593Smuzhiyun 	if (ca->ti->len > ca->origin_sectors) {
2152*4882a593Smuzhiyun 		*error = "Device size larger than cached device";
2153*4882a593Smuzhiyun 		return -EINVAL;
2154*4882a593Smuzhiyun 	}
2155*4882a593Smuzhiyun 
2156*4882a593Smuzhiyun 	return 0;
2157*4882a593Smuzhiyun }
2158*4882a593Smuzhiyun 
parse_block_size(struct cache_args * ca,struct dm_arg_set * as,char ** error)2159*4882a593Smuzhiyun static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2160*4882a593Smuzhiyun 			    char **error)
2161*4882a593Smuzhiyun {
2162*4882a593Smuzhiyun 	unsigned long block_size;
2163*4882a593Smuzhiyun 
2164*4882a593Smuzhiyun 	if (!at_least_one_arg(as, error))
2165*4882a593Smuzhiyun 		return -EINVAL;
2166*4882a593Smuzhiyun 
2167*4882a593Smuzhiyun 	if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2168*4882a593Smuzhiyun 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2169*4882a593Smuzhiyun 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2170*4882a593Smuzhiyun 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2171*4882a593Smuzhiyun 		*error = "Invalid data block size";
2172*4882a593Smuzhiyun 		return -EINVAL;
2173*4882a593Smuzhiyun 	}
2174*4882a593Smuzhiyun 
2175*4882a593Smuzhiyun 	if (block_size > ca->cache_sectors) {
2176*4882a593Smuzhiyun 		*error = "Data block size is larger than the cache device";
2177*4882a593Smuzhiyun 		return -EINVAL;
2178*4882a593Smuzhiyun 	}
2179*4882a593Smuzhiyun 
2180*4882a593Smuzhiyun 	ca->block_size = block_size;
2181*4882a593Smuzhiyun 
2182*4882a593Smuzhiyun 	return 0;
2183*4882a593Smuzhiyun }
2184*4882a593Smuzhiyun 
init_features(struct cache_features * cf)2185*4882a593Smuzhiyun static void init_features(struct cache_features *cf)
2186*4882a593Smuzhiyun {
2187*4882a593Smuzhiyun 	cf->mode = CM_WRITE;
2188*4882a593Smuzhiyun 	cf->io_mode = CM_IO_WRITEBACK;
2189*4882a593Smuzhiyun 	cf->metadata_version = 1;
2190*4882a593Smuzhiyun 	cf->discard_passdown = true;
2191*4882a593Smuzhiyun }
2192*4882a593Smuzhiyun 
parse_features(struct cache_args * ca,struct dm_arg_set * as,char ** error)2193*4882a593Smuzhiyun static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2194*4882a593Smuzhiyun 			  char **error)
2195*4882a593Smuzhiyun {
2196*4882a593Smuzhiyun 	static const struct dm_arg _args[] = {
2197*4882a593Smuzhiyun 		{0, 3, "Invalid number of cache feature arguments"},
2198*4882a593Smuzhiyun 	};
2199*4882a593Smuzhiyun 
2200*4882a593Smuzhiyun 	int r, mode_ctr = 0;
2201*4882a593Smuzhiyun 	unsigned argc;
2202*4882a593Smuzhiyun 	const char *arg;
2203*4882a593Smuzhiyun 	struct cache_features *cf = &ca->features;
2204*4882a593Smuzhiyun 
2205*4882a593Smuzhiyun 	init_features(cf);
2206*4882a593Smuzhiyun 
2207*4882a593Smuzhiyun 	r = dm_read_arg_group(_args, as, &argc, error);
2208*4882a593Smuzhiyun 	if (r)
2209*4882a593Smuzhiyun 		return -EINVAL;
2210*4882a593Smuzhiyun 
2211*4882a593Smuzhiyun 	while (argc--) {
2212*4882a593Smuzhiyun 		arg = dm_shift_arg(as);
2213*4882a593Smuzhiyun 
2214*4882a593Smuzhiyun 		if (!strcasecmp(arg, "writeback")) {
2215*4882a593Smuzhiyun 			cf->io_mode = CM_IO_WRITEBACK;
2216*4882a593Smuzhiyun 			mode_ctr++;
2217*4882a593Smuzhiyun 		}
2218*4882a593Smuzhiyun 
2219*4882a593Smuzhiyun 		else if (!strcasecmp(arg, "writethrough")) {
2220*4882a593Smuzhiyun 			cf->io_mode = CM_IO_WRITETHROUGH;
2221*4882a593Smuzhiyun 			mode_ctr++;
2222*4882a593Smuzhiyun 		}
2223*4882a593Smuzhiyun 
2224*4882a593Smuzhiyun 		else if (!strcasecmp(arg, "passthrough")) {
2225*4882a593Smuzhiyun 			cf->io_mode = CM_IO_PASSTHROUGH;
2226*4882a593Smuzhiyun 			mode_ctr++;
2227*4882a593Smuzhiyun 		}
2228*4882a593Smuzhiyun 
2229*4882a593Smuzhiyun 		else if (!strcasecmp(arg, "metadata2"))
2230*4882a593Smuzhiyun 			cf->metadata_version = 2;
2231*4882a593Smuzhiyun 
2232*4882a593Smuzhiyun 		else if (!strcasecmp(arg, "no_discard_passdown"))
2233*4882a593Smuzhiyun 			cf->discard_passdown = false;
2234*4882a593Smuzhiyun 
2235*4882a593Smuzhiyun 		else {
2236*4882a593Smuzhiyun 			*error = "Unrecognised cache feature requested";
2237*4882a593Smuzhiyun 			return -EINVAL;
2238*4882a593Smuzhiyun 		}
2239*4882a593Smuzhiyun 	}
2240*4882a593Smuzhiyun 
2241*4882a593Smuzhiyun 	if (mode_ctr > 1) {
2242*4882a593Smuzhiyun 		*error = "Duplicate cache io_mode features requested";
2243*4882a593Smuzhiyun 		return -EINVAL;
2244*4882a593Smuzhiyun 	}
2245*4882a593Smuzhiyun 
2246*4882a593Smuzhiyun 	return 0;
2247*4882a593Smuzhiyun }
2248*4882a593Smuzhiyun 
parse_policy(struct cache_args * ca,struct dm_arg_set * as,char ** error)2249*4882a593Smuzhiyun static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2250*4882a593Smuzhiyun 			char **error)
2251*4882a593Smuzhiyun {
2252*4882a593Smuzhiyun 	static const struct dm_arg _args[] = {
2253*4882a593Smuzhiyun 		{0, 1024, "Invalid number of policy arguments"},
2254*4882a593Smuzhiyun 	};
2255*4882a593Smuzhiyun 
2256*4882a593Smuzhiyun 	int r;
2257*4882a593Smuzhiyun 
2258*4882a593Smuzhiyun 	if (!at_least_one_arg(as, error))
2259*4882a593Smuzhiyun 		return -EINVAL;
2260*4882a593Smuzhiyun 
2261*4882a593Smuzhiyun 	ca->policy_name = dm_shift_arg(as);
2262*4882a593Smuzhiyun 
2263*4882a593Smuzhiyun 	r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2264*4882a593Smuzhiyun 	if (r)
2265*4882a593Smuzhiyun 		return -EINVAL;
2266*4882a593Smuzhiyun 
2267*4882a593Smuzhiyun 	ca->policy_argv = (const char **)as->argv;
2268*4882a593Smuzhiyun 	dm_consume_args(as, ca->policy_argc);
2269*4882a593Smuzhiyun 
2270*4882a593Smuzhiyun 	return 0;
2271*4882a593Smuzhiyun }
2272*4882a593Smuzhiyun 
parse_cache_args(struct cache_args * ca,int argc,char ** argv,char ** error)2273*4882a593Smuzhiyun static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2274*4882a593Smuzhiyun 			    char **error)
2275*4882a593Smuzhiyun {
2276*4882a593Smuzhiyun 	int r;
2277*4882a593Smuzhiyun 	struct dm_arg_set as;
2278*4882a593Smuzhiyun 
2279*4882a593Smuzhiyun 	as.argc = argc;
2280*4882a593Smuzhiyun 	as.argv = argv;
2281*4882a593Smuzhiyun 
2282*4882a593Smuzhiyun 	r = parse_metadata_dev(ca, &as, error);
2283*4882a593Smuzhiyun 	if (r)
2284*4882a593Smuzhiyun 		return r;
2285*4882a593Smuzhiyun 
2286*4882a593Smuzhiyun 	r = parse_cache_dev(ca, &as, error);
2287*4882a593Smuzhiyun 	if (r)
2288*4882a593Smuzhiyun 		return r;
2289*4882a593Smuzhiyun 
2290*4882a593Smuzhiyun 	r = parse_origin_dev(ca, &as, error);
2291*4882a593Smuzhiyun 	if (r)
2292*4882a593Smuzhiyun 		return r;
2293*4882a593Smuzhiyun 
2294*4882a593Smuzhiyun 	r = parse_block_size(ca, &as, error);
2295*4882a593Smuzhiyun 	if (r)
2296*4882a593Smuzhiyun 		return r;
2297*4882a593Smuzhiyun 
2298*4882a593Smuzhiyun 	r = parse_features(ca, &as, error);
2299*4882a593Smuzhiyun 	if (r)
2300*4882a593Smuzhiyun 		return r;
2301*4882a593Smuzhiyun 
2302*4882a593Smuzhiyun 	r = parse_policy(ca, &as, error);
2303*4882a593Smuzhiyun 	if (r)
2304*4882a593Smuzhiyun 		return r;
2305*4882a593Smuzhiyun 
2306*4882a593Smuzhiyun 	return 0;
2307*4882a593Smuzhiyun }
2308*4882a593Smuzhiyun 
2309*4882a593Smuzhiyun /*----------------------------------------------------------------*/
2310*4882a593Smuzhiyun 
2311*4882a593Smuzhiyun static struct kmem_cache *migration_cache;
2312*4882a593Smuzhiyun 
2313*4882a593Smuzhiyun #define NOT_CORE_OPTION 1
2314*4882a593Smuzhiyun 
process_config_option(struct cache * cache,const char * key,const char * value)2315*4882a593Smuzhiyun static int process_config_option(struct cache *cache, const char *key, const char *value)
2316*4882a593Smuzhiyun {
2317*4882a593Smuzhiyun 	unsigned long tmp;
2318*4882a593Smuzhiyun 
2319*4882a593Smuzhiyun 	if (!strcasecmp(key, "migration_threshold")) {
2320*4882a593Smuzhiyun 		if (kstrtoul(value, 10, &tmp))
2321*4882a593Smuzhiyun 			return -EINVAL;
2322*4882a593Smuzhiyun 
2323*4882a593Smuzhiyun 		cache->migration_threshold = tmp;
2324*4882a593Smuzhiyun 		return 0;
2325*4882a593Smuzhiyun 	}
2326*4882a593Smuzhiyun 
2327*4882a593Smuzhiyun 	return NOT_CORE_OPTION;
2328*4882a593Smuzhiyun }
2329*4882a593Smuzhiyun 
set_config_value(struct cache * cache,const char * key,const char * value)2330*4882a593Smuzhiyun static int set_config_value(struct cache *cache, const char *key, const char *value)
2331*4882a593Smuzhiyun {
2332*4882a593Smuzhiyun 	int r = process_config_option(cache, key, value);
2333*4882a593Smuzhiyun 
2334*4882a593Smuzhiyun 	if (r == NOT_CORE_OPTION)
2335*4882a593Smuzhiyun 		r = policy_set_config_value(cache->policy, key, value);
2336*4882a593Smuzhiyun 
2337*4882a593Smuzhiyun 	if (r)
2338*4882a593Smuzhiyun 		DMWARN("bad config value for %s: %s", key, value);
2339*4882a593Smuzhiyun 
2340*4882a593Smuzhiyun 	return r;
2341*4882a593Smuzhiyun }
2342*4882a593Smuzhiyun 
set_config_values(struct cache * cache,int argc,const char ** argv)2343*4882a593Smuzhiyun static int set_config_values(struct cache *cache, int argc, const char **argv)
2344*4882a593Smuzhiyun {
2345*4882a593Smuzhiyun 	int r = 0;
2346*4882a593Smuzhiyun 
2347*4882a593Smuzhiyun 	if (argc & 1) {
2348*4882a593Smuzhiyun 		DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2349*4882a593Smuzhiyun 		return -EINVAL;
2350*4882a593Smuzhiyun 	}
2351*4882a593Smuzhiyun 
2352*4882a593Smuzhiyun 	while (argc) {
2353*4882a593Smuzhiyun 		r = set_config_value(cache, argv[0], argv[1]);
2354*4882a593Smuzhiyun 		if (r)
2355*4882a593Smuzhiyun 			break;
2356*4882a593Smuzhiyun 
2357*4882a593Smuzhiyun 		argc -= 2;
2358*4882a593Smuzhiyun 		argv += 2;
2359*4882a593Smuzhiyun 	}
2360*4882a593Smuzhiyun 
2361*4882a593Smuzhiyun 	return r;
2362*4882a593Smuzhiyun }
2363*4882a593Smuzhiyun 
create_cache_policy(struct cache * cache,struct cache_args * ca,char ** error)2364*4882a593Smuzhiyun static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2365*4882a593Smuzhiyun 			       char **error)
2366*4882a593Smuzhiyun {
2367*4882a593Smuzhiyun 	struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2368*4882a593Smuzhiyun 							   cache->cache_size,
2369*4882a593Smuzhiyun 							   cache->origin_sectors,
2370*4882a593Smuzhiyun 							   cache->sectors_per_block);
2371*4882a593Smuzhiyun 	if (IS_ERR(p)) {
2372*4882a593Smuzhiyun 		*error = "Error creating cache's policy";
2373*4882a593Smuzhiyun 		return PTR_ERR(p);
2374*4882a593Smuzhiyun 	}
2375*4882a593Smuzhiyun 	cache->policy = p;
2376*4882a593Smuzhiyun 	BUG_ON(!cache->policy);
2377*4882a593Smuzhiyun 
2378*4882a593Smuzhiyun 	return 0;
2379*4882a593Smuzhiyun }
2380*4882a593Smuzhiyun 
2381*4882a593Smuzhiyun /*
2382*4882a593Smuzhiyun  * We want the discard block size to be at least the size of the cache
2383*4882a593Smuzhiyun  * block size and have no more than 2^14 discard blocks across the origin.
2384*4882a593Smuzhiyun  */
2385*4882a593Smuzhiyun #define MAX_DISCARD_BLOCKS (1 << 14)
2386*4882a593Smuzhiyun 
too_many_discard_blocks(sector_t discard_block_size,sector_t origin_size)2387*4882a593Smuzhiyun static bool too_many_discard_blocks(sector_t discard_block_size,
2388*4882a593Smuzhiyun 				    sector_t origin_size)
2389*4882a593Smuzhiyun {
2390*4882a593Smuzhiyun 	(void) sector_div(origin_size, discard_block_size);
2391*4882a593Smuzhiyun 
2392*4882a593Smuzhiyun 	return origin_size > MAX_DISCARD_BLOCKS;
2393*4882a593Smuzhiyun }
2394*4882a593Smuzhiyun 
calculate_discard_block_size(sector_t cache_block_size,sector_t origin_size)2395*4882a593Smuzhiyun static sector_t calculate_discard_block_size(sector_t cache_block_size,
2396*4882a593Smuzhiyun 					     sector_t origin_size)
2397*4882a593Smuzhiyun {
2398*4882a593Smuzhiyun 	sector_t discard_block_size = cache_block_size;
2399*4882a593Smuzhiyun 
2400*4882a593Smuzhiyun 	if (origin_size)
2401*4882a593Smuzhiyun 		while (too_many_discard_blocks(discard_block_size, origin_size))
2402*4882a593Smuzhiyun 			discard_block_size *= 2;
2403*4882a593Smuzhiyun 
2404*4882a593Smuzhiyun 	return discard_block_size;
2405*4882a593Smuzhiyun }
2406*4882a593Smuzhiyun 
set_cache_size(struct cache * cache,dm_cblock_t size)2407*4882a593Smuzhiyun static void set_cache_size(struct cache *cache, dm_cblock_t size)
2408*4882a593Smuzhiyun {
2409*4882a593Smuzhiyun 	dm_block_t nr_blocks = from_cblock(size);
2410*4882a593Smuzhiyun 
2411*4882a593Smuzhiyun 	if (nr_blocks > (1 << 20) && cache->cache_size != size)
2412*4882a593Smuzhiyun 		DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2413*4882a593Smuzhiyun 			     "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2414*4882a593Smuzhiyun 			     "Please consider increasing the cache block size to reduce the overall cache block count.",
2415*4882a593Smuzhiyun 			     (unsigned long long) nr_blocks);
2416*4882a593Smuzhiyun 
2417*4882a593Smuzhiyun 	cache->cache_size = size;
2418*4882a593Smuzhiyun }
2419*4882a593Smuzhiyun 
2420*4882a593Smuzhiyun #define DEFAULT_MIGRATION_THRESHOLD 2048
2421*4882a593Smuzhiyun 
cache_create(struct cache_args * ca,struct cache ** result)2422*4882a593Smuzhiyun static int cache_create(struct cache_args *ca, struct cache **result)
2423*4882a593Smuzhiyun {
2424*4882a593Smuzhiyun 	int r = 0;
2425*4882a593Smuzhiyun 	char **error = &ca->ti->error;
2426*4882a593Smuzhiyun 	struct cache *cache;
2427*4882a593Smuzhiyun 	struct dm_target *ti = ca->ti;
2428*4882a593Smuzhiyun 	dm_block_t origin_blocks;
2429*4882a593Smuzhiyun 	struct dm_cache_metadata *cmd;
2430*4882a593Smuzhiyun 	bool may_format = ca->features.mode == CM_WRITE;
2431*4882a593Smuzhiyun 
2432*4882a593Smuzhiyun 	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2433*4882a593Smuzhiyun 	if (!cache)
2434*4882a593Smuzhiyun 		return -ENOMEM;
2435*4882a593Smuzhiyun 
2436*4882a593Smuzhiyun 	cache->ti = ca->ti;
2437*4882a593Smuzhiyun 	ti->private = cache;
2438*4882a593Smuzhiyun 	ti->num_flush_bios = 2;
2439*4882a593Smuzhiyun 	ti->flush_supported = true;
2440*4882a593Smuzhiyun 
2441*4882a593Smuzhiyun 	ti->num_discard_bios = 1;
2442*4882a593Smuzhiyun 	ti->discards_supported = true;
2443*4882a593Smuzhiyun 
2444*4882a593Smuzhiyun 	ti->per_io_data_size = sizeof(struct per_bio_data);
2445*4882a593Smuzhiyun 
2446*4882a593Smuzhiyun 	cache->features = ca->features;
2447*4882a593Smuzhiyun 	if (writethrough_mode(cache)) {
2448*4882a593Smuzhiyun 		/* Create bioset for writethrough bios issued to origin */
2449*4882a593Smuzhiyun 		r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
2450*4882a593Smuzhiyun 		if (r)
2451*4882a593Smuzhiyun 			goto bad;
2452*4882a593Smuzhiyun 	}
2453*4882a593Smuzhiyun 
2454*4882a593Smuzhiyun 	cache->metadata_dev = ca->metadata_dev;
2455*4882a593Smuzhiyun 	cache->origin_dev = ca->origin_dev;
2456*4882a593Smuzhiyun 	cache->cache_dev = ca->cache_dev;
2457*4882a593Smuzhiyun 
2458*4882a593Smuzhiyun 	ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2459*4882a593Smuzhiyun 
2460*4882a593Smuzhiyun 	origin_blocks = cache->origin_sectors = ca->origin_sectors;
2461*4882a593Smuzhiyun 	origin_blocks = block_div(origin_blocks, ca->block_size);
2462*4882a593Smuzhiyun 	cache->origin_blocks = to_oblock(origin_blocks);
2463*4882a593Smuzhiyun 
2464*4882a593Smuzhiyun 	cache->sectors_per_block = ca->block_size;
2465*4882a593Smuzhiyun 	if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2466*4882a593Smuzhiyun 		r = -EINVAL;
2467*4882a593Smuzhiyun 		goto bad;
2468*4882a593Smuzhiyun 	}
2469*4882a593Smuzhiyun 
2470*4882a593Smuzhiyun 	if (ca->block_size & (ca->block_size - 1)) {
2471*4882a593Smuzhiyun 		dm_block_t cache_size = ca->cache_sectors;
2472*4882a593Smuzhiyun 
2473*4882a593Smuzhiyun 		cache->sectors_per_block_shift = -1;
2474*4882a593Smuzhiyun 		cache_size = block_div(cache_size, ca->block_size);
2475*4882a593Smuzhiyun 		set_cache_size(cache, to_cblock(cache_size));
2476*4882a593Smuzhiyun 	} else {
2477*4882a593Smuzhiyun 		cache->sectors_per_block_shift = __ffs(ca->block_size);
2478*4882a593Smuzhiyun 		set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2479*4882a593Smuzhiyun 	}
2480*4882a593Smuzhiyun 
2481*4882a593Smuzhiyun 	r = create_cache_policy(cache, ca, error);
2482*4882a593Smuzhiyun 	if (r)
2483*4882a593Smuzhiyun 		goto bad;
2484*4882a593Smuzhiyun 
2485*4882a593Smuzhiyun 	cache->policy_nr_args = ca->policy_argc;
2486*4882a593Smuzhiyun 	cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2487*4882a593Smuzhiyun 
2488*4882a593Smuzhiyun 	r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2489*4882a593Smuzhiyun 	if (r) {
2490*4882a593Smuzhiyun 		*error = "Error setting cache policy's config values";
2491*4882a593Smuzhiyun 		goto bad;
2492*4882a593Smuzhiyun 	}
2493*4882a593Smuzhiyun 
2494*4882a593Smuzhiyun 	cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2495*4882a593Smuzhiyun 				     ca->block_size, may_format,
2496*4882a593Smuzhiyun 				     dm_cache_policy_get_hint_size(cache->policy),
2497*4882a593Smuzhiyun 				     ca->features.metadata_version);
2498*4882a593Smuzhiyun 	if (IS_ERR(cmd)) {
2499*4882a593Smuzhiyun 		*error = "Error creating metadata object";
2500*4882a593Smuzhiyun 		r = PTR_ERR(cmd);
2501*4882a593Smuzhiyun 		goto bad;
2502*4882a593Smuzhiyun 	}
2503*4882a593Smuzhiyun 	cache->cmd = cmd;
2504*4882a593Smuzhiyun 	set_cache_mode(cache, CM_WRITE);
2505*4882a593Smuzhiyun 	if (get_cache_mode(cache) != CM_WRITE) {
2506*4882a593Smuzhiyun 		*error = "Unable to get write access to metadata, please check/repair metadata.";
2507*4882a593Smuzhiyun 		r = -EINVAL;
2508*4882a593Smuzhiyun 		goto bad;
2509*4882a593Smuzhiyun 	}
2510*4882a593Smuzhiyun 
2511*4882a593Smuzhiyun 	if (passthrough_mode(cache)) {
2512*4882a593Smuzhiyun 		bool all_clean;
2513*4882a593Smuzhiyun 
2514*4882a593Smuzhiyun 		r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2515*4882a593Smuzhiyun 		if (r) {
2516*4882a593Smuzhiyun 			*error = "dm_cache_metadata_all_clean() failed";
2517*4882a593Smuzhiyun 			goto bad;
2518*4882a593Smuzhiyun 		}
2519*4882a593Smuzhiyun 
2520*4882a593Smuzhiyun 		if (!all_clean) {
2521*4882a593Smuzhiyun 			*error = "Cannot enter passthrough mode unless all blocks are clean";
2522*4882a593Smuzhiyun 			r = -EINVAL;
2523*4882a593Smuzhiyun 			goto bad;
2524*4882a593Smuzhiyun 		}
2525*4882a593Smuzhiyun 
2526*4882a593Smuzhiyun 		policy_allow_migrations(cache->policy, false);
2527*4882a593Smuzhiyun 	}
2528*4882a593Smuzhiyun 
2529*4882a593Smuzhiyun 	spin_lock_init(&cache->lock);
2530*4882a593Smuzhiyun 	bio_list_init(&cache->deferred_bios);
2531*4882a593Smuzhiyun 	atomic_set(&cache->nr_allocated_migrations, 0);
2532*4882a593Smuzhiyun 	atomic_set(&cache->nr_io_migrations, 0);
2533*4882a593Smuzhiyun 	init_waitqueue_head(&cache->migration_wait);
2534*4882a593Smuzhiyun 
2535*4882a593Smuzhiyun 	r = -ENOMEM;
2536*4882a593Smuzhiyun 	atomic_set(&cache->nr_dirty, 0);
2537*4882a593Smuzhiyun 	cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2538*4882a593Smuzhiyun 	if (!cache->dirty_bitset) {
2539*4882a593Smuzhiyun 		*error = "could not allocate dirty bitset";
2540*4882a593Smuzhiyun 		goto bad;
2541*4882a593Smuzhiyun 	}
2542*4882a593Smuzhiyun 	clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2543*4882a593Smuzhiyun 
2544*4882a593Smuzhiyun 	cache->discard_block_size =
2545*4882a593Smuzhiyun 		calculate_discard_block_size(cache->sectors_per_block,
2546*4882a593Smuzhiyun 					     cache->origin_sectors);
2547*4882a593Smuzhiyun 	cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2548*4882a593Smuzhiyun 							      cache->discard_block_size));
2549*4882a593Smuzhiyun 	cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2550*4882a593Smuzhiyun 	if (!cache->discard_bitset) {
2551*4882a593Smuzhiyun 		*error = "could not allocate discard bitset";
2552*4882a593Smuzhiyun 		goto bad;
2553*4882a593Smuzhiyun 	}
2554*4882a593Smuzhiyun 	clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2555*4882a593Smuzhiyun 
2556*4882a593Smuzhiyun 	cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2557*4882a593Smuzhiyun 	if (IS_ERR(cache->copier)) {
2558*4882a593Smuzhiyun 		*error = "could not create kcopyd client";
2559*4882a593Smuzhiyun 		r = PTR_ERR(cache->copier);
2560*4882a593Smuzhiyun 		goto bad;
2561*4882a593Smuzhiyun 	}
2562*4882a593Smuzhiyun 
2563*4882a593Smuzhiyun 	cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2564*4882a593Smuzhiyun 	if (!cache->wq) {
2565*4882a593Smuzhiyun 		*error = "could not create workqueue for metadata object";
2566*4882a593Smuzhiyun 		goto bad;
2567*4882a593Smuzhiyun 	}
2568*4882a593Smuzhiyun 	INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2569*4882a593Smuzhiyun 	INIT_WORK(&cache->migration_worker, check_migrations);
2570*4882a593Smuzhiyun 	INIT_DELAYED_WORK(&cache->waker, do_waker);
2571*4882a593Smuzhiyun 
2572*4882a593Smuzhiyun 	cache->prison = dm_bio_prison_create_v2(cache->wq);
2573*4882a593Smuzhiyun 	if (!cache->prison) {
2574*4882a593Smuzhiyun 		*error = "could not create bio prison";
2575*4882a593Smuzhiyun 		goto bad;
2576*4882a593Smuzhiyun 	}
2577*4882a593Smuzhiyun 
2578*4882a593Smuzhiyun 	r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
2579*4882a593Smuzhiyun 				   migration_cache);
2580*4882a593Smuzhiyun 	if (r) {
2581*4882a593Smuzhiyun 		*error = "Error creating cache's migration mempool";
2582*4882a593Smuzhiyun 		goto bad;
2583*4882a593Smuzhiyun 	}
2584*4882a593Smuzhiyun 
2585*4882a593Smuzhiyun 	cache->need_tick_bio = true;
2586*4882a593Smuzhiyun 	cache->sized = false;
2587*4882a593Smuzhiyun 	cache->invalidate = false;
2588*4882a593Smuzhiyun 	cache->commit_requested = false;
2589*4882a593Smuzhiyun 	cache->loaded_mappings = false;
2590*4882a593Smuzhiyun 	cache->loaded_discards = false;
2591*4882a593Smuzhiyun 
2592*4882a593Smuzhiyun 	load_stats(cache);
2593*4882a593Smuzhiyun 
2594*4882a593Smuzhiyun 	atomic_set(&cache->stats.demotion, 0);
2595*4882a593Smuzhiyun 	atomic_set(&cache->stats.promotion, 0);
2596*4882a593Smuzhiyun 	atomic_set(&cache->stats.copies_avoided, 0);
2597*4882a593Smuzhiyun 	atomic_set(&cache->stats.cache_cell_clash, 0);
2598*4882a593Smuzhiyun 	atomic_set(&cache->stats.commit_count, 0);
2599*4882a593Smuzhiyun 	atomic_set(&cache->stats.discard_count, 0);
2600*4882a593Smuzhiyun 
2601*4882a593Smuzhiyun 	spin_lock_init(&cache->invalidation_lock);
2602*4882a593Smuzhiyun 	INIT_LIST_HEAD(&cache->invalidation_requests);
2603*4882a593Smuzhiyun 
2604*4882a593Smuzhiyun 	batcher_init(&cache->committer, commit_op, cache,
2605*4882a593Smuzhiyun 		     issue_op, cache, cache->wq);
2606*4882a593Smuzhiyun 	iot_init(&cache->tracker);
2607*4882a593Smuzhiyun 
2608*4882a593Smuzhiyun 	init_rwsem(&cache->background_work_lock);
2609*4882a593Smuzhiyun 	prevent_background_work(cache);
2610*4882a593Smuzhiyun 
2611*4882a593Smuzhiyun 	*result = cache;
2612*4882a593Smuzhiyun 	return 0;
2613*4882a593Smuzhiyun bad:
2614*4882a593Smuzhiyun 	destroy(cache);
2615*4882a593Smuzhiyun 	return r;
2616*4882a593Smuzhiyun }
2617*4882a593Smuzhiyun 
copy_ctr_args(struct cache * cache,int argc,const char ** argv)2618*4882a593Smuzhiyun static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2619*4882a593Smuzhiyun {
2620*4882a593Smuzhiyun 	unsigned i;
2621*4882a593Smuzhiyun 	const char **copy;
2622*4882a593Smuzhiyun 
2623*4882a593Smuzhiyun 	copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2624*4882a593Smuzhiyun 	if (!copy)
2625*4882a593Smuzhiyun 		return -ENOMEM;
2626*4882a593Smuzhiyun 	for (i = 0; i < argc; i++) {
2627*4882a593Smuzhiyun 		copy[i] = kstrdup(argv[i], GFP_KERNEL);
2628*4882a593Smuzhiyun 		if (!copy[i]) {
2629*4882a593Smuzhiyun 			while (i--)
2630*4882a593Smuzhiyun 				kfree(copy[i]);
2631*4882a593Smuzhiyun 			kfree(copy);
2632*4882a593Smuzhiyun 			return -ENOMEM;
2633*4882a593Smuzhiyun 		}
2634*4882a593Smuzhiyun 	}
2635*4882a593Smuzhiyun 
2636*4882a593Smuzhiyun 	cache->nr_ctr_args = argc;
2637*4882a593Smuzhiyun 	cache->ctr_args = copy;
2638*4882a593Smuzhiyun 
2639*4882a593Smuzhiyun 	return 0;
2640*4882a593Smuzhiyun }
2641*4882a593Smuzhiyun 
cache_ctr(struct dm_target * ti,unsigned argc,char ** argv)2642*4882a593Smuzhiyun static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2643*4882a593Smuzhiyun {
2644*4882a593Smuzhiyun 	int r = -EINVAL;
2645*4882a593Smuzhiyun 	struct cache_args *ca;
2646*4882a593Smuzhiyun 	struct cache *cache = NULL;
2647*4882a593Smuzhiyun 
2648*4882a593Smuzhiyun 	ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2649*4882a593Smuzhiyun 	if (!ca) {
2650*4882a593Smuzhiyun 		ti->error = "Error allocating memory for cache";
2651*4882a593Smuzhiyun 		return -ENOMEM;
2652*4882a593Smuzhiyun 	}
2653*4882a593Smuzhiyun 	ca->ti = ti;
2654*4882a593Smuzhiyun 
2655*4882a593Smuzhiyun 	r = parse_cache_args(ca, argc, argv, &ti->error);
2656*4882a593Smuzhiyun 	if (r)
2657*4882a593Smuzhiyun 		goto out;
2658*4882a593Smuzhiyun 
2659*4882a593Smuzhiyun 	r = cache_create(ca, &cache);
2660*4882a593Smuzhiyun 	if (r)
2661*4882a593Smuzhiyun 		goto out;
2662*4882a593Smuzhiyun 
2663*4882a593Smuzhiyun 	r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2664*4882a593Smuzhiyun 	if (r) {
2665*4882a593Smuzhiyun 		destroy(cache);
2666*4882a593Smuzhiyun 		goto out;
2667*4882a593Smuzhiyun 	}
2668*4882a593Smuzhiyun 
2669*4882a593Smuzhiyun 	ti->private = cache;
2670*4882a593Smuzhiyun out:
2671*4882a593Smuzhiyun 	destroy_cache_args(ca);
2672*4882a593Smuzhiyun 	return r;
2673*4882a593Smuzhiyun }
2674*4882a593Smuzhiyun 
2675*4882a593Smuzhiyun /*----------------------------------------------------------------*/
2676*4882a593Smuzhiyun 
cache_map(struct dm_target * ti,struct bio * bio)2677*4882a593Smuzhiyun static int cache_map(struct dm_target *ti, struct bio *bio)
2678*4882a593Smuzhiyun {
2679*4882a593Smuzhiyun 	struct cache *cache = ti->private;
2680*4882a593Smuzhiyun 
2681*4882a593Smuzhiyun 	int r;
2682*4882a593Smuzhiyun 	bool commit_needed;
2683*4882a593Smuzhiyun 	dm_oblock_t block = get_bio_block(cache, bio);
2684*4882a593Smuzhiyun 
2685*4882a593Smuzhiyun 	init_per_bio_data(bio);
2686*4882a593Smuzhiyun 	if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2687*4882a593Smuzhiyun 		/*
2688*4882a593Smuzhiyun 		 * This can only occur if the io goes to a partial block at
2689*4882a593Smuzhiyun 		 * the end of the origin device.  We don't cache these.
2690*4882a593Smuzhiyun 		 * Just remap to the origin and carry on.
2691*4882a593Smuzhiyun 		 */
2692*4882a593Smuzhiyun 		remap_to_origin(cache, bio);
2693*4882a593Smuzhiyun 		accounted_begin(cache, bio);
2694*4882a593Smuzhiyun 		return DM_MAPIO_REMAPPED;
2695*4882a593Smuzhiyun 	}
2696*4882a593Smuzhiyun 
2697*4882a593Smuzhiyun 	if (discard_or_flush(bio)) {
2698*4882a593Smuzhiyun 		defer_bio(cache, bio);
2699*4882a593Smuzhiyun 		return DM_MAPIO_SUBMITTED;
2700*4882a593Smuzhiyun 	}
2701*4882a593Smuzhiyun 
2702*4882a593Smuzhiyun 	r = map_bio(cache, bio, block, &commit_needed);
2703*4882a593Smuzhiyun 	if (commit_needed)
2704*4882a593Smuzhiyun 		schedule_commit(&cache->committer);
2705*4882a593Smuzhiyun 
2706*4882a593Smuzhiyun 	return r;
2707*4882a593Smuzhiyun }
2708*4882a593Smuzhiyun 
cache_end_io(struct dm_target * ti,struct bio * bio,blk_status_t * error)2709*4882a593Smuzhiyun static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
2710*4882a593Smuzhiyun {
2711*4882a593Smuzhiyun 	struct cache *cache = ti->private;
2712*4882a593Smuzhiyun 	unsigned long flags;
2713*4882a593Smuzhiyun 	struct per_bio_data *pb = get_per_bio_data(bio);
2714*4882a593Smuzhiyun 
2715*4882a593Smuzhiyun 	if (pb->tick) {
2716*4882a593Smuzhiyun 		policy_tick(cache->policy, false);
2717*4882a593Smuzhiyun 
2718*4882a593Smuzhiyun 		spin_lock_irqsave(&cache->lock, flags);
2719*4882a593Smuzhiyun 		cache->need_tick_bio = true;
2720*4882a593Smuzhiyun 		spin_unlock_irqrestore(&cache->lock, flags);
2721*4882a593Smuzhiyun 	}
2722*4882a593Smuzhiyun 
2723*4882a593Smuzhiyun 	bio_drop_shared_lock(cache, bio);
2724*4882a593Smuzhiyun 	accounted_complete(cache, bio);
2725*4882a593Smuzhiyun 
2726*4882a593Smuzhiyun 	return DM_ENDIO_DONE;
2727*4882a593Smuzhiyun }
2728*4882a593Smuzhiyun 
write_dirty_bitset(struct cache * cache)2729*4882a593Smuzhiyun static int write_dirty_bitset(struct cache *cache)
2730*4882a593Smuzhiyun {
2731*4882a593Smuzhiyun 	int r;
2732*4882a593Smuzhiyun 
2733*4882a593Smuzhiyun 	if (get_cache_mode(cache) >= CM_READ_ONLY)
2734*4882a593Smuzhiyun 		return -EINVAL;
2735*4882a593Smuzhiyun 
2736*4882a593Smuzhiyun 	r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2737*4882a593Smuzhiyun 	if (r)
2738*4882a593Smuzhiyun 		metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2739*4882a593Smuzhiyun 
2740*4882a593Smuzhiyun 	return r;
2741*4882a593Smuzhiyun }
2742*4882a593Smuzhiyun 
write_discard_bitset(struct cache * cache)2743*4882a593Smuzhiyun static int write_discard_bitset(struct cache *cache)
2744*4882a593Smuzhiyun {
2745*4882a593Smuzhiyun 	unsigned i, r;
2746*4882a593Smuzhiyun 
2747*4882a593Smuzhiyun 	if (get_cache_mode(cache) >= CM_READ_ONLY)
2748*4882a593Smuzhiyun 		return -EINVAL;
2749*4882a593Smuzhiyun 
2750*4882a593Smuzhiyun 	r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2751*4882a593Smuzhiyun 					   cache->discard_nr_blocks);
2752*4882a593Smuzhiyun 	if (r) {
2753*4882a593Smuzhiyun 		DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2754*4882a593Smuzhiyun 		metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2755*4882a593Smuzhiyun 		return r;
2756*4882a593Smuzhiyun 	}
2757*4882a593Smuzhiyun 
2758*4882a593Smuzhiyun 	for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2759*4882a593Smuzhiyun 		r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2760*4882a593Smuzhiyun 					 is_discarded(cache, to_dblock(i)));
2761*4882a593Smuzhiyun 		if (r) {
2762*4882a593Smuzhiyun 			metadata_operation_failed(cache, "dm_cache_set_discard", r);
2763*4882a593Smuzhiyun 			return r;
2764*4882a593Smuzhiyun 		}
2765*4882a593Smuzhiyun 	}
2766*4882a593Smuzhiyun 
2767*4882a593Smuzhiyun 	return 0;
2768*4882a593Smuzhiyun }
2769*4882a593Smuzhiyun 
write_hints(struct cache * cache)2770*4882a593Smuzhiyun static int write_hints(struct cache *cache)
2771*4882a593Smuzhiyun {
2772*4882a593Smuzhiyun 	int r;
2773*4882a593Smuzhiyun 
2774*4882a593Smuzhiyun 	if (get_cache_mode(cache) >= CM_READ_ONLY)
2775*4882a593Smuzhiyun 		return -EINVAL;
2776*4882a593Smuzhiyun 
2777*4882a593Smuzhiyun 	r = dm_cache_write_hints(cache->cmd, cache->policy);
2778*4882a593Smuzhiyun 	if (r) {
2779*4882a593Smuzhiyun 		metadata_operation_failed(cache, "dm_cache_write_hints", r);
2780*4882a593Smuzhiyun 		return r;
2781*4882a593Smuzhiyun 	}
2782*4882a593Smuzhiyun 
2783*4882a593Smuzhiyun 	return 0;
2784*4882a593Smuzhiyun }
2785*4882a593Smuzhiyun 
2786*4882a593Smuzhiyun /*
2787*4882a593Smuzhiyun  * returns true on success
2788*4882a593Smuzhiyun  */
sync_metadata(struct cache * cache)2789*4882a593Smuzhiyun static bool sync_metadata(struct cache *cache)
2790*4882a593Smuzhiyun {
2791*4882a593Smuzhiyun 	int r1, r2, r3, r4;
2792*4882a593Smuzhiyun 
2793*4882a593Smuzhiyun 	r1 = write_dirty_bitset(cache);
2794*4882a593Smuzhiyun 	if (r1)
2795*4882a593Smuzhiyun 		DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2796*4882a593Smuzhiyun 
2797*4882a593Smuzhiyun 	r2 = write_discard_bitset(cache);
2798*4882a593Smuzhiyun 	if (r2)
2799*4882a593Smuzhiyun 		DMERR("%s: could not write discard bitset", cache_device_name(cache));
2800*4882a593Smuzhiyun 
2801*4882a593Smuzhiyun 	save_stats(cache);
2802*4882a593Smuzhiyun 
2803*4882a593Smuzhiyun 	r3 = write_hints(cache);
2804*4882a593Smuzhiyun 	if (r3)
2805*4882a593Smuzhiyun 		DMERR("%s: could not write hints", cache_device_name(cache));
2806*4882a593Smuzhiyun 
2807*4882a593Smuzhiyun 	/*
2808*4882a593Smuzhiyun 	 * If writing the above metadata failed, we still commit, but don't
2809*4882a593Smuzhiyun 	 * set the clean shutdown flag.  This will effectively force every
2810*4882a593Smuzhiyun 	 * dirty bit to be set on reload.
2811*4882a593Smuzhiyun 	 */
2812*4882a593Smuzhiyun 	r4 = commit(cache, !r1 && !r2 && !r3);
2813*4882a593Smuzhiyun 	if (r4)
2814*4882a593Smuzhiyun 		DMERR("%s: could not write cache metadata", cache_device_name(cache));
2815*4882a593Smuzhiyun 
2816*4882a593Smuzhiyun 	return !r1 && !r2 && !r3 && !r4;
2817*4882a593Smuzhiyun }
2818*4882a593Smuzhiyun 
cache_postsuspend(struct dm_target * ti)2819*4882a593Smuzhiyun static void cache_postsuspend(struct dm_target *ti)
2820*4882a593Smuzhiyun {
2821*4882a593Smuzhiyun 	struct cache *cache = ti->private;
2822*4882a593Smuzhiyun 
2823*4882a593Smuzhiyun 	prevent_background_work(cache);
2824*4882a593Smuzhiyun 	BUG_ON(atomic_read(&cache->nr_io_migrations));
2825*4882a593Smuzhiyun 
2826*4882a593Smuzhiyun 	cancel_delayed_work_sync(&cache->waker);
2827*4882a593Smuzhiyun 	drain_workqueue(cache->wq);
2828*4882a593Smuzhiyun 	WARN_ON(cache->tracker.in_flight);
2829*4882a593Smuzhiyun 
2830*4882a593Smuzhiyun 	/*
2831*4882a593Smuzhiyun 	 * If it's a flush suspend there won't be any deferred bios, so this
2832*4882a593Smuzhiyun 	 * call is harmless.
2833*4882a593Smuzhiyun 	 */
2834*4882a593Smuzhiyun 	requeue_deferred_bios(cache);
2835*4882a593Smuzhiyun 
2836*4882a593Smuzhiyun 	if (get_cache_mode(cache) == CM_WRITE)
2837*4882a593Smuzhiyun 		(void) sync_metadata(cache);
2838*4882a593Smuzhiyun }
2839*4882a593Smuzhiyun 
load_mapping(void * context,dm_oblock_t oblock,dm_cblock_t cblock,bool dirty,uint32_t hint,bool hint_valid)2840*4882a593Smuzhiyun static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2841*4882a593Smuzhiyun 			bool dirty, uint32_t hint, bool hint_valid)
2842*4882a593Smuzhiyun {
2843*4882a593Smuzhiyun 	int r;
2844*4882a593Smuzhiyun 	struct cache *cache = context;
2845*4882a593Smuzhiyun 
2846*4882a593Smuzhiyun 	if (dirty) {
2847*4882a593Smuzhiyun 		set_bit(from_cblock(cblock), cache->dirty_bitset);
2848*4882a593Smuzhiyun 		atomic_inc(&cache->nr_dirty);
2849*4882a593Smuzhiyun 	} else
2850*4882a593Smuzhiyun 		clear_bit(from_cblock(cblock), cache->dirty_bitset);
2851*4882a593Smuzhiyun 
2852*4882a593Smuzhiyun 	r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
2853*4882a593Smuzhiyun 	if (r)
2854*4882a593Smuzhiyun 		return r;
2855*4882a593Smuzhiyun 
2856*4882a593Smuzhiyun 	return 0;
2857*4882a593Smuzhiyun }
2858*4882a593Smuzhiyun 
2859*4882a593Smuzhiyun /*
2860*4882a593Smuzhiyun  * The discard block size in the on disk metadata is not
2861*4882a593Smuzhiyun  * neccessarily the same as we're currently using.  So we have to
2862*4882a593Smuzhiyun  * be careful to only set the discarded attribute if we know it
2863*4882a593Smuzhiyun  * covers a complete block of the new size.
2864*4882a593Smuzhiyun  */
2865*4882a593Smuzhiyun struct discard_load_info {
2866*4882a593Smuzhiyun 	struct cache *cache;
2867*4882a593Smuzhiyun 
2868*4882a593Smuzhiyun 	/*
2869*4882a593Smuzhiyun 	 * These blocks are sized using the on disk dblock size, rather
2870*4882a593Smuzhiyun 	 * than the current one.
2871*4882a593Smuzhiyun 	 */
2872*4882a593Smuzhiyun 	dm_block_t block_size;
2873*4882a593Smuzhiyun 	dm_block_t discard_begin, discard_end;
2874*4882a593Smuzhiyun };
2875*4882a593Smuzhiyun 
discard_load_info_init(struct cache * cache,struct discard_load_info * li)2876*4882a593Smuzhiyun static void discard_load_info_init(struct cache *cache,
2877*4882a593Smuzhiyun 				   struct discard_load_info *li)
2878*4882a593Smuzhiyun {
2879*4882a593Smuzhiyun 	li->cache = cache;
2880*4882a593Smuzhiyun 	li->discard_begin = li->discard_end = 0;
2881*4882a593Smuzhiyun }
2882*4882a593Smuzhiyun 
set_discard_range(struct discard_load_info * li)2883*4882a593Smuzhiyun static void set_discard_range(struct discard_load_info *li)
2884*4882a593Smuzhiyun {
2885*4882a593Smuzhiyun 	sector_t b, e;
2886*4882a593Smuzhiyun 
2887*4882a593Smuzhiyun 	if (li->discard_begin == li->discard_end)
2888*4882a593Smuzhiyun 		return;
2889*4882a593Smuzhiyun 
2890*4882a593Smuzhiyun 	/*
2891*4882a593Smuzhiyun 	 * Convert to sectors.
2892*4882a593Smuzhiyun 	 */
2893*4882a593Smuzhiyun 	b = li->discard_begin * li->block_size;
2894*4882a593Smuzhiyun 	e = li->discard_end * li->block_size;
2895*4882a593Smuzhiyun 
2896*4882a593Smuzhiyun 	/*
2897*4882a593Smuzhiyun 	 * Then convert back to the current dblock size.
2898*4882a593Smuzhiyun 	 */
2899*4882a593Smuzhiyun 	b = dm_sector_div_up(b, li->cache->discard_block_size);
2900*4882a593Smuzhiyun 	sector_div(e, li->cache->discard_block_size);
2901*4882a593Smuzhiyun 
2902*4882a593Smuzhiyun 	/*
2903*4882a593Smuzhiyun 	 * The origin may have shrunk, so we need to check we're still in
2904*4882a593Smuzhiyun 	 * bounds.
2905*4882a593Smuzhiyun 	 */
2906*4882a593Smuzhiyun 	if (e > from_dblock(li->cache->discard_nr_blocks))
2907*4882a593Smuzhiyun 		e = from_dblock(li->cache->discard_nr_blocks);
2908*4882a593Smuzhiyun 
2909*4882a593Smuzhiyun 	for (; b < e; b++)
2910*4882a593Smuzhiyun 		set_discard(li->cache, to_dblock(b));
2911*4882a593Smuzhiyun }
2912*4882a593Smuzhiyun 
load_discard(void * context,sector_t discard_block_size,dm_dblock_t dblock,bool discard)2913*4882a593Smuzhiyun static int load_discard(void *context, sector_t discard_block_size,
2914*4882a593Smuzhiyun 			dm_dblock_t dblock, bool discard)
2915*4882a593Smuzhiyun {
2916*4882a593Smuzhiyun 	struct discard_load_info *li = context;
2917*4882a593Smuzhiyun 
2918*4882a593Smuzhiyun 	li->block_size = discard_block_size;
2919*4882a593Smuzhiyun 
2920*4882a593Smuzhiyun 	if (discard) {
2921*4882a593Smuzhiyun 		if (from_dblock(dblock) == li->discard_end)
2922*4882a593Smuzhiyun 			/*
2923*4882a593Smuzhiyun 			 * We're already in a discard range, just extend it.
2924*4882a593Smuzhiyun 			 */
2925*4882a593Smuzhiyun 			li->discard_end = li->discard_end + 1ULL;
2926*4882a593Smuzhiyun 
2927*4882a593Smuzhiyun 		else {
2928*4882a593Smuzhiyun 			/*
2929*4882a593Smuzhiyun 			 * Emit the old range and start a new one.
2930*4882a593Smuzhiyun 			 */
2931*4882a593Smuzhiyun 			set_discard_range(li);
2932*4882a593Smuzhiyun 			li->discard_begin = from_dblock(dblock);
2933*4882a593Smuzhiyun 			li->discard_end = li->discard_begin + 1ULL;
2934*4882a593Smuzhiyun 		}
2935*4882a593Smuzhiyun 	} else {
2936*4882a593Smuzhiyun 		set_discard_range(li);
2937*4882a593Smuzhiyun 		li->discard_begin = li->discard_end = 0;
2938*4882a593Smuzhiyun 	}
2939*4882a593Smuzhiyun 
2940*4882a593Smuzhiyun 	return 0;
2941*4882a593Smuzhiyun }
2942*4882a593Smuzhiyun 
get_cache_dev_size(struct cache * cache)2943*4882a593Smuzhiyun static dm_cblock_t get_cache_dev_size(struct cache *cache)
2944*4882a593Smuzhiyun {
2945*4882a593Smuzhiyun 	sector_t size = get_dev_size(cache->cache_dev);
2946*4882a593Smuzhiyun 	(void) sector_div(size, cache->sectors_per_block);
2947*4882a593Smuzhiyun 	return to_cblock(size);
2948*4882a593Smuzhiyun }
2949*4882a593Smuzhiyun 
can_resize(struct cache * cache,dm_cblock_t new_size)2950*4882a593Smuzhiyun static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2951*4882a593Smuzhiyun {
2952*4882a593Smuzhiyun 	if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
2953*4882a593Smuzhiyun 		if (cache->sized) {
2954*4882a593Smuzhiyun 			DMERR("%s: unable to extend cache due to missing cache table reload",
2955*4882a593Smuzhiyun 			      cache_device_name(cache));
2956*4882a593Smuzhiyun 			return false;
2957*4882a593Smuzhiyun 		}
2958*4882a593Smuzhiyun 	}
2959*4882a593Smuzhiyun 
2960*4882a593Smuzhiyun 	/*
2961*4882a593Smuzhiyun 	 * We can't drop a dirty block when shrinking the cache.
2962*4882a593Smuzhiyun 	 */
2963*4882a593Smuzhiyun 	while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2964*4882a593Smuzhiyun 		new_size = to_cblock(from_cblock(new_size) + 1);
2965*4882a593Smuzhiyun 		if (is_dirty(cache, new_size)) {
2966*4882a593Smuzhiyun 			DMERR("%s: unable to shrink cache; cache block %llu is dirty",
2967*4882a593Smuzhiyun 			      cache_device_name(cache),
2968*4882a593Smuzhiyun 			      (unsigned long long) from_cblock(new_size));
2969*4882a593Smuzhiyun 			return false;
2970*4882a593Smuzhiyun 		}
2971*4882a593Smuzhiyun 	}
2972*4882a593Smuzhiyun 
2973*4882a593Smuzhiyun 	return true;
2974*4882a593Smuzhiyun }
2975*4882a593Smuzhiyun 
resize_cache_dev(struct cache * cache,dm_cblock_t new_size)2976*4882a593Smuzhiyun static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
2977*4882a593Smuzhiyun {
2978*4882a593Smuzhiyun 	int r;
2979*4882a593Smuzhiyun 
2980*4882a593Smuzhiyun 	r = dm_cache_resize(cache->cmd, new_size);
2981*4882a593Smuzhiyun 	if (r) {
2982*4882a593Smuzhiyun 		DMERR("%s: could not resize cache metadata", cache_device_name(cache));
2983*4882a593Smuzhiyun 		metadata_operation_failed(cache, "dm_cache_resize", r);
2984*4882a593Smuzhiyun 		return r;
2985*4882a593Smuzhiyun 	}
2986*4882a593Smuzhiyun 
2987*4882a593Smuzhiyun 	set_cache_size(cache, new_size);
2988*4882a593Smuzhiyun 
2989*4882a593Smuzhiyun 	return 0;
2990*4882a593Smuzhiyun }
2991*4882a593Smuzhiyun 
cache_preresume(struct dm_target * ti)2992*4882a593Smuzhiyun static int cache_preresume(struct dm_target *ti)
2993*4882a593Smuzhiyun {
2994*4882a593Smuzhiyun 	int r = 0;
2995*4882a593Smuzhiyun 	struct cache *cache = ti->private;
2996*4882a593Smuzhiyun 	dm_cblock_t csize = get_cache_dev_size(cache);
2997*4882a593Smuzhiyun 
2998*4882a593Smuzhiyun 	/*
2999*4882a593Smuzhiyun 	 * Check to see if the cache has resized.
3000*4882a593Smuzhiyun 	 */
3001*4882a593Smuzhiyun 	if (!cache->sized) {
3002*4882a593Smuzhiyun 		r = resize_cache_dev(cache, csize);
3003*4882a593Smuzhiyun 		if (r)
3004*4882a593Smuzhiyun 			return r;
3005*4882a593Smuzhiyun 
3006*4882a593Smuzhiyun 		cache->sized = true;
3007*4882a593Smuzhiyun 
3008*4882a593Smuzhiyun 	} else if (csize != cache->cache_size) {
3009*4882a593Smuzhiyun 		if (!can_resize(cache, csize))
3010*4882a593Smuzhiyun 			return -EINVAL;
3011*4882a593Smuzhiyun 
3012*4882a593Smuzhiyun 		r = resize_cache_dev(cache, csize);
3013*4882a593Smuzhiyun 		if (r)
3014*4882a593Smuzhiyun 			return r;
3015*4882a593Smuzhiyun 	}
3016*4882a593Smuzhiyun 
3017*4882a593Smuzhiyun 	if (!cache->loaded_mappings) {
3018*4882a593Smuzhiyun 		r = dm_cache_load_mappings(cache->cmd, cache->policy,
3019*4882a593Smuzhiyun 					   load_mapping, cache);
3020*4882a593Smuzhiyun 		if (r) {
3021*4882a593Smuzhiyun 			DMERR("%s: could not load cache mappings", cache_device_name(cache));
3022*4882a593Smuzhiyun 			metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3023*4882a593Smuzhiyun 			return r;
3024*4882a593Smuzhiyun 		}
3025*4882a593Smuzhiyun 
3026*4882a593Smuzhiyun 		cache->loaded_mappings = true;
3027*4882a593Smuzhiyun 	}
3028*4882a593Smuzhiyun 
3029*4882a593Smuzhiyun 	if (!cache->loaded_discards) {
3030*4882a593Smuzhiyun 		struct discard_load_info li;
3031*4882a593Smuzhiyun 
3032*4882a593Smuzhiyun 		/*
3033*4882a593Smuzhiyun 		 * The discard bitset could have been resized, or the
3034*4882a593Smuzhiyun 		 * discard block size changed.  To be safe we start by
3035*4882a593Smuzhiyun 		 * setting every dblock to not discarded.
3036*4882a593Smuzhiyun 		 */
3037*4882a593Smuzhiyun 		clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3038*4882a593Smuzhiyun 
3039*4882a593Smuzhiyun 		discard_load_info_init(cache, &li);
3040*4882a593Smuzhiyun 		r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3041*4882a593Smuzhiyun 		if (r) {
3042*4882a593Smuzhiyun 			DMERR("%s: could not load origin discards", cache_device_name(cache));
3043*4882a593Smuzhiyun 			metadata_operation_failed(cache, "dm_cache_load_discards", r);
3044*4882a593Smuzhiyun 			return r;
3045*4882a593Smuzhiyun 		}
3046*4882a593Smuzhiyun 		set_discard_range(&li);
3047*4882a593Smuzhiyun 
3048*4882a593Smuzhiyun 		cache->loaded_discards = true;
3049*4882a593Smuzhiyun 	}
3050*4882a593Smuzhiyun 
3051*4882a593Smuzhiyun 	return r;
3052*4882a593Smuzhiyun }
3053*4882a593Smuzhiyun 
cache_resume(struct dm_target * ti)3054*4882a593Smuzhiyun static void cache_resume(struct dm_target *ti)
3055*4882a593Smuzhiyun {
3056*4882a593Smuzhiyun 	struct cache *cache = ti->private;
3057*4882a593Smuzhiyun 
3058*4882a593Smuzhiyun 	cache->need_tick_bio = true;
3059*4882a593Smuzhiyun 	allow_background_work(cache);
3060*4882a593Smuzhiyun 	do_waker(&cache->waker.work);
3061*4882a593Smuzhiyun }
3062*4882a593Smuzhiyun 
emit_flags(struct cache * cache,char * result,unsigned maxlen,ssize_t * sz_ptr)3063*4882a593Smuzhiyun static void emit_flags(struct cache *cache, char *result,
3064*4882a593Smuzhiyun 		       unsigned maxlen, ssize_t *sz_ptr)
3065*4882a593Smuzhiyun {
3066*4882a593Smuzhiyun 	ssize_t sz = *sz_ptr;
3067*4882a593Smuzhiyun 	struct cache_features *cf = &cache->features;
3068*4882a593Smuzhiyun 	unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
3069*4882a593Smuzhiyun 
3070*4882a593Smuzhiyun 	DMEMIT("%u ", count);
3071*4882a593Smuzhiyun 
3072*4882a593Smuzhiyun 	if (cf->metadata_version == 2)
3073*4882a593Smuzhiyun 		DMEMIT("metadata2 ");
3074*4882a593Smuzhiyun 
3075*4882a593Smuzhiyun 	if (writethrough_mode(cache))
3076*4882a593Smuzhiyun 		DMEMIT("writethrough ");
3077*4882a593Smuzhiyun 
3078*4882a593Smuzhiyun 	else if (passthrough_mode(cache))
3079*4882a593Smuzhiyun 		DMEMIT("passthrough ");
3080*4882a593Smuzhiyun 
3081*4882a593Smuzhiyun 	else if (writeback_mode(cache))
3082*4882a593Smuzhiyun 		DMEMIT("writeback ");
3083*4882a593Smuzhiyun 
3084*4882a593Smuzhiyun 	else {
3085*4882a593Smuzhiyun 		DMEMIT("unknown ");
3086*4882a593Smuzhiyun 		DMERR("%s: internal error: unknown io mode: %d",
3087*4882a593Smuzhiyun 		      cache_device_name(cache), (int) cf->io_mode);
3088*4882a593Smuzhiyun 	}
3089*4882a593Smuzhiyun 
3090*4882a593Smuzhiyun 	if (!cf->discard_passdown)
3091*4882a593Smuzhiyun 		DMEMIT("no_discard_passdown ");
3092*4882a593Smuzhiyun 
3093*4882a593Smuzhiyun 	*sz_ptr = sz;
3094*4882a593Smuzhiyun }
3095*4882a593Smuzhiyun 
3096*4882a593Smuzhiyun /*
3097*4882a593Smuzhiyun  * Status format:
3098*4882a593Smuzhiyun  *
3099*4882a593Smuzhiyun  * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3100*4882a593Smuzhiyun  * <cache block size> <#used cache blocks>/<#total cache blocks>
3101*4882a593Smuzhiyun  * <#read hits> <#read misses> <#write hits> <#write misses>
3102*4882a593Smuzhiyun  * <#demotions> <#promotions> <#dirty>
3103*4882a593Smuzhiyun  * <#features> <features>*
3104*4882a593Smuzhiyun  * <#core args> <core args>
3105*4882a593Smuzhiyun  * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3106*4882a593Smuzhiyun  */
cache_status(struct dm_target * ti,status_type_t type,unsigned status_flags,char * result,unsigned maxlen)3107*4882a593Smuzhiyun static void cache_status(struct dm_target *ti, status_type_t type,
3108*4882a593Smuzhiyun 			 unsigned status_flags, char *result, unsigned maxlen)
3109*4882a593Smuzhiyun {
3110*4882a593Smuzhiyun 	int r = 0;
3111*4882a593Smuzhiyun 	unsigned i;
3112*4882a593Smuzhiyun 	ssize_t sz = 0;
3113*4882a593Smuzhiyun 	dm_block_t nr_free_blocks_metadata = 0;
3114*4882a593Smuzhiyun 	dm_block_t nr_blocks_metadata = 0;
3115*4882a593Smuzhiyun 	char buf[BDEVNAME_SIZE];
3116*4882a593Smuzhiyun 	struct cache *cache = ti->private;
3117*4882a593Smuzhiyun 	dm_cblock_t residency;
3118*4882a593Smuzhiyun 	bool needs_check;
3119*4882a593Smuzhiyun 
3120*4882a593Smuzhiyun 	switch (type) {
3121*4882a593Smuzhiyun 	case STATUSTYPE_INFO:
3122*4882a593Smuzhiyun 		if (get_cache_mode(cache) == CM_FAIL) {
3123*4882a593Smuzhiyun 			DMEMIT("Fail");
3124*4882a593Smuzhiyun 			break;
3125*4882a593Smuzhiyun 		}
3126*4882a593Smuzhiyun 
3127*4882a593Smuzhiyun 		/* Commit to ensure statistics aren't out-of-date */
3128*4882a593Smuzhiyun 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3129*4882a593Smuzhiyun 			(void) commit(cache, false);
3130*4882a593Smuzhiyun 
3131*4882a593Smuzhiyun 		r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3132*4882a593Smuzhiyun 		if (r) {
3133*4882a593Smuzhiyun 			DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3134*4882a593Smuzhiyun 			      cache_device_name(cache), r);
3135*4882a593Smuzhiyun 			goto err;
3136*4882a593Smuzhiyun 		}
3137*4882a593Smuzhiyun 
3138*4882a593Smuzhiyun 		r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3139*4882a593Smuzhiyun 		if (r) {
3140*4882a593Smuzhiyun 			DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3141*4882a593Smuzhiyun 			      cache_device_name(cache), r);
3142*4882a593Smuzhiyun 			goto err;
3143*4882a593Smuzhiyun 		}
3144*4882a593Smuzhiyun 
3145*4882a593Smuzhiyun 		residency = policy_residency(cache->policy);
3146*4882a593Smuzhiyun 
3147*4882a593Smuzhiyun 		DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3148*4882a593Smuzhiyun 		       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3149*4882a593Smuzhiyun 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3150*4882a593Smuzhiyun 		       (unsigned long long)nr_blocks_metadata,
3151*4882a593Smuzhiyun 		       (unsigned long long)cache->sectors_per_block,
3152*4882a593Smuzhiyun 		       (unsigned long long) from_cblock(residency),
3153*4882a593Smuzhiyun 		       (unsigned long long) from_cblock(cache->cache_size),
3154*4882a593Smuzhiyun 		       (unsigned) atomic_read(&cache->stats.read_hit),
3155*4882a593Smuzhiyun 		       (unsigned) atomic_read(&cache->stats.read_miss),
3156*4882a593Smuzhiyun 		       (unsigned) atomic_read(&cache->stats.write_hit),
3157*4882a593Smuzhiyun 		       (unsigned) atomic_read(&cache->stats.write_miss),
3158*4882a593Smuzhiyun 		       (unsigned) atomic_read(&cache->stats.demotion),
3159*4882a593Smuzhiyun 		       (unsigned) atomic_read(&cache->stats.promotion),
3160*4882a593Smuzhiyun 		       (unsigned long) atomic_read(&cache->nr_dirty));
3161*4882a593Smuzhiyun 
3162*4882a593Smuzhiyun 		emit_flags(cache, result, maxlen, &sz);
3163*4882a593Smuzhiyun 
3164*4882a593Smuzhiyun 		DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3165*4882a593Smuzhiyun 
3166*4882a593Smuzhiyun 		DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3167*4882a593Smuzhiyun 		if (sz < maxlen) {
3168*4882a593Smuzhiyun 			r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3169*4882a593Smuzhiyun 			if (r)
3170*4882a593Smuzhiyun 				DMERR("%s: policy_emit_config_values returned %d",
3171*4882a593Smuzhiyun 				      cache_device_name(cache), r);
3172*4882a593Smuzhiyun 		}
3173*4882a593Smuzhiyun 
3174*4882a593Smuzhiyun 		if (get_cache_mode(cache) == CM_READ_ONLY)
3175*4882a593Smuzhiyun 			DMEMIT("ro ");
3176*4882a593Smuzhiyun 		else
3177*4882a593Smuzhiyun 			DMEMIT("rw ");
3178*4882a593Smuzhiyun 
3179*4882a593Smuzhiyun 		r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3180*4882a593Smuzhiyun 
3181*4882a593Smuzhiyun 		if (r || needs_check)
3182*4882a593Smuzhiyun 			DMEMIT("needs_check ");
3183*4882a593Smuzhiyun 		else
3184*4882a593Smuzhiyun 			DMEMIT("- ");
3185*4882a593Smuzhiyun 
3186*4882a593Smuzhiyun 		break;
3187*4882a593Smuzhiyun 
3188*4882a593Smuzhiyun 	case STATUSTYPE_TABLE:
3189*4882a593Smuzhiyun 		format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3190*4882a593Smuzhiyun 		DMEMIT("%s ", buf);
3191*4882a593Smuzhiyun 		format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3192*4882a593Smuzhiyun 		DMEMIT("%s ", buf);
3193*4882a593Smuzhiyun 		format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3194*4882a593Smuzhiyun 		DMEMIT("%s", buf);
3195*4882a593Smuzhiyun 
3196*4882a593Smuzhiyun 		for (i = 0; i < cache->nr_ctr_args - 1; i++)
3197*4882a593Smuzhiyun 			DMEMIT(" %s", cache->ctr_args[i]);
3198*4882a593Smuzhiyun 		if (cache->nr_ctr_args)
3199*4882a593Smuzhiyun 			DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3200*4882a593Smuzhiyun 	}
3201*4882a593Smuzhiyun 
3202*4882a593Smuzhiyun 	return;
3203*4882a593Smuzhiyun 
3204*4882a593Smuzhiyun err:
3205*4882a593Smuzhiyun 	DMEMIT("Error");
3206*4882a593Smuzhiyun }
3207*4882a593Smuzhiyun 
3208*4882a593Smuzhiyun /*
3209*4882a593Smuzhiyun  * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
3210*4882a593Smuzhiyun  * the one-past-the-end value.
3211*4882a593Smuzhiyun  */
3212*4882a593Smuzhiyun struct cblock_range {
3213*4882a593Smuzhiyun 	dm_cblock_t begin;
3214*4882a593Smuzhiyun 	dm_cblock_t end;
3215*4882a593Smuzhiyun };
3216*4882a593Smuzhiyun 
3217*4882a593Smuzhiyun /*
3218*4882a593Smuzhiyun  * A cache block range can take two forms:
3219*4882a593Smuzhiyun  *
3220*4882a593Smuzhiyun  * i) A single cblock, eg. '3456'
3221*4882a593Smuzhiyun  * ii) A begin and end cblock with a dash between, eg. 123-234
3222*4882a593Smuzhiyun  */
parse_cblock_range(struct cache * cache,const char * str,struct cblock_range * result)3223*4882a593Smuzhiyun static int parse_cblock_range(struct cache *cache, const char *str,
3224*4882a593Smuzhiyun 			      struct cblock_range *result)
3225*4882a593Smuzhiyun {
3226*4882a593Smuzhiyun 	char dummy;
3227*4882a593Smuzhiyun 	uint64_t b, e;
3228*4882a593Smuzhiyun 	int r;
3229*4882a593Smuzhiyun 
3230*4882a593Smuzhiyun 	/*
3231*4882a593Smuzhiyun 	 * Try and parse form (ii) first.
3232*4882a593Smuzhiyun 	 */
3233*4882a593Smuzhiyun 	r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3234*4882a593Smuzhiyun 	if (r < 0)
3235*4882a593Smuzhiyun 		return r;
3236*4882a593Smuzhiyun 
3237*4882a593Smuzhiyun 	if (r == 2) {
3238*4882a593Smuzhiyun 		result->begin = to_cblock(b);
3239*4882a593Smuzhiyun 		result->end = to_cblock(e);
3240*4882a593Smuzhiyun 		return 0;
3241*4882a593Smuzhiyun 	}
3242*4882a593Smuzhiyun 
3243*4882a593Smuzhiyun 	/*
3244*4882a593Smuzhiyun 	 * That didn't work, try form (i).
3245*4882a593Smuzhiyun 	 */
3246*4882a593Smuzhiyun 	r = sscanf(str, "%llu%c", &b, &dummy);
3247*4882a593Smuzhiyun 	if (r < 0)
3248*4882a593Smuzhiyun 		return r;
3249*4882a593Smuzhiyun 
3250*4882a593Smuzhiyun 	if (r == 1) {
3251*4882a593Smuzhiyun 		result->begin = to_cblock(b);
3252*4882a593Smuzhiyun 		result->end = to_cblock(from_cblock(result->begin) + 1u);
3253*4882a593Smuzhiyun 		return 0;
3254*4882a593Smuzhiyun 	}
3255*4882a593Smuzhiyun 
3256*4882a593Smuzhiyun 	DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3257*4882a593Smuzhiyun 	return -EINVAL;
3258*4882a593Smuzhiyun }
3259*4882a593Smuzhiyun 
validate_cblock_range(struct cache * cache,struct cblock_range * range)3260*4882a593Smuzhiyun static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3261*4882a593Smuzhiyun {
3262*4882a593Smuzhiyun 	uint64_t b = from_cblock(range->begin);
3263*4882a593Smuzhiyun 	uint64_t e = from_cblock(range->end);
3264*4882a593Smuzhiyun 	uint64_t n = from_cblock(cache->cache_size);
3265*4882a593Smuzhiyun 
3266*4882a593Smuzhiyun 	if (b >= n) {
3267*4882a593Smuzhiyun 		DMERR("%s: begin cblock out of range: %llu >= %llu",
3268*4882a593Smuzhiyun 		      cache_device_name(cache), b, n);
3269*4882a593Smuzhiyun 		return -EINVAL;
3270*4882a593Smuzhiyun 	}
3271*4882a593Smuzhiyun 
3272*4882a593Smuzhiyun 	if (e > n) {
3273*4882a593Smuzhiyun 		DMERR("%s: end cblock out of range: %llu > %llu",
3274*4882a593Smuzhiyun 		      cache_device_name(cache), e, n);
3275*4882a593Smuzhiyun 		return -EINVAL;
3276*4882a593Smuzhiyun 	}
3277*4882a593Smuzhiyun 
3278*4882a593Smuzhiyun 	if (b >= e) {
3279*4882a593Smuzhiyun 		DMERR("%s: invalid cblock range: %llu >= %llu",
3280*4882a593Smuzhiyun 		      cache_device_name(cache), b, e);
3281*4882a593Smuzhiyun 		return -EINVAL;
3282*4882a593Smuzhiyun 	}
3283*4882a593Smuzhiyun 
3284*4882a593Smuzhiyun 	return 0;
3285*4882a593Smuzhiyun }
3286*4882a593Smuzhiyun 
cblock_succ(dm_cblock_t b)3287*4882a593Smuzhiyun static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3288*4882a593Smuzhiyun {
3289*4882a593Smuzhiyun 	return to_cblock(from_cblock(b) + 1);
3290*4882a593Smuzhiyun }
3291*4882a593Smuzhiyun 
request_invalidation(struct cache * cache,struct cblock_range * range)3292*4882a593Smuzhiyun static int request_invalidation(struct cache *cache, struct cblock_range *range)
3293*4882a593Smuzhiyun {
3294*4882a593Smuzhiyun 	int r = 0;
3295*4882a593Smuzhiyun 
3296*4882a593Smuzhiyun 	/*
3297*4882a593Smuzhiyun 	 * We don't need to do any locking here because we know we're in
3298*4882a593Smuzhiyun 	 * passthrough mode.  There's is potential for a race between an
3299*4882a593Smuzhiyun 	 * invalidation triggered by an io and an invalidation message.  This
3300*4882a593Smuzhiyun 	 * is harmless, we must not worry if the policy call fails.
3301*4882a593Smuzhiyun 	 */
3302*4882a593Smuzhiyun 	while (range->begin != range->end) {
3303*4882a593Smuzhiyun 		r = invalidate_cblock(cache, range->begin);
3304*4882a593Smuzhiyun 		if (r)
3305*4882a593Smuzhiyun 			return r;
3306*4882a593Smuzhiyun 
3307*4882a593Smuzhiyun 		range->begin = cblock_succ(range->begin);
3308*4882a593Smuzhiyun 	}
3309*4882a593Smuzhiyun 
3310*4882a593Smuzhiyun 	cache->commit_requested = true;
3311*4882a593Smuzhiyun 	return r;
3312*4882a593Smuzhiyun }
3313*4882a593Smuzhiyun 
process_invalidate_cblocks_message(struct cache * cache,unsigned count,const char ** cblock_ranges)3314*4882a593Smuzhiyun static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3315*4882a593Smuzhiyun 					      const char **cblock_ranges)
3316*4882a593Smuzhiyun {
3317*4882a593Smuzhiyun 	int r = 0;
3318*4882a593Smuzhiyun 	unsigned i;
3319*4882a593Smuzhiyun 	struct cblock_range range;
3320*4882a593Smuzhiyun 
3321*4882a593Smuzhiyun 	if (!passthrough_mode(cache)) {
3322*4882a593Smuzhiyun 		DMERR("%s: cache has to be in passthrough mode for invalidation",
3323*4882a593Smuzhiyun 		      cache_device_name(cache));
3324*4882a593Smuzhiyun 		return -EPERM;
3325*4882a593Smuzhiyun 	}
3326*4882a593Smuzhiyun 
3327*4882a593Smuzhiyun 	for (i = 0; i < count; i++) {
3328*4882a593Smuzhiyun 		r = parse_cblock_range(cache, cblock_ranges[i], &range);
3329*4882a593Smuzhiyun 		if (r)
3330*4882a593Smuzhiyun 			break;
3331*4882a593Smuzhiyun 
3332*4882a593Smuzhiyun 		r = validate_cblock_range(cache, &range);
3333*4882a593Smuzhiyun 		if (r)
3334*4882a593Smuzhiyun 			break;
3335*4882a593Smuzhiyun 
3336*4882a593Smuzhiyun 		/*
3337*4882a593Smuzhiyun 		 * Pass begin and end origin blocks to the worker and wake it.
3338*4882a593Smuzhiyun 		 */
3339*4882a593Smuzhiyun 		r = request_invalidation(cache, &range);
3340*4882a593Smuzhiyun 		if (r)
3341*4882a593Smuzhiyun 			break;
3342*4882a593Smuzhiyun 	}
3343*4882a593Smuzhiyun 
3344*4882a593Smuzhiyun 	return r;
3345*4882a593Smuzhiyun }
3346*4882a593Smuzhiyun 
3347*4882a593Smuzhiyun /*
3348*4882a593Smuzhiyun  * Supports
3349*4882a593Smuzhiyun  *	"<key> <value>"
3350*4882a593Smuzhiyun  * and
3351*4882a593Smuzhiyun  *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3352*4882a593Smuzhiyun  *
3353*4882a593Smuzhiyun  * The key migration_threshold is supported by the cache target core.
3354*4882a593Smuzhiyun  */
cache_message(struct dm_target * ti,unsigned argc,char ** argv,char * result,unsigned maxlen)3355*4882a593Smuzhiyun static int cache_message(struct dm_target *ti, unsigned argc, char **argv,
3356*4882a593Smuzhiyun 			 char *result, unsigned maxlen)
3357*4882a593Smuzhiyun {
3358*4882a593Smuzhiyun 	struct cache *cache = ti->private;
3359*4882a593Smuzhiyun 
3360*4882a593Smuzhiyun 	if (!argc)
3361*4882a593Smuzhiyun 		return -EINVAL;
3362*4882a593Smuzhiyun 
3363*4882a593Smuzhiyun 	if (get_cache_mode(cache) >= CM_READ_ONLY) {
3364*4882a593Smuzhiyun 		DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3365*4882a593Smuzhiyun 		      cache_device_name(cache));
3366*4882a593Smuzhiyun 		return -EOPNOTSUPP;
3367*4882a593Smuzhiyun 	}
3368*4882a593Smuzhiyun 
3369*4882a593Smuzhiyun 	if (!strcasecmp(argv[0], "invalidate_cblocks"))
3370*4882a593Smuzhiyun 		return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3371*4882a593Smuzhiyun 
3372*4882a593Smuzhiyun 	if (argc != 2)
3373*4882a593Smuzhiyun 		return -EINVAL;
3374*4882a593Smuzhiyun 
3375*4882a593Smuzhiyun 	return set_config_value(cache, argv[0], argv[1]);
3376*4882a593Smuzhiyun }
3377*4882a593Smuzhiyun 
cache_iterate_devices(struct dm_target * ti,iterate_devices_callout_fn fn,void * data)3378*4882a593Smuzhiyun static int cache_iterate_devices(struct dm_target *ti,
3379*4882a593Smuzhiyun 				 iterate_devices_callout_fn fn, void *data)
3380*4882a593Smuzhiyun {
3381*4882a593Smuzhiyun 	int r = 0;
3382*4882a593Smuzhiyun 	struct cache *cache = ti->private;
3383*4882a593Smuzhiyun 
3384*4882a593Smuzhiyun 	r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3385*4882a593Smuzhiyun 	if (!r)
3386*4882a593Smuzhiyun 		r = fn(ti, cache->origin_dev, 0, ti->len, data);
3387*4882a593Smuzhiyun 
3388*4882a593Smuzhiyun 	return r;
3389*4882a593Smuzhiyun }
3390*4882a593Smuzhiyun 
origin_dev_supports_discard(struct block_device * origin_bdev)3391*4882a593Smuzhiyun static bool origin_dev_supports_discard(struct block_device *origin_bdev)
3392*4882a593Smuzhiyun {
3393*4882a593Smuzhiyun 	struct request_queue *q = bdev_get_queue(origin_bdev);
3394*4882a593Smuzhiyun 
3395*4882a593Smuzhiyun 	return q && blk_queue_discard(q);
3396*4882a593Smuzhiyun }
3397*4882a593Smuzhiyun 
3398*4882a593Smuzhiyun /*
3399*4882a593Smuzhiyun  * If discard_passdown was enabled verify that the origin device
3400*4882a593Smuzhiyun  * supports discards.  Disable discard_passdown if not.
3401*4882a593Smuzhiyun  */
disable_passdown_if_not_supported(struct cache * cache)3402*4882a593Smuzhiyun static void disable_passdown_if_not_supported(struct cache *cache)
3403*4882a593Smuzhiyun {
3404*4882a593Smuzhiyun 	struct block_device *origin_bdev = cache->origin_dev->bdev;
3405*4882a593Smuzhiyun 	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3406*4882a593Smuzhiyun 	const char *reason = NULL;
3407*4882a593Smuzhiyun 	char buf[BDEVNAME_SIZE];
3408*4882a593Smuzhiyun 
3409*4882a593Smuzhiyun 	if (!cache->features.discard_passdown)
3410*4882a593Smuzhiyun 		return;
3411*4882a593Smuzhiyun 
3412*4882a593Smuzhiyun 	if (!origin_dev_supports_discard(origin_bdev))
3413*4882a593Smuzhiyun 		reason = "discard unsupported";
3414*4882a593Smuzhiyun 
3415*4882a593Smuzhiyun 	else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
3416*4882a593Smuzhiyun 		reason = "max discard sectors smaller than a block";
3417*4882a593Smuzhiyun 
3418*4882a593Smuzhiyun 	if (reason) {
3419*4882a593Smuzhiyun 		DMWARN("Origin device (%s) %s: Disabling discard passdown.",
3420*4882a593Smuzhiyun 		       bdevname(origin_bdev, buf), reason);
3421*4882a593Smuzhiyun 		cache->features.discard_passdown = false;
3422*4882a593Smuzhiyun 	}
3423*4882a593Smuzhiyun }
3424*4882a593Smuzhiyun 
set_discard_limits(struct cache * cache,struct queue_limits * limits)3425*4882a593Smuzhiyun static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3426*4882a593Smuzhiyun {
3427*4882a593Smuzhiyun 	struct block_device *origin_bdev = cache->origin_dev->bdev;
3428*4882a593Smuzhiyun 	struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3429*4882a593Smuzhiyun 
3430*4882a593Smuzhiyun 	if (!cache->features.discard_passdown) {
3431*4882a593Smuzhiyun 		/* No passdown is done so setting own virtual limits */
3432*4882a593Smuzhiyun 		limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3433*4882a593Smuzhiyun 						    cache->origin_sectors);
3434*4882a593Smuzhiyun 		limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3435*4882a593Smuzhiyun 		return;
3436*4882a593Smuzhiyun 	}
3437*4882a593Smuzhiyun 
3438*4882a593Smuzhiyun 	/*
3439*4882a593Smuzhiyun 	 * cache_iterate_devices() is stacking both origin and fast device limits
3440*4882a593Smuzhiyun 	 * but discards aren't passed to fast device, so inherit origin's limits.
3441*4882a593Smuzhiyun 	 */
3442*4882a593Smuzhiyun 	limits->max_discard_sectors = origin_limits->max_discard_sectors;
3443*4882a593Smuzhiyun 	limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
3444*4882a593Smuzhiyun 	limits->discard_granularity = origin_limits->discard_granularity;
3445*4882a593Smuzhiyun 	limits->discard_alignment = origin_limits->discard_alignment;
3446*4882a593Smuzhiyun 	limits->discard_misaligned = origin_limits->discard_misaligned;
3447*4882a593Smuzhiyun }
3448*4882a593Smuzhiyun 
cache_io_hints(struct dm_target * ti,struct queue_limits * limits)3449*4882a593Smuzhiyun static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3450*4882a593Smuzhiyun {
3451*4882a593Smuzhiyun 	struct cache *cache = ti->private;
3452*4882a593Smuzhiyun 	uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3453*4882a593Smuzhiyun 
3454*4882a593Smuzhiyun 	/*
3455*4882a593Smuzhiyun 	 * If the system-determined stacked limits are compatible with the
3456*4882a593Smuzhiyun 	 * cache's blocksize (io_opt is a factor) do not override them.
3457*4882a593Smuzhiyun 	 */
3458*4882a593Smuzhiyun 	if (io_opt_sectors < cache->sectors_per_block ||
3459*4882a593Smuzhiyun 	    do_div(io_opt_sectors, cache->sectors_per_block)) {
3460*4882a593Smuzhiyun 		blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3461*4882a593Smuzhiyun 		blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3462*4882a593Smuzhiyun 	}
3463*4882a593Smuzhiyun 
3464*4882a593Smuzhiyun 	disable_passdown_if_not_supported(cache);
3465*4882a593Smuzhiyun 	set_discard_limits(cache, limits);
3466*4882a593Smuzhiyun }
3467*4882a593Smuzhiyun 
3468*4882a593Smuzhiyun /*----------------------------------------------------------------*/
3469*4882a593Smuzhiyun 
3470*4882a593Smuzhiyun static struct target_type cache_target = {
3471*4882a593Smuzhiyun 	.name = "cache",
3472*4882a593Smuzhiyun 	.version = {2, 2, 0},
3473*4882a593Smuzhiyun 	.module = THIS_MODULE,
3474*4882a593Smuzhiyun 	.ctr = cache_ctr,
3475*4882a593Smuzhiyun 	.dtr = cache_dtr,
3476*4882a593Smuzhiyun 	.map = cache_map,
3477*4882a593Smuzhiyun 	.end_io = cache_end_io,
3478*4882a593Smuzhiyun 	.postsuspend = cache_postsuspend,
3479*4882a593Smuzhiyun 	.preresume = cache_preresume,
3480*4882a593Smuzhiyun 	.resume = cache_resume,
3481*4882a593Smuzhiyun 	.status = cache_status,
3482*4882a593Smuzhiyun 	.message = cache_message,
3483*4882a593Smuzhiyun 	.iterate_devices = cache_iterate_devices,
3484*4882a593Smuzhiyun 	.io_hints = cache_io_hints,
3485*4882a593Smuzhiyun };
3486*4882a593Smuzhiyun 
dm_cache_init(void)3487*4882a593Smuzhiyun static int __init dm_cache_init(void)
3488*4882a593Smuzhiyun {
3489*4882a593Smuzhiyun 	int r;
3490*4882a593Smuzhiyun 
3491*4882a593Smuzhiyun 	migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3492*4882a593Smuzhiyun 	if (!migration_cache)
3493*4882a593Smuzhiyun 		return -ENOMEM;
3494*4882a593Smuzhiyun 
3495*4882a593Smuzhiyun 	r = dm_register_target(&cache_target);
3496*4882a593Smuzhiyun 	if (r) {
3497*4882a593Smuzhiyun 		DMERR("cache target registration failed: %d", r);
3498*4882a593Smuzhiyun 		kmem_cache_destroy(migration_cache);
3499*4882a593Smuzhiyun 		return r;
3500*4882a593Smuzhiyun 	}
3501*4882a593Smuzhiyun 
3502*4882a593Smuzhiyun 	return 0;
3503*4882a593Smuzhiyun }
3504*4882a593Smuzhiyun 
dm_cache_exit(void)3505*4882a593Smuzhiyun static void __exit dm_cache_exit(void)
3506*4882a593Smuzhiyun {
3507*4882a593Smuzhiyun 	dm_unregister_target(&cache_target);
3508*4882a593Smuzhiyun 	kmem_cache_destroy(migration_cache);
3509*4882a593Smuzhiyun }
3510*4882a593Smuzhiyun 
3511*4882a593Smuzhiyun module_init(dm_cache_init);
3512*4882a593Smuzhiyun module_exit(dm_cache_exit);
3513*4882a593Smuzhiyun 
3514*4882a593Smuzhiyun MODULE_DESCRIPTION(DM_NAME " cache target");
3515*4882a593Smuzhiyun MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3516*4882a593Smuzhiyun MODULE_LICENSE("GPL");
3517