1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Main bcache entry point - handle a read or a write request and decide what to
4*4882a593Smuzhiyun * do with it; the make_request functions are called by the block layer.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7*4882a593Smuzhiyun * Copyright 2012 Google, Inc.
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun #include "bcache.h"
11*4882a593Smuzhiyun #include "btree.h"
12*4882a593Smuzhiyun #include "debug.h"
13*4882a593Smuzhiyun #include "request.h"
14*4882a593Smuzhiyun #include "writeback.h"
15*4882a593Smuzhiyun
16*4882a593Smuzhiyun #include <linux/module.h>
17*4882a593Smuzhiyun #include <linux/hash.h>
18*4882a593Smuzhiyun #include <linux/random.h>
19*4882a593Smuzhiyun #include <linux/backing-dev.h>
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun #include <trace/events/bcache.h>
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun #define CUTOFF_CACHE_ADD 95
24*4882a593Smuzhiyun #define CUTOFF_CACHE_READA 90
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun struct kmem_cache *bch_search_cache;
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun static void bch_data_insert_start(struct closure *cl);
29*4882a593Smuzhiyun
cache_mode(struct cached_dev * dc)30*4882a593Smuzhiyun static unsigned int cache_mode(struct cached_dev *dc)
31*4882a593Smuzhiyun {
32*4882a593Smuzhiyun return BDEV_CACHE_MODE(&dc->sb);
33*4882a593Smuzhiyun }
34*4882a593Smuzhiyun
verify(struct cached_dev * dc)35*4882a593Smuzhiyun static bool verify(struct cached_dev *dc)
36*4882a593Smuzhiyun {
37*4882a593Smuzhiyun return dc->verify;
38*4882a593Smuzhiyun }
39*4882a593Smuzhiyun
bio_csum(struct bio * bio,struct bkey * k)40*4882a593Smuzhiyun static void bio_csum(struct bio *bio, struct bkey *k)
41*4882a593Smuzhiyun {
42*4882a593Smuzhiyun struct bio_vec bv;
43*4882a593Smuzhiyun struct bvec_iter iter;
44*4882a593Smuzhiyun uint64_t csum = 0;
45*4882a593Smuzhiyun
46*4882a593Smuzhiyun bio_for_each_segment(bv, bio, iter) {
47*4882a593Smuzhiyun void *d = kmap(bv.bv_page) + bv.bv_offset;
48*4882a593Smuzhiyun
49*4882a593Smuzhiyun csum = bch_crc64_update(csum, d, bv.bv_len);
50*4882a593Smuzhiyun kunmap(bv.bv_page);
51*4882a593Smuzhiyun }
52*4882a593Smuzhiyun
53*4882a593Smuzhiyun k->ptr[KEY_PTRS(k)] = csum & (~0ULL >> 1);
54*4882a593Smuzhiyun }
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun /* Insert data into cache */
57*4882a593Smuzhiyun
bch_data_insert_keys(struct closure * cl)58*4882a593Smuzhiyun static void bch_data_insert_keys(struct closure *cl)
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
61*4882a593Smuzhiyun atomic_t *journal_ref = NULL;
62*4882a593Smuzhiyun struct bkey *replace_key = op->replace ? &op->replace_key : NULL;
63*4882a593Smuzhiyun int ret;
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun if (!op->replace)
66*4882a593Smuzhiyun journal_ref = bch_journal(op->c, &op->insert_keys,
67*4882a593Smuzhiyun op->flush_journal ? cl : NULL);
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun ret = bch_btree_insert(op->c, &op->insert_keys,
70*4882a593Smuzhiyun journal_ref, replace_key);
71*4882a593Smuzhiyun if (ret == -ESRCH) {
72*4882a593Smuzhiyun op->replace_collision = true;
73*4882a593Smuzhiyun } else if (ret) {
74*4882a593Smuzhiyun op->status = BLK_STS_RESOURCE;
75*4882a593Smuzhiyun op->insert_data_done = true;
76*4882a593Smuzhiyun }
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun if (journal_ref)
79*4882a593Smuzhiyun atomic_dec_bug(journal_ref);
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun if (!op->insert_data_done) {
82*4882a593Smuzhiyun continue_at(cl, bch_data_insert_start, op->wq);
83*4882a593Smuzhiyun return;
84*4882a593Smuzhiyun }
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun bch_keylist_free(&op->insert_keys);
87*4882a593Smuzhiyun closure_return(cl);
88*4882a593Smuzhiyun }
89*4882a593Smuzhiyun
bch_keylist_realloc(struct keylist * l,unsigned int u64s,struct cache_set * c)90*4882a593Smuzhiyun static int bch_keylist_realloc(struct keylist *l, unsigned int u64s,
91*4882a593Smuzhiyun struct cache_set *c)
92*4882a593Smuzhiyun {
93*4882a593Smuzhiyun size_t oldsize = bch_keylist_nkeys(l);
94*4882a593Smuzhiyun size_t newsize = oldsize + u64s;
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun /*
97*4882a593Smuzhiyun * The journalling code doesn't handle the case where the keys to insert
98*4882a593Smuzhiyun * is bigger than an empty write: If we just return -ENOMEM here,
99*4882a593Smuzhiyun * bch_data_insert_keys() will insert the keys created so far
100*4882a593Smuzhiyun * and finish the rest when the keylist is empty.
101*4882a593Smuzhiyun */
102*4882a593Smuzhiyun if (newsize * sizeof(uint64_t) > block_bytes(c->cache) - sizeof(struct jset))
103*4882a593Smuzhiyun return -ENOMEM;
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun return __bch_keylist_realloc(l, u64s);
106*4882a593Smuzhiyun }
107*4882a593Smuzhiyun
bch_data_invalidate(struct closure * cl)108*4882a593Smuzhiyun static void bch_data_invalidate(struct closure *cl)
109*4882a593Smuzhiyun {
110*4882a593Smuzhiyun struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
111*4882a593Smuzhiyun struct bio *bio = op->bio;
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun pr_debug("invalidating %i sectors from %llu\n",
114*4882a593Smuzhiyun bio_sectors(bio), (uint64_t) bio->bi_iter.bi_sector);
115*4882a593Smuzhiyun
116*4882a593Smuzhiyun while (bio_sectors(bio)) {
117*4882a593Smuzhiyun unsigned int sectors = min(bio_sectors(bio),
118*4882a593Smuzhiyun 1U << (KEY_SIZE_BITS - 1));
119*4882a593Smuzhiyun
120*4882a593Smuzhiyun if (bch_keylist_realloc(&op->insert_keys, 2, op->c))
121*4882a593Smuzhiyun goto out;
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun bio->bi_iter.bi_sector += sectors;
124*4882a593Smuzhiyun bio->bi_iter.bi_size -= sectors << 9;
125*4882a593Smuzhiyun
126*4882a593Smuzhiyun bch_keylist_add(&op->insert_keys,
127*4882a593Smuzhiyun &KEY(op->inode,
128*4882a593Smuzhiyun bio->bi_iter.bi_sector,
129*4882a593Smuzhiyun sectors));
130*4882a593Smuzhiyun }
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun op->insert_data_done = true;
133*4882a593Smuzhiyun /* get in bch_data_insert() */
134*4882a593Smuzhiyun bio_put(bio);
135*4882a593Smuzhiyun out:
136*4882a593Smuzhiyun continue_at(cl, bch_data_insert_keys, op->wq);
137*4882a593Smuzhiyun }
138*4882a593Smuzhiyun
bch_data_insert_error(struct closure * cl)139*4882a593Smuzhiyun static void bch_data_insert_error(struct closure *cl)
140*4882a593Smuzhiyun {
141*4882a593Smuzhiyun struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun /*
144*4882a593Smuzhiyun * Our data write just errored, which means we've got a bunch of keys to
145*4882a593Smuzhiyun * insert that point to data that wasn't successfully written.
146*4882a593Smuzhiyun *
147*4882a593Smuzhiyun * We don't have to insert those keys but we still have to invalidate
148*4882a593Smuzhiyun * that region of the cache - so, if we just strip off all the pointers
149*4882a593Smuzhiyun * from the keys we'll accomplish just that.
150*4882a593Smuzhiyun */
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun struct bkey *src = op->insert_keys.keys, *dst = op->insert_keys.keys;
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun while (src != op->insert_keys.top) {
155*4882a593Smuzhiyun struct bkey *n = bkey_next(src);
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun SET_KEY_PTRS(src, 0);
158*4882a593Smuzhiyun memmove(dst, src, bkey_bytes(src));
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun dst = bkey_next(dst);
161*4882a593Smuzhiyun src = n;
162*4882a593Smuzhiyun }
163*4882a593Smuzhiyun
164*4882a593Smuzhiyun op->insert_keys.top = dst;
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun bch_data_insert_keys(cl);
167*4882a593Smuzhiyun }
168*4882a593Smuzhiyun
bch_data_insert_endio(struct bio * bio)169*4882a593Smuzhiyun static void bch_data_insert_endio(struct bio *bio)
170*4882a593Smuzhiyun {
171*4882a593Smuzhiyun struct closure *cl = bio->bi_private;
172*4882a593Smuzhiyun struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun if (bio->bi_status) {
175*4882a593Smuzhiyun /* TODO: We could try to recover from this. */
176*4882a593Smuzhiyun if (op->writeback)
177*4882a593Smuzhiyun op->status = bio->bi_status;
178*4882a593Smuzhiyun else if (!op->replace)
179*4882a593Smuzhiyun set_closure_fn(cl, bch_data_insert_error, op->wq);
180*4882a593Smuzhiyun else
181*4882a593Smuzhiyun set_closure_fn(cl, NULL, NULL);
182*4882a593Smuzhiyun }
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun bch_bbio_endio(op->c, bio, bio->bi_status, "writing data to cache");
185*4882a593Smuzhiyun }
186*4882a593Smuzhiyun
bch_data_insert_start(struct closure * cl)187*4882a593Smuzhiyun static void bch_data_insert_start(struct closure *cl)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
190*4882a593Smuzhiyun struct bio *bio = op->bio, *n;
191*4882a593Smuzhiyun
192*4882a593Smuzhiyun if (op->bypass)
193*4882a593Smuzhiyun return bch_data_invalidate(cl);
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0)
196*4882a593Smuzhiyun wake_up_gc(op->c);
197*4882a593Smuzhiyun
198*4882a593Smuzhiyun /*
199*4882a593Smuzhiyun * Journal writes are marked REQ_PREFLUSH; if the original write was a
200*4882a593Smuzhiyun * flush, it'll wait on the journal write.
201*4882a593Smuzhiyun */
202*4882a593Smuzhiyun bio->bi_opf &= ~(REQ_PREFLUSH|REQ_FUA);
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun do {
205*4882a593Smuzhiyun unsigned int i;
206*4882a593Smuzhiyun struct bkey *k;
207*4882a593Smuzhiyun struct bio_set *split = &op->c->bio_split;
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun /* 1 for the device pointer and 1 for the chksum */
210*4882a593Smuzhiyun if (bch_keylist_realloc(&op->insert_keys,
211*4882a593Smuzhiyun 3 + (op->csum ? 1 : 0),
212*4882a593Smuzhiyun op->c)) {
213*4882a593Smuzhiyun continue_at(cl, bch_data_insert_keys, op->wq);
214*4882a593Smuzhiyun return;
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun k = op->insert_keys.top;
218*4882a593Smuzhiyun bkey_init(k);
219*4882a593Smuzhiyun SET_KEY_INODE(k, op->inode);
220*4882a593Smuzhiyun SET_KEY_OFFSET(k, bio->bi_iter.bi_sector);
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun if (!bch_alloc_sectors(op->c, k, bio_sectors(bio),
223*4882a593Smuzhiyun op->write_point, op->write_prio,
224*4882a593Smuzhiyun op->writeback))
225*4882a593Smuzhiyun goto err;
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun n = bio_next_split(bio, KEY_SIZE(k), GFP_NOIO, split);
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun n->bi_end_io = bch_data_insert_endio;
230*4882a593Smuzhiyun n->bi_private = cl;
231*4882a593Smuzhiyun
232*4882a593Smuzhiyun if (op->writeback) {
233*4882a593Smuzhiyun SET_KEY_DIRTY(k, true);
234*4882a593Smuzhiyun
235*4882a593Smuzhiyun for (i = 0; i < KEY_PTRS(k); i++)
236*4882a593Smuzhiyun SET_GC_MARK(PTR_BUCKET(op->c, k, i),
237*4882a593Smuzhiyun GC_MARK_DIRTY);
238*4882a593Smuzhiyun }
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun SET_KEY_CSUM(k, op->csum);
241*4882a593Smuzhiyun if (KEY_CSUM(k))
242*4882a593Smuzhiyun bio_csum(n, k);
243*4882a593Smuzhiyun
244*4882a593Smuzhiyun trace_bcache_cache_insert(k);
245*4882a593Smuzhiyun bch_keylist_push(&op->insert_keys);
246*4882a593Smuzhiyun
247*4882a593Smuzhiyun bio_set_op_attrs(n, REQ_OP_WRITE, 0);
248*4882a593Smuzhiyun bch_submit_bbio(n, op->c, k, 0);
249*4882a593Smuzhiyun } while (n != bio);
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun op->insert_data_done = true;
252*4882a593Smuzhiyun continue_at(cl, bch_data_insert_keys, op->wq);
253*4882a593Smuzhiyun return;
254*4882a593Smuzhiyun err:
255*4882a593Smuzhiyun /* bch_alloc_sectors() blocks if s->writeback = true */
256*4882a593Smuzhiyun BUG_ON(op->writeback);
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun /*
259*4882a593Smuzhiyun * But if it's not a writeback write we'd rather just bail out if
260*4882a593Smuzhiyun * there aren't any buckets ready to write to - it might take awhile and
261*4882a593Smuzhiyun * we might be starving btree writes for gc or something.
262*4882a593Smuzhiyun */
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun if (!op->replace) {
265*4882a593Smuzhiyun /*
266*4882a593Smuzhiyun * Writethrough write: We can't complete the write until we've
267*4882a593Smuzhiyun * updated the index. But we don't want to delay the write while
268*4882a593Smuzhiyun * we wait for buckets to be freed up, so just invalidate the
269*4882a593Smuzhiyun * rest of the write.
270*4882a593Smuzhiyun */
271*4882a593Smuzhiyun op->bypass = true;
272*4882a593Smuzhiyun return bch_data_invalidate(cl);
273*4882a593Smuzhiyun } else {
274*4882a593Smuzhiyun /*
275*4882a593Smuzhiyun * From a cache miss, we can just insert the keys for the data
276*4882a593Smuzhiyun * we have written or bail out if we didn't do anything.
277*4882a593Smuzhiyun */
278*4882a593Smuzhiyun op->insert_data_done = true;
279*4882a593Smuzhiyun bio_put(bio);
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun if (!bch_keylist_empty(&op->insert_keys))
282*4882a593Smuzhiyun continue_at(cl, bch_data_insert_keys, op->wq);
283*4882a593Smuzhiyun else
284*4882a593Smuzhiyun closure_return(cl);
285*4882a593Smuzhiyun }
286*4882a593Smuzhiyun }
287*4882a593Smuzhiyun
288*4882a593Smuzhiyun /**
289*4882a593Smuzhiyun * bch_data_insert - stick some data in the cache
290*4882a593Smuzhiyun * @cl: closure pointer.
291*4882a593Smuzhiyun *
292*4882a593Smuzhiyun * This is the starting point for any data to end up in a cache device; it could
293*4882a593Smuzhiyun * be from a normal write, or a writeback write, or a write to a flash only
294*4882a593Smuzhiyun * volume - it's also used by the moving garbage collector to compact data in
295*4882a593Smuzhiyun * mostly empty buckets.
296*4882a593Smuzhiyun *
297*4882a593Smuzhiyun * It first writes the data to the cache, creating a list of keys to be inserted
298*4882a593Smuzhiyun * (if the data had to be fragmented there will be multiple keys); after the
299*4882a593Smuzhiyun * data is written it calls bch_journal, and after the keys have been added to
300*4882a593Smuzhiyun * the next journal write they're inserted into the btree.
301*4882a593Smuzhiyun *
302*4882a593Smuzhiyun * It inserts the data in op->bio; bi_sector is used for the key offset,
303*4882a593Smuzhiyun * and op->inode is used for the key inode.
304*4882a593Smuzhiyun *
305*4882a593Smuzhiyun * If op->bypass is true, instead of inserting the data it invalidates the
306*4882a593Smuzhiyun * region of the cache represented by op->bio and op->inode.
307*4882a593Smuzhiyun */
bch_data_insert(struct closure * cl)308*4882a593Smuzhiyun void bch_data_insert(struct closure *cl)
309*4882a593Smuzhiyun {
310*4882a593Smuzhiyun struct data_insert_op *op = container_of(cl, struct data_insert_op, cl);
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun trace_bcache_write(op->c, op->inode, op->bio,
313*4882a593Smuzhiyun op->writeback, op->bypass);
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun bch_keylist_init(&op->insert_keys);
316*4882a593Smuzhiyun bio_get(op->bio);
317*4882a593Smuzhiyun bch_data_insert_start(cl);
318*4882a593Smuzhiyun }
319*4882a593Smuzhiyun
320*4882a593Smuzhiyun /*
321*4882a593Smuzhiyun * Congested? Return 0 (not congested) or the limit (in sectors)
322*4882a593Smuzhiyun * beyond which we should bypass the cache due to congestion.
323*4882a593Smuzhiyun */
bch_get_congested(const struct cache_set * c)324*4882a593Smuzhiyun unsigned int bch_get_congested(const struct cache_set *c)
325*4882a593Smuzhiyun {
326*4882a593Smuzhiyun int i;
327*4882a593Smuzhiyun
328*4882a593Smuzhiyun if (!c->congested_read_threshold_us &&
329*4882a593Smuzhiyun !c->congested_write_threshold_us)
330*4882a593Smuzhiyun return 0;
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun i = (local_clock_us() - c->congested_last_us) / 1024;
333*4882a593Smuzhiyun if (i < 0)
334*4882a593Smuzhiyun return 0;
335*4882a593Smuzhiyun
336*4882a593Smuzhiyun i += atomic_read(&c->congested);
337*4882a593Smuzhiyun if (i >= 0)
338*4882a593Smuzhiyun return 0;
339*4882a593Smuzhiyun
340*4882a593Smuzhiyun i += CONGESTED_MAX;
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun if (i > 0)
343*4882a593Smuzhiyun i = fract_exp_two(i, 6);
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun i -= hweight32(get_random_u32());
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun return i > 0 ? i : 1;
348*4882a593Smuzhiyun }
349*4882a593Smuzhiyun
add_sequential(struct task_struct * t)350*4882a593Smuzhiyun static void add_sequential(struct task_struct *t)
351*4882a593Smuzhiyun {
352*4882a593Smuzhiyun ewma_add(t->sequential_io_avg,
353*4882a593Smuzhiyun t->sequential_io, 8, 0);
354*4882a593Smuzhiyun
355*4882a593Smuzhiyun t->sequential_io = 0;
356*4882a593Smuzhiyun }
357*4882a593Smuzhiyun
iohash(struct cached_dev * dc,uint64_t k)358*4882a593Smuzhiyun static struct hlist_head *iohash(struct cached_dev *dc, uint64_t k)
359*4882a593Smuzhiyun {
360*4882a593Smuzhiyun return &dc->io_hash[hash_64(k, RECENT_IO_BITS)];
361*4882a593Smuzhiyun }
362*4882a593Smuzhiyun
check_should_bypass(struct cached_dev * dc,struct bio * bio)363*4882a593Smuzhiyun static bool check_should_bypass(struct cached_dev *dc, struct bio *bio)
364*4882a593Smuzhiyun {
365*4882a593Smuzhiyun struct cache_set *c = dc->disk.c;
366*4882a593Smuzhiyun unsigned int mode = cache_mode(dc);
367*4882a593Smuzhiyun unsigned int sectors, congested;
368*4882a593Smuzhiyun struct task_struct *task = current;
369*4882a593Smuzhiyun struct io *i;
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
372*4882a593Smuzhiyun c->gc_stats.in_use > CUTOFF_CACHE_ADD ||
373*4882a593Smuzhiyun (bio_op(bio) == REQ_OP_DISCARD))
374*4882a593Smuzhiyun goto skip;
375*4882a593Smuzhiyun
376*4882a593Smuzhiyun if (mode == CACHE_MODE_NONE ||
377*4882a593Smuzhiyun (mode == CACHE_MODE_WRITEAROUND &&
378*4882a593Smuzhiyun op_is_write(bio_op(bio))))
379*4882a593Smuzhiyun goto skip;
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun /*
382*4882a593Smuzhiyun * If the bio is for read-ahead or background IO, bypass it or
383*4882a593Smuzhiyun * not depends on the following situations,
384*4882a593Smuzhiyun * - If the IO is for meta data, always cache it and no bypass
385*4882a593Smuzhiyun * - If the IO is not meta data, check dc->cache_reada_policy,
386*4882a593Smuzhiyun * BCH_CACHE_READA_ALL: cache it and not bypass
387*4882a593Smuzhiyun * BCH_CACHE_READA_META_ONLY: not cache it and bypass
388*4882a593Smuzhiyun * That is, read-ahead request for metadata always get cached
389*4882a593Smuzhiyun * (eg, for gfs2 or xfs).
390*4882a593Smuzhiyun */
391*4882a593Smuzhiyun if ((bio->bi_opf & (REQ_RAHEAD|REQ_BACKGROUND))) {
392*4882a593Smuzhiyun if (!(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
393*4882a593Smuzhiyun (dc->cache_readahead_policy != BCH_CACHE_READA_ALL))
394*4882a593Smuzhiyun goto skip;
395*4882a593Smuzhiyun }
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun if (bio->bi_iter.bi_sector & (c->cache->sb.block_size - 1) ||
398*4882a593Smuzhiyun bio_sectors(bio) & (c->cache->sb.block_size - 1)) {
399*4882a593Smuzhiyun pr_debug("skipping unaligned io\n");
400*4882a593Smuzhiyun goto skip;
401*4882a593Smuzhiyun }
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun if (bypass_torture_test(dc)) {
404*4882a593Smuzhiyun if ((get_random_int() & 3) == 3)
405*4882a593Smuzhiyun goto skip;
406*4882a593Smuzhiyun else
407*4882a593Smuzhiyun goto rescale;
408*4882a593Smuzhiyun }
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun congested = bch_get_congested(c);
411*4882a593Smuzhiyun if (!congested && !dc->sequential_cutoff)
412*4882a593Smuzhiyun goto rescale;
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun spin_lock(&dc->io_lock);
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun hlist_for_each_entry(i, iohash(dc, bio->bi_iter.bi_sector), hash)
417*4882a593Smuzhiyun if (i->last == bio->bi_iter.bi_sector &&
418*4882a593Smuzhiyun time_before(jiffies, i->jiffies))
419*4882a593Smuzhiyun goto found;
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun i = list_first_entry(&dc->io_lru, struct io, lru);
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun add_sequential(task);
424*4882a593Smuzhiyun i->sequential = 0;
425*4882a593Smuzhiyun found:
426*4882a593Smuzhiyun if (i->sequential + bio->bi_iter.bi_size > i->sequential)
427*4882a593Smuzhiyun i->sequential += bio->bi_iter.bi_size;
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun i->last = bio_end_sector(bio);
430*4882a593Smuzhiyun i->jiffies = jiffies + msecs_to_jiffies(5000);
431*4882a593Smuzhiyun task->sequential_io = i->sequential;
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun hlist_del(&i->hash);
434*4882a593Smuzhiyun hlist_add_head(&i->hash, iohash(dc, i->last));
435*4882a593Smuzhiyun list_move_tail(&i->lru, &dc->io_lru);
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun spin_unlock(&dc->io_lock);
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun sectors = max(task->sequential_io,
440*4882a593Smuzhiyun task->sequential_io_avg) >> 9;
441*4882a593Smuzhiyun
442*4882a593Smuzhiyun if (dc->sequential_cutoff &&
443*4882a593Smuzhiyun sectors >= dc->sequential_cutoff >> 9) {
444*4882a593Smuzhiyun trace_bcache_bypass_sequential(bio);
445*4882a593Smuzhiyun goto skip;
446*4882a593Smuzhiyun }
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun if (congested && sectors >= congested) {
449*4882a593Smuzhiyun trace_bcache_bypass_congested(bio);
450*4882a593Smuzhiyun goto skip;
451*4882a593Smuzhiyun }
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun rescale:
454*4882a593Smuzhiyun bch_rescale_priorities(c, bio_sectors(bio));
455*4882a593Smuzhiyun return false;
456*4882a593Smuzhiyun skip:
457*4882a593Smuzhiyun bch_mark_sectors_bypassed(c, dc, bio_sectors(bio));
458*4882a593Smuzhiyun return true;
459*4882a593Smuzhiyun }
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun /* Cache lookup */
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun struct search {
464*4882a593Smuzhiyun /* Stack frame for bio_complete */
465*4882a593Smuzhiyun struct closure cl;
466*4882a593Smuzhiyun
467*4882a593Smuzhiyun struct bbio bio;
468*4882a593Smuzhiyun struct bio *orig_bio;
469*4882a593Smuzhiyun struct bio *cache_miss;
470*4882a593Smuzhiyun struct bcache_device *d;
471*4882a593Smuzhiyun
472*4882a593Smuzhiyun unsigned int insert_bio_sectors;
473*4882a593Smuzhiyun unsigned int recoverable:1;
474*4882a593Smuzhiyun unsigned int write:1;
475*4882a593Smuzhiyun unsigned int read_dirty_data:1;
476*4882a593Smuzhiyun unsigned int cache_missed:1;
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun struct hd_struct *part;
479*4882a593Smuzhiyun unsigned long start_time;
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun struct btree_op op;
482*4882a593Smuzhiyun struct data_insert_op iop;
483*4882a593Smuzhiyun };
484*4882a593Smuzhiyun
bch_cache_read_endio(struct bio * bio)485*4882a593Smuzhiyun static void bch_cache_read_endio(struct bio *bio)
486*4882a593Smuzhiyun {
487*4882a593Smuzhiyun struct bbio *b = container_of(bio, struct bbio, bio);
488*4882a593Smuzhiyun struct closure *cl = bio->bi_private;
489*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun /*
492*4882a593Smuzhiyun * If the bucket was reused while our bio was in flight, we might have
493*4882a593Smuzhiyun * read the wrong data. Set s->error but not error so it doesn't get
494*4882a593Smuzhiyun * counted against the cache device, but we'll still reread the data
495*4882a593Smuzhiyun * from the backing device.
496*4882a593Smuzhiyun */
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun if (bio->bi_status)
499*4882a593Smuzhiyun s->iop.status = bio->bi_status;
500*4882a593Smuzhiyun else if (!KEY_DIRTY(&b->key) &&
501*4882a593Smuzhiyun ptr_stale(s->iop.c, &b->key, 0)) {
502*4882a593Smuzhiyun atomic_long_inc(&s->iop.c->cache_read_races);
503*4882a593Smuzhiyun s->iop.status = BLK_STS_IOERR;
504*4882a593Smuzhiyun }
505*4882a593Smuzhiyun
506*4882a593Smuzhiyun bch_bbio_endio(s->iop.c, bio, bio->bi_status, "reading from cache");
507*4882a593Smuzhiyun }
508*4882a593Smuzhiyun
509*4882a593Smuzhiyun /*
510*4882a593Smuzhiyun * Read from a single key, handling the initial cache miss if the key starts in
511*4882a593Smuzhiyun * the middle of the bio
512*4882a593Smuzhiyun */
cache_lookup_fn(struct btree_op * op,struct btree * b,struct bkey * k)513*4882a593Smuzhiyun static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k)
514*4882a593Smuzhiyun {
515*4882a593Smuzhiyun struct search *s = container_of(op, struct search, op);
516*4882a593Smuzhiyun struct bio *n, *bio = &s->bio.bio;
517*4882a593Smuzhiyun struct bkey *bio_key;
518*4882a593Smuzhiyun unsigned int ptr;
519*4882a593Smuzhiyun
520*4882a593Smuzhiyun if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0)
521*4882a593Smuzhiyun return MAP_CONTINUE;
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun if (KEY_INODE(k) != s->iop.inode ||
524*4882a593Smuzhiyun KEY_START(k) > bio->bi_iter.bi_sector) {
525*4882a593Smuzhiyun unsigned int bio_sectors = bio_sectors(bio);
526*4882a593Smuzhiyun unsigned int sectors = KEY_INODE(k) == s->iop.inode
527*4882a593Smuzhiyun ? min_t(uint64_t, INT_MAX,
528*4882a593Smuzhiyun KEY_START(k) - bio->bi_iter.bi_sector)
529*4882a593Smuzhiyun : INT_MAX;
530*4882a593Smuzhiyun int ret = s->d->cache_miss(b, s, bio, sectors);
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun if (ret != MAP_CONTINUE)
533*4882a593Smuzhiyun return ret;
534*4882a593Smuzhiyun
535*4882a593Smuzhiyun /* if this was a complete miss we shouldn't get here */
536*4882a593Smuzhiyun BUG_ON(bio_sectors <= sectors);
537*4882a593Smuzhiyun }
538*4882a593Smuzhiyun
539*4882a593Smuzhiyun if (!KEY_SIZE(k))
540*4882a593Smuzhiyun return MAP_CONTINUE;
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun /* XXX: figure out best pointer - for multiple cache devices */
543*4882a593Smuzhiyun ptr = 0;
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO;
546*4882a593Smuzhiyun
547*4882a593Smuzhiyun if (KEY_DIRTY(k))
548*4882a593Smuzhiyun s->read_dirty_data = true;
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun n = bio_next_split(bio, min_t(uint64_t, INT_MAX,
551*4882a593Smuzhiyun KEY_OFFSET(k) - bio->bi_iter.bi_sector),
552*4882a593Smuzhiyun GFP_NOIO, &s->d->bio_split);
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun bio_key = &container_of(n, struct bbio, bio)->key;
555*4882a593Smuzhiyun bch_bkey_copy_single_ptr(bio_key, k, ptr);
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun bch_cut_front(&KEY(s->iop.inode, n->bi_iter.bi_sector, 0), bio_key);
558*4882a593Smuzhiyun bch_cut_back(&KEY(s->iop.inode, bio_end_sector(n), 0), bio_key);
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun n->bi_end_io = bch_cache_read_endio;
561*4882a593Smuzhiyun n->bi_private = &s->cl;
562*4882a593Smuzhiyun
563*4882a593Smuzhiyun /*
564*4882a593Smuzhiyun * The bucket we're reading from might be reused while our bio
565*4882a593Smuzhiyun * is in flight, and we could then end up reading the wrong
566*4882a593Smuzhiyun * data.
567*4882a593Smuzhiyun *
568*4882a593Smuzhiyun * We guard against this by checking (in cache_read_endio()) if
569*4882a593Smuzhiyun * the pointer is stale again; if so, we treat it as an error
570*4882a593Smuzhiyun * and reread from the backing device (but we don't pass that
571*4882a593Smuzhiyun * error up anywhere).
572*4882a593Smuzhiyun */
573*4882a593Smuzhiyun
574*4882a593Smuzhiyun __bch_submit_bbio(n, b->c);
575*4882a593Smuzhiyun return n == bio ? MAP_DONE : MAP_CONTINUE;
576*4882a593Smuzhiyun }
577*4882a593Smuzhiyun
cache_lookup(struct closure * cl)578*4882a593Smuzhiyun static void cache_lookup(struct closure *cl)
579*4882a593Smuzhiyun {
580*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, iop.cl);
581*4882a593Smuzhiyun struct bio *bio = &s->bio.bio;
582*4882a593Smuzhiyun struct cached_dev *dc;
583*4882a593Smuzhiyun int ret;
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun bch_btree_op_init(&s->op, -1);
586*4882a593Smuzhiyun
587*4882a593Smuzhiyun ret = bch_btree_map_keys(&s->op, s->iop.c,
588*4882a593Smuzhiyun &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0),
589*4882a593Smuzhiyun cache_lookup_fn, MAP_END_KEY);
590*4882a593Smuzhiyun if (ret == -EAGAIN) {
591*4882a593Smuzhiyun continue_at(cl, cache_lookup, bcache_wq);
592*4882a593Smuzhiyun return;
593*4882a593Smuzhiyun }
594*4882a593Smuzhiyun
595*4882a593Smuzhiyun /*
596*4882a593Smuzhiyun * We might meet err when searching the btree, If that happens, we will
597*4882a593Smuzhiyun * get negative ret, in this scenario we should not recover data from
598*4882a593Smuzhiyun * backing device (when cache device is dirty) because we don't know
599*4882a593Smuzhiyun * whether bkeys the read request covered are all clean.
600*4882a593Smuzhiyun *
601*4882a593Smuzhiyun * And after that happened, s->iop.status is still its initial value
602*4882a593Smuzhiyun * before we submit s->bio.bio
603*4882a593Smuzhiyun */
604*4882a593Smuzhiyun if (ret < 0) {
605*4882a593Smuzhiyun BUG_ON(ret == -EINTR);
606*4882a593Smuzhiyun if (s->d && s->d->c &&
607*4882a593Smuzhiyun !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
608*4882a593Smuzhiyun dc = container_of(s->d, struct cached_dev, disk);
609*4882a593Smuzhiyun if (dc && atomic_read(&dc->has_dirty))
610*4882a593Smuzhiyun s->recoverable = false;
611*4882a593Smuzhiyun }
612*4882a593Smuzhiyun if (!s->iop.status)
613*4882a593Smuzhiyun s->iop.status = BLK_STS_IOERR;
614*4882a593Smuzhiyun }
615*4882a593Smuzhiyun
616*4882a593Smuzhiyun closure_return(cl);
617*4882a593Smuzhiyun }
618*4882a593Smuzhiyun
619*4882a593Smuzhiyun /* Common code for the make_request functions */
620*4882a593Smuzhiyun
request_endio(struct bio * bio)621*4882a593Smuzhiyun static void request_endio(struct bio *bio)
622*4882a593Smuzhiyun {
623*4882a593Smuzhiyun struct closure *cl = bio->bi_private;
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun if (bio->bi_status) {
626*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun s->iop.status = bio->bi_status;
629*4882a593Smuzhiyun /* Only cache read errors are recoverable */
630*4882a593Smuzhiyun s->recoverable = false;
631*4882a593Smuzhiyun }
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun bio_put(bio);
634*4882a593Smuzhiyun closure_put(cl);
635*4882a593Smuzhiyun }
636*4882a593Smuzhiyun
backing_request_endio(struct bio * bio)637*4882a593Smuzhiyun static void backing_request_endio(struct bio *bio)
638*4882a593Smuzhiyun {
639*4882a593Smuzhiyun struct closure *cl = bio->bi_private;
640*4882a593Smuzhiyun
641*4882a593Smuzhiyun if (bio->bi_status) {
642*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
643*4882a593Smuzhiyun struct cached_dev *dc = container_of(s->d,
644*4882a593Smuzhiyun struct cached_dev, disk);
645*4882a593Smuzhiyun /*
646*4882a593Smuzhiyun * If a bio has REQ_PREFLUSH for writeback mode, it is
647*4882a593Smuzhiyun * speically assembled in cached_dev_write() for a non-zero
648*4882a593Smuzhiyun * write request which has REQ_PREFLUSH. we don't set
649*4882a593Smuzhiyun * s->iop.status by this failure, the status will be decided
650*4882a593Smuzhiyun * by result of bch_data_insert() operation.
651*4882a593Smuzhiyun */
652*4882a593Smuzhiyun if (unlikely(s->iop.writeback &&
653*4882a593Smuzhiyun bio->bi_opf & REQ_PREFLUSH)) {
654*4882a593Smuzhiyun pr_err("Can't flush %s: returned bi_status %i\n",
655*4882a593Smuzhiyun dc->backing_dev_name, bio->bi_status);
656*4882a593Smuzhiyun } else {
657*4882a593Smuzhiyun /* set to orig_bio->bi_status in bio_complete() */
658*4882a593Smuzhiyun s->iop.status = bio->bi_status;
659*4882a593Smuzhiyun }
660*4882a593Smuzhiyun s->recoverable = false;
661*4882a593Smuzhiyun /* should count I/O error for backing device here */
662*4882a593Smuzhiyun bch_count_backing_io_errors(dc, bio);
663*4882a593Smuzhiyun }
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun bio_put(bio);
666*4882a593Smuzhiyun closure_put(cl);
667*4882a593Smuzhiyun }
668*4882a593Smuzhiyun
bio_complete(struct search * s)669*4882a593Smuzhiyun static void bio_complete(struct search *s)
670*4882a593Smuzhiyun {
671*4882a593Smuzhiyun if (s->orig_bio) {
672*4882a593Smuzhiyun /* Count on bcache device */
673*4882a593Smuzhiyun part_end_io_acct(s->part, s->orig_bio, s->start_time);
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun trace_bcache_request_end(s->d, s->orig_bio);
676*4882a593Smuzhiyun s->orig_bio->bi_status = s->iop.status;
677*4882a593Smuzhiyun bio_endio(s->orig_bio);
678*4882a593Smuzhiyun s->orig_bio = NULL;
679*4882a593Smuzhiyun }
680*4882a593Smuzhiyun }
681*4882a593Smuzhiyun
do_bio_hook(struct search * s,struct bio * orig_bio,bio_end_io_t * end_io_fn)682*4882a593Smuzhiyun static void do_bio_hook(struct search *s,
683*4882a593Smuzhiyun struct bio *orig_bio,
684*4882a593Smuzhiyun bio_end_io_t *end_io_fn)
685*4882a593Smuzhiyun {
686*4882a593Smuzhiyun struct bio *bio = &s->bio.bio;
687*4882a593Smuzhiyun
688*4882a593Smuzhiyun bio_init(bio, NULL, 0);
689*4882a593Smuzhiyun __bio_clone_fast(bio, orig_bio);
690*4882a593Smuzhiyun /*
691*4882a593Smuzhiyun * bi_end_io can be set separately somewhere else, e.g. the
692*4882a593Smuzhiyun * variants in,
693*4882a593Smuzhiyun * - cache_bio->bi_end_io from cached_dev_cache_miss()
694*4882a593Smuzhiyun * - n->bi_end_io from cache_lookup_fn()
695*4882a593Smuzhiyun */
696*4882a593Smuzhiyun bio->bi_end_io = end_io_fn;
697*4882a593Smuzhiyun bio->bi_private = &s->cl;
698*4882a593Smuzhiyun
699*4882a593Smuzhiyun bio_cnt_set(bio, 3);
700*4882a593Smuzhiyun }
701*4882a593Smuzhiyun
search_free(struct closure * cl)702*4882a593Smuzhiyun static void search_free(struct closure *cl)
703*4882a593Smuzhiyun {
704*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
705*4882a593Smuzhiyun
706*4882a593Smuzhiyun atomic_dec(&s->iop.c->search_inflight);
707*4882a593Smuzhiyun
708*4882a593Smuzhiyun if (s->iop.bio)
709*4882a593Smuzhiyun bio_put(s->iop.bio);
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun bio_complete(s);
712*4882a593Smuzhiyun closure_debug_destroy(cl);
713*4882a593Smuzhiyun mempool_free(s, &s->iop.c->search);
714*4882a593Smuzhiyun }
715*4882a593Smuzhiyun
search_alloc(struct bio * bio,struct bcache_device * d)716*4882a593Smuzhiyun static inline struct search *search_alloc(struct bio *bio,
717*4882a593Smuzhiyun struct bcache_device *d)
718*4882a593Smuzhiyun {
719*4882a593Smuzhiyun struct search *s;
720*4882a593Smuzhiyun
721*4882a593Smuzhiyun s = mempool_alloc(&d->c->search, GFP_NOIO);
722*4882a593Smuzhiyun
723*4882a593Smuzhiyun closure_init(&s->cl, NULL);
724*4882a593Smuzhiyun do_bio_hook(s, bio, request_endio);
725*4882a593Smuzhiyun atomic_inc(&d->c->search_inflight);
726*4882a593Smuzhiyun
727*4882a593Smuzhiyun s->orig_bio = bio;
728*4882a593Smuzhiyun s->cache_miss = NULL;
729*4882a593Smuzhiyun s->cache_missed = 0;
730*4882a593Smuzhiyun s->d = d;
731*4882a593Smuzhiyun s->recoverable = 1;
732*4882a593Smuzhiyun s->write = op_is_write(bio_op(bio));
733*4882a593Smuzhiyun s->read_dirty_data = 0;
734*4882a593Smuzhiyun /* Count on the bcache device */
735*4882a593Smuzhiyun s->start_time = part_start_io_acct(d->disk, &s->part, bio);
736*4882a593Smuzhiyun s->iop.c = d->c;
737*4882a593Smuzhiyun s->iop.bio = NULL;
738*4882a593Smuzhiyun s->iop.inode = d->id;
739*4882a593Smuzhiyun s->iop.write_point = hash_long((unsigned long) current, 16);
740*4882a593Smuzhiyun s->iop.write_prio = 0;
741*4882a593Smuzhiyun s->iop.status = 0;
742*4882a593Smuzhiyun s->iop.flags = 0;
743*4882a593Smuzhiyun s->iop.flush_journal = op_is_flush(bio->bi_opf);
744*4882a593Smuzhiyun s->iop.wq = bcache_wq;
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun return s;
747*4882a593Smuzhiyun }
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun /* Cached devices */
750*4882a593Smuzhiyun
cached_dev_bio_complete(struct closure * cl)751*4882a593Smuzhiyun static void cached_dev_bio_complete(struct closure *cl)
752*4882a593Smuzhiyun {
753*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
754*4882a593Smuzhiyun struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
755*4882a593Smuzhiyun
756*4882a593Smuzhiyun cached_dev_put(dc);
757*4882a593Smuzhiyun search_free(cl);
758*4882a593Smuzhiyun }
759*4882a593Smuzhiyun
760*4882a593Smuzhiyun /* Process reads */
761*4882a593Smuzhiyun
cached_dev_read_error_done(struct closure * cl)762*4882a593Smuzhiyun static void cached_dev_read_error_done(struct closure *cl)
763*4882a593Smuzhiyun {
764*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
765*4882a593Smuzhiyun
766*4882a593Smuzhiyun if (s->iop.replace_collision)
767*4882a593Smuzhiyun bch_mark_cache_miss_collision(s->iop.c, s->d);
768*4882a593Smuzhiyun
769*4882a593Smuzhiyun if (s->iop.bio)
770*4882a593Smuzhiyun bio_free_pages(s->iop.bio);
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun cached_dev_bio_complete(cl);
773*4882a593Smuzhiyun }
774*4882a593Smuzhiyun
cached_dev_read_error(struct closure * cl)775*4882a593Smuzhiyun static void cached_dev_read_error(struct closure *cl)
776*4882a593Smuzhiyun {
777*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
778*4882a593Smuzhiyun struct bio *bio = &s->bio.bio;
779*4882a593Smuzhiyun
780*4882a593Smuzhiyun /*
781*4882a593Smuzhiyun * If read request hit dirty data (s->read_dirty_data is true),
782*4882a593Smuzhiyun * then recovery a failed read request from cached device may
783*4882a593Smuzhiyun * get a stale data back. So read failure recovery is only
784*4882a593Smuzhiyun * permitted when read request hit clean data in cache device,
785*4882a593Smuzhiyun * or when cache read race happened.
786*4882a593Smuzhiyun */
787*4882a593Smuzhiyun if (s->recoverable && !s->read_dirty_data) {
788*4882a593Smuzhiyun /* Retry from the backing device: */
789*4882a593Smuzhiyun trace_bcache_read_retry(s->orig_bio);
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun s->iop.status = 0;
792*4882a593Smuzhiyun do_bio_hook(s, s->orig_bio, backing_request_endio);
793*4882a593Smuzhiyun
794*4882a593Smuzhiyun /* XXX: invalidate cache */
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun /* I/O request sent to backing device */
797*4882a593Smuzhiyun closure_bio_submit(s->iop.c, bio, cl);
798*4882a593Smuzhiyun }
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun continue_at(cl, cached_dev_read_error_done, NULL);
801*4882a593Smuzhiyun }
802*4882a593Smuzhiyun
cached_dev_cache_miss_done(struct closure * cl)803*4882a593Smuzhiyun static void cached_dev_cache_miss_done(struct closure *cl)
804*4882a593Smuzhiyun {
805*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
806*4882a593Smuzhiyun struct bcache_device *d = s->d;
807*4882a593Smuzhiyun
808*4882a593Smuzhiyun if (s->iop.replace_collision)
809*4882a593Smuzhiyun bch_mark_cache_miss_collision(s->iop.c, s->d);
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun if (s->iop.bio)
812*4882a593Smuzhiyun bio_free_pages(s->iop.bio);
813*4882a593Smuzhiyun
814*4882a593Smuzhiyun cached_dev_bio_complete(cl);
815*4882a593Smuzhiyun closure_put(&d->cl);
816*4882a593Smuzhiyun }
817*4882a593Smuzhiyun
cached_dev_read_done(struct closure * cl)818*4882a593Smuzhiyun static void cached_dev_read_done(struct closure *cl)
819*4882a593Smuzhiyun {
820*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
821*4882a593Smuzhiyun struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
822*4882a593Smuzhiyun
823*4882a593Smuzhiyun /*
824*4882a593Smuzhiyun * We had a cache miss; cache_bio now contains data ready to be inserted
825*4882a593Smuzhiyun * into the cache.
826*4882a593Smuzhiyun *
827*4882a593Smuzhiyun * First, we copy the data we just read from cache_bio's bounce buffers
828*4882a593Smuzhiyun * to the buffers the original bio pointed to:
829*4882a593Smuzhiyun */
830*4882a593Smuzhiyun
831*4882a593Smuzhiyun if (s->iop.bio) {
832*4882a593Smuzhiyun bio_reset(s->iop.bio);
833*4882a593Smuzhiyun s->iop.bio->bi_iter.bi_sector =
834*4882a593Smuzhiyun s->cache_miss->bi_iter.bi_sector;
835*4882a593Smuzhiyun bio_copy_dev(s->iop.bio, s->cache_miss);
836*4882a593Smuzhiyun s->iop.bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
837*4882a593Smuzhiyun bch_bio_map(s->iop.bio, NULL);
838*4882a593Smuzhiyun
839*4882a593Smuzhiyun bio_copy_data(s->cache_miss, s->iop.bio);
840*4882a593Smuzhiyun
841*4882a593Smuzhiyun bio_put(s->cache_miss);
842*4882a593Smuzhiyun s->cache_miss = NULL;
843*4882a593Smuzhiyun }
844*4882a593Smuzhiyun
845*4882a593Smuzhiyun if (verify(dc) && s->recoverable && !s->read_dirty_data)
846*4882a593Smuzhiyun bch_data_verify(dc, s->orig_bio);
847*4882a593Smuzhiyun
848*4882a593Smuzhiyun closure_get(&dc->disk.cl);
849*4882a593Smuzhiyun bio_complete(s);
850*4882a593Smuzhiyun
851*4882a593Smuzhiyun if (s->iop.bio &&
852*4882a593Smuzhiyun !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) {
853*4882a593Smuzhiyun BUG_ON(!s->iop.replace);
854*4882a593Smuzhiyun closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
855*4882a593Smuzhiyun }
856*4882a593Smuzhiyun
857*4882a593Smuzhiyun continue_at(cl, cached_dev_cache_miss_done, NULL);
858*4882a593Smuzhiyun }
859*4882a593Smuzhiyun
cached_dev_read_done_bh(struct closure * cl)860*4882a593Smuzhiyun static void cached_dev_read_done_bh(struct closure *cl)
861*4882a593Smuzhiyun {
862*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
863*4882a593Smuzhiyun struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun bch_mark_cache_accounting(s->iop.c, s->d,
866*4882a593Smuzhiyun !s->cache_missed, s->iop.bypass);
867*4882a593Smuzhiyun trace_bcache_read(s->orig_bio, !s->cache_missed, s->iop.bypass);
868*4882a593Smuzhiyun
869*4882a593Smuzhiyun if (s->iop.status)
870*4882a593Smuzhiyun continue_at_nobarrier(cl, cached_dev_read_error, bcache_wq);
871*4882a593Smuzhiyun else if (s->iop.bio || verify(dc))
872*4882a593Smuzhiyun continue_at_nobarrier(cl, cached_dev_read_done, bcache_wq);
873*4882a593Smuzhiyun else
874*4882a593Smuzhiyun continue_at_nobarrier(cl, cached_dev_bio_complete, NULL);
875*4882a593Smuzhiyun }
876*4882a593Smuzhiyun
cached_dev_cache_miss(struct btree * b,struct search * s,struct bio * bio,unsigned int sectors)877*4882a593Smuzhiyun static int cached_dev_cache_miss(struct btree *b, struct search *s,
878*4882a593Smuzhiyun struct bio *bio, unsigned int sectors)
879*4882a593Smuzhiyun {
880*4882a593Smuzhiyun int ret = MAP_CONTINUE;
881*4882a593Smuzhiyun unsigned int reada = 0;
882*4882a593Smuzhiyun struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
883*4882a593Smuzhiyun struct bio *miss, *cache_bio;
884*4882a593Smuzhiyun
885*4882a593Smuzhiyun s->cache_missed = 1;
886*4882a593Smuzhiyun
887*4882a593Smuzhiyun if (s->cache_miss || s->iop.bypass) {
888*4882a593Smuzhiyun miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
889*4882a593Smuzhiyun ret = miss == bio ? MAP_DONE : MAP_CONTINUE;
890*4882a593Smuzhiyun goto out_submit;
891*4882a593Smuzhiyun }
892*4882a593Smuzhiyun
893*4882a593Smuzhiyun if (!(bio->bi_opf & REQ_RAHEAD) &&
894*4882a593Smuzhiyun !(bio->bi_opf & (REQ_META|REQ_PRIO)) &&
895*4882a593Smuzhiyun s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA)
896*4882a593Smuzhiyun reada = min_t(sector_t, dc->readahead >> 9,
897*4882a593Smuzhiyun get_capacity(bio->bi_disk) - bio_end_sector(bio));
898*4882a593Smuzhiyun
899*4882a593Smuzhiyun s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada);
900*4882a593Smuzhiyun
901*4882a593Smuzhiyun s->iop.replace_key = KEY(s->iop.inode,
902*4882a593Smuzhiyun bio->bi_iter.bi_sector + s->insert_bio_sectors,
903*4882a593Smuzhiyun s->insert_bio_sectors);
904*4882a593Smuzhiyun
905*4882a593Smuzhiyun ret = bch_btree_insert_check_key(b, &s->op, &s->iop.replace_key);
906*4882a593Smuzhiyun if (ret)
907*4882a593Smuzhiyun return ret;
908*4882a593Smuzhiyun
909*4882a593Smuzhiyun s->iop.replace = true;
910*4882a593Smuzhiyun
911*4882a593Smuzhiyun miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split);
912*4882a593Smuzhiyun
913*4882a593Smuzhiyun /* btree_search_recurse()'s btree iterator is no good anymore */
914*4882a593Smuzhiyun ret = miss == bio ? MAP_DONE : -EINTR;
915*4882a593Smuzhiyun
916*4882a593Smuzhiyun cache_bio = bio_alloc_bioset(GFP_NOWAIT,
917*4882a593Smuzhiyun DIV_ROUND_UP(s->insert_bio_sectors, PAGE_SECTORS),
918*4882a593Smuzhiyun &dc->disk.bio_split);
919*4882a593Smuzhiyun if (!cache_bio)
920*4882a593Smuzhiyun goto out_submit;
921*4882a593Smuzhiyun
922*4882a593Smuzhiyun cache_bio->bi_iter.bi_sector = miss->bi_iter.bi_sector;
923*4882a593Smuzhiyun bio_copy_dev(cache_bio, miss);
924*4882a593Smuzhiyun cache_bio->bi_iter.bi_size = s->insert_bio_sectors << 9;
925*4882a593Smuzhiyun
926*4882a593Smuzhiyun cache_bio->bi_end_io = backing_request_endio;
927*4882a593Smuzhiyun cache_bio->bi_private = &s->cl;
928*4882a593Smuzhiyun
929*4882a593Smuzhiyun bch_bio_map(cache_bio, NULL);
930*4882a593Smuzhiyun if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
931*4882a593Smuzhiyun goto out_put;
932*4882a593Smuzhiyun
933*4882a593Smuzhiyun if (reada)
934*4882a593Smuzhiyun bch_mark_cache_readahead(s->iop.c, s->d);
935*4882a593Smuzhiyun
936*4882a593Smuzhiyun s->cache_miss = miss;
937*4882a593Smuzhiyun s->iop.bio = cache_bio;
938*4882a593Smuzhiyun bio_get(cache_bio);
939*4882a593Smuzhiyun /* I/O request sent to backing device */
940*4882a593Smuzhiyun closure_bio_submit(s->iop.c, cache_bio, &s->cl);
941*4882a593Smuzhiyun
942*4882a593Smuzhiyun return ret;
943*4882a593Smuzhiyun out_put:
944*4882a593Smuzhiyun bio_put(cache_bio);
945*4882a593Smuzhiyun out_submit:
946*4882a593Smuzhiyun miss->bi_end_io = backing_request_endio;
947*4882a593Smuzhiyun miss->bi_private = &s->cl;
948*4882a593Smuzhiyun /* I/O request sent to backing device */
949*4882a593Smuzhiyun closure_bio_submit(s->iop.c, miss, &s->cl);
950*4882a593Smuzhiyun return ret;
951*4882a593Smuzhiyun }
952*4882a593Smuzhiyun
cached_dev_read(struct cached_dev * dc,struct search * s)953*4882a593Smuzhiyun static void cached_dev_read(struct cached_dev *dc, struct search *s)
954*4882a593Smuzhiyun {
955*4882a593Smuzhiyun struct closure *cl = &s->cl;
956*4882a593Smuzhiyun
957*4882a593Smuzhiyun closure_call(&s->iop.cl, cache_lookup, NULL, cl);
958*4882a593Smuzhiyun continue_at(cl, cached_dev_read_done_bh, NULL);
959*4882a593Smuzhiyun }
960*4882a593Smuzhiyun
961*4882a593Smuzhiyun /* Process writes */
962*4882a593Smuzhiyun
cached_dev_write_complete(struct closure * cl)963*4882a593Smuzhiyun static void cached_dev_write_complete(struct closure *cl)
964*4882a593Smuzhiyun {
965*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
966*4882a593Smuzhiyun struct cached_dev *dc = container_of(s->d, struct cached_dev, disk);
967*4882a593Smuzhiyun
968*4882a593Smuzhiyun up_read_non_owner(&dc->writeback_lock);
969*4882a593Smuzhiyun cached_dev_bio_complete(cl);
970*4882a593Smuzhiyun }
971*4882a593Smuzhiyun
cached_dev_write(struct cached_dev * dc,struct search * s)972*4882a593Smuzhiyun static void cached_dev_write(struct cached_dev *dc, struct search *s)
973*4882a593Smuzhiyun {
974*4882a593Smuzhiyun struct closure *cl = &s->cl;
975*4882a593Smuzhiyun struct bio *bio = &s->bio.bio;
976*4882a593Smuzhiyun struct bkey start = KEY(dc->disk.id, bio->bi_iter.bi_sector, 0);
977*4882a593Smuzhiyun struct bkey end = KEY(dc->disk.id, bio_end_sector(bio), 0);
978*4882a593Smuzhiyun
979*4882a593Smuzhiyun bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end);
980*4882a593Smuzhiyun
981*4882a593Smuzhiyun down_read_non_owner(&dc->writeback_lock);
982*4882a593Smuzhiyun if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) {
983*4882a593Smuzhiyun /*
984*4882a593Smuzhiyun * We overlap with some dirty data undergoing background
985*4882a593Smuzhiyun * writeback, force this write to writeback
986*4882a593Smuzhiyun */
987*4882a593Smuzhiyun s->iop.bypass = false;
988*4882a593Smuzhiyun s->iop.writeback = true;
989*4882a593Smuzhiyun }
990*4882a593Smuzhiyun
991*4882a593Smuzhiyun /*
992*4882a593Smuzhiyun * Discards aren't _required_ to do anything, so skipping if
993*4882a593Smuzhiyun * check_overlapping returned true is ok
994*4882a593Smuzhiyun *
995*4882a593Smuzhiyun * But check_overlapping drops dirty keys for which io hasn't started,
996*4882a593Smuzhiyun * so we still want to call it.
997*4882a593Smuzhiyun */
998*4882a593Smuzhiyun if (bio_op(bio) == REQ_OP_DISCARD)
999*4882a593Smuzhiyun s->iop.bypass = true;
1000*4882a593Smuzhiyun
1001*4882a593Smuzhiyun if (should_writeback(dc, s->orig_bio,
1002*4882a593Smuzhiyun cache_mode(dc),
1003*4882a593Smuzhiyun s->iop.bypass)) {
1004*4882a593Smuzhiyun s->iop.bypass = false;
1005*4882a593Smuzhiyun s->iop.writeback = true;
1006*4882a593Smuzhiyun }
1007*4882a593Smuzhiyun
1008*4882a593Smuzhiyun if (s->iop.bypass) {
1009*4882a593Smuzhiyun s->iop.bio = s->orig_bio;
1010*4882a593Smuzhiyun bio_get(s->iop.bio);
1011*4882a593Smuzhiyun
1012*4882a593Smuzhiyun if (bio_op(bio) == REQ_OP_DISCARD &&
1013*4882a593Smuzhiyun !blk_queue_discard(bdev_get_queue(dc->bdev)))
1014*4882a593Smuzhiyun goto insert_data;
1015*4882a593Smuzhiyun
1016*4882a593Smuzhiyun /* I/O request sent to backing device */
1017*4882a593Smuzhiyun bio->bi_end_io = backing_request_endio;
1018*4882a593Smuzhiyun closure_bio_submit(s->iop.c, bio, cl);
1019*4882a593Smuzhiyun
1020*4882a593Smuzhiyun } else if (s->iop.writeback) {
1021*4882a593Smuzhiyun bch_writeback_add(dc);
1022*4882a593Smuzhiyun s->iop.bio = bio;
1023*4882a593Smuzhiyun
1024*4882a593Smuzhiyun if (bio->bi_opf & REQ_PREFLUSH) {
1025*4882a593Smuzhiyun /*
1026*4882a593Smuzhiyun * Also need to send a flush to the backing
1027*4882a593Smuzhiyun * device.
1028*4882a593Smuzhiyun */
1029*4882a593Smuzhiyun struct bio *flush;
1030*4882a593Smuzhiyun
1031*4882a593Smuzhiyun flush = bio_alloc_bioset(GFP_NOIO, 0,
1032*4882a593Smuzhiyun &dc->disk.bio_split);
1033*4882a593Smuzhiyun if (!flush) {
1034*4882a593Smuzhiyun s->iop.status = BLK_STS_RESOURCE;
1035*4882a593Smuzhiyun goto insert_data;
1036*4882a593Smuzhiyun }
1037*4882a593Smuzhiyun bio_copy_dev(flush, bio);
1038*4882a593Smuzhiyun flush->bi_end_io = backing_request_endio;
1039*4882a593Smuzhiyun flush->bi_private = cl;
1040*4882a593Smuzhiyun flush->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
1041*4882a593Smuzhiyun /* I/O request sent to backing device */
1042*4882a593Smuzhiyun closure_bio_submit(s->iop.c, flush, cl);
1043*4882a593Smuzhiyun }
1044*4882a593Smuzhiyun } else {
1045*4882a593Smuzhiyun s->iop.bio = bio_clone_fast(bio, GFP_NOIO, &dc->disk.bio_split);
1046*4882a593Smuzhiyun /* I/O request sent to backing device */
1047*4882a593Smuzhiyun bio->bi_end_io = backing_request_endio;
1048*4882a593Smuzhiyun closure_bio_submit(s->iop.c, bio, cl);
1049*4882a593Smuzhiyun }
1050*4882a593Smuzhiyun
1051*4882a593Smuzhiyun insert_data:
1052*4882a593Smuzhiyun closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
1053*4882a593Smuzhiyun continue_at(cl, cached_dev_write_complete, NULL);
1054*4882a593Smuzhiyun }
1055*4882a593Smuzhiyun
cached_dev_nodata(struct closure * cl)1056*4882a593Smuzhiyun static void cached_dev_nodata(struct closure *cl)
1057*4882a593Smuzhiyun {
1058*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
1059*4882a593Smuzhiyun struct bio *bio = &s->bio.bio;
1060*4882a593Smuzhiyun
1061*4882a593Smuzhiyun if (s->iop.flush_journal)
1062*4882a593Smuzhiyun bch_journal_meta(s->iop.c, cl);
1063*4882a593Smuzhiyun
1064*4882a593Smuzhiyun /* If it's a flush, we send the flush to the backing device too */
1065*4882a593Smuzhiyun bio->bi_end_io = backing_request_endio;
1066*4882a593Smuzhiyun closure_bio_submit(s->iop.c, bio, cl);
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun continue_at(cl, cached_dev_bio_complete, NULL);
1069*4882a593Smuzhiyun }
1070*4882a593Smuzhiyun
1071*4882a593Smuzhiyun struct detached_dev_io_private {
1072*4882a593Smuzhiyun struct bcache_device *d;
1073*4882a593Smuzhiyun unsigned long start_time;
1074*4882a593Smuzhiyun bio_end_io_t *bi_end_io;
1075*4882a593Smuzhiyun void *bi_private;
1076*4882a593Smuzhiyun struct hd_struct *part;
1077*4882a593Smuzhiyun };
1078*4882a593Smuzhiyun
detached_dev_end_io(struct bio * bio)1079*4882a593Smuzhiyun static void detached_dev_end_io(struct bio *bio)
1080*4882a593Smuzhiyun {
1081*4882a593Smuzhiyun struct detached_dev_io_private *ddip;
1082*4882a593Smuzhiyun
1083*4882a593Smuzhiyun ddip = bio->bi_private;
1084*4882a593Smuzhiyun bio->bi_end_io = ddip->bi_end_io;
1085*4882a593Smuzhiyun bio->bi_private = ddip->bi_private;
1086*4882a593Smuzhiyun
1087*4882a593Smuzhiyun /* Count on the bcache device */
1088*4882a593Smuzhiyun part_end_io_acct(ddip->part, bio, ddip->start_time);
1089*4882a593Smuzhiyun
1090*4882a593Smuzhiyun if (bio->bi_status) {
1091*4882a593Smuzhiyun struct cached_dev *dc = container_of(ddip->d,
1092*4882a593Smuzhiyun struct cached_dev, disk);
1093*4882a593Smuzhiyun /* should count I/O error for backing device here */
1094*4882a593Smuzhiyun bch_count_backing_io_errors(dc, bio);
1095*4882a593Smuzhiyun }
1096*4882a593Smuzhiyun
1097*4882a593Smuzhiyun kfree(ddip);
1098*4882a593Smuzhiyun bio->bi_end_io(bio);
1099*4882a593Smuzhiyun }
1100*4882a593Smuzhiyun
detached_dev_do_request(struct bcache_device * d,struct bio * bio)1101*4882a593Smuzhiyun static void detached_dev_do_request(struct bcache_device *d, struct bio *bio)
1102*4882a593Smuzhiyun {
1103*4882a593Smuzhiyun struct detached_dev_io_private *ddip;
1104*4882a593Smuzhiyun struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1105*4882a593Smuzhiyun
1106*4882a593Smuzhiyun /*
1107*4882a593Smuzhiyun * no need to call closure_get(&dc->disk.cl),
1108*4882a593Smuzhiyun * because upper layer had already opened bcache device,
1109*4882a593Smuzhiyun * which would call closure_get(&dc->disk.cl)
1110*4882a593Smuzhiyun */
1111*4882a593Smuzhiyun ddip = kzalloc(sizeof(struct detached_dev_io_private), GFP_NOIO);
1112*4882a593Smuzhiyun if (!ddip) {
1113*4882a593Smuzhiyun bio->bi_status = BLK_STS_RESOURCE;
1114*4882a593Smuzhiyun bio->bi_end_io(bio);
1115*4882a593Smuzhiyun return;
1116*4882a593Smuzhiyun }
1117*4882a593Smuzhiyun
1118*4882a593Smuzhiyun ddip->d = d;
1119*4882a593Smuzhiyun /* Count on the bcache device */
1120*4882a593Smuzhiyun ddip->start_time = part_start_io_acct(d->disk, &ddip->part, bio);
1121*4882a593Smuzhiyun ddip->bi_end_io = bio->bi_end_io;
1122*4882a593Smuzhiyun ddip->bi_private = bio->bi_private;
1123*4882a593Smuzhiyun bio->bi_end_io = detached_dev_end_io;
1124*4882a593Smuzhiyun bio->bi_private = ddip;
1125*4882a593Smuzhiyun
1126*4882a593Smuzhiyun if ((bio_op(bio) == REQ_OP_DISCARD) &&
1127*4882a593Smuzhiyun !blk_queue_discard(bdev_get_queue(dc->bdev)))
1128*4882a593Smuzhiyun bio->bi_end_io(bio);
1129*4882a593Smuzhiyun else
1130*4882a593Smuzhiyun submit_bio_noacct(bio);
1131*4882a593Smuzhiyun }
1132*4882a593Smuzhiyun
quit_max_writeback_rate(struct cache_set * c,struct cached_dev * this_dc)1133*4882a593Smuzhiyun static void quit_max_writeback_rate(struct cache_set *c,
1134*4882a593Smuzhiyun struct cached_dev *this_dc)
1135*4882a593Smuzhiyun {
1136*4882a593Smuzhiyun int i;
1137*4882a593Smuzhiyun struct bcache_device *d;
1138*4882a593Smuzhiyun struct cached_dev *dc;
1139*4882a593Smuzhiyun
1140*4882a593Smuzhiyun /*
1141*4882a593Smuzhiyun * mutex bch_register_lock may compete with other parallel requesters,
1142*4882a593Smuzhiyun * or attach/detach operations on other backing device. Waiting to
1143*4882a593Smuzhiyun * the mutex lock may increase I/O request latency for seconds or more.
1144*4882a593Smuzhiyun * To avoid such situation, if mutext_trylock() failed, only writeback
1145*4882a593Smuzhiyun * rate of current cached device is set to 1, and __update_write_back()
1146*4882a593Smuzhiyun * will decide writeback rate of other cached devices (remember now
1147*4882a593Smuzhiyun * c->idle_counter is 0 already).
1148*4882a593Smuzhiyun */
1149*4882a593Smuzhiyun if (mutex_trylock(&bch_register_lock)) {
1150*4882a593Smuzhiyun for (i = 0; i < c->devices_max_used; i++) {
1151*4882a593Smuzhiyun if (!c->devices[i])
1152*4882a593Smuzhiyun continue;
1153*4882a593Smuzhiyun
1154*4882a593Smuzhiyun if (UUID_FLASH_ONLY(&c->uuids[i]))
1155*4882a593Smuzhiyun continue;
1156*4882a593Smuzhiyun
1157*4882a593Smuzhiyun d = c->devices[i];
1158*4882a593Smuzhiyun dc = container_of(d, struct cached_dev, disk);
1159*4882a593Smuzhiyun /*
1160*4882a593Smuzhiyun * set writeback rate to default minimum value,
1161*4882a593Smuzhiyun * then let update_writeback_rate() to decide the
1162*4882a593Smuzhiyun * upcoming rate.
1163*4882a593Smuzhiyun */
1164*4882a593Smuzhiyun atomic_long_set(&dc->writeback_rate.rate, 1);
1165*4882a593Smuzhiyun }
1166*4882a593Smuzhiyun mutex_unlock(&bch_register_lock);
1167*4882a593Smuzhiyun } else
1168*4882a593Smuzhiyun atomic_long_set(&this_dc->writeback_rate.rate, 1);
1169*4882a593Smuzhiyun }
1170*4882a593Smuzhiyun
1171*4882a593Smuzhiyun /* Cached devices - read & write stuff */
1172*4882a593Smuzhiyun
cached_dev_submit_bio(struct bio * bio)1173*4882a593Smuzhiyun blk_qc_t cached_dev_submit_bio(struct bio *bio)
1174*4882a593Smuzhiyun {
1175*4882a593Smuzhiyun struct search *s;
1176*4882a593Smuzhiyun struct bcache_device *d = bio->bi_disk->private_data;
1177*4882a593Smuzhiyun struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1178*4882a593Smuzhiyun int rw = bio_data_dir(bio);
1179*4882a593Smuzhiyun
1180*4882a593Smuzhiyun if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) ||
1181*4882a593Smuzhiyun dc->io_disable)) {
1182*4882a593Smuzhiyun bio->bi_status = BLK_STS_IOERR;
1183*4882a593Smuzhiyun bio_endio(bio);
1184*4882a593Smuzhiyun return BLK_QC_T_NONE;
1185*4882a593Smuzhiyun }
1186*4882a593Smuzhiyun
1187*4882a593Smuzhiyun if (likely(d->c)) {
1188*4882a593Smuzhiyun if (atomic_read(&d->c->idle_counter))
1189*4882a593Smuzhiyun atomic_set(&d->c->idle_counter, 0);
1190*4882a593Smuzhiyun /*
1191*4882a593Smuzhiyun * If at_max_writeback_rate of cache set is true and new I/O
1192*4882a593Smuzhiyun * comes, quit max writeback rate of all cached devices
1193*4882a593Smuzhiyun * attached to this cache set, and set at_max_writeback_rate
1194*4882a593Smuzhiyun * to false.
1195*4882a593Smuzhiyun */
1196*4882a593Smuzhiyun if (unlikely(atomic_read(&d->c->at_max_writeback_rate) == 1)) {
1197*4882a593Smuzhiyun atomic_set(&d->c->at_max_writeback_rate, 0);
1198*4882a593Smuzhiyun quit_max_writeback_rate(d->c, dc);
1199*4882a593Smuzhiyun }
1200*4882a593Smuzhiyun }
1201*4882a593Smuzhiyun
1202*4882a593Smuzhiyun bio_set_dev(bio, dc->bdev);
1203*4882a593Smuzhiyun bio->bi_iter.bi_sector += dc->sb.data_offset;
1204*4882a593Smuzhiyun
1205*4882a593Smuzhiyun if (cached_dev_get(dc)) {
1206*4882a593Smuzhiyun s = search_alloc(bio, d);
1207*4882a593Smuzhiyun trace_bcache_request_start(s->d, bio);
1208*4882a593Smuzhiyun
1209*4882a593Smuzhiyun if (!bio->bi_iter.bi_size) {
1210*4882a593Smuzhiyun /*
1211*4882a593Smuzhiyun * can't call bch_journal_meta from under
1212*4882a593Smuzhiyun * submit_bio_noacct
1213*4882a593Smuzhiyun */
1214*4882a593Smuzhiyun continue_at_nobarrier(&s->cl,
1215*4882a593Smuzhiyun cached_dev_nodata,
1216*4882a593Smuzhiyun bcache_wq);
1217*4882a593Smuzhiyun } else {
1218*4882a593Smuzhiyun s->iop.bypass = check_should_bypass(dc, bio);
1219*4882a593Smuzhiyun
1220*4882a593Smuzhiyun if (rw)
1221*4882a593Smuzhiyun cached_dev_write(dc, s);
1222*4882a593Smuzhiyun else
1223*4882a593Smuzhiyun cached_dev_read(dc, s);
1224*4882a593Smuzhiyun }
1225*4882a593Smuzhiyun } else
1226*4882a593Smuzhiyun /* I/O request sent to backing device */
1227*4882a593Smuzhiyun detached_dev_do_request(d, bio);
1228*4882a593Smuzhiyun
1229*4882a593Smuzhiyun return BLK_QC_T_NONE;
1230*4882a593Smuzhiyun }
1231*4882a593Smuzhiyun
cached_dev_ioctl(struct bcache_device * d,fmode_t mode,unsigned int cmd,unsigned long arg)1232*4882a593Smuzhiyun static int cached_dev_ioctl(struct bcache_device *d, fmode_t mode,
1233*4882a593Smuzhiyun unsigned int cmd, unsigned long arg)
1234*4882a593Smuzhiyun {
1235*4882a593Smuzhiyun struct cached_dev *dc = container_of(d, struct cached_dev, disk);
1236*4882a593Smuzhiyun
1237*4882a593Smuzhiyun if (dc->io_disable)
1238*4882a593Smuzhiyun return -EIO;
1239*4882a593Smuzhiyun
1240*4882a593Smuzhiyun return __blkdev_driver_ioctl(dc->bdev, mode, cmd, arg);
1241*4882a593Smuzhiyun }
1242*4882a593Smuzhiyun
bch_cached_dev_request_init(struct cached_dev * dc)1243*4882a593Smuzhiyun void bch_cached_dev_request_init(struct cached_dev *dc)
1244*4882a593Smuzhiyun {
1245*4882a593Smuzhiyun dc->disk.cache_miss = cached_dev_cache_miss;
1246*4882a593Smuzhiyun dc->disk.ioctl = cached_dev_ioctl;
1247*4882a593Smuzhiyun }
1248*4882a593Smuzhiyun
1249*4882a593Smuzhiyun /* Flash backed devices */
1250*4882a593Smuzhiyun
flash_dev_cache_miss(struct btree * b,struct search * s,struct bio * bio,unsigned int sectors)1251*4882a593Smuzhiyun static int flash_dev_cache_miss(struct btree *b, struct search *s,
1252*4882a593Smuzhiyun struct bio *bio, unsigned int sectors)
1253*4882a593Smuzhiyun {
1254*4882a593Smuzhiyun unsigned int bytes = min(sectors, bio_sectors(bio)) << 9;
1255*4882a593Smuzhiyun
1256*4882a593Smuzhiyun swap(bio->bi_iter.bi_size, bytes);
1257*4882a593Smuzhiyun zero_fill_bio(bio);
1258*4882a593Smuzhiyun swap(bio->bi_iter.bi_size, bytes);
1259*4882a593Smuzhiyun
1260*4882a593Smuzhiyun bio_advance(bio, bytes);
1261*4882a593Smuzhiyun
1262*4882a593Smuzhiyun if (!bio->bi_iter.bi_size)
1263*4882a593Smuzhiyun return MAP_DONE;
1264*4882a593Smuzhiyun
1265*4882a593Smuzhiyun return MAP_CONTINUE;
1266*4882a593Smuzhiyun }
1267*4882a593Smuzhiyun
flash_dev_nodata(struct closure * cl)1268*4882a593Smuzhiyun static void flash_dev_nodata(struct closure *cl)
1269*4882a593Smuzhiyun {
1270*4882a593Smuzhiyun struct search *s = container_of(cl, struct search, cl);
1271*4882a593Smuzhiyun
1272*4882a593Smuzhiyun if (s->iop.flush_journal)
1273*4882a593Smuzhiyun bch_journal_meta(s->iop.c, cl);
1274*4882a593Smuzhiyun
1275*4882a593Smuzhiyun continue_at(cl, search_free, NULL);
1276*4882a593Smuzhiyun }
1277*4882a593Smuzhiyun
flash_dev_submit_bio(struct bio * bio)1278*4882a593Smuzhiyun blk_qc_t flash_dev_submit_bio(struct bio *bio)
1279*4882a593Smuzhiyun {
1280*4882a593Smuzhiyun struct search *s;
1281*4882a593Smuzhiyun struct closure *cl;
1282*4882a593Smuzhiyun struct bcache_device *d = bio->bi_disk->private_data;
1283*4882a593Smuzhiyun
1284*4882a593Smuzhiyun if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) {
1285*4882a593Smuzhiyun bio->bi_status = BLK_STS_IOERR;
1286*4882a593Smuzhiyun bio_endio(bio);
1287*4882a593Smuzhiyun return BLK_QC_T_NONE;
1288*4882a593Smuzhiyun }
1289*4882a593Smuzhiyun
1290*4882a593Smuzhiyun s = search_alloc(bio, d);
1291*4882a593Smuzhiyun cl = &s->cl;
1292*4882a593Smuzhiyun bio = &s->bio.bio;
1293*4882a593Smuzhiyun
1294*4882a593Smuzhiyun trace_bcache_request_start(s->d, bio);
1295*4882a593Smuzhiyun
1296*4882a593Smuzhiyun if (!bio->bi_iter.bi_size) {
1297*4882a593Smuzhiyun /*
1298*4882a593Smuzhiyun * can't call bch_journal_meta from under submit_bio_noacct
1299*4882a593Smuzhiyun */
1300*4882a593Smuzhiyun continue_at_nobarrier(&s->cl,
1301*4882a593Smuzhiyun flash_dev_nodata,
1302*4882a593Smuzhiyun bcache_wq);
1303*4882a593Smuzhiyun return BLK_QC_T_NONE;
1304*4882a593Smuzhiyun } else if (bio_data_dir(bio)) {
1305*4882a593Smuzhiyun bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys,
1306*4882a593Smuzhiyun &KEY(d->id, bio->bi_iter.bi_sector, 0),
1307*4882a593Smuzhiyun &KEY(d->id, bio_end_sector(bio), 0));
1308*4882a593Smuzhiyun
1309*4882a593Smuzhiyun s->iop.bypass = (bio_op(bio) == REQ_OP_DISCARD) != 0;
1310*4882a593Smuzhiyun s->iop.writeback = true;
1311*4882a593Smuzhiyun s->iop.bio = bio;
1312*4882a593Smuzhiyun
1313*4882a593Smuzhiyun closure_call(&s->iop.cl, bch_data_insert, NULL, cl);
1314*4882a593Smuzhiyun } else {
1315*4882a593Smuzhiyun closure_call(&s->iop.cl, cache_lookup, NULL, cl);
1316*4882a593Smuzhiyun }
1317*4882a593Smuzhiyun
1318*4882a593Smuzhiyun continue_at(cl, search_free, NULL);
1319*4882a593Smuzhiyun return BLK_QC_T_NONE;
1320*4882a593Smuzhiyun }
1321*4882a593Smuzhiyun
flash_dev_ioctl(struct bcache_device * d,fmode_t mode,unsigned int cmd,unsigned long arg)1322*4882a593Smuzhiyun static int flash_dev_ioctl(struct bcache_device *d, fmode_t mode,
1323*4882a593Smuzhiyun unsigned int cmd, unsigned long arg)
1324*4882a593Smuzhiyun {
1325*4882a593Smuzhiyun return -ENOTTY;
1326*4882a593Smuzhiyun }
1327*4882a593Smuzhiyun
bch_flash_dev_request_init(struct bcache_device * d)1328*4882a593Smuzhiyun void bch_flash_dev_request_init(struct bcache_device *d)
1329*4882a593Smuzhiyun {
1330*4882a593Smuzhiyun d->cache_miss = flash_dev_cache_miss;
1331*4882a593Smuzhiyun d->ioctl = flash_dev_ioctl;
1332*4882a593Smuzhiyun }
1333*4882a593Smuzhiyun
bch_request_exit(void)1334*4882a593Smuzhiyun void bch_request_exit(void)
1335*4882a593Smuzhiyun {
1336*4882a593Smuzhiyun kmem_cache_destroy(bch_search_cache);
1337*4882a593Smuzhiyun }
1338*4882a593Smuzhiyun
bch_request_init(void)1339*4882a593Smuzhiyun int __init bch_request_init(void)
1340*4882a593Smuzhiyun {
1341*4882a593Smuzhiyun bch_search_cache = KMEM_CACHE(search, 0);
1342*4882a593Smuzhiyun if (!bch_search_cache)
1343*4882a593Smuzhiyun return -ENOMEM;
1344*4882a593Smuzhiyun
1345*4882a593Smuzhiyun return 0;
1346*4882a593Smuzhiyun }
1347