1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * fs/f2fs/gc.c
4 *
5 * Copyright (c) 2012 Samsung Electronics Co., Ltd.
6 * http://www.samsung.com/
7 */
8 #include <linux/fs.h>
9 #include <linux/module.h>
10 #include <linux/backing-dev.h>
11 #include <linux/init.h>
12 #include <linux/f2fs_fs.h>
13 #include <linux/kthread.h>
14 #include <linux/delay.h>
15 #include <linux/freezer.h>
16 #include <linux/sched/signal.h>
17
18 #include "f2fs.h"
19 #include "node.h"
20 #include "segment.h"
21 #include "gc.h"
22 #include <trace/events/f2fs.h>
23
24 static struct kmem_cache *victim_entry_slab;
25
26 static unsigned int count_bits(const unsigned long *addr,
27 unsigned int offset, unsigned int len);
28
gc_thread_func(void * data)29 static int gc_thread_func(void *data)
30 {
31 struct f2fs_sb_info *sbi = data;
32 struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
33 wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head;
34 wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq;
35 unsigned int wait_ms;
36
37 wait_ms = gc_th->min_sleep_time;
38
39 set_freezable();
40 do {
41 bool sync_mode, foreground = false;
42
43 wait_event_interruptible_timeout(*wq,
44 kthread_should_stop() || freezing(current) ||
45 waitqueue_active(fggc_wq) ||
46 gc_th->gc_wake,
47 msecs_to_jiffies(wait_ms));
48
49 if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq))
50 foreground = true;
51
52 /* give it a try one time */
53 if (gc_th->gc_wake)
54 gc_th->gc_wake = 0;
55
56 if (try_to_freeze()) {
57 stat_other_skip_bggc_count(sbi);
58 continue;
59 }
60 if (kthread_should_stop())
61 break;
62
63 if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
64 increase_sleep_time(gc_th, &wait_ms);
65 stat_other_skip_bggc_count(sbi);
66 continue;
67 }
68
69 if (time_to_inject(sbi, FAULT_CHECKPOINT)) {
70 f2fs_show_injection_info(sbi, FAULT_CHECKPOINT);
71 f2fs_stop_checkpoint(sbi, false,
72 STOP_CP_REASON_FAULT_INJECT);
73 }
74
75 if (!sb_start_write_trylock(sbi->sb)) {
76 stat_other_skip_bggc_count(sbi);
77 continue;
78 }
79
80 /*
81 * [GC triggering condition]
82 * 0. GC is not conducted currently.
83 * 1. There are enough dirty segments.
84 * 2. IO subsystem is idle by checking the # of writeback pages.
85 * 3. IO subsystem is idle by checking the # of requests in
86 * bdev's request list.
87 *
88 * Note) We have to avoid triggering GCs frequently.
89 * Because it is possible that some segments can be
90 * invalidated soon after by user update or deletion.
91 * So, I'd like to wait some time to collect dirty segments.
92 */
93 if (sbi->gc_mode == GC_URGENT_HIGH) {
94 wait_ms = gc_th->urgent_sleep_time;
95 f2fs_down_write(&sbi->gc_lock);
96 goto do_gc;
97 }
98
99 if (foreground) {
100 f2fs_down_write(&sbi->gc_lock);
101 goto do_gc;
102 } else if (!f2fs_down_write_trylock(&sbi->gc_lock)) {
103 stat_other_skip_bggc_count(sbi);
104 goto next;
105 }
106
107 if (!is_idle(sbi, GC_TIME)) {
108 increase_sleep_time(gc_th, &wait_ms);
109 f2fs_up_write(&sbi->gc_lock);
110 stat_io_skip_bggc_count(sbi);
111 goto next;
112 }
113
114 if (has_enough_invalid_blocks(sbi))
115 decrease_sleep_time(gc_th, &wait_ms);
116 else
117 increase_sleep_time(gc_th, &wait_ms);
118 do_gc:
119 if (!foreground)
120 stat_inc_bggc_count(sbi->stat_info);
121
122 sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
123
124 /* foreground GC was been triggered via f2fs_balance_fs() */
125 if (foreground)
126 sync_mode = false;
127
128 /* if return value is not zero, no victim was selected */
129 if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO))
130 wait_ms = gc_th->no_gc_sleep_time;
131
132 if (foreground)
133 wake_up_all(&gc_th->fggc_wq);
134
135 trace_f2fs_background_gc(sbi->sb, wait_ms,
136 prefree_segments(sbi), free_segments(sbi));
137
138 /* balancing f2fs's metadata periodically */
139 f2fs_balance_fs_bg(sbi, true);
140 next:
141 sb_end_write(sbi->sb);
142
143 } while (!kthread_should_stop());
144 return 0;
145 }
146
f2fs_start_gc_thread(struct f2fs_sb_info * sbi)147 int f2fs_start_gc_thread(struct f2fs_sb_info *sbi)
148 {
149 struct f2fs_gc_kthread *gc_th;
150 dev_t dev = sbi->sb->s_bdev->bd_dev;
151 int err = 0;
152
153 gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL);
154 if (!gc_th) {
155 err = -ENOMEM;
156 goto out;
157 }
158
159 gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME;
160 gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME;
161 gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME;
162 gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME;
163
164 gc_th->gc_wake = 0;
165
166 sbi->gc_thread = gc_th;
167 init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head);
168 init_waitqueue_head(&sbi->gc_thread->fggc_wq);
169 sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi,
170 "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev));
171 if (IS_ERR(gc_th->f2fs_gc_task)) {
172 err = PTR_ERR(gc_th->f2fs_gc_task);
173 kfree(gc_th);
174 sbi->gc_thread = NULL;
175 }
176 out:
177 return err;
178 }
179
f2fs_stop_gc_thread(struct f2fs_sb_info * sbi)180 void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi)
181 {
182 struct f2fs_gc_kthread *gc_th = sbi->gc_thread;
183
184 if (!gc_th)
185 return;
186 kthread_stop(gc_th->f2fs_gc_task);
187 wake_up_all(&gc_th->fggc_wq);
188 kfree(gc_th);
189 sbi->gc_thread = NULL;
190 }
191
select_gc_type(struct f2fs_sb_info * sbi,int gc_type)192 static int select_gc_type(struct f2fs_sb_info *sbi, int gc_type)
193 {
194 int gc_mode;
195
196 if (gc_type == BG_GC) {
197 if (sbi->am.atgc_enabled)
198 gc_mode = GC_AT;
199 else
200 gc_mode = GC_CB;
201 } else {
202 gc_mode = GC_GREEDY;
203 }
204
205 switch (sbi->gc_mode) {
206 case GC_IDLE_CB:
207 gc_mode = GC_CB;
208 break;
209 case GC_IDLE_GREEDY:
210 case GC_URGENT_HIGH:
211 gc_mode = GC_GREEDY;
212 break;
213 case GC_IDLE_AT:
214 gc_mode = GC_AT;
215 break;
216 }
217
218 return gc_mode;
219 }
220
select_policy(struct f2fs_sb_info * sbi,int gc_type,int type,struct victim_sel_policy * p)221 static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
222 int type, struct victim_sel_policy *p)
223 {
224 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
225
226 if (p->alloc_mode == SSR) {
227 p->gc_mode = GC_GREEDY;
228 p->dirty_bitmap = dirty_i->dirty_segmap[type];
229 p->max_search = dirty_i->nr_dirty[type];
230 p->ofs_unit = 1;
231 } else if (p->alloc_mode == AT_SSR) {
232 p->gc_mode = GC_GREEDY;
233 p->dirty_bitmap = dirty_i->dirty_segmap[type];
234 p->max_search = dirty_i->nr_dirty[type];
235 p->ofs_unit = 1;
236 } else {
237 p->gc_mode = select_gc_type(sbi, gc_type);
238 p->ofs_unit = sbi->segs_per_sec;
239 if (__is_large_section(sbi)) {
240 p->dirty_bitmap = dirty_i->dirty_secmap;
241 p->max_search = count_bits(p->dirty_bitmap,
242 0, MAIN_SECS(sbi));
243 } else {
244 p->dirty_bitmap = dirty_i->dirty_segmap[DIRTY];
245 p->max_search = dirty_i->nr_dirty[DIRTY];
246 }
247 }
248
249 /*
250 * adjust candidates range, should select all dirty segments for
251 * foreground GC and urgent GC cases.
252 */
253 if (gc_type != FG_GC &&
254 (sbi->gc_mode != GC_URGENT_HIGH) &&
255 (p->gc_mode != GC_AT && p->alloc_mode != AT_SSR) &&
256 p->max_search > sbi->max_victim_search)
257 p->max_search = sbi->max_victim_search;
258
259 /* let's select beginning hot/small space first in no_heap mode*/
260 if (test_opt(sbi, NOHEAP) &&
261 (type == CURSEG_HOT_DATA || IS_NODESEG(type)))
262 p->offset = 0;
263 else
264 p->offset = SIT_I(sbi)->last_victim[p->gc_mode];
265 }
266
get_max_cost(struct f2fs_sb_info * sbi,struct victim_sel_policy * p)267 static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
268 struct victim_sel_policy *p)
269 {
270 /* SSR allocates in a segment unit */
271 if (p->alloc_mode == SSR)
272 return sbi->blocks_per_seg;
273 else if (p->alloc_mode == AT_SSR)
274 return UINT_MAX;
275
276 /* LFS */
277 if (p->gc_mode == GC_GREEDY)
278 return 2 * sbi->blocks_per_seg * p->ofs_unit;
279 else if (p->gc_mode == GC_CB)
280 return UINT_MAX;
281 else if (p->gc_mode == GC_AT)
282 return UINT_MAX;
283 else /* No other gc_mode */
284 return 0;
285 }
286
check_bg_victims(struct f2fs_sb_info * sbi)287 static unsigned int check_bg_victims(struct f2fs_sb_info *sbi)
288 {
289 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
290 unsigned int secno;
291
292 /*
293 * If the gc_type is FG_GC, we can select victim segments
294 * selected by background GC before.
295 * Those segments guarantee they have small valid blocks.
296 */
297 for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) {
298 if (sec_usage_check(sbi, secno))
299 continue;
300 clear_bit(secno, dirty_i->victim_secmap);
301 return GET_SEG_FROM_SEC(sbi, secno);
302 }
303 return NULL_SEGNO;
304 }
305
get_cb_cost(struct f2fs_sb_info * sbi,unsigned int segno)306 static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
307 {
308 struct sit_info *sit_i = SIT_I(sbi);
309 unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
310 unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
311 unsigned long long mtime = 0;
312 unsigned int vblocks;
313 unsigned char age = 0;
314 unsigned char u;
315 unsigned int i;
316 unsigned int usable_segs_per_sec = f2fs_usable_segs_in_sec(sbi, segno);
317
318 for (i = 0; i < usable_segs_per_sec; i++)
319 mtime += get_seg_entry(sbi, start + i)->mtime;
320 vblocks = get_valid_blocks(sbi, segno, true);
321
322 mtime = div_u64(mtime, usable_segs_per_sec);
323 vblocks = div_u64(vblocks, usable_segs_per_sec);
324
325 u = (vblocks * 100) >> sbi->log_blocks_per_seg;
326
327 /* Handle if the system time has changed by the user */
328 if (mtime < sit_i->min_mtime)
329 sit_i->min_mtime = mtime;
330 if (mtime > sit_i->max_mtime)
331 sit_i->max_mtime = mtime;
332 if (sit_i->max_mtime != sit_i->min_mtime)
333 age = 100 - div64_u64(100 * (mtime - sit_i->min_mtime),
334 sit_i->max_mtime - sit_i->min_mtime);
335
336 return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
337 }
338
get_gc_cost(struct f2fs_sb_info * sbi,unsigned int segno,struct victim_sel_policy * p)339 static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
340 unsigned int segno, struct victim_sel_policy *p)
341 {
342 if (p->alloc_mode == SSR)
343 return get_seg_entry(sbi, segno)->ckpt_valid_blocks;
344
345 /* alloc_mode == LFS */
346 if (p->gc_mode == GC_GREEDY)
347 return get_valid_blocks(sbi, segno, true);
348 else if (p->gc_mode == GC_CB)
349 return get_cb_cost(sbi, segno);
350
351 f2fs_bug_on(sbi, 1);
352 return 0;
353 }
354
count_bits(const unsigned long * addr,unsigned int offset,unsigned int len)355 static unsigned int count_bits(const unsigned long *addr,
356 unsigned int offset, unsigned int len)
357 {
358 unsigned int end = offset + len, sum = 0;
359
360 while (offset < end) {
361 if (test_bit(offset++, addr))
362 ++sum;
363 }
364 return sum;
365 }
366
attach_victim_entry(struct f2fs_sb_info * sbi,unsigned long long mtime,unsigned int segno,struct rb_node * parent,struct rb_node ** p,bool left_most)367 static struct victim_entry *attach_victim_entry(struct f2fs_sb_info *sbi,
368 unsigned long long mtime, unsigned int segno,
369 struct rb_node *parent, struct rb_node **p,
370 bool left_most)
371 {
372 struct atgc_management *am = &sbi->am;
373 struct victim_entry *ve;
374
375 ve = f2fs_kmem_cache_alloc(victim_entry_slab, GFP_NOFS);
376
377 ve->mtime = mtime;
378 ve->segno = segno;
379
380 rb_link_node(&ve->rb_node, parent, p);
381 rb_insert_color_cached(&ve->rb_node, &am->root, left_most);
382
383 list_add_tail(&ve->list, &am->victim_list);
384
385 am->victim_count++;
386
387 return ve;
388 }
389
insert_victim_entry(struct f2fs_sb_info * sbi,unsigned long long mtime,unsigned int segno)390 static void insert_victim_entry(struct f2fs_sb_info *sbi,
391 unsigned long long mtime, unsigned int segno)
392 {
393 struct atgc_management *am = &sbi->am;
394 struct rb_node **p;
395 struct rb_node *parent = NULL;
396 bool left_most = true;
397
398 p = f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, mtime, &left_most);
399 attach_victim_entry(sbi, mtime, segno, parent, p, left_most);
400 }
401
add_victim_entry(struct f2fs_sb_info * sbi,struct victim_sel_policy * p,unsigned int segno)402 static void add_victim_entry(struct f2fs_sb_info *sbi,
403 struct victim_sel_policy *p, unsigned int segno)
404 {
405 struct sit_info *sit_i = SIT_I(sbi);
406 unsigned int secno = GET_SEC_FROM_SEG(sbi, segno);
407 unsigned int start = GET_SEG_FROM_SEC(sbi, secno);
408 unsigned long long mtime = 0;
409 unsigned int i;
410
411 if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
412 if (p->gc_mode == GC_AT &&
413 get_valid_blocks(sbi, segno, true) == 0)
414 return;
415 }
416
417 for (i = 0; i < sbi->segs_per_sec; i++)
418 mtime += get_seg_entry(sbi, start + i)->mtime;
419 mtime = div_u64(mtime, sbi->segs_per_sec);
420
421 /* Handle if the system time has changed by the user */
422 if (mtime < sit_i->min_mtime)
423 sit_i->min_mtime = mtime;
424 if (mtime > sit_i->max_mtime)
425 sit_i->max_mtime = mtime;
426 if (mtime < sit_i->dirty_min_mtime)
427 sit_i->dirty_min_mtime = mtime;
428 if (mtime > sit_i->dirty_max_mtime)
429 sit_i->dirty_max_mtime = mtime;
430
431 /* don't choose young section as candidate */
432 if (sit_i->dirty_max_mtime - mtime < p->age_threshold)
433 return;
434
435 insert_victim_entry(sbi, mtime, segno);
436 }
437
lookup_central_victim(struct f2fs_sb_info * sbi,struct victim_sel_policy * p)438 static struct rb_node *lookup_central_victim(struct f2fs_sb_info *sbi,
439 struct victim_sel_policy *p)
440 {
441 struct atgc_management *am = &sbi->am;
442 struct rb_node *parent = NULL;
443 bool left_most;
444
445 f2fs_lookup_rb_tree_ext(sbi, &am->root, &parent, p->age, &left_most);
446
447 return parent;
448 }
449
atgc_lookup_victim(struct f2fs_sb_info * sbi,struct victim_sel_policy * p)450 static void atgc_lookup_victim(struct f2fs_sb_info *sbi,
451 struct victim_sel_policy *p)
452 {
453 struct sit_info *sit_i = SIT_I(sbi);
454 struct atgc_management *am = &sbi->am;
455 struct rb_root_cached *root = &am->root;
456 struct rb_node *node;
457 struct rb_entry *re;
458 struct victim_entry *ve;
459 unsigned long long total_time;
460 unsigned long long age, u, accu;
461 unsigned long long max_mtime = sit_i->dirty_max_mtime;
462 unsigned long long min_mtime = sit_i->dirty_min_mtime;
463 unsigned int sec_blocks = BLKS_PER_SEC(sbi);
464 unsigned int vblocks;
465 unsigned int dirty_threshold = max(am->max_candidate_count,
466 am->candidate_ratio *
467 am->victim_count / 100);
468 unsigned int age_weight = am->age_weight;
469 unsigned int cost;
470 unsigned int iter = 0;
471
472 if (max_mtime < min_mtime)
473 return;
474
475 max_mtime += 1;
476 total_time = max_mtime - min_mtime;
477
478 accu = div64_u64(ULLONG_MAX, total_time);
479 accu = min_t(unsigned long long, div_u64(accu, 100),
480 DEFAULT_ACCURACY_CLASS);
481
482 node = rb_first_cached(root);
483 next:
484 re = rb_entry_safe(node, struct rb_entry, rb_node);
485 if (!re)
486 return;
487
488 ve = (struct victim_entry *)re;
489
490 if (ve->mtime >= max_mtime || ve->mtime < min_mtime)
491 goto skip;
492
493 /* age = 10000 * x% * 60 */
494 age = div64_u64(accu * (max_mtime - ve->mtime), total_time) *
495 age_weight;
496
497 vblocks = get_valid_blocks(sbi, ve->segno, true);
498 f2fs_bug_on(sbi, !vblocks || vblocks == sec_blocks);
499
500 /* u = 10000 * x% * 40 */
501 u = div64_u64(accu * (sec_blocks - vblocks), sec_blocks) *
502 (100 - age_weight);
503
504 f2fs_bug_on(sbi, age + u >= UINT_MAX);
505
506 cost = UINT_MAX - (age + u);
507 iter++;
508
509 if (cost < p->min_cost ||
510 (cost == p->min_cost && age > p->oldest_age)) {
511 p->min_cost = cost;
512 p->oldest_age = age;
513 p->min_segno = ve->segno;
514 }
515 skip:
516 if (iter < dirty_threshold) {
517 node = rb_next(node);
518 goto next;
519 }
520 }
521
522 /*
523 * select candidates around source section in range of
524 * [target - dirty_threshold, target + dirty_threshold]
525 */
atssr_lookup_victim(struct f2fs_sb_info * sbi,struct victim_sel_policy * p)526 static void atssr_lookup_victim(struct f2fs_sb_info *sbi,
527 struct victim_sel_policy *p)
528 {
529 struct sit_info *sit_i = SIT_I(sbi);
530 struct atgc_management *am = &sbi->am;
531 struct rb_node *node;
532 struct rb_entry *re;
533 struct victim_entry *ve;
534 unsigned long long age;
535 unsigned long long max_mtime = sit_i->dirty_max_mtime;
536 unsigned long long min_mtime = sit_i->dirty_min_mtime;
537 unsigned int seg_blocks = sbi->blocks_per_seg;
538 unsigned int vblocks;
539 unsigned int dirty_threshold = max(am->max_candidate_count,
540 am->candidate_ratio *
541 am->victim_count / 100);
542 unsigned int cost;
543 unsigned int iter = 0;
544 int stage = 0;
545
546 if (max_mtime < min_mtime)
547 return;
548 max_mtime += 1;
549 next_stage:
550 node = lookup_central_victim(sbi, p);
551 next_node:
552 re = rb_entry_safe(node, struct rb_entry, rb_node);
553 if (!re) {
554 if (stage == 0)
555 goto skip_stage;
556 return;
557 }
558
559 ve = (struct victim_entry *)re;
560
561 if (ve->mtime >= max_mtime || ve->mtime < min_mtime)
562 goto skip_node;
563
564 age = max_mtime - ve->mtime;
565
566 vblocks = get_seg_entry(sbi, ve->segno)->ckpt_valid_blocks;
567 f2fs_bug_on(sbi, !vblocks);
568
569 /* rare case */
570 if (vblocks == seg_blocks)
571 goto skip_node;
572
573 iter++;
574
575 age = max_mtime - abs(p->age - age);
576 cost = UINT_MAX - vblocks;
577
578 if (cost < p->min_cost ||
579 (cost == p->min_cost && age > p->oldest_age)) {
580 p->min_cost = cost;
581 p->oldest_age = age;
582 p->min_segno = ve->segno;
583 }
584 skip_node:
585 if (iter < dirty_threshold) {
586 if (stage == 0)
587 node = rb_prev(node);
588 else if (stage == 1)
589 node = rb_next(node);
590 goto next_node;
591 }
592 skip_stage:
593 if (stage < 1) {
594 stage++;
595 iter = 0;
596 goto next_stage;
597 }
598 }
lookup_victim_by_age(struct f2fs_sb_info * sbi,struct victim_sel_policy * p)599 static void lookup_victim_by_age(struct f2fs_sb_info *sbi,
600 struct victim_sel_policy *p)
601 {
602 f2fs_bug_on(sbi, !f2fs_check_rb_tree_consistence(sbi,
603 &sbi->am.root, true));
604
605 if (p->gc_mode == GC_AT)
606 atgc_lookup_victim(sbi, p);
607 else if (p->alloc_mode == AT_SSR)
608 atssr_lookup_victim(sbi, p);
609 else
610 f2fs_bug_on(sbi, 1);
611 }
612
release_victim_entry(struct f2fs_sb_info * sbi)613 static void release_victim_entry(struct f2fs_sb_info *sbi)
614 {
615 struct atgc_management *am = &sbi->am;
616 struct victim_entry *ve, *tmp;
617
618 list_for_each_entry_safe(ve, tmp, &am->victim_list, list) {
619 list_del(&ve->list);
620 kmem_cache_free(victim_entry_slab, ve);
621 am->victim_count--;
622 }
623
624 am->root = RB_ROOT_CACHED;
625
626 f2fs_bug_on(sbi, am->victim_count);
627 f2fs_bug_on(sbi, !list_empty(&am->victim_list));
628 }
629
630 /*
631 * This function is called from two paths.
632 * One is garbage collection and the other is SSR segment selection.
633 * When it is called during GC, it just gets a victim segment
634 * and it does not remove it from dirty seglist.
635 * When it is called from SSR segment selection, it finds a segment
636 * which has minimum valid blocks and removes it from dirty seglist.
637 */
get_victim_by_default(struct f2fs_sb_info * sbi,unsigned int * result,int gc_type,int type,char alloc_mode,unsigned long long age)638 static int get_victim_by_default(struct f2fs_sb_info *sbi,
639 unsigned int *result, int gc_type, int type,
640 char alloc_mode, unsigned long long age)
641 {
642 struct dirty_seglist_info *dirty_i = DIRTY_I(sbi);
643 struct sit_info *sm = SIT_I(sbi);
644 struct victim_sel_policy p;
645 unsigned int secno, last_victim;
646 unsigned int last_segment;
647 unsigned int nsearched;
648 bool is_atgc;
649 int ret = 0;
650
651 mutex_lock(&dirty_i->seglist_lock);
652 last_segment = MAIN_SECS(sbi) * sbi->segs_per_sec;
653
654 p.alloc_mode = alloc_mode;
655 p.age = age;
656 p.age_threshold = sbi->am.age_threshold;
657
658 retry:
659 select_policy(sbi, gc_type, type, &p);
660 p.min_segno = NULL_SEGNO;
661 p.oldest_age = 0;
662 p.min_cost = get_max_cost(sbi, &p);
663
664 is_atgc = (p.gc_mode == GC_AT || p.alloc_mode == AT_SSR);
665 nsearched = 0;
666
667 if (is_atgc)
668 SIT_I(sbi)->dirty_min_mtime = ULLONG_MAX;
669
670 if (*result != NULL_SEGNO) {
671 if (!get_valid_blocks(sbi, *result, false)) {
672 ret = -ENODATA;
673 goto out;
674 }
675
676 if (sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result)))
677 ret = -EBUSY;
678 else
679 p.min_segno = *result;
680 goto out;
681 }
682
683 ret = -ENODATA;
684 if (p.max_search == 0)
685 goto out;
686
687 if (__is_large_section(sbi) && p.alloc_mode == LFS) {
688 if (sbi->next_victim_seg[BG_GC] != NULL_SEGNO) {
689 p.min_segno = sbi->next_victim_seg[BG_GC];
690 *result = p.min_segno;
691 sbi->next_victim_seg[BG_GC] = NULL_SEGNO;
692 goto got_result;
693 }
694 if (gc_type == FG_GC &&
695 sbi->next_victim_seg[FG_GC] != NULL_SEGNO) {
696 p.min_segno = sbi->next_victim_seg[FG_GC];
697 *result = p.min_segno;
698 sbi->next_victim_seg[FG_GC] = NULL_SEGNO;
699 goto got_result;
700 }
701 }
702
703 last_victim = sm->last_victim[p.gc_mode];
704 if (p.alloc_mode == LFS && gc_type == FG_GC) {
705 p.min_segno = check_bg_victims(sbi);
706 if (p.min_segno != NULL_SEGNO)
707 goto got_it;
708 }
709
710 while (1) {
711 unsigned long cost, *dirty_bitmap;
712 unsigned int unit_no, segno;
713
714 dirty_bitmap = p.dirty_bitmap;
715 unit_no = find_next_bit(dirty_bitmap,
716 last_segment / p.ofs_unit,
717 p.offset / p.ofs_unit);
718 segno = unit_no * p.ofs_unit;
719 if (segno >= last_segment) {
720 if (sm->last_victim[p.gc_mode]) {
721 last_segment =
722 sm->last_victim[p.gc_mode];
723 sm->last_victim[p.gc_mode] = 0;
724 p.offset = 0;
725 continue;
726 }
727 break;
728 }
729
730 p.offset = segno + p.ofs_unit;
731 nsearched++;
732
733 #ifdef CONFIG_F2FS_CHECK_FS
734 /*
735 * skip selecting the invalid segno (that is failed due to block
736 * validity check failure during GC) to avoid endless GC loop in
737 * such cases.
738 */
739 if (test_bit(segno, sm->invalid_segmap))
740 goto next;
741 #endif
742
743 secno = GET_SEC_FROM_SEG(sbi, segno);
744
745 if (sec_usage_check(sbi, secno))
746 goto next;
747
748 /* Don't touch checkpointed data */
749 if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) {
750 if (p.alloc_mode == LFS) {
751 /*
752 * LFS is set to find source section during GC.
753 * The victim should have no checkpointed data.
754 */
755 if (get_ckpt_valid_blocks(sbi, segno, true))
756 goto next;
757 } else {
758 /*
759 * SSR | AT_SSR are set to find target segment
760 * for writes which can be full by checkpointed
761 * and newly written blocks.
762 */
763 if (!f2fs_segment_has_free_slot(sbi, segno))
764 goto next;
765 }
766 }
767
768 if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap))
769 goto next;
770
771 if (is_atgc) {
772 add_victim_entry(sbi, &p, segno);
773 goto next;
774 }
775
776 cost = get_gc_cost(sbi, segno, &p);
777
778 if (p.min_cost > cost) {
779 p.min_segno = segno;
780 p.min_cost = cost;
781 }
782 next:
783 if (nsearched >= p.max_search) {
784 if (!sm->last_victim[p.gc_mode] && segno <= last_victim)
785 sm->last_victim[p.gc_mode] =
786 last_victim + p.ofs_unit;
787 else
788 sm->last_victim[p.gc_mode] = segno + p.ofs_unit;
789 sm->last_victim[p.gc_mode] %=
790 (MAIN_SECS(sbi) * sbi->segs_per_sec);
791 break;
792 }
793 }
794
795 /* get victim for GC_AT/AT_SSR */
796 if (is_atgc) {
797 lookup_victim_by_age(sbi, &p);
798 release_victim_entry(sbi);
799 }
800
801 if (is_atgc && p.min_segno == NULL_SEGNO &&
802 sm->elapsed_time < p.age_threshold) {
803 p.age_threshold = 0;
804 goto retry;
805 }
806
807 if (p.min_segno != NULL_SEGNO) {
808 got_it:
809 *result = (p.min_segno / p.ofs_unit) * p.ofs_unit;
810 got_result:
811 if (p.alloc_mode == LFS) {
812 secno = GET_SEC_FROM_SEG(sbi, p.min_segno);
813 if (gc_type == FG_GC)
814 sbi->cur_victim_sec = secno;
815 else
816 set_bit(secno, dirty_i->victim_secmap);
817 }
818 ret = 0;
819
820 }
821 out:
822 if (p.min_segno != NULL_SEGNO)
823 trace_f2fs_get_victim(sbi->sb, type, gc_type, &p,
824 sbi->cur_victim_sec,
825 prefree_segments(sbi), free_segments(sbi));
826 mutex_unlock(&dirty_i->seglist_lock);
827
828 return ret;
829 }
830
831 static const struct victim_selection default_v_ops = {
832 .get_victim = get_victim_by_default,
833 };
834
find_gc_inode(struct gc_inode_list * gc_list,nid_t ino)835 static struct inode *find_gc_inode(struct gc_inode_list *gc_list, nid_t ino)
836 {
837 struct inode_entry *ie;
838
839 ie = radix_tree_lookup(&gc_list->iroot, ino);
840 if (ie)
841 return ie->inode;
842 return NULL;
843 }
844
add_gc_inode(struct gc_inode_list * gc_list,struct inode * inode)845 static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
846 {
847 struct inode_entry *new_ie;
848
849 if (inode == find_gc_inode(gc_list, inode->i_ino)) {
850 iput(inode);
851 return;
852 }
853 new_ie = f2fs_kmem_cache_alloc(f2fs_inode_entry_slab, GFP_NOFS);
854 new_ie->inode = inode;
855
856 f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
857 list_add_tail(&new_ie->list, &gc_list->ilist);
858 }
859
put_gc_inode(struct gc_inode_list * gc_list)860 static void put_gc_inode(struct gc_inode_list *gc_list)
861 {
862 struct inode_entry *ie, *next_ie;
863
864 list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) {
865 radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
866 iput(ie->inode);
867 list_del(&ie->list);
868 kmem_cache_free(f2fs_inode_entry_slab, ie);
869 }
870 }
871
check_valid_map(struct f2fs_sb_info * sbi,unsigned int segno,int offset)872 static int check_valid_map(struct f2fs_sb_info *sbi,
873 unsigned int segno, int offset)
874 {
875 struct sit_info *sit_i = SIT_I(sbi);
876 struct seg_entry *sentry;
877 int ret;
878
879 down_read(&sit_i->sentry_lock);
880 sentry = get_seg_entry(sbi, segno);
881 ret = f2fs_test_bit(offset, sentry->cur_valid_map);
882 up_read(&sit_i->sentry_lock);
883 return ret;
884 }
885
886 /*
887 * This function compares node address got in summary with that in NAT.
888 * On validity, copy that node with cold status, otherwise (invalid node)
889 * ignore that.
890 */
gc_node_segment(struct f2fs_sb_info * sbi,struct f2fs_summary * sum,unsigned int segno,int gc_type)891 static int gc_node_segment(struct f2fs_sb_info *sbi,
892 struct f2fs_summary *sum, unsigned int segno, int gc_type)
893 {
894 struct f2fs_summary *entry;
895 block_t start_addr;
896 int off;
897 int phase = 0;
898 bool fggc = (gc_type == FG_GC);
899 int submitted = 0;
900 unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
901
902 start_addr = START_BLOCK(sbi, segno);
903
904 next_step:
905 entry = sum;
906
907 if (fggc && phase == 2)
908 atomic_inc(&sbi->wb_sync_req[NODE]);
909
910 for (off = 0; off < usable_blks_in_seg; off++, entry++) {
911 nid_t nid = le32_to_cpu(entry->nid);
912 struct page *node_page;
913 struct node_info ni;
914 int err;
915
916 /* stop BG_GC if there is not enough free sections. */
917 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0))
918 return submitted;
919
920 if (check_valid_map(sbi, segno, off) == 0)
921 continue;
922
923 if (phase == 0) {
924 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
925 META_NAT, true);
926 continue;
927 }
928
929 if (phase == 1) {
930 f2fs_ra_node_page(sbi, nid);
931 continue;
932 }
933
934 /* phase == 2 */
935 node_page = f2fs_get_node_page(sbi, nid);
936 if (IS_ERR(node_page))
937 continue;
938
939 /* block may become invalid during f2fs_get_node_page */
940 if (check_valid_map(sbi, segno, off) == 0) {
941 f2fs_put_page(node_page, 1);
942 continue;
943 }
944
945 if (f2fs_get_node_info(sbi, nid, &ni, false)) {
946 f2fs_put_page(node_page, 1);
947 continue;
948 }
949
950 if (ni.blk_addr != start_addr + off) {
951 f2fs_put_page(node_page, 1);
952 continue;
953 }
954
955 err = f2fs_move_node_page(node_page, gc_type);
956 if (!err && gc_type == FG_GC)
957 submitted++;
958 stat_inc_node_blk_count(sbi, 1, gc_type);
959 }
960
961 if (++phase < 3)
962 goto next_step;
963
964 if (fggc)
965 atomic_dec(&sbi->wb_sync_req[NODE]);
966 return submitted;
967 }
968
969 /*
970 * Calculate start block index indicating the given node offset.
971 * Be careful, caller should give this node offset only indicating direct node
972 * blocks. If any node offsets, which point the other types of node blocks such
973 * as indirect or double indirect node blocks, are given, it must be a caller's
974 * bug.
975 */
f2fs_start_bidx_of_node(unsigned int node_ofs,struct inode * inode)976 block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode)
977 {
978 unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
979 unsigned int bidx;
980
981 if (node_ofs == 0)
982 return 0;
983
984 if (node_ofs <= 2) {
985 bidx = node_ofs - 1;
986 } else if (node_ofs <= indirect_blks) {
987 int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
988
989 bidx = node_ofs - 2 - dec;
990 } else {
991 int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
992
993 bidx = node_ofs - 5 - dec;
994 }
995 return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode);
996 }
997
is_alive(struct f2fs_sb_info * sbi,struct f2fs_summary * sum,struct node_info * dni,block_t blkaddr,unsigned int * nofs)998 static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
999 struct node_info *dni, block_t blkaddr, unsigned int *nofs)
1000 {
1001 struct page *node_page;
1002 nid_t nid;
1003 unsigned int ofs_in_node, max_addrs;
1004 block_t source_blkaddr;
1005
1006 nid = le32_to_cpu(sum->nid);
1007 ofs_in_node = le16_to_cpu(sum->ofs_in_node);
1008
1009 node_page = f2fs_get_node_page(sbi, nid);
1010 if (IS_ERR(node_page))
1011 return false;
1012
1013 if (f2fs_get_node_info(sbi, nid, dni, false)) {
1014 f2fs_put_page(node_page, 1);
1015 return false;
1016 }
1017
1018 if (sum->version != dni->version) {
1019 f2fs_warn(sbi, "%s: valid data with mismatched node version.",
1020 __func__);
1021 set_sbi_flag(sbi, SBI_NEED_FSCK);
1022 }
1023
1024 if (f2fs_check_nid_range(sbi, dni->ino)) {
1025 f2fs_put_page(node_page, 1);
1026 return false;
1027 }
1028
1029 max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE :
1030 DEF_ADDRS_PER_BLOCK;
1031 if (ofs_in_node >= max_addrs) {
1032 f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
1033 ofs_in_node, dni->ino, dni->nid, max_addrs);
1034 return false;
1035 }
1036
1037 *nofs = ofs_of_node(node_page);
1038 source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
1039 f2fs_put_page(node_page, 1);
1040
1041 if (source_blkaddr != blkaddr) {
1042 #ifdef CONFIG_F2FS_CHECK_FS
1043 unsigned int segno = GET_SEGNO(sbi, blkaddr);
1044 unsigned long offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
1045
1046 if (unlikely(check_valid_map(sbi, segno, offset))) {
1047 if (!test_and_set_bit(segno, SIT_I(sbi)->invalid_segmap)) {
1048 f2fs_err(sbi, "mismatched blkaddr %u (source_blkaddr %u) in seg %u",
1049 blkaddr, source_blkaddr, segno);
1050 f2fs_bug_on(sbi, 1);
1051 }
1052 }
1053 #endif
1054 return false;
1055 }
1056 return true;
1057 }
1058
ra_data_block(struct inode * inode,pgoff_t index)1059 static int ra_data_block(struct inode *inode, pgoff_t index)
1060 {
1061 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
1062 struct address_space *mapping = inode->i_mapping;
1063 struct dnode_of_data dn;
1064 struct page *page;
1065 struct extent_info ei = {0, };
1066 struct f2fs_io_info fio = {
1067 .sbi = sbi,
1068 .ino = inode->i_ino,
1069 .type = DATA,
1070 .temp = COLD,
1071 .op = REQ_OP_READ,
1072 .op_flags = 0,
1073 .encrypted_page = NULL,
1074 .in_list = false,
1075 .retry = false,
1076 };
1077 int err;
1078
1079 page = f2fs_grab_cache_page(mapping, index, true);
1080 if (!page)
1081 return -ENOMEM;
1082
1083 if (f2fs_lookup_read_extent_cache(inode, index, &ei)) {
1084 dn.data_blkaddr = ei.blk + index - ei.fofs;
1085 if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
1086 DATA_GENERIC_ENHANCE_READ))) {
1087 err = -EFSCORRUPTED;
1088 goto put_page;
1089 }
1090 goto got_it;
1091 }
1092
1093 set_new_dnode(&dn, inode, NULL, NULL, 0);
1094 err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
1095 if (err)
1096 goto put_page;
1097 f2fs_put_dnode(&dn);
1098
1099 if (!__is_valid_data_blkaddr(dn.data_blkaddr)) {
1100 err = -ENOENT;
1101 goto put_page;
1102 }
1103 if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr,
1104 DATA_GENERIC_ENHANCE))) {
1105 err = -EFSCORRUPTED;
1106 goto put_page;
1107 }
1108 got_it:
1109 /* read page */
1110 fio.page = page;
1111 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
1112
1113 /*
1114 * don't cache encrypted data into meta inode until previous dirty
1115 * data were writebacked to avoid racing between GC and flush.
1116 */
1117 f2fs_wait_on_page_writeback(page, DATA, true, true);
1118
1119 f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
1120
1121 fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi),
1122 dn.data_blkaddr,
1123 FGP_LOCK | FGP_CREAT, GFP_NOFS);
1124 if (!fio.encrypted_page) {
1125 err = -ENOMEM;
1126 goto put_page;
1127 }
1128
1129 err = f2fs_submit_page_bio(&fio);
1130 if (err)
1131 goto put_encrypted_page;
1132 f2fs_put_page(fio.encrypted_page, 0);
1133 f2fs_put_page(page, 1);
1134
1135 f2fs_update_iostat(sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
1136 f2fs_update_iostat(sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
1137
1138 return 0;
1139 put_encrypted_page:
1140 f2fs_put_page(fio.encrypted_page, 1);
1141 put_page:
1142 f2fs_put_page(page, 1);
1143 return err;
1144 }
1145
1146 /*
1147 * Move data block via META_MAPPING while keeping locked data page.
1148 * This can be used to move blocks, aka LBAs, directly on disk.
1149 */
move_data_block(struct inode * inode,block_t bidx,int gc_type,unsigned int segno,int off)1150 static int move_data_block(struct inode *inode, block_t bidx,
1151 int gc_type, unsigned int segno, int off)
1152 {
1153 struct f2fs_io_info fio = {
1154 .sbi = F2FS_I_SB(inode),
1155 .ino = inode->i_ino,
1156 .type = DATA,
1157 .temp = COLD,
1158 .op = REQ_OP_READ,
1159 .op_flags = 0,
1160 .encrypted_page = NULL,
1161 .in_list = false,
1162 .retry = false,
1163 };
1164 struct dnode_of_data dn;
1165 struct f2fs_summary sum;
1166 struct node_info ni;
1167 struct page *page, *mpage;
1168 block_t newaddr;
1169 int err = 0;
1170 bool lfs_mode = f2fs_lfs_mode(fio.sbi);
1171 int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) &&
1172 (fio.sbi->gc_mode != GC_URGENT_HIGH) ?
1173 CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA;
1174
1175 /* do not read out */
1176 page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
1177 if (!page)
1178 return -ENOMEM;
1179
1180 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
1181 err = -ENOENT;
1182 goto out;
1183 }
1184
1185 if (f2fs_is_atomic_file(inode)) {
1186 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
1187 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
1188 err = -EAGAIN;
1189 goto out;
1190 }
1191
1192 if (f2fs_is_pinned_file(inode)) {
1193 if (gc_type == FG_GC)
1194 f2fs_pin_file_control(inode, true);
1195 err = -EAGAIN;
1196 goto out;
1197 }
1198
1199 set_new_dnode(&dn, inode, NULL, NULL, 0);
1200 err = f2fs_get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
1201 if (err)
1202 goto out;
1203
1204 if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
1205 ClearPageUptodate(page);
1206 err = -ENOENT;
1207 goto put_out;
1208 }
1209
1210 /*
1211 * don't cache encrypted data into meta inode until previous dirty
1212 * data were writebacked to avoid racing between GC and flush.
1213 */
1214 f2fs_wait_on_page_writeback(page, DATA, true, true);
1215
1216 f2fs_wait_on_block_writeback(inode, dn.data_blkaddr);
1217
1218 err = f2fs_get_node_info(fio.sbi, dn.nid, &ni, false);
1219 if (err)
1220 goto put_out;
1221
1222 /* read page */
1223 fio.page = page;
1224 fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr;
1225
1226 if (lfs_mode)
1227 f2fs_down_write(&fio.sbi->io_order_lock);
1228
1229 mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi),
1230 fio.old_blkaddr, false);
1231 if (!mpage) {
1232 err = -ENOMEM;
1233 goto up_out;
1234 }
1235
1236 fio.encrypted_page = mpage;
1237
1238 /* read source block in mpage */
1239 if (!PageUptodate(mpage)) {
1240 err = f2fs_submit_page_bio(&fio);
1241 if (err) {
1242 f2fs_put_page(mpage, 1);
1243 goto up_out;
1244 }
1245
1246 f2fs_update_iostat(fio.sbi, FS_DATA_READ_IO, F2FS_BLKSIZE);
1247 f2fs_update_iostat(fio.sbi, FS_GDATA_READ_IO, F2FS_BLKSIZE);
1248
1249 lock_page(mpage);
1250 if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) ||
1251 !PageUptodate(mpage))) {
1252 err = -EIO;
1253 f2fs_put_page(mpage, 1);
1254 goto up_out;
1255 }
1256 }
1257
1258 set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version);
1259
1260 /* allocate block address */
1261 f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
1262 &sum, type, NULL);
1263
1264 fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
1265 newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
1266 if (!fio.encrypted_page) {
1267 err = -ENOMEM;
1268 f2fs_put_page(mpage, 1);
1269 goto recover_block;
1270 }
1271
1272 /* write target block */
1273 f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true);
1274 memcpy(page_address(fio.encrypted_page),
1275 page_address(mpage), PAGE_SIZE);
1276 f2fs_put_page(mpage, 1);
1277 invalidate_mapping_pages(META_MAPPING(fio.sbi),
1278 fio.old_blkaddr, fio.old_blkaddr);
1279 f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr);
1280
1281 set_page_dirty(fio.encrypted_page);
1282 if (clear_page_dirty_for_io(fio.encrypted_page))
1283 dec_page_count(fio.sbi, F2FS_DIRTY_META);
1284
1285 set_page_writeback(fio.encrypted_page);
1286 ClearPageError(page);
1287
1288 fio.op = REQ_OP_WRITE;
1289 fio.op_flags = REQ_SYNC;
1290 fio.new_blkaddr = newaddr;
1291 f2fs_submit_page_write(&fio);
1292 if (fio.retry) {
1293 err = -EAGAIN;
1294 if (PageWriteback(fio.encrypted_page))
1295 end_page_writeback(fio.encrypted_page);
1296 goto put_page_out;
1297 }
1298
1299 f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE);
1300
1301 f2fs_update_data_blkaddr(&dn, newaddr);
1302 set_inode_flag(inode, FI_APPEND_WRITE);
1303 if (page->index == 0)
1304 set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
1305 put_page_out:
1306 f2fs_put_page(fio.encrypted_page, 1);
1307 recover_block:
1308 if (err)
1309 f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr,
1310 true, true, true);
1311 up_out:
1312 if (lfs_mode)
1313 f2fs_up_write(&fio.sbi->io_order_lock);
1314 put_out:
1315 f2fs_put_dnode(&dn);
1316 out:
1317 f2fs_put_page(page, 1);
1318 return err;
1319 }
1320
move_data_page(struct inode * inode,block_t bidx,int gc_type,unsigned int segno,int off)1321 static int move_data_page(struct inode *inode, block_t bidx, int gc_type,
1322 unsigned int segno, int off)
1323 {
1324 struct page *page;
1325 int err = 0;
1326
1327 page = f2fs_get_lock_data_page(inode, bidx, true);
1328 if (IS_ERR(page))
1329 return PTR_ERR(page);
1330
1331 if (!check_valid_map(F2FS_I_SB(inode), segno, off)) {
1332 err = -ENOENT;
1333 goto out;
1334 }
1335
1336 if (f2fs_is_atomic_file(inode)) {
1337 F2FS_I(inode)->i_gc_failures[GC_FAILURE_ATOMIC]++;
1338 F2FS_I_SB(inode)->skipped_atomic_files[gc_type]++;
1339 err = -EAGAIN;
1340 goto out;
1341 }
1342 if (f2fs_is_pinned_file(inode)) {
1343 if (gc_type == FG_GC)
1344 f2fs_pin_file_control(inode, true);
1345 err = -EAGAIN;
1346 goto out;
1347 }
1348
1349 if (gc_type == BG_GC) {
1350 if (PageWriteback(page)) {
1351 err = -EAGAIN;
1352 goto out;
1353 }
1354 set_page_dirty(page);
1355 set_page_private_gcing(page);
1356 } else {
1357 struct f2fs_io_info fio = {
1358 .sbi = F2FS_I_SB(inode),
1359 .ino = inode->i_ino,
1360 .type = DATA,
1361 .temp = COLD,
1362 .op = REQ_OP_WRITE,
1363 .op_flags = REQ_SYNC,
1364 .old_blkaddr = NULL_ADDR,
1365 .page = page,
1366 .encrypted_page = NULL,
1367 .need_lock = LOCK_REQ,
1368 .io_type = FS_GC_DATA_IO,
1369 };
1370 bool is_dirty = PageDirty(page);
1371
1372 retry:
1373 f2fs_wait_on_page_writeback(page, DATA, true, true);
1374
1375 set_page_dirty(page);
1376 if (clear_page_dirty_for_io(page)) {
1377 inode_dec_dirty_pages(inode);
1378 f2fs_remove_dirty_inode(inode);
1379 }
1380
1381 set_page_private_gcing(page);
1382
1383 err = f2fs_do_write_data_page(&fio);
1384 if (err) {
1385 clear_page_private_gcing(page);
1386 if (err == -ENOMEM) {
1387 congestion_wait(BLK_RW_ASYNC,
1388 DEFAULT_IO_TIMEOUT);
1389 goto retry;
1390 }
1391 if (is_dirty)
1392 set_page_dirty(page);
1393 }
1394 }
1395 out:
1396 f2fs_put_page(page, 1);
1397 return err;
1398 }
1399
1400 /*
1401 * This function tries to get parent node of victim data block, and identifies
1402 * data block validity. If the block is valid, copy that with cold status and
1403 * modify parent node.
1404 * If the parent node is not valid or the data block address is different,
1405 * the victim data block is ignored.
1406 */
gc_data_segment(struct f2fs_sb_info * sbi,struct f2fs_summary * sum,struct gc_inode_list * gc_list,unsigned int segno,int gc_type,bool force_migrate)1407 static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
1408 struct gc_inode_list *gc_list, unsigned int segno, int gc_type,
1409 bool force_migrate)
1410 {
1411 struct super_block *sb = sbi->sb;
1412 struct f2fs_summary *entry;
1413 block_t start_addr;
1414 int off;
1415 int phase = 0;
1416 int submitted = 0;
1417 unsigned int usable_blks_in_seg = f2fs_usable_blks_in_seg(sbi, segno);
1418
1419 start_addr = START_BLOCK(sbi, segno);
1420
1421 next_step:
1422 entry = sum;
1423
1424 for (off = 0; off < usable_blks_in_seg; off++, entry++) {
1425 struct page *data_page;
1426 struct inode *inode;
1427 struct node_info dni; /* dnode info for the data */
1428 unsigned int ofs_in_node, nofs;
1429 block_t start_bidx;
1430 nid_t nid = le32_to_cpu(entry->nid);
1431
1432 /*
1433 * stop BG_GC if there is not enough free sections.
1434 * Or, stop GC if the segment becomes fully valid caused by
1435 * race condition along with SSR block allocation.
1436 */
1437 if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
1438 (!force_migrate && get_valid_blocks(sbi, segno, true) ==
1439 BLKS_PER_SEC(sbi)))
1440 return submitted;
1441
1442 if (check_valid_map(sbi, segno, off) == 0)
1443 continue;
1444
1445 if (phase == 0) {
1446 f2fs_ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1,
1447 META_NAT, true);
1448 continue;
1449 }
1450
1451 if (phase == 1) {
1452 f2fs_ra_node_page(sbi, nid);
1453 continue;
1454 }
1455
1456 /* Get an inode by ino with checking validity */
1457 if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs))
1458 continue;
1459
1460 if (phase == 2) {
1461 f2fs_ra_node_page(sbi, dni.ino);
1462 continue;
1463 }
1464
1465 ofs_in_node = le16_to_cpu(entry->ofs_in_node);
1466
1467 if (phase == 3) {
1468 inode = f2fs_iget(sb, dni.ino);
1469 if (IS_ERR(inode) || is_bad_inode(inode))
1470 continue;
1471
1472 if (!f2fs_down_write_trylock(
1473 &F2FS_I(inode)->i_gc_rwsem[WRITE])) {
1474 iput(inode);
1475 sbi->skipped_gc_rwsem++;
1476 continue;
1477 }
1478
1479 start_bidx = f2fs_start_bidx_of_node(nofs, inode) +
1480 ofs_in_node;
1481
1482 if (f2fs_post_read_required(inode)) {
1483 int err = ra_data_block(inode, start_bidx);
1484
1485 f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
1486 if (err) {
1487 iput(inode);
1488 continue;
1489 }
1490 add_gc_inode(gc_list, inode);
1491 continue;
1492 }
1493
1494 data_page = f2fs_get_read_data_page(inode,
1495 start_bidx, REQ_RAHEAD, true);
1496 f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
1497 if (IS_ERR(data_page)) {
1498 iput(inode);
1499 continue;
1500 }
1501
1502 f2fs_put_page(data_page, 0);
1503 add_gc_inode(gc_list, inode);
1504 continue;
1505 }
1506
1507 /* phase 4 */
1508 inode = find_gc_inode(gc_list, dni.ino);
1509 if (inode) {
1510 struct f2fs_inode_info *fi = F2FS_I(inode);
1511 bool locked = false;
1512 int err;
1513
1514 if (S_ISREG(inode->i_mode)) {
1515 if (!f2fs_down_write_trylock(&fi->i_gc_rwsem[READ])) {
1516 sbi->skipped_gc_rwsem++;
1517 continue;
1518 }
1519 if (!f2fs_down_write_trylock(
1520 &fi->i_gc_rwsem[WRITE])) {
1521 sbi->skipped_gc_rwsem++;
1522 f2fs_up_write(&fi->i_gc_rwsem[READ]);
1523 continue;
1524 }
1525 locked = true;
1526
1527 /* wait for all inflight aio data */
1528 inode_dio_wait(inode);
1529 }
1530
1531 start_bidx = f2fs_start_bidx_of_node(nofs, inode)
1532 + ofs_in_node;
1533 if (f2fs_post_read_required(inode))
1534 err = move_data_block(inode, start_bidx,
1535 gc_type, segno, off);
1536 else
1537 err = move_data_page(inode, start_bidx, gc_type,
1538 segno, off);
1539
1540 if (!err && (gc_type == FG_GC ||
1541 f2fs_post_read_required(inode)))
1542 submitted++;
1543
1544 if (locked) {
1545 f2fs_up_write(&fi->i_gc_rwsem[WRITE]);
1546 f2fs_up_write(&fi->i_gc_rwsem[READ]);
1547 }
1548
1549 stat_inc_data_blk_count(sbi, 1, gc_type);
1550 }
1551 }
1552
1553 if (++phase < 5)
1554 goto next_step;
1555
1556 return submitted;
1557 }
1558
__get_victim(struct f2fs_sb_info * sbi,unsigned int * victim,int gc_type)1559 static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
1560 int gc_type)
1561 {
1562 struct sit_info *sit_i = SIT_I(sbi);
1563 int ret;
1564
1565 down_write(&sit_i->sentry_lock);
1566 ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
1567 NO_CHECK_TYPE, LFS, 0);
1568 up_write(&sit_i->sentry_lock);
1569 return ret;
1570 }
1571
do_garbage_collect(struct f2fs_sb_info * sbi,unsigned int start_segno,struct gc_inode_list * gc_list,int gc_type,bool force_migrate)1572 static int do_garbage_collect(struct f2fs_sb_info *sbi,
1573 unsigned int start_segno,
1574 struct gc_inode_list *gc_list, int gc_type,
1575 bool force_migrate)
1576 {
1577 struct page *sum_page;
1578 struct f2fs_summary_block *sum;
1579 struct blk_plug plug;
1580 unsigned int segno = start_segno;
1581 unsigned int end_segno = start_segno + sbi->segs_per_sec;
1582 int seg_freed = 0, migrated = 0;
1583 unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
1584 SUM_TYPE_DATA : SUM_TYPE_NODE;
1585 int submitted = 0;
1586
1587 if (__is_large_section(sbi))
1588 end_segno = rounddown(end_segno, sbi->segs_per_sec);
1589
1590 /*
1591 * zone-capacity can be less than zone-size in zoned devices,
1592 * resulting in less than expected usable segments in the zone,
1593 * calculate the end segno in the zone which can be garbage collected
1594 */
1595 if (f2fs_sb_has_blkzoned(sbi))
1596 end_segno -= sbi->segs_per_sec -
1597 f2fs_usable_segs_in_sec(sbi, segno);
1598
1599 sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type);
1600
1601 /* readahead multi ssa blocks those have contiguous address */
1602 if (__is_large_section(sbi))
1603 f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno),
1604 end_segno - segno, META_SSA, true);
1605
1606 /* reference all summary page */
1607 while (segno < end_segno) {
1608 sum_page = f2fs_get_sum_page(sbi, segno++);
1609 if (IS_ERR(sum_page)) {
1610 int err = PTR_ERR(sum_page);
1611
1612 end_segno = segno - 1;
1613 for (segno = start_segno; segno < end_segno; segno++) {
1614 sum_page = find_get_page(META_MAPPING(sbi),
1615 GET_SUM_BLOCK(sbi, segno));
1616 f2fs_put_page(sum_page, 0);
1617 f2fs_put_page(sum_page, 0);
1618 }
1619 return err;
1620 }
1621 unlock_page(sum_page);
1622 }
1623
1624 blk_start_plug(&plug);
1625
1626 for (segno = start_segno; segno < end_segno; segno++) {
1627
1628 /* find segment summary of victim */
1629 sum_page = find_get_page(META_MAPPING(sbi),
1630 GET_SUM_BLOCK(sbi, segno));
1631 f2fs_put_page(sum_page, 0);
1632
1633 if (get_valid_blocks(sbi, segno, false) == 0)
1634 goto freed;
1635 if (gc_type == BG_GC && __is_large_section(sbi) &&
1636 migrated >= sbi->migration_granularity)
1637 goto skip;
1638 if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
1639 goto skip;
1640
1641 sum = page_address(sum_page);
1642 if (type != GET_SUM_TYPE((&sum->footer))) {
1643 f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT",
1644 segno, type, GET_SUM_TYPE((&sum->footer)));
1645 set_sbi_flag(sbi, SBI_NEED_FSCK);
1646 f2fs_stop_checkpoint(sbi, false,
1647 STOP_CP_REASON_CORRUPTED_SUMMARY);
1648 goto skip;
1649 }
1650
1651 /*
1652 * this is to avoid deadlock:
1653 * - lock_page(sum_page) - f2fs_replace_block
1654 * - check_valid_map() - down_write(sentry_lock)
1655 * - down_read(sentry_lock) - change_curseg()
1656 * - lock_page(sum_page)
1657 */
1658 if (type == SUM_TYPE_NODE)
1659 submitted += gc_node_segment(sbi, sum->entries, segno,
1660 gc_type);
1661 else
1662 submitted += gc_data_segment(sbi, sum->entries, gc_list,
1663 segno, gc_type,
1664 force_migrate);
1665
1666 stat_inc_seg_count(sbi, type, gc_type);
1667 sbi->gc_reclaimed_segs[sbi->gc_mode]++;
1668 migrated++;
1669
1670 freed:
1671 if (gc_type == FG_GC &&
1672 get_valid_blocks(sbi, segno, false) == 0)
1673 seg_freed++;
1674
1675 if (__is_large_section(sbi) && segno + 1 < end_segno)
1676 sbi->next_victim_seg[gc_type] = segno + 1;
1677 skip:
1678 f2fs_put_page(sum_page, 0);
1679 }
1680
1681 if (submitted)
1682 f2fs_submit_merged_write(sbi,
1683 (type == SUM_TYPE_NODE) ? NODE : DATA);
1684
1685 blk_finish_plug(&plug);
1686
1687 stat_inc_call_count(sbi->stat_info);
1688
1689 return seg_freed;
1690 }
1691
f2fs_gc(struct f2fs_sb_info * sbi,bool sync,bool background,bool force,unsigned int segno)1692 int f2fs_gc(struct f2fs_sb_info *sbi, bool sync,
1693 bool background, bool force, unsigned int segno)
1694 {
1695 int gc_type = sync ? FG_GC : BG_GC;
1696 int sec_freed = 0, seg_freed = 0, total_freed = 0;
1697 int ret = 0;
1698 struct cp_control cpc;
1699 unsigned int init_segno = segno;
1700 struct gc_inode_list gc_list = {
1701 .ilist = LIST_HEAD_INIT(gc_list.ilist),
1702 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
1703 };
1704 unsigned long long last_skipped = sbi->skipped_atomic_files[FG_GC];
1705 unsigned long long first_skipped;
1706 unsigned int skipped_round = 0, round = 0;
1707
1708 trace_f2fs_gc_begin(sbi->sb, sync, background,
1709 get_pages(sbi, F2FS_DIRTY_NODES),
1710 get_pages(sbi, F2FS_DIRTY_DENTS),
1711 get_pages(sbi, F2FS_DIRTY_IMETA),
1712 free_sections(sbi),
1713 free_segments(sbi),
1714 reserved_segments(sbi),
1715 prefree_segments(sbi));
1716
1717 cpc.reason = __get_cp_reason(sbi);
1718 sbi->skipped_gc_rwsem = 0;
1719 first_skipped = last_skipped;
1720 gc_more:
1721 if (unlikely(!(sbi->sb->s_flags & SB_ACTIVE))) {
1722 ret = -EINVAL;
1723 goto stop;
1724 }
1725 if (unlikely(f2fs_cp_error(sbi))) {
1726 ret = -EIO;
1727 goto stop;
1728 }
1729
1730 if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) {
1731 /*
1732 * For example, if there are many prefree_segments below given
1733 * threshold, we can make them free by checkpoint. Then, we
1734 * secure free segments which doesn't need fggc any more.
1735 */
1736 if (prefree_segments(sbi) &&
1737 !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
1738 ret = f2fs_write_checkpoint(sbi, &cpc);
1739 if (ret)
1740 goto stop;
1741 }
1742 if (has_not_enough_free_secs(sbi, 0, 0))
1743 gc_type = FG_GC;
1744 }
1745
1746 /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
1747 if (gc_type == BG_GC && !background) {
1748 ret = -EINVAL;
1749 goto stop;
1750 }
1751 ret = __get_victim(sbi, &segno, gc_type);
1752 if (ret)
1753 goto stop;
1754
1755 seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force);
1756 if (gc_type == FG_GC &&
1757 seg_freed == f2fs_usable_segs_in_sec(sbi, segno))
1758 sec_freed++;
1759 total_freed += seg_freed;
1760
1761 if (gc_type == FG_GC) {
1762 if (sbi->skipped_atomic_files[FG_GC] > last_skipped ||
1763 sbi->skipped_gc_rwsem)
1764 skipped_round++;
1765 last_skipped = sbi->skipped_atomic_files[FG_GC];
1766 round++;
1767 }
1768
1769 if (gc_type == FG_GC && seg_freed)
1770 sbi->cur_victim_sec = NULL_SEGNO;
1771
1772 if (sync)
1773 goto stop;
1774
1775 if (!has_not_enough_free_secs(sbi, sec_freed, 0))
1776 goto stop;
1777
1778 if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) {
1779
1780 /* Write checkpoint to reclaim prefree segments */
1781 if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE &&
1782 prefree_segments(sbi) &&
1783 !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
1784 ret = f2fs_write_checkpoint(sbi, &cpc);
1785 if (ret)
1786 goto stop;
1787 }
1788 segno = NULL_SEGNO;
1789 goto gc_more;
1790 }
1791 if (first_skipped < last_skipped &&
1792 (last_skipped - first_skipped) >
1793 sbi->skipped_gc_rwsem) {
1794 f2fs_drop_inmem_pages_all(sbi, true);
1795 segno = NULL_SEGNO;
1796 goto gc_more;
1797 }
1798 if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
1799 ret = f2fs_write_checkpoint(sbi, &cpc);
1800 stop:
1801 SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
1802 SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
1803
1804 trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed,
1805 get_pages(sbi, F2FS_DIRTY_NODES),
1806 get_pages(sbi, F2FS_DIRTY_DENTS),
1807 get_pages(sbi, F2FS_DIRTY_IMETA),
1808 free_sections(sbi),
1809 free_segments(sbi),
1810 reserved_segments(sbi),
1811 prefree_segments(sbi));
1812
1813 f2fs_up_write(&sbi->gc_lock);
1814
1815 put_gc_inode(&gc_list);
1816
1817 if (sync && !ret)
1818 ret = sec_freed ? 0 : -EAGAIN;
1819 return ret;
1820 }
1821
f2fs_create_garbage_collection_cache(void)1822 int __init f2fs_create_garbage_collection_cache(void)
1823 {
1824 victim_entry_slab = f2fs_kmem_cache_create("f2fs_victim_entry",
1825 sizeof(struct victim_entry));
1826 if (!victim_entry_slab)
1827 return -ENOMEM;
1828 return 0;
1829 }
1830
f2fs_destroy_garbage_collection_cache(void)1831 void f2fs_destroy_garbage_collection_cache(void)
1832 {
1833 kmem_cache_destroy(victim_entry_slab);
1834 }
1835
init_atgc_management(struct f2fs_sb_info * sbi)1836 static void init_atgc_management(struct f2fs_sb_info *sbi)
1837 {
1838 struct atgc_management *am = &sbi->am;
1839
1840 if (test_opt(sbi, ATGC) &&
1841 SIT_I(sbi)->elapsed_time >= DEF_GC_THREAD_AGE_THRESHOLD)
1842 am->atgc_enabled = true;
1843
1844 am->root = RB_ROOT_CACHED;
1845 INIT_LIST_HEAD(&am->victim_list);
1846 am->victim_count = 0;
1847
1848 am->candidate_ratio = DEF_GC_THREAD_CANDIDATE_RATIO;
1849 am->max_candidate_count = DEF_GC_THREAD_MAX_CANDIDATE_COUNT;
1850 am->age_weight = DEF_GC_THREAD_AGE_WEIGHT;
1851 am->age_threshold = DEF_GC_THREAD_AGE_THRESHOLD;
1852 }
1853
f2fs_build_gc_manager(struct f2fs_sb_info * sbi)1854 void f2fs_build_gc_manager(struct f2fs_sb_info *sbi)
1855 {
1856 DIRTY_I(sbi)->v_ops = &default_v_ops;
1857
1858 sbi->gc_pin_file_threshold = DEF_GC_FAILED_PINNED_FILES;
1859
1860 /* give warm/cold data area from slower device */
1861 if (f2fs_is_multi_device(sbi) && !__is_large_section(sbi))
1862 SIT_I(sbi)->last_victim[ALLOC_NEXT] =
1863 GET_SEGNO(sbi, FDEV(0).end_blk) + 1;
1864
1865 init_atgc_management(sbi);
1866 }
1867
free_segment_range(struct f2fs_sb_info * sbi,unsigned int secs,bool gc_only)1868 static int free_segment_range(struct f2fs_sb_info *sbi,
1869 unsigned int secs, bool gc_only)
1870 {
1871 unsigned int segno, next_inuse, start, end;
1872 struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
1873 int gc_mode, gc_type;
1874 int err = 0;
1875 int type;
1876
1877 /* Force block allocation for GC */
1878 MAIN_SECS(sbi) -= secs;
1879 start = MAIN_SECS(sbi) * sbi->segs_per_sec;
1880 end = MAIN_SEGS(sbi) - 1;
1881
1882 mutex_lock(&DIRTY_I(sbi)->seglist_lock);
1883 for (gc_mode = 0; gc_mode < MAX_GC_POLICY; gc_mode++)
1884 if (SIT_I(sbi)->last_victim[gc_mode] >= start)
1885 SIT_I(sbi)->last_victim[gc_mode] = 0;
1886
1887 for (gc_type = BG_GC; gc_type <= FG_GC; gc_type++)
1888 if (sbi->next_victim_seg[gc_type] >= start)
1889 sbi->next_victim_seg[gc_type] = NULL_SEGNO;
1890 mutex_unlock(&DIRTY_I(sbi)->seglist_lock);
1891
1892 /* Move out cursegs from the target range */
1893 for (type = CURSEG_HOT_DATA; type < NR_CURSEG_PERSIST_TYPE; type++)
1894 f2fs_allocate_segment_for_resize(sbi, type, start, end);
1895
1896 /* do GC to move out valid blocks in the range */
1897 for (segno = start; segno <= end; segno += sbi->segs_per_sec) {
1898 struct gc_inode_list gc_list = {
1899 .ilist = LIST_HEAD_INIT(gc_list.ilist),
1900 .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS),
1901 };
1902
1903 do_garbage_collect(sbi, segno, &gc_list, FG_GC, true);
1904 put_gc_inode(&gc_list);
1905
1906 if (!gc_only && get_valid_blocks(sbi, segno, true)) {
1907 err = -EAGAIN;
1908 goto out;
1909 }
1910 if (fatal_signal_pending(current)) {
1911 err = -ERESTARTSYS;
1912 goto out;
1913 }
1914 }
1915 if (gc_only)
1916 goto out;
1917
1918 err = f2fs_write_checkpoint(sbi, &cpc);
1919 if (err)
1920 goto out;
1921
1922 next_inuse = find_next_inuse(FREE_I(sbi), end + 1, start);
1923 if (next_inuse <= end) {
1924 f2fs_err(sbi, "segno %u should be free but still inuse!",
1925 next_inuse);
1926 f2fs_bug_on(sbi, 1);
1927 }
1928 out:
1929 MAIN_SECS(sbi) += secs;
1930 return err;
1931 }
1932
update_sb_metadata(struct f2fs_sb_info * sbi,int secs)1933 static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
1934 {
1935 struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi);
1936 int section_count;
1937 int segment_count;
1938 int segment_count_main;
1939 long long block_count;
1940 int segs = secs * sbi->segs_per_sec;
1941
1942 f2fs_down_write(&sbi->sb_lock);
1943
1944 section_count = le32_to_cpu(raw_sb->section_count);
1945 segment_count = le32_to_cpu(raw_sb->segment_count);
1946 segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
1947 block_count = le64_to_cpu(raw_sb->block_count);
1948
1949 raw_sb->section_count = cpu_to_le32(section_count + secs);
1950 raw_sb->segment_count = cpu_to_le32(segment_count + segs);
1951 raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
1952 raw_sb->block_count = cpu_to_le64(block_count +
1953 (long long)segs * sbi->blocks_per_seg);
1954 if (f2fs_is_multi_device(sbi)) {
1955 int last_dev = sbi->s_ndevs - 1;
1956 int dev_segs =
1957 le32_to_cpu(raw_sb->devs[last_dev].total_segments);
1958
1959 raw_sb->devs[last_dev].total_segments =
1960 cpu_to_le32(dev_segs + segs);
1961 }
1962
1963 f2fs_up_write(&sbi->sb_lock);
1964 }
1965
update_fs_metadata(struct f2fs_sb_info * sbi,int secs)1966 static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
1967 {
1968 int segs = secs * sbi->segs_per_sec;
1969 long long blks = (long long)segs * sbi->blocks_per_seg;
1970 long long user_block_count =
1971 le64_to_cpu(F2FS_CKPT(sbi)->user_block_count);
1972
1973 SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs;
1974 MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs;
1975 MAIN_SECS(sbi) += secs;
1976 FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs;
1977 FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs;
1978 F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks);
1979
1980 if (f2fs_is_multi_device(sbi)) {
1981 int last_dev = sbi->s_ndevs - 1;
1982
1983 FDEV(last_dev).total_segments =
1984 (int)FDEV(last_dev).total_segments + segs;
1985 FDEV(last_dev).end_blk =
1986 (long long)FDEV(last_dev).end_blk + blks;
1987 #ifdef CONFIG_BLK_DEV_ZONED
1988 FDEV(last_dev).nr_blkz = (int)FDEV(last_dev).nr_blkz +
1989 (int)(blks >> sbi->log_blocks_per_blkz);
1990 #endif
1991 }
1992 }
1993
f2fs_resize_fs(struct f2fs_sb_info * sbi,__u64 block_count)1994 int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
1995 {
1996 __u64 old_block_count, shrunk_blocks;
1997 struct cp_control cpc = { CP_RESIZE, 0, 0, 0 };
1998 unsigned int secs;
1999 int err = 0;
2000 __u32 rem;
2001
2002 old_block_count = le64_to_cpu(F2FS_RAW_SUPER(sbi)->block_count);
2003 if (block_count > old_block_count)
2004 return -EINVAL;
2005
2006 if (f2fs_is_multi_device(sbi)) {
2007 int last_dev = sbi->s_ndevs - 1;
2008 __u64 last_segs = FDEV(last_dev).total_segments;
2009
2010 if (block_count + last_segs * sbi->blocks_per_seg <=
2011 old_block_count)
2012 return -EINVAL;
2013 }
2014
2015 /* new fs size should align to section size */
2016 div_u64_rem(block_count, BLKS_PER_SEC(sbi), &rem);
2017 if (rem)
2018 return -EINVAL;
2019
2020 if (block_count == old_block_count)
2021 return 0;
2022
2023 if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) {
2024 f2fs_err(sbi, "Should run fsck to repair first.");
2025 return -EFSCORRUPTED;
2026 }
2027
2028 if (test_opt(sbi, DISABLE_CHECKPOINT)) {
2029 f2fs_err(sbi, "Checkpoint should be enabled.");
2030 return -EINVAL;
2031 }
2032
2033 shrunk_blocks = old_block_count - block_count;
2034 secs = div_u64(shrunk_blocks, BLKS_PER_SEC(sbi));
2035
2036 /* stop other GC */
2037 if (!f2fs_down_write_trylock(&sbi->gc_lock))
2038 return -EAGAIN;
2039
2040 /* stop CP to protect MAIN_SEC in free_segment_range */
2041 f2fs_lock_op(sbi);
2042
2043 spin_lock(&sbi->stat_lock);
2044 if (shrunk_blocks + valid_user_blocks(sbi) +
2045 sbi->current_reserved_blocks + sbi->unusable_block_count +
2046 F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count)
2047 err = -ENOSPC;
2048 spin_unlock(&sbi->stat_lock);
2049
2050 if (err)
2051 goto out_unlock;
2052
2053 err = free_segment_range(sbi, secs, true);
2054
2055 out_unlock:
2056 f2fs_unlock_op(sbi);
2057 f2fs_up_write(&sbi->gc_lock);
2058 if (err)
2059 return err;
2060
2061 set_sbi_flag(sbi, SBI_IS_RESIZEFS);
2062
2063 freeze_super(sbi->sb);
2064 f2fs_down_write(&sbi->gc_lock);
2065 f2fs_down_write(&sbi->cp_global_sem);
2066
2067 spin_lock(&sbi->stat_lock);
2068 if (shrunk_blocks + valid_user_blocks(sbi) +
2069 sbi->current_reserved_blocks + sbi->unusable_block_count +
2070 F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count)
2071 err = -ENOSPC;
2072 else
2073 sbi->user_block_count -= shrunk_blocks;
2074 spin_unlock(&sbi->stat_lock);
2075 if (err)
2076 goto out_err;
2077
2078 err = free_segment_range(sbi, secs, false);
2079 if (err)
2080 goto recover_out;
2081
2082 update_sb_metadata(sbi, -secs);
2083
2084 err = f2fs_commit_super(sbi, false);
2085 if (err) {
2086 update_sb_metadata(sbi, secs);
2087 goto recover_out;
2088 }
2089
2090 update_fs_metadata(sbi, -secs);
2091 clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
2092 set_sbi_flag(sbi, SBI_IS_DIRTY);
2093
2094 err = f2fs_write_checkpoint(sbi, &cpc);
2095 if (err) {
2096 update_fs_metadata(sbi, secs);
2097 update_sb_metadata(sbi, secs);
2098 f2fs_commit_super(sbi, false);
2099 }
2100 recover_out:
2101 if (err) {
2102 set_sbi_flag(sbi, SBI_NEED_FSCK);
2103 f2fs_err(sbi, "resize_fs failed, should run fsck to repair!");
2104
2105 spin_lock(&sbi->stat_lock);
2106 sbi->user_block_count += shrunk_blocks;
2107 spin_unlock(&sbi->stat_lock);
2108 }
2109 out_err:
2110 f2fs_up_write(&sbi->cp_global_sem);
2111 f2fs_up_write(&sbi->gc_lock);
2112 thaw_super(sbi->sb);
2113 clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
2114 return err;
2115 }
2116