1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2008 Oracle. All rights reserved.
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun #include <linux/sched.h>
7*4882a593Smuzhiyun #include <linux/slab.h>
8*4882a593Smuzhiyun #include <linux/blkdev.h>
9*4882a593Smuzhiyun #include <linux/list_sort.h>
10*4882a593Smuzhiyun #include <linux/iversion.h>
11*4882a593Smuzhiyun #include "misc.h"
12*4882a593Smuzhiyun #include "ctree.h"
13*4882a593Smuzhiyun #include "tree-log.h"
14*4882a593Smuzhiyun #include "disk-io.h"
15*4882a593Smuzhiyun #include "locking.h"
16*4882a593Smuzhiyun #include "print-tree.h"
17*4882a593Smuzhiyun #include "backref.h"
18*4882a593Smuzhiyun #include "compression.h"
19*4882a593Smuzhiyun #include "qgroup.h"
20*4882a593Smuzhiyun #include "inode-map.h"
21*4882a593Smuzhiyun #include "block-group.h"
22*4882a593Smuzhiyun #include "space-info.h"
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun /* magic values for the inode_only field in btrfs_log_inode:
25*4882a593Smuzhiyun *
26*4882a593Smuzhiyun * LOG_INODE_ALL means to log everything
27*4882a593Smuzhiyun * LOG_INODE_EXISTS means to log just enough to recreate the inode
28*4882a593Smuzhiyun * during log replay
29*4882a593Smuzhiyun */
30*4882a593Smuzhiyun enum {
31*4882a593Smuzhiyun LOG_INODE_ALL,
32*4882a593Smuzhiyun LOG_INODE_EXISTS,
33*4882a593Smuzhiyun LOG_OTHER_INODE,
34*4882a593Smuzhiyun LOG_OTHER_INODE_ALL,
35*4882a593Smuzhiyun };
36*4882a593Smuzhiyun
37*4882a593Smuzhiyun /*
38*4882a593Smuzhiyun * directory trouble cases
39*4882a593Smuzhiyun *
40*4882a593Smuzhiyun * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
41*4882a593Smuzhiyun * log, we must force a full commit before doing an fsync of the directory
42*4882a593Smuzhiyun * where the unlink was done.
43*4882a593Smuzhiyun * ---> record transid of last unlink/rename per directory
44*4882a593Smuzhiyun *
45*4882a593Smuzhiyun * mkdir foo/some_dir
46*4882a593Smuzhiyun * normal commit
47*4882a593Smuzhiyun * rename foo/some_dir foo2/some_dir
48*4882a593Smuzhiyun * mkdir foo/some_dir
49*4882a593Smuzhiyun * fsync foo/some_dir/some_file
50*4882a593Smuzhiyun *
51*4882a593Smuzhiyun * The fsync above will unlink the original some_dir without recording
52*4882a593Smuzhiyun * it in its new location (foo2). After a crash, some_dir will be gone
53*4882a593Smuzhiyun * unless the fsync of some_file forces a full commit
54*4882a593Smuzhiyun *
55*4882a593Smuzhiyun * 2) we must log any new names for any file or dir that is in the fsync
56*4882a593Smuzhiyun * log. ---> check inode while renaming/linking.
57*4882a593Smuzhiyun *
58*4882a593Smuzhiyun * 2a) we must log any new names for any file or dir during rename
59*4882a593Smuzhiyun * when the directory they are being removed from was logged.
60*4882a593Smuzhiyun * ---> check inode and old parent dir during rename
61*4882a593Smuzhiyun *
62*4882a593Smuzhiyun * 2a is actually the more important variant. With the extra logging
63*4882a593Smuzhiyun * a crash might unlink the old name without recreating the new one
64*4882a593Smuzhiyun *
65*4882a593Smuzhiyun * 3) after a crash, we must go through any directories with a link count
66*4882a593Smuzhiyun * of zero and redo the rm -rf
67*4882a593Smuzhiyun *
68*4882a593Smuzhiyun * mkdir f1/foo
69*4882a593Smuzhiyun * normal commit
70*4882a593Smuzhiyun * rm -rf f1/foo
71*4882a593Smuzhiyun * fsync(f1)
72*4882a593Smuzhiyun *
73*4882a593Smuzhiyun * The directory f1 was fully removed from the FS, but fsync was never
74*4882a593Smuzhiyun * called on f1, only its parent dir. After a crash the rm -rf must
75*4882a593Smuzhiyun * be replayed. This must be able to recurse down the entire
76*4882a593Smuzhiyun * directory tree. The inode link count fixup code takes care of the
77*4882a593Smuzhiyun * ugly details.
78*4882a593Smuzhiyun */
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun /*
81*4882a593Smuzhiyun * stages for the tree walking. The first
82*4882a593Smuzhiyun * stage (0) is to only pin down the blocks we find
83*4882a593Smuzhiyun * the second stage (1) is to make sure that all the inodes
84*4882a593Smuzhiyun * we find in the log are created in the subvolume.
85*4882a593Smuzhiyun *
86*4882a593Smuzhiyun * The last stage is to deal with directories and links and extents
87*4882a593Smuzhiyun * and all the other fun semantics
88*4882a593Smuzhiyun */
89*4882a593Smuzhiyun enum {
90*4882a593Smuzhiyun LOG_WALK_PIN_ONLY,
91*4882a593Smuzhiyun LOG_WALK_REPLAY_INODES,
92*4882a593Smuzhiyun LOG_WALK_REPLAY_DIR_INDEX,
93*4882a593Smuzhiyun LOG_WALK_REPLAY_ALL,
94*4882a593Smuzhiyun };
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun static int btrfs_log_inode(struct btrfs_trans_handle *trans,
97*4882a593Smuzhiyun struct btrfs_root *root, struct btrfs_inode *inode,
98*4882a593Smuzhiyun int inode_only,
99*4882a593Smuzhiyun struct btrfs_log_ctx *ctx);
100*4882a593Smuzhiyun static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
101*4882a593Smuzhiyun struct btrfs_root *root,
102*4882a593Smuzhiyun struct btrfs_path *path, u64 objectid);
103*4882a593Smuzhiyun static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
104*4882a593Smuzhiyun struct btrfs_root *root,
105*4882a593Smuzhiyun struct btrfs_root *log,
106*4882a593Smuzhiyun struct btrfs_path *path,
107*4882a593Smuzhiyun u64 dirid, int del_all);
108*4882a593Smuzhiyun
109*4882a593Smuzhiyun /*
110*4882a593Smuzhiyun * tree logging is a special write ahead log used to make sure that
111*4882a593Smuzhiyun * fsyncs and O_SYNCs can happen without doing full tree commits.
112*4882a593Smuzhiyun *
113*4882a593Smuzhiyun * Full tree commits are expensive because they require commonly
114*4882a593Smuzhiyun * modified blocks to be recowed, creating many dirty pages in the
115*4882a593Smuzhiyun * extent tree an 4x-6x higher write load than ext3.
116*4882a593Smuzhiyun *
117*4882a593Smuzhiyun * Instead of doing a tree commit on every fsync, we use the
118*4882a593Smuzhiyun * key ranges and transaction ids to find items for a given file or directory
119*4882a593Smuzhiyun * that have changed in this transaction. Those items are copied into
120*4882a593Smuzhiyun * a special tree (one per subvolume root), that tree is written to disk
121*4882a593Smuzhiyun * and then the fsync is considered complete.
122*4882a593Smuzhiyun *
123*4882a593Smuzhiyun * After a crash, items are copied out of the log-tree back into the
124*4882a593Smuzhiyun * subvolume tree. Any file data extents found are recorded in the extent
125*4882a593Smuzhiyun * allocation tree, and the log-tree freed.
126*4882a593Smuzhiyun *
127*4882a593Smuzhiyun * The log tree is read three times, once to pin down all the extents it is
128*4882a593Smuzhiyun * using in ram and once, once to create all the inodes logged in the tree
129*4882a593Smuzhiyun * and once to do all the other items.
130*4882a593Smuzhiyun */
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun /*
133*4882a593Smuzhiyun * start a sub transaction and setup the log tree
134*4882a593Smuzhiyun * this increments the log tree writer count to make the people
135*4882a593Smuzhiyun * syncing the tree wait for us to finish
136*4882a593Smuzhiyun */
start_log_trans(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)137*4882a593Smuzhiyun static int start_log_trans(struct btrfs_trans_handle *trans,
138*4882a593Smuzhiyun struct btrfs_root *root,
139*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
140*4882a593Smuzhiyun {
141*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
142*4882a593Smuzhiyun int ret = 0;
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun if (root->log_root) {
147*4882a593Smuzhiyun if (btrfs_need_log_full_commit(trans)) {
148*4882a593Smuzhiyun ret = -EAGAIN;
149*4882a593Smuzhiyun goto out;
150*4882a593Smuzhiyun }
151*4882a593Smuzhiyun
152*4882a593Smuzhiyun if (!root->log_start_pid) {
153*4882a593Smuzhiyun clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
154*4882a593Smuzhiyun root->log_start_pid = current->pid;
155*4882a593Smuzhiyun } else if (root->log_start_pid != current->pid) {
156*4882a593Smuzhiyun set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
157*4882a593Smuzhiyun }
158*4882a593Smuzhiyun } else {
159*4882a593Smuzhiyun mutex_lock(&fs_info->tree_log_mutex);
160*4882a593Smuzhiyun if (!fs_info->log_root_tree)
161*4882a593Smuzhiyun ret = btrfs_init_log_root_tree(trans, fs_info);
162*4882a593Smuzhiyun mutex_unlock(&fs_info->tree_log_mutex);
163*4882a593Smuzhiyun if (ret)
164*4882a593Smuzhiyun goto out;
165*4882a593Smuzhiyun
166*4882a593Smuzhiyun ret = btrfs_add_log_tree(trans, root);
167*4882a593Smuzhiyun if (ret)
168*4882a593Smuzhiyun goto out;
169*4882a593Smuzhiyun
170*4882a593Smuzhiyun set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
171*4882a593Smuzhiyun clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
172*4882a593Smuzhiyun root->log_start_pid = current->pid;
173*4882a593Smuzhiyun }
174*4882a593Smuzhiyun
175*4882a593Smuzhiyun atomic_inc(&root->log_batch);
176*4882a593Smuzhiyun atomic_inc(&root->log_writers);
177*4882a593Smuzhiyun if (ctx && !ctx->logging_new_name) {
178*4882a593Smuzhiyun int index = root->log_transid % 2;
179*4882a593Smuzhiyun list_add_tail(&ctx->list, &root->log_ctxs[index]);
180*4882a593Smuzhiyun ctx->log_transid = root->log_transid;
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun out:
184*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
185*4882a593Smuzhiyun return ret;
186*4882a593Smuzhiyun }
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun /*
189*4882a593Smuzhiyun * returns 0 if there was a log transaction running and we were able
190*4882a593Smuzhiyun * to join, or returns -ENOENT if there were not transactions
191*4882a593Smuzhiyun * in progress
192*4882a593Smuzhiyun */
join_running_log_trans(struct btrfs_root * root)193*4882a593Smuzhiyun static int join_running_log_trans(struct btrfs_root *root)
194*4882a593Smuzhiyun {
195*4882a593Smuzhiyun int ret = -ENOENT;
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
198*4882a593Smuzhiyun return ret;
199*4882a593Smuzhiyun
200*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
201*4882a593Smuzhiyun if (root->log_root) {
202*4882a593Smuzhiyun ret = 0;
203*4882a593Smuzhiyun atomic_inc(&root->log_writers);
204*4882a593Smuzhiyun }
205*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
206*4882a593Smuzhiyun return ret;
207*4882a593Smuzhiyun }
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun /*
210*4882a593Smuzhiyun * This either makes the current running log transaction wait
211*4882a593Smuzhiyun * until you call btrfs_end_log_trans() or it makes any future
212*4882a593Smuzhiyun * log transactions wait until you call btrfs_end_log_trans()
213*4882a593Smuzhiyun */
btrfs_pin_log_trans(struct btrfs_root * root)214*4882a593Smuzhiyun void btrfs_pin_log_trans(struct btrfs_root *root)
215*4882a593Smuzhiyun {
216*4882a593Smuzhiyun atomic_inc(&root->log_writers);
217*4882a593Smuzhiyun }
218*4882a593Smuzhiyun
219*4882a593Smuzhiyun /*
220*4882a593Smuzhiyun * indicate we're done making changes to the log tree
221*4882a593Smuzhiyun * and wake up anyone waiting to do a sync
222*4882a593Smuzhiyun */
btrfs_end_log_trans(struct btrfs_root * root)223*4882a593Smuzhiyun void btrfs_end_log_trans(struct btrfs_root *root)
224*4882a593Smuzhiyun {
225*4882a593Smuzhiyun if (atomic_dec_and_test(&root->log_writers)) {
226*4882a593Smuzhiyun /* atomic_dec_and_test implies a barrier */
227*4882a593Smuzhiyun cond_wake_up_nomb(&root->log_writer_wait);
228*4882a593Smuzhiyun }
229*4882a593Smuzhiyun }
230*4882a593Smuzhiyun
btrfs_write_tree_block(struct extent_buffer * buf)231*4882a593Smuzhiyun static int btrfs_write_tree_block(struct extent_buffer *buf)
232*4882a593Smuzhiyun {
233*4882a593Smuzhiyun return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
234*4882a593Smuzhiyun buf->start + buf->len - 1);
235*4882a593Smuzhiyun }
236*4882a593Smuzhiyun
btrfs_wait_tree_block_writeback(struct extent_buffer * buf)237*4882a593Smuzhiyun static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
238*4882a593Smuzhiyun {
239*4882a593Smuzhiyun filemap_fdatawait_range(buf->pages[0]->mapping,
240*4882a593Smuzhiyun buf->start, buf->start + buf->len - 1);
241*4882a593Smuzhiyun }
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun /*
244*4882a593Smuzhiyun * the walk control struct is used to pass state down the chain when
245*4882a593Smuzhiyun * processing the log tree. The stage field tells us which part
246*4882a593Smuzhiyun * of the log tree processing we are currently doing. The others
247*4882a593Smuzhiyun * are state fields used for that specific part
248*4882a593Smuzhiyun */
249*4882a593Smuzhiyun struct walk_control {
250*4882a593Smuzhiyun /* should we free the extent on disk when done? This is used
251*4882a593Smuzhiyun * at transaction commit time while freeing a log tree
252*4882a593Smuzhiyun */
253*4882a593Smuzhiyun int free;
254*4882a593Smuzhiyun
255*4882a593Smuzhiyun /* should we write out the extent buffer? This is used
256*4882a593Smuzhiyun * while flushing the log tree to disk during a sync
257*4882a593Smuzhiyun */
258*4882a593Smuzhiyun int write;
259*4882a593Smuzhiyun
260*4882a593Smuzhiyun /* should we wait for the extent buffer io to finish? Also used
261*4882a593Smuzhiyun * while flushing the log tree to disk for a sync
262*4882a593Smuzhiyun */
263*4882a593Smuzhiyun int wait;
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun /* pin only walk, we record which extents on disk belong to the
266*4882a593Smuzhiyun * log trees
267*4882a593Smuzhiyun */
268*4882a593Smuzhiyun int pin;
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun /* what stage of the replay code we're currently in */
271*4882a593Smuzhiyun int stage;
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun /*
274*4882a593Smuzhiyun * Ignore any items from the inode currently being processed. Needs
275*4882a593Smuzhiyun * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
276*4882a593Smuzhiyun * the LOG_WALK_REPLAY_INODES stage.
277*4882a593Smuzhiyun */
278*4882a593Smuzhiyun bool ignore_cur_inode;
279*4882a593Smuzhiyun
280*4882a593Smuzhiyun /* the root we are currently replaying */
281*4882a593Smuzhiyun struct btrfs_root *replay_dest;
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun /* the trans handle for the current replay */
284*4882a593Smuzhiyun struct btrfs_trans_handle *trans;
285*4882a593Smuzhiyun
286*4882a593Smuzhiyun /* the function that gets used to process blocks we find in the
287*4882a593Smuzhiyun * tree. Note the extent_buffer might not be up to date when it is
288*4882a593Smuzhiyun * passed in, and it must be checked or read if you need the data
289*4882a593Smuzhiyun * inside it
290*4882a593Smuzhiyun */
291*4882a593Smuzhiyun int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
292*4882a593Smuzhiyun struct walk_control *wc, u64 gen, int level);
293*4882a593Smuzhiyun };
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun /*
296*4882a593Smuzhiyun * process_func used to pin down extents, write them or wait on them
297*4882a593Smuzhiyun */
process_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)298*4882a593Smuzhiyun static int process_one_buffer(struct btrfs_root *log,
299*4882a593Smuzhiyun struct extent_buffer *eb,
300*4882a593Smuzhiyun struct walk_control *wc, u64 gen, int level)
301*4882a593Smuzhiyun {
302*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = log->fs_info;
303*4882a593Smuzhiyun int ret = 0;
304*4882a593Smuzhiyun
305*4882a593Smuzhiyun /*
306*4882a593Smuzhiyun * If this fs is mixed then we need to be able to process the leaves to
307*4882a593Smuzhiyun * pin down any logged extents, so we have to read the block.
308*4882a593Smuzhiyun */
309*4882a593Smuzhiyun if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
310*4882a593Smuzhiyun ret = btrfs_read_buffer(eb, gen, level, NULL);
311*4882a593Smuzhiyun if (ret)
312*4882a593Smuzhiyun return ret;
313*4882a593Smuzhiyun }
314*4882a593Smuzhiyun
315*4882a593Smuzhiyun if (wc->pin)
316*4882a593Smuzhiyun ret = btrfs_pin_extent_for_log_replay(wc->trans, eb->start,
317*4882a593Smuzhiyun eb->len);
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
320*4882a593Smuzhiyun if (wc->pin && btrfs_header_level(eb) == 0)
321*4882a593Smuzhiyun ret = btrfs_exclude_logged_extents(eb);
322*4882a593Smuzhiyun if (wc->write)
323*4882a593Smuzhiyun btrfs_write_tree_block(eb);
324*4882a593Smuzhiyun if (wc->wait)
325*4882a593Smuzhiyun btrfs_wait_tree_block_writeback(eb);
326*4882a593Smuzhiyun }
327*4882a593Smuzhiyun return ret;
328*4882a593Smuzhiyun }
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun /*
331*4882a593Smuzhiyun * Item overwrite used by replay and tree logging. eb, slot and key all refer
332*4882a593Smuzhiyun * to the src data we are copying out.
333*4882a593Smuzhiyun *
334*4882a593Smuzhiyun * root is the tree we are copying into, and path is a scratch
335*4882a593Smuzhiyun * path for use in this function (it should be released on entry and
336*4882a593Smuzhiyun * will be released on exit).
337*4882a593Smuzhiyun *
338*4882a593Smuzhiyun * If the key is already in the destination tree the existing item is
339*4882a593Smuzhiyun * overwritten. If the existing item isn't big enough, it is extended.
340*4882a593Smuzhiyun * If it is too large, it is truncated.
341*4882a593Smuzhiyun *
342*4882a593Smuzhiyun * If the key isn't in the destination yet, a new item is inserted.
343*4882a593Smuzhiyun */
overwrite_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)344*4882a593Smuzhiyun static noinline int overwrite_item(struct btrfs_trans_handle *trans,
345*4882a593Smuzhiyun struct btrfs_root *root,
346*4882a593Smuzhiyun struct btrfs_path *path,
347*4882a593Smuzhiyun struct extent_buffer *eb, int slot,
348*4882a593Smuzhiyun struct btrfs_key *key)
349*4882a593Smuzhiyun {
350*4882a593Smuzhiyun int ret;
351*4882a593Smuzhiyun u32 item_size;
352*4882a593Smuzhiyun u64 saved_i_size = 0;
353*4882a593Smuzhiyun int save_old_i_size = 0;
354*4882a593Smuzhiyun unsigned long src_ptr;
355*4882a593Smuzhiyun unsigned long dst_ptr;
356*4882a593Smuzhiyun int overwrite_root = 0;
357*4882a593Smuzhiyun bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
358*4882a593Smuzhiyun
359*4882a593Smuzhiyun if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
360*4882a593Smuzhiyun overwrite_root = 1;
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun item_size = btrfs_item_size_nr(eb, slot);
363*4882a593Smuzhiyun src_ptr = btrfs_item_ptr_offset(eb, slot);
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun /* look for the key in the destination tree */
366*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
367*4882a593Smuzhiyun if (ret < 0)
368*4882a593Smuzhiyun return ret;
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun if (ret == 0) {
371*4882a593Smuzhiyun char *src_copy;
372*4882a593Smuzhiyun char *dst_copy;
373*4882a593Smuzhiyun u32 dst_size = btrfs_item_size_nr(path->nodes[0],
374*4882a593Smuzhiyun path->slots[0]);
375*4882a593Smuzhiyun if (dst_size != item_size)
376*4882a593Smuzhiyun goto insert;
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun if (item_size == 0) {
379*4882a593Smuzhiyun btrfs_release_path(path);
380*4882a593Smuzhiyun return 0;
381*4882a593Smuzhiyun }
382*4882a593Smuzhiyun dst_copy = kmalloc(item_size, GFP_NOFS);
383*4882a593Smuzhiyun src_copy = kmalloc(item_size, GFP_NOFS);
384*4882a593Smuzhiyun if (!dst_copy || !src_copy) {
385*4882a593Smuzhiyun btrfs_release_path(path);
386*4882a593Smuzhiyun kfree(dst_copy);
387*4882a593Smuzhiyun kfree(src_copy);
388*4882a593Smuzhiyun return -ENOMEM;
389*4882a593Smuzhiyun }
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun read_extent_buffer(eb, src_copy, src_ptr, item_size);
392*4882a593Smuzhiyun
393*4882a593Smuzhiyun dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
394*4882a593Smuzhiyun read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
395*4882a593Smuzhiyun item_size);
396*4882a593Smuzhiyun ret = memcmp(dst_copy, src_copy, item_size);
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun kfree(dst_copy);
399*4882a593Smuzhiyun kfree(src_copy);
400*4882a593Smuzhiyun /*
401*4882a593Smuzhiyun * they have the same contents, just return, this saves
402*4882a593Smuzhiyun * us from cowing blocks in the destination tree and doing
403*4882a593Smuzhiyun * extra writes that may not have been done by a previous
404*4882a593Smuzhiyun * sync
405*4882a593Smuzhiyun */
406*4882a593Smuzhiyun if (ret == 0) {
407*4882a593Smuzhiyun btrfs_release_path(path);
408*4882a593Smuzhiyun return 0;
409*4882a593Smuzhiyun }
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun /*
412*4882a593Smuzhiyun * We need to load the old nbytes into the inode so when we
413*4882a593Smuzhiyun * replay the extents we've logged we get the right nbytes.
414*4882a593Smuzhiyun */
415*4882a593Smuzhiyun if (inode_item) {
416*4882a593Smuzhiyun struct btrfs_inode_item *item;
417*4882a593Smuzhiyun u64 nbytes;
418*4882a593Smuzhiyun u32 mode;
419*4882a593Smuzhiyun
420*4882a593Smuzhiyun item = btrfs_item_ptr(path->nodes[0], path->slots[0],
421*4882a593Smuzhiyun struct btrfs_inode_item);
422*4882a593Smuzhiyun nbytes = btrfs_inode_nbytes(path->nodes[0], item);
423*4882a593Smuzhiyun item = btrfs_item_ptr(eb, slot,
424*4882a593Smuzhiyun struct btrfs_inode_item);
425*4882a593Smuzhiyun btrfs_set_inode_nbytes(eb, item, nbytes);
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun /*
428*4882a593Smuzhiyun * If this is a directory we need to reset the i_size to
429*4882a593Smuzhiyun * 0 so that we can set it up properly when replaying
430*4882a593Smuzhiyun * the rest of the items in this log.
431*4882a593Smuzhiyun */
432*4882a593Smuzhiyun mode = btrfs_inode_mode(eb, item);
433*4882a593Smuzhiyun if (S_ISDIR(mode))
434*4882a593Smuzhiyun btrfs_set_inode_size(eb, item, 0);
435*4882a593Smuzhiyun }
436*4882a593Smuzhiyun } else if (inode_item) {
437*4882a593Smuzhiyun struct btrfs_inode_item *item;
438*4882a593Smuzhiyun u32 mode;
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun /*
441*4882a593Smuzhiyun * New inode, set nbytes to 0 so that the nbytes comes out
442*4882a593Smuzhiyun * properly when we replay the extents.
443*4882a593Smuzhiyun */
444*4882a593Smuzhiyun item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
445*4882a593Smuzhiyun btrfs_set_inode_nbytes(eb, item, 0);
446*4882a593Smuzhiyun
447*4882a593Smuzhiyun /*
448*4882a593Smuzhiyun * If this is a directory we need to reset the i_size to 0 so
449*4882a593Smuzhiyun * that we can set it up properly when replaying the rest of
450*4882a593Smuzhiyun * the items in this log.
451*4882a593Smuzhiyun */
452*4882a593Smuzhiyun mode = btrfs_inode_mode(eb, item);
453*4882a593Smuzhiyun if (S_ISDIR(mode))
454*4882a593Smuzhiyun btrfs_set_inode_size(eb, item, 0);
455*4882a593Smuzhiyun }
456*4882a593Smuzhiyun insert:
457*4882a593Smuzhiyun btrfs_release_path(path);
458*4882a593Smuzhiyun /* try to insert the key into the destination tree */
459*4882a593Smuzhiyun path->skip_release_on_error = 1;
460*4882a593Smuzhiyun ret = btrfs_insert_empty_item(trans, root, path,
461*4882a593Smuzhiyun key, item_size);
462*4882a593Smuzhiyun path->skip_release_on_error = 0;
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun /* make sure any existing item is the correct size */
465*4882a593Smuzhiyun if (ret == -EEXIST || ret == -EOVERFLOW) {
466*4882a593Smuzhiyun u32 found_size;
467*4882a593Smuzhiyun found_size = btrfs_item_size_nr(path->nodes[0],
468*4882a593Smuzhiyun path->slots[0]);
469*4882a593Smuzhiyun if (found_size > item_size)
470*4882a593Smuzhiyun btrfs_truncate_item(path, item_size, 1);
471*4882a593Smuzhiyun else if (found_size < item_size)
472*4882a593Smuzhiyun btrfs_extend_item(path, item_size - found_size);
473*4882a593Smuzhiyun } else if (ret) {
474*4882a593Smuzhiyun return ret;
475*4882a593Smuzhiyun }
476*4882a593Smuzhiyun dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
477*4882a593Smuzhiyun path->slots[0]);
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun /* don't overwrite an existing inode if the generation number
480*4882a593Smuzhiyun * was logged as zero. This is done when the tree logging code
481*4882a593Smuzhiyun * is just logging an inode to make sure it exists after recovery.
482*4882a593Smuzhiyun *
483*4882a593Smuzhiyun * Also, don't overwrite i_size on directories during replay.
484*4882a593Smuzhiyun * log replay inserts and removes directory items based on the
485*4882a593Smuzhiyun * state of the tree found in the subvolume, and i_size is modified
486*4882a593Smuzhiyun * as it goes
487*4882a593Smuzhiyun */
488*4882a593Smuzhiyun if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
489*4882a593Smuzhiyun struct btrfs_inode_item *src_item;
490*4882a593Smuzhiyun struct btrfs_inode_item *dst_item;
491*4882a593Smuzhiyun
492*4882a593Smuzhiyun src_item = (struct btrfs_inode_item *)src_ptr;
493*4882a593Smuzhiyun dst_item = (struct btrfs_inode_item *)dst_ptr;
494*4882a593Smuzhiyun
495*4882a593Smuzhiyun if (btrfs_inode_generation(eb, src_item) == 0) {
496*4882a593Smuzhiyun struct extent_buffer *dst_eb = path->nodes[0];
497*4882a593Smuzhiyun const u64 ino_size = btrfs_inode_size(eb, src_item);
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun /*
500*4882a593Smuzhiyun * For regular files an ino_size == 0 is used only when
501*4882a593Smuzhiyun * logging that an inode exists, as part of a directory
502*4882a593Smuzhiyun * fsync, and the inode wasn't fsynced before. In this
503*4882a593Smuzhiyun * case don't set the size of the inode in the fs/subvol
504*4882a593Smuzhiyun * tree, otherwise we would be throwing valid data away.
505*4882a593Smuzhiyun */
506*4882a593Smuzhiyun if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
507*4882a593Smuzhiyun S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
508*4882a593Smuzhiyun ino_size != 0)
509*4882a593Smuzhiyun btrfs_set_inode_size(dst_eb, dst_item, ino_size);
510*4882a593Smuzhiyun goto no_copy;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun if (overwrite_root &&
514*4882a593Smuzhiyun S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
515*4882a593Smuzhiyun S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
516*4882a593Smuzhiyun save_old_i_size = 1;
517*4882a593Smuzhiyun saved_i_size = btrfs_inode_size(path->nodes[0],
518*4882a593Smuzhiyun dst_item);
519*4882a593Smuzhiyun }
520*4882a593Smuzhiyun }
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun copy_extent_buffer(path->nodes[0], eb, dst_ptr,
523*4882a593Smuzhiyun src_ptr, item_size);
524*4882a593Smuzhiyun
525*4882a593Smuzhiyun if (save_old_i_size) {
526*4882a593Smuzhiyun struct btrfs_inode_item *dst_item;
527*4882a593Smuzhiyun dst_item = (struct btrfs_inode_item *)dst_ptr;
528*4882a593Smuzhiyun btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
529*4882a593Smuzhiyun }
530*4882a593Smuzhiyun
531*4882a593Smuzhiyun /* make sure the generation is filled in */
532*4882a593Smuzhiyun if (key->type == BTRFS_INODE_ITEM_KEY) {
533*4882a593Smuzhiyun struct btrfs_inode_item *dst_item;
534*4882a593Smuzhiyun dst_item = (struct btrfs_inode_item *)dst_ptr;
535*4882a593Smuzhiyun if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
536*4882a593Smuzhiyun btrfs_set_inode_generation(path->nodes[0], dst_item,
537*4882a593Smuzhiyun trans->transid);
538*4882a593Smuzhiyun }
539*4882a593Smuzhiyun }
540*4882a593Smuzhiyun no_copy:
541*4882a593Smuzhiyun btrfs_mark_buffer_dirty(path->nodes[0]);
542*4882a593Smuzhiyun btrfs_release_path(path);
543*4882a593Smuzhiyun return 0;
544*4882a593Smuzhiyun }
545*4882a593Smuzhiyun
546*4882a593Smuzhiyun /*
547*4882a593Smuzhiyun * simple helper to read an inode off the disk from a given root
548*4882a593Smuzhiyun * This can only be called for subvolume roots and not for the log
549*4882a593Smuzhiyun */
read_one_inode(struct btrfs_root * root,u64 objectid)550*4882a593Smuzhiyun static noinline struct inode *read_one_inode(struct btrfs_root *root,
551*4882a593Smuzhiyun u64 objectid)
552*4882a593Smuzhiyun {
553*4882a593Smuzhiyun struct inode *inode;
554*4882a593Smuzhiyun
555*4882a593Smuzhiyun inode = btrfs_iget(root->fs_info->sb, objectid, root);
556*4882a593Smuzhiyun if (IS_ERR(inode))
557*4882a593Smuzhiyun inode = NULL;
558*4882a593Smuzhiyun return inode;
559*4882a593Smuzhiyun }
560*4882a593Smuzhiyun
561*4882a593Smuzhiyun /* replays a single extent in 'eb' at 'slot' with 'key' into the
562*4882a593Smuzhiyun * subvolume 'root'. path is released on entry and should be released
563*4882a593Smuzhiyun * on exit.
564*4882a593Smuzhiyun *
565*4882a593Smuzhiyun * extents in the log tree have not been allocated out of the extent
566*4882a593Smuzhiyun * tree yet. So, this completes the allocation, taking a reference
567*4882a593Smuzhiyun * as required if the extent already exists or creating a new extent
568*4882a593Smuzhiyun * if it isn't in the extent allocation tree yet.
569*4882a593Smuzhiyun *
570*4882a593Smuzhiyun * The extent is inserted into the file, dropping any existing extents
571*4882a593Smuzhiyun * from the file that overlap the new one.
572*4882a593Smuzhiyun */
replay_one_extent(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)573*4882a593Smuzhiyun static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
574*4882a593Smuzhiyun struct btrfs_root *root,
575*4882a593Smuzhiyun struct btrfs_path *path,
576*4882a593Smuzhiyun struct extent_buffer *eb, int slot,
577*4882a593Smuzhiyun struct btrfs_key *key)
578*4882a593Smuzhiyun {
579*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
580*4882a593Smuzhiyun int found_type;
581*4882a593Smuzhiyun u64 extent_end;
582*4882a593Smuzhiyun u64 start = key->offset;
583*4882a593Smuzhiyun u64 nbytes = 0;
584*4882a593Smuzhiyun struct btrfs_file_extent_item *item;
585*4882a593Smuzhiyun struct inode *inode = NULL;
586*4882a593Smuzhiyun unsigned long size;
587*4882a593Smuzhiyun int ret = 0;
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
590*4882a593Smuzhiyun found_type = btrfs_file_extent_type(eb, item);
591*4882a593Smuzhiyun
592*4882a593Smuzhiyun if (found_type == BTRFS_FILE_EXTENT_REG ||
593*4882a593Smuzhiyun found_type == BTRFS_FILE_EXTENT_PREALLOC) {
594*4882a593Smuzhiyun nbytes = btrfs_file_extent_num_bytes(eb, item);
595*4882a593Smuzhiyun extent_end = start + nbytes;
596*4882a593Smuzhiyun
597*4882a593Smuzhiyun /*
598*4882a593Smuzhiyun * We don't add to the inodes nbytes if we are prealloc or a
599*4882a593Smuzhiyun * hole.
600*4882a593Smuzhiyun */
601*4882a593Smuzhiyun if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
602*4882a593Smuzhiyun nbytes = 0;
603*4882a593Smuzhiyun } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
604*4882a593Smuzhiyun size = btrfs_file_extent_ram_bytes(eb, item);
605*4882a593Smuzhiyun nbytes = btrfs_file_extent_ram_bytes(eb, item);
606*4882a593Smuzhiyun extent_end = ALIGN(start + size,
607*4882a593Smuzhiyun fs_info->sectorsize);
608*4882a593Smuzhiyun } else {
609*4882a593Smuzhiyun ret = 0;
610*4882a593Smuzhiyun goto out;
611*4882a593Smuzhiyun }
612*4882a593Smuzhiyun
613*4882a593Smuzhiyun inode = read_one_inode(root, key->objectid);
614*4882a593Smuzhiyun if (!inode) {
615*4882a593Smuzhiyun ret = -EIO;
616*4882a593Smuzhiyun goto out;
617*4882a593Smuzhiyun }
618*4882a593Smuzhiyun
619*4882a593Smuzhiyun /*
620*4882a593Smuzhiyun * first check to see if we already have this extent in the
621*4882a593Smuzhiyun * file. This must be done before the btrfs_drop_extents run
622*4882a593Smuzhiyun * so we don't try to drop this extent.
623*4882a593Smuzhiyun */
624*4882a593Smuzhiyun ret = btrfs_lookup_file_extent(trans, root, path,
625*4882a593Smuzhiyun btrfs_ino(BTRFS_I(inode)), start, 0);
626*4882a593Smuzhiyun
627*4882a593Smuzhiyun if (ret == 0 &&
628*4882a593Smuzhiyun (found_type == BTRFS_FILE_EXTENT_REG ||
629*4882a593Smuzhiyun found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
630*4882a593Smuzhiyun struct btrfs_file_extent_item cmp1;
631*4882a593Smuzhiyun struct btrfs_file_extent_item cmp2;
632*4882a593Smuzhiyun struct btrfs_file_extent_item *existing;
633*4882a593Smuzhiyun struct extent_buffer *leaf;
634*4882a593Smuzhiyun
635*4882a593Smuzhiyun leaf = path->nodes[0];
636*4882a593Smuzhiyun existing = btrfs_item_ptr(leaf, path->slots[0],
637*4882a593Smuzhiyun struct btrfs_file_extent_item);
638*4882a593Smuzhiyun
639*4882a593Smuzhiyun read_extent_buffer(eb, &cmp1, (unsigned long)item,
640*4882a593Smuzhiyun sizeof(cmp1));
641*4882a593Smuzhiyun read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
642*4882a593Smuzhiyun sizeof(cmp2));
643*4882a593Smuzhiyun
644*4882a593Smuzhiyun /*
645*4882a593Smuzhiyun * we already have a pointer to this exact extent,
646*4882a593Smuzhiyun * we don't have to do anything
647*4882a593Smuzhiyun */
648*4882a593Smuzhiyun if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
649*4882a593Smuzhiyun btrfs_release_path(path);
650*4882a593Smuzhiyun goto out;
651*4882a593Smuzhiyun }
652*4882a593Smuzhiyun }
653*4882a593Smuzhiyun btrfs_release_path(path);
654*4882a593Smuzhiyun
655*4882a593Smuzhiyun /* drop any overlapping extents */
656*4882a593Smuzhiyun ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
657*4882a593Smuzhiyun if (ret)
658*4882a593Smuzhiyun goto out;
659*4882a593Smuzhiyun
660*4882a593Smuzhiyun if (found_type == BTRFS_FILE_EXTENT_REG ||
661*4882a593Smuzhiyun found_type == BTRFS_FILE_EXTENT_PREALLOC) {
662*4882a593Smuzhiyun u64 offset;
663*4882a593Smuzhiyun unsigned long dest_offset;
664*4882a593Smuzhiyun struct btrfs_key ins;
665*4882a593Smuzhiyun
666*4882a593Smuzhiyun if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
667*4882a593Smuzhiyun btrfs_fs_incompat(fs_info, NO_HOLES))
668*4882a593Smuzhiyun goto update_inode;
669*4882a593Smuzhiyun
670*4882a593Smuzhiyun ret = btrfs_insert_empty_item(trans, root, path, key,
671*4882a593Smuzhiyun sizeof(*item));
672*4882a593Smuzhiyun if (ret)
673*4882a593Smuzhiyun goto out;
674*4882a593Smuzhiyun dest_offset = btrfs_item_ptr_offset(path->nodes[0],
675*4882a593Smuzhiyun path->slots[0]);
676*4882a593Smuzhiyun copy_extent_buffer(path->nodes[0], eb, dest_offset,
677*4882a593Smuzhiyun (unsigned long)item, sizeof(*item));
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
680*4882a593Smuzhiyun ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
681*4882a593Smuzhiyun ins.type = BTRFS_EXTENT_ITEM_KEY;
682*4882a593Smuzhiyun offset = key->offset - btrfs_file_extent_offset(eb, item);
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun /*
685*4882a593Smuzhiyun * Manually record dirty extent, as here we did a shallow
686*4882a593Smuzhiyun * file extent item copy and skip normal backref update,
687*4882a593Smuzhiyun * but modifying extent tree all by ourselves.
688*4882a593Smuzhiyun * So need to manually record dirty extent for qgroup,
689*4882a593Smuzhiyun * as the owner of the file extent changed from log tree
690*4882a593Smuzhiyun * (doesn't affect qgroup) to fs/file tree(affects qgroup)
691*4882a593Smuzhiyun */
692*4882a593Smuzhiyun ret = btrfs_qgroup_trace_extent(trans,
693*4882a593Smuzhiyun btrfs_file_extent_disk_bytenr(eb, item),
694*4882a593Smuzhiyun btrfs_file_extent_disk_num_bytes(eb, item),
695*4882a593Smuzhiyun GFP_NOFS);
696*4882a593Smuzhiyun if (ret < 0)
697*4882a593Smuzhiyun goto out;
698*4882a593Smuzhiyun
699*4882a593Smuzhiyun if (ins.objectid > 0) {
700*4882a593Smuzhiyun struct btrfs_ref ref = { 0 };
701*4882a593Smuzhiyun u64 csum_start;
702*4882a593Smuzhiyun u64 csum_end;
703*4882a593Smuzhiyun LIST_HEAD(ordered_sums);
704*4882a593Smuzhiyun
705*4882a593Smuzhiyun /*
706*4882a593Smuzhiyun * is this extent already allocated in the extent
707*4882a593Smuzhiyun * allocation tree? If so, just add a reference
708*4882a593Smuzhiyun */
709*4882a593Smuzhiyun ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
710*4882a593Smuzhiyun ins.offset);
711*4882a593Smuzhiyun if (ret < 0) {
712*4882a593Smuzhiyun goto out;
713*4882a593Smuzhiyun } else if (ret == 0) {
714*4882a593Smuzhiyun btrfs_init_generic_ref(&ref,
715*4882a593Smuzhiyun BTRFS_ADD_DELAYED_REF,
716*4882a593Smuzhiyun ins.objectid, ins.offset, 0);
717*4882a593Smuzhiyun btrfs_init_data_ref(&ref,
718*4882a593Smuzhiyun root->root_key.objectid,
719*4882a593Smuzhiyun key->objectid, offset);
720*4882a593Smuzhiyun ret = btrfs_inc_extent_ref(trans, &ref);
721*4882a593Smuzhiyun if (ret)
722*4882a593Smuzhiyun goto out;
723*4882a593Smuzhiyun } else {
724*4882a593Smuzhiyun /*
725*4882a593Smuzhiyun * insert the extent pointer in the extent
726*4882a593Smuzhiyun * allocation tree
727*4882a593Smuzhiyun */
728*4882a593Smuzhiyun ret = btrfs_alloc_logged_file_extent(trans,
729*4882a593Smuzhiyun root->root_key.objectid,
730*4882a593Smuzhiyun key->objectid, offset, &ins);
731*4882a593Smuzhiyun if (ret)
732*4882a593Smuzhiyun goto out;
733*4882a593Smuzhiyun }
734*4882a593Smuzhiyun btrfs_release_path(path);
735*4882a593Smuzhiyun
736*4882a593Smuzhiyun if (btrfs_file_extent_compression(eb, item)) {
737*4882a593Smuzhiyun csum_start = ins.objectid;
738*4882a593Smuzhiyun csum_end = csum_start + ins.offset;
739*4882a593Smuzhiyun } else {
740*4882a593Smuzhiyun csum_start = ins.objectid +
741*4882a593Smuzhiyun btrfs_file_extent_offset(eb, item);
742*4882a593Smuzhiyun csum_end = csum_start +
743*4882a593Smuzhiyun btrfs_file_extent_num_bytes(eb, item);
744*4882a593Smuzhiyun }
745*4882a593Smuzhiyun
746*4882a593Smuzhiyun ret = btrfs_lookup_csums_range(root->log_root,
747*4882a593Smuzhiyun csum_start, csum_end - 1,
748*4882a593Smuzhiyun &ordered_sums, 0);
749*4882a593Smuzhiyun if (ret)
750*4882a593Smuzhiyun goto out;
751*4882a593Smuzhiyun /*
752*4882a593Smuzhiyun * Now delete all existing cums in the csum root that
753*4882a593Smuzhiyun * cover our range. We do this because we can have an
754*4882a593Smuzhiyun * extent that is completely referenced by one file
755*4882a593Smuzhiyun * extent item and partially referenced by another
756*4882a593Smuzhiyun * file extent item (like after using the clone or
757*4882a593Smuzhiyun * extent_same ioctls). In this case if we end up doing
758*4882a593Smuzhiyun * the replay of the one that partially references the
759*4882a593Smuzhiyun * extent first, and we do not do the csum deletion
760*4882a593Smuzhiyun * below, we can get 2 csum items in the csum tree that
761*4882a593Smuzhiyun * overlap each other. For example, imagine our log has
762*4882a593Smuzhiyun * the two following file extent items:
763*4882a593Smuzhiyun *
764*4882a593Smuzhiyun * key (257 EXTENT_DATA 409600)
765*4882a593Smuzhiyun * extent data disk byte 12845056 nr 102400
766*4882a593Smuzhiyun * extent data offset 20480 nr 20480 ram 102400
767*4882a593Smuzhiyun *
768*4882a593Smuzhiyun * key (257 EXTENT_DATA 819200)
769*4882a593Smuzhiyun * extent data disk byte 12845056 nr 102400
770*4882a593Smuzhiyun * extent data offset 0 nr 102400 ram 102400
771*4882a593Smuzhiyun *
772*4882a593Smuzhiyun * Where the second one fully references the 100K extent
773*4882a593Smuzhiyun * that starts at disk byte 12845056, and the log tree
774*4882a593Smuzhiyun * has a single csum item that covers the entire range
775*4882a593Smuzhiyun * of the extent:
776*4882a593Smuzhiyun *
777*4882a593Smuzhiyun * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
778*4882a593Smuzhiyun *
779*4882a593Smuzhiyun * After the first file extent item is replayed, the
780*4882a593Smuzhiyun * csum tree gets the following csum item:
781*4882a593Smuzhiyun *
782*4882a593Smuzhiyun * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
783*4882a593Smuzhiyun *
784*4882a593Smuzhiyun * Which covers the 20K sub-range starting at offset 20K
785*4882a593Smuzhiyun * of our extent. Now when we replay the second file
786*4882a593Smuzhiyun * extent item, if we do not delete existing csum items
787*4882a593Smuzhiyun * that cover any of its blocks, we end up getting two
788*4882a593Smuzhiyun * csum items in our csum tree that overlap each other:
789*4882a593Smuzhiyun *
790*4882a593Smuzhiyun * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
791*4882a593Smuzhiyun * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
792*4882a593Smuzhiyun *
793*4882a593Smuzhiyun * Which is a problem, because after this anyone trying
794*4882a593Smuzhiyun * to lookup up for the checksum of any block of our
795*4882a593Smuzhiyun * extent starting at an offset of 40K or higher, will
796*4882a593Smuzhiyun * end up looking at the second csum item only, which
797*4882a593Smuzhiyun * does not contain the checksum for any block starting
798*4882a593Smuzhiyun * at offset 40K or higher of our extent.
799*4882a593Smuzhiyun */
800*4882a593Smuzhiyun while (!list_empty(&ordered_sums)) {
801*4882a593Smuzhiyun struct btrfs_ordered_sum *sums;
802*4882a593Smuzhiyun sums = list_entry(ordered_sums.next,
803*4882a593Smuzhiyun struct btrfs_ordered_sum,
804*4882a593Smuzhiyun list);
805*4882a593Smuzhiyun if (!ret)
806*4882a593Smuzhiyun ret = btrfs_del_csums(trans,
807*4882a593Smuzhiyun fs_info->csum_root,
808*4882a593Smuzhiyun sums->bytenr,
809*4882a593Smuzhiyun sums->len);
810*4882a593Smuzhiyun if (!ret)
811*4882a593Smuzhiyun ret = btrfs_csum_file_blocks(trans,
812*4882a593Smuzhiyun fs_info->csum_root, sums);
813*4882a593Smuzhiyun list_del(&sums->list);
814*4882a593Smuzhiyun kfree(sums);
815*4882a593Smuzhiyun }
816*4882a593Smuzhiyun if (ret)
817*4882a593Smuzhiyun goto out;
818*4882a593Smuzhiyun } else {
819*4882a593Smuzhiyun btrfs_release_path(path);
820*4882a593Smuzhiyun }
821*4882a593Smuzhiyun } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
822*4882a593Smuzhiyun /* inline extents are easy, we just overwrite them */
823*4882a593Smuzhiyun ret = overwrite_item(trans, root, path, eb, slot, key);
824*4882a593Smuzhiyun if (ret)
825*4882a593Smuzhiyun goto out;
826*4882a593Smuzhiyun }
827*4882a593Smuzhiyun
828*4882a593Smuzhiyun ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start,
829*4882a593Smuzhiyun extent_end - start);
830*4882a593Smuzhiyun if (ret)
831*4882a593Smuzhiyun goto out;
832*4882a593Smuzhiyun
833*4882a593Smuzhiyun inode_add_bytes(inode, nbytes);
834*4882a593Smuzhiyun update_inode:
835*4882a593Smuzhiyun ret = btrfs_update_inode(trans, root, inode);
836*4882a593Smuzhiyun out:
837*4882a593Smuzhiyun if (inode)
838*4882a593Smuzhiyun iput(inode);
839*4882a593Smuzhiyun return ret;
840*4882a593Smuzhiyun }
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun /*
843*4882a593Smuzhiyun * when cleaning up conflicts between the directory names in the
844*4882a593Smuzhiyun * subvolume, directory names in the log and directory names in the
845*4882a593Smuzhiyun * inode back references, we may have to unlink inodes from directories.
846*4882a593Smuzhiyun *
847*4882a593Smuzhiyun * This is a helper function to do the unlink of a specific directory
848*4882a593Smuzhiyun * item
849*4882a593Smuzhiyun */
drop_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_inode * dir,struct btrfs_dir_item * di)850*4882a593Smuzhiyun static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
851*4882a593Smuzhiyun struct btrfs_root *root,
852*4882a593Smuzhiyun struct btrfs_path *path,
853*4882a593Smuzhiyun struct btrfs_inode *dir,
854*4882a593Smuzhiyun struct btrfs_dir_item *di)
855*4882a593Smuzhiyun {
856*4882a593Smuzhiyun struct inode *inode;
857*4882a593Smuzhiyun char *name;
858*4882a593Smuzhiyun int name_len;
859*4882a593Smuzhiyun struct extent_buffer *leaf;
860*4882a593Smuzhiyun struct btrfs_key location;
861*4882a593Smuzhiyun int ret;
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun leaf = path->nodes[0];
864*4882a593Smuzhiyun
865*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(leaf, di, &location);
866*4882a593Smuzhiyun name_len = btrfs_dir_name_len(leaf, di);
867*4882a593Smuzhiyun name = kmalloc(name_len, GFP_NOFS);
868*4882a593Smuzhiyun if (!name)
869*4882a593Smuzhiyun return -ENOMEM;
870*4882a593Smuzhiyun
871*4882a593Smuzhiyun read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
872*4882a593Smuzhiyun btrfs_release_path(path);
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun inode = read_one_inode(root, location.objectid);
875*4882a593Smuzhiyun if (!inode) {
876*4882a593Smuzhiyun ret = -EIO;
877*4882a593Smuzhiyun goto out;
878*4882a593Smuzhiyun }
879*4882a593Smuzhiyun
880*4882a593Smuzhiyun ret = link_to_fixup_dir(trans, root, path, location.objectid);
881*4882a593Smuzhiyun if (ret)
882*4882a593Smuzhiyun goto out;
883*4882a593Smuzhiyun
884*4882a593Smuzhiyun ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
885*4882a593Smuzhiyun name_len);
886*4882a593Smuzhiyun if (ret)
887*4882a593Smuzhiyun goto out;
888*4882a593Smuzhiyun else
889*4882a593Smuzhiyun ret = btrfs_run_delayed_items(trans);
890*4882a593Smuzhiyun out:
891*4882a593Smuzhiyun kfree(name);
892*4882a593Smuzhiyun iput(inode);
893*4882a593Smuzhiyun return ret;
894*4882a593Smuzhiyun }
895*4882a593Smuzhiyun
896*4882a593Smuzhiyun /*
897*4882a593Smuzhiyun * See if a given name and sequence number found in an inode back reference are
898*4882a593Smuzhiyun * already in a directory and correctly point to this inode.
899*4882a593Smuzhiyun *
900*4882a593Smuzhiyun * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
901*4882a593Smuzhiyun * exists.
902*4882a593Smuzhiyun */
inode_in_dir(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,u64 objectid,u64 index,const char * name,int name_len)903*4882a593Smuzhiyun static noinline int inode_in_dir(struct btrfs_root *root,
904*4882a593Smuzhiyun struct btrfs_path *path,
905*4882a593Smuzhiyun u64 dirid, u64 objectid, u64 index,
906*4882a593Smuzhiyun const char *name, int name_len)
907*4882a593Smuzhiyun {
908*4882a593Smuzhiyun struct btrfs_dir_item *di;
909*4882a593Smuzhiyun struct btrfs_key location;
910*4882a593Smuzhiyun int ret = 0;
911*4882a593Smuzhiyun
912*4882a593Smuzhiyun di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
913*4882a593Smuzhiyun index, name, name_len, 0);
914*4882a593Smuzhiyun if (IS_ERR(di)) {
915*4882a593Smuzhiyun if (PTR_ERR(di) != -ENOENT)
916*4882a593Smuzhiyun ret = PTR_ERR(di);
917*4882a593Smuzhiyun goto out;
918*4882a593Smuzhiyun } else if (di) {
919*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
920*4882a593Smuzhiyun if (location.objectid != objectid)
921*4882a593Smuzhiyun goto out;
922*4882a593Smuzhiyun } else {
923*4882a593Smuzhiyun goto out;
924*4882a593Smuzhiyun }
925*4882a593Smuzhiyun
926*4882a593Smuzhiyun btrfs_release_path(path);
927*4882a593Smuzhiyun di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
928*4882a593Smuzhiyun if (IS_ERR(di)) {
929*4882a593Smuzhiyun ret = PTR_ERR(di);
930*4882a593Smuzhiyun goto out;
931*4882a593Smuzhiyun } else if (di) {
932*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
933*4882a593Smuzhiyun if (location.objectid == objectid)
934*4882a593Smuzhiyun ret = 1;
935*4882a593Smuzhiyun }
936*4882a593Smuzhiyun out:
937*4882a593Smuzhiyun btrfs_release_path(path);
938*4882a593Smuzhiyun return ret;
939*4882a593Smuzhiyun }
940*4882a593Smuzhiyun
941*4882a593Smuzhiyun /*
942*4882a593Smuzhiyun * helper function to check a log tree for a named back reference in
943*4882a593Smuzhiyun * an inode. This is used to decide if a back reference that is
944*4882a593Smuzhiyun * found in the subvolume conflicts with what we find in the log.
945*4882a593Smuzhiyun *
946*4882a593Smuzhiyun * inode backreferences may have multiple refs in a single item,
947*4882a593Smuzhiyun * during replay we process one reference at a time, and we don't
948*4882a593Smuzhiyun * want to delete valid links to a file from the subvolume if that
949*4882a593Smuzhiyun * link is also in the log.
950*4882a593Smuzhiyun */
backref_in_log(struct btrfs_root * log,struct btrfs_key * key,u64 ref_objectid,const char * name,int namelen)951*4882a593Smuzhiyun static noinline int backref_in_log(struct btrfs_root *log,
952*4882a593Smuzhiyun struct btrfs_key *key,
953*4882a593Smuzhiyun u64 ref_objectid,
954*4882a593Smuzhiyun const char *name, int namelen)
955*4882a593Smuzhiyun {
956*4882a593Smuzhiyun struct btrfs_path *path;
957*4882a593Smuzhiyun int ret;
958*4882a593Smuzhiyun
959*4882a593Smuzhiyun path = btrfs_alloc_path();
960*4882a593Smuzhiyun if (!path)
961*4882a593Smuzhiyun return -ENOMEM;
962*4882a593Smuzhiyun
963*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
964*4882a593Smuzhiyun if (ret < 0) {
965*4882a593Smuzhiyun goto out;
966*4882a593Smuzhiyun } else if (ret == 1) {
967*4882a593Smuzhiyun ret = 0;
968*4882a593Smuzhiyun goto out;
969*4882a593Smuzhiyun }
970*4882a593Smuzhiyun
971*4882a593Smuzhiyun if (key->type == BTRFS_INODE_EXTREF_KEY)
972*4882a593Smuzhiyun ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
973*4882a593Smuzhiyun path->slots[0],
974*4882a593Smuzhiyun ref_objectid,
975*4882a593Smuzhiyun name, namelen);
976*4882a593Smuzhiyun else
977*4882a593Smuzhiyun ret = !!btrfs_find_name_in_backref(path->nodes[0],
978*4882a593Smuzhiyun path->slots[0],
979*4882a593Smuzhiyun name, namelen);
980*4882a593Smuzhiyun out:
981*4882a593Smuzhiyun btrfs_free_path(path);
982*4882a593Smuzhiyun return ret;
983*4882a593Smuzhiyun }
984*4882a593Smuzhiyun
__add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_root * log_root,struct btrfs_inode * dir,struct btrfs_inode * inode,u64 inode_objectid,u64 parent_objectid,u64 ref_index,char * name,int namelen,int * search_done)985*4882a593Smuzhiyun static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
986*4882a593Smuzhiyun struct btrfs_root *root,
987*4882a593Smuzhiyun struct btrfs_path *path,
988*4882a593Smuzhiyun struct btrfs_root *log_root,
989*4882a593Smuzhiyun struct btrfs_inode *dir,
990*4882a593Smuzhiyun struct btrfs_inode *inode,
991*4882a593Smuzhiyun u64 inode_objectid, u64 parent_objectid,
992*4882a593Smuzhiyun u64 ref_index, char *name, int namelen,
993*4882a593Smuzhiyun int *search_done)
994*4882a593Smuzhiyun {
995*4882a593Smuzhiyun int ret;
996*4882a593Smuzhiyun char *victim_name;
997*4882a593Smuzhiyun int victim_name_len;
998*4882a593Smuzhiyun struct extent_buffer *leaf;
999*4882a593Smuzhiyun struct btrfs_dir_item *di;
1000*4882a593Smuzhiyun struct btrfs_key search_key;
1001*4882a593Smuzhiyun struct btrfs_inode_extref *extref;
1002*4882a593Smuzhiyun
1003*4882a593Smuzhiyun again:
1004*4882a593Smuzhiyun /* Search old style refs */
1005*4882a593Smuzhiyun search_key.objectid = inode_objectid;
1006*4882a593Smuzhiyun search_key.type = BTRFS_INODE_REF_KEY;
1007*4882a593Smuzhiyun search_key.offset = parent_objectid;
1008*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
1009*4882a593Smuzhiyun if (ret == 0) {
1010*4882a593Smuzhiyun struct btrfs_inode_ref *victim_ref;
1011*4882a593Smuzhiyun unsigned long ptr;
1012*4882a593Smuzhiyun unsigned long ptr_end;
1013*4882a593Smuzhiyun
1014*4882a593Smuzhiyun leaf = path->nodes[0];
1015*4882a593Smuzhiyun
1016*4882a593Smuzhiyun /* are we trying to overwrite a back ref for the root directory
1017*4882a593Smuzhiyun * if so, just jump out, we're done
1018*4882a593Smuzhiyun */
1019*4882a593Smuzhiyun if (search_key.objectid == search_key.offset)
1020*4882a593Smuzhiyun return 1;
1021*4882a593Smuzhiyun
1022*4882a593Smuzhiyun /* check all the names in this back reference to see
1023*4882a593Smuzhiyun * if they are in the log. if so, we allow them to stay
1024*4882a593Smuzhiyun * otherwise they must be unlinked as a conflict
1025*4882a593Smuzhiyun */
1026*4882a593Smuzhiyun ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1027*4882a593Smuzhiyun ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1028*4882a593Smuzhiyun while (ptr < ptr_end) {
1029*4882a593Smuzhiyun victim_ref = (struct btrfs_inode_ref *)ptr;
1030*4882a593Smuzhiyun victim_name_len = btrfs_inode_ref_name_len(leaf,
1031*4882a593Smuzhiyun victim_ref);
1032*4882a593Smuzhiyun victim_name = kmalloc(victim_name_len, GFP_NOFS);
1033*4882a593Smuzhiyun if (!victim_name)
1034*4882a593Smuzhiyun return -ENOMEM;
1035*4882a593Smuzhiyun
1036*4882a593Smuzhiyun read_extent_buffer(leaf, victim_name,
1037*4882a593Smuzhiyun (unsigned long)(victim_ref + 1),
1038*4882a593Smuzhiyun victim_name_len);
1039*4882a593Smuzhiyun
1040*4882a593Smuzhiyun ret = backref_in_log(log_root, &search_key,
1041*4882a593Smuzhiyun parent_objectid, victim_name,
1042*4882a593Smuzhiyun victim_name_len);
1043*4882a593Smuzhiyun if (ret < 0) {
1044*4882a593Smuzhiyun kfree(victim_name);
1045*4882a593Smuzhiyun return ret;
1046*4882a593Smuzhiyun } else if (!ret) {
1047*4882a593Smuzhiyun inc_nlink(&inode->vfs_inode);
1048*4882a593Smuzhiyun btrfs_release_path(path);
1049*4882a593Smuzhiyun
1050*4882a593Smuzhiyun ret = btrfs_unlink_inode(trans, root, dir, inode,
1051*4882a593Smuzhiyun victim_name, victim_name_len);
1052*4882a593Smuzhiyun kfree(victim_name);
1053*4882a593Smuzhiyun if (ret)
1054*4882a593Smuzhiyun return ret;
1055*4882a593Smuzhiyun ret = btrfs_run_delayed_items(trans);
1056*4882a593Smuzhiyun if (ret)
1057*4882a593Smuzhiyun return ret;
1058*4882a593Smuzhiyun *search_done = 1;
1059*4882a593Smuzhiyun goto again;
1060*4882a593Smuzhiyun }
1061*4882a593Smuzhiyun kfree(victim_name);
1062*4882a593Smuzhiyun
1063*4882a593Smuzhiyun ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1064*4882a593Smuzhiyun }
1065*4882a593Smuzhiyun
1066*4882a593Smuzhiyun /*
1067*4882a593Smuzhiyun * NOTE: we have searched root tree and checked the
1068*4882a593Smuzhiyun * corresponding ref, it does not need to check again.
1069*4882a593Smuzhiyun */
1070*4882a593Smuzhiyun *search_done = 1;
1071*4882a593Smuzhiyun }
1072*4882a593Smuzhiyun btrfs_release_path(path);
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun /* Same search but for extended refs */
1075*4882a593Smuzhiyun extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1076*4882a593Smuzhiyun inode_objectid, parent_objectid, 0,
1077*4882a593Smuzhiyun 0);
1078*4882a593Smuzhiyun if (IS_ERR(extref)) {
1079*4882a593Smuzhiyun return PTR_ERR(extref);
1080*4882a593Smuzhiyun } else if (extref) {
1081*4882a593Smuzhiyun u32 item_size;
1082*4882a593Smuzhiyun u32 cur_offset = 0;
1083*4882a593Smuzhiyun unsigned long base;
1084*4882a593Smuzhiyun struct inode *victim_parent;
1085*4882a593Smuzhiyun
1086*4882a593Smuzhiyun leaf = path->nodes[0];
1087*4882a593Smuzhiyun
1088*4882a593Smuzhiyun item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1089*4882a593Smuzhiyun base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1090*4882a593Smuzhiyun
1091*4882a593Smuzhiyun while (cur_offset < item_size) {
1092*4882a593Smuzhiyun extref = (struct btrfs_inode_extref *)(base + cur_offset);
1093*4882a593Smuzhiyun
1094*4882a593Smuzhiyun victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1095*4882a593Smuzhiyun
1096*4882a593Smuzhiyun if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1097*4882a593Smuzhiyun goto next;
1098*4882a593Smuzhiyun
1099*4882a593Smuzhiyun victim_name = kmalloc(victim_name_len, GFP_NOFS);
1100*4882a593Smuzhiyun if (!victim_name)
1101*4882a593Smuzhiyun return -ENOMEM;
1102*4882a593Smuzhiyun read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1103*4882a593Smuzhiyun victim_name_len);
1104*4882a593Smuzhiyun
1105*4882a593Smuzhiyun search_key.objectid = inode_objectid;
1106*4882a593Smuzhiyun search_key.type = BTRFS_INODE_EXTREF_KEY;
1107*4882a593Smuzhiyun search_key.offset = btrfs_extref_hash(parent_objectid,
1108*4882a593Smuzhiyun victim_name,
1109*4882a593Smuzhiyun victim_name_len);
1110*4882a593Smuzhiyun ret = backref_in_log(log_root, &search_key,
1111*4882a593Smuzhiyun parent_objectid, victim_name,
1112*4882a593Smuzhiyun victim_name_len);
1113*4882a593Smuzhiyun if (ret < 0) {
1114*4882a593Smuzhiyun kfree(victim_name);
1115*4882a593Smuzhiyun return ret;
1116*4882a593Smuzhiyun } else if (!ret) {
1117*4882a593Smuzhiyun ret = -ENOENT;
1118*4882a593Smuzhiyun victim_parent = read_one_inode(root,
1119*4882a593Smuzhiyun parent_objectid);
1120*4882a593Smuzhiyun if (victim_parent) {
1121*4882a593Smuzhiyun inc_nlink(&inode->vfs_inode);
1122*4882a593Smuzhiyun btrfs_release_path(path);
1123*4882a593Smuzhiyun
1124*4882a593Smuzhiyun ret = btrfs_unlink_inode(trans, root,
1125*4882a593Smuzhiyun BTRFS_I(victim_parent),
1126*4882a593Smuzhiyun inode,
1127*4882a593Smuzhiyun victim_name,
1128*4882a593Smuzhiyun victim_name_len);
1129*4882a593Smuzhiyun if (!ret)
1130*4882a593Smuzhiyun ret = btrfs_run_delayed_items(
1131*4882a593Smuzhiyun trans);
1132*4882a593Smuzhiyun }
1133*4882a593Smuzhiyun iput(victim_parent);
1134*4882a593Smuzhiyun kfree(victim_name);
1135*4882a593Smuzhiyun if (ret)
1136*4882a593Smuzhiyun return ret;
1137*4882a593Smuzhiyun *search_done = 1;
1138*4882a593Smuzhiyun goto again;
1139*4882a593Smuzhiyun }
1140*4882a593Smuzhiyun kfree(victim_name);
1141*4882a593Smuzhiyun next:
1142*4882a593Smuzhiyun cur_offset += victim_name_len + sizeof(*extref);
1143*4882a593Smuzhiyun }
1144*4882a593Smuzhiyun *search_done = 1;
1145*4882a593Smuzhiyun }
1146*4882a593Smuzhiyun btrfs_release_path(path);
1147*4882a593Smuzhiyun
1148*4882a593Smuzhiyun /* look for a conflicting sequence number */
1149*4882a593Smuzhiyun di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
1150*4882a593Smuzhiyun ref_index, name, namelen, 0);
1151*4882a593Smuzhiyun if (IS_ERR(di)) {
1152*4882a593Smuzhiyun if (PTR_ERR(di) != -ENOENT)
1153*4882a593Smuzhiyun return PTR_ERR(di);
1154*4882a593Smuzhiyun } else if (di) {
1155*4882a593Smuzhiyun ret = drop_one_dir_item(trans, root, path, dir, di);
1156*4882a593Smuzhiyun if (ret)
1157*4882a593Smuzhiyun return ret;
1158*4882a593Smuzhiyun }
1159*4882a593Smuzhiyun btrfs_release_path(path);
1160*4882a593Smuzhiyun
1161*4882a593Smuzhiyun /* look for a conflicting name */
1162*4882a593Smuzhiyun di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
1163*4882a593Smuzhiyun name, namelen, 0);
1164*4882a593Smuzhiyun if (IS_ERR(di)) {
1165*4882a593Smuzhiyun return PTR_ERR(di);
1166*4882a593Smuzhiyun } else if (di) {
1167*4882a593Smuzhiyun ret = drop_one_dir_item(trans, root, path, dir, di);
1168*4882a593Smuzhiyun if (ret)
1169*4882a593Smuzhiyun return ret;
1170*4882a593Smuzhiyun }
1171*4882a593Smuzhiyun btrfs_release_path(path);
1172*4882a593Smuzhiyun
1173*4882a593Smuzhiyun return 0;
1174*4882a593Smuzhiyun }
1175*4882a593Smuzhiyun
extref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,u32 * namelen,char ** name,u64 * index,u64 * parent_objectid)1176*4882a593Smuzhiyun static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1177*4882a593Smuzhiyun u32 *namelen, char **name, u64 *index,
1178*4882a593Smuzhiyun u64 *parent_objectid)
1179*4882a593Smuzhiyun {
1180*4882a593Smuzhiyun struct btrfs_inode_extref *extref;
1181*4882a593Smuzhiyun
1182*4882a593Smuzhiyun extref = (struct btrfs_inode_extref *)ref_ptr;
1183*4882a593Smuzhiyun
1184*4882a593Smuzhiyun *namelen = btrfs_inode_extref_name_len(eb, extref);
1185*4882a593Smuzhiyun *name = kmalloc(*namelen, GFP_NOFS);
1186*4882a593Smuzhiyun if (*name == NULL)
1187*4882a593Smuzhiyun return -ENOMEM;
1188*4882a593Smuzhiyun
1189*4882a593Smuzhiyun read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1190*4882a593Smuzhiyun *namelen);
1191*4882a593Smuzhiyun
1192*4882a593Smuzhiyun if (index)
1193*4882a593Smuzhiyun *index = btrfs_inode_extref_index(eb, extref);
1194*4882a593Smuzhiyun if (parent_objectid)
1195*4882a593Smuzhiyun *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1196*4882a593Smuzhiyun
1197*4882a593Smuzhiyun return 0;
1198*4882a593Smuzhiyun }
1199*4882a593Smuzhiyun
ref_get_fields(struct extent_buffer * eb,unsigned long ref_ptr,u32 * namelen,char ** name,u64 * index)1200*4882a593Smuzhiyun static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1201*4882a593Smuzhiyun u32 *namelen, char **name, u64 *index)
1202*4882a593Smuzhiyun {
1203*4882a593Smuzhiyun struct btrfs_inode_ref *ref;
1204*4882a593Smuzhiyun
1205*4882a593Smuzhiyun ref = (struct btrfs_inode_ref *)ref_ptr;
1206*4882a593Smuzhiyun
1207*4882a593Smuzhiyun *namelen = btrfs_inode_ref_name_len(eb, ref);
1208*4882a593Smuzhiyun *name = kmalloc(*namelen, GFP_NOFS);
1209*4882a593Smuzhiyun if (*name == NULL)
1210*4882a593Smuzhiyun return -ENOMEM;
1211*4882a593Smuzhiyun
1212*4882a593Smuzhiyun read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1213*4882a593Smuzhiyun
1214*4882a593Smuzhiyun if (index)
1215*4882a593Smuzhiyun *index = btrfs_inode_ref_index(eb, ref);
1216*4882a593Smuzhiyun
1217*4882a593Smuzhiyun return 0;
1218*4882a593Smuzhiyun }
1219*4882a593Smuzhiyun
1220*4882a593Smuzhiyun /*
1221*4882a593Smuzhiyun * Take an inode reference item from the log tree and iterate all names from the
1222*4882a593Smuzhiyun * inode reference item in the subvolume tree with the same key (if it exists).
1223*4882a593Smuzhiyun * For any name that is not in the inode reference item from the log tree, do a
1224*4882a593Smuzhiyun * proper unlink of that name (that is, remove its entry from the inode
1225*4882a593Smuzhiyun * reference item and both dir index keys).
1226*4882a593Smuzhiyun */
unlink_old_inode_refs(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_inode * inode,struct extent_buffer * log_eb,int log_slot,struct btrfs_key * key)1227*4882a593Smuzhiyun static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1228*4882a593Smuzhiyun struct btrfs_root *root,
1229*4882a593Smuzhiyun struct btrfs_path *path,
1230*4882a593Smuzhiyun struct btrfs_inode *inode,
1231*4882a593Smuzhiyun struct extent_buffer *log_eb,
1232*4882a593Smuzhiyun int log_slot,
1233*4882a593Smuzhiyun struct btrfs_key *key)
1234*4882a593Smuzhiyun {
1235*4882a593Smuzhiyun int ret;
1236*4882a593Smuzhiyun unsigned long ref_ptr;
1237*4882a593Smuzhiyun unsigned long ref_end;
1238*4882a593Smuzhiyun struct extent_buffer *eb;
1239*4882a593Smuzhiyun
1240*4882a593Smuzhiyun again:
1241*4882a593Smuzhiyun btrfs_release_path(path);
1242*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1243*4882a593Smuzhiyun if (ret > 0) {
1244*4882a593Smuzhiyun ret = 0;
1245*4882a593Smuzhiyun goto out;
1246*4882a593Smuzhiyun }
1247*4882a593Smuzhiyun if (ret < 0)
1248*4882a593Smuzhiyun goto out;
1249*4882a593Smuzhiyun
1250*4882a593Smuzhiyun eb = path->nodes[0];
1251*4882a593Smuzhiyun ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1252*4882a593Smuzhiyun ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1253*4882a593Smuzhiyun while (ref_ptr < ref_end) {
1254*4882a593Smuzhiyun char *name = NULL;
1255*4882a593Smuzhiyun int namelen;
1256*4882a593Smuzhiyun u64 parent_id;
1257*4882a593Smuzhiyun
1258*4882a593Smuzhiyun if (key->type == BTRFS_INODE_EXTREF_KEY) {
1259*4882a593Smuzhiyun ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1260*4882a593Smuzhiyun NULL, &parent_id);
1261*4882a593Smuzhiyun } else {
1262*4882a593Smuzhiyun parent_id = key->offset;
1263*4882a593Smuzhiyun ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1264*4882a593Smuzhiyun NULL);
1265*4882a593Smuzhiyun }
1266*4882a593Smuzhiyun if (ret)
1267*4882a593Smuzhiyun goto out;
1268*4882a593Smuzhiyun
1269*4882a593Smuzhiyun if (key->type == BTRFS_INODE_EXTREF_KEY)
1270*4882a593Smuzhiyun ret = !!btrfs_find_name_in_ext_backref(log_eb, log_slot,
1271*4882a593Smuzhiyun parent_id, name,
1272*4882a593Smuzhiyun namelen);
1273*4882a593Smuzhiyun else
1274*4882a593Smuzhiyun ret = !!btrfs_find_name_in_backref(log_eb, log_slot,
1275*4882a593Smuzhiyun name, namelen);
1276*4882a593Smuzhiyun
1277*4882a593Smuzhiyun if (!ret) {
1278*4882a593Smuzhiyun struct inode *dir;
1279*4882a593Smuzhiyun
1280*4882a593Smuzhiyun btrfs_release_path(path);
1281*4882a593Smuzhiyun dir = read_one_inode(root, parent_id);
1282*4882a593Smuzhiyun if (!dir) {
1283*4882a593Smuzhiyun ret = -ENOENT;
1284*4882a593Smuzhiyun kfree(name);
1285*4882a593Smuzhiyun goto out;
1286*4882a593Smuzhiyun }
1287*4882a593Smuzhiyun ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1288*4882a593Smuzhiyun inode, name, namelen);
1289*4882a593Smuzhiyun kfree(name);
1290*4882a593Smuzhiyun iput(dir);
1291*4882a593Smuzhiyun /*
1292*4882a593Smuzhiyun * Whenever we need to check if a name exists or not, we
1293*4882a593Smuzhiyun * check the subvolume tree. So after an unlink we must
1294*4882a593Smuzhiyun * run delayed items, so that future checks for a name
1295*4882a593Smuzhiyun * during log replay see that the name does not exists
1296*4882a593Smuzhiyun * anymore.
1297*4882a593Smuzhiyun */
1298*4882a593Smuzhiyun if (!ret)
1299*4882a593Smuzhiyun ret = btrfs_run_delayed_items(trans);
1300*4882a593Smuzhiyun if (ret)
1301*4882a593Smuzhiyun goto out;
1302*4882a593Smuzhiyun goto again;
1303*4882a593Smuzhiyun }
1304*4882a593Smuzhiyun
1305*4882a593Smuzhiyun kfree(name);
1306*4882a593Smuzhiyun ref_ptr += namelen;
1307*4882a593Smuzhiyun if (key->type == BTRFS_INODE_EXTREF_KEY)
1308*4882a593Smuzhiyun ref_ptr += sizeof(struct btrfs_inode_extref);
1309*4882a593Smuzhiyun else
1310*4882a593Smuzhiyun ref_ptr += sizeof(struct btrfs_inode_ref);
1311*4882a593Smuzhiyun }
1312*4882a593Smuzhiyun ret = 0;
1313*4882a593Smuzhiyun out:
1314*4882a593Smuzhiyun btrfs_release_path(path);
1315*4882a593Smuzhiyun return ret;
1316*4882a593Smuzhiyun }
1317*4882a593Smuzhiyun
btrfs_inode_ref_exists(struct inode * inode,struct inode * dir,const u8 ref_type,const char * name,const int namelen)1318*4882a593Smuzhiyun static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1319*4882a593Smuzhiyun const u8 ref_type, const char *name,
1320*4882a593Smuzhiyun const int namelen)
1321*4882a593Smuzhiyun {
1322*4882a593Smuzhiyun struct btrfs_key key;
1323*4882a593Smuzhiyun struct btrfs_path *path;
1324*4882a593Smuzhiyun const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1325*4882a593Smuzhiyun int ret;
1326*4882a593Smuzhiyun
1327*4882a593Smuzhiyun path = btrfs_alloc_path();
1328*4882a593Smuzhiyun if (!path)
1329*4882a593Smuzhiyun return -ENOMEM;
1330*4882a593Smuzhiyun
1331*4882a593Smuzhiyun key.objectid = btrfs_ino(BTRFS_I(inode));
1332*4882a593Smuzhiyun key.type = ref_type;
1333*4882a593Smuzhiyun if (key.type == BTRFS_INODE_REF_KEY)
1334*4882a593Smuzhiyun key.offset = parent_id;
1335*4882a593Smuzhiyun else
1336*4882a593Smuzhiyun key.offset = btrfs_extref_hash(parent_id, name, namelen);
1337*4882a593Smuzhiyun
1338*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1339*4882a593Smuzhiyun if (ret < 0)
1340*4882a593Smuzhiyun goto out;
1341*4882a593Smuzhiyun if (ret > 0) {
1342*4882a593Smuzhiyun ret = 0;
1343*4882a593Smuzhiyun goto out;
1344*4882a593Smuzhiyun }
1345*4882a593Smuzhiyun if (key.type == BTRFS_INODE_EXTREF_KEY)
1346*4882a593Smuzhiyun ret = !!btrfs_find_name_in_ext_backref(path->nodes[0],
1347*4882a593Smuzhiyun path->slots[0], parent_id, name, namelen);
1348*4882a593Smuzhiyun else
1349*4882a593Smuzhiyun ret = !!btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1350*4882a593Smuzhiyun name, namelen);
1351*4882a593Smuzhiyun
1352*4882a593Smuzhiyun out:
1353*4882a593Smuzhiyun btrfs_free_path(path);
1354*4882a593Smuzhiyun return ret;
1355*4882a593Smuzhiyun }
1356*4882a593Smuzhiyun
add_link(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * dir,struct inode * inode,const char * name,int namelen,u64 ref_index)1357*4882a593Smuzhiyun static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1358*4882a593Smuzhiyun struct inode *dir, struct inode *inode, const char *name,
1359*4882a593Smuzhiyun int namelen, u64 ref_index)
1360*4882a593Smuzhiyun {
1361*4882a593Smuzhiyun struct btrfs_dir_item *dir_item;
1362*4882a593Smuzhiyun struct btrfs_key key;
1363*4882a593Smuzhiyun struct btrfs_path *path;
1364*4882a593Smuzhiyun struct inode *other_inode = NULL;
1365*4882a593Smuzhiyun int ret;
1366*4882a593Smuzhiyun
1367*4882a593Smuzhiyun path = btrfs_alloc_path();
1368*4882a593Smuzhiyun if (!path)
1369*4882a593Smuzhiyun return -ENOMEM;
1370*4882a593Smuzhiyun
1371*4882a593Smuzhiyun dir_item = btrfs_lookup_dir_item(NULL, root, path,
1372*4882a593Smuzhiyun btrfs_ino(BTRFS_I(dir)),
1373*4882a593Smuzhiyun name, namelen, 0);
1374*4882a593Smuzhiyun if (!dir_item) {
1375*4882a593Smuzhiyun btrfs_release_path(path);
1376*4882a593Smuzhiyun goto add_link;
1377*4882a593Smuzhiyun } else if (IS_ERR(dir_item)) {
1378*4882a593Smuzhiyun ret = PTR_ERR(dir_item);
1379*4882a593Smuzhiyun goto out;
1380*4882a593Smuzhiyun }
1381*4882a593Smuzhiyun
1382*4882a593Smuzhiyun /*
1383*4882a593Smuzhiyun * Our inode's dentry collides with the dentry of another inode which is
1384*4882a593Smuzhiyun * in the log but not yet processed since it has a higher inode number.
1385*4882a593Smuzhiyun * So delete that other dentry.
1386*4882a593Smuzhiyun */
1387*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1388*4882a593Smuzhiyun btrfs_release_path(path);
1389*4882a593Smuzhiyun other_inode = read_one_inode(root, key.objectid);
1390*4882a593Smuzhiyun if (!other_inode) {
1391*4882a593Smuzhiyun ret = -ENOENT;
1392*4882a593Smuzhiyun goto out;
1393*4882a593Smuzhiyun }
1394*4882a593Smuzhiyun ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1395*4882a593Smuzhiyun name, namelen);
1396*4882a593Smuzhiyun if (ret)
1397*4882a593Smuzhiyun goto out;
1398*4882a593Smuzhiyun /*
1399*4882a593Smuzhiyun * If we dropped the link count to 0, bump it so that later the iput()
1400*4882a593Smuzhiyun * on the inode will not free it. We will fixup the link count later.
1401*4882a593Smuzhiyun */
1402*4882a593Smuzhiyun if (other_inode->i_nlink == 0)
1403*4882a593Smuzhiyun inc_nlink(other_inode);
1404*4882a593Smuzhiyun
1405*4882a593Smuzhiyun ret = btrfs_run_delayed_items(trans);
1406*4882a593Smuzhiyun if (ret)
1407*4882a593Smuzhiyun goto out;
1408*4882a593Smuzhiyun add_link:
1409*4882a593Smuzhiyun ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1410*4882a593Smuzhiyun name, namelen, 0, ref_index);
1411*4882a593Smuzhiyun out:
1412*4882a593Smuzhiyun iput(other_inode);
1413*4882a593Smuzhiyun btrfs_free_path(path);
1414*4882a593Smuzhiyun
1415*4882a593Smuzhiyun return ret;
1416*4882a593Smuzhiyun }
1417*4882a593Smuzhiyun
1418*4882a593Smuzhiyun /*
1419*4882a593Smuzhiyun * replay one inode back reference item found in the log tree.
1420*4882a593Smuzhiyun * eb, slot and key refer to the buffer and key found in the log tree.
1421*4882a593Smuzhiyun * root is the destination we are replaying into, and path is for temp
1422*4882a593Smuzhiyun * use by this function. (it should be released on return).
1423*4882a593Smuzhiyun */
add_inode_ref(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)1424*4882a593Smuzhiyun static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1425*4882a593Smuzhiyun struct btrfs_root *root,
1426*4882a593Smuzhiyun struct btrfs_root *log,
1427*4882a593Smuzhiyun struct btrfs_path *path,
1428*4882a593Smuzhiyun struct extent_buffer *eb, int slot,
1429*4882a593Smuzhiyun struct btrfs_key *key)
1430*4882a593Smuzhiyun {
1431*4882a593Smuzhiyun struct inode *dir = NULL;
1432*4882a593Smuzhiyun struct inode *inode = NULL;
1433*4882a593Smuzhiyun unsigned long ref_ptr;
1434*4882a593Smuzhiyun unsigned long ref_end;
1435*4882a593Smuzhiyun char *name = NULL;
1436*4882a593Smuzhiyun int namelen;
1437*4882a593Smuzhiyun int ret;
1438*4882a593Smuzhiyun int search_done = 0;
1439*4882a593Smuzhiyun int log_ref_ver = 0;
1440*4882a593Smuzhiyun u64 parent_objectid;
1441*4882a593Smuzhiyun u64 inode_objectid;
1442*4882a593Smuzhiyun u64 ref_index = 0;
1443*4882a593Smuzhiyun int ref_struct_size;
1444*4882a593Smuzhiyun
1445*4882a593Smuzhiyun ref_ptr = btrfs_item_ptr_offset(eb, slot);
1446*4882a593Smuzhiyun ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1447*4882a593Smuzhiyun
1448*4882a593Smuzhiyun if (key->type == BTRFS_INODE_EXTREF_KEY) {
1449*4882a593Smuzhiyun struct btrfs_inode_extref *r;
1450*4882a593Smuzhiyun
1451*4882a593Smuzhiyun ref_struct_size = sizeof(struct btrfs_inode_extref);
1452*4882a593Smuzhiyun log_ref_ver = 1;
1453*4882a593Smuzhiyun r = (struct btrfs_inode_extref *)ref_ptr;
1454*4882a593Smuzhiyun parent_objectid = btrfs_inode_extref_parent(eb, r);
1455*4882a593Smuzhiyun } else {
1456*4882a593Smuzhiyun ref_struct_size = sizeof(struct btrfs_inode_ref);
1457*4882a593Smuzhiyun parent_objectid = key->offset;
1458*4882a593Smuzhiyun }
1459*4882a593Smuzhiyun inode_objectid = key->objectid;
1460*4882a593Smuzhiyun
1461*4882a593Smuzhiyun /*
1462*4882a593Smuzhiyun * it is possible that we didn't log all the parent directories
1463*4882a593Smuzhiyun * for a given inode. If we don't find the dir, just don't
1464*4882a593Smuzhiyun * copy the back ref in. The link count fixup code will take
1465*4882a593Smuzhiyun * care of the rest
1466*4882a593Smuzhiyun */
1467*4882a593Smuzhiyun dir = read_one_inode(root, parent_objectid);
1468*4882a593Smuzhiyun if (!dir) {
1469*4882a593Smuzhiyun ret = -ENOENT;
1470*4882a593Smuzhiyun goto out;
1471*4882a593Smuzhiyun }
1472*4882a593Smuzhiyun
1473*4882a593Smuzhiyun inode = read_one_inode(root, inode_objectid);
1474*4882a593Smuzhiyun if (!inode) {
1475*4882a593Smuzhiyun ret = -EIO;
1476*4882a593Smuzhiyun goto out;
1477*4882a593Smuzhiyun }
1478*4882a593Smuzhiyun
1479*4882a593Smuzhiyun while (ref_ptr < ref_end) {
1480*4882a593Smuzhiyun if (log_ref_ver) {
1481*4882a593Smuzhiyun ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1482*4882a593Smuzhiyun &ref_index, &parent_objectid);
1483*4882a593Smuzhiyun /*
1484*4882a593Smuzhiyun * parent object can change from one array
1485*4882a593Smuzhiyun * item to another.
1486*4882a593Smuzhiyun */
1487*4882a593Smuzhiyun if (!dir)
1488*4882a593Smuzhiyun dir = read_one_inode(root, parent_objectid);
1489*4882a593Smuzhiyun if (!dir) {
1490*4882a593Smuzhiyun ret = -ENOENT;
1491*4882a593Smuzhiyun goto out;
1492*4882a593Smuzhiyun }
1493*4882a593Smuzhiyun } else {
1494*4882a593Smuzhiyun ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1495*4882a593Smuzhiyun &ref_index);
1496*4882a593Smuzhiyun }
1497*4882a593Smuzhiyun if (ret)
1498*4882a593Smuzhiyun goto out;
1499*4882a593Smuzhiyun
1500*4882a593Smuzhiyun ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1501*4882a593Smuzhiyun btrfs_ino(BTRFS_I(inode)), ref_index,
1502*4882a593Smuzhiyun name, namelen);
1503*4882a593Smuzhiyun if (ret < 0) {
1504*4882a593Smuzhiyun goto out;
1505*4882a593Smuzhiyun } else if (ret == 0) {
1506*4882a593Smuzhiyun /*
1507*4882a593Smuzhiyun * look for a conflicting back reference in the
1508*4882a593Smuzhiyun * metadata. if we find one we have to unlink that name
1509*4882a593Smuzhiyun * of the file before we add our new link. Later on, we
1510*4882a593Smuzhiyun * overwrite any existing back reference, and we don't
1511*4882a593Smuzhiyun * want to create dangling pointers in the directory.
1512*4882a593Smuzhiyun */
1513*4882a593Smuzhiyun
1514*4882a593Smuzhiyun if (!search_done) {
1515*4882a593Smuzhiyun ret = __add_inode_ref(trans, root, path, log,
1516*4882a593Smuzhiyun BTRFS_I(dir),
1517*4882a593Smuzhiyun BTRFS_I(inode),
1518*4882a593Smuzhiyun inode_objectid,
1519*4882a593Smuzhiyun parent_objectid,
1520*4882a593Smuzhiyun ref_index, name, namelen,
1521*4882a593Smuzhiyun &search_done);
1522*4882a593Smuzhiyun if (ret) {
1523*4882a593Smuzhiyun if (ret == 1)
1524*4882a593Smuzhiyun ret = 0;
1525*4882a593Smuzhiyun goto out;
1526*4882a593Smuzhiyun }
1527*4882a593Smuzhiyun }
1528*4882a593Smuzhiyun
1529*4882a593Smuzhiyun /*
1530*4882a593Smuzhiyun * If a reference item already exists for this inode
1531*4882a593Smuzhiyun * with the same parent and name, but different index,
1532*4882a593Smuzhiyun * drop it and the corresponding directory index entries
1533*4882a593Smuzhiyun * from the parent before adding the new reference item
1534*4882a593Smuzhiyun * and dir index entries, otherwise we would fail with
1535*4882a593Smuzhiyun * -EEXIST returned from btrfs_add_link() below.
1536*4882a593Smuzhiyun */
1537*4882a593Smuzhiyun ret = btrfs_inode_ref_exists(inode, dir, key->type,
1538*4882a593Smuzhiyun name, namelen);
1539*4882a593Smuzhiyun if (ret > 0) {
1540*4882a593Smuzhiyun ret = btrfs_unlink_inode(trans, root,
1541*4882a593Smuzhiyun BTRFS_I(dir),
1542*4882a593Smuzhiyun BTRFS_I(inode),
1543*4882a593Smuzhiyun name, namelen);
1544*4882a593Smuzhiyun /*
1545*4882a593Smuzhiyun * If we dropped the link count to 0, bump it so
1546*4882a593Smuzhiyun * that later the iput() on the inode will not
1547*4882a593Smuzhiyun * free it. We will fixup the link count later.
1548*4882a593Smuzhiyun */
1549*4882a593Smuzhiyun if (!ret && inode->i_nlink == 0)
1550*4882a593Smuzhiyun inc_nlink(inode);
1551*4882a593Smuzhiyun /*
1552*4882a593Smuzhiyun * Whenever we need to check if a name exists or
1553*4882a593Smuzhiyun * not, we check the subvolume tree. So after an
1554*4882a593Smuzhiyun * unlink we must run delayed items, so that future
1555*4882a593Smuzhiyun * checks for a name during log replay see that the
1556*4882a593Smuzhiyun * name does not exists anymore.
1557*4882a593Smuzhiyun */
1558*4882a593Smuzhiyun if (!ret)
1559*4882a593Smuzhiyun ret = btrfs_run_delayed_items(trans);
1560*4882a593Smuzhiyun }
1561*4882a593Smuzhiyun if (ret < 0)
1562*4882a593Smuzhiyun goto out;
1563*4882a593Smuzhiyun
1564*4882a593Smuzhiyun /* insert our name */
1565*4882a593Smuzhiyun ret = add_link(trans, root, dir, inode, name, namelen,
1566*4882a593Smuzhiyun ref_index);
1567*4882a593Smuzhiyun if (ret)
1568*4882a593Smuzhiyun goto out;
1569*4882a593Smuzhiyun
1570*4882a593Smuzhiyun btrfs_update_inode(trans, root, inode);
1571*4882a593Smuzhiyun }
1572*4882a593Smuzhiyun /* Else, ret == 1, we already have a perfect match, we're done. */
1573*4882a593Smuzhiyun
1574*4882a593Smuzhiyun ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
1575*4882a593Smuzhiyun kfree(name);
1576*4882a593Smuzhiyun name = NULL;
1577*4882a593Smuzhiyun if (log_ref_ver) {
1578*4882a593Smuzhiyun iput(dir);
1579*4882a593Smuzhiyun dir = NULL;
1580*4882a593Smuzhiyun }
1581*4882a593Smuzhiyun }
1582*4882a593Smuzhiyun
1583*4882a593Smuzhiyun /*
1584*4882a593Smuzhiyun * Before we overwrite the inode reference item in the subvolume tree
1585*4882a593Smuzhiyun * with the item from the log tree, we must unlink all names from the
1586*4882a593Smuzhiyun * parent directory that are in the subvolume's tree inode reference
1587*4882a593Smuzhiyun * item, otherwise we end up with an inconsistent subvolume tree where
1588*4882a593Smuzhiyun * dir index entries exist for a name but there is no inode reference
1589*4882a593Smuzhiyun * item with the same name.
1590*4882a593Smuzhiyun */
1591*4882a593Smuzhiyun ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1592*4882a593Smuzhiyun key);
1593*4882a593Smuzhiyun if (ret)
1594*4882a593Smuzhiyun goto out;
1595*4882a593Smuzhiyun
1596*4882a593Smuzhiyun /* finally write the back reference in the inode */
1597*4882a593Smuzhiyun ret = overwrite_item(trans, root, path, eb, slot, key);
1598*4882a593Smuzhiyun out:
1599*4882a593Smuzhiyun btrfs_release_path(path);
1600*4882a593Smuzhiyun kfree(name);
1601*4882a593Smuzhiyun iput(dir);
1602*4882a593Smuzhiyun iput(inode);
1603*4882a593Smuzhiyun return ret;
1604*4882a593Smuzhiyun }
1605*4882a593Smuzhiyun
insert_orphan_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 ino)1606*4882a593Smuzhiyun static int insert_orphan_item(struct btrfs_trans_handle *trans,
1607*4882a593Smuzhiyun struct btrfs_root *root, u64 ino)
1608*4882a593Smuzhiyun {
1609*4882a593Smuzhiyun int ret;
1610*4882a593Smuzhiyun
1611*4882a593Smuzhiyun ret = btrfs_insert_orphan_item(trans, root, ino);
1612*4882a593Smuzhiyun if (ret == -EEXIST)
1613*4882a593Smuzhiyun ret = 0;
1614*4882a593Smuzhiyun
1615*4882a593Smuzhiyun return ret;
1616*4882a593Smuzhiyun }
1617*4882a593Smuzhiyun
count_inode_extrefs(struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)1618*4882a593Smuzhiyun static int count_inode_extrefs(struct btrfs_root *root,
1619*4882a593Smuzhiyun struct btrfs_inode *inode, struct btrfs_path *path)
1620*4882a593Smuzhiyun {
1621*4882a593Smuzhiyun int ret = 0;
1622*4882a593Smuzhiyun int name_len;
1623*4882a593Smuzhiyun unsigned int nlink = 0;
1624*4882a593Smuzhiyun u32 item_size;
1625*4882a593Smuzhiyun u32 cur_offset = 0;
1626*4882a593Smuzhiyun u64 inode_objectid = btrfs_ino(inode);
1627*4882a593Smuzhiyun u64 offset = 0;
1628*4882a593Smuzhiyun unsigned long ptr;
1629*4882a593Smuzhiyun struct btrfs_inode_extref *extref;
1630*4882a593Smuzhiyun struct extent_buffer *leaf;
1631*4882a593Smuzhiyun
1632*4882a593Smuzhiyun while (1) {
1633*4882a593Smuzhiyun ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1634*4882a593Smuzhiyun &extref, &offset);
1635*4882a593Smuzhiyun if (ret)
1636*4882a593Smuzhiyun break;
1637*4882a593Smuzhiyun
1638*4882a593Smuzhiyun leaf = path->nodes[0];
1639*4882a593Smuzhiyun item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1640*4882a593Smuzhiyun ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1641*4882a593Smuzhiyun cur_offset = 0;
1642*4882a593Smuzhiyun
1643*4882a593Smuzhiyun while (cur_offset < item_size) {
1644*4882a593Smuzhiyun extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1645*4882a593Smuzhiyun name_len = btrfs_inode_extref_name_len(leaf, extref);
1646*4882a593Smuzhiyun
1647*4882a593Smuzhiyun nlink++;
1648*4882a593Smuzhiyun
1649*4882a593Smuzhiyun cur_offset += name_len + sizeof(*extref);
1650*4882a593Smuzhiyun }
1651*4882a593Smuzhiyun
1652*4882a593Smuzhiyun offset++;
1653*4882a593Smuzhiyun btrfs_release_path(path);
1654*4882a593Smuzhiyun }
1655*4882a593Smuzhiyun btrfs_release_path(path);
1656*4882a593Smuzhiyun
1657*4882a593Smuzhiyun if (ret < 0 && ret != -ENOENT)
1658*4882a593Smuzhiyun return ret;
1659*4882a593Smuzhiyun return nlink;
1660*4882a593Smuzhiyun }
1661*4882a593Smuzhiyun
count_inode_refs(struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)1662*4882a593Smuzhiyun static int count_inode_refs(struct btrfs_root *root,
1663*4882a593Smuzhiyun struct btrfs_inode *inode, struct btrfs_path *path)
1664*4882a593Smuzhiyun {
1665*4882a593Smuzhiyun int ret;
1666*4882a593Smuzhiyun struct btrfs_key key;
1667*4882a593Smuzhiyun unsigned int nlink = 0;
1668*4882a593Smuzhiyun unsigned long ptr;
1669*4882a593Smuzhiyun unsigned long ptr_end;
1670*4882a593Smuzhiyun int name_len;
1671*4882a593Smuzhiyun u64 ino = btrfs_ino(inode);
1672*4882a593Smuzhiyun
1673*4882a593Smuzhiyun key.objectid = ino;
1674*4882a593Smuzhiyun key.type = BTRFS_INODE_REF_KEY;
1675*4882a593Smuzhiyun key.offset = (u64)-1;
1676*4882a593Smuzhiyun
1677*4882a593Smuzhiyun while (1) {
1678*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1679*4882a593Smuzhiyun if (ret < 0)
1680*4882a593Smuzhiyun break;
1681*4882a593Smuzhiyun if (ret > 0) {
1682*4882a593Smuzhiyun if (path->slots[0] == 0)
1683*4882a593Smuzhiyun break;
1684*4882a593Smuzhiyun path->slots[0]--;
1685*4882a593Smuzhiyun }
1686*4882a593Smuzhiyun process_slot:
1687*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &key,
1688*4882a593Smuzhiyun path->slots[0]);
1689*4882a593Smuzhiyun if (key.objectid != ino ||
1690*4882a593Smuzhiyun key.type != BTRFS_INODE_REF_KEY)
1691*4882a593Smuzhiyun break;
1692*4882a593Smuzhiyun ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1693*4882a593Smuzhiyun ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1694*4882a593Smuzhiyun path->slots[0]);
1695*4882a593Smuzhiyun while (ptr < ptr_end) {
1696*4882a593Smuzhiyun struct btrfs_inode_ref *ref;
1697*4882a593Smuzhiyun
1698*4882a593Smuzhiyun ref = (struct btrfs_inode_ref *)ptr;
1699*4882a593Smuzhiyun name_len = btrfs_inode_ref_name_len(path->nodes[0],
1700*4882a593Smuzhiyun ref);
1701*4882a593Smuzhiyun ptr = (unsigned long)(ref + 1) + name_len;
1702*4882a593Smuzhiyun nlink++;
1703*4882a593Smuzhiyun }
1704*4882a593Smuzhiyun
1705*4882a593Smuzhiyun if (key.offset == 0)
1706*4882a593Smuzhiyun break;
1707*4882a593Smuzhiyun if (path->slots[0] > 0) {
1708*4882a593Smuzhiyun path->slots[0]--;
1709*4882a593Smuzhiyun goto process_slot;
1710*4882a593Smuzhiyun }
1711*4882a593Smuzhiyun key.offset--;
1712*4882a593Smuzhiyun btrfs_release_path(path);
1713*4882a593Smuzhiyun }
1714*4882a593Smuzhiyun btrfs_release_path(path);
1715*4882a593Smuzhiyun
1716*4882a593Smuzhiyun return nlink;
1717*4882a593Smuzhiyun }
1718*4882a593Smuzhiyun
1719*4882a593Smuzhiyun /*
1720*4882a593Smuzhiyun * There are a few corners where the link count of the file can't
1721*4882a593Smuzhiyun * be properly maintained during replay. So, instead of adding
1722*4882a593Smuzhiyun * lots of complexity to the log code, we just scan the backrefs
1723*4882a593Smuzhiyun * for any file that has been through replay.
1724*4882a593Smuzhiyun *
1725*4882a593Smuzhiyun * The scan will update the link count on the inode to reflect the
1726*4882a593Smuzhiyun * number of back refs found. If it goes down to zero, the iput
1727*4882a593Smuzhiyun * will free the inode.
1728*4882a593Smuzhiyun */
fixup_inode_link_count(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode)1729*4882a593Smuzhiyun static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1730*4882a593Smuzhiyun struct btrfs_root *root,
1731*4882a593Smuzhiyun struct inode *inode)
1732*4882a593Smuzhiyun {
1733*4882a593Smuzhiyun struct btrfs_path *path;
1734*4882a593Smuzhiyun int ret;
1735*4882a593Smuzhiyun u64 nlink = 0;
1736*4882a593Smuzhiyun u64 ino = btrfs_ino(BTRFS_I(inode));
1737*4882a593Smuzhiyun
1738*4882a593Smuzhiyun path = btrfs_alloc_path();
1739*4882a593Smuzhiyun if (!path)
1740*4882a593Smuzhiyun return -ENOMEM;
1741*4882a593Smuzhiyun
1742*4882a593Smuzhiyun ret = count_inode_refs(root, BTRFS_I(inode), path);
1743*4882a593Smuzhiyun if (ret < 0)
1744*4882a593Smuzhiyun goto out;
1745*4882a593Smuzhiyun
1746*4882a593Smuzhiyun nlink = ret;
1747*4882a593Smuzhiyun
1748*4882a593Smuzhiyun ret = count_inode_extrefs(root, BTRFS_I(inode), path);
1749*4882a593Smuzhiyun if (ret < 0)
1750*4882a593Smuzhiyun goto out;
1751*4882a593Smuzhiyun
1752*4882a593Smuzhiyun nlink += ret;
1753*4882a593Smuzhiyun
1754*4882a593Smuzhiyun ret = 0;
1755*4882a593Smuzhiyun
1756*4882a593Smuzhiyun if (nlink != inode->i_nlink) {
1757*4882a593Smuzhiyun set_nlink(inode, nlink);
1758*4882a593Smuzhiyun btrfs_update_inode(trans, root, inode);
1759*4882a593Smuzhiyun }
1760*4882a593Smuzhiyun BTRFS_I(inode)->index_cnt = (u64)-1;
1761*4882a593Smuzhiyun
1762*4882a593Smuzhiyun if (inode->i_nlink == 0) {
1763*4882a593Smuzhiyun if (S_ISDIR(inode->i_mode)) {
1764*4882a593Smuzhiyun ret = replay_dir_deletes(trans, root, NULL, path,
1765*4882a593Smuzhiyun ino, 1);
1766*4882a593Smuzhiyun if (ret)
1767*4882a593Smuzhiyun goto out;
1768*4882a593Smuzhiyun }
1769*4882a593Smuzhiyun ret = insert_orphan_item(trans, root, ino);
1770*4882a593Smuzhiyun }
1771*4882a593Smuzhiyun
1772*4882a593Smuzhiyun out:
1773*4882a593Smuzhiyun btrfs_free_path(path);
1774*4882a593Smuzhiyun return ret;
1775*4882a593Smuzhiyun }
1776*4882a593Smuzhiyun
fixup_inode_link_counts(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path)1777*4882a593Smuzhiyun static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1778*4882a593Smuzhiyun struct btrfs_root *root,
1779*4882a593Smuzhiyun struct btrfs_path *path)
1780*4882a593Smuzhiyun {
1781*4882a593Smuzhiyun int ret;
1782*4882a593Smuzhiyun struct btrfs_key key;
1783*4882a593Smuzhiyun struct inode *inode;
1784*4882a593Smuzhiyun
1785*4882a593Smuzhiyun key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1786*4882a593Smuzhiyun key.type = BTRFS_ORPHAN_ITEM_KEY;
1787*4882a593Smuzhiyun key.offset = (u64)-1;
1788*4882a593Smuzhiyun while (1) {
1789*4882a593Smuzhiyun ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1790*4882a593Smuzhiyun if (ret < 0)
1791*4882a593Smuzhiyun break;
1792*4882a593Smuzhiyun
1793*4882a593Smuzhiyun if (ret == 1) {
1794*4882a593Smuzhiyun ret = 0;
1795*4882a593Smuzhiyun if (path->slots[0] == 0)
1796*4882a593Smuzhiyun break;
1797*4882a593Smuzhiyun path->slots[0]--;
1798*4882a593Smuzhiyun }
1799*4882a593Smuzhiyun
1800*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1801*4882a593Smuzhiyun if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1802*4882a593Smuzhiyun key.type != BTRFS_ORPHAN_ITEM_KEY)
1803*4882a593Smuzhiyun break;
1804*4882a593Smuzhiyun
1805*4882a593Smuzhiyun ret = btrfs_del_item(trans, root, path);
1806*4882a593Smuzhiyun if (ret)
1807*4882a593Smuzhiyun break;
1808*4882a593Smuzhiyun
1809*4882a593Smuzhiyun btrfs_release_path(path);
1810*4882a593Smuzhiyun inode = read_one_inode(root, key.offset);
1811*4882a593Smuzhiyun if (!inode) {
1812*4882a593Smuzhiyun ret = -EIO;
1813*4882a593Smuzhiyun break;
1814*4882a593Smuzhiyun }
1815*4882a593Smuzhiyun
1816*4882a593Smuzhiyun ret = fixup_inode_link_count(trans, root, inode);
1817*4882a593Smuzhiyun iput(inode);
1818*4882a593Smuzhiyun if (ret)
1819*4882a593Smuzhiyun break;
1820*4882a593Smuzhiyun
1821*4882a593Smuzhiyun /*
1822*4882a593Smuzhiyun * fixup on a directory may create new entries,
1823*4882a593Smuzhiyun * make sure we always look for the highset possible
1824*4882a593Smuzhiyun * offset
1825*4882a593Smuzhiyun */
1826*4882a593Smuzhiyun key.offset = (u64)-1;
1827*4882a593Smuzhiyun }
1828*4882a593Smuzhiyun btrfs_release_path(path);
1829*4882a593Smuzhiyun return ret;
1830*4882a593Smuzhiyun }
1831*4882a593Smuzhiyun
1832*4882a593Smuzhiyun
1833*4882a593Smuzhiyun /*
1834*4882a593Smuzhiyun * record a given inode in the fixup dir so we can check its link
1835*4882a593Smuzhiyun * count when replay is done. The link count is incremented here
1836*4882a593Smuzhiyun * so the inode won't go away until we check it
1837*4882a593Smuzhiyun */
link_to_fixup_dir(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,u64 objectid)1838*4882a593Smuzhiyun static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1839*4882a593Smuzhiyun struct btrfs_root *root,
1840*4882a593Smuzhiyun struct btrfs_path *path,
1841*4882a593Smuzhiyun u64 objectid)
1842*4882a593Smuzhiyun {
1843*4882a593Smuzhiyun struct btrfs_key key;
1844*4882a593Smuzhiyun int ret = 0;
1845*4882a593Smuzhiyun struct inode *inode;
1846*4882a593Smuzhiyun
1847*4882a593Smuzhiyun inode = read_one_inode(root, objectid);
1848*4882a593Smuzhiyun if (!inode)
1849*4882a593Smuzhiyun return -EIO;
1850*4882a593Smuzhiyun
1851*4882a593Smuzhiyun key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1852*4882a593Smuzhiyun key.type = BTRFS_ORPHAN_ITEM_KEY;
1853*4882a593Smuzhiyun key.offset = objectid;
1854*4882a593Smuzhiyun
1855*4882a593Smuzhiyun ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1856*4882a593Smuzhiyun
1857*4882a593Smuzhiyun btrfs_release_path(path);
1858*4882a593Smuzhiyun if (ret == 0) {
1859*4882a593Smuzhiyun if (!inode->i_nlink)
1860*4882a593Smuzhiyun set_nlink(inode, 1);
1861*4882a593Smuzhiyun else
1862*4882a593Smuzhiyun inc_nlink(inode);
1863*4882a593Smuzhiyun ret = btrfs_update_inode(trans, root, inode);
1864*4882a593Smuzhiyun } else if (ret == -EEXIST) {
1865*4882a593Smuzhiyun ret = 0;
1866*4882a593Smuzhiyun }
1867*4882a593Smuzhiyun iput(inode);
1868*4882a593Smuzhiyun
1869*4882a593Smuzhiyun return ret;
1870*4882a593Smuzhiyun }
1871*4882a593Smuzhiyun
1872*4882a593Smuzhiyun /*
1873*4882a593Smuzhiyun * when replaying the log for a directory, we only insert names
1874*4882a593Smuzhiyun * for inodes that actually exist. This means an fsync on a directory
1875*4882a593Smuzhiyun * does not implicitly fsync all the new files in it
1876*4882a593Smuzhiyun */
insert_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,u64 dirid,u64 index,char * name,int name_len,struct btrfs_key * location)1877*4882a593Smuzhiyun static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1878*4882a593Smuzhiyun struct btrfs_root *root,
1879*4882a593Smuzhiyun u64 dirid, u64 index,
1880*4882a593Smuzhiyun char *name, int name_len,
1881*4882a593Smuzhiyun struct btrfs_key *location)
1882*4882a593Smuzhiyun {
1883*4882a593Smuzhiyun struct inode *inode;
1884*4882a593Smuzhiyun struct inode *dir;
1885*4882a593Smuzhiyun int ret;
1886*4882a593Smuzhiyun
1887*4882a593Smuzhiyun inode = read_one_inode(root, location->objectid);
1888*4882a593Smuzhiyun if (!inode)
1889*4882a593Smuzhiyun return -ENOENT;
1890*4882a593Smuzhiyun
1891*4882a593Smuzhiyun dir = read_one_inode(root, dirid);
1892*4882a593Smuzhiyun if (!dir) {
1893*4882a593Smuzhiyun iput(inode);
1894*4882a593Smuzhiyun return -EIO;
1895*4882a593Smuzhiyun }
1896*4882a593Smuzhiyun
1897*4882a593Smuzhiyun ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1898*4882a593Smuzhiyun name_len, 1, index);
1899*4882a593Smuzhiyun
1900*4882a593Smuzhiyun /* FIXME, put inode into FIXUP list */
1901*4882a593Smuzhiyun
1902*4882a593Smuzhiyun iput(inode);
1903*4882a593Smuzhiyun iput(dir);
1904*4882a593Smuzhiyun return ret;
1905*4882a593Smuzhiyun }
1906*4882a593Smuzhiyun
1907*4882a593Smuzhiyun /*
1908*4882a593Smuzhiyun * take a single entry in a log directory item and replay it into
1909*4882a593Smuzhiyun * the subvolume.
1910*4882a593Smuzhiyun *
1911*4882a593Smuzhiyun * if a conflicting item exists in the subdirectory already,
1912*4882a593Smuzhiyun * the inode it points to is unlinked and put into the link count
1913*4882a593Smuzhiyun * fix up tree.
1914*4882a593Smuzhiyun *
1915*4882a593Smuzhiyun * If a name from the log points to a file or directory that does
1916*4882a593Smuzhiyun * not exist in the FS, it is skipped. fsyncs on directories
1917*4882a593Smuzhiyun * do not force down inodes inside that directory, just changes to the
1918*4882a593Smuzhiyun * names or unlinks in a directory.
1919*4882a593Smuzhiyun *
1920*4882a593Smuzhiyun * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1921*4882a593Smuzhiyun * non-existing inode) and 1 if the name was replayed.
1922*4882a593Smuzhiyun */
replay_one_name(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,struct btrfs_dir_item * di,struct btrfs_key * key)1923*4882a593Smuzhiyun static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1924*4882a593Smuzhiyun struct btrfs_root *root,
1925*4882a593Smuzhiyun struct btrfs_path *path,
1926*4882a593Smuzhiyun struct extent_buffer *eb,
1927*4882a593Smuzhiyun struct btrfs_dir_item *di,
1928*4882a593Smuzhiyun struct btrfs_key *key)
1929*4882a593Smuzhiyun {
1930*4882a593Smuzhiyun char *name;
1931*4882a593Smuzhiyun int name_len;
1932*4882a593Smuzhiyun struct btrfs_dir_item *dst_di;
1933*4882a593Smuzhiyun struct btrfs_key found_key;
1934*4882a593Smuzhiyun struct btrfs_key log_key;
1935*4882a593Smuzhiyun struct inode *dir;
1936*4882a593Smuzhiyun u8 log_type;
1937*4882a593Smuzhiyun bool exists;
1938*4882a593Smuzhiyun int ret;
1939*4882a593Smuzhiyun bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
1940*4882a593Smuzhiyun bool name_added = false;
1941*4882a593Smuzhiyun
1942*4882a593Smuzhiyun dir = read_one_inode(root, key->objectid);
1943*4882a593Smuzhiyun if (!dir)
1944*4882a593Smuzhiyun return -EIO;
1945*4882a593Smuzhiyun
1946*4882a593Smuzhiyun name_len = btrfs_dir_name_len(eb, di);
1947*4882a593Smuzhiyun name = kmalloc(name_len, GFP_NOFS);
1948*4882a593Smuzhiyun if (!name) {
1949*4882a593Smuzhiyun ret = -ENOMEM;
1950*4882a593Smuzhiyun goto out;
1951*4882a593Smuzhiyun }
1952*4882a593Smuzhiyun
1953*4882a593Smuzhiyun log_type = btrfs_dir_type(eb, di);
1954*4882a593Smuzhiyun read_extent_buffer(eb, name, (unsigned long)(di + 1),
1955*4882a593Smuzhiyun name_len);
1956*4882a593Smuzhiyun
1957*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1958*4882a593Smuzhiyun ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1959*4882a593Smuzhiyun btrfs_release_path(path);
1960*4882a593Smuzhiyun if (ret < 0)
1961*4882a593Smuzhiyun goto out;
1962*4882a593Smuzhiyun exists = (ret == 0);
1963*4882a593Smuzhiyun ret = 0;
1964*4882a593Smuzhiyun
1965*4882a593Smuzhiyun if (key->type == BTRFS_DIR_ITEM_KEY) {
1966*4882a593Smuzhiyun dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1967*4882a593Smuzhiyun name, name_len, 1);
1968*4882a593Smuzhiyun } else if (key->type == BTRFS_DIR_INDEX_KEY) {
1969*4882a593Smuzhiyun dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1970*4882a593Smuzhiyun key->objectid,
1971*4882a593Smuzhiyun key->offset, name,
1972*4882a593Smuzhiyun name_len, 1);
1973*4882a593Smuzhiyun } else {
1974*4882a593Smuzhiyun /* Corruption */
1975*4882a593Smuzhiyun ret = -EINVAL;
1976*4882a593Smuzhiyun goto out;
1977*4882a593Smuzhiyun }
1978*4882a593Smuzhiyun
1979*4882a593Smuzhiyun if (dst_di == ERR_PTR(-ENOENT))
1980*4882a593Smuzhiyun dst_di = NULL;
1981*4882a593Smuzhiyun
1982*4882a593Smuzhiyun if (IS_ERR(dst_di)) {
1983*4882a593Smuzhiyun ret = PTR_ERR(dst_di);
1984*4882a593Smuzhiyun goto out;
1985*4882a593Smuzhiyun } else if (!dst_di) {
1986*4882a593Smuzhiyun /* we need a sequence number to insert, so we only
1987*4882a593Smuzhiyun * do inserts for the BTRFS_DIR_INDEX_KEY types
1988*4882a593Smuzhiyun */
1989*4882a593Smuzhiyun if (key->type != BTRFS_DIR_INDEX_KEY)
1990*4882a593Smuzhiyun goto out;
1991*4882a593Smuzhiyun goto insert;
1992*4882a593Smuzhiyun }
1993*4882a593Smuzhiyun
1994*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1995*4882a593Smuzhiyun /* the existing item matches the logged item */
1996*4882a593Smuzhiyun if (found_key.objectid == log_key.objectid &&
1997*4882a593Smuzhiyun found_key.type == log_key.type &&
1998*4882a593Smuzhiyun found_key.offset == log_key.offset &&
1999*4882a593Smuzhiyun btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
2000*4882a593Smuzhiyun update_size = false;
2001*4882a593Smuzhiyun goto out;
2002*4882a593Smuzhiyun }
2003*4882a593Smuzhiyun
2004*4882a593Smuzhiyun /*
2005*4882a593Smuzhiyun * don't drop the conflicting directory entry if the inode
2006*4882a593Smuzhiyun * for the new entry doesn't exist
2007*4882a593Smuzhiyun */
2008*4882a593Smuzhiyun if (!exists)
2009*4882a593Smuzhiyun goto out;
2010*4882a593Smuzhiyun
2011*4882a593Smuzhiyun ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
2012*4882a593Smuzhiyun if (ret)
2013*4882a593Smuzhiyun goto out;
2014*4882a593Smuzhiyun
2015*4882a593Smuzhiyun if (key->type == BTRFS_DIR_INDEX_KEY)
2016*4882a593Smuzhiyun goto insert;
2017*4882a593Smuzhiyun out:
2018*4882a593Smuzhiyun btrfs_release_path(path);
2019*4882a593Smuzhiyun if (!ret && update_size) {
2020*4882a593Smuzhiyun btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
2021*4882a593Smuzhiyun ret = btrfs_update_inode(trans, root, dir);
2022*4882a593Smuzhiyun }
2023*4882a593Smuzhiyun kfree(name);
2024*4882a593Smuzhiyun iput(dir);
2025*4882a593Smuzhiyun if (!ret && name_added)
2026*4882a593Smuzhiyun ret = 1;
2027*4882a593Smuzhiyun return ret;
2028*4882a593Smuzhiyun
2029*4882a593Smuzhiyun insert:
2030*4882a593Smuzhiyun /*
2031*4882a593Smuzhiyun * Check if the inode reference exists in the log for the given name,
2032*4882a593Smuzhiyun * inode and parent inode
2033*4882a593Smuzhiyun */
2034*4882a593Smuzhiyun found_key.objectid = log_key.objectid;
2035*4882a593Smuzhiyun found_key.type = BTRFS_INODE_REF_KEY;
2036*4882a593Smuzhiyun found_key.offset = key->objectid;
2037*4882a593Smuzhiyun ret = backref_in_log(root->log_root, &found_key, 0, name, name_len);
2038*4882a593Smuzhiyun if (ret < 0) {
2039*4882a593Smuzhiyun goto out;
2040*4882a593Smuzhiyun } else if (ret) {
2041*4882a593Smuzhiyun /* The dentry will be added later. */
2042*4882a593Smuzhiyun ret = 0;
2043*4882a593Smuzhiyun update_size = false;
2044*4882a593Smuzhiyun goto out;
2045*4882a593Smuzhiyun }
2046*4882a593Smuzhiyun
2047*4882a593Smuzhiyun found_key.objectid = log_key.objectid;
2048*4882a593Smuzhiyun found_key.type = BTRFS_INODE_EXTREF_KEY;
2049*4882a593Smuzhiyun found_key.offset = key->objectid;
2050*4882a593Smuzhiyun ret = backref_in_log(root->log_root, &found_key, key->objectid, name,
2051*4882a593Smuzhiyun name_len);
2052*4882a593Smuzhiyun if (ret < 0) {
2053*4882a593Smuzhiyun goto out;
2054*4882a593Smuzhiyun } else if (ret) {
2055*4882a593Smuzhiyun /* The dentry will be added later. */
2056*4882a593Smuzhiyun ret = 0;
2057*4882a593Smuzhiyun update_size = false;
2058*4882a593Smuzhiyun goto out;
2059*4882a593Smuzhiyun }
2060*4882a593Smuzhiyun btrfs_release_path(path);
2061*4882a593Smuzhiyun ret = insert_one_name(trans, root, key->objectid, key->offset,
2062*4882a593Smuzhiyun name, name_len, &log_key);
2063*4882a593Smuzhiyun if (ret && ret != -ENOENT && ret != -EEXIST)
2064*4882a593Smuzhiyun goto out;
2065*4882a593Smuzhiyun if (!ret)
2066*4882a593Smuzhiyun name_added = true;
2067*4882a593Smuzhiyun update_size = false;
2068*4882a593Smuzhiyun ret = 0;
2069*4882a593Smuzhiyun goto out;
2070*4882a593Smuzhiyun }
2071*4882a593Smuzhiyun
2072*4882a593Smuzhiyun /*
2073*4882a593Smuzhiyun * find all the names in a directory item and reconcile them into
2074*4882a593Smuzhiyun * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
2075*4882a593Smuzhiyun * one name in a directory item, but the same code gets used for
2076*4882a593Smuzhiyun * both directory index types
2077*4882a593Smuzhiyun */
replay_one_dir_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct extent_buffer * eb,int slot,struct btrfs_key * key)2078*4882a593Smuzhiyun static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2079*4882a593Smuzhiyun struct btrfs_root *root,
2080*4882a593Smuzhiyun struct btrfs_path *path,
2081*4882a593Smuzhiyun struct extent_buffer *eb, int slot,
2082*4882a593Smuzhiyun struct btrfs_key *key)
2083*4882a593Smuzhiyun {
2084*4882a593Smuzhiyun int ret = 0;
2085*4882a593Smuzhiyun u32 item_size = btrfs_item_size_nr(eb, slot);
2086*4882a593Smuzhiyun struct btrfs_dir_item *di;
2087*4882a593Smuzhiyun int name_len;
2088*4882a593Smuzhiyun unsigned long ptr;
2089*4882a593Smuzhiyun unsigned long ptr_end;
2090*4882a593Smuzhiyun struct btrfs_path *fixup_path = NULL;
2091*4882a593Smuzhiyun
2092*4882a593Smuzhiyun ptr = btrfs_item_ptr_offset(eb, slot);
2093*4882a593Smuzhiyun ptr_end = ptr + item_size;
2094*4882a593Smuzhiyun while (ptr < ptr_end) {
2095*4882a593Smuzhiyun di = (struct btrfs_dir_item *)ptr;
2096*4882a593Smuzhiyun name_len = btrfs_dir_name_len(eb, di);
2097*4882a593Smuzhiyun ret = replay_one_name(trans, root, path, eb, di, key);
2098*4882a593Smuzhiyun if (ret < 0)
2099*4882a593Smuzhiyun break;
2100*4882a593Smuzhiyun ptr = (unsigned long)(di + 1);
2101*4882a593Smuzhiyun ptr += name_len;
2102*4882a593Smuzhiyun
2103*4882a593Smuzhiyun /*
2104*4882a593Smuzhiyun * If this entry refers to a non-directory (directories can not
2105*4882a593Smuzhiyun * have a link count > 1) and it was added in the transaction
2106*4882a593Smuzhiyun * that was not committed, make sure we fixup the link count of
2107*4882a593Smuzhiyun * the inode it the entry points to. Otherwise something like
2108*4882a593Smuzhiyun * the following would result in a directory pointing to an
2109*4882a593Smuzhiyun * inode with a wrong link that does not account for this dir
2110*4882a593Smuzhiyun * entry:
2111*4882a593Smuzhiyun *
2112*4882a593Smuzhiyun * mkdir testdir
2113*4882a593Smuzhiyun * touch testdir/foo
2114*4882a593Smuzhiyun * touch testdir/bar
2115*4882a593Smuzhiyun * sync
2116*4882a593Smuzhiyun *
2117*4882a593Smuzhiyun * ln testdir/bar testdir/bar_link
2118*4882a593Smuzhiyun * ln testdir/foo testdir/foo_link
2119*4882a593Smuzhiyun * xfs_io -c "fsync" testdir/bar
2120*4882a593Smuzhiyun *
2121*4882a593Smuzhiyun * <power failure>
2122*4882a593Smuzhiyun *
2123*4882a593Smuzhiyun * mount fs, log replay happens
2124*4882a593Smuzhiyun *
2125*4882a593Smuzhiyun * File foo would remain with a link count of 1 when it has two
2126*4882a593Smuzhiyun * entries pointing to it in the directory testdir. This would
2127*4882a593Smuzhiyun * make it impossible to ever delete the parent directory has
2128*4882a593Smuzhiyun * it would result in stale dentries that can never be deleted.
2129*4882a593Smuzhiyun */
2130*4882a593Smuzhiyun if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2131*4882a593Smuzhiyun struct btrfs_key di_key;
2132*4882a593Smuzhiyun
2133*4882a593Smuzhiyun if (!fixup_path) {
2134*4882a593Smuzhiyun fixup_path = btrfs_alloc_path();
2135*4882a593Smuzhiyun if (!fixup_path) {
2136*4882a593Smuzhiyun ret = -ENOMEM;
2137*4882a593Smuzhiyun break;
2138*4882a593Smuzhiyun }
2139*4882a593Smuzhiyun }
2140*4882a593Smuzhiyun
2141*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2142*4882a593Smuzhiyun ret = link_to_fixup_dir(trans, root, fixup_path,
2143*4882a593Smuzhiyun di_key.objectid);
2144*4882a593Smuzhiyun if (ret)
2145*4882a593Smuzhiyun break;
2146*4882a593Smuzhiyun }
2147*4882a593Smuzhiyun ret = 0;
2148*4882a593Smuzhiyun }
2149*4882a593Smuzhiyun btrfs_free_path(fixup_path);
2150*4882a593Smuzhiyun return ret;
2151*4882a593Smuzhiyun }
2152*4882a593Smuzhiyun
2153*4882a593Smuzhiyun /*
2154*4882a593Smuzhiyun * directory replay has two parts. There are the standard directory
2155*4882a593Smuzhiyun * items in the log copied from the subvolume, and range items
2156*4882a593Smuzhiyun * created in the log while the subvolume was logged.
2157*4882a593Smuzhiyun *
2158*4882a593Smuzhiyun * The range items tell us which parts of the key space the log
2159*4882a593Smuzhiyun * is authoritative for. During replay, if a key in the subvolume
2160*4882a593Smuzhiyun * directory is in a logged range item, but not actually in the log
2161*4882a593Smuzhiyun * that means it was deleted from the directory before the fsync
2162*4882a593Smuzhiyun * and should be removed.
2163*4882a593Smuzhiyun */
find_dir_range(struct btrfs_root * root,struct btrfs_path * path,u64 dirid,int key_type,u64 * start_ret,u64 * end_ret)2164*4882a593Smuzhiyun static noinline int find_dir_range(struct btrfs_root *root,
2165*4882a593Smuzhiyun struct btrfs_path *path,
2166*4882a593Smuzhiyun u64 dirid, int key_type,
2167*4882a593Smuzhiyun u64 *start_ret, u64 *end_ret)
2168*4882a593Smuzhiyun {
2169*4882a593Smuzhiyun struct btrfs_key key;
2170*4882a593Smuzhiyun u64 found_end;
2171*4882a593Smuzhiyun struct btrfs_dir_log_item *item;
2172*4882a593Smuzhiyun int ret;
2173*4882a593Smuzhiyun int nritems;
2174*4882a593Smuzhiyun
2175*4882a593Smuzhiyun if (*start_ret == (u64)-1)
2176*4882a593Smuzhiyun return 1;
2177*4882a593Smuzhiyun
2178*4882a593Smuzhiyun key.objectid = dirid;
2179*4882a593Smuzhiyun key.type = key_type;
2180*4882a593Smuzhiyun key.offset = *start_ret;
2181*4882a593Smuzhiyun
2182*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2183*4882a593Smuzhiyun if (ret < 0)
2184*4882a593Smuzhiyun goto out;
2185*4882a593Smuzhiyun if (ret > 0) {
2186*4882a593Smuzhiyun if (path->slots[0] == 0)
2187*4882a593Smuzhiyun goto out;
2188*4882a593Smuzhiyun path->slots[0]--;
2189*4882a593Smuzhiyun }
2190*4882a593Smuzhiyun if (ret != 0)
2191*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2192*4882a593Smuzhiyun
2193*4882a593Smuzhiyun if (key.type != key_type || key.objectid != dirid) {
2194*4882a593Smuzhiyun ret = 1;
2195*4882a593Smuzhiyun goto next;
2196*4882a593Smuzhiyun }
2197*4882a593Smuzhiyun item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2198*4882a593Smuzhiyun struct btrfs_dir_log_item);
2199*4882a593Smuzhiyun found_end = btrfs_dir_log_end(path->nodes[0], item);
2200*4882a593Smuzhiyun
2201*4882a593Smuzhiyun if (*start_ret >= key.offset && *start_ret <= found_end) {
2202*4882a593Smuzhiyun ret = 0;
2203*4882a593Smuzhiyun *start_ret = key.offset;
2204*4882a593Smuzhiyun *end_ret = found_end;
2205*4882a593Smuzhiyun goto out;
2206*4882a593Smuzhiyun }
2207*4882a593Smuzhiyun ret = 1;
2208*4882a593Smuzhiyun next:
2209*4882a593Smuzhiyun /* check the next slot in the tree to see if it is a valid item */
2210*4882a593Smuzhiyun nritems = btrfs_header_nritems(path->nodes[0]);
2211*4882a593Smuzhiyun path->slots[0]++;
2212*4882a593Smuzhiyun if (path->slots[0] >= nritems) {
2213*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
2214*4882a593Smuzhiyun if (ret)
2215*4882a593Smuzhiyun goto out;
2216*4882a593Smuzhiyun }
2217*4882a593Smuzhiyun
2218*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2219*4882a593Smuzhiyun
2220*4882a593Smuzhiyun if (key.type != key_type || key.objectid != dirid) {
2221*4882a593Smuzhiyun ret = 1;
2222*4882a593Smuzhiyun goto out;
2223*4882a593Smuzhiyun }
2224*4882a593Smuzhiyun item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2225*4882a593Smuzhiyun struct btrfs_dir_log_item);
2226*4882a593Smuzhiyun found_end = btrfs_dir_log_end(path->nodes[0], item);
2227*4882a593Smuzhiyun *start_ret = key.offset;
2228*4882a593Smuzhiyun *end_ret = found_end;
2229*4882a593Smuzhiyun ret = 0;
2230*4882a593Smuzhiyun out:
2231*4882a593Smuzhiyun btrfs_release_path(path);
2232*4882a593Smuzhiyun return ret;
2233*4882a593Smuzhiyun }
2234*4882a593Smuzhiyun
2235*4882a593Smuzhiyun /*
2236*4882a593Smuzhiyun * this looks for a given directory item in the log. If the directory
2237*4882a593Smuzhiyun * item is not in the log, the item is removed and the inode it points
2238*4882a593Smuzhiyun * to is unlinked
2239*4882a593Smuzhiyun */
check_item_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_path * log_path,struct inode * dir,struct btrfs_key * dir_key)2240*4882a593Smuzhiyun static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2241*4882a593Smuzhiyun struct btrfs_root *root,
2242*4882a593Smuzhiyun struct btrfs_root *log,
2243*4882a593Smuzhiyun struct btrfs_path *path,
2244*4882a593Smuzhiyun struct btrfs_path *log_path,
2245*4882a593Smuzhiyun struct inode *dir,
2246*4882a593Smuzhiyun struct btrfs_key *dir_key)
2247*4882a593Smuzhiyun {
2248*4882a593Smuzhiyun int ret;
2249*4882a593Smuzhiyun struct extent_buffer *eb;
2250*4882a593Smuzhiyun int slot;
2251*4882a593Smuzhiyun u32 item_size;
2252*4882a593Smuzhiyun struct btrfs_dir_item *di;
2253*4882a593Smuzhiyun struct btrfs_dir_item *log_di;
2254*4882a593Smuzhiyun int name_len;
2255*4882a593Smuzhiyun unsigned long ptr;
2256*4882a593Smuzhiyun unsigned long ptr_end;
2257*4882a593Smuzhiyun char *name;
2258*4882a593Smuzhiyun struct inode *inode;
2259*4882a593Smuzhiyun struct btrfs_key location;
2260*4882a593Smuzhiyun
2261*4882a593Smuzhiyun again:
2262*4882a593Smuzhiyun eb = path->nodes[0];
2263*4882a593Smuzhiyun slot = path->slots[0];
2264*4882a593Smuzhiyun item_size = btrfs_item_size_nr(eb, slot);
2265*4882a593Smuzhiyun ptr = btrfs_item_ptr_offset(eb, slot);
2266*4882a593Smuzhiyun ptr_end = ptr + item_size;
2267*4882a593Smuzhiyun while (ptr < ptr_end) {
2268*4882a593Smuzhiyun di = (struct btrfs_dir_item *)ptr;
2269*4882a593Smuzhiyun name_len = btrfs_dir_name_len(eb, di);
2270*4882a593Smuzhiyun name = kmalloc(name_len, GFP_NOFS);
2271*4882a593Smuzhiyun if (!name) {
2272*4882a593Smuzhiyun ret = -ENOMEM;
2273*4882a593Smuzhiyun goto out;
2274*4882a593Smuzhiyun }
2275*4882a593Smuzhiyun read_extent_buffer(eb, name, (unsigned long)(di + 1),
2276*4882a593Smuzhiyun name_len);
2277*4882a593Smuzhiyun log_di = NULL;
2278*4882a593Smuzhiyun if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
2279*4882a593Smuzhiyun log_di = btrfs_lookup_dir_item(trans, log, log_path,
2280*4882a593Smuzhiyun dir_key->objectid,
2281*4882a593Smuzhiyun name, name_len, 0);
2282*4882a593Smuzhiyun } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
2283*4882a593Smuzhiyun log_di = btrfs_lookup_dir_index_item(trans, log,
2284*4882a593Smuzhiyun log_path,
2285*4882a593Smuzhiyun dir_key->objectid,
2286*4882a593Smuzhiyun dir_key->offset,
2287*4882a593Smuzhiyun name, name_len, 0);
2288*4882a593Smuzhiyun }
2289*4882a593Smuzhiyun if (!log_di || log_di == ERR_PTR(-ENOENT)) {
2290*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(eb, di, &location);
2291*4882a593Smuzhiyun btrfs_release_path(path);
2292*4882a593Smuzhiyun btrfs_release_path(log_path);
2293*4882a593Smuzhiyun inode = read_one_inode(root, location.objectid);
2294*4882a593Smuzhiyun if (!inode) {
2295*4882a593Smuzhiyun kfree(name);
2296*4882a593Smuzhiyun return -EIO;
2297*4882a593Smuzhiyun }
2298*4882a593Smuzhiyun
2299*4882a593Smuzhiyun ret = link_to_fixup_dir(trans, root,
2300*4882a593Smuzhiyun path, location.objectid);
2301*4882a593Smuzhiyun if (ret) {
2302*4882a593Smuzhiyun kfree(name);
2303*4882a593Smuzhiyun iput(inode);
2304*4882a593Smuzhiyun goto out;
2305*4882a593Smuzhiyun }
2306*4882a593Smuzhiyun
2307*4882a593Smuzhiyun inc_nlink(inode);
2308*4882a593Smuzhiyun ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2309*4882a593Smuzhiyun BTRFS_I(inode), name, name_len);
2310*4882a593Smuzhiyun if (!ret)
2311*4882a593Smuzhiyun ret = btrfs_run_delayed_items(trans);
2312*4882a593Smuzhiyun kfree(name);
2313*4882a593Smuzhiyun iput(inode);
2314*4882a593Smuzhiyun if (ret)
2315*4882a593Smuzhiyun goto out;
2316*4882a593Smuzhiyun
2317*4882a593Smuzhiyun /* there might still be more names under this key
2318*4882a593Smuzhiyun * check and repeat if required
2319*4882a593Smuzhiyun */
2320*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, dir_key, path,
2321*4882a593Smuzhiyun 0, 0);
2322*4882a593Smuzhiyun if (ret == 0)
2323*4882a593Smuzhiyun goto again;
2324*4882a593Smuzhiyun ret = 0;
2325*4882a593Smuzhiyun goto out;
2326*4882a593Smuzhiyun } else if (IS_ERR(log_di)) {
2327*4882a593Smuzhiyun kfree(name);
2328*4882a593Smuzhiyun return PTR_ERR(log_di);
2329*4882a593Smuzhiyun }
2330*4882a593Smuzhiyun btrfs_release_path(log_path);
2331*4882a593Smuzhiyun kfree(name);
2332*4882a593Smuzhiyun
2333*4882a593Smuzhiyun ptr = (unsigned long)(di + 1);
2334*4882a593Smuzhiyun ptr += name_len;
2335*4882a593Smuzhiyun }
2336*4882a593Smuzhiyun ret = 0;
2337*4882a593Smuzhiyun out:
2338*4882a593Smuzhiyun btrfs_release_path(path);
2339*4882a593Smuzhiyun btrfs_release_path(log_path);
2340*4882a593Smuzhiyun return ret;
2341*4882a593Smuzhiyun }
2342*4882a593Smuzhiyun
replay_xattr_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,const u64 ino)2343*4882a593Smuzhiyun static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2344*4882a593Smuzhiyun struct btrfs_root *root,
2345*4882a593Smuzhiyun struct btrfs_root *log,
2346*4882a593Smuzhiyun struct btrfs_path *path,
2347*4882a593Smuzhiyun const u64 ino)
2348*4882a593Smuzhiyun {
2349*4882a593Smuzhiyun struct btrfs_key search_key;
2350*4882a593Smuzhiyun struct btrfs_path *log_path;
2351*4882a593Smuzhiyun int i;
2352*4882a593Smuzhiyun int nritems;
2353*4882a593Smuzhiyun int ret;
2354*4882a593Smuzhiyun
2355*4882a593Smuzhiyun log_path = btrfs_alloc_path();
2356*4882a593Smuzhiyun if (!log_path)
2357*4882a593Smuzhiyun return -ENOMEM;
2358*4882a593Smuzhiyun
2359*4882a593Smuzhiyun search_key.objectid = ino;
2360*4882a593Smuzhiyun search_key.type = BTRFS_XATTR_ITEM_KEY;
2361*4882a593Smuzhiyun search_key.offset = 0;
2362*4882a593Smuzhiyun again:
2363*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2364*4882a593Smuzhiyun if (ret < 0)
2365*4882a593Smuzhiyun goto out;
2366*4882a593Smuzhiyun process_leaf:
2367*4882a593Smuzhiyun nritems = btrfs_header_nritems(path->nodes[0]);
2368*4882a593Smuzhiyun for (i = path->slots[0]; i < nritems; i++) {
2369*4882a593Smuzhiyun struct btrfs_key key;
2370*4882a593Smuzhiyun struct btrfs_dir_item *di;
2371*4882a593Smuzhiyun struct btrfs_dir_item *log_di;
2372*4882a593Smuzhiyun u32 total_size;
2373*4882a593Smuzhiyun u32 cur;
2374*4882a593Smuzhiyun
2375*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2376*4882a593Smuzhiyun if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2377*4882a593Smuzhiyun ret = 0;
2378*4882a593Smuzhiyun goto out;
2379*4882a593Smuzhiyun }
2380*4882a593Smuzhiyun
2381*4882a593Smuzhiyun di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2382*4882a593Smuzhiyun total_size = btrfs_item_size_nr(path->nodes[0], i);
2383*4882a593Smuzhiyun cur = 0;
2384*4882a593Smuzhiyun while (cur < total_size) {
2385*4882a593Smuzhiyun u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2386*4882a593Smuzhiyun u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2387*4882a593Smuzhiyun u32 this_len = sizeof(*di) + name_len + data_len;
2388*4882a593Smuzhiyun char *name;
2389*4882a593Smuzhiyun
2390*4882a593Smuzhiyun name = kmalloc(name_len, GFP_NOFS);
2391*4882a593Smuzhiyun if (!name) {
2392*4882a593Smuzhiyun ret = -ENOMEM;
2393*4882a593Smuzhiyun goto out;
2394*4882a593Smuzhiyun }
2395*4882a593Smuzhiyun read_extent_buffer(path->nodes[0], name,
2396*4882a593Smuzhiyun (unsigned long)(di + 1), name_len);
2397*4882a593Smuzhiyun
2398*4882a593Smuzhiyun log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2399*4882a593Smuzhiyun name, name_len, 0);
2400*4882a593Smuzhiyun btrfs_release_path(log_path);
2401*4882a593Smuzhiyun if (!log_di) {
2402*4882a593Smuzhiyun /* Doesn't exist in log tree, so delete it. */
2403*4882a593Smuzhiyun btrfs_release_path(path);
2404*4882a593Smuzhiyun di = btrfs_lookup_xattr(trans, root, path, ino,
2405*4882a593Smuzhiyun name, name_len, -1);
2406*4882a593Smuzhiyun kfree(name);
2407*4882a593Smuzhiyun if (IS_ERR(di)) {
2408*4882a593Smuzhiyun ret = PTR_ERR(di);
2409*4882a593Smuzhiyun goto out;
2410*4882a593Smuzhiyun }
2411*4882a593Smuzhiyun ASSERT(di);
2412*4882a593Smuzhiyun ret = btrfs_delete_one_dir_name(trans, root,
2413*4882a593Smuzhiyun path, di);
2414*4882a593Smuzhiyun if (ret)
2415*4882a593Smuzhiyun goto out;
2416*4882a593Smuzhiyun btrfs_release_path(path);
2417*4882a593Smuzhiyun search_key = key;
2418*4882a593Smuzhiyun goto again;
2419*4882a593Smuzhiyun }
2420*4882a593Smuzhiyun kfree(name);
2421*4882a593Smuzhiyun if (IS_ERR(log_di)) {
2422*4882a593Smuzhiyun ret = PTR_ERR(log_di);
2423*4882a593Smuzhiyun goto out;
2424*4882a593Smuzhiyun }
2425*4882a593Smuzhiyun cur += this_len;
2426*4882a593Smuzhiyun di = (struct btrfs_dir_item *)((char *)di + this_len);
2427*4882a593Smuzhiyun }
2428*4882a593Smuzhiyun }
2429*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
2430*4882a593Smuzhiyun if (ret > 0)
2431*4882a593Smuzhiyun ret = 0;
2432*4882a593Smuzhiyun else if (ret == 0)
2433*4882a593Smuzhiyun goto process_leaf;
2434*4882a593Smuzhiyun out:
2435*4882a593Smuzhiyun btrfs_free_path(log_path);
2436*4882a593Smuzhiyun btrfs_release_path(path);
2437*4882a593Smuzhiyun return ret;
2438*4882a593Smuzhiyun }
2439*4882a593Smuzhiyun
2440*4882a593Smuzhiyun
2441*4882a593Smuzhiyun /*
2442*4882a593Smuzhiyun * deletion replay happens before we copy any new directory items
2443*4882a593Smuzhiyun * out of the log or out of backreferences from inodes. It
2444*4882a593Smuzhiyun * scans the log to find ranges of keys that log is authoritative for,
2445*4882a593Smuzhiyun * and then scans the directory to find items in those ranges that are
2446*4882a593Smuzhiyun * not present in the log.
2447*4882a593Smuzhiyun *
2448*4882a593Smuzhiyun * Anything we don't find in the log is unlinked and removed from the
2449*4882a593Smuzhiyun * directory.
2450*4882a593Smuzhiyun */
replay_dir_deletes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_root * log,struct btrfs_path * path,u64 dirid,int del_all)2451*4882a593Smuzhiyun static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2452*4882a593Smuzhiyun struct btrfs_root *root,
2453*4882a593Smuzhiyun struct btrfs_root *log,
2454*4882a593Smuzhiyun struct btrfs_path *path,
2455*4882a593Smuzhiyun u64 dirid, int del_all)
2456*4882a593Smuzhiyun {
2457*4882a593Smuzhiyun u64 range_start;
2458*4882a593Smuzhiyun u64 range_end;
2459*4882a593Smuzhiyun int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2460*4882a593Smuzhiyun int ret = 0;
2461*4882a593Smuzhiyun struct btrfs_key dir_key;
2462*4882a593Smuzhiyun struct btrfs_key found_key;
2463*4882a593Smuzhiyun struct btrfs_path *log_path;
2464*4882a593Smuzhiyun struct inode *dir;
2465*4882a593Smuzhiyun
2466*4882a593Smuzhiyun dir_key.objectid = dirid;
2467*4882a593Smuzhiyun dir_key.type = BTRFS_DIR_ITEM_KEY;
2468*4882a593Smuzhiyun log_path = btrfs_alloc_path();
2469*4882a593Smuzhiyun if (!log_path)
2470*4882a593Smuzhiyun return -ENOMEM;
2471*4882a593Smuzhiyun
2472*4882a593Smuzhiyun dir = read_one_inode(root, dirid);
2473*4882a593Smuzhiyun /* it isn't an error if the inode isn't there, that can happen
2474*4882a593Smuzhiyun * because we replay the deletes before we copy in the inode item
2475*4882a593Smuzhiyun * from the log
2476*4882a593Smuzhiyun */
2477*4882a593Smuzhiyun if (!dir) {
2478*4882a593Smuzhiyun btrfs_free_path(log_path);
2479*4882a593Smuzhiyun return 0;
2480*4882a593Smuzhiyun }
2481*4882a593Smuzhiyun again:
2482*4882a593Smuzhiyun range_start = 0;
2483*4882a593Smuzhiyun range_end = 0;
2484*4882a593Smuzhiyun while (1) {
2485*4882a593Smuzhiyun if (del_all)
2486*4882a593Smuzhiyun range_end = (u64)-1;
2487*4882a593Smuzhiyun else {
2488*4882a593Smuzhiyun ret = find_dir_range(log, path, dirid, key_type,
2489*4882a593Smuzhiyun &range_start, &range_end);
2490*4882a593Smuzhiyun if (ret < 0)
2491*4882a593Smuzhiyun goto out;
2492*4882a593Smuzhiyun else if (ret > 0)
2493*4882a593Smuzhiyun break;
2494*4882a593Smuzhiyun }
2495*4882a593Smuzhiyun
2496*4882a593Smuzhiyun dir_key.offset = range_start;
2497*4882a593Smuzhiyun while (1) {
2498*4882a593Smuzhiyun int nritems;
2499*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &dir_key, path,
2500*4882a593Smuzhiyun 0, 0);
2501*4882a593Smuzhiyun if (ret < 0)
2502*4882a593Smuzhiyun goto out;
2503*4882a593Smuzhiyun
2504*4882a593Smuzhiyun nritems = btrfs_header_nritems(path->nodes[0]);
2505*4882a593Smuzhiyun if (path->slots[0] >= nritems) {
2506*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
2507*4882a593Smuzhiyun if (ret == 1)
2508*4882a593Smuzhiyun break;
2509*4882a593Smuzhiyun else if (ret < 0)
2510*4882a593Smuzhiyun goto out;
2511*4882a593Smuzhiyun }
2512*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2513*4882a593Smuzhiyun path->slots[0]);
2514*4882a593Smuzhiyun if (found_key.objectid != dirid ||
2515*4882a593Smuzhiyun found_key.type != dir_key.type)
2516*4882a593Smuzhiyun goto next_type;
2517*4882a593Smuzhiyun
2518*4882a593Smuzhiyun if (found_key.offset > range_end)
2519*4882a593Smuzhiyun break;
2520*4882a593Smuzhiyun
2521*4882a593Smuzhiyun ret = check_item_in_log(trans, root, log, path,
2522*4882a593Smuzhiyun log_path, dir,
2523*4882a593Smuzhiyun &found_key);
2524*4882a593Smuzhiyun if (ret)
2525*4882a593Smuzhiyun goto out;
2526*4882a593Smuzhiyun if (found_key.offset == (u64)-1)
2527*4882a593Smuzhiyun break;
2528*4882a593Smuzhiyun dir_key.offset = found_key.offset + 1;
2529*4882a593Smuzhiyun }
2530*4882a593Smuzhiyun btrfs_release_path(path);
2531*4882a593Smuzhiyun if (range_end == (u64)-1)
2532*4882a593Smuzhiyun break;
2533*4882a593Smuzhiyun range_start = range_end + 1;
2534*4882a593Smuzhiyun }
2535*4882a593Smuzhiyun
2536*4882a593Smuzhiyun next_type:
2537*4882a593Smuzhiyun ret = 0;
2538*4882a593Smuzhiyun if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2539*4882a593Smuzhiyun key_type = BTRFS_DIR_LOG_INDEX_KEY;
2540*4882a593Smuzhiyun dir_key.type = BTRFS_DIR_INDEX_KEY;
2541*4882a593Smuzhiyun btrfs_release_path(path);
2542*4882a593Smuzhiyun goto again;
2543*4882a593Smuzhiyun }
2544*4882a593Smuzhiyun out:
2545*4882a593Smuzhiyun btrfs_release_path(path);
2546*4882a593Smuzhiyun btrfs_free_path(log_path);
2547*4882a593Smuzhiyun iput(dir);
2548*4882a593Smuzhiyun return ret;
2549*4882a593Smuzhiyun }
2550*4882a593Smuzhiyun
2551*4882a593Smuzhiyun /*
2552*4882a593Smuzhiyun * the process_func used to replay items from the log tree. This
2553*4882a593Smuzhiyun * gets called in two different stages. The first stage just looks
2554*4882a593Smuzhiyun * for inodes and makes sure they are all copied into the subvolume.
2555*4882a593Smuzhiyun *
2556*4882a593Smuzhiyun * The second stage copies all the other item types from the log into
2557*4882a593Smuzhiyun * the subvolume. The two stage approach is slower, but gets rid of
2558*4882a593Smuzhiyun * lots of complexity around inodes referencing other inodes that exist
2559*4882a593Smuzhiyun * only in the log (references come from either directory items or inode
2560*4882a593Smuzhiyun * back refs).
2561*4882a593Smuzhiyun */
replay_one_buffer(struct btrfs_root * log,struct extent_buffer * eb,struct walk_control * wc,u64 gen,int level)2562*4882a593Smuzhiyun static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
2563*4882a593Smuzhiyun struct walk_control *wc, u64 gen, int level)
2564*4882a593Smuzhiyun {
2565*4882a593Smuzhiyun int nritems;
2566*4882a593Smuzhiyun struct btrfs_path *path;
2567*4882a593Smuzhiyun struct btrfs_root *root = wc->replay_dest;
2568*4882a593Smuzhiyun struct btrfs_key key;
2569*4882a593Smuzhiyun int i;
2570*4882a593Smuzhiyun int ret;
2571*4882a593Smuzhiyun
2572*4882a593Smuzhiyun ret = btrfs_read_buffer(eb, gen, level, NULL);
2573*4882a593Smuzhiyun if (ret)
2574*4882a593Smuzhiyun return ret;
2575*4882a593Smuzhiyun
2576*4882a593Smuzhiyun level = btrfs_header_level(eb);
2577*4882a593Smuzhiyun
2578*4882a593Smuzhiyun if (level != 0)
2579*4882a593Smuzhiyun return 0;
2580*4882a593Smuzhiyun
2581*4882a593Smuzhiyun path = btrfs_alloc_path();
2582*4882a593Smuzhiyun if (!path)
2583*4882a593Smuzhiyun return -ENOMEM;
2584*4882a593Smuzhiyun
2585*4882a593Smuzhiyun nritems = btrfs_header_nritems(eb);
2586*4882a593Smuzhiyun for (i = 0; i < nritems; i++) {
2587*4882a593Smuzhiyun btrfs_item_key_to_cpu(eb, &key, i);
2588*4882a593Smuzhiyun
2589*4882a593Smuzhiyun /* inode keys are done during the first stage */
2590*4882a593Smuzhiyun if (key.type == BTRFS_INODE_ITEM_KEY &&
2591*4882a593Smuzhiyun wc->stage == LOG_WALK_REPLAY_INODES) {
2592*4882a593Smuzhiyun struct btrfs_inode_item *inode_item;
2593*4882a593Smuzhiyun u32 mode;
2594*4882a593Smuzhiyun
2595*4882a593Smuzhiyun inode_item = btrfs_item_ptr(eb, i,
2596*4882a593Smuzhiyun struct btrfs_inode_item);
2597*4882a593Smuzhiyun /*
2598*4882a593Smuzhiyun * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2599*4882a593Smuzhiyun * and never got linked before the fsync, skip it, as
2600*4882a593Smuzhiyun * replaying it is pointless since it would be deleted
2601*4882a593Smuzhiyun * later. We skip logging tmpfiles, but it's always
2602*4882a593Smuzhiyun * possible we are replaying a log created with a kernel
2603*4882a593Smuzhiyun * that used to log tmpfiles.
2604*4882a593Smuzhiyun */
2605*4882a593Smuzhiyun if (btrfs_inode_nlink(eb, inode_item) == 0) {
2606*4882a593Smuzhiyun wc->ignore_cur_inode = true;
2607*4882a593Smuzhiyun continue;
2608*4882a593Smuzhiyun } else {
2609*4882a593Smuzhiyun wc->ignore_cur_inode = false;
2610*4882a593Smuzhiyun }
2611*4882a593Smuzhiyun ret = replay_xattr_deletes(wc->trans, root, log,
2612*4882a593Smuzhiyun path, key.objectid);
2613*4882a593Smuzhiyun if (ret)
2614*4882a593Smuzhiyun break;
2615*4882a593Smuzhiyun mode = btrfs_inode_mode(eb, inode_item);
2616*4882a593Smuzhiyun if (S_ISDIR(mode)) {
2617*4882a593Smuzhiyun ret = replay_dir_deletes(wc->trans,
2618*4882a593Smuzhiyun root, log, path, key.objectid, 0);
2619*4882a593Smuzhiyun if (ret)
2620*4882a593Smuzhiyun break;
2621*4882a593Smuzhiyun }
2622*4882a593Smuzhiyun ret = overwrite_item(wc->trans, root, path,
2623*4882a593Smuzhiyun eb, i, &key);
2624*4882a593Smuzhiyun if (ret)
2625*4882a593Smuzhiyun break;
2626*4882a593Smuzhiyun
2627*4882a593Smuzhiyun /*
2628*4882a593Smuzhiyun * Before replaying extents, truncate the inode to its
2629*4882a593Smuzhiyun * size. We need to do it now and not after log replay
2630*4882a593Smuzhiyun * because before an fsync we can have prealloc extents
2631*4882a593Smuzhiyun * added beyond the inode's i_size. If we did it after,
2632*4882a593Smuzhiyun * through orphan cleanup for example, we would drop
2633*4882a593Smuzhiyun * those prealloc extents just after replaying them.
2634*4882a593Smuzhiyun */
2635*4882a593Smuzhiyun if (S_ISREG(mode)) {
2636*4882a593Smuzhiyun struct inode *inode;
2637*4882a593Smuzhiyun u64 from;
2638*4882a593Smuzhiyun
2639*4882a593Smuzhiyun inode = read_one_inode(root, key.objectid);
2640*4882a593Smuzhiyun if (!inode) {
2641*4882a593Smuzhiyun ret = -EIO;
2642*4882a593Smuzhiyun break;
2643*4882a593Smuzhiyun }
2644*4882a593Smuzhiyun from = ALIGN(i_size_read(inode),
2645*4882a593Smuzhiyun root->fs_info->sectorsize);
2646*4882a593Smuzhiyun ret = btrfs_drop_extents(wc->trans, root, inode,
2647*4882a593Smuzhiyun from, (u64)-1, 1);
2648*4882a593Smuzhiyun if (!ret) {
2649*4882a593Smuzhiyun /* Update the inode's nbytes. */
2650*4882a593Smuzhiyun ret = btrfs_update_inode(wc->trans,
2651*4882a593Smuzhiyun root, inode);
2652*4882a593Smuzhiyun }
2653*4882a593Smuzhiyun iput(inode);
2654*4882a593Smuzhiyun if (ret)
2655*4882a593Smuzhiyun break;
2656*4882a593Smuzhiyun }
2657*4882a593Smuzhiyun
2658*4882a593Smuzhiyun ret = link_to_fixup_dir(wc->trans, root,
2659*4882a593Smuzhiyun path, key.objectid);
2660*4882a593Smuzhiyun if (ret)
2661*4882a593Smuzhiyun break;
2662*4882a593Smuzhiyun }
2663*4882a593Smuzhiyun
2664*4882a593Smuzhiyun if (wc->ignore_cur_inode)
2665*4882a593Smuzhiyun continue;
2666*4882a593Smuzhiyun
2667*4882a593Smuzhiyun if (key.type == BTRFS_DIR_INDEX_KEY &&
2668*4882a593Smuzhiyun wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2669*4882a593Smuzhiyun ret = replay_one_dir_item(wc->trans, root, path,
2670*4882a593Smuzhiyun eb, i, &key);
2671*4882a593Smuzhiyun if (ret)
2672*4882a593Smuzhiyun break;
2673*4882a593Smuzhiyun }
2674*4882a593Smuzhiyun
2675*4882a593Smuzhiyun if (wc->stage < LOG_WALK_REPLAY_ALL)
2676*4882a593Smuzhiyun continue;
2677*4882a593Smuzhiyun
2678*4882a593Smuzhiyun /* these keys are simply copied */
2679*4882a593Smuzhiyun if (key.type == BTRFS_XATTR_ITEM_KEY) {
2680*4882a593Smuzhiyun ret = overwrite_item(wc->trans, root, path,
2681*4882a593Smuzhiyun eb, i, &key);
2682*4882a593Smuzhiyun if (ret)
2683*4882a593Smuzhiyun break;
2684*4882a593Smuzhiyun } else if (key.type == BTRFS_INODE_REF_KEY ||
2685*4882a593Smuzhiyun key.type == BTRFS_INODE_EXTREF_KEY) {
2686*4882a593Smuzhiyun ret = add_inode_ref(wc->trans, root, log, path,
2687*4882a593Smuzhiyun eb, i, &key);
2688*4882a593Smuzhiyun if (ret && ret != -ENOENT)
2689*4882a593Smuzhiyun break;
2690*4882a593Smuzhiyun ret = 0;
2691*4882a593Smuzhiyun } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2692*4882a593Smuzhiyun ret = replay_one_extent(wc->trans, root, path,
2693*4882a593Smuzhiyun eb, i, &key);
2694*4882a593Smuzhiyun if (ret)
2695*4882a593Smuzhiyun break;
2696*4882a593Smuzhiyun } else if (key.type == BTRFS_DIR_ITEM_KEY) {
2697*4882a593Smuzhiyun ret = replay_one_dir_item(wc->trans, root, path,
2698*4882a593Smuzhiyun eb, i, &key);
2699*4882a593Smuzhiyun if (ret)
2700*4882a593Smuzhiyun break;
2701*4882a593Smuzhiyun }
2702*4882a593Smuzhiyun }
2703*4882a593Smuzhiyun btrfs_free_path(path);
2704*4882a593Smuzhiyun return ret;
2705*4882a593Smuzhiyun }
2706*4882a593Smuzhiyun
2707*4882a593Smuzhiyun /*
2708*4882a593Smuzhiyun * Correctly adjust the reserved bytes occupied by a log tree extent buffer
2709*4882a593Smuzhiyun */
unaccount_log_buffer(struct btrfs_fs_info * fs_info,u64 start)2710*4882a593Smuzhiyun static void unaccount_log_buffer(struct btrfs_fs_info *fs_info, u64 start)
2711*4882a593Smuzhiyun {
2712*4882a593Smuzhiyun struct btrfs_block_group *cache;
2713*4882a593Smuzhiyun
2714*4882a593Smuzhiyun cache = btrfs_lookup_block_group(fs_info, start);
2715*4882a593Smuzhiyun if (!cache) {
2716*4882a593Smuzhiyun btrfs_err(fs_info, "unable to find block group for %llu", start);
2717*4882a593Smuzhiyun return;
2718*4882a593Smuzhiyun }
2719*4882a593Smuzhiyun
2720*4882a593Smuzhiyun spin_lock(&cache->space_info->lock);
2721*4882a593Smuzhiyun spin_lock(&cache->lock);
2722*4882a593Smuzhiyun cache->reserved -= fs_info->nodesize;
2723*4882a593Smuzhiyun cache->space_info->bytes_reserved -= fs_info->nodesize;
2724*4882a593Smuzhiyun spin_unlock(&cache->lock);
2725*4882a593Smuzhiyun spin_unlock(&cache->space_info->lock);
2726*4882a593Smuzhiyun
2727*4882a593Smuzhiyun btrfs_put_block_group(cache);
2728*4882a593Smuzhiyun }
2729*4882a593Smuzhiyun
walk_down_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2730*4882a593Smuzhiyun static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
2731*4882a593Smuzhiyun struct btrfs_root *root,
2732*4882a593Smuzhiyun struct btrfs_path *path, int *level,
2733*4882a593Smuzhiyun struct walk_control *wc)
2734*4882a593Smuzhiyun {
2735*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
2736*4882a593Smuzhiyun u64 bytenr;
2737*4882a593Smuzhiyun u64 ptr_gen;
2738*4882a593Smuzhiyun struct extent_buffer *next;
2739*4882a593Smuzhiyun struct extent_buffer *cur;
2740*4882a593Smuzhiyun u32 blocksize;
2741*4882a593Smuzhiyun int ret = 0;
2742*4882a593Smuzhiyun
2743*4882a593Smuzhiyun while (*level > 0) {
2744*4882a593Smuzhiyun struct btrfs_key first_key;
2745*4882a593Smuzhiyun
2746*4882a593Smuzhiyun cur = path->nodes[*level];
2747*4882a593Smuzhiyun
2748*4882a593Smuzhiyun WARN_ON(btrfs_header_level(cur) != *level);
2749*4882a593Smuzhiyun
2750*4882a593Smuzhiyun if (path->slots[*level] >=
2751*4882a593Smuzhiyun btrfs_header_nritems(cur))
2752*4882a593Smuzhiyun break;
2753*4882a593Smuzhiyun
2754*4882a593Smuzhiyun bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2755*4882a593Smuzhiyun ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
2756*4882a593Smuzhiyun btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
2757*4882a593Smuzhiyun blocksize = fs_info->nodesize;
2758*4882a593Smuzhiyun
2759*4882a593Smuzhiyun next = btrfs_find_create_tree_block(fs_info, bytenr);
2760*4882a593Smuzhiyun if (IS_ERR(next))
2761*4882a593Smuzhiyun return PTR_ERR(next);
2762*4882a593Smuzhiyun
2763*4882a593Smuzhiyun if (*level == 1) {
2764*4882a593Smuzhiyun ret = wc->process_func(root, next, wc, ptr_gen,
2765*4882a593Smuzhiyun *level - 1);
2766*4882a593Smuzhiyun if (ret) {
2767*4882a593Smuzhiyun free_extent_buffer(next);
2768*4882a593Smuzhiyun return ret;
2769*4882a593Smuzhiyun }
2770*4882a593Smuzhiyun
2771*4882a593Smuzhiyun path->slots[*level]++;
2772*4882a593Smuzhiyun if (wc->free) {
2773*4882a593Smuzhiyun ret = btrfs_read_buffer(next, ptr_gen,
2774*4882a593Smuzhiyun *level - 1, &first_key);
2775*4882a593Smuzhiyun if (ret) {
2776*4882a593Smuzhiyun free_extent_buffer(next);
2777*4882a593Smuzhiyun return ret;
2778*4882a593Smuzhiyun }
2779*4882a593Smuzhiyun
2780*4882a593Smuzhiyun if (trans) {
2781*4882a593Smuzhiyun btrfs_tree_lock(next);
2782*4882a593Smuzhiyun btrfs_set_lock_blocking_write(next);
2783*4882a593Smuzhiyun btrfs_clean_tree_block(next);
2784*4882a593Smuzhiyun btrfs_wait_tree_block_writeback(next);
2785*4882a593Smuzhiyun btrfs_tree_unlock(next);
2786*4882a593Smuzhiyun ret = btrfs_pin_reserved_extent(trans,
2787*4882a593Smuzhiyun bytenr, blocksize);
2788*4882a593Smuzhiyun if (ret) {
2789*4882a593Smuzhiyun free_extent_buffer(next);
2790*4882a593Smuzhiyun return ret;
2791*4882a593Smuzhiyun }
2792*4882a593Smuzhiyun } else {
2793*4882a593Smuzhiyun if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2794*4882a593Smuzhiyun clear_extent_buffer_dirty(next);
2795*4882a593Smuzhiyun unaccount_log_buffer(fs_info, bytenr);
2796*4882a593Smuzhiyun }
2797*4882a593Smuzhiyun }
2798*4882a593Smuzhiyun free_extent_buffer(next);
2799*4882a593Smuzhiyun continue;
2800*4882a593Smuzhiyun }
2801*4882a593Smuzhiyun ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
2802*4882a593Smuzhiyun if (ret) {
2803*4882a593Smuzhiyun free_extent_buffer(next);
2804*4882a593Smuzhiyun return ret;
2805*4882a593Smuzhiyun }
2806*4882a593Smuzhiyun
2807*4882a593Smuzhiyun if (path->nodes[*level-1])
2808*4882a593Smuzhiyun free_extent_buffer(path->nodes[*level-1]);
2809*4882a593Smuzhiyun path->nodes[*level-1] = next;
2810*4882a593Smuzhiyun *level = btrfs_header_level(next);
2811*4882a593Smuzhiyun path->slots[*level] = 0;
2812*4882a593Smuzhiyun cond_resched();
2813*4882a593Smuzhiyun }
2814*4882a593Smuzhiyun path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
2815*4882a593Smuzhiyun
2816*4882a593Smuzhiyun cond_resched();
2817*4882a593Smuzhiyun return 0;
2818*4882a593Smuzhiyun }
2819*4882a593Smuzhiyun
walk_up_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,int * level,struct walk_control * wc)2820*4882a593Smuzhiyun static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
2821*4882a593Smuzhiyun struct btrfs_root *root,
2822*4882a593Smuzhiyun struct btrfs_path *path, int *level,
2823*4882a593Smuzhiyun struct walk_control *wc)
2824*4882a593Smuzhiyun {
2825*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
2826*4882a593Smuzhiyun int i;
2827*4882a593Smuzhiyun int slot;
2828*4882a593Smuzhiyun int ret;
2829*4882a593Smuzhiyun
2830*4882a593Smuzhiyun for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
2831*4882a593Smuzhiyun slot = path->slots[i];
2832*4882a593Smuzhiyun if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
2833*4882a593Smuzhiyun path->slots[i]++;
2834*4882a593Smuzhiyun *level = i;
2835*4882a593Smuzhiyun WARN_ON(*level == 0);
2836*4882a593Smuzhiyun return 0;
2837*4882a593Smuzhiyun } else {
2838*4882a593Smuzhiyun ret = wc->process_func(root, path->nodes[*level], wc,
2839*4882a593Smuzhiyun btrfs_header_generation(path->nodes[*level]),
2840*4882a593Smuzhiyun *level);
2841*4882a593Smuzhiyun if (ret)
2842*4882a593Smuzhiyun return ret;
2843*4882a593Smuzhiyun
2844*4882a593Smuzhiyun if (wc->free) {
2845*4882a593Smuzhiyun struct extent_buffer *next;
2846*4882a593Smuzhiyun
2847*4882a593Smuzhiyun next = path->nodes[*level];
2848*4882a593Smuzhiyun
2849*4882a593Smuzhiyun if (trans) {
2850*4882a593Smuzhiyun btrfs_tree_lock(next);
2851*4882a593Smuzhiyun btrfs_set_lock_blocking_write(next);
2852*4882a593Smuzhiyun btrfs_clean_tree_block(next);
2853*4882a593Smuzhiyun btrfs_wait_tree_block_writeback(next);
2854*4882a593Smuzhiyun btrfs_tree_unlock(next);
2855*4882a593Smuzhiyun ret = btrfs_pin_reserved_extent(trans,
2856*4882a593Smuzhiyun path->nodes[*level]->start,
2857*4882a593Smuzhiyun path->nodes[*level]->len);
2858*4882a593Smuzhiyun if (ret)
2859*4882a593Smuzhiyun return ret;
2860*4882a593Smuzhiyun } else {
2861*4882a593Smuzhiyun if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2862*4882a593Smuzhiyun clear_extent_buffer_dirty(next);
2863*4882a593Smuzhiyun
2864*4882a593Smuzhiyun unaccount_log_buffer(fs_info,
2865*4882a593Smuzhiyun path->nodes[*level]->start);
2866*4882a593Smuzhiyun }
2867*4882a593Smuzhiyun }
2868*4882a593Smuzhiyun free_extent_buffer(path->nodes[*level]);
2869*4882a593Smuzhiyun path->nodes[*level] = NULL;
2870*4882a593Smuzhiyun *level = i + 1;
2871*4882a593Smuzhiyun }
2872*4882a593Smuzhiyun }
2873*4882a593Smuzhiyun return 1;
2874*4882a593Smuzhiyun }
2875*4882a593Smuzhiyun
2876*4882a593Smuzhiyun /*
2877*4882a593Smuzhiyun * drop the reference count on the tree rooted at 'snap'. This traverses
2878*4882a593Smuzhiyun * the tree freeing any blocks that have a ref count of zero after being
2879*4882a593Smuzhiyun * decremented.
2880*4882a593Smuzhiyun */
walk_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct walk_control * wc)2881*4882a593Smuzhiyun static int walk_log_tree(struct btrfs_trans_handle *trans,
2882*4882a593Smuzhiyun struct btrfs_root *log, struct walk_control *wc)
2883*4882a593Smuzhiyun {
2884*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = log->fs_info;
2885*4882a593Smuzhiyun int ret = 0;
2886*4882a593Smuzhiyun int wret;
2887*4882a593Smuzhiyun int level;
2888*4882a593Smuzhiyun struct btrfs_path *path;
2889*4882a593Smuzhiyun int orig_level;
2890*4882a593Smuzhiyun
2891*4882a593Smuzhiyun path = btrfs_alloc_path();
2892*4882a593Smuzhiyun if (!path)
2893*4882a593Smuzhiyun return -ENOMEM;
2894*4882a593Smuzhiyun
2895*4882a593Smuzhiyun level = btrfs_header_level(log->node);
2896*4882a593Smuzhiyun orig_level = level;
2897*4882a593Smuzhiyun path->nodes[level] = log->node;
2898*4882a593Smuzhiyun atomic_inc(&log->node->refs);
2899*4882a593Smuzhiyun path->slots[level] = 0;
2900*4882a593Smuzhiyun
2901*4882a593Smuzhiyun while (1) {
2902*4882a593Smuzhiyun wret = walk_down_log_tree(trans, log, path, &level, wc);
2903*4882a593Smuzhiyun if (wret > 0)
2904*4882a593Smuzhiyun break;
2905*4882a593Smuzhiyun if (wret < 0) {
2906*4882a593Smuzhiyun ret = wret;
2907*4882a593Smuzhiyun goto out;
2908*4882a593Smuzhiyun }
2909*4882a593Smuzhiyun
2910*4882a593Smuzhiyun wret = walk_up_log_tree(trans, log, path, &level, wc);
2911*4882a593Smuzhiyun if (wret > 0)
2912*4882a593Smuzhiyun break;
2913*4882a593Smuzhiyun if (wret < 0) {
2914*4882a593Smuzhiyun ret = wret;
2915*4882a593Smuzhiyun goto out;
2916*4882a593Smuzhiyun }
2917*4882a593Smuzhiyun }
2918*4882a593Smuzhiyun
2919*4882a593Smuzhiyun /* was the root node processed? if not, catch it here */
2920*4882a593Smuzhiyun if (path->nodes[orig_level]) {
2921*4882a593Smuzhiyun ret = wc->process_func(log, path->nodes[orig_level], wc,
2922*4882a593Smuzhiyun btrfs_header_generation(path->nodes[orig_level]),
2923*4882a593Smuzhiyun orig_level);
2924*4882a593Smuzhiyun if (ret)
2925*4882a593Smuzhiyun goto out;
2926*4882a593Smuzhiyun if (wc->free) {
2927*4882a593Smuzhiyun struct extent_buffer *next;
2928*4882a593Smuzhiyun
2929*4882a593Smuzhiyun next = path->nodes[orig_level];
2930*4882a593Smuzhiyun
2931*4882a593Smuzhiyun if (trans) {
2932*4882a593Smuzhiyun btrfs_tree_lock(next);
2933*4882a593Smuzhiyun btrfs_set_lock_blocking_write(next);
2934*4882a593Smuzhiyun btrfs_clean_tree_block(next);
2935*4882a593Smuzhiyun btrfs_wait_tree_block_writeback(next);
2936*4882a593Smuzhiyun btrfs_tree_unlock(next);
2937*4882a593Smuzhiyun ret = btrfs_pin_reserved_extent(trans,
2938*4882a593Smuzhiyun next->start, next->len);
2939*4882a593Smuzhiyun if (ret)
2940*4882a593Smuzhiyun goto out;
2941*4882a593Smuzhiyun } else {
2942*4882a593Smuzhiyun if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2943*4882a593Smuzhiyun clear_extent_buffer_dirty(next);
2944*4882a593Smuzhiyun unaccount_log_buffer(fs_info, next->start);
2945*4882a593Smuzhiyun }
2946*4882a593Smuzhiyun }
2947*4882a593Smuzhiyun }
2948*4882a593Smuzhiyun
2949*4882a593Smuzhiyun out:
2950*4882a593Smuzhiyun btrfs_free_path(path);
2951*4882a593Smuzhiyun return ret;
2952*4882a593Smuzhiyun }
2953*4882a593Smuzhiyun
2954*4882a593Smuzhiyun /*
2955*4882a593Smuzhiyun * helper function to update the item for a given subvolumes log root
2956*4882a593Smuzhiyun * in the tree of log roots
2957*4882a593Smuzhiyun */
update_log_root(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_root_item * root_item)2958*4882a593Smuzhiyun static int update_log_root(struct btrfs_trans_handle *trans,
2959*4882a593Smuzhiyun struct btrfs_root *log,
2960*4882a593Smuzhiyun struct btrfs_root_item *root_item)
2961*4882a593Smuzhiyun {
2962*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = log->fs_info;
2963*4882a593Smuzhiyun int ret;
2964*4882a593Smuzhiyun
2965*4882a593Smuzhiyun if (log->log_transid == 1) {
2966*4882a593Smuzhiyun /* insert root item on the first sync */
2967*4882a593Smuzhiyun ret = btrfs_insert_root(trans, fs_info->log_root_tree,
2968*4882a593Smuzhiyun &log->root_key, root_item);
2969*4882a593Smuzhiyun } else {
2970*4882a593Smuzhiyun ret = btrfs_update_root(trans, fs_info->log_root_tree,
2971*4882a593Smuzhiyun &log->root_key, root_item);
2972*4882a593Smuzhiyun }
2973*4882a593Smuzhiyun return ret;
2974*4882a593Smuzhiyun }
2975*4882a593Smuzhiyun
wait_log_commit(struct btrfs_root * root,int transid)2976*4882a593Smuzhiyun static void wait_log_commit(struct btrfs_root *root, int transid)
2977*4882a593Smuzhiyun {
2978*4882a593Smuzhiyun DEFINE_WAIT(wait);
2979*4882a593Smuzhiyun int index = transid % 2;
2980*4882a593Smuzhiyun
2981*4882a593Smuzhiyun /*
2982*4882a593Smuzhiyun * we only allow two pending log transactions at a time,
2983*4882a593Smuzhiyun * so we know that if ours is more than 2 older than the
2984*4882a593Smuzhiyun * current transaction, we're done
2985*4882a593Smuzhiyun */
2986*4882a593Smuzhiyun for (;;) {
2987*4882a593Smuzhiyun prepare_to_wait(&root->log_commit_wait[index],
2988*4882a593Smuzhiyun &wait, TASK_UNINTERRUPTIBLE);
2989*4882a593Smuzhiyun
2990*4882a593Smuzhiyun if (!(root->log_transid_committed < transid &&
2991*4882a593Smuzhiyun atomic_read(&root->log_commit[index])))
2992*4882a593Smuzhiyun break;
2993*4882a593Smuzhiyun
2994*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
2995*4882a593Smuzhiyun schedule();
2996*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
2997*4882a593Smuzhiyun }
2998*4882a593Smuzhiyun finish_wait(&root->log_commit_wait[index], &wait);
2999*4882a593Smuzhiyun }
3000*4882a593Smuzhiyun
wait_for_writer(struct btrfs_root * root)3001*4882a593Smuzhiyun static void wait_for_writer(struct btrfs_root *root)
3002*4882a593Smuzhiyun {
3003*4882a593Smuzhiyun DEFINE_WAIT(wait);
3004*4882a593Smuzhiyun
3005*4882a593Smuzhiyun for (;;) {
3006*4882a593Smuzhiyun prepare_to_wait(&root->log_writer_wait, &wait,
3007*4882a593Smuzhiyun TASK_UNINTERRUPTIBLE);
3008*4882a593Smuzhiyun if (!atomic_read(&root->log_writers))
3009*4882a593Smuzhiyun break;
3010*4882a593Smuzhiyun
3011*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3012*4882a593Smuzhiyun schedule();
3013*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
3014*4882a593Smuzhiyun }
3015*4882a593Smuzhiyun finish_wait(&root->log_writer_wait, &wait);
3016*4882a593Smuzhiyun }
3017*4882a593Smuzhiyun
btrfs_remove_log_ctx(struct btrfs_root * root,struct btrfs_log_ctx * ctx)3018*4882a593Smuzhiyun static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
3019*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
3020*4882a593Smuzhiyun {
3021*4882a593Smuzhiyun if (!ctx)
3022*4882a593Smuzhiyun return;
3023*4882a593Smuzhiyun
3024*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
3025*4882a593Smuzhiyun list_del_init(&ctx->list);
3026*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3027*4882a593Smuzhiyun }
3028*4882a593Smuzhiyun
3029*4882a593Smuzhiyun /*
3030*4882a593Smuzhiyun * Invoked in log mutex context, or be sure there is no other task which
3031*4882a593Smuzhiyun * can access the list.
3032*4882a593Smuzhiyun */
btrfs_remove_all_log_ctxs(struct btrfs_root * root,int index,int error)3033*4882a593Smuzhiyun static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3034*4882a593Smuzhiyun int index, int error)
3035*4882a593Smuzhiyun {
3036*4882a593Smuzhiyun struct btrfs_log_ctx *ctx;
3037*4882a593Smuzhiyun struct btrfs_log_ctx *safe;
3038*4882a593Smuzhiyun
3039*4882a593Smuzhiyun list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3040*4882a593Smuzhiyun list_del_init(&ctx->list);
3041*4882a593Smuzhiyun ctx->log_ret = error;
3042*4882a593Smuzhiyun }
3043*4882a593Smuzhiyun
3044*4882a593Smuzhiyun INIT_LIST_HEAD(&root->log_ctxs[index]);
3045*4882a593Smuzhiyun }
3046*4882a593Smuzhiyun
3047*4882a593Smuzhiyun /*
3048*4882a593Smuzhiyun * btrfs_sync_log does sends a given tree log down to the disk and
3049*4882a593Smuzhiyun * updates the super blocks to record it. When this call is done,
3050*4882a593Smuzhiyun * you know that any inodes previously logged are safely on disk only
3051*4882a593Smuzhiyun * if it returns 0.
3052*4882a593Smuzhiyun *
3053*4882a593Smuzhiyun * Any other return value means you need to call btrfs_commit_transaction.
3054*4882a593Smuzhiyun * Some of the edge cases for fsyncing directories that have had unlinks
3055*4882a593Smuzhiyun * or renames done in the past mean that sometimes the only safe
3056*4882a593Smuzhiyun * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
3057*4882a593Smuzhiyun * that has happened.
3058*4882a593Smuzhiyun */
btrfs_sync_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_log_ctx * ctx)3059*4882a593Smuzhiyun int btrfs_sync_log(struct btrfs_trans_handle *trans,
3060*4882a593Smuzhiyun struct btrfs_root *root, struct btrfs_log_ctx *ctx)
3061*4882a593Smuzhiyun {
3062*4882a593Smuzhiyun int index1;
3063*4882a593Smuzhiyun int index2;
3064*4882a593Smuzhiyun int mark;
3065*4882a593Smuzhiyun int ret;
3066*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
3067*4882a593Smuzhiyun struct btrfs_root *log = root->log_root;
3068*4882a593Smuzhiyun struct btrfs_root *log_root_tree = fs_info->log_root_tree;
3069*4882a593Smuzhiyun struct btrfs_root_item new_root_item;
3070*4882a593Smuzhiyun int log_transid = 0;
3071*4882a593Smuzhiyun struct btrfs_log_ctx root_log_ctx;
3072*4882a593Smuzhiyun struct blk_plug plug;
3073*4882a593Smuzhiyun
3074*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
3075*4882a593Smuzhiyun log_transid = ctx->log_transid;
3076*4882a593Smuzhiyun if (root->log_transid_committed >= log_transid) {
3077*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3078*4882a593Smuzhiyun return ctx->log_ret;
3079*4882a593Smuzhiyun }
3080*4882a593Smuzhiyun
3081*4882a593Smuzhiyun index1 = log_transid % 2;
3082*4882a593Smuzhiyun if (atomic_read(&root->log_commit[index1])) {
3083*4882a593Smuzhiyun wait_log_commit(root, log_transid);
3084*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3085*4882a593Smuzhiyun return ctx->log_ret;
3086*4882a593Smuzhiyun }
3087*4882a593Smuzhiyun ASSERT(log_transid == root->log_transid);
3088*4882a593Smuzhiyun atomic_set(&root->log_commit[index1], 1);
3089*4882a593Smuzhiyun
3090*4882a593Smuzhiyun /* wait for previous tree log sync to complete */
3091*4882a593Smuzhiyun if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
3092*4882a593Smuzhiyun wait_log_commit(root, log_transid - 1);
3093*4882a593Smuzhiyun
3094*4882a593Smuzhiyun while (1) {
3095*4882a593Smuzhiyun int batch = atomic_read(&root->log_batch);
3096*4882a593Smuzhiyun /* when we're on an ssd, just kick the log commit out */
3097*4882a593Smuzhiyun if (!btrfs_test_opt(fs_info, SSD) &&
3098*4882a593Smuzhiyun test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
3099*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3100*4882a593Smuzhiyun schedule_timeout_uninterruptible(1);
3101*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
3102*4882a593Smuzhiyun }
3103*4882a593Smuzhiyun wait_for_writer(root);
3104*4882a593Smuzhiyun if (batch == atomic_read(&root->log_batch))
3105*4882a593Smuzhiyun break;
3106*4882a593Smuzhiyun }
3107*4882a593Smuzhiyun
3108*4882a593Smuzhiyun /* bail out if we need to do a full commit */
3109*4882a593Smuzhiyun if (btrfs_need_log_full_commit(trans)) {
3110*4882a593Smuzhiyun ret = -EAGAIN;
3111*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3112*4882a593Smuzhiyun goto out;
3113*4882a593Smuzhiyun }
3114*4882a593Smuzhiyun
3115*4882a593Smuzhiyun if (log_transid % 2 == 0)
3116*4882a593Smuzhiyun mark = EXTENT_DIRTY;
3117*4882a593Smuzhiyun else
3118*4882a593Smuzhiyun mark = EXTENT_NEW;
3119*4882a593Smuzhiyun
3120*4882a593Smuzhiyun /* we start IO on all the marked extents here, but we don't actually
3121*4882a593Smuzhiyun * wait for them until later.
3122*4882a593Smuzhiyun */
3123*4882a593Smuzhiyun blk_start_plug(&plug);
3124*4882a593Smuzhiyun ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
3125*4882a593Smuzhiyun if (ret) {
3126*4882a593Smuzhiyun blk_finish_plug(&plug);
3127*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
3128*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
3129*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3130*4882a593Smuzhiyun goto out;
3131*4882a593Smuzhiyun }
3132*4882a593Smuzhiyun
3133*4882a593Smuzhiyun /*
3134*4882a593Smuzhiyun * We _must_ update under the root->log_mutex in order to make sure we
3135*4882a593Smuzhiyun * have a consistent view of the log root we are trying to commit at
3136*4882a593Smuzhiyun * this moment.
3137*4882a593Smuzhiyun *
3138*4882a593Smuzhiyun * We _must_ copy this into a local copy, because we are not holding the
3139*4882a593Smuzhiyun * log_root_tree->log_mutex yet. This is important because when we
3140*4882a593Smuzhiyun * commit the log_root_tree we must have a consistent view of the
3141*4882a593Smuzhiyun * log_root_tree when we update the super block to point at the
3142*4882a593Smuzhiyun * log_root_tree bytenr. If we update the log_root_tree here we'll race
3143*4882a593Smuzhiyun * with the commit and possibly point at the new block which we may not
3144*4882a593Smuzhiyun * have written out.
3145*4882a593Smuzhiyun */
3146*4882a593Smuzhiyun btrfs_set_root_node(&log->root_item, log->node);
3147*4882a593Smuzhiyun memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
3148*4882a593Smuzhiyun
3149*4882a593Smuzhiyun root->log_transid++;
3150*4882a593Smuzhiyun log->log_transid = root->log_transid;
3151*4882a593Smuzhiyun root->log_start_pid = 0;
3152*4882a593Smuzhiyun /*
3153*4882a593Smuzhiyun * IO has been started, blocks of the log tree have WRITTEN flag set
3154*4882a593Smuzhiyun * in their headers. new modifications of the log will be written to
3155*4882a593Smuzhiyun * new positions. so it's safe to allow log writers to go in.
3156*4882a593Smuzhiyun */
3157*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3158*4882a593Smuzhiyun
3159*4882a593Smuzhiyun btrfs_init_log_ctx(&root_log_ctx, NULL);
3160*4882a593Smuzhiyun
3161*4882a593Smuzhiyun mutex_lock(&log_root_tree->log_mutex);
3162*4882a593Smuzhiyun
3163*4882a593Smuzhiyun index2 = log_root_tree->log_transid % 2;
3164*4882a593Smuzhiyun list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3165*4882a593Smuzhiyun root_log_ctx.log_transid = log_root_tree->log_transid;
3166*4882a593Smuzhiyun
3167*4882a593Smuzhiyun /*
3168*4882a593Smuzhiyun * Now we are safe to update the log_root_tree because we're under the
3169*4882a593Smuzhiyun * log_mutex, and we're a current writer so we're holding the commit
3170*4882a593Smuzhiyun * open until we drop the log_mutex.
3171*4882a593Smuzhiyun */
3172*4882a593Smuzhiyun ret = update_log_root(trans, log, &new_root_item);
3173*4882a593Smuzhiyun if (ret) {
3174*4882a593Smuzhiyun if (!list_empty(&root_log_ctx.list))
3175*4882a593Smuzhiyun list_del_init(&root_log_ctx.list);
3176*4882a593Smuzhiyun
3177*4882a593Smuzhiyun blk_finish_plug(&plug);
3178*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
3179*4882a593Smuzhiyun
3180*4882a593Smuzhiyun if (ret != -ENOSPC) {
3181*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
3182*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3183*4882a593Smuzhiyun goto out;
3184*4882a593Smuzhiyun }
3185*4882a593Smuzhiyun btrfs_wait_tree_log_extents(log, mark);
3186*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3187*4882a593Smuzhiyun ret = -EAGAIN;
3188*4882a593Smuzhiyun goto out;
3189*4882a593Smuzhiyun }
3190*4882a593Smuzhiyun
3191*4882a593Smuzhiyun if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
3192*4882a593Smuzhiyun blk_finish_plug(&plug);
3193*4882a593Smuzhiyun list_del_init(&root_log_ctx.list);
3194*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3195*4882a593Smuzhiyun ret = root_log_ctx.log_ret;
3196*4882a593Smuzhiyun goto out;
3197*4882a593Smuzhiyun }
3198*4882a593Smuzhiyun
3199*4882a593Smuzhiyun index2 = root_log_ctx.log_transid % 2;
3200*4882a593Smuzhiyun if (atomic_read(&log_root_tree->log_commit[index2])) {
3201*4882a593Smuzhiyun blk_finish_plug(&plug);
3202*4882a593Smuzhiyun ret = btrfs_wait_tree_log_extents(log, mark);
3203*4882a593Smuzhiyun wait_log_commit(log_root_tree,
3204*4882a593Smuzhiyun root_log_ctx.log_transid);
3205*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3206*4882a593Smuzhiyun if (!ret)
3207*4882a593Smuzhiyun ret = root_log_ctx.log_ret;
3208*4882a593Smuzhiyun goto out;
3209*4882a593Smuzhiyun }
3210*4882a593Smuzhiyun ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
3211*4882a593Smuzhiyun atomic_set(&log_root_tree->log_commit[index2], 1);
3212*4882a593Smuzhiyun
3213*4882a593Smuzhiyun if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
3214*4882a593Smuzhiyun wait_log_commit(log_root_tree,
3215*4882a593Smuzhiyun root_log_ctx.log_transid - 1);
3216*4882a593Smuzhiyun }
3217*4882a593Smuzhiyun
3218*4882a593Smuzhiyun /*
3219*4882a593Smuzhiyun * now that we've moved on to the tree of log tree roots,
3220*4882a593Smuzhiyun * check the full commit flag again
3221*4882a593Smuzhiyun */
3222*4882a593Smuzhiyun if (btrfs_need_log_full_commit(trans)) {
3223*4882a593Smuzhiyun blk_finish_plug(&plug);
3224*4882a593Smuzhiyun btrfs_wait_tree_log_extents(log, mark);
3225*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3226*4882a593Smuzhiyun ret = -EAGAIN;
3227*4882a593Smuzhiyun goto out_wake_log_root;
3228*4882a593Smuzhiyun }
3229*4882a593Smuzhiyun
3230*4882a593Smuzhiyun ret = btrfs_write_marked_extents(fs_info,
3231*4882a593Smuzhiyun &log_root_tree->dirty_log_pages,
3232*4882a593Smuzhiyun EXTENT_DIRTY | EXTENT_NEW);
3233*4882a593Smuzhiyun blk_finish_plug(&plug);
3234*4882a593Smuzhiyun if (ret) {
3235*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
3236*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
3237*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3238*4882a593Smuzhiyun goto out_wake_log_root;
3239*4882a593Smuzhiyun }
3240*4882a593Smuzhiyun ret = btrfs_wait_tree_log_extents(log, mark);
3241*4882a593Smuzhiyun if (!ret)
3242*4882a593Smuzhiyun ret = btrfs_wait_tree_log_extents(log_root_tree,
3243*4882a593Smuzhiyun EXTENT_NEW | EXTENT_DIRTY);
3244*4882a593Smuzhiyun if (ret) {
3245*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
3246*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3247*4882a593Smuzhiyun goto out_wake_log_root;
3248*4882a593Smuzhiyun }
3249*4882a593Smuzhiyun
3250*4882a593Smuzhiyun btrfs_set_super_log_root(fs_info->super_for_commit,
3251*4882a593Smuzhiyun log_root_tree->node->start);
3252*4882a593Smuzhiyun btrfs_set_super_log_root_level(fs_info->super_for_commit,
3253*4882a593Smuzhiyun btrfs_header_level(log_root_tree->node));
3254*4882a593Smuzhiyun
3255*4882a593Smuzhiyun log_root_tree->log_transid++;
3256*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3257*4882a593Smuzhiyun
3258*4882a593Smuzhiyun /*
3259*4882a593Smuzhiyun * Nobody else is going to jump in and write the ctree
3260*4882a593Smuzhiyun * super here because the log_commit atomic below is protecting
3261*4882a593Smuzhiyun * us. We must be called with a transaction handle pinning
3262*4882a593Smuzhiyun * the running transaction open, so a full commit can't hop
3263*4882a593Smuzhiyun * in and cause problems either.
3264*4882a593Smuzhiyun */
3265*4882a593Smuzhiyun ret = write_all_supers(fs_info, 1);
3266*4882a593Smuzhiyun if (ret) {
3267*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
3268*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
3269*4882a593Smuzhiyun goto out_wake_log_root;
3270*4882a593Smuzhiyun }
3271*4882a593Smuzhiyun
3272*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
3273*4882a593Smuzhiyun if (root->last_log_commit < log_transid)
3274*4882a593Smuzhiyun root->last_log_commit = log_transid;
3275*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3276*4882a593Smuzhiyun
3277*4882a593Smuzhiyun out_wake_log_root:
3278*4882a593Smuzhiyun mutex_lock(&log_root_tree->log_mutex);
3279*4882a593Smuzhiyun btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3280*4882a593Smuzhiyun
3281*4882a593Smuzhiyun log_root_tree->log_transid_committed++;
3282*4882a593Smuzhiyun atomic_set(&log_root_tree->log_commit[index2], 0);
3283*4882a593Smuzhiyun mutex_unlock(&log_root_tree->log_mutex);
3284*4882a593Smuzhiyun
3285*4882a593Smuzhiyun /*
3286*4882a593Smuzhiyun * The barrier before waitqueue_active (in cond_wake_up) is needed so
3287*4882a593Smuzhiyun * all the updates above are seen by the woken threads. It might not be
3288*4882a593Smuzhiyun * necessary, but proving that seems to be hard.
3289*4882a593Smuzhiyun */
3290*4882a593Smuzhiyun cond_wake_up(&log_root_tree->log_commit_wait[index2]);
3291*4882a593Smuzhiyun out:
3292*4882a593Smuzhiyun mutex_lock(&root->log_mutex);
3293*4882a593Smuzhiyun btrfs_remove_all_log_ctxs(root, index1, ret);
3294*4882a593Smuzhiyun root->log_transid_committed++;
3295*4882a593Smuzhiyun atomic_set(&root->log_commit[index1], 0);
3296*4882a593Smuzhiyun mutex_unlock(&root->log_mutex);
3297*4882a593Smuzhiyun
3298*4882a593Smuzhiyun /*
3299*4882a593Smuzhiyun * The barrier before waitqueue_active (in cond_wake_up) is needed so
3300*4882a593Smuzhiyun * all the updates above are seen by the woken threads. It might not be
3301*4882a593Smuzhiyun * necessary, but proving that seems to be hard.
3302*4882a593Smuzhiyun */
3303*4882a593Smuzhiyun cond_wake_up(&root->log_commit_wait[index1]);
3304*4882a593Smuzhiyun return ret;
3305*4882a593Smuzhiyun }
3306*4882a593Smuzhiyun
free_log_tree(struct btrfs_trans_handle * trans,struct btrfs_root * log)3307*4882a593Smuzhiyun static void free_log_tree(struct btrfs_trans_handle *trans,
3308*4882a593Smuzhiyun struct btrfs_root *log)
3309*4882a593Smuzhiyun {
3310*4882a593Smuzhiyun int ret;
3311*4882a593Smuzhiyun struct walk_control wc = {
3312*4882a593Smuzhiyun .free = 1,
3313*4882a593Smuzhiyun .process_func = process_one_buffer
3314*4882a593Smuzhiyun };
3315*4882a593Smuzhiyun
3316*4882a593Smuzhiyun ret = walk_log_tree(trans, log, &wc);
3317*4882a593Smuzhiyun if (ret) {
3318*4882a593Smuzhiyun if (trans)
3319*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
3320*4882a593Smuzhiyun else
3321*4882a593Smuzhiyun btrfs_handle_fs_error(log->fs_info, ret, NULL);
3322*4882a593Smuzhiyun }
3323*4882a593Smuzhiyun
3324*4882a593Smuzhiyun clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3325*4882a593Smuzhiyun EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
3326*4882a593Smuzhiyun extent_io_tree_release(&log->log_csum_range);
3327*4882a593Smuzhiyun btrfs_put_root(log);
3328*4882a593Smuzhiyun }
3329*4882a593Smuzhiyun
3330*4882a593Smuzhiyun /*
3331*4882a593Smuzhiyun * free all the extents used by the tree log. This should be called
3332*4882a593Smuzhiyun * at commit time of the full transaction
3333*4882a593Smuzhiyun */
btrfs_free_log(struct btrfs_trans_handle * trans,struct btrfs_root * root)3334*4882a593Smuzhiyun int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3335*4882a593Smuzhiyun {
3336*4882a593Smuzhiyun if (root->log_root) {
3337*4882a593Smuzhiyun free_log_tree(trans, root->log_root);
3338*4882a593Smuzhiyun root->log_root = NULL;
3339*4882a593Smuzhiyun clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
3340*4882a593Smuzhiyun }
3341*4882a593Smuzhiyun return 0;
3342*4882a593Smuzhiyun }
3343*4882a593Smuzhiyun
btrfs_free_log_root_tree(struct btrfs_trans_handle * trans,struct btrfs_fs_info * fs_info)3344*4882a593Smuzhiyun int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3345*4882a593Smuzhiyun struct btrfs_fs_info *fs_info)
3346*4882a593Smuzhiyun {
3347*4882a593Smuzhiyun if (fs_info->log_root_tree) {
3348*4882a593Smuzhiyun free_log_tree(trans, fs_info->log_root_tree);
3349*4882a593Smuzhiyun fs_info->log_root_tree = NULL;
3350*4882a593Smuzhiyun }
3351*4882a593Smuzhiyun return 0;
3352*4882a593Smuzhiyun }
3353*4882a593Smuzhiyun
3354*4882a593Smuzhiyun /*
3355*4882a593Smuzhiyun * Check if an inode was logged in the current transaction. We can't always rely
3356*4882a593Smuzhiyun * on an inode's logged_trans value, because it's an in-memory only field and
3357*4882a593Smuzhiyun * therefore not persisted. This means that its value is lost if the inode gets
3358*4882a593Smuzhiyun * evicted and loaded again from disk (in which case it has a value of 0, and
3359*4882a593Smuzhiyun * certainly it is smaller then any possible transaction ID), when that happens
3360*4882a593Smuzhiyun * the full_sync flag is set in the inode's runtime flags, so on that case we
3361*4882a593Smuzhiyun * assume eviction happened and ignore the logged_trans value, assuming the
3362*4882a593Smuzhiyun * worst case, that the inode was logged before in the current transaction.
3363*4882a593Smuzhiyun */
inode_logged(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3364*4882a593Smuzhiyun static bool inode_logged(struct btrfs_trans_handle *trans,
3365*4882a593Smuzhiyun struct btrfs_inode *inode)
3366*4882a593Smuzhiyun {
3367*4882a593Smuzhiyun if (inode->logged_trans == trans->transid)
3368*4882a593Smuzhiyun return true;
3369*4882a593Smuzhiyun
3370*4882a593Smuzhiyun if (inode->last_trans == trans->transid &&
3371*4882a593Smuzhiyun test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
3372*4882a593Smuzhiyun !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
3373*4882a593Smuzhiyun return true;
3374*4882a593Smuzhiyun
3375*4882a593Smuzhiyun return false;
3376*4882a593Smuzhiyun }
3377*4882a593Smuzhiyun
3378*4882a593Smuzhiyun /*
3379*4882a593Smuzhiyun * If both a file and directory are logged, and unlinks or renames are
3380*4882a593Smuzhiyun * mixed in, we have a few interesting corners:
3381*4882a593Smuzhiyun *
3382*4882a593Smuzhiyun * create file X in dir Y
3383*4882a593Smuzhiyun * link file X to X.link in dir Y
3384*4882a593Smuzhiyun * fsync file X
3385*4882a593Smuzhiyun * unlink file X but leave X.link
3386*4882a593Smuzhiyun * fsync dir Y
3387*4882a593Smuzhiyun *
3388*4882a593Smuzhiyun * After a crash we would expect only X.link to exist. But file X
3389*4882a593Smuzhiyun * didn't get fsync'd again so the log has back refs for X and X.link.
3390*4882a593Smuzhiyun *
3391*4882a593Smuzhiyun * We solve this by removing directory entries and inode backrefs from the
3392*4882a593Smuzhiyun * log when a file that was logged in the current transaction is
3393*4882a593Smuzhiyun * unlinked. Any later fsync will include the updated log entries, and
3394*4882a593Smuzhiyun * we'll be able to reconstruct the proper directory items from backrefs.
3395*4882a593Smuzhiyun *
3396*4882a593Smuzhiyun * This optimizations allows us to avoid relogging the entire inode
3397*4882a593Smuzhiyun * or the entire directory.
3398*4882a593Smuzhiyun */
btrfs_del_dir_entries_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const char * name,int name_len,struct btrfs_inode * dir,u64 index)3399*4882a593Smuzhiyun int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3400*4882a593Smuzhiyun struct btrfs_root *root,
3401*4882a593Smuzhiyun const char *name, int name_len,
3402*4882a593Smuzhiyun struct btrfs_inode *dir, u64 index)
3403*4882a593Smuzhiyun {
3404*4882a593Smuzhiyun struct btrfs_root *log;
3405*4882a593Smuzhiyun struct btrfs_dir_item *di;
3406*4882a593Smuzhiyun struct btrfs_path *path;
3407*4882a593Smuzhiyun int ret;
3408*4882a593Smuzhiyun int err = 0;
3409*4882a593Smuzhiyun int bytes_del = 0;
3410*4882a593Smuzhiyun u64 dir_ino = btrfs_ino(dir);
3411*4882a593Smuzhiyun
3412*4882a593Smuzhiyun if (!inode_logged(trans, dir))
3413*4882a593Smuzhiyun return 0;
3414*4882a593Smuzhiyun
3415*4882a593Smuzhiyun ret = join_running_log_trans(root);
3416*4882a593Smuzhiyun if (ret)
3417*4882a593Smuzhiyun return 0;
3418*4882a593Smuzhiyun
3419*4882a593Smuzhiyun mutex_lock(&dir->log_mutex);
3420*4882a593Smuzhiyun
3421*4882a593Smuzhiyun log = root->log_root;
3422*4882a593Smuzhiyun path = btrfs_alloc_path();
3423*4882a593Smuzhiyun if (!path) {
3424*4882a593Smuzhiyun err = -ENOMEM;
3425*4882a593Smuzhiyun goto out_unlock;
3426*4882a593Smuzhiyun }
3427*4882a593Smuzhiyun
3428*4882a593Smuzhiyun di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
3429*4882a593Smuzhiyun name, name_len, -1);
3430*4882a593Smuzhiyun if (IS_ERR(di)) {
3431*4882a593Smuzhiyun err = PTR_ERR(di);
3432*4882a593Smuzhiyun goto fail;
3433*4882a593Smuzhiyun }
3434*4882a593Smuzhiyun if (di) {
3435*4882a593Smuzhiyun ret = btrfs_delete_one_dir_name(trans, log, path, di);
3436*4882a593Smuzhiyun bytes_del += name_len;
3437*4882a593Smuzhiyun if (ret) {
3438*4882a593Smuzhiyun err = ret;
3439*4882a593Smuzhiyun goto fail;
3440*4882a593Smuzhiyun }
3441*4882a593Smuzhiyun }
3442*4882a593Smuzhiyun btrfs_release_path(path);
3443*4882a593Smuzhiyun di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
3444*4882a593Smuzhiyun index, name, name_len, -1);
3445*4882a593Smuzhiyun if (IS_ERR(di)) {
3446*4882a593Smuzhiyun err = PTR_ERR(di);
3447*4882a593Smuzhiyun goto fail;
3448*4882a593Smuzhiyun }
3449*4882a593Smuzhiyun if (di) {
3450*4882a593Smuzhiyun ret = btrfs_delete_one_dir_name(trans, log, path, di);
3451*4882a593Smuzhiyun bytes_del += name_len;
3452*4882a593Smuzhiyun if (ret) {
3453*4882a593Smuzhiyun err = ret;
3454*4882a593Smuzhiyun goto fail;
3455*4882a593Smuzhiyun }
3456*4882a593Smuzhiyun }
3457*4882a593Smuzhiyun
3458*4882a593Smuzhiyun /* update the directory size in the log to reflect the names
3459*4882a593Smuzhiyun * we have removed
3460*4882a593Smuzhiyun */
3461*4882a593Smuzhiyun if (bytes_del) {
3462*4882a593Smuzhiyun struct btrfs_key key;
3463*4882a593Smuzhiyun
3464*4882a593Smuzhiyun key.objectid = dir_ino;
3465*4882a593Smuzhiyun key.offset = 0;
3466*4882a593Smuzhiyun key.type = BTRFS_INODE_ITEM_KEY;
3467*4882a593Smuzhiyun btrfs_release_path(path);
3468*4882a593Smuzhiyun
3469*4882a593Smuzhiyun ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
3470*4882a593Smuzhiyun if (ret < 0) {
3471*4882a593Smuzhiyun err = ret;
3472*4882a593Smuzhiyun goto fail;
3473*4882a593Smuzhiyun }
3474*4882a593Smuzhiyun if (ret == 0) {
3475*4882a593Smuzhiyun struct btrfs_inode_item *item;
3476*4882a593Smuzhiyun u64 i_size;
3477*4882a593Smuzhiyun
3478*4882a593Smuzhiyun item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3479*4882a593Smuzhiyun struct btrfs_inode_item);
3480*4882a593Smuzhiyun i_size = btrfs_inode_size(path->nodes[0], item);
3481*4882a593Smuzhiyun if (i_size > bytes_del)
3482*4882a593Smuzhiyun i_size -= bytes_del;
3483*4882a593Smuzhiyun else
3484*4882a593Smuzhiyun i_size = 0;
3485*4882a593Smuzhiyun btrfs_set_inode_size(path->nodes[0], item, i_size);
3486*4882a593Smuzhiyun btrfs_mark_buffer_dirty(path->nodes[0]);
3487*4882a593Smuzhiyun } else
3488*4882a593Smuzhiyun ret = 0;
3489*4882a593Smuzhiyun btrfs_release_path(path);
3490*4882a593Smuzhiyun }
3491*4882a593Smuzhiyun fail:
3492*4882a593Smuzhiyun btrfs_free_path(path);
3493*4882a593Smuzhiyun out_unlock:
3494*4882a593Smuzhiyun mutex_unlock(&dir->log_mutex);
3495*4882a593Smuzhiyun if (err == -ENOSPC) {
3496*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
3497*4882a593Smuzhiyun err = 0;
3498*4882a593Smuzhiyun } else if (err < 0 && err != -ENOENT) {
3499*4882a593Smuzhiyun /* ENOENT can be returned if the entry hasn't been fsynced yet */
3500*4882a593Smuzhiyun btrfs_abort_transaction(trans, err);
3501*4882a593Smuzhiyun }
3502*4882a593Smuzhiyun
3503*4882a593Smuzhiyun btrfs_end_log_trans(root);
3504*4882a593Smuzhiyun
3505*4882a593Smuzhiyun return err;
3506*4882a593Smuzhiyun }
3507*4882a593Smuzhiyun
3508*4882a593Smuzhiyun /* see comments for btrfs_del_dir_entries_in_log */
btrfs_del_inode_ref_in_log(struct btrfs_trans_handle * trans,struct btrfs_root * root,const char * name,int name_len,struct btrfs_inode * inode,u64 dirid)3509*4882a593Smuzhiyun int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3510*4882a593Smuzhiyun struct btrfs_root *root,
3511*4882a593Smuzhiyun const char *name, int name_len,
3512*4882a593Smuzhiyun struct btrfs_inode *inode, u64 dirid)
3513*4882a593Smuzhiyun {
3514*4882a593Smuzhiyun struct btrfs_root *log;
3515*4882a593Smuzhiyun u64 index;
3516*4882a593Smuzhiyun int ret;
3517*4882a593Smuzhiyun
3518*4882a593Smuzhiyun if (!inode_logged(trans, inode))
3519*4882a593Smuzhiyun return 0;
3520*4882a593Smuzhiyun
3521*4882a593Smuzhiyun ret = join_running_log_trans(root);
3522*4882a593Smuzhiyun if (ret)
3523*4882a593Smuzhiyun return 0;
3524*4882a593Smuzhiyun log = root->log_root;
3525*4882a593Smuzhiyun mutex_lock(&inode->log_mutex);
3526*4882a593Smuzhiyun
3527*4882a593Smuzhiyun ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
3528*4882a593Smuzhiyun dirid, &index);
3529*4882a593Smuzhiyun mutex_unlock(&inode->log_mutex);
3530*4882a593Smuzhiyun if (ret == -ENOSPC) {
3531*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
3532*4882a593Smuzhiyun ret = 0;
3533*4882a593Smuzhiyun } else if (ret < 0 && ret != -ENOENT)
3534*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
3535*4882a593Smuzhiyun btrfs_end_log_trans(root);
3536*4882a593Smuzhiyun
3537*4882a593Smuzhiyun return ret;
3538*4882a593Smuzhiyun }
3539*4882a593Smuzhiyun
3540*4882a593Smuzhiyun /*
3541*4882a593Smuzhiyun * creates a range item in the log for 'dirid'. first_offset and
3542*4882a593Smuzhiyun * last_offset tell us which parts of the key space the log should
3543*4882a593Smuzhiyun * be considered authoritative for.
3544*4882a593Smuzhiyun */
insert_dir_log_key(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,int key_type,u64 dirid,u64 first_offset,u64 last_offset)3545*4882a593Smuzhiyun static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3546*4882a593Smuzhiyun struct btrfs_root *log,
3547*4882a593Smuzhiyun struct btrfs_path *path,
3548*4882a593Smuzhiyun int key_type, u64 dirid,
3549*4882a593Smuzhiyun u64 first_offset, u64 last_offset)
3550*4882a593Smuzhiyun {
3551*4882a593Smuzhiyun int ret;
3552*4882a593Smuzhiyun struct btrfs_key key;
3553*4882a593Smuzhiyun struct btrfs_dir_log_item *item;
3554*4882a593Smuzhiyun
3555*4882a593Smuzhiyun key.objectid = dirid;
3556*4882a593Smuzhiyun key.offset = first_offset;
3557*4882a593Smuzhiyun if (key_type == BTRFS_DIR_ITEM_KEY)
3558*4882a593Smuzhiyun key.type = BTRFS_DIR_LOG_ITEM_KEY;
3559*4882a593Smuzhiyun else
3560*4882a593Smuzhiyun key.type = BTRFS_DIR_LOG_INDEX_KEY;
3561*4882a593Smuzhiyun ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
3562*4882a593Smuzhiyun if (ret)
3563*4882a593Smuzhiyun return ret;
3564*4882a593Smuzhiyun
3565*4882a593Smuzhiyun item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3566*4882a593Smuzhiyun struct btrfs_dir_log_item);
3567*4882a593Smuzhiyun btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3568*4882a593Smuzhiyun btrfs_mark_buffer_dirty(path->nodes[0]);
3569*4882a593Smuzhiyun btrfs_release_path(path);
3570*4882a593Smuzhiyun return 0;
3571*4882a593Smuzhiyun }
3572*4882a593Smuzhiyun
3573*4882a593Smuzhiyun /*
3574*4882a593Smuzhiyun * log all the items included in the current transaction for a given
3575*4882a593Smuzhiyun * directory. This also creates the range items in the log tree required
3576*4882a593Smuzhiyun * to replay anything deleted before the fsync
3577*4882a593Smuzhiyun */
log_dir_items(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,int key_type,struct btrfs_log_ctx * ctx,u64 min_offset,u64 * last_offset_ret)3578*4882a593Smuzhiyun static noinline int log_dir_items(struct btrfs_trans_handle *trans,
3579*4882a593Smuzhiyun struct btrfs_root *root, struct btrfs_inode *inode,
3580*4882a593Smuzhiyun struct btrfs_path *path,
3581*4882a593Smuzhiyun struct btrfs_path *dst_path, int key_type,
3582*4882a593Smuzhiyun struct btrfs_log_ctx *ctx,
3583*4882a593Smuzhiyun u64 min_offset, u64 *last_offset_ret)
3584*4882a593Smuzhiyun {
3585*4882a593Smuzhiyun struct btrfs_key min_key;
3586*4882a593Smuzhiyun struct btrfs_root *log = root->log_root;
3587*4882a593Smuzhiyun struct extent_buffer *src;
3588*4882a593Smuzhiyun int err = 0;
3589*4882a593Smuzhiyun int ret;
3590*4882a593Smuzhiyun int i;
3591*4882a593Smuzhiyun int nritems;
3592*4882a593Smuzhiyun u64 first_offset = min_offset;
3593*4882a593Smuzhiyun u64 last_offset = (u64)-1;
3594*4882a593Smuzhiyun u64 ino = btrfs_ino(inode);
3595*4882a593Smuzhiyun
3596*4882a593Smuzhiyun log = root->log_root;
3597*4882a593Smuzhiyun
3598*4882a593Smuzhiyun min_key.objectid = ino;
3599*4882a593Smuzhiyun min_key.type = key_type;
3600*4882a593Smuzhiyun min_key.offset = min_offset;
3601*4882a593Smuzhiyun
3602*4882a593Smuzhiyun ret = btrfs_search_forward(root, &min_key, path, trans->transid);
3603*4882a593Smuzhiyun
3604*4882a593Smuzhiyun /*
3605*4882a593Smuzhiyun * we didn't find anything from this transaction, see if there
3606*4882a593Smuzhiyun * is anything at all
3607*4882a593Smuzhiyun */
3608*4882a593Smuzhiyun if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3609*4882a593Smuzhiyun min_key.objectid = ino;
3610*4882a593Smuzhiyun min_key.type = key_type;
3611*4882a593Smuzhiyun min_key.offset = (u64)-1;
3612*4882a593Smuzhiyun btrfs_release_path(path);
3613*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3614*4882a593Smuzhiyun if (ret < 0) {
3615*4882a593Smuzhiyun btrfs_release_path(path);
3616*4882a593Smuzhiyun return ret;
3617*4882a593Smuzhiyun }
3618*4882a593Smuzhiyun ret = btrfs_previous_item(root, path, ino, key_type);
3619*4882a593Smuzhiyun
3620*4882a593Smuzhiyun /* if ret == 0 there are items for this type,
3621*4882a593Smuzhiyun * create a range to tell us the last key of this type.
3622*4882a593Smuzhiyun * otherwise, there are no items in this directory after
3623*4882a593Smuzhiyun * *min_offset, and we create a range to indicate that.
3624*4882a593Smuzhiyun */
3625*4882a593Smuzhiyun if (ret == 0) {
3626*4882a593Smuzhiyun struct btrfs_key tmp;
3627*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3628*4882a593Smuzhiyun path->slots[0]);
3629*4882a593Smuzhiyun if (key_type == tmp.type)
3630*4882a593Smuzhiyun first_offset = max(min_offset, tmp.offset) + 1;
3631*4882a593Smuzhiyun }
3632*4882a593Smuzhiyun goto done;
3633*4882a593Smuzhiyun }
3634*4882a593Smuzhiyun
3635*4882a593Smuzhiyun /* go backward to find any previous key */
3636*4882a593Smuzhiyun ret = btrfs_previous_item(root, path, ino, key_type);
3637*4882a593Smuzhiyun if (ret == 0) {
3638*4882a593Smuzhiyun struct btrfs_key tmp;
3639*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3640*4882a593Smuzhiyun if (key_type == tmp.type) {
3641*4882a593Smuzhiyun first_offset = tmp.offset;
3642*4882a593Smuzhiyun ret = overwrite_item(trans, log, dst_path,
3643*4882a593Smuzhiyun path->nodes[0], path->slots[0],
3644*4882a593Smuzhiyun &tmp);
3645*4882a593Smuzhiyun if (ret) {
3646*4882a593Smuzhiyun err = ret;
3647*4882a593Smuzhiyun goto done;
3648*4882a593Smuzhiyun }
3649*4882a593Smuzhiyun }
3650*4882a593Smuzhiyun }
3651*4882a593Smuzhiyun btrfs_release_path(path);
3652*4882a593Smuzhiyun
3653*4882a593Smuzhiyun /*
3654*4882a593Smuzhiyun * Find the first key from this transaction again. See the note for
3655*4882a593Smuzhiyun * log_new_dir_dentries, if we're logging a directory recursively we
3656*4882a593Smuzhiyun * won't be holding its i_mutex, which means we can modify the directory
3657*4882a593Smuzhiyun * while we're logging it. If we remove an entry between our first
3658*4882a593Smuzhiyun * search and this search we'll not find the key again and can just
3659*4882a593Smuzhiyun * bail.
3660*4882a593Smuzhiyun */
3661*4882a593Smuzhiyun search:
3662*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3663*4882a593Smuzhiyun if (ret != 0)
3664*4882a593Smuzhiyun goto done;
3665*4882a593Smuzhiyun
3666*4882a593Smuzhiyun /*
3667*4882a593Smuzhiyun * we have a block from this transaction, log every item in it
3668*4882a593Smuzhiyun * from our directory
3669*4882a593Smuzhiyun */
3670*4882a593Smuzhiyun while (1) {
3671*4882a593Smuzhiyun struct btrfs_key tmp;
3672*4882a593Smuzhiyun src = path->nodes[0];
3673*4882a593Smuzhiyun nritems = btrfs_header_nritems(src);
3674*4882a593Smuzhiyun for (i = path->slots[0]; i < nritems; i++) {
3675*4882a593Smuzhiyun struct btrfs_dir_item *di;
3676*4882a593Smuzhiyun
3677*4882a593Smuzhiyun btrfs_item_key_to_cpu(src, &min_key, i);
3678*4882a593Smuzhiyun
3679*4882a593Smuzhiyun if (min_key.objectid != ino || min_key.type != key_type)
3680*4882a593Smuzhiyun goto done;
3681*4882a593Smuzhiyun
3682*4882a593Smuzhiyun if (need_resched()) {
3683*4882a593Smuzhiyun btrfs_release_path(path);
3684*4882a593Smuzhiyun cond_resched();
3685*4882a593Smuzhiyun goto search;
3686*4882a593Smuzhiyun }
3687*4882a593Smuzhiyun
3688*4882a593Smuzhiyun ret = overwrite_item(trans, log, dst_path, src, i,
3689*4882a593Smuzhiyun &min_key);
3690*4882a593Smuzhiyun if (ret) {
3691*4882a593Smuzhiyun err = ret;
3692*4882a593Smuzhiyun goto done;
3693*4882a593Smuzhiyun }
3694*4882a593Smuzhiyun
3695*4882a593Smuzhiyun /*
3696*4882a593Smuzhiyun * We must make sure that when we log a directory entry,
3697*4882a593Smuzhiyun * the corresponding inode, after log replay, has a
3698*4882a593Smuzhiyun * matching link count. For example:
3699*4882a593Smuzhiyun *
3700*4882a593Smuzhiyun * touch foo
3701*4882a593Smuzhiyun * mkdir mydir
3702*4882a593Smuzhiyun * sync
3703*4882a593Smuzhiyun * ln foo mydir/bar
3704*4882a593Smuzhiyun * xfs_io -c "fsync" mydir
3705*4882a593Smuzhiyun * <crash>
3706*4882a593Smuzhiyun * <mount fs and log replay>
3707*4882a593Smuzhiyun *
3708*4882a593Smuzhiyun * Would result in a fsync log that when replayed, our
3709*4882a593Smuzhiyun * file inode would have a link count of 1, but we get
3710*4882a593Smuzhiyun * two directory entries pointing to the same inode.
3711*4882a593Smuzhiyun * After removing one of the names, it would not be
3712*4882a593Smuzhiyun * possible to remove the other name, which resulted
3713*4882a593Smuzhiyun * always in stale file handle errors, and would not
3714*4882a593Smuzhiyun * be possible to rmdir the parent directory, since
3715*4882a593Smuzhiyun * its i_size could never decrement to the value
3716*4882a593Smuzhiyun * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3717*4882a593Smuzhiyun */
3718*4882a593Smuzhiyun di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3719*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(src, di, &tmp);
3720*4882a593Smuzhiyun if (ctx &&
3721*4882a593Smuzhiyun (btrfs_dir_transid(src, di) == trans->transid ||
3722*4882a593Smuzhiyun btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3723*4882a593Smuzhiyun tmp.type != BTRFS_ROOT_ITEM_KEY)
3724*4882a593Smuzhiyun ctx->log_new_dentries = true;
3725*4882a593Smuzhiyun }
3726*4882a593Smuzhiyun path->slots[0] = nritems;
3727*4882a593Smuzhiyun
3728*4882a593Smuzhiyun /*
3729*4882a593Smuzhiyun * look ahead to the next item and see if it is also
3730*4882a593Smuzhiyun * from this directory and from this transaction
3731*4882a593Smuzhiyun */
3732*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
3733*4882a593Smuzhiyun if (ret) {
3734*4882a593Smuzhiyun if (ret == 1)
3735*4882a593Smuzhiyun last_offset = (u64)-1;
3736*4882a593Smuzhiyun else
3737*4882a593Smuzhiyun err = ret;
3738*4882a593Smuzhiyun goto done;
3739*4882a593Smuzhiyun }
3740*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3741*4882a593Smuzhiyun if (tmp.objectid != ino || tmp.type != key_type) {
3742*4882a593Smuzhiyun last_offset = (u64)-1;
3743*4882a593Smuzhiyun goto done;
3744*4882a593Smuzhiyun }
3745*4882a593Smuzhiyun if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3746*4882a593Smuzhiyun ret = overwrite_item(trans, log, dst_path,
3747*4882a593Smuzhiyun path->nodes[0], path->slots[0],
3748*4882a593Smuzhiyun &tmp);
3749*4882a593Smuzhiyun if (ret)
3750*4882a593Smuzhiyun err = ret;
3751*4882a593Smuzhiyun else
3752*4882a593Smuzhiyun last_offset = tmp.offset;
3753*4882a593Smuzhiyun goto done;
3754*4882a593Smuzhiyun }
3755*4882a593Smuzhiyun }
3756*4882a593Smuzhiyun done:
3757*4882a593Smuzhiyun btrfs_release_path(path);
3758*4882a593Smuzhiyun btrfs_release_path(dst_path);
3759*4882a593Smuzhiyun
3760*4882a593Smuzhiyun if (err == 0) {
3761*4882a593Smuzhiyun *last_offset_ret = last_offset;
3762*4882a593Smuzhiyun /*
3763*4882a593Smuzhiyun * insert the log range keys to indicate where the log
3764*4882a593Smuzhiyun * is valid
3765*4882a593Smuzhiyun */
3766*4882a593Smuzhiyun ret = insert_dir_log_key(trans, log, path, key_type,
3767*4882a593Smuzhiyun ino, first_offset, last_offset);
3768*4882a593Smuzhiyun if (ret)
3769*4882a593Smuzhiyun err = ret;
3770*4882a593Smuzhiyun }
3771*4882a593Smuzhiyun return err;
3772*4882a593Smuzhiyun }
3773*4882a593Smuzhiyun
3774*4882a593Smuzhiyun /*
3775*4882a593Smuzhiyun * logging directories is very similar to logging inodes, We find all the items
3776*4882a593Smuzhiyun * from the current transaction and write them to the log.
3777*4882a593Smuzhiyun *
3778*4882a593Smuzhiyun * The recovery code scans the directory in the subvolume, and if it finds a
3779*4882a593Smuzhiyun * key in the range logged that is not present in the log tree, then it means
3780*4882a593Smuzhiyun * that dir entry was unlinked during the transaction.
3781*4882a593Smuzhiyun *
3782*4882a593Smuzhiyun * In order for that scan to work, we must include one key smaller than
3783*4882a593Smuzhiyun * the smallest logged by this transaction and one key larger than the largest
3784*4882a593Smuzhiyun * key logged by this transaction.
3785*4882a593Smuzhiyun */
log_directory_changes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path,struct btrfs_log_ctx * ctx)3786*4882a593Smuzhiyun static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
3787*4882a593Smuzhiyun struct btrfs_root *root, struct btrfs_inode *inode,
3788*4882a593Smuzhiyun struct btrfs_path *path,
3789*4882a593Smuzhiyun struct btrfs_path *dst_path,
3790*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
3791*4882a593Smuzhiyun {
3792*4882a593Smuzhiyun u64 min_key;
3793*4882a593Smuzhiyun u64 max_key;
3794*4882a593Smuzhiyun int ret;
3795*4882a593Smuzhiyun int key_type = BTRFS_DIR_ITEM_KEY;
3796*4882a593Smuzhiyun
3797*4882a593Smuzhiyun again:
3798*4882a593Smuzhiyun min_key = 0;
3799*4882a593Smuzhiyun max_key = 0;
3800*4882a593Smuzhiyun while (1) {
3801*4882a593Smuzhiyun ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3802*4882a593Smuzhiyun ctx, min_key, &max_key);
3803*4882a593Smuzhiyun if (ret)
3804*4882a593Smuzhiyun return ret;
3805*4882a593Smuzhiyun if (max_key == (u64)-1)
3806*4882a593Smuzhiyun break;
3807*4882a593Smuzhiyun min_key = max_key + 1;
3808*4882a593Smuzhiyun }
3809*4882a593Smuzhiyun
3810*4882a593Smuzhiyun if (key_type == BTRFS_DIR_ITEM_KEY) {
3811*4882a593Smuzhiyun key_type = BTRFS_DIR_INDEX_KEY;
3812*4882a593Smuzhiyun goto again;
3813*4882a593Smuzhiyun }
3814*4882a593Smuzhiyun return 0;
3815*4882a593Smuzhiyun }
3816*4882a593Smuzhiyun
3817*4882a593Smuzhiyun /*
3818*4882a593Smuzhiyun * a helper function to drop items from the log before we relog an
3819*4882a593Smuzhiyun * inode. max_key_type indicates the highest item type to remove.
3820*4882a593Smuzhiyun * This cannot be run for file data extents because it does not
3821*4882a593Smuzhiyun * free the extents they point to.
3822*4882a593Smuzhiyun */
drop_objectid_items(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,u64 objectid,int max_key_type)3823*4882a593Smuzhiyun static int drop_objectid_items(struct btrfs_trans_handle *trans,
3824*4882a593Smuzhiyun struct btrfs_root *log,
3825*4882a593Smuzhiyun struct btrfs_path *path,
3826*4882a593Smuzhiyun u64 objectid, int max_key_type)
3827*4882a593Smuzhiyun {
3828*4882a593Smuzhiyun int ret;
3829*4882a593Smuzhiyun struct btrfs_key key;
3830*4882a593Smuzhiyun struct btrfs_key found_key;
3831*4882a593Smuzhiyun int start_slot;
3832*4882a593Smuzhiyun
3833*4882a593Smuzhiyun key.objectid = objectid;
3834*4882a593Smuzhiyun key.type = max_key_type;
3835*4882a593Smuzhiyun key.offset = (u64)-1;
3836*4882a593Smuzhiyun
3837*4882a593Smuzhiyun while (1) {
3838*4882a593Smuzhiyun ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
3839*4882a593Smuzhiyun BUG_ON(ret == 0); /* Logic error */
3840*4882a593Smuzhiyun if (ret < 0)
3841*4882a593Smuzhiyun break;
3842*4882a593Smuzhiyun
3843*4882a593Smuzhiyun if (path->slots[0] == 0)
3844*4882a593Smuzhiyun break;
3845*4882a593Smuzhiyun
3846*4882a593Smuzhiyun path->slots[0]--;
3847*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3848*4882a593Smuzhiyun path->slots[0]);
3849*4882a593Smuzhiyun
3850*4882a593Smuzhiyun if (found_key.objectid != objectid)
3851*4882a593Smuzhiyun break;
3852*4882a593Smuzhiyun
3853*4882a593Smuzhiyun found_key.offset = 0;
3854*4882a593Smuzhiyun found_key.type = 0;
3855*4882a593Smuzhiyun ret = btrfs_bin_search(path->nodes[0], &found_key, &start_slot);
3856*4882a593Smuzhiyun if (ret < 0)
3857*4882a593Smuzhiyun break;
3858*4882a593Smuzhiyun
3859*4882a593Smuzhiyun ret = btrfs_del_items(trans, log, path, start_slot,
3860*4882a593Smuzhiyun path->slots[0] - start_slot + 1);
3861*4882a593Smuzhiyun /*
3862*4882a593Smuzhiyun * If start slot isn't 0 then we don't need to re-search, we've
3863*4882a593Smuzhiyun * found the last guy with the objectid in this tree.
3864*4882a593Smuzhiyun */
3865*4882a593Smuzhiyun if (ret || start_slot != 0)
3866*4882a593Smuzhiyun break;
3867*4882a593Smuzhiyun btrfs_release_path(path);
3868*4882a593Smuzhiyun }
3869*4882a593Smuzhiyun btrfs_release_path(path);
3870*4882a593Smuzhiyun if (ret > 0)
3871*4882a593Smuzhiyun ret = 0;
3872*4882a593Smuzhiyun return ret;
3873*4882a593Smuzhiyun }
3874*4882a593Smuzhiyun
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode,int log_inode_only,u64 logged_isize)3875*4882a593Smuzhiyun static void fill_inode_item(struct btrfs_trans_handle *trans,
3876*4882a593Smuzhiyun struct extent_buffer *leaf,
3877*4882a593Smuzhiyun struct btrfs_inode_item *item,
3878*4882a593Smuzhiyun struct inode *inode, int log_inode_only,
3879*4882a593Smuzhiyun u64 logged_isize)
3880*4882a593Smuzhiyun {
3881*4882a593Smuzhiyun struct btrfs_map_token token;
3882*4882a593Smuzhiyun
3883*4882a593Smuzhiyun btrfs_init_map_token(&token, leaf);
3884*4882a593Smuzhiyun
3885*4882a593Smuzhiyun if (log_inode_only) {
3886*4882a593Smuzhiyun /* set the generation to zero so the recover code
3887*4882a593Smuzhiyun * can tell the difference between an logging
3888*4882a593Smuzhiyun * just to say 'this inode exists' and a logging
3889*4882a593Smuzhiyun * to say 'update this inode with these values'
3890*4882a593Smuzhiyun */
3891*4882a593Smuzhiyun btrfs_set_token_inode_generation(&token, item, 0);
3892*4882a593Smuzhiyun btrfs_set_token_inode_size(&token, item, logged_isize);
3893*4882a593Smuzhiyun } else {
3894*4882a593Smuzhiyun btrfs_set_token_inode_generation(&token, item,
3895*4882a593Smuzhiyun BTRFS_I(inode)->generation);
3896*4882a593Smuzhiyun btrfs_set_token_inode_size(&token, item, inode->i_size);
3897*4882a593Smuzhiyun }
3898*4882a593Smuzhiyun
3899*4882a593Smuzhiyun btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3900*4882a593Smuzhiyun btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3901*4882a593Smuzhiyun btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3902*4882a593Smuzhiyun btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3903*4882a593Smuzhiyun
3904*4882a593Smuzhiyun btrfs_set_token_timespec_sec(&token, &item->atime,
3905*4882a593Smuzhiyun inode->i_atime.tv_sec);
3906*4882a593Smuzhiyun btrfs_set_token_timespec_nsec(&token, &item->atime,
3907*4882a593Smuzhiyun inode->i_atime.tv_nsec);
3908*4882a593Smuzhiyun
3909*4882a593Smuzhiyun btrfs_set_token_timespec_sec(&token, &item->mtime,
3910*4882a593Smuzhiyun inode->i_mtime.tv_sec);
3911*4882a593Smuzhiyun btrfs_set_token_timespec_nsec(&token, &item->mtime,
3912*4882a593Smuzhiyun inode->i_mtime.tv_nsec);
3913*4882a593Smuzhiyun
3914*4882a593Smuzhiyun btrfs_set_token_timespec_sec(&token, &item->ctime,
3915*4882a593Smuzhiyun inode->i_ctime.tv_sec);
3916*4882a593Smuzhiyun btrfs_set_token_timespec_nsec(&token, &item->ctime,
3917*4882a593Smuzhiyun inode->i_ctime.tv_nsec);
3918*4882a593Smuzhiyun
3919*4882a593Smuzhiyun btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3920*4882a593Smuzhiyun
3921*4882a593Smuzhiyun btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3922*4882a593Smuzhiyun btrfs_set_token_inode_transid(&token, item, trans->transid);
3923*4882a593Smuzhiyun btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3924*4882a593Smuzhiyun btrfs_set_token_inode_flags(&token, item, BTRFS_I(inode)->flags);
3925*4882a593Smuzhiyun btrfs_set_token_inode_block_group(&token, item, 0);
3926*4882a593Smuzhiyun }
3927*4882a593Smuzhiyun
log_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * log,struct btrfs_path * path,struct btrfs_inode * inode)3928*4882a593Smuzhiyun static int log_inode_item(struct btrfs_trans_handle *trans,
3929*4882a593Smuzhiyun struct btrfs_root *log, struct btrfs_path *path,
3930*4882a593Smuzhiyun struct btrfs_inode *inode)
3931*4882a593Smuzhiyun {
3932*4882a593Smuzhiyun struct btrfs_inode_item *inode_item;
3933*4882a593Smuzhiyun int ret;
3934*4882a593Smuzhiyun
3935*4882a593Smuzhiyun ret = btrfs_insert_empty_item(trans, log, path,
3936*4882a593Smuzhiyun &inode->location, sizeof(*inode_item));
3937*4882a593Smuzhiyun if (ret && ret != -EEXIST)
3938*4882a593Smuzhiyun return ret;
3939*4882a593Smuzhiyun inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3940*4882a593Smuzhiyun struct btrfs_inode_item);
3941*4882a593Smuzhiyun fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3942*4882a593Smuzhiyun 0, 0);
3943*4882a593Smuzhiyun btrfs_release_path(path);
3944*4882a593Smuzhiyun return 0;
3945*4882a593Smuzhiyun }
3946*4882a593Smuzhiyun
log_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,struct btrfs_ordered_sum * sums)3947*4882a593Smuzhiyun static int log_csums(struct btrfs_trans_handle *trans,
3948*4882a593Smuzhiyun struct btrfs_inode *inode,
3949*4882a593Smuzhiyun struct btrfs_root *log_root,
3950*4882a593Smuzhiyun struct btrfs_ordered_sum *sums)
3951*4882a593Smuzhiyun {
3952*4882a593Smuzhiyun const u64 lock_end = sums->bytenr + sums->len - 1;
3953*4882a593Smuzhiyun struct extent_state *cached_state = NULL;
3954*4882a593Smuzhiyun int ret;
3955*4882a593Smuzhiyun
3956*4882a593Smuzhiyun /*
3957*4882a593Smuzhiyun * If this inode was not used for reflink operations in the current
3958*4882a593Smuzhiyun * transaction with new extents, then do the fast path, no need to
3959*4882a593Smuzhiyun * worry about logging checksum items with overlapping ranges.
3960*4882a593Smuzhiyun */
3961*4882a593Smuzhiyun if (inode->last_reflink_trans < trans->transid)
3962*4882a593Smuzhiyun return btrfs_csum_file_blocks(trans, log_root, sums);
3963*4882a593Smuzhiyun
3964*4882a593Smuzhiyun /*
3965*4882a593Smuzhiyun * Serialize logging for checksums. This is to avoid racing with the
3966*4882a593Smuzhiyun * same checksum being logged by another task that is logging another
3967*4882a593Smuzhiyun * file which happens to refer to the same extent as well. Such races
3968*4882a593Smuzhiyun * can leave checksum items in the log with overlapping ranges.
3969*4882a593Smuzhiyun */
3970*4882a593Smuzhiyun ret = lock_extent_bits(&log_root->log_csum_range, sums->bytenr,
3971*4882a593Smuzhiyun lock_end, &cached_state);
3972*4882a593Smuzhiyun if (ret)
3973*4882a593Smuzhiyun return ret;
3974*4882a593Smuzhiyun /*
3975*4882a593Smuzhiyun * Due to extent cloning, we might have logged a csum item that covers a
3976*4882a593Smuzhiyun * subrange of a cloned extent, and later we can end up logging a csum
3977*4882a593Smuzhiyun * item for a larger subrange of the same extent or the entire range.
3978*4882a593Smuzhiyun * This would leave csum items in the log tree that cover the same range
3979*4882a593Smuzhiyun * and break the searches for checksums in the log tree, resulting in
3980*4882a593Smuzhiyun * some checksums missing in the fs/subvolume tree. So just delete (or
3981*4882a593Smuzhiyun * trim and adjust) any existing csum items in the log for this range.
3982*4882a593Smuzhiyun */
3983*4882a593Smuzhiyun ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
3984*4882a593Smuzhiyun if (!ret)
3985*4882a593Smuzhiyun ret = btrfs_csum_file_blocks(trans, log_root, sums);
3986*4882a593Smuzhiyun
3987*4882a593Smuzhiyun unlock_extent_cached(&log_root->log_csum_range, sums->bytenr, lock_end,
3988*4882a593Smuzhiyun &cached_state);
3989*4882a593Smuzhiyun
3990*4882a593Smuzhiyun return ret;
3991*4882a593Smuzhiyun }
3992*4882a593Smuzhiyun
copy_items(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * dst_path,struct btrfs_path * src_path,int start_slot,int nr,int inode_only,u64 logged_isize)3993*4882a593Smuzhiyun static noinline int copy_items(struct btrfs_trans_handle *trans,
3994*4882a593Smuzhiyun struct btrfs_inode *inode,
3995*4882a593Smuzhiyun struct btrfs_path *dst_path,
3996*4882a593Smuzhiyun struct btrfs_path *src_path,
3997*4882a593Smuzhiyun int start_slot, int nr, int inode_only,
3998*4882a593Smuzhiyun u64 logged_isize)
3999*4882a593Smuzhiyun {
4000*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = trans->fs_info;
4001*4882a593Smuzhiyun unsigned long src_offset;
4002*4882a593Smuzhiyun unsigned long dst_offset;
4003*4882a593Smuzhiyun struct btrfs_root *log = inode->root->log_root;
4004*4882a593Smuzhiyun struct btrfs_file_extent_item *extent;
4005*4882a593Smuzhiyun struct btrfs_inode_item *inode_item;
4006*4882a593Smuzhiyun struct extent_buffer *src = src_path->nodes[0];
4007*4882a593Smuzhiyun int ret;
4008*4882a593Smuzhiyun struct btrfs_key *ins_keys;
4009*4882a593Smuzhiyun u32 *ins_sizes;
4010*4882a593Smuzhiyun char *ins_data;
4011*4882a593Smuzhiyun int i;
4012*4882a593Smuzhiyun struct list_head ordered_sums;
4013*4882a593Smuzhiyun int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
4014*4882a593Smuzhiyun
4015*4882a593Smuzhiyun INIT_LIST_HEAD(&ordered_sums);
4016*4882a593Smuzhiyun
4017*4882a593Smuzhiyun ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
4018*4882a593Smuzhiyun nr * sizeof(u32), GFP_NOFS);
4019*4882a593Smuzhiyun if (!ins_data)
4020*4882a593Smuzhiyun return -ENOMEM;
4021*4882a593Smuzhiyun
4022*4882a593Smuzhiyun ins_sizes = (u32 *)ins_data;
4023*4882a593Smuzhiyun ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
4024*4882a593Smuzhiyun
4025*4882a593Smuzhiyun for (i = 0; i < nr; i++) {
4026*4882a593Smuzhiyun ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
4027*4882a593Smuzhiyun btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
4028*4882a593Smuzhiyun }
4029*4882a593Smuzhiyun ret = btrfs_insert_empty_items(trans, log, dst_path,
4030*4882a593Smuzhiyun ins_keys, ins_sizes, nr);
4031*4882a593Smuzhiyun if (ret) {
4032*4882a593Smuzhiyun kfree(ins_data);
4033*4882a593Smuzhiyun return ret;
4034*4882a593Smuzhiyun }
4035*4882a593Smuzhiyun
4036*4882a593Smuzhiyun for (i = 0; i < nr; i++, dst_path->slots[0]++) {
4037*4882a593Smuzhiyun dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
4038*4882a593Smuzhiyun dst_path->slots[0]);
4039*4882a593Smuzhiyun
4040*4882a593Smuzhiyun src_offset = btrfs_item_ptr_offset(src, start_slot + i);
4041*4882a593Smuzhiyun
4042*4882a593Smuzhiyun if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
4043*4882a593Smuzhiyun inode_item = btrfs_item_ptr(dst_path->nodes[0],
4044*4882a593Smuzhiyun dst_path->slots[0],
4045*4882a593Smuzhiyun struct btrfs_inode_item);
4046*4882a593Smuzhiyun fill_inode_item(trans, dst_path->nodes[0], inode_item,
4047*4882a593Smuzhiyun &inode->vfs_inode,
4048*4882a593Smuzhiyun inode_only == LOG_INODE_EXISTS,
4049*4882a593Smuzhiyun logged_isize);
4050*4882a593Smuzhiyun } else {
4051*4882a593Smuzhiyun copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
4052*4882a593Smuzhiyun src_offset, ins_sizes[i]);
4053*4882a593Smuzhiyun }
4054*4882a593Smuzhiyun
4055*4882a593Smuzhiyun /* take a reference on file data extents so that truncates
4056*4882a593Smuzhiyun * or deletes of this inode don't have to relog the inode
4057*4882a593Smuzhiyun * again
4058*4882a593Smuzhiyun */
4059*4882a593Smuzhiyun if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
4060*4882a593Smuzhiyun !skip_csum) {
4061*4882a593Smuzhiyun int found_type;
4062*4882a593Smuzhiyun extent = btrfs_item_ptr(src, start_slot + i,
4063*4882a593Smuzhiyun struct btrfs_file_extent_item);
4064*4882a593Smuzhiyun
4065*4882a593Smuzhiyun if (btrfs_file_extent_generation(src, extent) < trans->transid)
4066*4882a593Smuzhiyun continue;
4067*4882a593Smuzhiyun
4068*4882a593Smuzhiyun found_type = btrfs_file_extent_type(src, extent);
4069*4882a593Smuzhiyun if (found_type == BTRFS_FILE_EXTENT_REG) {
4070*4882a593Smuzhiyun u64 ds, dl, cs, cl;
4071*4882a593Smuzhiyun ds = btrfs_file_extent_disk_bytenr(src,
4072*4882a593Smuzhiyun extent);
4073*4882a593Smuzhiyun /* ds == 0 is a hole */
4074*4882a593Smuzhiyun if (ds == 0)
4075*4882a593Smuzhiyun continue;
4076*4882a593Smuzhiyun
4077*4882a593Smuzhiyun dl = btrfs_file_extent_disk_num_bytes(src,
4078*4882a593Smuzhiyun extent);
4079*4882a593Smuzhiyun cs = btrfs_file_extent_offset(src, extent);
4080*4882a593Smuzhiyun cl = btrfs_file_extent_num_bytes(src,
4081*4882a593Smuzhiyun extent);
4082*4882a593Smuzhiyun if (btrfs_file_extent_compression(src,
4083*4882a593Smuzhiyun extent)) {
4084*4882a593Smuzhiyun cs = 0;
4085*4882a593Smuzhiyun cl = dl;
4086*4882a593Smuzhiyun }
4087*4882a593Smuzhiyun
4088*4882a593Smuzhiyun ret = btrfs_lookup_csums_range(
4089*4882a593Smuzhiyun fs_info->csum_root,
4090*4882a593Smuzhiyun ds + cs, ds + cs + cl - 1,
4091*4882a593Smuzhiyun &ordered_sums, 0);
4092*4882a593Smuzhiyun if (ret)
4093*4882a593Smuzhiyun break;
4094*4882a593Smuzhiyun }
4095*4882a593Smuzhiyun }
4096*4882a593Smuzhiyun }
4097*4882a593Smuzhiyun
4098*4882a593Smuzhiyun btrfs_mark_buffer_dirty(dst_path->nodes[0]);
4099*4882a593Smuzhiyun btrfs_release_path(dst_path);
4100*4882a593Smuzhiyun kfree(ins_data);
4101*4882a593Smuzhiyun
4102*4882a593Smuzhiyun /*
4103*4882a593Smuzhiyun * we have to do this after the loop above to avoid changing the
4104*4882a593Smuzhiyun * log tree while trying to change the log tree.
4105*4882a593Smuzhiyun */
4106*4882a593Smuzhiyun while (!list_empty(&ordered_sums)) {
4107*4882a593Smuzhiyun struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4108*4882a593Smuzhiyun struct btrfs_ordered_sum,
4109*4882a593Smuzhiyun list);
4110*4882a593Smuzhiyun if (!ret)
4111*4882a593Smuzhiyun ret = log_csums(trans, inode, log, sums);
4112*4882a593Smuzhiyun list_del(&sums->list);
4113*4882a593Smuzhiyun kfree(sums);
4114*4882a593Smuzhiyun }
4115*4882a593Smuzhiyun
4116*4882a593Smuzhiyun return ret;
4117*4882a593Smuzhiyun }
4118*4882a593Smuzhiyun
extent_cmp(void * priv,struct list_head * a,struct list_head * b)4119*4882a593Smuzhiyun static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
4120*4882a593Smuzhiyun {
4121*4882a593Smuzhiyun struct extent_map *em1, *em2;
4122*4882a593Smuzhiyun
4123*4882a593Smuzhiyun em1 = list_entry(a, struct extent_map, list);
4124*4882a593Smuzhiyun em2 = list_entry(b, struct extent_map, list);
4125*4882a593Smuzhiyun
4126*4882a593Smuzhiyun if (em1->start < em2->start)
4127*4882a593Smuzhiyun return -1;
4128*4882a593Smuzhiyun else if (em1->start > em2->start)
4129*4882a593Smuzhiyun return 1;
4130*4882a593Smuzhiyun return 0;
4131*4882a593Smuzhiyun }
4132*4882a593Smuzhiyun
log_extent_csums(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * log_root,const struct extent_map * em,struct btrfs_log_ctx * ctx)4133*4882a593Smuzhiyun static int log_extent_csums(struct btrfs_trans_handle *trans,
4134*4882a593Smuzhiyun struct btrfs_inode *inode,
4135*4882a593Smuzhiyun struct btrfs_root *log_root,
4136*4882a593Smuzhiyun const struct extent_map *em,
4137*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
4138*4882a593Smuzhiyun {
4139*4882a593Smuzhiyun struct btrfs_ordered_extent *ordered;
4140*4882a593Smuzhiyun u64 csum_offset;
4141*4882a593Smuzhiyun u64 csum_len;
4142*4882a593Smuzhiyun u64 mod_start = em->mod_start;
4143*4882a593Smuzhiyun u64 mod_len = em->mod_len;
4144*4882a593Smuzhiyun LIST_HEAD(ordered_sums);
4145*4882a593Smuzhiyun int ret = 0;
4146*4882a593Smuzhiyun
4147*4882a593Smuzhiyun if (inode->flags & BTRFS_INODE_NODATASUM ||
4148*4882a593Smuzhiyun test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
4149*4882a593Smuzhiyun em->block_start == EXTENT_MAP_HOLE)
4150*4882a593Smuzhiyun return 0;
4151*4882a593Smuzhiyun
4152*4882a593Smuzhiyun list_for_each_entry(ordered, &ctx->ordered_extents, log_list) {
4153*4882a593Smuzhiyun const u64 ordered_end = ordered->file_offset + ordered->num_bytes;
4154*4882a593Smuzhiyun const u64 mod_end = mod_start + mod_len;
4155*4882a593Smuzhiyun struct btrfs_ordered_sum *sums;
4156*4882a593Smuzhiyun
4157*4882a593Smuzhiyun if (mod_len == 0)
4158*4882a593Smuzhiyun break;
4159*4882a593Smuzhiyun
4160*4882a593Smuzhiyun if (ordered_end <= mod_start)
4161*4882a593Smuzhiyun continue;
4162*4882a593Smuzhiyun if (mod_end <= ordered->file_offset)
4163*4882a593Smuzhiyun break;
4164*4882a593Smuzhiyun
4165*4882a593Smuzhiyun /*
4166*4882a593Smuzhiyun * We are going to copy all the csums on this ordered extent, so
4167*4882a593Smuzhiyun * go ahead and adjust mod_start and mod_len in case this ordered
4168*4882a593Smuzhiyun * extent has already been logged.
4169*4882a593Smuzhiyun */
4170*4882a593Smuzhiyun if (ordered->file_offset > mod_start) {
4171*4882a593Smuzhiyun if (ordered_end >= mod_end)
4172*4882a593Smuzhiyun mod_len = ordered->file_offset - mod_start;
4173*4882a593Smuzhiyun /*
4174*4882a593Smuzhiyun * If we have this case
4175*4882a593Smuzhiyun *
4176*4882a593Smuzhiyun * |--------- logged extent ---------|
4177*4882a593Smuzhiyun * |----- ordered extent ----|
4178*4882a593Smuzhiyun *
4179*4882a593Smuzhiyun * Just don't mess with mod_start and mod_len, we'll
4180*4882a593Smuzhiyun * just end up logging more csums than we need and it
4181*4882a593Smuzhiyun * will be ok.
4182*4882a593Smuzhiyun */
4183*4882a593Smuzhiyun } else {
4184*4882a593Smuzhiyun if (ordered_end < mod_end) {
4185*4882a593Smuzhiyun mod_len = mod_end - ordered_end;
4186*4882a593Smuzhiyun mod_start = ordered_end;
4187*4882a593Smuzhiyun } else {
4188*4882a593Smuzhiyun mod_len = 0;
4189*4882a593Smuzhiyun }
4190*4882a593Smuzhiyun }
4191*4882a593Smuzhiyun
4192*4882a593Smuzhiyun /*
4193*4882a593Smuzhiyun * To keep us from looping for the above case of an ordered
4194*4882a593Smuzhiyun * extent that falls inside of the logged extent.
4195*4882a593Smuzhiyun */
4196*4882a593Smuzhiyun if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags))
4197*4882a593Smuzhiyun continue;
4198*4882a593Smuzhiyun
4199*4882a593Smuzhiyun list_for_each_entry(sums, &ordered->list, list) {
4200*4882a593Smuzhiyun ret = log_csums(trans, inode, log_root, sums);
4201*4882a593Smuzhiyun if (ret)
4202*4882a593Smuzhiyun return ret;
4203*4882a593Smuzhiyun }
4204*4882a593Smuzhiyun }
4205*4882a593Smuzhiyun
4206*4882a593Smuzhiyun /* We're done, found all csums in the ordered extents. */
4207*4882a593Smuzhiyun if (mod_len == 0)
4208*4882a593Smuzhiyun return 0;
4209*4882a593Smuzhiyun
4210*4882a593Smuzhiyun /* If we're compressed we have to save the entire range of csums. */
4211*4882a593Smuzhiyun if (em->compress_type) {
4212*4882a593Smuzhiyun csum_offset = 0;
4213*4882a593Smuzhiyun csum_len = max(em->block_len, em->orig_block_len);
4214*4882a593Smuzhiyun } else {
4215*4882a593Smuzhiyun csum_offset = mod_start - em->start;
4216*4882a593Smuzhiyun csum_len = mod_len;
4217*4882a593Smuzhiyun }
4218*4882a593Smuzhiyun
4219*4882a593Smuzhiyun /* block start is already adjusted for the file extent offset. */
4220*4882a593Smuzhiyun ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
4221*4882a593Smuzhiyun em->block_start + csum_offset,
4222*4882a593Smuzhiyun em->block_start + csum_offset +
4223*4882a593Smuzhiyun csum_len - 1, &ordered_sums, 0);
4224*4882a593Smuzhiyun if (ret)
4225*4882a593Smuzhiyun return ret;
4226*4882a593Smuzhiyun
4227*4882a593Smuzhiyun while (!list_empty(&ordered_sums)) {
4228*4882a593Smuzhiyun struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4229*4882a593Smuzhiyun struct btrfs_ordered_sum,
4230*4882a593Smuzhiyun list);
4231*4882a593Smuzhiyun if (!ret)
4232*4882a593Smuzhiyun ret = log_csums(trans, inode, log_root, sums);
4233*4882a593Smuzhiyun list_del(&sums->list);
4234*4882a593Smuzhiyun kfree(sums);
4235*4882a593Smuzhiyun }
4236*4882a593Smuzhiyun
4237*4882a593Smuzhiyun return ret;
4238*4882a593Smuzhiyun }
4239*4882a593Smuzhiyun
log_one_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_root * root,const struct extent_map * em,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4240*4882a593Smuzhiyun static int log_one_extent(struct btrfs_trans_handle *trans,
4241*4882a593Smuzhiyun struct btrfs_inode *inode, struct btrfs_root *root,
4242*4882a593Smuzhiyun const struct extent_map *em,
4243*4882a593Smuzhiyun struct btrfs_path *path,
4244*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
4245*4882a593Smuzhiyun {
4246*4882a593Smuzhiyun struct btrfs_root *log = root->log_root;
4247*4882a593Smuzhiyun struct btrfs_file_extent_item *fi;
4248*4882a593Smuzhiyun struct extent_buffer *leaf;
4249*4882a593Smuzhiyun struct btrfs_map_token token;
4250*4882a593Smuzhiyun struct btrfs_key key;
4251*4882a593Smuzhiyun u64 extent_offset = em->start - em->orig_start;
4252*4882a593Smuzhiyun u64 block_len;
4253*4882a593Smuzhiyun int ret;
4254*4882a593Smuzhiyun int extent_inserted = 0;
4255*4882a593Smuzhiyun
4256*4882a593Smuzhiyun ret = log_extent_csums(trans, inode, log, em, ctx);
4257*4882a593Smuzhiyun if (ret)
4258*4882a593Smuzhiyun return ret;
4259*4882a593Smuzhiyun
4260*4882a593Smuzhiyun ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
4261*4882a593Smuzhiyun em->start + em->len, NULL, 0, 1,
4262*4882a593Smuzhiyun sizeof(*fi), &extent_inserted);
4263*4882a593Smuzhiyun if (ret)
4264*4882a593Smuzhiyun return ret;
4265*4882a593Smuzhiyun
4266*4882a593Smuzhiyun if (!extent_inserted) {
4267*4882a593Smuzhiyun key.objectid = btrfs_ino(inode);
4268*4882a593Smuzhiyun key.type = BTRFS_EXTENT_DATA_KEY;
4269*4882a593Smuzhiyun key.offset = em->start;
4270*4882a593Smuzhiyun
4271*4882a593Smuzhiyun ret = btrfs_insert_empty_item(trans, log, path, &key,
4272*4882a593Smuzhiyun sizeof(*fi));
4273*4882a593Smuzhiyun if (ret)
4274*4882a593Smuzhiyun return ret;
4275*4882a593Smuzhiyun }
4276*4882a593Smuzhiyun leaf = path->nodes[0];
4277*4882a593Smuzhiyun btrfs_init_map_token(&token, leaf);
4278*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
4279*4882a593Smuzhiyun struct btrfs_file_extent_item);
4280*4882a593Smuzhiyun
4281*4882a593Smuzhiyun btrfs_set_token_file_extent_generation(&token, fi, trans->transid);
4282*4882a593Smuzhiyun if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4283*4882a593Smuzhiyun btrfs_set_token_file_extent_type(&token, fi,
4284*4882a593Smuzhiyun BTRFS_FILE_EXTENT_PREALLOC);
4285*4882a593Smuzhiyun else
4286*4882a593Smuzhiyun btrfs_set_token_file_extent_type(&token, fi,
4287*4882a593Smuzhiyun BTRFS_FILE_EXTENT_REG);
4288*4882a593Smuzhiyun
4289*4882a593Smuzhiyun block_len = max(em->block_len, em->orig_block_len);
4290*4882a593Smuzhiyun if (em->compress_type != BTRFS_COMPRESS_NONE) {
4291*4882a593Smuzhiyun btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4292*4882a593Smuzhiyun em->block_start);
4293*4882a593Smuzhiyun btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4294*4882a593Smuzhiyun } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4295*4882a593Smuzhiyun btrfs_set_token_file_extent_disk_bytenr(&token, fi,
4296*4882a593Smuzhiyun em->block_start -
4297*4882a593Smuzhiyun extent_offset);
4298*4882a593Smuzhiyun btrfs_set_token_file_extent_disk_num_bytes(&token, fi, block_len);
4299*4882a593Smuzhiyun } else {
4300*4882a593Smuzhiyun btrfs_set_token_file_extent_disk_bytenr(&token, fi, 0);
4301*4882a593Smuzhiyun btrfs_set_token_file_extent_disk_num_bytes(&token, fi, 0);
4302*4882a593Smuzhiyun }
4303*4882a593Smuzhiyun
4304*4882a593Smuzhiyun btrfs_set_token_file_extent_offset(&token, fi, extent_offset);
4305*4882a593Smuzhiyun btrfs_set_token_file_extent_num_bytes(&token, fi, em->len);
4306*4882a593Smuzhiyun btrfs_set_token_file_extent_ram_bytes(&token, fi, em->ram_bytes);
4307*4882a593Smuzhiyun btrfs_set_token_file_extent_compression(&token, fi, em->compress_type);
4308*4882a593Smuzhiyun btrfs_set_token_file_extent_encryption(&token, fi, 0);
4309*4882a593Smuzhiyun btrfs_set_token_file_extent_other_encoding(&token, fi, 0);
4310*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
4311*4882a593Smuzhiyun
4312*4882a593Smuzhiyun btrfs_release_path(path);
4313*4882a593Smuzhiyun
4314*4882a593Smuzhiyun return ret;
4315*4882a593Smuzhiyun }
4316*4882a593Smuzhiyun
4317*4882a593Smuzhiyun /*
4318*4882a593Smuzhiyun * Log all prealloc extents beyond the inode's i_size to make sure we do not
4319*4882a593Smuzhiyun * lose them after doing a full/fast fsync and replaying the log. We scan the
4320*4882a593Smuzhiyun * subvolume's root instead of iterating the inode's extent map tree because
4321*4882a593Smuzhiyun * otherwise we can log incorrect extent items based on extent map conversion.
4322*4882a593Smuzhiyun * That can happen due to the fact that extent maps are merged when they
4323*4882a593Smuzhiyun * are not in the extent map tree's list of modified extents.
4324*4882a593Smuzhiyun */
btrfs_log_prealloc_extents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path)4325*4882a593Smuzhiyun static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4326*4882a593Smuzhiyun struct btrfs_inode *inode,
4327*4882a593Smuzhiyun struct btrfs_path *path)
4328*4882a593Smuzhiyun {
4329*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
4330*4882a593Smuzhiyun struct btrfs_key key;
4331*4882a593Smuzhiyun const u64 i_size = i_size_read(&inode->vfs_inode);
4332*4882a593Smuzhiyun const u64 ino = btrfs_ino(inode);
4333*4882a593Smuzhiyun struct btrfs_path *dst_path = NULL;
4334*4882a593Smuzhiyun bool dropped_extents = false;
4335*4882a593Smuzhiyun u64 truncate_offset = i_size;
4336*4882a593Smuzhiyun struct extent_buffer *leaf;
4337*4882a593Smuzhiyun int slot;
4338*4882a593Smuzhiyun int ins_nr = 0;
4339*4882a593Smuzhiyun int start_slot;
4340*4882a593Smuzhiyun int ret;
4341*4882a593Smuzhiyun
4342*4882a593Smuzhiyun if (!(inode->flags & BTRFS_INODE_PREALLOC))
4343*4882a593Smuzhiyun return 0;
4344*4882a593Smuzhiyun
4345*4882a593Smuzhiyun key.objectid = ino;
4346*4882a593Smuzhiyun key.type = BTRFS_EXTENT_DATA_KEY;
4347*4882a593Smuzhiyun key.offset = i_size;
4348*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4349*4882a593Smuzhiyun if (ret < 0)
4350*4882a593Smuzhiyun goto out;
4351*4882a593Smuzhiyun
4352*4882a593Smuzhiyun /*
4353*4882a593Smuzhiyun * We must check if there is a prealloc extent that starts before the
4354*4882a593Smuzhiyun * i_size and crosses the i_size boundary. This is to ensure later we
4355*4882a593Smuzhiyun * truncate down to the end of that extent and not to the i_size, as
4356*4882a593Smuzhiyun * otherwise we end up losing part of the prealloc extent after a log
4357*4882a593Smuzhiyun * replay and with an implicit hole if there is another prealloc extent
4358*4882a593Smuzhiyun * that starts at an offset beyond i_size.
4359*4882a593Smuzhiyun */
4360*4882a593Smuzhiyun ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
4361*4882a593Smuzhiyun if (ret < 0)
4362*4882a593Smuzhiyun goto out;
4363*4882a593Smuzhiyun
4364*4882a593Smuzhiyun if (ret == 0) {
4365*4882a593Smuzhiyun struct btrfs_file_extent_item *ei;
4366*4882a593Smuzhiyun
4367*4882a593Smuzhiyun leaf = path->nodes[0];
4368*4882a593Smuzhiyun slot = path->slots[0];
4369*4882a593Smuzhiyun ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4370*4882a593Smuzhiyun
4371*4882a593Smuzhiyun if (btrfs_file_extent_type(leaf, ei) ==
4372*4882a593Smuzhiyun BTRFS_FILE_EXTENT_PREALLOC) {
4373*4882a593Smuzhiyun u64 extent_end;
4374*4882a593Smuzhiyun
4375*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, slot);
4376*4882a593Smuzhiyun extent_end = key.offset +
4377*4882a593Smuzhiyun btrfs_file_extent_num_bytes(leaf, ei);
4378*4882a593Smuzhiyun
4379*4882a593Smuzhiyun if (extent_end > i_size)
4380*4882a593Smuzhiyun truncate_offset = extent_end;
4381*4882a593Smuzhiyun }
4382*4882a593Smuzhiyun } else {
4383*4882a593Smuzhiyun ret = 0;
4384*4882a593Smuzhiyun }
4385*4882a593Smuzhiyun
4386*4882a593Smuzhiyun while (true) {
4387*4882a593Smuzhiyun leaf = path->nodes[0];
4388*4882a593Smuzhiyun slot = path->slots[0];
4389*4882a593Smuzhiyun
4390*4882a593Smuzhiyun if (slot >= btrfs_header_nritems(leaf)) {
4391*4882a593Smuzhiyun if (ins_nr > 0) {
4392*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path,
4393*4882a593Smuzhiyun start_slot, ins_nr, 1, 0);
4394*4882a593Smuzhiyun if (ret < 0)
4395*4882a593Smuzhiyun goto out;
4396*4882a593Smuzhiyun ins_nr = 0;
4397*4882a593Smuzhiyun }
4398*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
4399*4882a593Smuzhiyun if (ret < 0)
4400*4882a593Smuzhiyun goto out;
4401*4882a593Smuzhiyun if (ret > 0) {
4402*4882a593Smuzhiyun ret = 0;
4403*4882a593Smuzhiyun break;
4404*4882a593Smuzhiyun }
4405*4882a593Smuzhiyun continue;
4406*4882a593Smuzhiyun }
4407*4882a593Smuzhiyun
4408*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, slot);
4409*4882a593Smuzhiyun if (key.objectid > ino)
4410*4882a593Smuzhiyun break;
4411*4882a593Smuzhiyun if (WARN_ON_ONCE(key.objectid < ino) ||
4412*4882a593Smuzhiyun key.type < BTRFS_EXTENT_DATA_KEY ||
4413*4882a593Smuzhiyun key.offset < i_size) {
4414*4882a593Smuzhiyun path->slots[0]++;
4415*4882a593Smuzhiyun continue;
4416*4882a593Smuzhiyun }
4417*4882a593Smuzhiyun if (!dropped_extents) {
4418*4882a593Smuzhiyun /*
4419*4882a593Smuzhiyun * Avoid logging extent items logged in past fsync calls
4420*4882a593Smuzhiyun * and leading to duplicate keys in the log tree.
4421*4882a593Smuzhiyun */
4422*4882a593Smuzhiyun do {
4423*4882a593Smuzhiyun ret = btrfs_truncate_inode_items(trans,
4424*4882a593Smuzhiyun root->log_root,
4425*4882a593Smuzhiyun &inode->vfs_inode,
4426*4882a593Smuzhiyun truncate_offset,
4427*4882a593Smuzhiyun BTRFS_EXTENT_DATA_KEY);
4428*4882a593Smuzhiyun } while (ret == -EAGAIN);
4429*4882a593Smuzhiyun if (ret)
4430*4882a593Smuzhiyun goto out;
4431*4882a593Smuzhiyun dropped_extents = true;
4432*4882a593Smuzhiyun }
4433*4882a593Smuzhiyun if (ins_nr == 0)
4434*4882a593Smuzhiyun start_slot = slot;
4435*4882a593Smuzhiyun ins_nr++;
4436*4882a593Smuzhiyun path->slots[0]++;
4437*4882a593Smuzhiyun if (!dst_path) {
4438*4882a593Smuzhiyun dst_path = btrfs_alloc_path();
4439*4882a593Smuzhiyun if (!dst_path) {
4440*4882a593Smuzhiyun ret = -ENOMEM;
4441*4882a593Smuzhiyun goto out;
4442*4882a593Smuzhiyun }
4443*4882a593Smuzhiyun }
4444*4882a593Smuzhiyun }
4445*4882a593Smuzhiyun if (ins_nr > 0)
4446*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path,
4447*4882a593Smuzhiyun start_slot, ins_nr, 1, 0);
4448*4882a593Smuzhiyun out:
4449*4882a593Smuzhiyun btrfs_release_path(path);
4450*4882a593Smuzhiyun btrfs_free_path(dst_path);
4451*4882a593Smuzhiyun return ret;
4452*4882a593Smuzhiyun }
4453*4882a593Smuzhiyun
btrfs_log_changed_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_log_ctx * ctx)4454*4882a593Smuzhiyun static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4455*4882a593Smuzhiyun struct btrfs_root *root,
4456*4882a593Smuzhiyun struct btrfs_inode *inode,
4457*4882a593Smuzhiyun struct btrfs_path *path,
4458*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
4459*4882a593Smuzhiyun {
4460*4882a593Smuzhiyun struct btrfs_ordered_extent *ordered;
4461*4882a593Smuzhiyun struct btrfs_ordered_extent *tmp;
4462*4882a593Smuzhiyun struct extent_map *em, *n;
4463*4882a593Smuzhiyun struct list_head extents;
4464*4882a593Smuzhiyun struct extent_map_tree *tree = &inode->extent_tree;
4465*4882a593Smuzhiyun u64 test_gen;
4466*4882a593Smuzhiyun int ret = 0;
4467*4882a593Smuzhiyun int num = 0;
4468*4882a593Smuzhiyun
4469*4882a593Smuzhiyun INIT_LIST_HEAD(&extents);
4470*4882a593Smuzhiyun
4471*4882a593Smuzhiyun write_lock(&tree->lock);
4472*4882a593Smuzhiyun test_gen = root->fs_info->last_trans_committed;
4473*4882a593Smuzhiyun
4474*4882a593Smuzhiyun list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
4475*4882a593Smuzhiyun list_del_init(&em->list);
4476*4882a593Smuzhiyun /*
4477*4882a593Smuzhiyun * Just an arbitrary number, this can be really CPU intensive
4478*4882a593Smuzhiyun * once we start getting a lot of extents, and really once we
4479*4882a593Smuzhiyun * have a bunch of extents we just want to commit since it will
4480*4882a593Smuzhiyun * be faster.
4481*4882a593Smuzhiyun */
4482*4882a593Smuzhiyun if (++num > 32768) {
4483*4882a593Smuzhiyun list_del_init(&tree->modified_extents);
4484*4882a593Smuzhiyun ret = -EFBIG;
4485*4882a593Smuzhiyun goto process;
4486*4882a593Smuzhiyun }
4487*4882a593Smuzhiyun
4488*4882a593Smuzhiyun if (em->generation <= test_gen)
4489*4882a593Smuzhiyun continue;
4490*4882a593Smuzhiyun
4491*4882a593Smuzhiyun /* We log prealloc extents beyond eof later. */
4492*4882a593Smuzhiyun if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4493*4882a593Smuzhiyun em->start >= i_size_read(&inode->vfs_inode))
4494*4882a593Smuzhiyun continue;
4495*4882a593Smuzhiyun
4496*4882a593Smuzhiyun /* Need a ref to keep it from getting evicted from cache */
4497*4882a593Smuzhiyun refcount_inc(&em->refs);
4498*4882a593Smuzhiyun set_bit(EXTENT_FLAG_LOGGING, &em->flags);
4499*4882a593Smuzhiyun list_add_tail(&em->list, &extents);
4500*4882a593Smuzhiyun num++;
4501*4882a593Smuzhiyun }
4502*4882a593Smuzhiyun
4503*4882a593Smuzhiyun list_sort(NULL, &extents, extent_cmp);
4504*4882a593Smuzhiyun process:
4505*4882a593Smuzhiyun while (!list_empty(&extents)) {
4506*4882a593Smuzhiyun em = list_entry(extents.next, struct extent_map, list);
4507*4882a593Smuzhiyun
4508*4882a593Smuzhiyun list_del_init(&em->list);
4509*4882a593Smuzhiyun
4510*4882a593Smuzhiyun /*
4511*4882a593Smuzhiyun * If we had an error we just need to delete everybody from our
4512*4882a593Smuzhiyun * private list.
4513*4882a593Smuzhiyun */
4514*4882a593Smuzhiyun if (ret) {
4515*4882a593Smuzhiyun clear_em_logging(tree, em);
4516*4882a593Smuzhiyun free_extent_map(em);
4517*4882a593Smuzhiyun continue;
4518*4882a593Smuzhiyun }
4519*4882a593Smuzhiyun
4520*4882a593Smuzhiyun write_unlock(&tree->lock);
4521*4882a593Smuzhiyun
4522*4882a593Smuzhiyun ret = log_one_extent(trans, inode, root, em, path, ctx);
4523*4882a593Smuzhiyun write_lock(&tree->lock);
4524*4882a593Smuzhiyun clear_em_logging(tree, em);
4525*4882a593Smuzhiyun free_extent_map(em);
4526*4882a593Smuzhiyun }
4527*4882a593Smuzhiyun WARN_ON(!list_empty(&extents));
4528*4882a593Smuzhiyun write_unlock(&tree->lock);
4529*4882a593Smuzhiyun
4530*4882a593Smuzhiyun btrfs_release_path(path);
4531*4882a593Smuzhiyun if (!ret)
4532*4882a593Smuzhiyun ret = btrfs_log_prealloc_extents(trans, inode, path);
4533*4882a593Smuzhiyun if (ret)
4534*4882a593Smuzhiyun return ret;
4535*4882a593Smuzhiyun
4536*4882a593Smuzhiyun /*
4537*4882a593Smuzhiyun * We have logged all extents successfully, now make sure the commit of
4538*4882a593Smuzhiyun * the current transaction waits for the ordered extents to complete
4539*4882a593Smuzhiyun * before it commits and wipes out the log trees, otherwise we would
4540*4882a593Smuzhiyun * lose data if an ordered extents completes after the transaction
4541*4882a593Smuzhiyun * commits and a power failure happens after the transaction commit.
4542*4882a593Smuzhiyun */
4543*4882a593Smuzhiyun list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) {
4544*4882a593Smuzhiyun list_del_init(&ordered->log_list);
4545*4882a593Smuzhiyun set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags);
4546*4882a593Smuzhiyun
4547*4882a593Smuzhiyun if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4548*4882a593Smuzhiyun spin_lock_irq(&inode->ordered_tree.lock);
4549*4882a593Smuzhiyun if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) {
4550*4882a593Smuzhiyun set_bit(BTRFS_ORDERED_PENDING, &ordered->flags);
4551*4882a593Smuzhiyun atomic_inc(&trans->transaction->pending_ordered);
4552*4882a593Smuzhiyun }
4553*4882a593Smuzhiyun spin_unlock_irq(&inode->ordered_tree.lock);
4554*4882a593Smuzhiyun }
4555*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
4556*4882a593Smuzhiyun }
4557*4882a593Smuzhiyun
4558*4882a593Smuzhiyun return 0;
4559*4882a593Smuzhiyun }
4560*4882a593Smuzhiyun
logged_inode_size(struct btrfs_root * log,struct btrfs_inode * inode,struct btrfs_path * path,u64 * size_ret)4561*4882a593Smuzhiyun static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
4562*4882a593Smuzhiyun struct btrfs_path *path, u64 *size_ret)
4563*4882a593Smuzhiyun {
4564*4882a593Smuzhiyun struct btrfs_key key;
4565*4882a593Smuzhiyun int ret;
4566*4882a593Smuzhiyun
4567*4882a593Smuzhiyun key.objectid = btrfs_ino(inode);
4568*4882a593Smuzhiyun key.type = BTRFS_INODE_ITEM_KEY;
4569*4882a593Smuzhiyun key.offset = 0;
4570*4882a593Smuzhiyun
4571*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4572*4882a593Smuzhiyun if (ret < 0) {
4573*4882a593Smuzhiyun return ret;
4574*4882a593Smuzhiyun } else if (ret > 0) {
4575*4882a593Smuzhiyun *size_ret = 0;
4576*4882a593Smuzhiyun } else {
4577*4882a593Smuzhiyun struct btrfs_inode_item *item;
4578*4882a593Smuzhiyun
4579*4882a593Smuzhiyun item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4580*4882a593Smuzhiyun struct btrfs_inode_item);
4581*4882a593Smuzhiyun *size_ret = btrfs_inode_size(path->nodes[0], item);
4582*4882a593Smuzhiyun /*
4583*4882a593Smuzhiyun * If the in-memory inode's i_size is smaller then the inode
4584*4882a593Smuzhiyun * size stored in the btree, return the inode's i_size, so
4585*4882a593Smuzhiyun * that we get a correct inode size after replaying the log
4586*4882a593Smuzhiyun * when before a power failure we had a shrinking truncate
4587*4882a593Smuzhiyun * followed by addition of a new name (rename / new hard link).
4588*4882a593Smuzhiyun * Otherwise return the inode size from the btree, to avoid
4589*4882a593Smuzhiyun * data loss when replaying a log due to previously doing a
4590*4882a593Smuzhiyun * write that expands the inode's size and logging a new name
4591*4882a593Smuzhiyun * immediately after.
4592*4882a593Smuzhiyun */
4593*4882a593Smuzhiyun if (*size_ret > inode->vfs_inode.i_size)
4594*4882a593Smuzhiyun *size_ret = inode->vfs_inode.i_size;
4595*4882a593Smuzhiyun }
4596*4882a593Smuzhiyun
4597*4882a593Smuzhiyun btrfs_release_path(path);
4598*4882a593Smuzhiyun return 0;
4599*4882a593Smuzhiyun }
4600*4882a593Smuzhiyun
4601*4882a593Smuzhiyun /*
4602*4882a593Smuzhiyun * At the moment we always log all xattrs. This is to figure out at log replay
4603*4882a593Smuzhiyun * time which xattrs must have their deletion replayed. If a xattr is missing
4604*4882a593Smuzhiyun * in the log tree and exists in the fs/subvol tree, we delete it. This is
4605*4882a593Smuzhiyun * because if a xattr is deleted, the inode is fsynced and a power failure
4606*4882a593Smuzhiyun * happens, causing the log to be replayed the next time the fs is mounted,
4607*4882a593Smuzhiyun * we want the xattr to not exist anymore (same behaviour as other filesystems
4608*4882a593Smuzhiyun * with a journal, ext3/4, xfs, f2fs, etc).
4609*4882a593Smuzhiyun */
btrfs_log_all_xattrs(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,struct btrfs_path * dst_path)4610*4882a593Smuzhiyun static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4611*4882a593Smuzhiyun struct btrfs_root *root,
4612*4882a593Smuzhiyun struct btrfs_inode *inode,
4613*4882a593Smuzhiyun struct btrfs_path *path,
4614*4882a593Smuzhiyun struct btrfs_path *dst_path)
4615*4882a593Smuzhiyun {
4616*4882a593Smuzhiyun int ret;
4617*4882a593Smuzhiyun struct btrfs_key key;
4618*4882a593Smuzhiyun const u64 ino = btrfs_ino(inode);
4619*4882a593Smuzhiyun int ins_nr = 0;
4620*4882a593Smuzhiyun int start_slot = 0;
4621*4882a593Smuzhiyun bool found_xattrs = false;
4622*4882a593Smuzhiyun
4623*4882a593Smuzhiyun if (test_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags))
4624*4882a593Smuzhiyun return 0;
4625*4882a593Smuzhiyun
4626*4882a593Smuzhiyun key.objectid = ino;
4627*4882a593Smuzhiyun key.type = BTRFS_XATTR_ITEM_KEY;
4628*4882a593Smuzhiyun key.offset = 0;
4629*4882a593Smuzhiyun
4630*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4631*4882a593Smuzhiyun if (ret < 0)
4632*4882a593Smuzhiyun return ret;
4633*4882a593Smuzhiyun
4634*4882a593Smuzhiyun while (true) {
4635*4882a593Smuzhiyun int slot = path->slots[0];
4636*4882a593Smuzhiyun struct extent_buffer *leaf = path->nodes[0];
4637*4882a593Smuzhiyun int nritems = btrfs_header_nritems(leaf);
4638*4882a593Smuzhiyun
4639*4882a593Smuzhiyun if (slot >= nritems) {
4640*4882a593Smuzhiyun if (ins_nr > 0) {
4641*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path,
4642*4882a593Smuzhiyun start_slot, ins_nr, 1, 0);
4643*4882a593Smuzhiyun if (ret < 0)
4644*4882a593Smuzhiyun return ret;
4645*4882a593Smuzhiyun ins_nr = 0;
4646*4882a593Smuzhiyun }
4647*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
4648*4882a593Smuzhiyun if (ret < 0)
4649*4882a593Smuzhiyun return ret;
4650*4882a593Smuzhiyun else if (ret > 0)
4651*4882a593Smuzhiyun break;
4652*4882a593Smuzhiyun continue;
4653*4882a593Smuzhiyun }
4654*4882a593Smuzhiyun
4655*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, slot);
4656*4882a593Smuzhiyun if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4657*4882a593Smuzhiyun break;
4658*4882a593Smuzhiyun
4659*4882a593Smuzhiyun if (ins_nr == 0)
4660*4882a593Smuzhiyun start_slot = slot;
4661*4882a593Smuzhiyun ins_nr++;
4662*4882a593Smuzhiyun path->slots[0]++;
4663*4882a593Smuzhiyun found_xattrs = true;
4664*4882a593Smuzhiyun cond_resched();
4665*4882a593Smuzhiyun }
4666*4882a593Smuzhiyun if (ins_nr > 0) {
4667*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path,
4668*4882a593Smuzhiyun start_slot, ins_nr, 1, 0);
4669*4882a593Smuzhiyun if (ret < 0)
4670*4882a593Smuzhiyun return ret;
4671*4882a593Smuzhiyun }
4672*4882a593Smuzhiyun
4673*4882a593Smuzhiyun if (!found_xattrs)
4674*4882a593Smuzhiyun set_bit(BTRFS_INODE_NO_XATTRS, &inode->runtime_flags);
4675*4882a593Smuzhiyun
4676*4882a593Smuzhiyun return 0;
4677*4882a593Smuzhiyun }
4678*4882a593Smuzhiyun
4679*4882a593Smuzhiyun /*
4680*4882a593Smuzhiyun * When using the NO_HOLES feature if we punched a hole that causes the
4681*4882a593Smuzhiyun * deletion of entire leafs or all the extent items of the first leaf (the one
4682*4882a593Smuzhiyun * that contains the inode item and references) we may end up not processing
4683*4882a593Smuzhiyun * any extents, because there are no leafs with a generation matching the
4684*4882a593Smuzhiyun * current transaction that have extent items for our inode. So we need to find
4685*4882a593Smuzhiyun * if any holes exist and then log them. We also need to log holes after any
4686*4882a593Smuzhiyun * truncate operation that changes the inode's size.
4687*4882a593Smuzhiyun */
btrfs_log_holes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path)4688*4882a593Smuzhiyun static int btrfs_log_holes(struct btrfs_trans_handle *trans,
4689*4882a593Smuzhiyun struct btrfs_root *root,
4690*4882a593Smuzhiyun struct btrfs_inode *inode,
4691*4882a593Smuzhiyun struct btrfs_path *path)
4692*4882a593Smuzhiyun {
4693*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
4694*4882a593Smuzhiyun struct btrfs_key key;
4695*4882a593Smuzhiyun const u64 ino = btrfs_ino(inode);
4696*4882a593Smuzhiyun const u64 i_size = i_size_read(&inode->vfs_inode);
4697*4882a593Smuzhiyun u64 prev_extent_end = 0;
4698*4882a593Smuzhiyun int ret;
4699*4882a593Smuzhiyun
4700*4882a593Smuzhiyun if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
4701*4882a593Smuzhiyun return 0;
4702*4882a593Smuzhiyun
4703*4882a593Smuzhiyun key.objectid = ino;
4704*4882a593Smuzhiyun key.type = BTRFS_EXTENT_DATA_KEY;
4705*4882a593Smuzhiyun key.offset = 0;
4706*4882a593Smuzhiyun
4707*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4708*4882a593Smuzhiyun if (ret < 0)
4709*4882a593Smuzhiyun return ret;
4710*4882a593Smuzhiyun
4711*4882a593Smuzhiyun while (true) {
4712*4882a593Smuzhiyun struct extent_buffer *leaf = path->nodes[0];
4713*4882a593Smuzhiyun
4714*4882a593Smuzhiyun if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
4715*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
4716*4882a593Smuzhiyun if (ret < 0)
4717*4882a593Smuzhiyun return ret;
4718*4882a593Smuzhiyun if (ret > 0) {
4719*4882a593Smuzhiyun ret = 0;
4720*4882a593Smuzhiyun break;
4721*4882a593Smuzhiyun }
4722*4882a593Smuzhiyun leaf = path->nodes[0];
4723*4882a593Smuzhiyun }
4724*4882a593Smuzhiyun
4725*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4726*4882a593Smuzhiyun if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
4727*4882a593Smuzhiyun break;
4728*4882a593Smuzhiyun
4729*4882a593Smuzhiyun /* We have a hole, log it. */
4730*4882a593Smuzhiyun if (prev_extent_end < key.offset) {
4731*4882a593Smuzhiyun const u64 hole_len = key.offset - prev_extent_end;
4732*4882a593Smuzhiyun
4733*4882a593Smuzhiyun /*
4734*4882a593Smuzhiyun * Release the path to avoid deadlocks with other code
4735*4882a593Smuzhiyun * paths that search the root while holding locks on
4736*4882a593Smuzhiyun * leafs from the log root.
4737*4882a593Smuzhiyun */
4738*4882a593Smuzhiyun btrfs_release_path(path);
4739*4882a593Smuzhiyun ret = btrfs_insert_file_extent(trans, root->log_root,
4740*4882a593Smuzhiyun ino, prev_extent_end, 0,
4741*4882a593Smuzhiyun 0, hole_len, 0, hole_len,
4742*4882a593Smuzhiyun 0, 0, 0);
4743*4882a593Smuzhiyun if (ret < 0)
4744*4882a593Smuzhiyun return ret;
4745*4882a593Smuzhiyun
4746*4882a593Smuzhiyun /*
4747*4882a593Smuzhiyun * Search for the same key again in the root. Since it's
4748*4882a593Smuzhiyun * an extent item and we are holding the inode lock, the
4749*4882a593Smuzhiyun * key must still exist. If it doesn't just emit warning
4750*4882a593Smuzhiyun * and return an error to fall back to a transaction
4751*4882a593Smuzhiyun * commit.
4752*4882a593Smuzhiyun */
4753*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4754*4882a593Smuzhiyun if (ret < 0)
4755*4882a593Smuzhiyun return ret;
4756*4882a593Smuzhiyun if (WARN_ON(ret > 0))
4757*4882a593Smuzhiyun return -ENOENT;
4758*4882a593Smuzhiyun leaf = path->nodes[0];
4759*4882a593Smuzhiyun }
4760*4882a593Smuzhiyun
4761*4882a593Smuzhiyun prev_extent_end = btrfs_file_extent_end(path);
4762*4882a593Smuzhiyun path->slots[0]++;
4763*4882a593Smuzhiyun cond_resched();
4764*4882a593Smuzhiyun }
4765*4882a593Smuzhiyun
4766*4882a593Smuzhiyun if (prev_extent_end < i_size) {
4767*4882a593Smuzhiyun u64 hole_len;
4768*4882a593Smuzhiyun
4769*4882a593Smuzhiyun btrfs_release_path(path);
4770*4882a593Smuzhiyun hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
4771*4882a593Smuzhiyun ret = btrfs_insert_file_extent(trans, root->log_root,
4772*4882a593Smuzhiyun ino, prev_extent_end, 0, 0,
4773*4882a593Smuzhiyun hole_len, 0, hole_len,
4774*4882a593Smuzhiyun 0, 0, 0);
4775*4882a593Smuzhiyun if (ret < 0)
4776*4882a593Smuzhiyun return ret;
4777*4882a593Smuzhiyun }
4778*4882a593Smuzhiyun
4779*4882a593Smuzhiyun return 0;
4780*4882a593Smuzhiyun }
4781*4882a593Smuzhiyun
4782*4882a593Smuzhiyun /*
4783*4882a593Smuzhiyun * When we are logging a new inode X, check if it doesn't have a reference that
4784*4882a593Smuzhiyun * matches the reference from some other inode Y created in a past transaction
4785*4882a593Smuzhiyun * and that was renamed in the current transaction. If we don't do this, then at
4786*4882a593Smuzhiyun * log replay time we can lose inode Y (and all its files if it's a directory):
4787*4882a593Smuzhiyun *
4788*4882a593Smuzhiyun * mkdir /mnt/x
4789*4882a593Smuzhiyun * echo "hello world" > /mnt/x/foobar
4790*4882a593Smuzhiyun * sync
4791*4882a593Smuzhiyun * mv /mnt/x /mnt/y
4792*4882a593Smuzhiyun * mkdir /mnt/x # or touch /mnt/x
4793*4882a593Smuzhiyun * xfs_io -c fsync /mnt/x
4794*4882a593Smuzhiyun * <power fail>
4795*4882a593Smuzhiyun * mount fs, trigger log replay
4796*4882a593Smuzhiyun *
4797*4882a593Smuzhiyun * After the log replay procedure, we would lose the first directory and all its
4798*4882a593Smuzhiyun * files (file foobar).
4799*4882a593Smuzhiyun * For the case where inode Y is not a directory we simply end up losing it:
4800*4882a593Smuzhiyun *
4801*4882a593Smuzhiyun * echo "123" > /mnt/foo
4802*4882a593Smuzhiyun * sync
4803*4882a593Smuzhiyun * mv /mnt/foo /mnt/bar
4804*4882a593Smuzhiyun * echo "abc" > /mnt/foo
4805*4882a593Smuzhiyun * xfs_io -c fsync /mnt/foo
4806*4882a593Smuzhiyun * <power fail>
4807*4882a593Smuzhiyun *
4808*4882a593Smuzhiyun * We also need this for cases where a snapshot entry is replaced by some other
4809*4882a593Smuzhiyun * entry (file or directory) otherwise we end up with an unreplayable log due to
4810*4882a593Smuzhiyun * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4811*4882a593Smuzhiyun * if it were a regular entry:
4812*4882a593Smuzhiyun *
4813*4882a593Smuzhiyun * mkdir /mnt/x
4814*4882a593Smuzhiyun * btrfs subvolume snapshot /mnt /mnt/x/snap
4815*4882a593Smuzhiyun * btrfs subvolume delete /mnt/x/snap
4816*4882a593Smuzhiyun * rmdir /mnt/x
4817*4882a593Smuzhiyun * mkdir /mnt/x
4818*4882a593Smuzhiyun * fsync /mnt/x or fsync some new file inside it
4819*4882a593Smuzhiyun * <power fail>
4820*4882a593Smuzhiyun *
4821*4882a593Smuzhiyun * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4822*4882a593Smuzhiyun * the same transaction.
4823*4882a593Smuzhiyun */
btrfs_check_ref_name_override(struct extent_buffer * eb,const int slot,const struct btrfs_key * key,struct btrfs_inode * inode,u64 * other_ino,u64 * other_parent)4824*4882a593Smuzhiyun static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4825*4882a593Smuzhiyun const int slot,
4826*4882a593Smuzhiyun const struct btrfs_key *key,
4827*4882a593Smuzhiyun struct btrfs_inode *inode,
4828*4882a593Smuzhiyun u64 *other_ino, u64 *other_parent)
4829*4882a593Smuzhiyun {
4830*4882a593Smuzhiyun int ret;
4831*4882a593Smuzhiyun struct btrfs_path *search_path;
4832*4882a593Smuzhiyun char *name = NULL;
4833*4882a593Smuzhiyun u32 name_len = 0;
4834*4882a593Smuzhiyun u32 item_size = btrfs_item_size_nr(eb, slot);
4835*4882a593Smuzhiyun u32 cur_offset = 0;
4836*4882a593Smuzhiyun unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4837*4882a593Smuzhiyun
4838*4882a593Smuzhiyun search_path = btrfs_alloc_path();
4839*4882a593Smuzhiyun if (!search_path)
4840*4882a593Smuzhiyun return -ENOMEM;
4841*4882a593Smuzhiyun search_path->search_commit_root = 1;
4842*4882a593Smuzhiyun search_path->skip_locking = 1;
4843*4882a593Smuzhiyun
4844*4882a593Smuzhiyun while (cur_offset < item_size) {
4845*4882a593Smuzhiyun u64 parent;
4846*4882a593Smuzhiyun u32 this_name_len;
4847*4882a593Smuzhiyun u32 this_len;
4848*4882a593Smuzhiyun unsigned long name_ptr;
4849*4882a593Smuzhiyun struct btrfs_dir_item *di;
4850*4882a593Smuzhiyun
4851*4882a593Smuzhiyun if (key->type == BTRFS_INODE_REF_KEY) {
4852*4882a593Smuzhiyun struct btrfs_inode_ref *iref;
4853*4882a593Smuzhiyun
4854*4882a593Smuzhiyun iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4855*4882a593Smuzhiyun parent = key->offset;
4856*4882a593Smuzhiyun this_name_len = btrfs_inode_ref_name_len(eb, iref);
4857*4882a593Smuzhiyun name_ptr = (unsigned long)(iref + 1);
4858*4882a593Smuzhiyun this_len = sizeof(*iref) + this_name_len;
4859*4882a593Smuzhiyun } else {
4860*4882a593Smuzhiyun struct btrfs_inode_extref *extref;
4861*4882a593Smuzhiyun
4862*4882a593Smuzhiyun extref = (struct btrfs_inode_extref *)(ptr +
4863*4882a593Smuzhiyun cur_offset);
4864*4882a593Smuzhiyun parent = btrfs_inode_extref_parent(eb, extref);
4865*4882a593Smuzhiyun this_name_len = btrfs_inode_extref_name_len(eb, extref);
4866*4882a593Smuzhiyun name_ptr = (unsigned long)&extref->name;
4867*4882a593Smuzhiyun this_len = sizeof(*extref) + this_name_len;
4868*4882a593Smuzhiyun }
4869*4882a593Smuzhiyun
4870*4882a593Smuzhiyun if (this_name_len > name_len) {
4871*4882a593Smuzhiyun char *new_name;
4872*4882a593Smuzhiyun
4873*4882a593Smuzhiyun new_name = krealloc(name, this_name_len, GFP_NOFS);
4874*4882a593Smuzhiyun if (!new_name) {
4875*4882a593Smuzhiyun ret = -ENOMEM;
4876*4882a593Smuzhiyun goto out;
4877*4882a593Smuzhiyun }
4878*4882a593Smuzhiyun name_len = this_name_len;
4879*4882a593Smuzhiyun name = new_name;
4880*4882a593Smuzhiyun }
4881*4882a593Smuzhiyun
4882*4882a593Smuzhiyun read_extent_buffer(eb, name, name_ptr, this_name_len);
4883*4882a593Smuzhiyun di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4884*4882a593Smuzhiyun parent, name, this_name_len, 0);
4885*4882a593Smuzhiyun if (di && !IS_ERR(di)) {
4886*4882a593Smuzhiyun struct btrfs_key di_key;
4887*4882a593Smuzhiyun
4888*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4889*4882a593Smuzhiyun di, &di_key);
4890*4882a593Smuzhiyun if (di_key.type == BTRFS_INODE_ITEM_KEY) {
4891*4882a593Smuzhiyun if (di_key.objectid != key->objectid) {
4892*4882a593Smuzhiyun ret = 1;
4893*4882a593Smuzhiyun *other_ino = di_key.objectid;
4894*4882a593Smuzhiyun *other_parent = parent;
4895*4882a593Smuzhiyun } else {
4896*4882a593Smuzhiyun ret = 0;
4897*4882a593Smuzhiyun }
4898*4882a593Smuzhiyun } else {
4899*4882a593Smuzhiyun ret = -EAGAIN;
4900*4882a593Smuzhiyun }
4901*4882a593Smuzhiyun goto out;
4902*4882a593Smuzhiyun } else if (IS_ERR(di)) {
4903*4882a593Smuzhiyun ret = PTR_ERR(di);
4904*4882a593Smuzhiyun goto out;
4905*4882a593Smuzhiyun }
4906*4882a593Smuzhiyun btrfs_release_path(search_path);
4907*4882a593Smuzhiyun
4908*4882a593Smuzhiyun cur_offset += this_len;
4909*4882a593Smuzhiyun }
4910*4882a593Smuzhiyun ret = 0;
4911*4882a593Smuzhiyun out:
4912*4882a593Smuzhiyun btrfs_free_path(search_path);
4913*4882a593Smuzhiyun kfree(name);
4914*4882a593Smuzhiyun return ret;
4915*4882a593Smuzhiyun }
4916*4882a593Smuzhiyun
4917*4882a593Smuzhiyun struct btrfs_ino_list {
4918*4882a593Smuzhiyun u64 ino;
4919*4882a593Smuzhiyun u64 parent;
4920*4882a593Smuzhiyun struct list_head list;
4921*4882a593Smuzhiyun };
4922*4882a593Smuzhiyun
log_conflicting_inodes(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx,u64 ino,u64 parent)4923*4882a593Smuzhiyun static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4924*4882a593Smuzhiyun struct btrfs_root *root,
4925*4882a593Smuzhiyun struct btrfs_path *path,
4926*4882a593Smuzhiyun struct btrfs_log_ctx *ctx,
4927*4882a593Smuzhiyun u64 ino, u64 parent)
4928*4882a593Smuzhiyun {
4929*4882a593Smuzhiyun struct btrfs_ino_list *ino_elem;
4930*4882a593Smuzhiyun LIST_HEAD(inode_list);
4931*4882a593Smuzhiyun int ret = 0;
4932*4882a593Smuzhiyun
4933*4882a593Smuzhiyun ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4934*4882a593Smuzhiyun if (!ino_elem)
4935*4882a593Smuzhiyun return -ENOMEM;
4936*4882a593Smuzhiyun ino_elem->ino = ino;
4937*4882a593Smuzhiyun ino_elem->parent = parent;
4938*4882a593Smuzhiyun list_add_tail(&ino_elem->list, &inode_list);
4939*4882a593Smuzhiyun
4940*4882a593Smuzhiyun while (!list_empty(&inode_list)) {
4941*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
4942*4882a593Smuzhiyun struct btrfs_key key;
4943*4882a593Smuzhiyun struct inode *inode;
4944*4882a593Smuzhiyun
4945*4882a593Smuzhiyun ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4946*4882a593Smuzhiyun list);
4947*4882a593Smuzhiyun ino = ino_elem->ino;
4948*4882a593Smuzhiyun parent = ino_elem->parent;
4949*4882a593Smuzhiyun list_del(&ino_elem->list);
4950*4882a593Smuzhiyun kfree(ino_elem);
4951*4882a593Smuzhiyun if (ret)
4952*4882a593Smuzhiyun continue;
4953*4882a593Smuzhiyun
4954*4882a593Smuzhiyun btrfs_release_path(path);
4955*4882a593Smuzhiyun
4956*4882a593Smuzhiyun inode = btrfs_iget(fs_info->sb, ino, root);
4957*4882a593Smuzhiyun /*
4958*4882a593Smuzhiyun * If the other inode that had a conflicting dir entry was
4959*4882a593Smuzhiyun * deleted in the current transaction, we need to log its parent
4960*4882a593Smuzhiyun * directory.
4961*4882a593Smuzhiyun */
4962*4882a593Smuzhiyun if (IS_ERR(inode)) {
4963*4882a593Smuzhiyun ret = PTR_ERR(inode);
4964*4882a593Smuzhiyun if (ret == -ENOENT) {
4965*4882a593Smuzhiyun inode = btrfs_iget(fs_info->sb, parent, root);
4966*4882a593Smuzhiyun if (IS_ERR(inode)) {
4967*4882a593Smuzhiyun ret = PTR_ERR(inode);
4968*4882a593Smuzhiyun } else {
4969*4882a593Smuzhiyun ret = btrfs_log_inode(trans, root,
4970*4882a593Smuzhiyun BTRFS_I(inode),
4971*4882a593Smuzhiyun LOG_OTHER_INODE_ALL,
4972*4882a593Smuzhiyun ctx);
4973*4882a593Smuzhiyun btrfs_add_delayed_iput(inode);
4974*4882a593Smuzhiyun }
4975*4882a593Smuzhiyun }
4976*4882a593Smuzhiyun continue;
4977*4882a593Smuzhiyun }
4978*4882a593Smuzhiyun /*
4979*4882a593Smuzhiyun * If the inode was already logged skip it - otherwise we can
4980*4882a593Smuzhiyun * hit an infinite loop. Example:
4981*4882a593Smuzhiyun *
4982*4882a593Smuzhiyun * From the commit root (previous transaction) we have the
4983*4882a593Smuzhiyun * following inodes:
4984*4882a593Smuzhiyun *
4985*4882a593Smuzhiyun * inode 257 a directory
4986*4882a593Smuzhiyun * inode 258 with references "zz" and "zz_link" on inode 257
4987*4882a593Smuzhiyun * inode 259 with reference "a" on inode 257
4988*4882a593Smuzhiyun *
4989*4882a593Smuzhiyun * And in the current (uncommitted) transaction we have:
4990*4882a593Smuzhiyun *
4991*4882a593Smuzhiyun * inode 257 a directory, unchanged
4992*4882a593Smuzhiyun * inode 258 with references "a" and "a2" on inode 257
4993*4882a593Smuzhiyun * inode 259 with reference "zz_link" on inode 257
4994*4882a593Smuzhiyun * inode 261 with reference "zz" on inode 257
4995*4882a593Smuzhiyun *
4996*4882a593Smuzhiyun * When logging inode 261 the following infinite loop could
4997*4882a593Smuzhiyun * happen if we don't skip already logged inodes:
4998*4882a593Smuzhiyun *
4999*4882a593Smuzhiyun * - we detect inode 258 as a conflicting inode, with inode 261
5000*4882a593Smuzhiyun * on reference "zz", and log it;
5001*4882a593Smuzhiyun *
5002*4882a593Smuzhiyun * - we detect inode 259 as a conflicting inode, with inode 258
5003*4882a593Smuzhiyun * on reference "a", and log it;
5004*4882a593Smuzhiyun *
5005*4882a593Smuzhiyun * - we detect inode 258 as a conflicting inode, with inode 259
5006*4882a593Smuzhiyun * on reference "zz_link", and log it - again! After this we
5007*4882a593Smuzhiyun * repeat the above steps forever.
5008*4882a593Smuzhiyun */
5009*4882a593Smuzhiyun spin_lock(&BTRFS_I(inode)->lock);
5010*4882a593Smuzhiyun /*
5011*4882a593Smuzhiyun * Check the inode's logged_trans only instead of
5012*4882a593Smuzhiyun * btrfs_inode_in_log(). This is because the last_log_commit of
5013*4882a593Smuzhiyun * the inode is not updated when we only log that it exists and
5014*4882a593Smuzhiyun * it has the full sync bit set (see btrfs_log_inode()).
5015*4882a593Smuzhiyun */
5016*4882a593Smuzhiyun if (BTRFS_I(inode)->logged_trans == trans->transid) {
5017*4882a593Smuzhiyun spin_unlock(&BTRFS_I(inode)->lock);
5018*4882a593Smuzhiyun btrfs_add_delayed_iput(inode);
5019*4882a593Smuzhiyun continue;
5020*4882a593Smuzhiyun }
5021*4882a593Smuzhiyun spin_unlock(&BTRFS_I(inode)->lock);
5022*4882a593Smuzhiyun /*
5023*4882a593Smuzhiyun * We are safe logging the other inode without acquiring its
5024*4882a593Smuzhiyun * lock as long as we log with the LOG_INODE_EXISTS mode. We
5025*4882a593Smuzhiyun * are safe against concurrent renames of the other inode as
5026*4882a593Smuzhiyun * well because during a rename we pin the log and update the
5027*4882a593Smuzhiyun * log with the new name before we unpin it.
5028*4882a593Smuzhiyun */
5029*4882a593Smuzhiyun ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5030*4882a593Smuzhiyun LOG_OTHER_INODE, ctx);
5031*4882a593Smuzhiyun if (ret) {
5032*4882a593Smuzhiyun btrfs_add_delayed_iput(inode);
5033*4882a593Smuzhiyun continue;
5034*4882a593Smuzhiyun }
5035*4882a593Smuzhiyun
5036*4882a593Smuzhiyun key.objectid = ino;
5037*4882a593Smuzhiyun key.type = BTRFS_INODE_REF_KEY;
5038*4882a593Smuzhiyun key.offset = 0;
5039*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5040*4882a593Smuzhiyun if (ret < 0) {
5041*4882a593Smuzhiyun btrfs_add_delayed_iput(inode);
5042*4882a593Smuzhiyun continue;
5043*4882a593Smuzhiyun }
5044*4882a593Smuzhiyun
5045*4882a593Smuzhiyun while (true) {
5046*4882a593Smuzhiyun struct extent_buffer *leaf = path->nodes[0];
5047*4882a593Smuzhiyun int slot = path->slots[0];
5048*4882a593Smuzhiyun u64 other_ino = 0;
5049*4882a593Smuzhiyun u64 other_parent = 0;
5050*4882a593Smuzhiyun
5051*4882a593Smuzhiyun if (slot >= btrfs_header_nritems(leaf)) {
5052*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
5053*4882a593Smuzhiyun if (ret < 0) {
5054*4882a593Smuzhiyun break;
5055*4882a593Smuzhiyun } else if (ret > 0) {
5056*4882a593Smuzhiyun ret = 0;
5057*4882a593Smuzhiyun break;
5058*4882a593Smuzhiyun }
5059*4882a593Smuzhiyun continue;
5060*4882a593Smuzhiyun }
5061*4882a593Smuzhiyun
5062*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, slot);
5063*4882a593Smuzhiyun if (key.objectid != ino ||
5064*4882a593Smuzhiyun (key.type != BTRFS_INODE_REF_KEY &&
5065*4882a593Smuzhiyun key.type != BTRFS_INODE_EXTREF_KEY)) {
5066*4882a593Smuzhiyun ret = 0;
5067*4882a593Smuzhiyun break;
5068*4882a593Smuzhiyun }
5069*4882a593Smuzhiyun
5070*4882a593Smuzhiyun ret = btrfs_check_ref_name_override(leaf, slot, &key,
5071*4882a593Smuzhiyun BTRFS_I(inode), &other_ino,
5072*4882a593Smuzhiyun &other_parent);
5073*4882a593Smuzhiyun if (ret < 0)
5074*4882a593Smuzhiyun break;
5075*4882a593Smuzhiyun if (ret > 0) {
5076*4882a593Smuzhiyun ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5077*4882a593Smuzhiyun if (!ino_elem) {
5078*4882a593Smuzhiyun ret = -ENOMEM;
5079*4882a593Smuzhiyun break;
5080*4882a593Smuzhiyun }
5081*4882a593Smuzhiyun ino_elem->ino = other_ino;
5082*4882a593Smuzhiyun ino_elem->parent = other_parent;
5083*4882a593Smuzhiyun list_add_tail(&ino_elem->list, &inode_list);
5084*4882a593Smuzhiyun ret = 0;
5085*4882a593Smuzhiyun }
5086*4882a593Smuzhiyun path->slots[0]++;
5087*4882a593Smuzhiyun }
5088*4882a593Smuzhiyun btrfs_add_delayed_iput(inode);
5089*4882a593Smuzhiyun }
5090*4882a593Smuzhiyun
5091*4882a593Smuzhiyun return ret;
5092*4882a593Smuzhiyun }
5093*4882a593Smuzhiyun
copy_inode_items_to_log(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_key * min_key,const struct btrfs_key * max_key,struct btrfs_path * path,struct btrfs_path * dst_path,const u64 logged_isize,const bool recursive_logging,const int inode_only,struct btrfs_log_ctx * ctx,bool * need_log_inode_item)5094*4882a593Smuzhiyun static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
5095*4882a593Smuzhiyun struct btrfs_inode *inode,
5096*4882a593Smuzhiyun struct btrfs_key *min_key,
5097*4882a593Smuzhiyun const struct btrfs_key *max_key,
5098*4882a593Smuzhiyun struct btrfs_path *path,
5099*4882a593Smuzhiyun struct btrfs_path *dst_path,
5100*4882a593Smuzhiyun const u64 logged_isize,
5101*4882a593Smuzhiyun const bool recursive_logging,
5102*4882a593Smuzhiyun const int inode_only,
5103*4882a593Smuzhiyun struct btrfs_log_ctx *ctx,
5104*4882a593Smuzhiyun bool *need_log_inode_item)
5105*4882a593Smuzhiyun {
5106*4882a593Smuzhiyun const u64 i_size = i_size_read(&inode->vfs_inode);
5107*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
5108*4882a593Smuzhiyun int ins_start_slot = 0;
5109*4882a593Smuzhiyun int ins_nr = 0;
5110*4882a593Smuzhiyun int ret;
5111*4882a593Smuzhiyun
5112*4882a593Smuzhiyun while (1) {
5113*4882a593Smuzhiyun ret = btrfs_search_forward(root, min_key, path, trans->transid);
5114*4882a593Smuzhiyun if (ret < 0)
5115*4882a593Smuzhiyun return ret;
5116*4882a593Smuzhiyun if (ret > 0) {
5117*4882a593Smuzhiyun ret = 0;
5118*4882a593Smuzhiyun break;
5119*4882a593Smuzhiyun }
5120*4882a593Smuzhiyun again:
5121*4882a593Smuzhiyun /* Note, ins_nr might be > 0 here, cleanup outside the loop */
5122*4882a593Smuzhiyun if (min_key->objectid != max_key->objectid)
5123*4882a593Smuzhiyun break;
5124*4882a593Smuzhiyun if (min_key->type > max_key->type)
5125*4882a593Smuzhiyun break;
5126*4882a593Smuzhiyun
5127*4882a593Smuzhiyun if (min_key->type == BTRFS_INODE_ITEM_KEY) {
5128*4882a593Smuzhiyun *need_log_inode_item = false;
5129*4882a593Smuzhiyun } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
5130*4882a593Smuzhiyun min_key->offset >= i_size) {
5131*4882a593Smuzhiyun /*
5132*4882a593Smuzhiyun * Extents at and beyond eof are logged with
5133*4882a593Smuzhiyun * btrfs_log_prealloc_extents().
5134*4882a593Smuzhiyun * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
5135*4882a593Smuzhiyun * and no keys greater than that, so bail out.
5136*4882a593Smuzhiyun */
5137*4882a593Smuzhiyun break;
5138*4882a593Smuzhiyun } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
5139*4882a593Smuzhiyun min_key->type == BTRFS_INODE_EXTREF_KEY) &&
5140*4882a593Smuzhiyun inode->generation == trans->transid &&
5141*4882a593Smuzhiyun !recursive_logging) {
5142*4882a593Smuzhiyun u64 other_ino = 0;
5143*4882a593Smuzhiyun u64 other_parent = 0;
5144*4882a593Smuzhiyun
5145*4882a593Smuzhiyun ret = btrfs_check_ref_name_override(path->nodes[0],
5146*4882a593Smuzhiyun path->slots[0], min_key, inode,
5147*4882a593Smuzhiyun &other_ino, &other_parent);
5148*4882a593Smuzhiyun if (ret < 0) {
5149*4882a593Smuzhiyun return ret;
5150*4882a593Smuzhiyun } else if (ret > 0 && ctx &&
5151*4882a593Smuzhiyun other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
5152*4882a593Smuzhiyun if (ins_nr > 0) {
5153*4882a593Smuzhiyun ins_nr++;
5154*4882a593Smuzhiyun } else {
5155*4882a593Smuzhiyun ins_nr = 1;
5156*4882a593Smuzhiyun ins_start_slot = path->slots[0];
5157*4882a593Smuzhiyun }
5158*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path,
5159*4882a593Smuzhiyun ins_start_slot, ins_nr,
5160*4882a593Smuzhiyun inode_only, logged_isize);
5161*4882a593Smuzhiyun if (ret < 0)
5162*4882a593Smuzhiyun return ret;
5163*4882a593Smuzhiyun ins_nr = 0;
5164*4882a593Smuzhiyun
5165*4882a593Smuzhiyun ret = log_conflicting_inodes(trans, root, path,
5166*4882a593Smuzhiyun ctx, other_ino, other_parent);
5167*4882a593Smuzhiyun if (ret)
5168*4882a593Smuzhiyun return ret;
5169*4882a593Smuzhiyun btrfs_release_path(path);
5170*4882a593Smuzhiyun goto next_key;
5171*4882a593Smuzhiyun }
5172*4882a593Smuzhiyun } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
5173*4882a593Smuzhiyun /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
5174*4882a593Smuzhiyun if (ins_nr == 0)
5175*4882a593Smuzhiyun goto next_slot;
5176*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path,
5177*4882a593Smuzhiyun ins_start_slot,
5178*4882a593Smuzhiyun ins_nr, inode_only, logged_isize);
5179*4882a593Smuzhiyun if (ret < 0)
5180*4882a593Smuzhiyun return ret;
5181*4882a593Smuzhiyun ins_nr = 0;
5182*4882a593Smuzhiyun goto next_slot;
5183*4882a593Smuzhiyun }
5184*4882a593Smuzhiyun
5185*4882a593Smuzhiyun if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5186*4882a593Smuzhiyun ins_nr++;
5187*4882a593Smuzhiyun goto next_slot;
5188*4882a593Smuzhiyun } else if (!ins_nr) {
5189*4882a593Smuzhiyun ins_start_slot = path->slots[0];
5190*4882a593Smuzhiyun ins_nr = 1;
5191*4882a593Smuzhiyun goto next_slot;
5192*4882a593Smuzhiyun }
5193*4882a593Smuzhiyun
5194*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5195*4882a593Smuzhiyun ins_nr, inode_only, logged_isize);
5196*4882a593Smuzhiyun if (ret < 0)
5197*4882a593Smuzhiyun return ret;
5198*4882a593Smuzhiyun ins_nr = 1;
5199*4882a593Smuzhiyun ins_start_slot = path->slots[0];
5200*4882a593Smuzhiyun next_slot:
5201*4882a593Smuzhiyun path->slots[0]++;
5202*4882a593Smuzhiyun if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5203*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], min_key,
5204*4882a593Smuzhiyun path->slots[0]);
5205*4882a593Smuzhiyun goto again;
5206*4882a593Smuzhiyun }
5207*4882a593Smuzhiyun if (ins_nr) {
5208*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path,
5209*4882a593Smuzhiyun ins_start_slot, ins_nr, inode_only,
5210*4882a593Smuzhiyun logged_isize);
5211*4882a593Smuzhiyun if (ret < 0)
5212*4882a593Smuzhiyun return ret;
5213*4882a593Smuzhiyun ins_nr = 0;
5214*4882a593Smuzhiyun }
5215*4882a593Smuzhiyun btrfs_release_path(path);
5216*4882a593Smuzhiyun next_key:
5217*4882a593Smuzhiyun if (min_key->offset < (u64)-1) {
5218*4882a593Smuzhiyun min_key->offset++;
5219*4882a593Smuzhiyun } else if (min_key->type < max_key->type) {
5220*4882a593Smuzhiyun min_key->type++;
5221*4882a593Smuzhiyun min_key->offset = 0;
5222*4882a593Smuzhiyun } else {
5223*4882a593Smuzhiyun break;
5224*4882a593Smuzhiyun }
5225*4882a593Smuzhiyun }
5226*4882a593Smuzhiyun if (ins_nr) {
5227*4882a593Smuzhiyun ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
5228*4882a593Smuzhiyun ins_nr, inode_only, logged_isize);
5229*4882a593Smuzhiyun if (ret)
5230*4882a593Smuzhiyun return ret;
5231*4882a593Smuzhiyun }
5232*4882a593Smuzhiyun
5233*4882a593Smuzhiyun if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
5234*4882a593Smuzhiyun /*
5235*4882a593Smuzhiyun * Release the path because otherwise we might attempt to double
5236*4882a593Smuzhiyun * lock the same leaf with btrfs_log_prealloc_extents() below.
5237*4882a593Smuzhiyun */
5238*4882a593Smuzhiyun btrfs_release_path(path);
5239*4882a593Smuzhiyun ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
5240*4882a593Smuzhiyun }
5241*4882a593Smuzhiyun
5242*4882a593Smuzhiyun return ret;
5243*4882a593Smuzhiyun }
5244*4882a593Smuzhiyun
5245*4882a593Smuzhiyun /* log a single inode in the tree log.
5246*4882a593Smuzhiyun * At least one parent directory for this inode must exist in the tree
5247*4882a593Smuzhiyun * or be logged already.
5248*4882a593Smuzhiyun *
5249*4882a593Smuzhiyun * Any items from this inode changed by the current transaction are copied
5250*4882a593Smuzhiyun * to the log tree. An extra reference is taken on any extents in this
5251*4882a593Smuzhiyun * file, allowing us to avoid a whole pile of corner cases around logging
5252*4882a593Smuzhiyun * blocks that have been removed from the tree.
5253*4882a593Smuzhiyun *
5254*4882a593Smuzhiyun * See LOG_INODE_ALL and related defines for a description of what inode_only
5255*4882a593Smuzhiyun * does.
5256*4882a593Smuzhiyun *
5257*4882a593Smuzhiyun * This handles both files and directories.
5258*4882a593Smuzhiyun */
btrfs_log_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,int inode_only,struct btrfs_log_ctx * ctx)5259*4882a593Smuzhiyun static int btrfs_log_inode(struct btrfs_trans_handle *trans,
5260*4882a593Smuzhiyun struct btrfs_root *root, struct btrfs_inode *inode,
5261*4882a593Smuzhiyun int inode_only,
5262*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
5263*4882a593Smuzhiyun {
5264*4882a593Smuzhiyun struct btrfs_path *path;
5265*4882a593Smuzhiyun struct btrfs_path *dst_path;
5266*4882a593Smuzhiyun struct btrfs_key min_key;
5267*4882a593Smuzhiyun struct btrfs_key max_key;
5268*4882a593Smuzhiyun struct btrfs_root *log = root->log_root;
5269*4882a593Smuzhiyun int err = 0;
5270*4882a593Smuzhiyun int ret = 0;
5271*4882a593Smuzhiyun bool fast_search = false;
5272*4882a593Smuzhiyun u64 ino = btrfs_ino(inode);
5273*4882a593Smuzhiyun struct extent_map_tree *em_tree = &inode->extent_tree;
5274*4882a593Smuzhiyun u64 logged_isize = 0;
5275*4882a593Smuzhiyun bool need_log_inode_item = true;
5276*4882a593Smuzhiyun bool xattrs_logged = false;
5277*4882a593Smuzhiyun bool recursive_logging = false;
5278*4882a593Smuzhiyun
5279*4882a593Smuzhiyun path = btrfs_alloc_path();
5280*4882a593Smuzhiyun if (!path)
5281*4882a593Smuzhiyun return -ENOMEM;
5282*4882a593Smuzhiyun dst_path = btrfs_alloc_path();
5283*4882a593Smuzhiyun if (!dst_path) {
5284*4882a593Smuzhiyun btrfs_free_path(path);
5285*4882a593Smuzhiyun return -ENOMEM;
5286*4882a593Smuzhiyun }
5287*4882a593Smuzhiyun
5288*4882a593Smuzhiyun min_key.objectid = ino;
5289*4882a593Smuzhiyun min_key.type = BTRFS_INODE_ITEM_KEY;
5290*4882a593Smuzhiyun min_key.offset = 0;
5291*4882a593Smuzhiyun
5292*4882a593Smuzhiyun max_key.objectid = ino;
5293*4882a593Smuzhiyun
5294*4882a593Smuzhiyun
5295*4882a593Smuzhiyun /* today the code can only do partial logging of directories */
5296*4882a593Smuzhiyun if (S_ISDIR(inode->vfs_inode.i_mode) ||
5297*4882a593Smuzhiyun (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5298*4882a593Smuzhiyun &inode->runtime_flags) &&
5299*4882a593Smuzhiyun inode_only >= LOG_INODE_EXISTS))
5300*4882a593Smuzhiyun max_key.type = BTRFS_XATTR_ITEM_KEY;
5301*4882a593Smuzhiyun else
5302*4882a593Smuzhiyun max_key.type = (u8)-1;
5303*4882a593Smuzhiyun max_key.offset = (u64)-1;
5304*4882a593Smuzhiyun
5305*4882a593Smuzhiyun /*
5306*4882a593Smuzhiyun * Only run delayed items if we are a directory. We want to make sure
5307*4882a593Smuzhiyun * all directory indexes hit the fs/subvolume tree so we can find them
5308*4882a593Smuzhiyun * and figure out which index ranges have to be logged.
5309*4882a593Smuzhiyun *
5310*4882a593Smuzhiyun * Otherwise commit the delayed inode only if the full sync flag is set,
5311*4882a593Smuzhiyun * as we want to make sure an up to date version is in the subvolume
5312*4882a593Smuzhiyun * tree so copy_inode_items_to_log() / copy_items() can find it and copy
5313*4882a593Smuzhiyun * it to the log tree. For a non full sync, we always log the inode item
5314*4882a593Smuzhiyun * based on the in-memory struct btrfs_inode which is always up to date.
5315*4882a593Smuzhiyun */
5316*4882a593Smuzhiyun if (S_ISDIR(inode->vfs_inode.i_mode))
5317*4882a593Smuzhiyun ret = btrfs_commit_inode_delayed_items(trans, inode);
5318*4882a593Smuzhiyun else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5319*4882a593Smuzhiyun ret = btrfs_commit_inode_delayed_inode(inode);
5320*4882a593Smuzhiyun
5321*4882a593Smuzhiyun if (ret) {
5322*4882a593Smuzhiyun btrfs_free_path(path);
5323*4882a593Smuzhiyun btrfs_free_path(dst_path);
5324*4882a593Smuzhiyun return ret;
5325*4882a593Smuzhiyun }
5326*4882a593Smuzhiyun
5327*4882a593Smuzhiyun if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5328*4882a593Smuzhiyun recursive_logging = true;
5329*4882a593Smuzhiyun if (inode_only == LOG_OTHER_INODE)
5330*4882a593Smuzhiyun inode_only = LOG_INODE_EXISTS;
5331*4882a593Smuzhiyun else
5332*4882a593Smuzhiyun inode_only = LOG_INODE_ALL;
5333*4882a593Smuzhiyun mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
5334*4882a593Smuzhiyun } else {
5335*4882a593Smuzhiyun mutex_lock(&inode->log_mutex);
5336*4882a593Smuzhiyun }
5337*4882a593Smuzhiyun
5338*4882a593Smuzhiyun /*
5339*4882a593Smuzhiyun * For symlinks, we must always log their content, which is stored in an
5340*4882a593Smuzhiyun * inline extent, otherwise we could end up with an empty symlink after
5341*4882a593Smuzhiyun * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
5342*4882a593Smuzhiyun * one attempts to create an empty symlink).
5343*4882a593Smuzhiyun * We don't need to worry about flushing delalloc, because when we create
5344*4882a593Smuzhiyun * the inline extent when the symlink is created (we never have delalloc
5345*4882a593Smuzhiyun * for symlinks).
5346*4882a593Smuzhiyun */
5347*4882a593Smuzhiyun if (S_ISLNK(inode->vfs_inode.i_mode))
5348*4882a593Smuzhiyun inode_only = LOG_INODE_ALL;
5349*4882a593Smuzhiyun
5350*4882a593Smuzhiyun /*
5351*4882a593Smuzhiyun * a brute force approach to making sure we get the most uptodate
5352*4882a593Smuzhiyun * copies of everything.
5353*4882a593Smuzhiyun */
5354*4882a593Smuzhiyun if (S_ISDIR(inode->vfs_inode.i_mode)) {
5355*4882a593Smuzhiyun int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5356*4882a593Smuzhiyun
5357*4882a593Smuzhiyun if (inode_only == LOG_INODE_EXISTS)
5358*4882a593Smuzhiyun max_key_type = BTRFS_XATTR_ITEM_KEY;
5359*4882a593Smuzhiyun ret = drop_objectid_items(trans, log, path, ino, max_key_type);
5360*4882a593Smuzhiyun } else {
5361*4882a593Smuzhiyun if (inode_only == LOG_INODE_EXISTS) {
5362*4882a593Smuzhiyun /*
5363*4882a593Smuzhiyun * Make sure the new inode item we write to the log has
5364*4882a593Smuzhiyun * the same isize as the current one (if it exists).
5365*4882a593Smuzhiyun * This is necessary to prevent data loss after log
5366*4882a593Smuzhiyun * replay, and also to prevent doing a wrong expanding
5367*4882a593Smuzhiyun * truncate - for e.g. create file, write 4K into offset
5368*4882a593Smuzhiyun * 0, fsync, write 4K into offset 4096, add hard link,
5369*4882a593Smuzhiyun * fsync some other file (to sync log), power fail - if
5370*4882a593Smuzhiyun * we use the inode's current i_size, after log replay
5371*4882a593Smuzhiyun * we get a 8Kb file, with the last 4Kb extent as a hole
5372*4882a593Smuzhiyun * (zeroes), as if an expanding truncate happened,
5373*4882a593Smuzhiyun * instead of getting a file of 4Kb only.
5374*4882a593Smuzhiyun */
5375*4882a593Smuzhiyun err = logged_inode_size(log, inode, path, &logged_isize);
5376*4882a593Smuzhiyun if (err)
5377*4882a593Smuzhiyun goto out_unlock;
5378*4882a593Smuzhiyun }
5379*4882a593Smuzhiyun if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5380*4882a593Smuzhiyun &inode->runtime_flags)) {
5381*4882a593Smuzhiyun if (inode_only == LOG_INODE_EXISTS) {
5382*4882a593Smuzhiyun max_key.type = BTRFS_XATTR_ITEM_KEY;
5383*4882a593Smuzhiyun ret = drop_objectid_items(trans, log, path, ino,
5384*4882a593Smuzhiyun max_key.type);
5385*4882a593Smuzhiyun } else {
5386*4882a593Smuzhiyun clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
5387*4882a593Smuzhiyun &inode->runtime_flags);
5388*4882a593Smuzhiyun clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5389*4882a593Smuzhiyun &inode->runtime_flags);
5390*4882a593Smuzhiyun while(1) {
5391*4882a593Smuzhiyun ret = btrfs_truncate_inode_items(trans,
5392*4882a593Smuzhiyun log, &inode->vfs_inode, 0, 0);
5393*4882a593Smuzhiyun if (ret != -EAGAIN)
5394*4882a593Smuzhiyun break;
5395*4882a593Smuzhiyun }
5396*4882a593Smuzhiyun }
5397*4882a593Smuzhiyun } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
5398*4882a593Smuzhiyun &inode->runtime_flags) ||
5399*4882a593Smuzhiyun inode_only == LOG_INODE_EXISTS) {
5400*4882a593Smuzhiyun if (inode_only == LOG_INODE_ALL)
5401*4882a593Smuzhiyun fast_search = true;
5402*4882a593Smuzhiyun max_key.type = BTRFS_XATTR_ITEM_KEY;
5403*4882a593Smuzhiyun ret = drop_objectid_items(trans, log, path, ino,
5404*4882a593Smuzhiyun max_key.type);
5405*4882a593Smuzhiyun } else {
5406*4882a593Smuzhiyun if (inode_only == LOG_INODE_ALL)
5407*4882a593Smuzhiyun fast_search = true;
5408*4882a593Smuzhiyun goto log_extents;
5409*4882a593Smuzhiyun }
5410*4882a593Smuzhiyun
5411*4882a593Smuzhiyun }
5412*4882a593Smuzhiyun if (ret) {
5413*4882a593Smuzhiyun err = ret;
5414*4882a593Smuzhiyun goto out_unlock;
5415*4882a593Smuzhiyun }
5416*4882a593Smuzhiyun
5417*4882a593Smuzhiyun err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
5418*4882a593Smuzhiyun path, dst_path, logged_isize,
5419*4882a593Smuzhiyun recursive_logging, inode_only, ctx,
5420*4882a593Smuzhiyun &need_log_inode_item);
5421*4882a593Smuzhiyun if (err)
5422*4882a593Smuzhiyun goto out_unlock;
5423*4882a593Smuzhiyun
5424*4882a593Smuzhiyun btrfs_release_path(path);
5425*4882a593Smuzhiyun btrfs_release_path(dst_path);
5426*4882a593Smuzhiyun err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
5427*4882a593Smuzhiyun if (err)
5428*4882a593Smuzhiyun goto out_unlock;
5429*4882a593Smuzhiyun xattrs_logged = true;
5430*4882a593Smuzhiyun if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5431*4882a593Smuzhiyun btrfs_release_path(path);
5432*4882a593Smuzhiyun btrfs_release_path(dst_path);
5433*4882a593Smuzhiyun err = btrfs_log_holes(trans, root, inode, path);
5434*4882a593Smuzhiyun if (err)
5435*4882a593Smuzhiyun goto out_unlock;
5436*4882a593Smuzhiyun }
5437*4882a593Smuzhiyun log_extents:
5438*4882a593Smuzhiyun btrfs_release_path(path);
5439*4882a593Smuzhiyun btrfs_release_path(dst_path);
5440*4882a593Smuzhiyun if (need_log_inode_item) {
5441*4882a593Smuzhiyun err = log_inode_item(trans, log, dst_path, inode);
5442*4882a593Smuzhiyun if (!err && !xattrs_logged) {
5443*4882a593Smuzhiyun err = btrfs_log_all_xattrs(trans, root, inode, path,
5444*4882a593Smuzhiyun dst_path);
5445*4882a593Smuzhiyun btrfs_release_path(path);
5446*4882a593Smuzhiyun }
5447*4882a593Smuzhiyun if (err)
5448*4882a593Smuzhiyun goto out_unlock;
5449*4882a593Smuzhiyun }
5450*4882a593Smuzhiyun if (fast_search) {
5451*4882a593Smuzhiyun ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
5452*4882a593Smuzhiyun ctx);
5453*4882a593Smuzhiyun if (ret) {
5454*4882a593Smuzhiyun err = ret;
5455*4882a593Smuzhiyun goto out_unlock;
5456*4882a593Smuzhiyun }
5457*4882a593Smuzhiyun } else if (inode_only == LOG_INODE_ALL) {
5458*4882a593Smuzhiyun struct extent_map *em, *n;
5459*4882a593Smuzhiyun
5460*4882a593Smuzhiyun write_lock(&em_tree->lock);
5461*4882a593Smuzhiyun list_for_each_entry_safe(em, n, &em_tree->modified_extents, list)
5462*4882a593Smuzhiyun list_del_init(&em->list);
5463*4882a593Smuzhiyun write_unlock(&em_tree->lock);
5464*4882a593Smuzhiyun }
5465*4882a593Smuzhiyun
5466*4882a593Smuzhiyun if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5467*4882a593Smuzhiyun ret = log_directory_changes(trans, root, inode, path, dst_path,
5468*4882a593Smuzhiyun ctx);
5469*4882a593Smuzhiyun if (ret) {
5470*4882a593Smuzhiyun err = ret;
5471*4882a593Smuzhiyun goto out_unlock;
5472*4882a593Smuzhiyun }
5473*4882a593Smuzhiyun }
5474*4882a593Smuzhiyun
5475*4882a593Smuzhiyun /*
5476*4882a593Smuzhiyun * If we are logging that an ancestor inode exists as part of logging a
5477*4882a593Smuzhiyun * new name from a link or rename operation, don't mark the inode as
5478*4882a593Smuzhiyun * logged - otherwise if an explicit fsync is made against an ancestor,
5479*4882a593Smuzhiyun * the fsync considers the inode in the log and doesn't sync the log,
5480*4882a593Smuzhiyun * resulting in the ancestor missing after a power failure unless the
5481*4882a593Smuzhiyun * log was synced as part of an fsync against any other unrelated inode.
5482*4882a593Smuzhiyun * So keep it simple for this case and just don't flag the ancestors as
5483*4882a593Smuzhiyun * logged.
5484*4882a593Smuzhiyun */
5485*4882a593Smuzhiyun if (!ctx ||
5486*4882a593Smuzhiyun !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
5487*4882a593Smuzhiyun &inode->vfs_inode != ctx->inode)) {
5488*4882a593Smuzhiyun spin_lock(&inode->lock);
5489*4882a593Smuzhiyun inode->logged_trans = trans->transid;
5490*4882a593Smuzhiyun /*
5491*4882a593Smuzhiyun * Don't update last_log_commit if we logged that an inode exists
5492*4882a593Smuzhiyun * after it was loaded to memory (full_sync bit set).
5493*4882a593Smuzhiyun * This is to prevent data loss when we do a write to the inode,
5494*4882a593Smuzhiyun * then the inode gets evicted after all delalloc was flushed,
5495*4882a593Smuzhiyun * then we log it exists (due to a rename for example) and then
5496*4882a593Smuzhiyun * fsync it. This last fsync would do nothing (not logging the
5497*4882a593Smuzhiyun * extents previously written).
5498*4882a593Smuzhiyun */
5499*4882a593Smuzhiyun if (inode_only != LOG_INODE_EXISTS ||
5500*4882a593Smuzhiyun !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
5501*4882a593Smuzhiyun inode->last_log_commit = inode->last_sub_trans;
5502*4882a593Smuzhiyun spin_unlock(&inode->lock);
5503*4882a593Smuzhiyun }
5504*4882a593Smuzhiyun out_unlock:
5505*4882a593Smuzhiyun mutex_unlock(&inode->log_mutex);
5506*4882a593Smuzhiyun
5507*4882a593Smuzhiyun btrfs_free_path(path);
5508*4882a593Smuzhiyun btrfs_free_path(dst_path);
5509*4882a593Smuzhiyun return err;
5510*4882a593Smuzhiyun }
5511*4882a593Smuzhiyun
5512*4882a593Smuzhiyun /*
5513*4882a593Smuzhiyun * Check if we must fallback to a transaction commit when logging an inode.
5514*4882a593Smuzhiyun * This must be called after logging the inode and is used only in the context
5515*4882a593Smuzhiyun * when fsyncing an inode requires the need to log some other inode - in which
5516*4882a593Smuzhiyun * case we can't lock the i_mutex of each other inode we need to log as that
5517*4882a593Smuzhiyun * can lead to deadlocks with concurrent fsync against other inodes (as we can
5518*4882a593Smuzhiyun * log inodes up or down in the hierarchy) or rename operations for example. So
5519*4882a593Smuzhiyun * we take the log_mutex of the inode after we have logged it and then check for
5520*4882a593Smuzhiyun * its last_unlink_trans value - this is safe because any task setting
5521*4882a593Smuzhiyun * last_unlink_trans must take the log_mutex and it must do this before it does
5522*4882a593Smuzhiyun * the actual unlink operation, so if we do this check before a concurrent task
5523*4882a593Smuzhiyun * sets last_unlink_trans it means we've logged a consistent version/state of
5524*4882a593Smuzhiyun * all the inode items, otherwise we are not sure and must do a transaction
5525*4882a593Smuzhiyun * commit (the concurrent task might have only updated last_unlink_trans before
5526*4882a593Smuzhiyun * we logged the inode or it might have also done the unlink).
5527*4882a593Smuzhiyun */
btrfs_must_commit_transaction(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)5528*4882a593Smuzhiyun static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
5529*4882a593Smuzhiyun struct btrfs_inode *inode)
5530*4882a593Smuzhiyun {
5531*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = inode->root->fs_info;
5532*4882a593Smuzhiyun bool ret = false;
5533*4882a593Smuzhiyun
5534*4882a593Smuzhiyun mutex_lock(&inode->log_mutex);
5535*4882a593Smuzhiyun if (inode->last_unlink_trans > fs_info->last_trans_committed) {
5536*4882a593Smuzhiyun /*
5537*4882a593Smuzhiyun * Make sure any commits to the log are forced to be full
5538*4882a593Smuzhiyun * commits.
5539*4882a593Smuzhiyun */
5540*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
5541*4882a593Smuzhiyun ret = true;
5542*4882a593Smuzhiyun }
5543*4882a593Smuzhiyun mutex_unlock(&inode->log_mutex);
5544*4882a593Smuzhiyun
5545*4882a593Smuzhiyun return ret;
5546*4882a593Smuzhiyun }
5547*4882a593Smuzhiyun
5548*4882a593Smuzhiyun /*
5549*4882a593Smuzhiyun * follow the dentry parent pointers up the chain and see if any
5550*4882a593Smuzhiyun * of the directories in it require a full commit before they can
5551*4882a593Smuzhiyun * be logged. Returns zero if nothing special needs to be done or 1 if
5552*4882a593Smuzhiyun * a full commit is required.
5553*4882a593Smuzhiyun */
check_parent_dirs_for_sync(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct super_block * sb,u64 last_committed)5554*4882a593Smuzhiyun static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
5555*4882a593Smuzhiyun struct btrfs_inode *inode,
5556*4882a593Smuzhiyun struct dentry *parent,
5557*4882a593Smuzhiyun struct super_block *sb,
5558*4882a593Smuzhiyun u64 last_committed)
5559*4882a593Smuzhiyun {
5560*4882a593Smuzhiyun int ret = 0;
5561*4882a593Smuzhiyun struct dentry *old_parent = NULL;
5562*4882a593Smuzhiyun
5563*4882a593Smuzhiyun /*
5564*4882a593Smuzhiyun * for regular files, if its inode is already on disk, we don't
5565*4882a593Smuzhiyun * have to worry about the parents at all. This is because
5566*4882a593Smuzhiyun * we can use the last_unlink_trans field to record renames
5567*4882a593Smuzhiyun * and other fun in this file.
5568*4882a593Smuzhiyun */
5569*4882a593Smuzhiyun if (S_ISREG(inode->vfs_inode.i_mode) &&
5570*4882a593Smuzhiyun inode->generation <= last_committed &&
5571*4882a593Smuzhiyun inode->last_unlink_trans <= last_committed)
5572*4882a593Smuzhiyun goto out;
5573*4882a593Smuzhiyun
5574*4882a593Smuzhiyun if (!S_ISDIR(inode->vfs_inode.i_mode)) {
5575*4882a593Smuzhiyun if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5576*4882a593Smuzhiyun goto out;
5577*4882a593Smuzhiyun inode = BTRFS_I(d_inode(parent));
5578*4882a593Smuzhiyun }
5579*4882a593Smuzhiyun
5580*4882a593Smuzhiyun while (1) {
5581*4882a593Smuzhiyun if (btrfs_must_commit_transaction(trans, inode)) {
5582*4882a593Smuzhiyun ret = 1;
5583*4882a593Smuzhiyun break;
5584*4882a593Smuzhiyun }
5585*4882a593Smuzhiyun
5586*4882a593Smuzhiyun if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
5587*4882a593Smuzhiyun break;
5588*4882a593Smuzhiyun
5589*4882a593Smuzhiyun if (IS_ROOT(parent)) {
5590*4882a593Smuzhiyun inode = BTRFS_I(d_inode(parent));
5591*4882a593Smuzhiyun if (btrfs_must_commit_transaction(trans, inode))
5592*4882a593Smuzhiyun ret = 1;
5593*4882a593Smuzhiyun break;
5594*4882a593Smuzhiyun }
5595*4882a593Smuzhiyun
5596*4882a593Smuzhiyun parent = dget_parent(parent);
5597*4882a593Smuzhiyun dput(old_parent);
5598*4882a593Smuzhiyun old_parent = parent;
5599*4882a593Smuzhiyun inode = BTRFS_I(d_inode(parent));
5600*4882a593Smuzhiyun
5601*4882a593Smuzhiyun }
5602*4882a593Smuzhiyun dput(old_parent);
5603*4882a593Smuzhiyun out:
5604*4882a593Smuzhiyun return ret;
5605*4882a593Smuzhiyun }
5606*4882a593Smuzhiyun
5607*4882a593Smuzhiyun struct btrfs_dir_list {
5608*4882a593Smuzhiyun u64 ino;
5609*4882a593Smuzhiyun struct list_head list;
5610*4882a593Smuzhiyun };
5611*4882a593Smuzhiyun
5612*4882a593Smuzhiyun /*
5613*4882a593Smuzhiyun * Log the inodes of the new dentries of a directory. See log_dir_items() for
5614*4882a593Smuzhiyun * details about the why it is needed.
5615*4882a593Smuzhiyun * This is a recursive operation - if an existing dentry corresponds to a
5616*4882a593Smuzhiyun * directory, that directory's new entries are logged too (same behaviour as
5617*4882a593Smuzhiyun * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5618*4882a593Smuzhiyun * the dentries point to we do not lock their i_mutex, otherwise lockdep
5619*4882a593Smuzhiyun * complains about the following circular lock dependency / possible deadlock:
5620*4882a593Smuzhiyun *
5621*4882a593Smuzhiyun * CPU0 CPU1
5622*4882a593Smuzhiyun * ---- ----
5623*4882a593Smuzhiyun * lock(&type->i_mutex_dir_key#3/2);
5624*4882a593Smuzhiyun * lock(sb_internal#2);
5625*4882a593Smuzhiyun * lock(&type->i_mutex_dir_key#3/2);
5626*4882a593Smuzhiyun * lock(&sb->s_type->i_mutex_key#14);
5627*4882a593Smuzhiyun *
5628*4882a593Smuzhiyun * Where sb_internal is the lock (a counter that works as a lock) acquired by
5629*4882a593Smuzhiyun * sb_start_intwrite() in btrfs_start_transaction().
5630*4882a593Smuzhiyun * Not locking i_mutex of the inodes is still safe because:
5631*4882a593Smuzhiyun *
5632*4882a593Smuzhiyun * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5633*4882a593Smuzhiyun * that while logging the inode new references (names) are added or removed
5634*4882a593Smuzhiyun * from the inode, leaving the logged inode item with a link count that does
5635*4882a593Smuzhiyun * not match the number of logged inode reference items. This is fine because
5636*4882a593Smuzhiyun * at log replay time we compute the real number of links and correct the
5637*4882a593Smuzhiyun * link count in the inode item (see replay_one_buffer() and
5638*4882a593Smuzhiyun * link_to_fixup_dir());
5639*4882a593Smuzhiyun *
5640*4882a593Smuzhiyun * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5641*4882a593Smuzhiyun * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5642*4882a593Smuzhiyun * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5643*4882a593Smuzhiyun * has a size that doesn't match the sum of the lengths of all the logged
5644*4882a593Smuzhiyun * names. This does not result in a problem because if a dir_item key is
5645*4882a593Smuzhiyun * logged but its matching dir_index key is not logged, at log replay time we
5646*4882a593Smuzhiyun * don't use it to replay the respective name (see replay_one_name()). On the
5647*4882a593Smuzhiyun * other hand if only the dir_index key ends up being logged, the respective
5648*4882a593Smuzhiyun * name is added to the fs/subvol tree with both the dir_item and dir_index
5649*4882a593Smuzhiyun * keys created (see replay_one_name()).
5650*4882a593Smuzhiyun * The directory's inode item with a wrong i_size is not a problem as well,
5651*4882a593Smuzhiyun * since we don't use it at log replay time to set the i_size in the inode
5652*4882a593Smuzhiyun * item of the fs/subvol tree (see overwrite_item()).
5653*4882a593Smuzhiyun */
log_new_dir_dentries(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * start_inode,struct btrfs_log_ctx * ctx)5654*4882a593Smuzhiyun static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5655*4882a593Smuzhiyun struct btrfs_root *root,
5656*4882a593Smuzhiyun struct btrfs_inode *start_inode,
5657*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
5658*4882a593Smuzhiyun {
5659*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
5660*4882a593Smuzhiyun struct btrfs_root *log = root->log_root;
5661*4882a593Smuzhiyun struct btrfs_path *path;
5662*4882a593Smuzhiyun LIST_HEAD(dir_list);
5663*4882a593Smuzhiyun struct btrfs_dir_list *dir_elem;
5664*4882a593Smuzhiyun int ret = 0;
5665*4882a593Smuzhiyun
5666*4882a593Smuzhiyun path = btrfs_alloc_path();
5667*4882a593Smuzhiyun if (!path)
5668*4882a593Smuzhiyun return -ENOMEM;
5669*4882a593Smuzhiyun
5670*4882a593Smuzhiyun dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5671*4882a593Smuzhiyun if (!dir_elem) {
5672*4882a593Smuzhiyun btrfs_free_path(path);
5673*4882a593Smuzhiyun return -ENOMEM;
5674*4882a593Smuzhiyun }
5675*4882a593Smuzhiyun dir_elem->ino = btrfs_ino(start_inode);
5676*4882a593Smuzhiyun list_add_tail(&dir_elem->list, &dir_list);
5677*4882a593Smuzhiyun
5678*4882a593Smuzhiyun while (!list_empty(&dir_list)) {
5679*4882a593Smuzhiyun struct extent_buffer *leaf;
5680*4882a593Smuzhiyun struct btrfs_key min_key;
5681*4882a593Smuzhiyun int nritems;
5682*4882a593Smuzhiyun int i;
5683*4882a593Smuzhiyun
5684*4882a593Smuzhiyun dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5685*4882a593Smuzhiyun list);
5686*4882a593Smuzhiyun if (ret)
5687*4882a593Smuzhiyun goto next_dir_inode;
5688*4882a593Smuzhiyun
5689*4882a593Smuzhiyun min_key.objectid = dir_elem->ino;
5690*4882a593Smuzhiyun min_key.type = BTRFS_DIR_ITEM_KEY;
5691*4882a593Smuzhiyun min_key.offset = 0;
5692*4882a593Smuzhiyun again:
5693*4882a593Smuzhiyun btrfs_release_path(path);
5694*4882a593Smuzhiyun ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5695*4882a593Smuzhiyun if (ret < 0) {
5696*4882a593Smuzhiyun goto next_dir_inode;
5697*4882a593Smuzhiyun } else if (ret > 0) {
5698*4882a593Smuzhiyun ret = 0;
5699*4882a593Smuzhiyun goto next_dir_inode;
5700*4882a593Smuzhiyun }
5701*4882a593Smuzhiyun
5702*4882a593Smuzhiyun process_leaf:
5703*4882a593Smuzhiyun leaf = path->nodes[0];
5704*4882a593Smuzhiyun nritems = btrfs_header_nritems(leaf);
5705*4882a593Smuzhiyun for (i = path->slots[0]; i < nritems; i++) {
5706*4882a593Smuzhiyun struct btrfs_dir_item *di;
5707*4882a593Smuzhiyun struct btrfs_key di_key;
5708*4882a593Smuzhiyun struct inode *di_inode;
5709*4882a593Smuzhiyun struct btrfs_dir_list *new_dir_elem;
5710*4882a593Smuzhiyun int log_mode = LOG_INODE_EXISTS;
5711*4882a593Smuzhiyun int type;
5712*4882a593Smuzhiyun
5713*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &min_key, i);
5714*4882a593Smuzhiyun if (min_key.objectid != dir_elem->ino ||
5715*4882a593Smuzhiyun min_key.type != BTRFS_DIR_ITEM_KEY)
5716*4882a593Smuzhiyun goto next_dir_inode;
5717*4882a593Smuzhiyun
5718*4882a593Smuzhiyun di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5719*4882a593Smuzhiyun type = btrfs_dir_type(leaf, di);
5720*4882a593Smuzhiyun if (btrfs_dir_transid(leaf, di) < trans->transid &&
5721*4882a593Smuzhiyun type != BTRFS_FT_DIR)
5722*4882a593Smuzhiyun continue;
5723*4882a593Smuzhiyun btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5724*4882a593Smuzhiyun if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5725*4882a593Smuzhiyun continue;
5726*4882a593Smuzhiyun
5727*4882a593Smuzhiyun btrfs_release_path(path);
5728*4882a593Smuzhiyun di_inode = btrfs_iget(fs_info->sb, di_key.objectid, root);
5729*4882a593Smuzhiyun if (IS_ERR(di_inode)) {
5730*4882a593Smuzhiyun ret = PTR_ERR(di_inode);
5731*4882a593Smuzhiyun goto next_dir_inode;
5732*4882a593Smuzhiyun }
5733*4882a593Smuzhiyun
5734*4882a593Smuzhiyun if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
5735*4882a593Smuzhiyun btrfs_add_delayed_iput(di_inode);
5736*4882a593Smuzhiyun break;
5737*4882a593Smuzhiyun }
5738*4882a593Smuzhiyun
5739*4882a593Smuzhiyun ctx->log_new_dentries = false;
5740*4882a593Smuzhiyun if (type == BTRFS_FT_DIR)
5741*4882a593Smuzhiyun log_mode = LOG_INODE_ALL;
5742*4882a593Smuzhiyun ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
5743*4882a593Smuzhiyun log_mode, ctx);
5744*4882a593Smuzhiyun if (!ret &&
5745*4882a593Smuzhiyun btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
5746*4882a593Smuzhiyun ret = 1;
5747*4882a593Smuzhiyun btrfs_add_delayed_iput(di_inode);
5748*4882a593Smuzhiyun if (ret)
5749*4882a593Smuzhiyun goto next_dir_inode;
5750*4882a593Smuzhiyun if (ctx->log_new_dentries) {
5751*4882a593Smuzhiyun new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5752*4882a593Smuzhiyun GFP_NOFS);
5753*4882a593Smuzhiyun if (!new_dir_elem) {
5754*4882a593Smuzhiyun ret = -ENOMEM;
5755*4882a593Smuzhiyun goto next_dir_inode;
5756*4882a593Smuzhiyun }
5757*4882a593Smuzhiyun new_dir_elem->ino = di_key.objectid;
5758*4882a593Smuzhiyun list_add_tail(&new_dir_elem->list, &dir_list);
5759*4882a593Smuzhiyun }
5760*4882a593Smuzhiyun break;
5761*4882a593Smuzhiyun }
5762*4882a593Smuzhiyun if (i == nritems) {
5763*4882a593Smuzhiyun ret = btrfs_next_leaf(log, path);
5764*4882a593Smuzhiyun if (ret < 0) {
5765*4882a593Smuzhiyun goto next_dir_inode;
5766*4882a593Smuzhiyun } else if (ret > 0) {
5767*4882a593Smuzhiyun ret = 0;
5768*4882a593Smuzhiyun goto next_dir_inode;
5769*4882a593Smuzhiyun }
5770*4882a593Smuzhiyun goto process_leaf;
5771*4882a593Smuzhiyun }
5772*4882a593Smuzhiyun if (min_key.offset < (u64)-1) {
5773*4882a593Smuzhiyun min_key.offset++;
5774*4882a593Smuzhiyun goto again;
5775*4882a593Smuzhiyun }
5776*4882a593Smuzhiyun next_dir_inode:
5777*4882a593Smuzhiyun list_del(&dir_elem->list);
5778*4882a593Smuzhiyun kfree(dir_elem);
5779*4882a593Smuzhiyun }
5780*4882a593Smuzhiyun
5781*4882a593Smuzhiyun btrfs_free_path(path);
5782*4882a593Smuzhiyun return ret;
5783*4882a593Smuzhiyun }
5784*4882a593Smuzhiyun
btrfs_log_all_parents(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_log_ctx * ctx)5785*4882a593Smuzhiyun static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
5786*4882a593Smuzhiyun struct btrfs_inode *inode,
5787*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
5788*4882a593Smuzhiyun {
5789*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = trans->fs_info;
5790*4882a593Smuzhiyun int ret;
5791*4882a593Smuzhiyun struct btrfs_path *path;
5792*4882a593Smuzhiyun struct btrfs_key key;
5793*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
5794*4882a593Smuzhiyun const u64 ino = btrfs_ino(inode);
5795*4882a593Smuzhiyun
5796*4882a593Smuzhiyun path = btrfs_alloc_path();
5797*4882a593Smuzhiyun if (!path)
5798*4882a593Smuzhiyun return -ENOMEM;
5799*4882a593Smuzhiyun path->skip_locking = 1;
5800*4882a593Smuzhiyun path->search_commit_root = 1;
5801*4882a593Smuzhiyun
5802*4882a593Smuzhiyun key.objectid = ino;
5803*4882a593Smuzhiyun key.type = BTRFS_INODE_REF_KEY;
5804*4882a593Smuzhiyun key.offset = 0;
5805*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5806*4882a593Smuzhiyun if (ret < 0)
5807*4882a593Smuzhiyun goto out;
5808*4882a593Smuzhiyun
5809*4882a593Smuzhiyun while (true) {
5810*4882a593Smuzhiyun struct extent_buffer *leaf = path->nodes[0];
5811*4882a593Smuzhiyun int slot = path->slots[0];
5812*4882a593Smuzhiyun u32 cur_offset = 0;
5813*4882a593Smuzhiyun u32 item_size;
5814*4882a593Smuzhiyun unsigned long ptr;
5815*4882a593Smuzhiyun
5816*4882a593Smuzhiyun if (slot >= btrfs_header_nritems(leaf)) {
5817*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
5818*4882a593Smuzhiyun if (ret < 0)
5819*4882a593Smuzhiyun goto out;
5820*4882a593Smuzhiyun else if (ret > 0)
5821*4882a593Smuzhiyun break;
5822*4882a593Smuzhiyun continue;
5823*4882a593Smuzhiyun }
5824*4882a593Smuzhiyun
5825*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, slot);
5826*4882a593Smuzhiyun /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5827*4882a593Smuzhiyun if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5828*4882a593Smuzhiyun break;
5829*4882a593Smuzhiyun
5830*4882a593Smuzhiyun item_size = btrfs_item_size_nr(leaf, slot);
5831*4882a593Smuzhiyun ptr = btrfs_item_ptr_offset(leaf, slot);
5832*4882a593Smuzhiyun while (cur_offset < item_size) {
5833*4882a593Smuzhiyun struct btrfs_key inode_key;
5834*4882a593Smuzhiyun struct inode *dir_inode;
5835*4882a593Smuzhiyun
5836*4882a593Smuzhiyun inode_key.type = BTRFS_INODE_ITEM_KEY;
5837*4882a593Smuzhiyun inode_key.offset = 0;
5838*4882a593Smuzhiyun
5839*4882a593Smuzhiyun if (key.type == BTRFS_INODE_EXTREF_KEY) {
5840*4882a593Smuzhiyun struct btrfs_inode_extref *extref;
5841*4882a593Smuzhiyun
5842*4882a593Smuzhiyun extref = (struct btrfs_inode_extref *)
5843*4882a593Smuzhiyun (ptr + cur_offset);
5844*4882a593Smuzhiyun inode_key.objectid = btrfs_inode_extref_parent(
5845*4882a593Smuzhiyun leaf, extref);
5846*4882a593Smuzhiyun cur_offset += sizeof(*extref);
5847*4882a593Smuzhiyun cur_offset += btrfs_inode_extref_name_len(leaf,
5848*4882a593Smuzhiyun extref);
5849*4882a593Smuzhiyun } else {
5850*4882a593Smuzhiyun inode_key.objectid = key.offset;
5851*4882a593Smuzhiyun cur_offset = item_size;
5852*4882a593Smuzhiyun }
5853*4882a593Smuzhiyun
5854*4882a593Smuzhiyun dir_inode = btrfs_iget(fs_info->sb, inode_key.objectid,
5855*4882a593Smuzhiyun root);
5856*4882a593Smuzhiyun /*
5857*4882a593Smuzhiyun * If the parent inode was deleted, return an error to
5858*4882a593Smuzhiyun * fallback to a transaction commit. This is to prevent
5859*4882a593Smuzhiyun * getting an inode that was moved from one parent A to
5860*4882a593Smuzhiyun * a parent B, got its former parent A deleted and then
5861*4882a593Smuzhiyun * it got fsync'ed, from existing at both parents after
5862*4882a593Smuzhiyun * a log replay (and the old parent still existing).
5863*4882a593Smuzhiyun * Example:
5864*4882a593Smuzhiyun *
5865*4882a593Smuzhiyun * mkdir /mnt/A
5866*4882a593Smuzhiyun * mkdir /mnt/B
5867*4882a593Smuzhiyun * touch /mnt/B/bar
5868*4882a593Smuzhiyun * sync
5869*4882a593Smuzhiyun * mv /mnt/B/bar /mnt/A/bar
5870*4882a593Smuzhiyun * mv -T /mnt/A /mnt/B
5871*4882a593Smuzhiyun * fsync /mnt/B/bar
5872*4882a593Smuzhiyun * <power fail>
5873*4882a593Smuzhiyun *
5874*4882a593Smuzhiyun * If we ignore the old parent B which got deleted,
5875*4882a593Smuzhiyun * after a log replay we would have file bar linked
5876*4882a593Smuzhiyun * at both parents and the old parent B would still
5877*4882a593Smuzhiyun * exist.
5878*4882a593Smuzhiyun */
5879*4882a593Smuzhiyun if (IS_ERR(dir_inode)) {
5880*4882a593Smuzhiyun ret = PTR_ERR(dir_inode);
5881*4882a593Smuzhiyun goto out;
5882*4882a593Smuzhiyun }
5883*4882a593Smuzhiyun
5884*4882a593Smuzhiyun if (ctx)
5885*4882a593Smuzhiyun ctx->log_new_dentries = false;
5886*4882a593Smuzhiyun ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
5887*4882a593Smuzhiyun LOG_INODE_ALL, ctx);
5888*4882a593Smuzhiyun if (!ret &&
5889*4882a593Smuzhiyun btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
5890*4882a593Smuzhiyun ret = 1;
5891*4882a593Smuzhiyun if (!ret && ctx && ctx->log_new_dentries)
5892*4882a593Smuzhiyun ret = log_new_dir_dentries(trans, root,
5893*4882a593Smuzhiyun BTRFS_I(dir_inode), ctx);
5894*4882a593Smuzhiyun btrfs_add_delayed_iput(dir_inode);
5895*4882a593Smuzhiyun if (ret)
5896*4882a593Smuzhiyun goto out;
5897*4882a593Smuzhiyun }
5898*4882a593Smuzhiyun path->slots[0]++;
5899*4882a593Smuzhiyun }
5900*4882a593Smuzhiyun ret = 0;
5901*4882a593Smuzhiyun out:
5902*4882a593Smuzhiyun btrfs_free_path(path);
5903*4882a593Smuzhiyun return ret;
5904*4882a593Smuzhiyun }
5905*4882a593Smuzhiyun
log_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_path * path,struct btrfs_log_ctx * ctx)5906*4882a593Smuzhiyun static int log_new_ancestors(struct btrfs_trans_handle *trans,
5907*4882a593Smuzhiyun struct btrfs_root *root,
5908*4882a593Smuzhiyun struct btrfs_path *path,
5909*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
5910*4882a593Smuzhiyun {
5911*4882a593Smuzhiyun struct btrfs_key found_key;
5912*4882a593Smuzhiyun
5913*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5914*4882a593Smuzhiyun
5915*4882a593Smuzhiyun while (true) {
5916*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
5917*4882a593Smuzhiyun const u64 last_committed = fs_info->last_trans_committed;
5918*4882a593Smuzhiyun struct extent_buffer *leaf = path->nodes[0];
5919*4882a593Smuzhiyun int slot = path->slots[0];
5920*4882a593Smuzhiyun struct btrfs_key search_key;
5921*4882a593Smuzhiyun struct inode *inode;
5922*4882a593Smuzhiyun u64 ino;
5923*4882a593Smuzhiyun int ret = 0;
5924*4882a593Smuzhiyun
5925*4882a593Smuzhiyun btrfs_release_path(path);
5926*4882a593Smuzhiyun
5927*4882a593Smuzhiyun ino = found_key.offset;
5928*4882a593Smuzhiyun
5929*4882a593Smuzhiyun search_key.objectid = found_key.offset;
5930*4882a593Smuzhiyun search_key.type = BTRFS_INODE_ITEM_KEY;
5931*4882a593Smuzhiyun search_key.offset = 0;
5932*4882a593Smuzhiyun inode = btrfs_iget(fs_info->sb, ino, root);
5933*4882a593Smuzhiyun if (IS_ERR(inode))
5934*4882a593Smuzhiyun return PTR_ERR(inode);
5935*4882a593Smuzhiyun
5936*4882a593Smuzhiyun if (BTRFS_I(inode)->generation > last_committed)
5937*4882a593Smuzhiyun ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
5938*4882a593Smuzhiyun LOG_INODE_EXISTS, ctx);
5939*4882a593Smuzhiyun btrfs_add_delayed_iput(inode);
5940*4882a593Smuzhiyun if (ret)
5941*4882a593Smuzhiyun return ret;
5942*4882a593Smuzhiyun
5943*4882a593Smuzhiyun if (search_key.objectid == BTRFS_FIRST_FREE_OBJECTID)
5944*4882a593Smuzhiyun break;
5945*4882a593Smuzhiyun
5946*4882a593Smuzhiyun search_key.type = BTRFS_INODE_REF_KEY;
5947*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
5948*4882a593Smuzhiyun if (ret < 0)
5949*4882a593Smuzhiyun return ret;
5950*4882a593Smuzhiyun
5951*4882a593Smuzhiyun leaf = path->nodes[0];
5952*4882a593Smuzhiyun slot = path->slots[0];
5953*4882a593Smuzhiyun if (slot >= btrfs_header_nritems(leaf)) {
5954*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
5955*4882a593Smuzhiyun if (ret < 0)
5956*4882a593Smuzhiyun return ret;
5957*4882a593Smuzhiyun else if (ret > 0)
5958*4882a593Smuzhiyun return -ENOENT;
5959*4882a593Smuzhiyun leaf = path->nodes[0];
5960*4882a593Smuzhiyun slot = path->slots[0];
5961*4882a593Smuzhiyun }
5962*4882a593Smuzhiyun
5963*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &found_key, slot);
5964*4882a593Smuzhiyun if (found_key.objectid != search_key.objectid ||
5965*4882a593Smuzhiyun found_key.type != BTRFS_INODE_REF_KEY)
5966*4882a593Smuzhiyun return -ENOENT;
5967*4882a593Smuzhiyun }
5968*4882a593Smuzhiyun return 0;
5969*4882a593Smuzhiyun }
5970*4882a593Smuzhiyun
log_new_ancestors_fast(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)5971*4882a593Smuzhiyun static int log_new_ancestors_fast(struct btrfs_trans_handle *trans,
5972*4882a593Smuzhiyun struct btrfs_inode *inode,
5973*4882a593Smuzhiyun struct dentry *parent,
5974*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
5975*4882a593Smuzhiyun {
5976*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
5977*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
5978*4882a593Smuzhiyun struct dentry *old_parent = NULL;
5979*4882a593Smuzhiyun struct super_block *sb = inode->vfs_inode.i_sb;
5980*4882a593Smuzhiyun int ret = 0;
5981*4882a593Smuzhiyun
5982*4882a593Smuzhiyun while (true) {
5983*4882a593Smuzhiyun if (!parent || d_really_is_negative(parent) ||
5984*4882a593Smuzhiyun sb != parent->d_sb)
5985*4882a593Smuzhiyun break;
5986*4882a593Smuzhiyun
5987*4882a593Smuzhiyun inode = BTRFS_I(d_inode(parent));
5988*4882a593Smuzhiyun if (root != inode->root)
5989*4882a593Smuzhiyun break;
5990*4882a593Smuzhiyun
5991*4882a593Smuzhiyun if (inode->generation > fs_info->last_trans_committed) {
5992*4882a593Smuzhiyun ret = btrfs_log_inode(trans, root, inode,
5993*4882a593Smuzhiyun LOG_INODE_EXISTS, ctx);
5994*4882a593Smuzhiyun if (ret)
5995*4882a593Smuzhiyun break;
5996*4882a593Smuzhiyun }
5997*4882a593Smuzhiyun if (IS_ROOT(parent))
5998*4882a593Smuzhiyun break;
5999*4882a593Smuzhiyun
6000*4882a593Smuzhiyun parent = dget_parent(parent);
6001*4882a593Smuzhiyun dput(old_parent);
6002*4882a593Smuzhiyun old_parent = parent;
6003*4882a593Smuzhiyun }
6004*4882a593Smuzhiyun dput(old_parent);
6005*4882a593Smuzhiyun
6006*4882a593Smuzhiyun return ret;
6007*4882a593Smuzhiyun }
6008*4882a593Smuzhiyun
log_all_new_ancestors(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,struct btrfs_log_ctx * ctx)6009*4882a593Smuzhiyun static int log_all_new_ancestors(struct btrfs_trans_handle *trans,
6010*4882a593Smuzhiyun struct btrfs_inode *inode,
6011*4882a593Smuzhiyun struct dentry *parent,
6012*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
6013*4882a593Smuzhiyun {
6014*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
6015*4882a593Smuzhiyun const u64 ino = btrfs_ino(inode);
6016*4882a593Smuzhiyun struct btrfs_path *path;
6017*4882a593Smuzhiyun struct btrfs_key search_key;
6018*4882a593Smuzhiyun int ret;
6019*4882a593Smuzhiyun
6020*4882a593Smuzhiyun /*
6021*4882a593Smuzhiyun * For a single hard link case, go through a fast path that does not
6022*4882a593Smuzhiyun * need to iterate the fs/subvolume tree.
6023*4882a593Smuzhiyun */
6024*4882a593Smuzhiyun if (inode->vfs_inode.i_nlink < 2)
6025*4882a593Smuzhiyun return log_new_ancestors_fast(trans, inode, parent, ctx);
6026*4882a593Smuzhiyun
6027*4882a593Smuzhiyun path = btrfs_alloc_path();
6028*4882a593Smuzhiyun if (!path)
6029*4882a593Smuzhiyun return -ENOMEM;
6030*4882a593Smuzhiyun
6031*4882a593Smuzhiyun search_key.objectid = ino;
6032*4882a593Smuzhiyun search_key.type = BTRFS_INODE_REF_KEY;
6033*4882a593Smuzhiyun search_key.offset = 0;
6034*4882a593Smuzhiyun again:
6035*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
6036*4882a593Smuzhiyun if (ret < 0)
6037*4882a593Smuzhiyun goto out;
6038*4882a593Smuzhiyun if (ret == 0)
6039*4882a593Smuzhiyun path->slots[0]++;
6040*4882a593Smuzhiyun
6041*4882a593Smuzhiyun while (true) {
6042*4882a593Smuzhiyun struct extent_buffer *leaf = path->nodes[0];
6043*4882a593Smuzhiyun int slot = path->slots[0];
6044*4882a593Smuzhiyun struct btrfs_key found_key;
6045*4882a593Smuzhiyun
6046*4882a593Smuzhiyun if (slot >= btrfs_header_nritems(leaf)) {
6047*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
6048*4882a593Smuzhiyun if (ret < 0)
6049*4882a593Smuzhiyun goto out;
6050*4882a593Smuzhiyun else if (ret > 0)
6051*4882a593Smuzhiyun break;
6052*4882a593Smuzhiyun continue;
6053*4882a593Smuzhiyun }
6054*4882a593Smuzhiyun
6055*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &found_key, slot);
6056*4882a593Smuzhiyun if (found_key.objectid != ino ||
6057*4882a593Smuzhiyun found_key.type > BTRFS_INODE_EXTREF_KEY)
6058*4882a593Smuzhiyun break;
6059*4882a593Smuzhiyun
6060*4882a593Smuzhiyun /*
6061*4882a593Smuzhiyun * Don't deal with extended references because they are rare
6062*4882a593Smuzhiyun * cases and too complex to deal with (we would need to keep
6063*4882a593Smuzhiyun * track of which subitem we are processing for each item in
6064*4882a593Smuzhiyun * this loop, etc). So just return some error to fallback to
6065*4882a593Smuzhiyun * a transaction commit.
6066*4882a593Smuzhiyun */
6067*4882a593Smuzhiyun if (found_key.type == BTRFS_INODE_EXTREF_KEY) {
6068*4882a593Smuzhiyun ret = -EMLINK;
6069*4882a593Smuzhiyun goto out;
6070*4882a593Smuzhiyun }
6071*4882a593Smuzhiyun
6072*4882a593Smuzhiyun /*
6073*4882a593Smuzhiyun * Logging ancestors needs to do more searches on the fs/subvol
6074*4882a593Smuzhiyun * tree, so it releases the path as needed to avoid deadlocks.
6075*4882a593Smuzhiyun * Keep track of the last inode ref key and resume from that key
6076*4882a593Smuzhiyun * after logging all new ancestors for the current hard link.
6077*4882a593Smuzhiyun */
6078*4882a593Smuzhiyun memcpy(&search_key, &found_key, sizeof(search_key));
6079*4882a593Smuzhiyun
6080*4882a593Smuzhiyun ret = log_new_ancestors(trans, root, path, ctx);
6081*4882a593Smuzhiyun if (ret)
6082*4882a593Smuzhiyun goto out;
6083*4882a593Smuzhiyun btrfs_release_path(path);
6084*4882a593Smuzhiyun goto again;
6085*4882a593Smuzhiyun }
6086*4882a593Smuzhiyun ret = 0;
6087*4882a593Smuzhiyun out:
6088*4882a593Smuzhiyun btrfs_free_path(path);
6089*4882a593Smuzhiyun return ret;
6090*4882a593Smuzhiyun }
6091*4882a593Smuzhiyun
6092*4882a593Smuzhiyun /*
6093*4882a593Smuzhiyun * helper function around btrfs_log_inode to make sure newly created
6094*4882a593Smuzhiyun * parent directories also end up in the log. A minimal inode and backref
6095*4882a593Smuzhiyun * only logging is done of any parent directories that are older than
6096*4882a593Smuzhiyun * the last committed transaction
6097*4882a593Smuzhiyun */
btrfs_log_inode_parent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct dentry * parent,int inode_only,struct btrfs_log_ctx * ctx)6098*4882a593Smuzhiyun static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
6099*4882a593Smuzhiyun struct btrfs_inode *inode,
6100*4882a593Smuzhiyun struct dentry *parent,
6101*4882a593Smuzhiyun int inode_only,
6102*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
6103*4882a593Smuzhiyun {
6104*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
6105*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
6106*4882a593Smuzhiyun struct super_block *sb;
6107*4882a593Smuzhiyun int ret = 0;
6108*4882a593Smuzhiyun u64 last_committed = fs_info->last_trans_committed;
6109*4882a593Smuzhiyun bool log_dentries = false;
6110*4882a593Smuzhiyun
6111*4882a593Smuzhiyun sb = inode->vfs_inode.i_sb;
6112*4882a593Smuzhiyun
6113*4882a593Smuzhiyun if (btrfs_test_opt(fs_info, NOTREELOG)) {
6114*4882a593Smuzhiyun ret = 1;
6115*4882a593Smuzhiyun goto end_no_trans;
6116*4882a593Smuzhiyun }
6117*4882a593Smuzhiyun
6118*4882a593Smuzhiyun /*
6119*4882a593Smuzhiyun * The prev transaction commit doesn't complete, we need do
6120*4882a593Smuzhiyun * full commit by ourselves.
6121*4882a593Smuzhiyun */
6122*4882a593Smuzhiyun if (fs_info->last_trans_log_full_commit >
6123*4882a593Smuzhiyun fs_info->last_trans_committed) {
6124*4882a593Smuzhiyun ret = 1;
6125*4882a593Smuzhiyun goto end_no_trans;
6126*4882a593Smuzhiyun }
6127*4882a593Smuzhiyun
6128*4882a593Smuzhiyun if (btrfs_root_refs(&root->root_item) == 0) {
6129*4882a593Smuzhiyun ret = 1;
6130*4882a593Smuzhiyun goto end_no_trans;
6131*4882a593Smuzhiyun }
6132*4882a593Smuzhiyun
6133*4882a593Smuzhiyun ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
6134*4882a593Smuzhiyun last_committed);
6135*4882a593Smuzhiyun if (ret)
6136*4882a593Smuzhiyun goto end_no_trans;
6137*4882a593Smuzhiyun
6138*4882a593Smuzhiyun /*
6139*4882a593Smuzhiyun * Skip already logged inodes or inodes corresponding to tmpfiles
6140*4882a593Smuzhiyun * (since logging them is pointless, a link count of 0 means they
6141*4882a593Smuzhiyun * will never be accessible).
6142*4882a593Smuzhiyun */
6143*4882a593Smuzhiyun if ((btrfs_inode_in_log(inode, trans->transid) &&
6144*4882a593Smuzhiyun list_empty(&ctx->ordered_extents)) ||
6145*4882a593Smuzhiyun inode->vfs_inode.i_nlink == 0) {
6146*4882a593Smuzhiyun ret = BTRFS_NO_LOG_SYNC;
6147*4882a593Smuzhiyun goto end_no_trans;
6148*4882a593Smuzhiyun }
6149*4882a593Smuzhiyun
6150*4882a593Smuzhiyun ret = start_log_trans(trans, root, ctx);
6151*4882a593Smuzhiyun if (ret)
6152*4882a593Smuzhiyun goto end_no_trans;
6153*4882a593Smuzhiyun
6154*4882a593Smuzhiyun ret = btrfs_log_inode(trans, root, inode, inode_only, ctx);
6155*4882a593Smuzhiyun if (ret)
6156*4882a593Smuzhiyun goto end_trans;
6157*4882a593Smuzhiyun
6158*4882a593Smuzhiyun /*
6159*4882a593Smuzhiyun * for regular files, if its inode is already on disk, we don't
6160*4882a593Smuzhiyun * have to worry about the parents at all. This is because
6161*4882a593Smuzhiyun * we can use the last_unlink_trans field to record renames
6162*4882a593Smuzhiyun * and other fun in this file.
6163*4882a593Smuzhiyun */
6164*4882a593Smuzhiyun if (S_ISREG(inode->vfs_inode.i_mode) &&
6165*4882a593Smuzhiyun inode->generation <= last_committed &&
6166*4882a593Smuzhiyun inode->last_unlink_trans <= last_committed) {
6167*4882a593Smuzhiyun ret = 0;
6168*4882a593Smuzhiyun goto end_trans;
6169*4882a593Smuzhiyun }
6170*4882a593Smuzhiyun
6171*4882a593Smuzhiyun if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
6172*4882a593Smuzhiyun log_dentries = true;
6173*4882a593Smuzhiyun
6174*4882a593Smuzhiyun /*
6175*4882a593Smuzhiyun * On unlink we must make sure all our current and old parent directory
6176*4882a593Smuzhiyun * inodes are fully logged. This is to prevent leaving dangling
6177*4882a593Smuzhiyun * directory index entries in directories that were our parents but are
6178*4882a593Smuzhiyun * not anymore. Not doing this results in old parent directory being
6179*4882a593Smuzhiyun * impossible to delete after log replay (rmdir will always fail with
6180*4882a593Smuzhiyun * error -ENOTEMPTY).
6181*4882a593Smuzhiyun *
6182*4882a593Smuzhiyun * Example 1:
6183*4882a593Smuzhiyun *
6184*4882a593Smuzhiyun * mkdir testdir
6185*4882a593Smuzhiyun * touch testdir/foo
6186*4882a593Smuzhiyun * ln testdir/foo testdir/bar
6187*4882a593Smuzhiyun * sync
6188*4882a593Smuzhiyun * unlink testdir/bar
6189*4882a593Smuzhiyun * xfs_io -c fsync testdir/foo
6190*4882a593Smuzhiyun * <power failure>
6191*4882a593Smuzhiyun * mount fs, triggers log replay
6192*4882a593Smuzhiyun *
6193*4882a593Smuzhiyun * If we don't log the parent directory (testdir), after log replay the
6194*4882a593Smuzhiyun * directory still has an entry pointing to the file inode using the bar
6195*4882a593Smuzhiyun * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
6196*4882a593Smuzhiyun * the file inode has a link count of 1.
6197*4882a593Smuzhiyun *
6198*4882a593Smuzhiyun * Example 2:
6199*4882a593Smuzhiyun *
6200*4882a593Smuzhiyun * mkdir testdir
6201*4882a593Smuzhiyun * touch foo
6202*4882a593Smuzhiyun * ln foo testdir/foo2
6203*4882a593Smuzhiyun * ln foo testdir/foo3
6204*4882a593Smuzhiyun * sync
6205*4882a593Smuzhiyun * unlink testdir/foo3
6206*4882a593Smuzhiyun * xfs_io -c fsync foo
6207*4882a593Smuzhiyun * <power failure>
6208*4882a593Smuzhiyun * mount fs, triggers log replay
6209*4882a593Smuzhiyun *
6210*4882a593Smuzhiyun * Similar as the first example, after log replay the parent directory
6211*4882a593Smuzhiyun * testdir still has an entry pointing to the inode file with name foo3
6212*4882a593Smuzhiyun * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
6213*4882a593Smuzhiyun * and has a link count of 2.
6214*4882a593Smuzhiyun */
6215*4882a593Smuzhiyun if (inode->last_unlink_trans > last_committed) {
6216*4882a593Smuzhiyun ret = btrfs_log_all_parents(trans, inode, ctx);
6217*4882a593Smuzhiyun if (ret)
6218*4882a593Smuzhiyun goto end_trans;
6219*4882a593Smuzhiyun }
6220*4882a593Smuzhiyun
6221*4882a593Smuzhiyun ret = log_all_new_ancestors(trans, inode, parent, ctx);
6222*4882a593Smuzhiyun if (ret)
6223*4882a593Smuzhiyun goto end_trans;
6224*4882a593Smuzhiyun
6225*4882a593Smuzhiyun if (log_dentries)
6226*4882a593Smuzhiyun ret = log_new_dir_dentries(trans, root, inode, ctx);
6227*4882a593Smuzhiyun else
6228*4882a593Smuzhiyun ret = 0;
6229*4882a593Smuzhiyun end_trans:
6230*4882a593Smuzhiyun if (ret < 0) {
6231*4882a593Smuzhiyun btrfs_set_log_full_commit(trans);
6232*4882a593Smuzhiyun ret = 1;
6233*4882a593Smuzhiyun }
6234*4882a593Smuzhiyun
6235*4882a593Smuzhiyun if (ret)
6236*4882a593Smuzhiyun btrfs_remove_log_ctx(root, ctx);
6237*4882a593Smuzhiyun btrfs_end_log_trans(root);
6238*4882a593Smuzhiyun end_no_trans:
6239*4882a593Smuzhiyun return ret;
6240*4882a593Smuzhiyun }
6241*4882a593Smuzhiyun
6242*4882a593Smuzhiyun /*
6243*4882a593Smuzhiyun * it is not safe to log dentry if the chunk root has added new
6244*4882a593Smuzhiyun * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
6245*4882a593Smuzhiyun * If this returns 1, you must commit the transaction to safely get your
6246*4882a593Smuzhiyun * data on disk.
6247*4882a593Smuzhiyun */
btrfs_log_dentry_safe(struct btrfs_trans_handle * trans,struct dentry * dentry,struct btrfs_log_ctx * ctx)6248*4882a593Smuzhiyun int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
6249*4882a593Smuzhiyun struct dentry *dentry,
6250*4882a593Smuzhiyun struct btrfs_log_ctx *ctx)
6251*4882a593Smuzhiyun {
6252*4882a593Smuzhiyun struct dentry *parent = dget_parent(dentry);
6253*4882a593Smuzhiyun int ret;
6254*4882a593Smuzhiyun
6255*4882a593Smuzhiyun ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6256*4882a593Smuzhiyun LOG_INODE_ALL, ctx);
6257*4882a593Smuzhiyun dput(parent);
6258*4882a593Smuzhiyun
6259*4882a593Smuzhiyun return ret;
6260*4882a593Smuzhiyun }
6261*4882a593Smuzhiyun
6262*4882a593Smuzhiyun /*
6263*4882a593Smuzhiyun * should be called during mount to recover any replay any log trees
6264*4882a593Smuzhiyun * from the FS
6265*4882a593Smuzhiyun */
btrfs_recover_log_trees(struct btrfs_root * log_root_tree)6266*4882a593Smuzhiyun int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6267*4882a593Smuzhiyun {
6268*4882a593Smuzhiyun int ret;
6269*4882a593Smuzhiyun struct btrfs_path *path;
6270*4882a593Smuzhiyun struct btrfs_trans_handle *trans;
6271*4882a593Smuzhiyun struct btrfs_key key;
6272*4882a593Smuzhiyun struct btrfs_key found_key;
6273*4882a593Smuzhiyun struct btrfs_root *log;
6274*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6275*4882a593Smuzhiyun struct walk_control wc = {
6276*4882a593Smuzhiyun .process_func = process_one_buffer,
6277*4882a593Smuzhiyun .stage = LOG_WALK_PIN_ONLY,
6278*4882a593Smuzhiyun };
6279*4882a593Smuzhiyun
6280*4882a593Smuzhiyun path = btrfs_alloc_path();
6281*4882a593Smuzhiyun if (!path)
6282*4882a593Smuzhiyun return -ENOMEM;
6283*4882a593Smuzhiyun
6284*4882a593Smuzhiyun set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6285*4882a593Smuzhiyun
6286*4882a593Smuzhiyun trans = btrfs_start_transaction(fs_info->tree_root, 0);
6287*4882a593Smuzhiyun if (IS_ERR(trans)) {
6288*4882a593Smuzhiyun ret = PTR_ERR(trans);
6289*4882a593Smuzhiyun goto error;
6290*4882a593Smuzhiyun }
6291*4882a593Smuzhiyun
6292*4882a593Smuzhiyun wc.trans = trans;
6293*4882a593Smuzhiyun wc.pin = 1;
6294*4882a593Smuzhiyun
6295*4882a593Smuzhiyun ret = walk_log_tree(trans, log_root_tree, &wc);
6296*4882a593Smuzhiyun if (ret) {
6297*4882a593Smuzhiyun btrfs_handle_fs_error(fs_info, ret,
6298*4882a593Smuzhiyun "Failed to pin buffers while recovering log root tree.");
6299*4882a593Smuzhiyun goto error;
6300*4882a593Smuzhiyun }
6301*4882a593Smuzhiyun
6302*4882a593Smuzhiyun again:
6303*4882a593Smuzhiyun key.objectid = BTRFS_TREE_LOG_OBJECTID;
6304*4882a593Smuzhiyun key.offset = (u64)-1;
6305*4882a593Smuzhiyun key.type = BTRFS_ROOT_ITEM_KEY;
6306*4882a593Smuzhiyun
6307*4882a593Smuzhiyun while (1) {
6308*4882a593Smuzhiyun ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
6309*4882a593Smuzhiyun
6310*4882a593Smuzhiyun if (ret < 0) {
6311*4882a593Smuzhiyun btrfs_handle_fs_error(fs_info, ret,
6312*4882a593Smuzhiyun "Couldn't find tree log root.");
6313*4882a593Smuzhiyun goto error;
6314*4882a593Smuzhiyun }
6315*4882a593Smuzhiyun if (ret > 0) {
6316*4882a593Smuzhiyun if (path->slots[0] == 0)
6317*4882a593Smuzhiyun break;
6318*4882a593Smuzhiyun path->slots[0]--;
6319*4882a593Smuzhiyun }
6320*4882a593Smuzhiyun btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6321*4882a593Smuzhiyun path->slots[0]);
6322*4882a593Smuzhiyun btrfs_release_path(path);
6323*4882a593Smuzhiyun if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6324*4882a593Smuzhiyun break;
6325*4882a593Smuzhiyun
6326*4882a593Smuzhiyun log = btrfs_read_tree_root(log_root_tree, &found_key);
6327*4882a593Smuzhiyun if (IS_ERR(log)) {
6328*4882a593Smuzhiyun ret = PTR_ERR(log);
6329*4882a593Smuzhiyun btrfs_handle_fs_error(fs_info, ret,
6330*4882a593Smuzhiyun "Couldn't read tree log root.");
6331*4882a593Smuzhiyun goto error;
6332*4882a593Smuzhiyun }
6333*4882a593Smuzhiyun
6334*4882a593Smuzhiyun wc.replay_dest = btrfs_get_fs_root(fs_info, found_key.offset,
6335*4882a593Smuzhiyun true);
6336*4882a593Smuzhiyun if (IS_ERR(wc.replay_dest)) {
6337*4882a593Smuzhiyun ret = PTR_ERR(wc.replay_dest);
6338*4882a593Smuzhiyun
6339*4882a593Smuzhiyun /*
6340*4882a593Smuzhiyun * We didn't find the subvol, likely because it was
6341*4882a593Smuzhiyun * deleted. This is ok, simply skip this log and go to
6342*4882a593Smuzhiyun * the next one.
6343*4882a593Smuzhiyun *
6344*4882a593Smuzhiyun * We need to exclude the root because we can't have
6345*4882a593Smuzhiyun * other log replays overwriting this log as we'll read
6346*4882a593Smuzhiyun * it back in a few more times. This will keep our
6347*4882a593Smuzhiyun * block from being modified, and we'll just bail for
6348*4882a593Smuzhiyun * each subsequent pass.
6349*4882a593Smuzhiyun */
6350*4882a593Smuzhiyun if (ret == -ENOENT)
6351*4882a593Smuzhiyun ret = btrfs_pin_extent_for_log_replay(trans,
6352*4882a593Smuzhiyun log->node->start,
6353*4882a593Smuzhiyun log->node->len);
6354*4882a593Smuzhiyun btrfs_put_root(log);
6355*4882a593Smuzhiyun
6356*4882a593Smuzhiyun if (!ret)
6357*4882a593Smuzhiyun goto next;
6358*4882a593Smuzhiyun btrfs_handle_fs_error(fs_info, ret,
6359*4882a593Smuzhiyun "Couldn't read target root for tree log recovery.");
6360*4882a593Smuzhiyun goto error;
6361*4882a593Smuzhiyun }
6362*4882a593Smuzhiyun
6363*4882a593Smuzhiyun wc.replay_dest->log_root = log;
6364*4882a593Smuzhiyun btrfs_record_root_in_trans(trans, wc.replay_dest);
6365*4882a593Smuzhiyun ret = walk_log_tree(trans, log, &wc);
6366*4882a593Smuzhiyun
6367*4882a593Smuzhiyun if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6368*4882a593Smuzhiyun ret = fixup_inode_link_counts(trans, wc.replay_dest,
6369*4882a593Smuzhiyun path);
6370*4882a593Smuzhiyun }
6371*4882a593Smuzhiyun
6372*4882a593Smuzhiyun if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6373*4882a593Smuzhiyun struct btrfs_root *root = wc.replay_dest;
6374*4882a593Smuzhiyun
6375*4882a593Smuzhiyun btrfs_release_path(path);
6376*4882a593Smuzhiyun
6377*4882a593Smuzhiyun /*
6378*4882a593Smuzhiyun * We have just replayed everything, and the highest
6379*4882a593Smuzhiyun * objectid of fs roots probably has changed in case
6380*4882a593Smuzhiyun * some inode_item's got replayed.
6381*4882a593Smuzhiyun *
6382*4882a593Smuzhiyun * root->objectid_mutex is not acquired as log replay
6383*4882a593Smuzhiyun * could only happen during mount.
6384*4882a593Smuzhiyun */
6385*4882a593Smuzhiyun ret = btrfs_find_highest_objectid(root,
6386*4882a593Smuzhiyun &root->highest_objectid);
6387*4882a593Smuzhiyun }
6388*4882a593Smuzhiyun
6389*4882a593Smuzhiyun wc.replay_dest->log_root = NULL;
6390*4882a593Smuzhiyun btrfs_put_root(wc.replay_dest);
6391*4882a593Smuzhiyun btrfs_put_root(log);
6392*4882a593Smuzhiyun
6393*4882a593Smuzhiyun if (ret)
6394*4882a593Smuzhiyun goto error;
6395*4882a593Smuzhiyun next:
6396*4882a593Smuzhiyun if (found_key.offset == 0)
6397*4882a593Smuzhiyun break;
6398*4882a593Smuzhiyun key.offset = found_key.offset - 1;
6399*4882a593Smuzhiyun }
6400*4882a593Smuzhiyun btrfs_release_path(path);
6401*4882a593Smuzhiyun
6402*4882a593Smuzhiyun /* step one is to pin it all, step two is to replay just inodes */
6403*4882a593Smuzhiyun if (wc.pin) {
6404*4882a593Smuzhiyun wc.pin = 0;
6405*4882a593Smuzhiyun wc.process_func = replay_one_buffer;
6406*4882a593Smuzhiyun wc.stage = LOG_WALK_REPLAY_INODES;
6407*4882a593Smuzhiyun goto again;
6408*4882a593Smuzhiyun }
6409*4882a593Smuzhiyun /* step three is to replay everything */
6410*4882a593Smuzhiyun if (wc.stage < LOG_WALK_REPLAY_ALL) {
6411*4882a593Smuzhiyun wc.stage++;
6412*4882a593Smuzhiyun goto again;
6413*4882a593Smuzhiyun }
6414*4882a593Smuzhiyun
6415*4882a593Smuzhiyun btrfs_free_path(path);
6416*4882a593Smuzhiyun
6417*4882a593Smuzhiyun /* step 4: commit the transaction, which also unpins the blocks */
6418*4882a593Smuzhiyun ret = btrfs_commit_transaction(trans);
6419*4882a593Smuzhiyun if (ret)
6420*4882a593Smuzhiyun return ret;
6421*4882a593Smuzhiyun
6422*4882a593Smuzhiyun log_root_tree->log_root = NULL;
6423*4882a593Smuzhiyun clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6424*4882a593Smuzhiyun btrfs_put_root(log_root_tree);
6425*4882a593Smuzhiyun
6426*4882a593Smuzhiyun return 0;
6427*4882a593Smuzhiyun error:
6428*4882a593Smuzhiyun if (wc.trans)
6429*4882a593Smuzhiyun btrfs_end_transaction(wc.trans);
6430*4882a593Smuzhiyun clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
6431*4882a593Smuzhiyun btrfs_free_path(path);
6432*4882a593Smuzhiyun return ret;
6433*4882a593Smuzhiyun }
6434*4882a593Smuzhiyun
6435*4882a593Smuzhiyun /*
6436*4882a593Smuzhiyun * there are some corner cases where we want to force a full
6437*4882a593Smuzhiyun * commit instead of allowing a directory to be logged.
6438*4882a593Smuzhiyun *
6439*4882a593Smuzhiyun * They revolve around files there were unlinked from the directory, and
6440*4882a593Smuzhiyun * this function updates the parent directory so that a full commit is
6441*4882a593Smuzhiyun * properly done if it is fsync'd later after the unlinks are done.
6442*4882a593Smuzhiyun *
6443*4882a593Smuzhiyun * Must be called before the unlink operations (updates to the subvolume tree,
6444*4882a593Smuzhiyun * inodes, etc) are done.
6445*4882a593Smuzhiyun */
btrfs_record_unlink_dir(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,int for_rename)6446*4882a593Smuzhiyun void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
6447*4882a593Smuzhiyun struct btrfs_inode *dir, struct btrfs_inode *inode,
6448*4882a593Smuzhiyun int for_rename)
6449*4882a593Smuzhiyun {
6450*4882a593Smuzhiyun /*
6451*4882a593Smuzhiyun * when we're logging a file, if it hasn't been renamed
6452*4882a593Smuzhiyun * or unlinked, and its inode is fully committed on disk,
6453*4882a593Smuzhiyun * we don't have to worry about walking up the directory chain
6454*4882a593Smuzhiyun * to log its parents.
6455*4882a593Smuzhiyun *
6456*4882a593Smuzhiyun * So, we use the last_unlink_trans field to put this transid
6457*4882a593Smuzhiyun * into the file. When the file is logged we check it and
6458*4882a593Smuzhiyun * don't log the parents if the file is fully on disk.
6459*4882a593Smuzhiyun */
6460*4882a593Smuzhiyun mutex_lock(&inode->log_mutex);
6461*4882a593Smuzhiyun inode->last_unlink_trans = trans->transid;
6462*4882a593Smuzhiyun mutex_unlock(&inode->log_mutex);
6463*4882a593Smuzhiyun
6464*4882a593Smuzhiyun /*
6465*4882a593Smuzhiyun * if this directory was already logged any new
6466*4882a593Smuzhiyun * names for this file/dir will get recorded
6467*4882a593Smuzhiyun */
6468*4882a593Smuzhiyun if (dir->logged_trans == trans->transid)
6469*4882a593Smuzhiyun return;
6470*4882a593Smuzhiyun
6471*4882a593Smuzhiyun /*
6472*4882a593Smuzhiyun * if the inode we're about to unlink was logged,
6473*4882a593Smuzhiyun * the log will be properly updated for any new names
6474*4882a593Smuzhiyun */
6475*4882a593Smuzhiyun if (inode->logged_trans == trans->transid)
6476*4882a593Smuzhiyun return;
6477*4882a593Smuzhiyun
6478*4882a593Smuzhiyun /*
6479*4882a593Smuzhiyun * when renaming files across directories, if the directory
6480*4882a593Smuzhiyun * there we're unlinking from gets fsync'd later on, there's
6481*4882a593Smuzhiyun * no way to find the destination directory later and fsync it
6482*4882a593Smuzhiyun * properly. So, we have to be conservative and force commits
6483*4882a593Smuzhiyun * so the new name gets discovered.
6484*4882a593Smuzhiyun */
6485*4882a593Smuzhiyun if (for_rename)
6486*4882a593Smuzhiyun goto record;
6487*4882a593Smuzhiyun
6488*4882a593Smuzhiyun /* we can safely do the unlink without any special recording */
6489*4882a593Smuzhiyun return;
6490*4882a593Smuzhiyun
6491*4882a593Smuzhiyun record:
6492*4882a593Smuzhiyun mutex_lock(&dir->log_mutex);
6493*4882a593Smuzhiyun dir->last_unlink_trans = trans->transid;
6494*4882a593Smuzhiyun mutex_unlock(&dir->log_mutex);
6495*4882a593Smuzhiyun }
6496*4882a593Smuzhiyun
6497*4882a593Smuzhiyun /*
6498*4882a593Smuzhiyun * Make sure that if someone attempts to fsync the parent directory of a deleted
6499*4882a593Smuzhiyun * snapshot, it ends up triggering a transaction commit. This is to guarantee
6500*4882a593Smuzhiyun * that after replaying the log tree of the parent directory's root we will not
6501*4882a593Smuzhiyun * see the snapshot anymore and at log replay time we will not see any log tree
6502*4882a593Smuzhiyun * corresponding to the deleted snapshot's root, which could lead to replaying
6503*4882a593Smuzhiyun * it after replaying the log tree of the parent directory (which would replay
6504*4882a593Smuzhiyun * the snapshot delete operation).
6505*4882a593Smuzhiyun *
6506*4882a593Smuzhiyun * Must be called before the actual snapshot destroy operation (updates to the
6507*4882a593Smuzhiyun * parent root and tree of tree roots trees, etc) are done.
6508*4882a593Smuzhiyun */
btrfs_record_snapshot_destroy(struct btrfs_trans_handle * trans,struct btrfs_inode * dir)6509*4882a593Smuzhiyun void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
6510*4882a593Smuzhiyun struct btrfs_inode *dir)
6511*4882a593Smuzhiyun {
6512*4882a593Smuzhiyun mutex_lock(&dir->log_mutex);
6513*4882a593Smuzhiyun dir->last_unlink_trans = trans->transid;
6514*4882a593Smuzhiyun mutex_unlock(&dir->log_mutex);
6515*4882a593Smuzhiyun }
6516*4882a593Smuzhiyun
6517*4882a593Smuzhiyun /*
6518*4882a593Smuzhiyun * Call this after adding a new name for a file and it will properly
6519*4882a593Smuzhiyun * update the log to reflect the new name.
6520*4882a593Smuzhiyun */
btrfs_log_new_name(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_inode * old_dir,struct dentry * parent)6521*4882a593Smuzhiyun void btrfs_log_new_name(struct btrfs_trans_handle *trans,
6522*4882a593Smuzhiyun struct btrfs_inode *inode, struct btrfs_inode *old_dir,
6523*4882a593Smuzhiyun struct dentry *parent)
6524*4882a593Smuzhiyun {
6525*4882a593Smuzhiyun struct btrfs_log_ctx ctx;
6526*4882a593Smuzhiyun
6527*4882a593Smuzhiyun /*
6528*4882a593Smuzhiyun * this will force the logging code to walk the dentry chain
6529*4882a593Smuzhiyun * up for the file
6530*4882a593Smuzhiyun */
6531*4882a593Smuzhiyun if (!S_ISDIR(inode->vfs_inode.i_mode))
6532*4882a593Smuzhiyun inode->last_unlink_trans = trans->transid;
6533*4882a593Smuzhiyun
6534*4882a593Smuzhiyun /*
6535*4882a593Smuzhiyun * if this inode hasn't been logged and directory we're renaming it
6536*4882a593Smuzhiyun * from hasn't been logged, we don't need to log it
6537*4882a593Smuzhiyun */
6538*4882a593Smuzhiyun if (!inode_logged(trans, inode) &&
6539*4882a593Smuzhiyun (!old_dir || !inode_logged(trans, old_dir)))
6540*4882a593Smuzhiyun return;
6541*4882a593Smuzhiyun
6542*4882a593Smuzhiyun btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
6543*4882a593Smuzhiyun ctx.logging_new_name = true;
6544*4882a593Smuzhiyun /*
6545*4882a593Smuzhiyun * We don't care about the return value. If we fail to log the new name
6546*4882a593Smuzhiyun * then we know the next attempt to sync the log will fallback to a full
6547*4882a593Smuzhiyun * transaction commit (due to a call to btrfs_set_log_full_commit()), so
6548*4882a593Smuzhiyun * we don't need to worry about getting a log committed that has an
6549*4882a593Smuzhiyun * inconsistent state after a rename operation.
6550*4882a593Smuzhiyun */
6551*4882a593Smuzhiyun btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx);
6552*4882a593Smuzhiyun }
6553*4882a593Smuzhiyun
6554