1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2007 Oracle. All rights reserved.
4*4882a593Smuzhiyun */
5*4882a593Smuzhiyun
6*4882a593Smuzhiyun #include <linux/fs.h>
7*4882a593Smuzhiyun #include <linux/pagemap.h>
8*4882a593Smuzhiyun #include <linux/time.h>
9*4882a593Smuzhiyun #include <linux/init.h>
10*4882a593Smuzhiyun #include <linux/string.h>
11*4882a593Smuzhiyun #include <linux/backing-dev.h>
12*4882a593Smuzhiyun #include <linux/falloc.h>
13*4882a593Smuzhiyun #include <linux/writeback.h>
14*4882a593Smuzhiyun #include <linux/compat.h>
15*4882a593Smuzhiyun #include <linux/slab.h>
16*4882a593Smuzhiyun #include <linux/btrfs.h>
17*4882a593Smuzhiyun #include <linux/uio.h>
18*4882a593Smuzhiyun #include <linux/iversion.h>
19*4882a593Smuzhiyun #include "ctree.h"
20*4882a593Smuzhiyun #include "disk-io.h"
21*4882a593Smuzhiyun #include "transaction.h"
22*4882a593Smuzhiyun #include "btrfs_inode.h"
23*4882a593Smuzhiyun #include "print-tree.h"
24*4882a593Smuzhiyun #include "tree-log.h"
25*4882a593Smuzhiyun #include "locking.h"
26*4882a593Smuzhiyun #include "volumes.h"
27*4882a593Smuzhiyun #include "qgroup.h"
28*4882a593Smuzhiyun #include "compression.h"
29*4882a593Smuzhiyun #include "delalloc-space.h"
30*4882a593Smuzhiyun #include "reflink.h"
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun static struct kmem_cache *btrfs_inode_defrag_cachep;
33*4882a593Smuzhiyun /*
34*4882a593Smuzhiyun * when auto defrag is enabled we
35*4882a593Smuzhiyun * queue up these defrag structs to remember which
36*4882a593Smuzhiyun * inodes need defragging passes
37*4882a593Smuzhiyun */
38*4882a593Smuzhiyun struct inode_defrag {
39*4882a593Smuzhiyun struct rb_node rb_node;
40*4882a593Smuzhiyun /* objectid */
41*4882a593Smuzhiyun u64 ino;
42*4882a593Smuzhiyun /*
43*4882a593Smuzhiyun * transid where the defrag was added, we search for
44*4882a593Smuzhiyun * extents newer than this
45*4882a593Smuzhiyun */
46*4882a593Smuzhiyun u64 transid;
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun /* root objectid */
49*4882a593Smuzhiyun u64 root;
50*4882a593Smuzhiyun
51*4882a593Smuzhiyun /* last offset we were able to defrag */
52*4882a593Smuzhiyun u64 last_offset;
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun /* if we've wrapped around back to zero once already */
55*4882a593Smuzhiyun int cycled;
56*4882a593Smuzhiyun };
57*4882a593Smuzhiyun
__compare_inode_defrag(struct inode_defrag * defrag1,struct inode_defrag * defrag2)58*4882a593Smuzhiyun static int __compare_inode_defrag(struct inode_defrag *defrag1,
59*4882a593Smuzhiyun struct inode_defrag *defrag2)
60*4882a593Smuzhiyun {
61*4882a593Smuzhiyun if (defrag1->root > defrag2->root)
62*4882a593Smuzhiyun return 1;
63*4882a593Smuzhiyun else if (defrag1->root < defrag2->root)
64*4882a593Smuzhiyun return -1;
65*4882a593Smuzhiyun else if (defrag1->ino > defrag2->ino)
66*4882a593Smuzhiyun return 1;
67*4882a593Smuzhiyun else if (defrag1->ino < defrag2->ino)
68*4882a593Smuzhiyun return -1;
69*4882a593Smuzhiyun else
70*4882a593Smuzhiyun return 0;
71*4882a593Smuzhiyun }
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun /* pop a record for an inode into the defrag tree. The lock
74*4882a593Smuzhiyun * must be held already
75*4882a593Smuzhiyun *
76*4882a593Smuzhiyun * If you're inserting a record for an older transid than an
77*4882a593Smuzhiyun * existing record, the transid already in the tree is lowered
78*4882a593Smuzhiyun *
79*4882a593Smuzhiyun * If an existing record is found the defrag item you
80*4882a593Smuzhiyun * pass in is freed
81*4882a593Smuzhiyun */
__btrfs_add_inode_defrag(struct btrfs_inode * inode,struct inode_defrag * defrag)82*4882a593Smuzhiyun static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
83*4882a593Smuzhiyun struct inode_defrag *defrag)
84*4882a593Smuzhiyun {
85*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = inode->root->fs_info;
86*4882a593Smuzhiyun struct inode_defrag *entry;
87*4882a593Smuzhiyun struct rb_node **p;
88*4882a593Smuzhiyun struct rb_node *parent = NULL;
89*4882a593Smuzhiyun int ret;
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun p = &fs_info->defrag_inodes.rb_node;
92*4882a593Smuzhiyun while (*p) {
93*4882a593Smuzhiyun parent = *p;
94*4882a593Smuzhiyun entry = rb_entry(parent, struct inode_defrag, rb_node);
95*4882a593Smuzhiyun
96*4882a593Smuzhiyun ret = __compare_inode_defrag(defrag, entry);
97*4882a593Smuzhiyun if (ret < 0)
98*4882a593Smuzhiyun p = &parent->rb_left;
99*4882a593Smuzhiyun else if (ret > 0)
100*4882a593Smuzhiyun p = &parent->rb_right;
101*4882a593Smuzhiyun else {
102*4882a593Smuzhiyun /* if we're reinserting an entry for
103*4882a593Smuzhiyun * an old defrag run, make sure to
104*4882a593Smuzhiyun * lower the transid of our existing record
105*4882a593Smuzhiyun */
106*4882a593Smuzhiyun if (defrag->transid < entry->transid)
107*4882a593Smuzhiyun entry->transid = defrag->transid;
108*4882a593Smuzhiyun if (defrag->last_offset > entry->last_offset)
109*4882a593Smuzhiyun entry->last_offset = defrag->last_offset;
110*4882a593Smuzhiyun return -EEXIST;
111*4882a593Smuzhiyun }
112*4882a593Smuzhiyun }
113*4882a593Smuzhiyun set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
114*4882a593Smuzhiyun rb_link_node(&defrag->rb_node, parent, p);
115*4882a593Smuzhiyun rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
116*4882a593Smuzhiyun return 0;
117*4882a593Smuzhiyun }
118*4882a593Smuzhiyun
__need_auto_defrag(struct btrfs_fs_info * fs_info)119*4882a593Smuzhiyun static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
120*4882a593Smuzhiyun {
121*4882a593Smuzhiyun if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
122*4882a593Smuzhiyun return 0;
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun if (btrfs_fs_closing(fs_info))
125*4882a593Smuzhiyun return 0;
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun return 1;
128*4882a593Smuzhiyun }
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun /*
131*4882a593Smuzhiyun * insert a defrag record for this inode if auto defrag is
132*4882a593Smuzhiyun * enabled
133*4882a593Smuzhiyun */
btrfs_add_inode_defrag(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)134*4882a593Smuzhiyun int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
135*4882a593Smuzhiyun struct btrfs_inode *inode)
136*4882a593Smuzhiyun {
137*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
138*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
139*4882a593Smuzhiyun struct inode_defrag *defrag;
140*4882a593Smuzhiyun u64 transid;
141*4882a593Smuzhiyun int ret;
142*4882a593Smuzhiyun
143*4882a593Smuzhiyun if (!__need_auto_defrag(fs_info))
144*4882a593Smuzhiyun return 0;
145*4882a593Smuzhiyun
146*4882a593Smuzhiyun if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
147*4882a593Smuzhiyun return 0;
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun if (trans)
150*4882a593Smuzhiyun transid = trans->transid;
151*4882a593Smuzhiyun else
152*4882a593Smuzhiyun transid = inode->root->last_trans;
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
155*4882a593Smuzhiyun if (!defrag)
156*4882a593Smuzhiyun return -ENOMEM;
157*4882a593Smuzhiyun
158*4882a593Smuzhiyun defrag->ino = btrfs_ino(inode);
159*4882a593Smuzhiyun defrag->transid = transid;
160*4882a593Smuzhiyun defrag->root = root->root_key.objectid;
161*4882a593Smuzhiyun
162*4882a593Smuzhiyun spin_lock(&fs_info->defrag_inodes_lock);
163*4882a593Smuzhiyun if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
164*4882a593Smuzhiyun /*
165*4882a593Smuzhiyun * If we set IN_DEFRAG flag and evict the inode from memory,
166*4882a593Smuzhiyun * and then re-read this inode, this new inode doesn't have
167*4882a593Smuzhiyun * IN_DEFRAG flag. At the case, we may find the existed defrag.
168*4882a593Smuzhiyun */
169*4882a593Smuzhiyun ret = __btrfs_add_inode_defrag(inode, defrag);
170*4882a593Smuzhiyun if (ret)
171*4882a593Smuzhiyun kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
172*4882a593Smuzhiyun } else {
173*4882a593Smuzhiyun kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
174*4882a593Smuzhiyun }
175*4882a593Smuzhiyun spin_unlock(&fs_info->defrag_inodes_lock);
176*4882a593Smuzhiyun return 0;
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun /*
180*4882a593Smuzhiyun * Requeue the defrag object. If there is a defrag object that points to
181*4882a593Smuzhiyun * the same inode in the tree, we will merge them together (by
182*4882a593Smuzhiyun * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
183*4882a593Smuzhiyun */
btrfs_requeue_inode_defrag(struct btrfs_inode * inode,struct inode_defrag * defrag)184*4882a593Smuzhiyun static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
185*4882a593Smuzhiyun struct inode_defrag *defrag)
186*4882a593Smuzhiyun {
187*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = inode->root->fs_info;
188*4882a593Smuzhiyun int ret;
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun if (!__need_auto_defrag(fs_info))
191*4882a593Smuzhiyun goto out;
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun /*
194*4882a593Smuzhiyun * Here we don't check the IN_DEFRAG flag, because we need merge
195*4882a593Smuzhiyun * them together.
196*4882a593Smuzhiyun */
197*4882a593Smuzhiyun spin_lock(&fs_info->defrag_inodes_lock);
198*4882a593Smuzhiyun ret = __btrfs_add_inode_defrag(inode, defrag);
199*4882a593Smuzhiyun spin_unlock(&fs_info->defrag_inodes_lock);
200*4882a593Smuzhiyun if (ret)
201*4882a593Smuzhiyun goto out;
202*4882a593Smuzhiyun return;
203*4882a593Smuzhiyun out:
204*4882a593Smuzhiyun kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
205*4882a593Smuzhiyun }
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun /*
208*4882a593Smuzhiyun * pick the defragable inode that we want, if it doesn't exist, we will get
209*4882a593Smuzhiyun * the next one.
210*4882a593Smuzhiyun */
211*4882a593Smuzhiyun static struct inode_defrag *
btrfs_pick_defrag_inode(struct btrfs_fs_info * fs_info,u64 root,u64 ino)212*4882a593Smuzhiyun btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
213*4882a593Smuzhiyun {
214*4882a593Smuzhiyun struct inode_defrag *entry = NULL;
215*4882a593Smuzhiyun struct inode_defrag tmp;
216*4882a593Smuzhiyun struct rb_node *p;
217*4882a593Smuzhiyun struct rb_node *parent = NULL;
218*4882a593Smuzhiyun int ret;
219*4882a593Smuzhiyun
220*4882a593Smuzhiyun tmp.ino = ino;
221*4882a593Smuzhiyun tmp.root = root;
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun spin_lock(&fs_info->defrag_inodes_lock);
224*4882a593Smuzhiyun p = fs_info->defrag_inodes.rb_node;
225*4882a593Smuzhiyun while (p) {
226*4882a593Smuzhiyun parent = p;
227*4882a593Smuzhiyun entry = rb_entry(parent, struct inode_defrag, rb_node);
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun ret = __compare_inode_defrag(&tmp, entry);
230*4882a593Smuzhiyun if (ret < 0)
231*4882a593Smuzhiyun p = parent->rb_left;
232*4882a593Smuzhiyun else if (ret > 0)
233*4882a593Smuzhiyun p = parent->rb_right;
234*4882a593Smuzhiyun else
235*4882a593Smuzhiyun goto out;
236*4882a593Smuzhiyun }
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
239*4882a593Smuzhiyun parent = rb_next(parent);
240*4882a593Smuzhiyun if (parent)
241*4882a593Smuzhiyun entry = rb_entry(parent, struct inode_defrag, rb_node);
242*4882a593Smuzhiyun else
243*4882a593Smuzhiyun entry = NULL;
244*4882a593Smuzhiyun }
245*4882a593Smuzhiyun out:
246*4882a593Smuzhiyun if (entry)
247*4882a593Smuzhiyun rb_erase(parent, &fs_info->defrag_inodes);
248*4882a593Smuzhiyun spin_unlock(&fs_info->defrag_inodes_lock);
249*4882a593Smuzhiyun return entry;
250*4882a593Smuzhiyun }
251*4882a593Smuzhiyun
btrfs_cleanup_defrag_inodes(struct btrfs_fs_info * fs_info)252*4882a593Smuzhiyun void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
253*4882a593Smuzhiyun {
254*4882a593Smuzhiyun struct inode_defrag *defrag;
255*4882a593Smuzhiyun struct rb_node *node;
256*4882a593Smuzhiyun
257*4882a593Smuzhiyun spin_lock(&fs_info->defrag_inodes_lock);
258*4882a593Smuzhiyun node = rb_first(&fs_info->defrag_inodes);
259*4882a593Smuzhiyun while (node) {
260*4882a593Smuzhiyun rb_erase(node, &fs_info->defrag_inodes);
261*4882a593Smuzhiyun defrag = rb_entry(node, struct inode_defrag, rb_node);
262*4882a593Smuzhiyun kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun cond_resched_lock(&fs_info->defrag_inodes_lock);
265*4882a593Smuzhiyun
266*4882a593Smuzhiyun node = rb_first(&fs_info->defrag_inodes);
267*4882a593Smuzhiyun }
268*4882a593Smuzhiyun spin_unlock(&fs_info->defrag_inodes_lock);
269*4882a593Smuzhiyun }
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun #define BTRFS_DEFRAG_BATCH 1024
272*4882a593Smuzhiyun
__btrfs_run_defrag_inode(struct btrfs_fs_info * fs_info,struct inode_defrag * defrag)273*4882a593Smuzhiyun static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
274*4882a593Smuzhiyun struct inode_defrag *defrag)
275*4882a593Smuzhiyun {
276*4882a593Smuzhiyun struct btrfs_root *inode_root;
277*4882a593Smuzhiyun struct inode *inode;
278*4882a593Smuzhiyun struct btrfs_ioctl_defrag_range_args range;
279*4882a593Smuzhiyun int num_defrag;
280*4882a593Smuzhiyun int ret;
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun /* get the inode */
283*4882a593Smuzhiyun inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
284*4882a593Smuzhiyun if (IS_ERR(inode_root)) {
285*4882a593Smuzhiyun ret = PTR_ERR(inode_root);
286*4882a593Smuzhiyun goto cleanup;
287*4882a593Smuzhiyun }
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
290*4882a593Smuzhiyun btrfs_put_root(inode_root);
291*4882a593Smuzhiyun if (IS_ERR(inode)) {
292*4882a593Smuzhiyun ret = PTR_ERR(inode);
293*4882a593Smuzhiyun goto cleanup;
294*4882a593Smuzhiyun }
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun /* do a chunk of defrag */
297*4882a593Smuzhiyun clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
298*4882a593Smuzhiyun memset(&range, 0, sizeof(range));
299*4882a593Smuzhiyun range.len = (u64)-1;
300*4882a593Smuzhiyun range.start = defrag->last_offset;
301*4882a593Smuzhiyun
302*4882a593Smuzhiyun sb_start_write(fs_info->sb);
303*4882a593Smuzhiyun num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
304*4882a593Smuzhiyun BTRFS_DEFRAG_BATCH);
305*4882a593Smuzhiyun sb_end_write(fs_info->sb);
306*4882a593Smuzhiyun /*
307*4882a593Smuzhiyun * if we filled the whole defrag batch, there
308*4882a593Smuzhiyun * must be more work to do. Queue this defrag
309*4882a593Smuzhiyun * again
310*4882a593Smuzhiyun */
311*4882a593Smuzhiyun if (num_defrag == BTRFS_DEFRAG_BATCH) {
312*4882a593Smuzhiyun defrag->last_offset = range.start;
313*4882a593Smuzhiyun btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
314*4882a593Smuzhiyun } else if (defrag->last_offset && !defrag->cycled) {
315*4882a593Smuzhiyun /*
316*4882a593Smuzhiyun * we didn't fill our defrag batch, but
317*4882a593Smuzhiyun * we didn't start at zero. Make sure we loop
318*4882a593Smuzhiyun * around to the start of the file.
319*4882a593Smuzhiyun */
320*4882a593Smuzhiyun defrag->last_offset = 0;
321*4882a593Smuzhiyun defrag->cycled = 1;
322*4882a593Smuzhiyun btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
323*4882a593Smuzhiyun } else {
324*4882a593Smuzhiyun kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
325*4882a593Smuzhiyun }
326*4882a593Smuzhiyun
327*4882a593Smuzhiyun iput(inode);
328*4882a593Smuzhiyun return 0;
329*4882a593Smuzhiyun cleanup:
330*4882a593Smuzhiyun kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
331*4882a593Smuzhiyun return ret;
332*4882a593Smuzhiyun }
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun /*
335*4882a593Smuzhiyun * run through the list of inodes in the FS that need
336*4882a593Smuzhiyun * defragging
337*4882a593Smuzhiyun */
btrfs_run_defrag_inodes(struct btrfs_fs_info * fs_info)338*4882a593Smuzhiyun int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
339*4882a593Smuzhiyun {
340*4882a593Smuzhiyun struct inode_defrag *defrag;
341*4882a593Smuzhiyun u64 first_ino = 0;
342*4882a593Smuzhiyun u64 root_objectid = 0;
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun atomic_inc(&fs_info->defrag_running);
345*4882a593Smuzhiyun while (1) {
346*4882a593Smuzhiyun /* Pause the auto defragger. */
347*4882a593Smuzhiyun if (test_bit(BTRFS_FS_STATE_REMOUNTING,
348*4882a593Smuzhiyun &fs_info->fs_state))
349*4882a593Smuzhiyun break;
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun if (!__need_auto_defrag(fs_info))
352*4882a593Smuzhiyun break;
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun /* find an inode to defrag */
355*4882a593Smuzhiyun defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
356*4882a593Smuzhiyun first_ino);
357*4882a593Smuzhiyun if (!defrag) {
358*4882a593Smuzhiyun if (root_objectid || first_ino) {
359*4882a593Smuzhiyun root_objectid = 0;
360*4882a593Smuzhiyun first_ino = 0;
361*4882a593Smuzhiyun continue;
362*4882a593Smuzhiyun } else {
363*4882a593Smuzhiyun break;
364*4882a593Smuzhiyun }
365*4882a593Smuzhiyun }
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun first_ino = defrag->ino + 1;
368*4882a593Smuzhiyun root_objectid = defrag->root;
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun __btrfs_run_defrag_inode(fs_info, defrag);
371*4882a593Smuzhiyun }
372*4882a593Smuzhiyun atomic_dec(&fs_info->defrag_running);
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun /*
375*4882a593Smuzhiyun * during unmount, we use the transaction_wait queue to
376*4882a593Smuzhiyun * wait for the defragger to stop
377*4882a593Smuzhiyun */
378*4882a593Smuzhiyun wake_up(&fs_info->transaction_wait);
379*4882a593Smuzhiyun return 0;
380*4882a593Smuzhiyun }
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun /* simple helper to fault in pages and copy. This should go away
383*4882a593Smuzhiyun * and be replaced with calls into generic code.
384*4882a593Smuzhiyun */
btrfs_copy_from_user(loff_t pos,size_t write_bytes,struct page ** prepared_pages,struct iov_iter * i)385*4882a593Smuzhiyun static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
386*4882a593Smuzhiyun struct page **prepared_pages,
387*4882a593Smuzhiyun struct iov_iter *i)
388*4882a593Smuzhiyun {
389*4882a593Smuzhiyun size_t copied = 0;
390*4882a593Smuzhiyun size_t total_copied = 0;
391*4882a593Smuzhiyun int pg = 0;
392*4882a593Smuzhiyun int offset = offset_in_page(pos);
393*4882a593Smuzhiyun
394*4882a593Smuzhiyun while (write_bytes > 0) {
395*4882a593Smuzhiyun size_t count = min_t(size_t,
396*4882a593Smuzhiyun PAGE_SIZE - offset, write_bytes);
397*4882a593Smuzhiyun struct page *page = prepared_pages[pg];
398*4882a593Smuzhiyun /*
399*4882a593Smuzhiyun * Copy data from userspace to the current page
400*4882a593Smuzhiyun */
401*4882a593Smuzhiyun copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun /* Flush processor's dcache for this page */
404*4882a593Smuzhiyun flush_dcache_page(page);
405*4882a593Smuzhiyun
406*4882a593Smuzhiyun /*
407*4882a593Smuzhiyun * if we get a partial write, we can end up with
408*4882a593Smuzhiyun * partially up to date pages. These add
409*4882a593Smuzhiyun * a lot of complexity, so make sure they don't
410*4882a593Smuzhiyun * happen by forcing this copy to be retried.
411*4882a593Smuzhiyun *
412*4882a593Smuzhiyun * The rest of the btrfs_file_write code will fall
413*4882a593Smuzhiyun * back to page at a time copies after we return 0.
414*4882a593Smuzhiyun */
415*4882a593Smuzhiyun if (!PageUptodate(page) && copied < count)
416*4882a593Smuzhiyun copied = 0;
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun iov_iter_advance(i, copied);
419*4882a593Smuzhiyun write_bytes -= copied;
420*4882a593Smuzhiyun total_copied += copied;
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun /* Return to btrfs_file_write_iter to fault page */
423*4882a593Smuzhiyun if (unlikely(copied == 0))
424*4882a593Smuzhiyun break;
425*4882a593Smuzhiyun
426*4882a593Smuzhiyun if (copied < PAGE_SIZE - offset) {
427*4882a593Smuzhiyun offset += copied;
428*4882a593Smuzhiyun } else {
429*4882a593Smuzhiyun pg++;
430*4882a593Smuzhiyun offset = 0;
431*4882a593Smuzhiyun }
432*4882a593Smuzhiyun }
433*4882a593Smuzhiyun return total_copied;
434*4882a593Smuzhiyun }
435*4882a593Smuzhiyun
436*4882a593Smuzhiyun /*
437*4882a593Smuzhiyun * unlocks pages after btrfs_file_write is done with them
438*4882a593Smuzhiyun */
btrfs_drop_pages(struct page ** pages,size_t num_pages)439*4882a593Smuzhiyun static void btrfs_drop_pages(struct page **pages, size_t num_pages)
440*4882a593Smuzhiyun {
441*4882a593Smuzhiyun size_t i;
442*4882a593Smuzhiyun for (i = 0; i < num_pages; i++) {
443*4882a593Smuzhiyun /* page checked is some magic around finding pages that
444*4882a593Smuzhiyun * have been modified without going through btrfs_set_page_dirty
445*4882a593Smuzhiyun * clear it here. There should be no need to mark the pages
446*4882a593Smuzhiyun * accessed as prepare_pages should have marked them accessed
447*4882a593Smuzhiyun * in prepare_pages via find_or_create_page()
448*4882a593Smuzhiyun */
449*4882a593Smuzhiyun ClearPageChecked(pages[i]);
450*4882a593Smuzhiyun unlock_page(pages[i]);
451*4882a593Smuzhiyun put_page(pages[i]);
452*4882a593Smuzhiyun }
453*4882a593Smuzhiyun }
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun /*
456*4882a593Smuzhiyun * after copy_from_user, pages need to be dirtied and we need to make
457*4882a593Smuzhiyun * sure holes are created between the current EOF and the start of
458*4882a593Smuzhiyun * any next extents (if required).
459*4882a593Smuzhiyun *
460*4882a593Smuzhiyun * this also makes the decision about creating an inline extent vs
461*4882a593Smuzhiyun * doing real data extents, marking pages dirty and delalloc as required.
462*4882a593Smuzhiyun */
btrfs_dirty_pages(struct btrfs_inode * inode,struct page ** pages,size_t num_pages,loff_t pos,size_t write_bytes,struct extent_state ** cached)463*4882a593Smuzhiyun int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
464*4882a593Smuzhiyun size_t num_pages, loff_t pos, size_t write_bytes,
465*4882a593Smuzhiyun struct extent_state **cached)
466*4882a593Smuzhiyun {
467*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = inode->root->fs_info;
468*4882a593Smuzhiyun int err = 0;
469*4882a593Smuzhiyun int i;
470*4882a593Smuzhiyun u64 num_bytes;
471*4882a593Smuzhiyun u64 start_pos;
472*4882a593Smuzhiyun u64 end_of_last_block;
473*4882a593Smuzhiyun u64 end_pos = pos + write_bytes;
474*4882a593Smuzhiyun loff_t isize = i_size_read(&inode->vfs_inode);
475*4882a593Smuzhiyun unsigned int extra_bits = 0;
476*4882a593Smuzhiyun
477*4882a593Smuzhiyun start_pos = pos & ~((u64) fs_info->sectorsize - 1);
478*4882a593Smuzhiyun num_bytes = round_up(write_bytes + pos - start_pos,
479*4882a593Smuzhiyun fs_info->sectorsize);
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun end_of_last_block = start_pos + num_bytes - 1;
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun /*
484*4882a593Smuzhiyun * The pages may have already been dirty, clear out old accounting so
485*4882a593Smuzhiyun * we can set things up properly
486*4882a593Smuzhiyun */
487*4882a593Smuzhiyun clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
488*4882a593Smuzhiyun EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
489*4882a593Smuzhiyun 0, 0, cached);
490*4882a593Smuzhiyun
491*4882a593Smuzhiyun err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
492*4882a593Smuzhiyun extra_bits, cached);
493*4882a593Smuzhiyun if (err)
494*4882a593Smuzhiyun return err;
495*4882a593Smuzhiyun
496*4882a593Smuzhiyun for (i = 0; i < num_pages; i++) {
497*4882a593Smuzhiyun struct page *p = pages[i];
498*4882a593Smuzhiyun SetPageUptodate(p);
499*4882a593Smuzhiyun ClearPageChecked(p);
500*4882a593Smuzhiyun set_page_dirty(p);
501*4882a593Smuzhiyun }
502*4882a593Smuzhiyun
503*4882a593Smuzhiyun /*
504*4882a593Smuzhiyun * we've only changed i_size in ram, and we haven't updated
505*4882a593Smuzhiyun * the disk i_size. There is no need to log the inode
506*4882a593Smuzhiyun * at this time.
507*4882a593Smuzhiyun */
508*4882a593Smuzhiyun if (end_pos > isize)
509*4882a593Smuzhiyun i_size_write(&inode->vfs_inode, end_pos);
510*4882a593Smuzhiyun return 0;
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun /*
514*4882a593Smuzhiyun * this drops all the extents in the cache that intersect the range
515*4882a593Smuzhiyun * [start, end]. Existing extents are split as required.
516*4882a593Smuzhiyun */
btrfs_drop_extent_cache(struct btrfs_inode * inode,u64 start,u64 end,int skip_pinned)517*4882a593Smuzhiyun void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
518*4882a593Smuzhiyun int skip_pinned)
519*4882a593Smuzhiyun {
520*4882a593Smuzhiyun struct extent_map *em;
521*4882a593Smuzhiyun struct extent_map *split = NULL;
522*4882a593Smuzhiyun struct extent_map *split2 = NULL;
523*4882a593Smuzhiyun struct extent_map_tree *em_tree = &inode->extent_tree;
524*4882a593Smuzhiyun u64 len = end - start + 1;
525*4882a593Smuzhiyun u64 gen;
526*4882a593Smuzhiyun int ret;
527*4882a593Smuzhiyun int testend = 1;
528*4882a593Smuzhiyun unsigned long flags;
529*4882a593Smuzhiyun int compressed = 0;
530*4882a593Smuzhiyun bool modified;
531*4882a593Smuzhiyun
532*4882a593Smuzhiyun WARN_ON(end < start);
533*4882a593Smuzhiyun if (end == (u64)-1) {
534*4882a593Smuzhiyun len = (u64)-1;
535*4882a593Smuzhiyun testend = 0;
536*4882a593Smuzhiyun }
537*4882a593Smuzhiyun while (1) {
538*4882a593Smuzhiyun int no_splits = 0;
539*4882a593Smuzhiyun
540*4882a593Smuzhiyun modified = false;
541*4882a593Smuzhiyun if (!split)
542*4882a593Smuzhiyun split = alloc_extent_map();
543*4882a593Smuzhiyun if (!split2)
544*4882a593Smuzhiyun split2 = alloc_extent_map();
545*4882a593Smuzhiyun if (!split || !split2)
546*4882a593Smuzhiyun no_splits = 1;
547*4882a593Smuzhiyun
548*4882a593Smuzhiyun write_lock(&em_tree->lock);
549*4882a593Smuzhiyun em = lookup_extent_mapping(em_tree, start, len);
550*4882a593Smuzhiyun if (!em) {
551*4882a593Smuzhiyun write_unlock(&em_tree->lock);
552*4882a593Smuzhiyun break;
553*4882a593Smuzhiyun }
554*4882a593Smuzhiyun flags = em->flags;
555*4882a593Smuzhiyun gen = em->generation;
556*4882a593Smuzhiyun if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
557*4882a593Smuzhiyun if (testend && em->start + em->len >= start + len) {
558*4882a593Smuzhiyun free_extent_map(em);
559*4882a593Smuzhiyun write_unlock(&em_tree->lock);
560*4882a593Smuzhiyun break;
561*4882a593Smuzhiyun }
562*4882a593Smuzhiyun start = em->start + em->len;
563*4882a593Smuzhiyun if (testend)
564*4882a593Smuzhiyun len = start + len - (em->start + em->len);
565*4882a593Smuzhiyun free_extent_map(em);
566*4882a593Smuzhiyun write_unlock(&em_tree->lock);
567*4882a593Smuzhiyun continue;
568*4882a593Smuzhiyun }
569*4882a593Smuzhiyun compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
570*4882a593Smuzhiyun clear_bit(EXTENT_FLAG_PINNED, &em->flags);
571*4882a593Smuzhiyun clear_bit(EXTENT_FLAG_LOGGING, &flags);
572*4882a593Smuzhiyun modified = !list_empty(&em->list);
573*4882a593Smuzhiyun if (no_splits)
574*4882a593Smuzhiyun goto next;
575*4882a593Smuzhiyun
576*4882a593Smuzhiyun if (em->start < start) {
577*4882a593Smuzhiyun split->start = em->start;
578*4882a593Smuzhiyun split->len = start - em->start;
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun if (em->block_start < EXTENT_MAP_LAST_BYTE) {
581*4882a593Smuzhiyun split->orig_start = em->orig_start;
582*4882a593Smuzhiyun split->block_start = em->block_start;
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun if (compressed)
585*4882a593Smuzhiyun split->block_len = em->block_len;
586*4882a593Smuzhiyun else
587*4882a593Smuzhiyun split->block_len = split->len;
588*4882a593Smuzhiyun split->orig_block_len = max(split->block_len,
589*4882a593Smuzhiyun em->orig_block_len);
590*4882a593Smuzhiyun split->ram_bytes = em->ram_bytes;
591*4882a593Smuzhiyun } else {
592*4882a593Smuzhiyun split->orig_start = split->start;
593*4882a593Smuzhiyun split->block_len = 0;
594*4882a593Smuzhiyun split->block_start = em->block_start;
595*4882a593Smuzhiyun split->orig_block_len = 0;
596*4882a593Smuzhiyun split->ram_bytes = split->len;
597*4882a593Smuzhiyun }
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun split->generation = gen;
600*4882a593Smuzhiyun split->flags = flags;
601*4882a593Smuzhiyun split->compress_type = em->compress_type;
602*4882a593Smuzhiyun replace_extent_mapping(em_tree, em, split, modified);
603*4882a593Smuzhiyun free_extent_map(split);
604*4882a593Smuzhiyun split = split2;
605*4882a593Smuzhiyun split2 = NULL;
606*4882a593Smuzhiyun }
607*4882a593Smuzhiyun if (testend && em->start + em->len > start + len) {
608*4882a593Smuzhiyun u64 diff = start + len - em->start;
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun split->start = start + len;
611*4882a593Smuzhiyun split->len = em->start + em->len - (start + len);
612*4882a593Smuzhiyun split->flags = flags;
613*4882a593Smuzhiyun split->compress_type = em->compress_type;
614*4882a593Smuzhiyun split->generation = gen;
615*4882a593Smuzhiyun
616*4882a593Smuzhiyun if (em->block_start < EXTENT_MAP_LAST_BYTE) {
617*4882a593Smuzhiyun split->orig_block_len = max(em->block_len,
618*4882a593Smuzhiyun em->orig_block_len);
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun split->ram_bytes = em->ram_bytes;
621*4882a593Smuzhiyun if (compressed) {
622*4882a593Smuzhiyun split->block_len = em->block_len;
623*4882a593Smuzhiyun split->block_start = em->block_start;
624*4882a593Smuzhiyun split->orig_start = em->orig_start;
625*4882a593Smuzhiyun } else {
626*4882a593Smuzhiyun split->block_len = split->len;
627*4882a593Smuzhiyun split->block_start = em->block_start
628*4882a593Smuzhiyun + diff;
629*4882a593Smuzhiyun split->orig_start = em->orig_start;
630*4882a593Smuzhiyun }
631*4882a593Smuzhiyun } else {
632*4882a593Smuzhiyun split->ram_bytes = split->len;
633*4882a593Smuzhiyun split->orig_start = split->start;
634*4882a593Smuzhiyun split->block_len = 0;
635*4882a593Smuzhiyun split->block_start = em->block_start;
636*4882a593Smuzhiyun split->orig_block_len = 0;
637*4882a593Smuzhiyun }
638*4882a593Smuzhiyun
639*4882a593Smuzhiyun if (extent_map_in_tree(em)) {
640*4882a593Smuzhiyun replace_extent_mapping(em_tree, em, split,
641*4882a593Smuzhiyun modified);
642*4882a593Smuzhiyun } else {
643*4882a593Smuzhiyun ret = add_extent_mapping(em_tree, split,
644*4882a593Smuzhiyun modified);
645*4882a593Smuzhiyun ASSERT(ret == 0); /* Logic error */
646*4882a593Smuzhiyun }
647*4882a593Smuzhiyun free_extent_map(split);
648*4882a593Smuzhiyun split = NULL;
649*4882a593Smuzhiyun }
650*4882a593Smuzhiyun next:
651*4882a593Smuzhiyun if (extent_map_in_tree(em))
652*4882a593Smuzhiyun remove_extent_mapping(em_tree, em);
653*4882a593Smuzhiyun write_unlock(&em_tree->lock);
654*4882a593Smuzhiyun
655*4882a593Smuzhiyun /* once for us */
656*4882a593Smuzhiyun free_extent_map(em);
657*4882a593Smuzhiyun /* once for the tree*/
658*4882a593Smuzhiyun free_extent_map(em);
659*4882a593Smuzhiyun }
660*4882a593Smuzhiyun if (split)
661*4882a593Smuzhiyun free_extent_map(split);
662*4882a593Smuzhiyun if (split2)
663*4882a593Smuzhiyun free_extent_map(split2);
664*4882a593Smuzhiyun }
665*4882a593Smuzhiyun
666*4882a593Smuzhiyun /*
667*4882a593Smuzhiyun * this is very complex, but the basic idea is to drop all extents
668*4882a593Smuzhiyun * in the range start - end. hint_block is filled in with a block number
669*4882a593Smuzhiyun * that would be a good hint to the block allocator for this file.
670*4882a593Smuzhiyun *
671*4882a593Smuzhiyun * If an extent intersects the range but is not entirely inside the range
672*4882a593Smuzhiyun * it is either truncated or split. Anything entirely inside the range
673*4882a593Smuzhiyun * is deleted from the tree.
674*4882a593Smuzhiyun */
__btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode,struct btrfs_path * path,u64 start,u64 end,u64 * drop_end,int drop_cache,int replace_extent,u32 extent_item_size,int * key_inserted)675*4882a593Smuzhiyun int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
676*4882a593Smuzhiyun struct btrfs_root *root, struct btrfs_inode *inode,
677*4882a593Smuzhiyun struct btrfs_path *path, u64 start, u64 end,
678*4882a593Smuzhiyun u64 *drop_end, int drop_cache,
679*4882a593Smuzhiyun int replace_extent,
680*4882a593Smuzhiyun u32 extent_item_size,
681*4882a593Smuzhiyun int *key_inserted)
682*4882a593Smuzhiyun {
683*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = root->fs_info;
684*4882a593Smuzhiyun struct extent_buffer *leaf;
685*4882a593Smuzhiyun struct btrfs_file_extent_item *fi;
686*4882a593Smuzhiyun struct btrfs_ref ref = { 0 };
687*4882a593Smuzhiyun struct btrfs_key key;
688*4882a593Smuzhiyun struct btrfs_key new_key;
689*4882a593Smuzhiyun struct inode *vfs_inode = &inode->vfs_inode;
690*4882a593Smuzhiyun u64 ino = btrfs_ino(inode);
691*4882a593Smuzhiyun u64 search_start = start;
692*4882a593Smuzhiyun u64 disk_bytenr = 0;
693*4882a593Smuzhiyun u64 num_bytes = 0;
694*4882a593Smuzhiyun u64 extent_offset = 0;
695*4882a593Smuzhiyun u64 extent_end = 0;
696*4882a593Smuzhiyun u64 last_end = start;
697*4882a593Smuzhiyun int del_nr = 0;
698*4882a593Smuzhiyun int del_slot = 0;
699*4882a593Smuzhiyun int extent_type;
700*4882a593Smuzhiyun int recow;
701*4882a593Smuzhiyun int ret;
702*4882a593Smuzhiyun int modify_tree = -1;
703*4882a593Smuzhiyun int update_refs;
704*4882a593Smuzhiyun int found = 0;
705*4882a593Smuzhiyun int leafs_visited = 0;
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun if (drop_cache)
708*4882a593Smuzhiyun btrfs_drop_extent_cache(inode, start, end - 1, 0);
709*4882a593Smuzhiyun
710*4882a593Smuzhiyun if (start >= inode->disk_i_size && !replace_extent)
711*4882a593Smuzhiyun modify_tree = 0;
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
714*4882a593Smuzhiyun while (1) {
715*4882a593Smuzhiyun recow = 0;
716*4882a593Smuzhiyun ret = btrfs_lookup_file_extent(trans, root, path, ino,
717*4882a593Smuzhiyun search_start, modify_tree);
718*4882a593Smuzhiyun if (ret < 0)
719*4882a593Smuzhiyun break;
720*4882a593Smuzhiyun if (ret > 0 && path->slots[0] > 0 && search_start == start) {
721*4882a593Smuzhiyun leaf = path->nodes[0];
722*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
723*4882a593Smuzhiyun if (key.objectid == ino &&
724*4882a593Smuzhiyun key.type == BTRFS_EXTENT_DATA_KEY)
725*4882a593Smuzhiyun path->slots[0]--;
726*4882a593Smuzhiyun }
727*4882a593Smuzhiyun ret = 0;
728*4882a593Smuzhiyun leafs_visited++;
729*4882a593Smuzhiyun next_slot:
730*4882a593Smuzhiyun leaf = path->nodes[0];
731*4882a593Smuzhiyun if (path->slots[0] >= btrfs_header_nritems(leaf)) {
732*4882a593Smuzhiyun BUG_ON(del_nr > 0);
733*4882a593Smuzhiyun ret = btrfs_next_leaf(root, path);
734*4882a593Smuzhiyun if (ret < 0)
735*4882a593Smuzhiyun break;
736*4882a593Smuzhiyun if (ret > 0) {
737*4882a593Smuzhiyun ret = 0;
738*4882a593Smuzhiyun break;
739*4882a593Smuzhiyun }
740*4882a593Smuzhiyun leafs_visited++;
741*4882a593Smuzhiyun leaf = path->nodes[0];
742*4882a593Smuzhiyun recow = 1;
743*4882a593Smuzhiyun }
744*4882a593Smuzhiyun
745*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
746*4882a593Smuzhiyun
747*4882a593Smuzhiyun if (key.objectid > ino)
748*4882a593Smuzhiyun break;
749*4882a593Smuzhiyun if (WARN_ON_ONCE(key.objectid < ino) ||
750*4882a593Smuzhiyun key.type < BTRFS_EXTENT_DATA_KEY) {
751*4882a593Smuzhiyun ASSERT(del_nr == 0);
752*4882a593Smuzhiyun path->slots[0]++;
753*4882a593Smuzhiyun goto next_slot;
754*4882a593Smuzhiyun }
755*4882a593Smuzhiyun if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
756*4882a593Smuzhiyun break;
757*4882a593Smuzhiyun
758*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
759*4882a593Smuzhiyun struct btrfs_file_extent_item);
760*4882a593Smuzhiyun extent_type = btrfs_file_extent_type(leaf, fi);
761*4882a593Smuzhiyun
762*4882a593Smuzhiyun if (extent_type == BTRFS_FILE_EXTENT_REG ||
763*4882a593Smuzhiyun extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
764*4882a593Smuzhiyun disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
765*4882a593Smuzhiyun num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
766*4882a593Smuzhiyun extent_offset = btrfs_file_extent_offset(leaf, fi);
767*4882a593Smuzhiyun extent_end = key.offset +
768*4882a593Smuzhiyun btrfs_file_extent_num_bytes(leaf, fi);
769*4882a593Smuzhiyun } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
770*4882a593Smuzhiyun extent_end = key.offset +
771*4882a593Smuzhiyun btrfs_file_extent_ram_bytes(leaf, fi);
772*4882a593Smuzhiyun } else {
773*4882a593Smuzhiyun /* can't happen */
774*4882a593Smuzhiyun BUG();
775*4882a593Smuzhiyun }
776*4882a593Smuzhiyun
777*4882a593Smuzhiyun /*
778*4882a593Smuzhiyun * Don't skip extent items representing 0 byte lengths. They
779*4882a593Smuzhiyun * used to be created (bug) if while punching holes we hit
780*4882a593Smuzhiyun * -ENOSPC condition. So if we find one here, just ensure we
781*4882a593Smuzhiyun * delete it, otherwise we would insert a new file extent item
782*4882a593Smuzhiyun * with the same key (offset) as that 0 bytes length file
783*4882a593Smuzhiyun * extent item in the call to setup_items_for_insert() later
784*4882a593Smuzhiyun * in this function.
785*4882a593Smuzhiyun */
786*4882a593Smuzhiyun if (extent_end == key.offset && extent_end >= search_start) {
787*4882a593Smuzhiyun last_end = extent_end;
788*4882a593Smuzhiyun goto delete_extent_item;
789*4882a593Smuzhiyun }
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun if (extent_end <= search_start) {
792*4882a593Smuzhiyun path->slots[0]++;
793*4882a593Smuzhiyun goto next_slot;
794*4882a593Smuzhiyun }
795*4882a593Smuzhiyun
796*4882a593Smuzhiyun found = 1;
797*4882a593Smuzhiyun search_start = max(key.offset, start);
798*4882a593Smuzhiyun if (recow || !modify_tree) {
799*4882a593Smuzhiyun modify_tree = -1;
800*4882a593Smuzhiyun btrfs_release_path(path);
801*4882a593Smuzhiyun continue;
802*4882a593Smuzhiyun }
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun /*
805*4882a593Smuzhiyun * | - range to drop - |
806*4882a593Smuzhiyun * | -------- extent -------- |
807*4882a593Smuzhiyun */
808*4882a593Smuzhiyun if (start > key.offset && end < extent_end) {
809*4882a593Smuzhiyun BUG_ON(del_nr > 0);
810*4882a593Smuzhiyun if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
811*4882a593Smuzhiyun ret = -EOPNOTSUPP;
812*4882a593Smuzhiyun break;
813*4882a593Smuzhiyun }
814*4882a593Smuzhiyun
815*4882a593Smuzhiyun memcpy(&new_key, &key, sizeof(new_key));
816*4882a593Smuzhiyun new_key.offset = start;
817*4882a593Smuzhiyun ret = btrfs_duplicate_item(trans, root, path,
818*4882a593Smuzhiyun &new_key);
819*4882a593Smuzhiyun if (ret == -EAGAIN) {
820*4882a593Smuzhiyun btrfs_release_path(path);
821*4882a593Smuzhiyun continue;
822*4882a593Smuzhiyun }
823*4882a593Smuzhiyun if (ret < 0)
824*4882a593Smuzhiyun break;
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun leaf = path->nodes[0];
827*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
828*4882a593Smuzhiyun struct btrfs_file_extent_item);
829*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
830*4882a593Smuzhiyun start - key.offset);
831*4882a593Smuzhiyun
832*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
833*4882a593Smuzhiyun struct btrfs_file_extent_item);
834*4882a593Smuzhiyun
835*4882a593Smuzhiyun extent_offset += start - key.offset;
836*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, fi, extent_offset);
837*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
838*4882a593Smuzhiyun extent_end - start);
839*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
840*4882a593Smuzhiyun
841*4882a593Smuzhiyun if (update_refs && disk_bytenr > 0) {
842*4882a593Smuzhiyun btrfs_init_generic_ref(&ref,
843*4882a593Smuzhiyun BTRFS_ADD_DELAYED_REF,
844*4882a593Smuzhiyun disk_bytenr, num_bytes, 0);
845*4882a593Smuzhiyun btrfs_init_data_ref(&ref,
846*4882a593Smuzhiyun root->root_key.objectid,
847*4882a593Smuzhiyun new_key.objectid,
848*4882a593Smuzhiyun start - extent_offset);
849*4882a593Smuzhiyun ret = btrfs_inc_extent_ref(trans, &ref);
850*4882a593Smuzhiyun BUG_ON(ret); /* -ENOMEM */
851*4882a593Smuzhiyun }
852*4882a593Smuzhiyun key.offset = start;
853*4882a593Smuzhiyun }
854*4882a593Smuzhiyun /*
855*4882a593Smuzhiyun * From here on out we will have actually dropped something, so
856*4882a593Smuzhiyun * last_end can be updated.
857*4882a593Smuzhiyun */
858*4882a593Smuzhiyun last_end = extent_end;
859*4882a593Smuzhiyun
860*4882a593Smuzhiyun /*
861*4882a593Smuzhiyun * | ---- range to drop ----- |
862*4882a593Smuzhiyun * | -------- extent -------- |
863*4882a593Smuzhiyun */
864*4882a593Smuzhiyun if (start <= key.offset && end < extent_end) {
865*4882a593Smuzhiyun if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
866*4882a593Smuzhiyun ret = -EOPNOTSUPP;
867*4882a593Smuzhiyun break;
868*4882a593Smuzhiyun }
869*4882a593Smuzhiyun
870*4882a593Smuzhiyun memcpy(&new_key, &key, sizeof(new_key));
871*4882a593Smuzhiyun new_key.offset = end;
872*4882a593Smuzhiyun btrfs_set_item_key_safe(fs_info, path, &new_key);
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun extent_offset += end - key.offset;
875*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, fi, extent_offset);
876*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
877*4882a593Smuzhiyun extent_end - end);
878*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
879*4882a593Smuzhiyun if (update_refs && disk_bytenr > 0)
880*4882a593Smuzhiyun inode_sub_bytes(vfs_inode, end - key.offset);
881*4882a593Smuzhiyun break;
882*4882a593Smuzhiyun }
883*4882a593Smuzhiyun
884*4882a593Smuzhiyun search_start = extent_end;
885*4882a593Smuzhiyun /*
886*4882a593Smuzhiyun * | ---- range to drop ----- |
887*4882a593Smuzhiyun * | -------- extent -------- |
888*4882a593Smuzhiyun */
889*4882a593Smuzhiyun if (start > key.offset && end >= extent_end) {
890*4882a593Smuzhiyun BUG_ON(del_nr > 0);
891*4882a593Smuzhiyun if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
892*4882a593Smuzhiyun ret = -EOPNOTSUPP;
893*4882a593Smuzhiyun break;
894*4882a593Smuzhiyun }
895*4882a593Smuzhiyun
896*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
897*4882a593Smuzhiyun start - key.offset);
898*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
899*4882a593Smuzhiyun if (update_refs && disk_bytenr > 0)
900*4882a593Smuzhiyun inode_sub_bytes(vfs_inode, extent_end - start);
901*4882a593Smuzhiyun if (end == extent_end)
902*4882a593Smuzhiyun break;
903*4882a593Smuzhiyun
904*4882a593Smuzhiyun path->slots[0]++;
905*4882a593Smuzhiyun goto next_slot;
906*4882a593Smuzhiyun }
907*4882a593Smuzhiyun
908*4882a593Smuzhiyun /*
909*4882a593Smuzhiyun * | ---- range to drop ----- |
910*4882a593Smuzhiyun * | ------ extent ------ |
911*4882a593Smuzhiyun */
912*4882a593Smuzhiyun if (start <= key.offset && end >= extent_end) {
913*4882a593Smuzhiyun delete_extent_item:
914*4882a593Smuzhiyun if (del_nr == 0) {
915*4882a593Smuzhiyun del_slot = path->slots[0];
916*4882a593Smuzhiyun del_nr = 1;
917*4882a593Smuzhiyun } else {
918*4882a593Smuzhiyun BUG_ON(del_slot + del_nr != path->slots[0]);
919*4882a593Smuzhiyun del_nr++;
920*4882a593Smuzhiyun }
921*4882a593Smuzhiyun
922*4882a593Smuzhiyun if (update_refs &&
923*4882a593Smuzhiyun extent_type == BTRFS_FILE_EXTENT_INLINE) {
924*4882a593Smuzhiyun inode_sub_bytes(vfs_inode,
925*4882a593Smuzhiyun extent_end - key.offset);
926*4882a593Smuzhiyun extent_end = ALIGN(extent_end,
927*4882a593Smuzhiyun fs_info->sectorsize);
928*4882a593Smuzhiyun } else if (update_refs && disk_bytenr > 0) {
929*4882a593Smuzhiyun btrfs_init_generic_ref(&ref,
930*4882a593Smuzhiyun BTRFS_DROP_DELAYED_REF,
931*4882a593Smuzhiyun disk_bytenr, num_bytes, 0);
932*4882a593Smuzhiyun btrfs_init_data_ref(&ref,
933*4882a593Smuzhiyun root->root_key.objectid,
934*4882a593Smuzhiyun key.objectid,
935*4882a593Smuzhiyun key.offset - extent_offset);
936*4882a593Smuzhiyun ret = btrfs_free_extent(trans, &ref);
937*4882a593Smuzhiyun BUG_ON(ret); /* -ENOMEM */
938*4882a593Smuzhiyun inode_sub_bytes(vfs_inode,
939*4882a593Smuzhiyun extent_end - key.offset);
940*4882a593Smuzhiyun }
941*4882a593Smuzhiyun
942*4882a593Smuzhiyun if (end == extent_end)
943*4882a593Smuzhiyun break;
944*4882a593Smuzhiyun
945*4882a593Smuzhiyun if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
946*4882a593Smuzhiyun path->slots[0]++;
947*4882a593Smuzhiyun goto next_slot;
948*4882a593Smuzhiyun }
949*4882a593Smuzhiyun
950*4882a593Smuzhiyun ret = btrfs_del_items(trans, root, path, del_slot,
951*4882a593Smuzhiyun del_nr);
952*4882a593Smuzhiyun if (ret) {
953*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
954*4882a593Smuzhiyun break;
955*4882a593Smuzhiyun }
956*4882a593Smuzhiyun
957*4882a593Smuzhiyun del_nr = 0;
958*4882a593Smuzhiyun del_slot = 0;
959*4882a593Smuzhiyun
960*4882a593Smuzhiyun btrfs_release_path(path);
961*4882a593Smuzhiyun continue;
962*4882a593Smuzhiyun }
963*4882a593Smuzhiyun
964*4882a593Smuzhiyun BUG();
965*4882a593Smuzhiyun }
966*4882a593Smuzhiyun
967*4882a593Smuzhiyun if (!ret && del_nr > 0) {
968*4882a593Smuzhiyun /*
969*4882a593Smuzhiyun * Set path->slots[0] to first slot, so that after the delete
970*4882a593Smuzhiyun * if items are move off from our leaf to its immediate left or
971*4882a593Smuzhiyun * right neighbor leafs, we end up with a correct and adjusted
972*4882a593Smuzhiyun * path->slots[0] for our insertion (if replace_extent != 0).
973*4882a593Smuzhiyun */
974*4882a593Smuzhiyun path->slots[0] = del_slot;
975*4882a593Smuzhiyun ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
976*4882a593Smuzhiyun if (ret)
977*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
978*4882a593Smuzhiyun }
979*4882a593Smuzhiyun
980*4882a593Smuzhiyun leaf = path->nodes[0];
981*4882a593Smuzhiyun /*
982*4882a593Smuzhiyun * If btrfs_del_items() was called, it might have deleted a leaf, in
983*4882a593Smuzhiyun * which case it unlocked our path, so check path->locks[0] matches a
984*4882a593Smuzhiyun * write lock.
985*4882a593Smuzhiyun */
986*4882a593Smuzhiyun if (!ret && replace_extent && leafs_visited == 1 &&
987*4882a593Smuzhiyun (path->locks[0] == BTRFS_WRITE_LOCK_BLOCKING ||
988*4882a593Smuzhiyun path->locks[0] == BTRFS_WRITE_LOCK) &&
989*4882a593Smuzhiyun btrfs_leaf_free_space(leaf) >=
990*4882a593Smuzhiyun sizeof(struct btrfs_item) + extent_item_size) {
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun key.objectid = ino;
993*4882a593Smuzhiyun key.type = BTRFS_EXTENT_DATA_KEY;
994*4882a593Smuzhiyun key.offset = start;
995*4882a593Smuzhiyun if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
996*4882a593Smuzhiyun struct btrfs_key slot_key;
997*4882a593Smuzhiyun
998*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
999*4882a593Smuzhiyun if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1000*4882a593Smuzhiyun path->slots[0]++;
1001*4882a593Smuzhiyun }
1002*4882a593Smuzhiyun setup_items_for_insert(root, path, &key, &extent_item_size, 1);
1003*4882a593Smuzhiyun *key_inserted = 1;
1004*4882a593Smuzhiyun }
1005*4882a593Smuzhiyun
1006*4882a593Smuzhiyun if (!replace_extent || !(*key_inserted))
1007*4882a593Smuzhiyun btrfs_release_path(path);
1008*4882a593Smuzhiyun if (drop_end)
1009*4882a593Smuzhiyun *drop_end = found ? min(end, last_end) : end;
1010*4882a593Smuzhiyun return ret;
1011*4882a593Smuzhiyun }
1012*4882a593Smuzhiyun
btrfs_drop_extents(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct inode * inode,u64 start,u64 end,int drop_cache)1013*4882a593Smuzhiyun int btrfs_drop_extents(struct btrfs_trans_handle *trans,
1014*4882a593Smuzhiyun struct btrfs_root *root, struct inode *inode, u64 start,
1015*4882a593Smuzhiyun u64 end, int drop_cache)
1016*4882a593Smuzhiyun {
1017*4882a593Smuzhiyun struct btrfs_path *path;
1018*4882a593Smuzhiyun int ret;
1019*4882a593Smuzhiyun
1020*4882a593Smuzhiyun path = btrfs_alloc_path();
1021*4882a593Smuzhiyun if (!path)
1022*4882a593Smuzhiyun return -ENOMEM;
1023*4882a593Smuzhiyun ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path, start,
1024*4882a593Smuzhiyun end, NULL, drop_cache, 0, 0, NULL);
1025*4882a593Smuzhiyun btrfs_free_path(path);
1026*4882a593Smuzhiyun return ret;
1027*4882a593Smuzhiyun }
1028*4882a593Smuzhiyun
extent_mergeable(struct extent_buffer * leaf,int slot,u64 objectid,u64 bytenr,u64 orig_offset,u64 * start,u64 * end)1029*4882a593Smuzhiyun static int extent_mergeable(struct extent_buffer *leaf, int slot,
1030*4882a593Smuzhiyun u64 objectid, u64 bytenr, u64 orig_offset,
1031*4882a593Smuzhiyun u64 *start, u64 *end)
1032*4882a593Smuzhiyun {
1033*4882a593Smuzhiyun struct btrfs_file_extent_item *fi;
1034*4882a593Smuzhiyun struct btrfs_key key;
1035*4882a593Smuzhiyun u64 extent_end;
1036*4882a593Smuzhiyun
1037*4882a593Smuzhiyun if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1038*4882a593Smuzhiyun return 0;
1039*4882a593Smuzhiyun
1040*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, slot);
1041*4882a593Smuzhiyun if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1042*4882a593Smuzhiyun return 0;
1043*4882a593Smuzhiyun
1044*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1045*4882a593Smuzhiyun if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1046*4882a593Smuzhiyun btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1047*4882a593Smuzhiyun btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1048*4882a593Smuzhiyun btrfs_file_extent_compression(leaf, fi) ||
1049*4882a593Smuzhiyun btrfs_file_extent_encryption(leaf, fi) ||
1050*4882a593Smuzhiyun btrfs_file_extent_other_encoding(leaf, fi))
1051*4882a593Smuzhiyun return 0;
1052*4882a593Smuzhiyun
1053*4882a593Smuzhiyun extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1054*4882a593Smuzhiyun if ((*start && *start != key.offset) || (*end && *end != extent_end))
1055*4882a593Smuzhiyun return 0;
1056*4882a593Smuzhiyun
1057*4882a593Smuzhiyun *start = key.offset;
1058*4882a593Smuzhiyun *end = extent_end;
1059*4882a593Smuzhiyun return 1;
1060*4882a593Smuzhiyun }
1061*4882a593Smuzhiyun
1062*4882a593Smuzhiyun /*
1063*4882a593Smuzhiyun * Mark extent in the range start - end as written.
1064*4882a593Smuzhiyun *
1065*4882a593Smuzhiyun * This changes extent type from 'pre-allocated' to 'regular'. If only
1066*4882a593Smuzhiyun * part of extent is marked as written, the extent will be split into
1067*4882a593Smuzhiyun * two or three.
1068*4882a593Smuzhiyun */
btrfs_mark_extent_written(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 start,u64 end)1069*4882a593Smuzhiyun int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1070*4882a593Smuzhiyun struct btrfs_inode *inode, u64 start, u64 end)
1071*4882a593Smuzhiyun {
1072*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = trans->fs_info;
1073*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
1074*4882a593Smuzhiyun struct extent_buffer *leaf;
1075*4882a593Smuzhiyun struct btrfs_path *path;
1076*4882a593Smuzhiyun struct btrfs_file_extent_item *fi;
1077*4882a593Smuzhiyun struct btrfs_ref ref = { 0 };
1078*4882a593Smuzhiyun struct btrfs_key key;
1079*4882a593Smuzhiyun struct btrfs_key new_key;
1080*4882a593Smuzhiyun u64 bytenr;
1081*4882a593Smuzhiyun u64 num_bytes;
1082*4882a593Smuzhiyun u64 extent_end;
1083*4882a593Smuzhiyun u64 orig_offset;
1084*4882a593Smuzhiyun u64 other_start;
1085*4882a593Smuzhiyun u64 other_end;
1086*4882a593Smuzhiyun u64 split;
1087*4882a593Smuzhiyun int del_nr = 0;
1088*4882a593Smuzhiyun int del_slot = 0;
1089*4882a593Smuzhiyun int recow;
1090*4882a593Smuzhiyun int ret = 0;
1091*4882a593Smuzhiyun u64 ino = btrfs_ino(inode);
1092*4882a593Smuzhiyun
1093*4882a593Smuzhiyun path = btrfs_alloc_path();
1094*4882a593Smuzhiyun if (!path)
1095*4882a593Smuzhiyun return -ENOMEM;
1096*4882a593Smuzhiyun again:
1097*4882a593Smuzhiyun recow = 0;
1098*4882a593Smuzhiyun split = start;
1099*4882a593Smuzhiyun key.objectid = ino;
1100*4882a593Smuzhiyun key.type = BTRFS_EXTENT_DATA_KEY;
1101*4882a593Smuzhiyun key.offset = split;
1102*4882a593Smuzhiyun
1103*4882a593Smuzhiyun ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1104*4882a593Smuzhiyun if (ret < 0)
1105*4882a593Smuzhiyun goto out;
1106*4882a593Smuzhiyun if (ret > 0 && path->slots[0] > 0)
1107*4882a593Smuzhiyun path->slots[0]--;
1108*4882a593Smuzhiyun
1109*4882a593Smuzhiyun leaf = path->nodes[0];
1110*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1111*4882a593Smuzhiyun if (key.objectid != ino ||
1112*4882a593Smuzhiyun key.type != BTRFS_EXTENT_DATA_KEY) {
1113*4882a593Smuzhiyun ret = -EINVAL;
1114*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1115*4882a593Smuzhiyun goto out;
1116*4882a593Smuzhiyun }
1117*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
1118*4882a593Smuzhiyun struct btrfs_file_extent_item);
1119*4882a593Smuzhiyun if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1120*4882a593Smuzhiyun ret = -EINVAL;
1121*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1122*4882a593Smuzhiyun goto out;
1123*4882a593Smuzhiyun }
1124*4882a593Smuzhiyun extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1125*4882a593Smuzhiyun if (key.offset > start || extent_end < end) {
1126*4882a593Smuzhiyun ret = -EINVAL;
1127*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1128*4882a593Smuzhiyun goto out;
1129*4882a593Smuzhiyun }
1130*4882a593Smuzhiyun
1131*4882a593Smuzhiyun bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1132*4882a593Smuzhiyun num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1133*4882a593Smuzhiyun orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1134*4882a593Smuzhiyun memcpy(&new_key, &key, sizeof(new_key));
1135*4882a593Smuzhiyun
1136*4882a593Smuzhiyun if (start == key.offset && end < extent_end) {
1137*4882a593Smuzhiyun other_start = 0;
1138*4882a593Smuzhiyun other_end = start;
1139*4882a593Smuzhiyun if (extent_mergeable(leaf, path->slots[0] - 1,
1140*4882a593Smuzhiyun ino, bytenr, orig_offset,
1141*4882a593Smuzhiyun &other_start, &other_end)) {
1142*4882a593Smuzhiyun new_key.offset = end;
1143*4882a593Smuzhiyun btrfs_set_item_key_safe(fs_info, path, &new_key);
1144*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
1145*4882a593Smuzhiyun struct btrfs_file_extent_item);
1146*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi,
1147*4882a593Smuzhiyun trans->transid);
1148*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
1149*4882a593Smuzhiyun extent_end - end);
1150*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, fi,
1151*4882a593Smuzhiyun end - orig_offset);
1152*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1153*4882a593Smuzhiyun struct btrfs_file_extent_item);
1154*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi,
1155*4882a593Smuzhiyun trans->transid);
1156*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
1157*4882a593Smuzhiyun end - other_start);
1158*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
1159*4882a593Smuzhiyun goto out;
1160*4882a593Smuzhiyun }
1161*4882a593Smuzhiyun }
1162*4882a593Smuzhiyun
1163*4882a593Smuzhiyun if (start > key.offset && end == extent_end) {
1164*4882a593Smuzhiyun other_start = end;
1165*4882a593Smuzhiyun other_end = 0;
1166*4882a593Smuzhiyun if (extent_mergeable(leaf, path->slots[0] + 1,
1167*4882a593Smuzhiyun ino, bytenr, orig_offset,
1168*4882a593Smuzhiyun &other_start, &other_end)) {
1169*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
1170*4882a593Smuzhiyun struct btrfs_file_extent_item);
1171*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
1172*4882a593Smuzhiyun start - key.offset);
1173*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi,
1174*4882a593Smuzhiyun trans->transid);
1175*4882a593Smuzhiyun path->slots[0]++;
1176*4882a593Smuzhiyun new_key.offset = start;
1177*4882a593Smuzhiyun btrfs_set_item_key_safe(fs_info, path, &new_key);
1178*4882a593Smuzhiyun
1179*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
1180*4882a593Smuzhiyun struct btrfs_file_extent_item);
1181*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi,
1182*4882a593Smuzhiyun trans->transid);
1183*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
1184*4882a593Smuzhiyun other_end - start);
1185*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, fi,
1186*4882a593Smuzhiyun start - orig_offset);
1187*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
1188*4882a593Smuzhiyun goto out;
1189*4882a593Smuzhiyun }
1190*4882a593Smuzhiyun }
1191*4882a593Smuzhiyun
1192*4882a593Smuzhiyun while (start > key.offset || end < extent_end) {
1193*4882a593Smuzhiyun if (key.offset == start)
1194*4882a593Smuzhiyun split = end;
1195*4882a593Smuzhiyun
1196*4882a593Smuzhiyun new_key.offset = split;
1197*4882a593Smuzhiyun ret = btrfs_duplicate_item(trans, root, path, &new_key);
1198*4882a593Smuzhiyun if (ret == -EAGAIN) {
1199*4882a593Smuzhiyun btrfs_release_path(path);
1200*4882a593Smuzhiyun goto again;
1201*4882a593Smuzhiyun }
1202*4882a593Smuzhiyun if (ret < 0) {
1203*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1204*4882a593Smuzhiyun goto out;
1205*4882a593Smuzhiyun }
1206*4882a593Smuzhiyun
1207*4882a593Smuzhiyun leaf = path->nodes[0];
1208*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1209*4882a593Smuzhiyun struct btrfs_file_extent_item);
1210*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1211*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
1212*4882a593Smuzhiyun split - key.offset);
1213*4882a593Smuzhiyun
1214*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
1215*4882a593Smuzhiyun struct btrfs_file_extent_item);
1216*4882a593Smuzhiyun
1217*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1218*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1219*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
1220*4882a593Smuzhiyun extent_end - split);
1221*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
1222*4882a593Smuzhiyun
1223*4882a593Smuzhiyun btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1224*4882a593Smuzhiyun num_bytes, 0);
1225*4882a593Smuzhiyun btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1226*4882a593Smuzhiyun orig_offset);
1227*4882a593Smuzhiyun ret = btrfs_inc_extent_ref(trans, &ref);
1228*4882a593Smuzhiyun if (ret) {
1229*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1230*4882a593Smuzhiyun goto out;
1231*4882a593Smuzhiyun }
1232*4882a593Smuzhiyun
1233*4882a593Smuzhiyun if (split == start) {
1234*4882a593Smuzhiyun key.offset = start;
1235*4882a593Smuzhiyun } else {
1236*4882a593Smuzhiyun if (start != key.offset) {
1237*4882a593Smuzhiyun ret = -EINVAL;
1238*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1239*4882a593Smuzhiyun goto out;
1240*4882a593Smuzhiyun }
1241*4882a593Smuzhiyun path->slots[0]--;
1242*4882a593Smuzhiyun extent_end = end;
1243*4882a593Smuzhiyun }
1244*4882a593Smuzhiyun recow = 1;
1245*4882a593Smuzhiyun }
1246*4882a593Smuzhiyun
1247*4882a593Smuzhiyun other_start = end;
1248*4882a593Smuzhiyun other_end = 0;
1249*4882a593Smuzhiyun btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1250*4882a593Smuzhiyun num_bytes, 0);
1251*4882a593Smuzhiyun btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
1252*4882a593Smuzhiyun if (extent_mergeable(leaf, path->slots[0] + 1,
1253*4882a593Smuzhiyun ino, bytenr, orig_offset,
1254*4882a593Smuzhiyun &other_start, &other_end)) {
1255*4882a593Smuzhiyun if (recow) {
1256*4882a593Smuzhiyun btrfs_release_path(path);
1257*4882a593Smuzhiyun goto again;
1258*4882a593Smuzhiyun }
1259*4882a593Smuzhiyun extent_end = other_end;
1260*4882a593Smuzhiyun del_slot = path->slots[0] + 1;
1261*4882a593Smuzhiyun del_nr++;
1262*4882a593Smuzhiyun ret = btrfs_free_extent(trans, &ref);
1263*4882a593Smuzhiyun if (ret) {
1264*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1265*4882a593Smuzhiyun goto out;
1266*4882a593Smuzhiyun }
1267*4882a593Smuzhiyun }
1268*4882a593Smuzhiyun other_start = 0;
1269*4882a593Smuzhiyun other_end = start;
1270*4882a593Smuzhiyun if (extent_mergeable(leaf, path->slots[0] - 1,
1271*4882a593Smuzhiyun ino, bytenr, orig_offset,
1272*4882a593Smuzhiyun &other_start, &other_end)) {
1273*4882a593Smuzhiyun if (recow) {
1274*4882a593Smuzhiyun btrfs_release_path(path);
1275*4882a593Smuzhiyun goto again;
1276*4882a593Smuzhiyun }
1277*4882a593Smuzhiyun key.offset = other_start;
1278*4882a593Smuzhiyun del_slot = path->slots[0];
1279*4882a593Smuzhiyun del_nr++;
1280*4882a593Smuzhiyun ret = btrfs_free_extent(trans, &ref);
1281*4882a593Smuzhiyun if (ret) {
1282*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1283*4882a593Smuzhiyun goto out;
1284*4882a593Smuzhiyun }
1285*4882a593Smuzhiyun }
1286*4882a593Smuzhiyun if (del_nr == 0) {
1287*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
1288*4882a593Smuzhiyun struct btrfs_file_extent_item);
1289*4882a593Smuzhiyun btrfs_set_file_extent_type(leaf, fi,
1290*4882a593Smuzhiyun BTRFS_FILE_EXTENT_REG);
1291*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1292*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
1293*4882a593Smuzhiyun } else {
1294*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, del_slot - 1,
1295*4882a593Smuzhiyun struct btrfs_file_extent_item);
1296*4882a593Smuzhiyun btrfs_set_file_extent_type(leaf, fi,
1297*4882a593Smuzhiyun BTRFS_FILE_EXTENT_REG);
1298*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1299*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi,
1300*4882a593Smuzhiyun extent_end - key.offset);
1301*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
1302*4882a593Smuzhiyun
1303*4882a593Smuzhiyun ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1304*4882a593Smuzhiyun if (ret < 0) {
1305*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
1306*4882a593Smuzhiyun goto out;
1307*4882a593Smuzhiyun }
1308*4882a593Smuzhiyun }
1309*4882a593Smuzhiyun out:
1310*4882a593Smuzhiyun btrfs_free_path(path);
1311*4882a593Smuzhiyun return ret;
1312*4882a593Smuzhiyun }
1313*4882a593Smuzhiyun
1314*4882a593Smuzhiyun /*
1315*4882a593Smuzhiyun * on error we return an unlocked page and the error value
1316*4882a593Smuzhiyun * on success we return a locked page and 0
1317*4882a593Smuzhiyun */
prepare_uptodate_page(struct inode * inode,struct page * page,u64 pos,bool force_uptodate)1318*4882a593Smuzhiyun static int prepare_uptodate_page(struct inode *inode,
1319*4882a593Smuzhiyun struct page *page, u64 pos,
1320*4882a593Smuzhiyun bool force_uptodate)
1321*4882a593Smuzhiyun {
1322*4882a593Smuzhiyun int ret = 0;
1323*4882a593Smuzhiyun
1324*4882a593Smuzhiyun if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1325*4882a593Smuzhiyun !PageUptodate(page)) {
1326*4882a593Smuzhiyun ret = btrfs_readpage(NULL, page);
1327*4882a593Smuzhiyun if (ret)
1328*4882a593Smuzhiyun return ret;
1329*4882a593Smuzhiyun lock_page(page);
1330*4882a593Smuzhiyun if (!PageUptodate(page)) {
1331*4882a593Smuzhiyun unlock_page(page);
1332*4882a593Smuzhiyun return -EIO;
1333*4882a593Smuzhiyun }
1334*4882a593Smuzhiyun if (page->mapping != inode->i_mapping) {
1335*4882a593Smuzhiyun unlock_page(page);
1336*4882a593Smuzhiyun return -EAGAIN;
1337*4882a593Smuzhiyun }
1338*4882a593Smuzhiyun }
1339*4882a593Smuzhiyun return 0;
1340*4882a593Smuzhiyun }
1341*4882a593Smuzhiyun
1342*4882a593Smuzhiyun /*
1343*4882a593Smuzhiyun * this just gets pages into the page cache and locks them down.
1344*4882a593Smuzhiyun */
prepare_pages(struct inode * inode,struct page ** pages,size_t num_pages,loff_t pos,size_t write_bytes,bool force_uptodate)1345*4882a593Smuzhiyun static noinline int prepare_pages(struct inode *inode, struct page **pages,
1346*4882a593Smuzhiyun size_t num_pages, loff_t pos,
1347*4882a593Smuzhiyun size_t write_bytes, bool force_uptodate)
1348*4882a593Smuzhiyun {
1349*4882a593Smuzhiyun int i;
1350*4882a593Smuzhiyun unsigned long index = pos >> PAGE_SHIFT;
1351*4882a593Smuzhiyun gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1352*4882a593Smuzhiyun int err = 0;
1353*4882a593Smuzhiyun int faili;
1354*4882a593Smuzhiyun
1355*4882a593Smuzhiyun for (i = 0; i < num_pages; i++) {
1356*4882a593Smuzhiyun again:
1357*4882a593Smuzhiyun pages[i] = find_or_create_page(inode->i_mapping, index + i,
1358*4882a593Smuzhiyun mask | __GFP_WRITE);
1359*4882a593Smuzhiyun if (!pages[i]) {
1360*4882a593Smuzhiyun faili = i - 1;
1361*4882a593Smuzhiyun err = -ENOMEM;
1362*4882a593Smuzhiyun goto fail;
1363*4882a593Smuzhiyun }
1364*4882a593Smuzhiyun
1365*4882a593Smuzhiyun if (i == 0)
1366*4882a593Smuzhiyun err = prepare_uptodate_page(inode, pages[i], pos,
1367*4882a593Smuzhiyun force_uptodate);
1368*4882a593Smuzhiyun if (!err && i == num_pages - 1)
1369*4882a593Smuzhiyun err = prepare_uptodate_page(inode, pages[i],
1370*4882a593Smuzhiyun pos + write_bytes, false);
1371*4882a593Smuzhiyun if (err) {
1372*4882a593Smuzhiyun put_page(pages[i]);
1373*4882a593Smuzhiyun if (err == -EAGAIN) {
1374*4882a593Smuzhiyun err = 0;
1375*4882a593Smuzhiyun goto again;
1376*4882a593Smuzhiyun }
1377*4882a593Smuzhiyun faili = i - 1;
1378*4882a593Smuzhiyun goto fail;
1379*4882a593Smuzhiyun }
1380*4882a593Smuzhiyun wait_on_page_writeback(pages[i]);
1381*4882a593Smuzhiyun }
1382*4882a593Smuzhiyun
1383*4882a593Smuzhiyun return 0;
1384*4882a593Smuzhiyun fail:
1385*4882a593Smuzhiyun while (faili >= 0) {
1386*4882a593Smuzhiyun unlock_page(pages[faili]);
1387*4882a593Smuzhiyun put_page(pages[faili]);
1388*4882a593Smuzhiyun faili--;
1389*4882a593Smuzhiyun }
1390*4882a593Smuzhiyun return err;
1391*4882a593Smuzhiyun
1392*4882a593Smuzhiyun }
1393*4882a593Smuzhiyun
1394*4882a593Smuzhiyun /*
1395*4882a593Smuzhiyun * This function locks the extent and properly waits for data=ordered extents
1396*4882a593Smuzhiyun * to finish before allowing the pages to be modified if need.
1397*4882a593Smuzhiyun *
1398*4882a593Smuzhiyun * The return value:
1399*4882a593Smuzhiyun * 1 - the extent is locked
1400*4882a593Smuzhiyun * 0 - the extent is not locked, and everything is OK
1401*4882a593Smuzhiyun * -EAGAIN - need re-prepare the pages
1402*4882a593Smuzhiyun * the other < 0 number - Something wrong happens
1403*4882a593Smuzhiyun */
1404*4882a593Smuzhiyun static noinline int
lock_and_cleanup_extent_if_need(struct btrfs_inode * inode,struct page ** pages,size_t num_pages,loff_t pos,size_t write_bytes,u64 * lockstart,u64 * lockend,struct extent_state ** cached_state)1405*4882a593Smuzhiyun lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1406*4882a593Smuzhiyun size_t num_pages, loff_t pos,
1407*4882a593Smuzhiyun size_t write_bytes,
1408*4882a593Smuzhiyun u64 *lockstart, u64 *lockend,
1409*4882a593Smuzhiyun struct extent_state **cached_state)
1410*4882a593Smuzhiyun {
1411*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = inode->root->fs_info;
1412*4882a593Smuzhiyun u64 start_pos;
1413*4882a593Smuzhiyun u64 last_pos;
1414*4882a593Smuzhiyun int i;
1415*4882a593Smuzhiyun int ret = 0;
1416*4882a593Smuzhiyun
1417*4882a593Smuzhiyun start_pos = round_down(pos, fs_info->sectorsize);
1418*4882a593Smuzhiyun last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1419*4882a593Smuzhiyun
1420*4882a593Smuzhiyun if (start_pos < inode->vfs_inode.i_size) {
1421*4882a593Smuzhiyun struct btrfs_ordered_extent *ordered;
1422*4882a593Smuzhiyun
1423*4882a593Smuzhiyun lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1424*4882a593Smuzhiyun cached_state);
1425*4882a593Smuzhiyun ordered = btrfs_lookup_ordered_range(inode, start_pos,
1426*4882a593Smuzhiyun last_pos - start_pos + 1);
1427*4882a593Smuzhiyun if (ordered &&
1428*4882a593Smuzhiyun ordered->file_offset + ordered->num_bytes > start_pos &&
1429*4882a593Smuzhiyun ordered->file_offset <= last_pos) {
1430*4882a593Smuzhiyun unlock_extent_cached(&inode->io_tree, start_pos,
1431*4882a593Smuzhiyun last_pos, cached_state);
1432*4882a593Smuzhiyun for (i = 0; i < num_pages; i++) {
1433*4882a593Smuzhiyun unlock_page(pages[i]);
1434*4882a593Smuzhiyun put_page(pages[i]);
1435*4882a593Smuzhiyun }
1436*4882a593Smuzhiyun btrfs_start_ordered_extent(ordered, 1);
1437*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
1438*4882a593Smuzhiyun return -EAGAIN;
1439*4882a593Smuzhiyun }
1440*4882a593Smuzhiyun if (ordered)
1441*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
1442*4882a593Smuzhiyun
1443*4882a593Smuzhiyun *lockstart = start_pos;
1444*4882a593Smuzhiyun *lockend = last_pos;
1445*4882a593Smuzhiyun ret = 1;
1446*4882a593Smuzhiyun }
1447*4882a593Smuzhiyun
1448*4882a593Smuzhiyun /*
1449*4882a593Smuzhiyun * It's possible the pages are dirty right now, but we don't want
1450*4882a593Smuzhiyun * to clean them yet because copy_from_user may catch a page fault
1451*4882a593Smuzhiyun * and we might have to fall back to one page at a time. If that
1452*4882a593Smuzhiyun * happens, we'll unlock these pages and we'd have a window where
1453*4882a593Smuzhiyun * reclaim could sneak in and drop the once-dirty page on the floor
1454*4882a593Smuzhiyun * without writing it.
1455*4882a593Smuzhiyun *
1456*4882a593Smuzhiyun * We have the pages locked and the extent range locked, so there's
1457*4882a593Smuzhiyun * no way someone can start IO on any dirty pages in this range.
1458*4882a593Smuzhiyun *
1459*4882a593Smuzhiyun * We'll call btrfs_dirty_pages() later on, and that will flip around
1460*4882a593Smuzhiyun * delalloc bits and dirty the pages as required.
1461*4882a593Smuzhiyun */
1462*4882a593Smuzhiyun for (i = 0; i < num_pages; i++) {
1463*4882a593Smuzhiyun set_page_extent_mapped(pages[i]);
1464*4882a593Smuzhiyun WARN_ON(!PageLocked(pages[i]));
1465*4882a593Smuzhiyun }
1466*4882a593Smuzhiyun
1467*4882a593Smuzhiyun return ret;
1468*4882a593Smuzhiyun }
1469*4882a593Smuzhiyun
check_can_nocow(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes,bool nowait)1470*4882a593Smuzhiyun static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1471*4882a593Smuzhiyun size_t *write_bytes, bool nowait)
1472*4882a593Smuzhiyun {
1473*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = inode->root->fs_info;
1474*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
1475*4882a593Smuzhiyun u64 lockstart, lockend;
1476*4882a593Smuzhiyun u64 num_bytes;
1477*4882a593Smuzhiyun int ret;
1478*4882a593Smuzhiyun
1479*4882a593Smuzhiyun if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1480*4882a593Smuzhiyun return 0;
1481*4882a593Smuzhiyun
1482*4882a593Smuzhiyun if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
1483*4882a593Smuzhiyun return -EAGAIN;
1484*4882a593Smuzhiyun
1485*4882a593Smuzhiyun lockstart = round_down(pos, fs_info->sectorsize);
1486*4882a593Smuzhiyun lockend = round_up(pos + *write_bytes,
1487*4882a593Smuzhiyun fs_info->sectorsize) - 1;
1488*4882a593Smuzhiyun num_bytes = lockend - lockstart + 1;
1489*4882a593Smuzhiyun
1490*4882a593Smuzhiyun if (nowait) {
1491*4882a593Smuzhiyun struct btrfs_ordered_extent *ordered;
1492*4882a593Smuzhiyun
1493*4882a593Smuzhiyun if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
1494*4882a593Smuzhiyun return -EAGAIN;
1495*4882a593Smuzhiyun
1496*4882a593Smuzhiyun ordered = btrfs_lookup_ordered_range(inode, lockstart,
1497*4882a593Smuzhiyun num_bytes);
1498*4882a593Smuzhiyun if (ordered) {
1499*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
1500*4882a593Smuzhiyun ret = -EAGAIN;
1501*4882a593Smuzhiyun goto out_unlock;
1502*4882a593Smuzhiyun }
1503*4882a593Smuzhiyun } else {
1504*4882a593Smuzhiyun btrfs_lock_and_flush_ordered_range(inode, lockstart,
1505*4882a593Smuzhiyun lockend, NULL);
1506*4882a593Smuzhiyun }
1507*4882a593Smuzhiyun
1508*4882a593Smuzhiyun ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1509*4882a593Smuzhiyun NULL, NULL, NULL, false);
1510*4882a593Smuzhiyun if (ret <= 0) {
1511*4882a593Smuzhiyun ret = 0;
1512*4882a593Smuzhiyun if (!nowait)
1513*4882a593Smuzhiyun btrfs_drew_write_unlock(&root->snapshot_lock);
1514*4882a593Smuzhiyun } else {
1515*4882a593Smuzhiyun *write_bytes = min_t(size_t, *write_bytes ,
1516*4882a593Smuzhiyun num_bytes - pos + lockstart);
1517*4882a593Smuzhiyun }
1518*4882a593Smuzhiyun out_unlock:
1519*4882a593Smuzhiyun unlock_extent(&inode->io_tree, lockstart, lockend);
1520*4882a593Smuzhiyun
1521*4882a593Smuzhiyun return ret;
1522*4882a593Smuzhiyun }
1523*4882a593Smuzhiyun
check_nocow_nolock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes)1524*4882a593Smuzhiyun static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
1525*4882a593Smuzhiyun size_t *write_bytes)
1526*4882a593Smuzhiyun {
1527*4882a593Smuzhiyun return check_can_nocow(inode, pos, write_bytes, true);
1528*4882a593Smuzhiyun }
1529*4882a593Smuzhiyun
1530*4882a593Smuzhiyun /*
1531*4882a593Smuzhiyun * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1532*4882a593Smuzhiyun *
1533*4882a593Smuzhiyun * @pos: File offset
1534*4882a593Smuzhiyun * @write_bytes: The length to write, will be updated to the nocow writeable
1535*4882a593Smuzhiyun * range
1536*4882a593Smuzhiyun *
1537*4882a593Smuzhiyun * This function will flush ordered extents in the range to ensure proper
1538*4882a593Smuzhiyun * nocow checks.
1539*4882a593Smuzhiyun *
1540*4882a593Smuzhiyun * Return:
1541*4882a593Smuzhiyun * >0 and update @write_bytes if we can do nocow write
1542*4882a593Smuzhiyun * 0 if we can't do nocow write
1543*4882a593Smuzhiyun * -EAGAIN if we can't get the needed lock or there are ordered extents
1544*4882a593Smuzhiyun * for * (nowait == true) case
1545*4882a593Smuzhiyun * <0 if other error happened
1546*4882a593Smuzhiyun *
1547*4882a593Smuzhiyun * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
1548*4882a593Smuzhiyun */
btrfs_check_nocow_lock(struct btrfs_inode * inode,loff_t pos,size_t * write_bytes)1549*4882a593Smuzhiyun int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1550*4882a593Smuzhiyun size_t *write_bytes)
1551*4882a593Smuzhiyun {
1552*4882a593Smuzhiyun return check_can_nocow(inode, pos, write_bytes, false);
1553*4882a593Smuzhiyun }
1554*4882a593Smuzhiyun
btrfs_check_nocow_unlock(struct btrfs_inode * inode)1555*4882a593Smuzhiyun void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1556*4882a593Smuzhiyun {
1557*4882a593Smuzhiyun btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1558*4882a593Smuzhiyun }
1559*4882a593Smuzhiyun
btrfs_buffered_write(struct kiocb * iocb,struct iov_iter * i)1560*4882a593Smuzhiyun static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1561*4882a593Smuzhiyun struct iov_iter *i)
1562*4882a593Smuzhiyun {
1563*4882a593Smuzhiyun struct file *file = iocb->ki_filp;
1564*4882a593Smuzhiyun loff_t pos = iocb->ki_pos;
1565*4882a593Smuzhiyun struct inode *inode = file_inode(file);
1566*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1567*4882a593Smuzhiyun struct page **pages = NULL;
1568*4882a593Smuzhiyun struct extent_changeset *data_reserved = NULL;
1569*4882a593Smuzhiyun u64 release_bytes = 0;
1570*4882a593Smuzhiyun u64 lockstart;
1571*4882a593Smuzhiyun u64 lockend;
1572*4882a593Smuzhiyun size_t num_written = 0;
1573*4882a593Smuzhiyun int nrptrs;
1574*4882a593Smuzhiyun int ret = 0;
1575*4882a593Smuzhiyun bool only_release_metadata = false;
1576*4882a593Smuzhiyun bool force_page_uptodate = false;
1577*4882a593Smuzhiyun
1578*4882a593Smuzhiyun nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1579*4882a593Smuzhiyun PAGE_SIZE / (sizeof(struct page *)));
1580*4882a593Smuzhiyun nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1581*4882a593Smuzhiyun nrptrs = max(nrptrs, 8);
1582*4882a593Smuzhiyun pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1583*4882a593Smuzhiyun if (!pages)
1584*4882a593Smuzhiyun return -ENOMEM;
1585*4882a593Smuzhiyun
1586*4882a593Smuzhiyun while (iov_iter_count(i) > 0) {
1587*4882a593Smuzhiyun struct extent_state *cached_state = NULL;
1588*4882a593Smuzhiyun size_t offset = offset_in_page(pos);
1589*4882a593Smuzhiyun size_t sector_offset;
1590*4882a593Smuzhiyun size_t write_bytes = min(iov_iter_count(i),
1591*4882a593Smuzhiyun nrptrs * (size_t)PAGE_SIZE -
1592*4882a593Smuzhiyun offset);
1593*4882a593Smuzhiyun size_t num_pages = DIV_ROUND_UP(write_bytes + offset,
1594*4882a593Smuzhiyun PAGE_SIZE);
1595*4882a593Smuzhiyun size_t reserve_bytes;
1596*4882a593Smuzhiyun size_t dirty_pages;
1597*4882a593Smuzhiyun size_t copied;
1598*4882a593Smuzhiyun size_t dirty_sectors;
1599*4882a593Smuzhiyun size_t num_sectors;
1600*4882a593Smuzhiyun int extents_locked;
1601*4882a593Smuzhiyun
1602*4882a593Smuzhiyun WARN_ON(num_pages > nrptrs);
1603*4882a593Smuzhiyun
1604*4882a593Smuzhiyun /*
1605*4882a593Smuzhiyun * Fault pages before locking them in prepare_pages
1606*4882a593Smuzhiyun * to avoid recursive lock
1607*4882a593Smuzhiyun */
1608*4882a593Smuzhiyun if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1609*4882a593Smuzhiyun ret = -EFAULT;
1610*4882a593Smuzhiyun break;
1611*4882a593Smuzhiyun }
1612*4882a593Smuzhiyun
1613*4882a593Smuzhiyun only_release_metadata = false;
1614*4882a593Smuzhiyun sector_offset = pos & (fs_info->sectorsize - 1);
1615*4882a593Smuzhiyun reserve_bytes = round_up(write_bytes + sector_offset,
1616*4882a593Smuzhiyun fs_info->sectorsize);
1617*4882a593Smuzhiyun
1618*4882a593Smuzhiyun extent_changeset_release(data_reserved);
1619*4882a593Smuzhiyun ret = btrfs_check_data_free_space(BTRFS_I(inode),
1620*4882a593Smuzhiyun &data_reserved, pos,
1621*4882a593Smuzhiyun write_bytes);
1622*4882a593Smuzhiyun if (ret < 0) {
1623*4882a593Smuzhiyun if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1624*4882a593Smuzhiyun &write_bytes) > 0) {
1625*4882a593Smuzhiyun /*
1626*4882a593Smuzhiyun * For nodata cow case, no need to reserve
1627*4882a593Smuzhiyun * data space.
1628*4882a593Smuzhiyun */
1629*4882a593Smuzhiyun only_release_metadata = true;
1630*4882a593Smuzhiyun /*
1631*4882a593Smuzhiyun * our prealloc extent may be smaller than
1632*4882a593Smuzhiyun * write_bytes, so scale down.
1633*4882a593Smuzhiyun */
1634*4882a593Smuzhiyun num_pages = DIV_ROUND_UP(write_bytes + offset,
1635*4882a593Smuzhiyun PAGE_SIZE);
1636*4882a593Smuzhiyun reserve_bytes = round_up(write_bytes +
1637*4882a593Smuzhiyun sector_offset,
1638*4882a593Smuzhiyun fs_info->sectorsize);
1639*4882a593Smuzhiyun } else {
1640*4882a593Smuzhiyun break;
1641*4882a593Smuzhiyun }
1642*4882a593Smuzhiyun }
1643*4882a593Smuzhiyun
1644*4882a593Smuzhiyun WARN_ON(reserve_bytes == 0);
1645*4882a593Smuzhiyun ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1646*4882a593Smuzhiyun reserve_bytes);
1647*4882a593Smuzhiyun if (ret) {
1648*4882a593Smuzhiyun if (!only_release_metadata)
1649*4882a593Smuzhiyun btrfs_free_reserved_data_space(BTRFS_I(inode),
1650*4882a593Smuzhiyun data_reserved, pos,
1651*4882a593Smuzhiyun write_bytes);
1652*4882a593Smuzhiyun else
1653*4882a593Smuzhiyun btrfs_check_nocow_unlock(BTRFS_I(inode));
1654*4882a593Smuzhiyun break;
1655*4882a593Smuzhiyun }
1656*4882a593Smuzhiyun
1657*4882a593Smuzhiyun release_bytes = reserve_bytes;
1658*4882a593Smuzhiyun again:
1659*4882a593Smuzhiyun /*
1660*4882a593Smuzhiyun * This is going to setup the pages array with the number of
1661*4882a593Smuzhiyun * pages we want, so we don't really need to worry about the
1662*4882a593Smuzhiyun * contents of pages from loop to loop
1663*4882a593Smuzhiyun */
1664*4882a593Smuzhiyun ret = prepare_pages(inode, pages, num_pages,
1665*4882a593Smuzhiyun pos, write_bytes,
1666*4882a593Smuzhiyun force_page_uptodate);
1667*4882a593Smuzhiyun if (ret) {
1668*4882a593Smuzhiyun btrfs_delalloc_release_extents(BTRFS_I(inode),
1669*4882a593Smuzhiyun reserve_bytes);
1670*4882a593Smuzhiyun break;
1671*4882a593Smuzhiyun }
1672*4882a593Smuzhiyun
1673*4882a593Smuzhiyun extents_locked = lock_and_cleanup_extent_if_need(
1674*4882a593Smuzhiyun BTRFS_I(inode), pages,
1675*4882a593Smuzhiyun num_pages, pos, write_bytes, &lockstart,
1676*4882a593Smuzhiyun &lockend, &cached_state);
1677*4882a593Smuzhiyun if (extents_locked < 0) {
1678*4882a593Smuzhiyun if (extents_locked == -EAGAIN)
1679*4882a593Smuzhiyun goto again;
1680*4882a593Smuzhiyun btrfs_delalloc_release_extents(BTRFS_I(inode),
1681*4882a593Smuzhiyun reserve_bytes);
1682*4882a593Smuzhiyun ret = extents_locked;
1683*4882a593Smuzhiyun break;
1684*4882a593Smuzhiyun }
1685*4882a593Smuzhiyun
1686*4882a593Smuzhiyun copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1687*4882a593Smuzhiyun
1688*4882a593Smuzhiyun num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1689*4882a593Smuzhiyun dirty_sectors = round_up(copied + sector_offset,
1690*4882a593Smuzhiyun fs_info->sectorsize);
1691*4882a593Smuzhiyun dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1692*4882a593Smuzhiyun
1693*4882a593Smuzhiyun /*
1694*4882a593Smuzhiyun * if we have trouble faulting in the pages, fall
1695*4882a593Smuzhiyun * back to one page at a time
1696*4882a593Smuzhiyun */
1697*4882a593Smuzhiyun if (copied < write_bytes)
1698*4882a593Smuzhiyun nrptrs = 1;
1699*4882a593Smuzhiyun
1700*4882a593Smuzhiyun if (copied == 0) {
1701*4882a593Smuzhiyun force_page_uptodate = true;
1702*4882a593Smuzhiyun dirty_sectors = 0;
1703*4882a593Smuzhiyun dirty_pages = 0;
1704*4882a593Smuzhiyun } else {
1705*4882a593Smuzhiyun force_page_uptodate = false;
1706*4882a593Smuzhiyun dirty_pages = DIV_ROUND_UP(copied + offset,
1707*4882a593Smuzhiyun PAGE_SIZE);
1708*4882a593Smuzhiyun }
1709*4882a593Smuzhiyun
1710*4882a593Smuzhiyun if (num_sectors > dirty_sectors) {
1711*4882a593Smuzhiyun /* release everything except the sectors we dirtied */
1712*4882a593Smuzhiyun release_bytes -= dirty_sectors <<
1713*4882a593Smuzhiyun fs_info->sb->s_blocksize_bits;
1714*4882a593Smuzhiyun if (only_release_metadata) {
1715*4882a593Smuzhiyun btrfs_delalloc_release_metadata(BTRFS_I(inode),
1716*4882a593Smuzhiyun release_bytes, true);
1717*4882a593Smuzhiyun } else {
1718*4882a593Smuzhiyun u64 __pos;
1719*4882a593Smuzhiyun
1720*4882a593Smuzhiyun __pos = round_down(pos,
1721*4882a593Smuzhiyun fs_info->sectorsize) +
1722*4882a593Smuzhiyun (dirty_pages << PAGE_SHIFT);
1723*4882a593Smuzhiyun btrfs_delalloc_release_space(BTRFS_I(inode),
1724*4882a593Smuzhiyun data_reserved, __pos,
1725*4882a593Smuzhiyun release_bytes, true);
1726*4882a593Smuzhiyun }
1727*4882a593Smuzhiyun }
1728*4882a593Smuzhiyun
1729*4882a593Smuzhiyun release_bytes = round_up(copied + sector_offset,
1730*4882a593Smuzhiyun fs_info->sectorsize);
1731*4882a593Smuzhiyun
1732*4882a593Smuzhiyun if (copied > 0)
1733*4882a593Smuzhiyun ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1734*4882a593Smuzhiyun dirty_pages, pos, copied,
1735*4882a593Smuzhiyun &cached_state);
1736*4882a593Smuzhiyun
1737*4882a593Smuzhiyun /*
1738*4882a593Smuzhiyun * If we have not locked the extent range, because the range's
1739*4882a593Smuzhiyun * start offset is >= i_size, we might still have a non-NULL
1740*4882a593Smuzhiyun * cached extent state, acquired while marking the extent range
1741*4882a593Smuzhiyun * as delalloc through btrfs_dirty_pages(). Therefore free any
1742*4882a593Smuzhiyun * possible cached extent state to avoid a memory leak.
1743*4882a593Smuzhiyun */
1744*4882a593Smuzhiyun if (extents_locked)
1745*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1746*4882a593Smuzhiyun lockstart, lockend, &cached_state);
1747*4882a593Smuzhiyun else
1748*4882a593Smuzhiyun free_extent_state(cached_state);
1749*4882a593Smuzhiyun
1750*4882a593Smuzhiyun btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1751*4882a593Smuzhiyun if (ret) {
1752*4882a593Smuzhiyun btrfs_drop_pages(pages, num_pages);
1753*4882a593Smuzhiyun break;
1754*4882a593Smuzhiyun }
1755*4882a593Smuzhiyun
1756*4882a593Smuzhiyun release_bytes = 0;
1757*4882a593Smuzhiyun if (only_release_metadata)
1758*4882a593Smuzhiyun btrfs_check_nocow_unlock(BTRFS_I(inode));
1759*4882a593Smuzhiyun
1760*4882a593Smuzhiyun if (only_release_metadata && copied > 0) {
1761*4882a593Smuzhiyun lockstart = round_down(pos,
1762*4882a593Smuzhiyun fs_info->sectorsize);
1763*4882a593Smuzhiyun lockend = round_up(pos + copied,
1764*4882a593Smuzhiyun fs_info->sectorsize) - 1;
1765*4882a593Smuzhiyun
1766*4882a593Smuzhiyun set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
1767*4882a593Smuzhiyun lockend, EXTENT_NORESERVE, NULL,
1768*4882a593Smuzhiyun NULL, GFP_NOFS);
1769*4882a593Smuzhiyun }
1770*4882a593Smuzhiyun
1771*4882a593Smuzhiyun btrfs_drop_pages(pages, num_pages);
1772*4882a593Smuzhiyun
1773*4882a593Smuzhiyun cond_resched();
1774*4882a593Smuzhiyun
1775*4882a593Smuzhiyun balance_dirty_pages_ratelimited(inode->i_mapping);
1776*4882a593Smuzhiyun
1777*4882a593Smuzhiyun pos += copied;
1778*4882a593Smuzhiyun num_written += copied;
1779*4882a593Smuzhiyun }
1780*4882a593Smuzhiyun
1781*4882a593Smuzhiyun kfree(pages);
1782*4882a593Smuzhiyun
1783*4882a593Smuzhiyun if (release_bytes) {
1784*4882a593Smuzhiyun if (only_release_metadata) {
1785*4882a593Smuzhiyun btrfs_check_nocow_unlock(BTRFS_I(inode));
1786*4882a593Smuzhiyun btrfs_delalloc_release_metadata(BTRFS_I(inode),
1787*4882a593Smuzhiyun release_bytes, true);
1788*4882a593Smuzhiyun } else {
1789*4882a593Smuzhiyun btrfs_delalloc_release_space(BTRFS_I(inode),
1790*4882a593Smuzhiyun data_reserved,
1791*4882a593Smuzhiyun round_down(pos, fs_info->sectorsize),
1792*4882a593Smuzhiyun release_bytes, true);
1793*4882a593Smuzhiyun }
1794*4882a593Smuzhiyun }
1795*4882a593Smuzhiyun
1796*4882a593Smuzhiyun extent_changeset_free(data_reserved);
1797*4882a593Smuzhiyun return num_written ? num_written : ret;
1798*4882a593Smuzhiyun }
1799*4882a593Smuzhiyun
__btrfs_direct_write(struct kiocb * iocb,struct iov_iter * from)1800*4882a593Smuzhiyun static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1801*4882a593Smuzhiyun {
1802*4882a593Smuzhiyun struct file *file = iocb->ki_filp;
1803*4882a593Smuzhiyun struct inode *inode = file_inode(file);
1804*4882a593Smuzhiyun loff_t pos;
1805*4882a593Smuzhiyun ssize_t written;
1806*4882a593Smuzhiyun ssize_t written_buffered;
1807*4882a593Smuzhiyun loff_t endbyte;
1808*4882a593Smuzhiyun int err;
1809*4882a593Smuzhiyun
1810*4882a593Smuzhiyun written = btrfs_direct_IO(iocb, from);
1811*4882a593Smuzhiyun
1812*4882a593Smuzhiyun if (written < 0 || !iov_iter_count(from))
1813*4882a593Smuzhiyun return written;
1814*4882a593Smuzhiyun
1815*4882a593Smuzhiyun pos = iocb->ki_pos;
1816*4882a593Smuzhiyun written_buffered = btrfs_buffered_write(iocb, from);
1817*4882a593Smuzhiyun if (written_buffered < 0) {
1818*4882a593Smuzhiyun err = written_buffered;
1819*4882a593Smuzhiyun goto out;
1820*4882a593Smuzhiyun }
1821*4882a593Smuzhiyun /*
1822*4882a593Smuzhiyun * Ensure all data is persisted. We want the next direct IO read to be
1823*4882a593Smuzhiyun * able to read what was just written.
1824*4882a593Smuzhiyun */
1825*4882a593Smuzhiyun endbyte = pos + written_buffered - 1;
1826*4882a593Smuzhiyun err = btrfs_fdatawrite_range(inode, pos, endbyte);
1827*4882a593Smuzhiyun if (err)
1828*4882a593Smuzhiyun goto out;
1829*4882a593Smuzhiyun err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1830*4882a593Smuzhiyun if (err)
1831*4882a593Smuzhiyun goto out;
1832*4882a593Smuzhiyun written += written_buffered;
1833*4882a593Smuzhiyun iocb->ki_pos = pos + written_buffered;
1834*4882a593Smuzhiyun invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1835*4882a593Smuzhiyun endbyte >> PAGE_SHIFT);
1836*4882a593Smuzhiyun out:
1837*4882a593Smuzhiyun return written ? written : err;
1838*4882a593Smuzhiyun }
1839*4882a593Smuzhiyun
update_time_for_write(struct inode * inode)1840*4882a593Smuzhiyun static void update_time_for_write(struct inode *inode)
1841*4882a593Smuzhiyun {
1842*4882a593Smuzhiyun struct timespec64 now;
1843*4882a593Smuzhiyun
1844*4882a593Smuzhiyun if (IS_NOCMTIME(inode))
1845*4882a593Smuzhiyun return;
1846*4882a593Smuzhiyun
1847*4882a593Smuzhiyun now = current_time(inode);
1848*4882a593Smuzhiyun if (!timespec64_equal(&inode->i_mtime, &now))
1849*4882a593Smuzhiyun inode->i_mtime = now;
1850*4882a593Smuzhiyun
1851*4882a593Smuzhiyun if (!timespec64_equal(&inode->i_ctime, &now))
1852*4882a593Smuzhiyun inode->i_ctime = now;
1853*4882a593Smuzhiyun
1854*4882a593Smuzhiyun if (IS_I_VERSION(inode))
1855*4882a593Smuzhiyun inode_inc_iversion(inode);
1856*4882a593Smuzhiyun }
1857*4882a593Smuzhiyun
btrfs_file_write_iter(struct kiocb * iocb,struct iov_iter * from)1858*4882a593Smuzhiyun static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1859*4882a593Smuzhiyun struct iov_iter *from)
1860*4882a593Smuzhiyun {
1861*4882a593Smuzhiyun struct file *file = iocb->ki_filp;
1862*4882a593Smuzhiyun struct inode *inode = file_inode(file);
1863*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1864*4882a593Smuzhiyun u64 start_pos;
1865*4882a593Smuzhiyun u64 end_pos;
1866*4882a593Smuzhiyun ssize_t num_written = 0;
1867*4882a593Smuzhiyun const bool sync = iocb->ki_flags & IOCB_DSYNC;
1868*4882a593Smuzhiyun ssize_t err;
1869*4882a593Smuzhiyun loff_t pos;
1870*4882a593Smuzhiyun size_t count;
1871*4882a593Smuzhiyun loff_t oldsize;
1872*4882a593Smuzhiyun int clean_page = 0;
1873*4882a593Smuzhiyun
1874*4882a593Smuzhiyun if (!(iocb->ki_flags & IOCB_DIRECT) &&
1875*4882a593Smuzhiyun (iocb->ki_flags & IOCB_NOWAIT))
1876*4882a593Smuzhiyun return -EOPNOTSUPP;
1877*4882a593Smuzhiyun
1878*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT) {
1879*4882a593Smuzhiyun if (!inode_trylock(inode))
1880*4882a593Smuzhiyun return -EAGAIN;
1881*4882a593Smuzhiyun } else {
1882*4882a593Smuzhiyun inode_lock(inode);
1883*4882a593Smuzhiyun }
1884*4882a593Smuzhiyun
1885*4882a593Smuzhiyun err = generic_write_checks(iocb, from);
1886*4882a593Smuzhiyun if (err <= 0) {
1887*4882a593Smuzhiyun inode_unlock(inode);
1888*4882a593Smuzhiyun return err;
1889*4882a593Smuzhiyun }
1890*4882a593Smuzhiyun
1891*4882a593Smuzhiyun pos = iocb->ki_pos;
1892*4882a593Smuzhiyun count = iov_iter_count(from);
1893*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_NOWAIT) {
1894*4882a593Smuzhiyun size_t nocow_bytes = count;
1895*4882a593Smuzhiyun
1896*4882a593Smuzhiyun /*
1897*4882a593Smuzhiyun * We will allocate space in case nodatacow is not set,
1898*4882a593Smuzhiyun * so bail
1899*4882a593Smuzhiyun */
1900*4882a593Smuzhiyun if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes)
1901*4882a593Smuzhiyun <= 0) {
1902*4882a593Smuzhiyun inode_unlock(inode);
1903*4882a593Smuzhiyun return -EAGAIN;
1904*4882a593Smuzhiyun }
1905*4882a593Smuzhiyun /*
1906*4882a593Smuzhiyun * There are holes in the range or parts of the range that must
1907*4882a593Smuzhiyun * be COWed (shared extents, RO block groups, etc), so just bail
1908*4882a593Smuzhiyun * out.
1909*4882a593Smuzhiyun */
1910*4882a593Smuzhiyun if (nocow_bytes < count) {
1911*4882a593Smuzhiyun inode_unlock(inode);
1912*4882a593Smuzhiyun return -EAGAIN;
1913*4882a593Smuzhiyun }
1914*4882a593Smuzhiyun }
1915*4882a593Smuzhiyun
1916*4882a593Smuzhiyun current->backing_dev_info = inode_to_bdi(inode);
1917*4882a593Smuzhiyun err = file_remove_privs(file);
1918*4882a593Smuzhiyun if (err) {
1919*4882a593Smuzhiyun inode_unlock(inode);
1920*4882a593Smuzhiyun goto out;
1921*4882a593Smuzhiyun }
1922*4882a593Smuzhiyun
1923*4882a593Smuzhiyun /*
1924*4882a593Smuzhiyun * If BTRFS flips readonly due to some impossible error
1925*4882a593Smuzhiyun * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
1926*4882a593Smuzhiyun * although we have opened a file as writable, we have
1927*4882a593Smuzhiyun * to stop this write operation to ensure FS consistency.
1928*4882a593Smuzhiyun */
1929*4882a593Smuzhiyun if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
1930*4882a593Smuzhiyun inode_unlock(inode);
1931*4882a593Smuzhiyun err = -EROFS;
1932*4882a593Smuzhiyun goto out;
1933*4882a593Smuzhiyun }
1934*4882a593Smuzhiyun
1935*4882a593Smuzhiyun /*
1936*4882a593Smuzhiyun * We reserve space for updating the inode when we reserve space for the
1937*4882a593Smuzhiyun * extent we are going to write, so we will enospc out there. We don't
1938*4882a593Smuzhiyun * need to start yet another transaction to update the inode as we will
1939*4882a593Smuzhiyun * update the inode when we finish writing whatever data we write.
1940*4882a593Smuzhiyun */
1941*4882a593Smuzhiyun update_time_for_write(inode);
1942*4882a593Smuzhiyun
1943*4882a593Smuzhiyun start_pos = round_down(pos, fs_info->sectorsize);
1944*4882a593Smuzhiyun oldsize = i_size_read(inode);
1945*4882a593Smuzhiyun if (start_pos > oldsize) {
1946*4882a593Smuzhiyun /* Expand hole size to cover write data, preventing empty gap */
1947*4882a593Smuzhiyun end_pos = round_up(pos + count,
1948*4882a593Smuzhiyun fs_info->sectorsize);
1949*4882a593Smuzhiyun err = btrfs_cont_expand(inode, oldsize, end_pos);
1950*4882a593Smuzhiyun if (err) {
1951*4882a593Smuzhiyun inode_unlock(inode);
1952*4882a593Smuzhiyun goto out;
1953*4882a593Smuzhiyun }
1954*4882a593Smuzhiyun if (start_pos > round_up(oldsize, fs_info->sectorsize))
1955*4882a593Smuzhiyun clean_page = 1;
1956*4882a593Smuzhiyun }
1957*4882a593Smuzhiyun
1958*4882a593Smuzhiyun if (sync)
1959*4882a593Smuzhiyun atomic_inc(&BTRFS_I(inode)->sync_writers);
1960*4882a593Smuzhiyun
1961*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_DIRECT) {
1962*4882a593Smuzhiyun /*
1963*4882a593Smuzhiyun * 1. We must always clear IOCB_DSYNC in order to not deadlock
1964*4882a593Smuzhiyun * in iomap, as it calls generic_write_sync() in this case.
1965*4882a593Smuzhiyun * 2. If we are async, we can call iomap_dio_complete() either
1966*4882a593Smuzhiyun * in
1967*4882a593Smuzhiyun *
1968*4882a593Smuzhiyun * 2.1. A worker thread from the last bio completed. In this
1969*4882a593Smuzhiyun * case we need to mark the btrfs_dio_data that it is
1970*4882a593Smuzhiyun * async in order to call generic_write_sync() properly.
1971*4882a593Smuzhiyun * This is handled by setting BTRFS_DIO_SYNC_STUB in the
1972*4882a593Smuzhiyun * current->journal_info.
1973*4882a593Smuzhiyun * 2.2 The submitter context, because all IO completed
1974*4882a593Smuzhiyun * before we exited iomap_dio_rw(). In this case we can
1975*4882a593Smuzhiyun * just re-set the IOCB_DSYNC on the iocb and we'll do
1976*4882a593Smuzhiyun * the sync below. If our ->end_io() gets called and
1977*4882a593Smuzhiyun * current->journal_info is set, then we know we're in
1978*4882a593Smuzhiyun * our current context and we will clear
1979*4882a593Smuzhiyun * current->journal_info to indicate that we need to
1980*4882a593Smuzhiyun * sync below.
1981*4882a593Smuzhiyun */
1982*4882a593Smuzhiyun if (sync) {
1983*4882a593Smuzhiyun ASSERT(current->journal_info == NULL);
1984*4882a593Smuzhiyun iocb->ki_flags &= ~IOCB_DSYNC;
1985*4882a593Smuzhiyun current->journal_info = BTRFS_DIO_SYNC_STUB;
1986*4882a593Smuzhiyun }
1987*4882a593Smuzhiyun num_written = __btrfs_direct_write(iocb, from);
1988*4882a593Smuzhiyun
1989*4882a593Smuzhiyun /*
1990*4882a593Smuzhiyun * As stated above, we cleared journal_info, so we need to do
1991*4882a593Smuzhiyun * the sync ourselves.
1992*4882a593Smuzhiyun */
1993*4882a593Smuzhiyun if (sync && current->journal_info == NULL)
1994*4882a593Smuzhiyun iocb->ki_flags |= IOCB_DSYNC;
1995*4882a593Smuzhiyun current->journal_info = NULL;
1996*4882a593Smuzhiyun } else {
1997*4882a593Smuzhiyun num_written = btrfs_buffered_write(iocb, from);
1998*4882a593Smuzhiyun if (num_written > 0)
1999*4882a593Smuzhiyun iocb->ki_pos = pos + num_written;
2000*4882a593Smuzhiyun if (clean_page)
2001*4882a593Smuzhiyun pagecache_isize_extended(inode, oldsize,
2002*4882a593Smuzhiyun i_size_read(inode));
2003*4882a593Smuzhiyun }
2004*4882a593Smuzhiyun
2005*4882a593Smuzhiyun inode_unlock(inode);
2006*4882a593Smuzhiyun
2007*4882a593Smuzhiyun btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
2008*4882a593Smuzhiyun
2009*4882a593Smuzhiyun if (num_written > 0)
2010*4882a593Smuzhiyun num_written = generic_write_sync(iocb, num_written);
2011*4882a593Smuzhiyun
2012*4882a593Smuzhiyun if (sync)
2013*4882a593Smuzhiyun atomic_dec(&BTRFS_I(inode)->sync_writers);
2014*4882a593Smuzhiyun out:
2015*4882a593Smuzhiyun current->backing_dev_info = NULL;
2016*4882a593Smuzhiyun return num_written ? num_written : err;
2017*4882a593Smuzhiyun }
2018*4882a593Smuzhiyun
btrfs_release_file(struct inode * inode,struct file * filp)2019*4882a593Smuzhiyun int btrfs_release_file(struct inode *inode, struct file *filp)
2020*4882a593Smuzhiyun {
2021*4882a593Smuzhiyun struct btrfs_file_private *private = filp->private_data;
2022*4882a593Smuzhiyun
2023*4882a593Smuzhiyun if (private && private->filldir_buf)
2024*4882a593Smuzhiyun kfree(private->filldir_buf);
2025*4882a593Smuzhiyun kfree(private);
2026*4882a593Smuzhiyun filp->private_data = NULL;
2027*4882a593Smuzhiyun
2028*4882a593Smuzhiyun /*
2029*4882a593Smuzhiyun * Set by setattr when we are about to truncate a file from a non-zero
2030*4882a593Smuzhiyun * size to a zero size. This tries to flush down new bytes that may
2031*4882a593Smuzhiyun * have been written if the application were using truncate to replace
2032*4882a593Smuzhiyun * a file in place.
2033*4882a593Smuzhiyun */
2034*4882a593Smuzhiyun if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
2035*4882a593Smuzhiyun &BTRFS_I(inode)->runtime_flags))
2036*4882a593Smuzhiyun filemap_flush(inode->i_mapping);
2037*4882a593Smuzhiyun return 0;
2038*4882a593Smuzhiyun }
2039*4882a593Smuzhiyun
start_ordered_ops(struct inode * inode,loff_t start,loff_t end)2040*4882a593Smuzhiyun static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2041*4882a593Smuzhiyun {
2042*4882a593Smuzhiyun int ret;
2043*4882a593Smuzhiyun struct blk_plug plug;
2044*4882a593Smuzhiyun
2045*4882a593Smuzhiyun /*
2046*4882a593Smuzhiyun * This is only called in fsync, which would do synchronous writes, so
2047*4882a593Smuzhiyun * a plug can merge adjacent IOs as much as possible. Esp. in case of
2048*4882a593Smuzhiyun * multiple disks using raid profile, a large IO can be split to
2049*4882a593Smuzhiyun * several segments of stripe length (currently 64K).
2050*4882a593Smuzhiyun */
2051*4882a593Smuzhiyun blk_start_plug(&plug);
2052*4882a593Smuzhiyun atomic_inc(&BTRFS_I(inode)->sync_writers);
2053*4882a593Smuzhiyun ret = btrfs_fdatawrite_range(inode, start, end);
2054*4882a593Smuzhiyun atomic_dec(&BTRFS_I(inode)->sync_writers);
2055*4882a593Smuzhiyun blk_finish_plug(&plug);
2056*4882a593Smuzhiyun
2057*4882a593Smuzhiyun return ret;
2058*4882a593Smuzhiyun }
2059*4882a593Smuzhiyun
skip_inode_logging(const struct btrfs_log_ctx * ctx)2060*4882a593Smuzhiyun static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2061*4882a593Smuzhiyun {
2062*4882a593Smuzhiyun struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2063*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = inode->root->fs_info;
2064*4882a593Smuzhiyun
2065*4882a593Smuzhiyun if (btrfs_inode_in_log(inode, fs_info->generation) &&
2066*4882a593Smuzhiyun list_empty(&ctx->ordered_extents))
2067*4882a593Smuzhiyun return true;
2068*4882a593Smuzhiyun
2069*4882a593Smuzhiyun /*
2070*4882a593Smuzhiyun * If we are doing a fast fsync we can not bail out if the inode's
2071*4882a593Smuzhiyun * last_trans is <= then the last committed transaction, because we only
2072*4882a593Smuzhiyun * update the last_trans of the inode during ordered extent completion,
2073*4882a593Smuzhiyun * and for a fast fsync we don't wait for that, we only wait for the
2074*4882a593Smuzhiyun * writeback to complete.
2075*4882a593Smuzhiyun */
2076*4882a593Smuzhiyun if (inode->last_trans <= fs_info->last_trans_committed &&
2077*4882a593Smuzhiyun (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2078*4882a593Smuzhiyun list_empty(&ctx->ordered_extents)))
2079*4882a593Smuzhiyun return true;
2080*4882a593Smuzhiyun
2081*4882a593Smuzhiyun return false;
2082*4882a593Smuzhiyun }
2083*4882a593Smuzhiyun
2084*4882a593Smuzhiyun /*
2085*4882a593Smuzhiyun * fsync call for both files and directories. This logs the inode into
2086*4882a593Smuzhiyun * the tree log instead of forcing full commits whenever possible.
2087*4882a593Smuzhiyun *
2088*4882a593Smuzhiyun * It needs to call filemap_fdatawait so that all ordered extent updates are
2089*4882a593Smuzhiyun * in the metadata btree are up to date for copying to the log.
2090*4882a593Smuzhiyun *
2091*4882a593Smuzhiyun * It drops the inode mutex before doing the tree log commit. This is an
2092*4882a593Smuzhiyun * important optimization for directories because holding the mutex prevents
2093*4882a593Smuzhiyun * new operations on the dir while we write to disk.
2094*4882a593Smuzhiyun */
btrfs_sync_file(struct file * file,loff_t start,loff_t end,int datasync)2095*4882a593Smuzhiyun int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2096*4882a593Smuzhiyun {
2097*4882a593Smuzhiyun struct dentry *dentry = file_dentry(file);
2098*4882a593Smuzhiyun struct inode *inode = d_inode(dentry);
2099*4882a593Smuzhiyun struct btrfs_root *root = BTRFS_I(inode)->root;
2100*4882a593Smuzhiyun struct btrfs_trans_handle *trans;
2101*4882a593Smuzhiyun struct btrfs_log_ctx ctx;
2102*4882a593Smuzhiyun int ret = 0, err;
2103*4882a593Smuzhiyun u64 len;
2104*4882a593Smuzhiyun bool full_sync;
2105*4882a593Smuzhiyun
2106*4882a593Smuzhiyun trace_btrfs_sync_file(file, datasync);
2107*4882a593Smuzhiyun
2108*4882a593Smuzhiyun btrfs_init_log_ctx(&ctx, inode);
2109*4882a593Smuzhiyun
2110*4882a593Smuzhiyun /*
2111*4882a593Smuzhiyun * Always set the range to a full range, otherwise we can get into
2112*4882a593Smuzhiyun * several problems, from missing file extent items to represent holes
2113*4882a593Smuzhiyun * when not using the NO_HOLES feature, to log tree corruption due to
2114*4882a593Smuzhiyun * races between hole detection during logging and completion of ordered
2115*4882a593Smuzhiyun * extents outside the range, to missing checksums due to ordered extents
2116*4882a593Smuzhiyun * for which we flushed only a subset of their pages.
2117*4882a593Smuzhiyun */
2118*4882a593Smuzhiyun start = 0;
2119*4882a593Smuzhiyun end = LLONG_MAX;
2120*4882a593Smuzhiyun len = (u64)LLONG_MAX + 1;
2121*4882a593Smuzhiyun
2122*4882a593Smuzhiyun /*
2123*4882a593Smuzhiyun * We write the dirty pages in the range and wait until they complete
2124*4882a593Smuzhiyun * out of the ->i_mutex. If so, we can flush the dirty pages by
2125*4882a593Smuzhiyun * multi-task, and make the performance up. See
2126*4882a593Smuzhiyun * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2127*4882a593Smuzhiyun */
2128*4882a593Smuzhiyun ret = start_ordered_ops(inode, start, end);
2129*4882a593Smuzhiyun if (ret)
2130*4882a593Smuzhiyun goto out;
2131*4882a593Smuzhiyun
2132*4882a593Smuzhiyun inode_lock(inode);
2133*4882a593Smuzhiyun
2134*4882a593Smuzhiyun /*
2135*4882a593Smuzhiyun * We take the dio_sem here because the tree log stuff can race with
2136*4882a593Smuzhiyun * lockless dio writes and get an extent map logged for an extent we
2137*4882a593Smuzhiyun * never waited on. We need it this high up for lockdep reasons.
2138*4882a593Smuzhiyun */
2139*4882a593Smuzhiyun down_write(&BTRFS_I(inode)->dio_sem);
2140*4882a593Smuzhiyun
2141*4882a593Smuzhiyun atomic_inc(&root->log_batch);
2142*4882a593Smuzhiyun
2143*4882a593Smuzhiyun /*
2144*4882a593Smuzhiyun * Always check for the full sync flag while holding the inode's lock,
2145*4882a593Smuzhiyun * to avoid races with other tasks. The flag must be either set all the
2146*4882a593Smuzhiyun * time during logging or always off all the time while logging.
2147*4882a593Smuzhiyun */
2148*4882a593Smuzhiyun full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2149*4882a593Smuzhiyun &BTRFS_I(inode)->runtime_flags);
2150*4882a593Smuzhiyun
2151*4882a593Smuzhiyun /*
2152*4882a593Smuzhiyun * Before we acquired the inode's lock, someone may have dirtied more
2153*4882a593Smuzhiyun * pages in the target range. We need to make sure that writeback for
2154*4882a593Smuzhiyun * any such pages does not start while we are logging the inode, because
2155*4882a593Smuzhiyun * if it does, any of the following might happen when we are not doing a
2156*4882a593Smuzhiyun * full inode sync:
2157*4882a593Smuzhiyun *
2158*4882a593Smuzhiyun * 1) We log an extent after its writeback finishes but before its
2159*4882a593Smuzhiyun * checksums are added to the csum tree, leading to -EIO errors
2160*4882a593Smuzhiyun * when attempting to read the extent after a log replay.
2161*4882a593Smuzhiyun *
2162*4882a593Smuzhiyun * 2) We can end up logging an extent before its writeback finishes.
2163*4882a593Smuzhiyun * Therefore after the log replay we will have a file extent item
2164*4882a593Smuzhiyun * pointing to an unwritten extent (and no data checksums as well).
2165*4882a593Smuzhiyun *
2166*4882a593Smuzhiyun * So trigger writeback for any eventual new dirty pages and then we
2167*4882a593Smuzhiyun * wait for all ordered extents to complete below.
2168*4882a593Smuzhiyun */
2169*4882a593Smuzhiyun ret = start_ordered_ops(inode, start, end);
2170*4882a593Smuzhiyun if (ret) {
2171*4882a593Smuzhiyun up_write(&BTRFS_I(inode)->dio_sem);
2172*4882a593Smuzhiyun inode_unlock(inode);
2173*4882a593Smuzhiyun goto out;
2174*4882a593Smuzhiyun }
2175*4882a593Smuzhiyun
2176*4882a593Smuzhiyun /*
2177*4882a593Smuzhiyun * We have to do this here to avoid the priority inversion of waiting on
2178*4882a593Smuzhiyun * IO of a lower priority task while holding a transaction open.
2179*4882a593Smuzhiyun *
2180*4882a593Smuzhiyun * For a full fsync we wait for the ordered extents to complete while
2181*4882a593Smuzhiyun * for a fast fsync we wait just for writeback to complete, and then
2182*4882a593Smuzhiyun * attach the ordered extents to the transaction so that a transaction
2183*4882a593Smuzhiyun * commit waits for their completion, to avoid data loss if we fsync,
2184*4882a593Smuzhiyun * the current transaction commits before the ordered extents complete
2185*4882a593Smuzhiyun * and a power failure happens right after that.
2186*4882a593Smuzhiyun */
2187*4882a593Smuzhiyun if (full_sync) {
2188*4882a593Smuzhiyun ret = btrfs_wait_ordered_range(inode, start, len);
2189*4882a593Smuzhiyun } else {
2190*4882a593Smuzhiyun /*
2191*4882a593Smuzhiyun * Get our ordered extents as soon as possible to avoid doing
2192*4882a593Smuzhiyun * checksum lookups in the csum tree, and use instead the
2193*4882a593Smuzhiyun * checksums attached to the ordered extents.
2194*4882a593Smuzhiyun */
2195*4882a593Smuzhiyun btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2196*4882a593Smuzhiyun &ctx.ordered_extents);
2197*4882a593Smuzhiyun ret = filemap_fdatawait_range(inode->i_mapping, start, end);
2198*4882a593Smuzhiyun }
2199*4882a593Smuzhiyun
2200*4882a593Smuzhiyun if (ret)
2201*4882a593Smuzhiyun goto out_release_extents;
2202*4882a593Smuzhiyun
2203*4882a593Smuzhiyun atomic_inc(&root->log_batch);
2204*4882a593Smuzhiyun
2205*4882a593Smuzhiyun smp_mb();
2206*4882a593Smuzhiyun if (skip_inode_logging(&ctx)) {
2207*4882a593Smuzhiyun /*
2208*4882a593Smuzhiyun * We've had everything committed since the last time we were
2209*4882a593Smuzhiyun * modified so clear this flag in case it was set for whatever
2210*4882a593Smuzhiyun * reason, it's no longer relevant.
2211*4882a593Smuzhiyun */
2212*4882a593Smuzhiyun clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2213*4882a593Smuzhiyun &BTRFS_I(inode)->runtime_flags);
2214*4882a593Smuzhiyun /*
2215*4882a593Smuzhiyun * An ordered extent might have started before and completed
2216*4882a593Smuzhiyun * already with io errors, in which case the inode was not
2217*4882a593Smuzhiyun * updated and we end up here. So check the inode's mapping
2218*4882a593Smuzhiyun * for any errors that might have happened since we last
2219*4882a593Smuzhiyun * checked called fsync.
2220*4882a593Smuzhiyun */
2221*4882a593Smuzhiyun ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2222*4882a593Smuzhiyun goto out_release_extents;
2223*4882a593Smuzhiyun }
2224*4882a593Smuzhiyun
2225*4882a593Smuzhiyun /*
2226*4882a593Smuzhiyun * We use start here because we will need to wait on the IO to complete
2227*4882a593Smuzhiyun * in btrfs_sync_log, which could require joining a transaction (for
2228*4882a593Smuzhiyun * example checking cross references in the nocow path). If we use join
2229*4882a593Smuzhiyun * here we could get into a situation where we're waiting on IO to
2230*4882a593Smuzhiyun * happen that is blocked on a transaction trying to commit. With start
2231*4882a593Smuzhiyun * we inc the extwriter counter, so we wait for all extwriters to exit
2232*4882a593Smuzhiyun * before we start blocking joiners. This comment is to keep somebody
2233*4882a593Smuzhiyun * from thinking they are super smart and changing this to
2234*4882a593Smuzhiyun * btrfs_join_transaction *cough*Josef*cough*.
2235*4882a593Smuzhiyun */
2236*4882a593Smuzhiyun trans = btrfs_start_transaction(root, 0);
2237*4882a593Smuzhiyun if (IS_ERR(trans)) {
2238*4882a593Smuzhiyun ret = PTR_ERR(trans);
2239*4882a593Smuzhiyun goto out_release_extents;
2240*4882a593Smuzhiyun }
2241*4882a593Smuzhiyun
2242*4882a593Smuzhiyun ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2243*4882a593Smuzhiyun btrfs_release_log_ctx_extents(&ctx);
2244*4882a593Smuzhiyun if (ret < 0) {
2245*4882a593Smuzhiyun /* Fallthrough and commit/free transaction. */
2246*4882a593Smuzhiyun ret = 1;
2247*4882a593Smuzhiyun }
2248*4882a593Smuzhiyun
2249*4882a593Smuzhiyun /* we've logged all the items and now have a consistent
2250*4882a593Smuzhiyun * version of the file in the log. It is possible that
2251*4882a593Smuzhiyun * someone will come in and modify the file, but that's
2252*4882a593Smuzhiyun * fine because the log is consistent on disk, and we
2253*4882a593Smuzhiyun * have references to all of the file's extents
2254*4882a593Smuzhiyun *
2255*4882a593Smuzhiyun * It is possible that someone will come in and log the
2256*4882a593Smuzhiyun * file again, but that will end up using the synchronization
2257*4882a593Smuzhiyun * inside btrfs_sync_log to keep things safe.
2258*4882a593Smuzhiyun */
2259*4882a593Smuzhiyun up_write(&BTRFS_I(inode)->dio_sem);
2260*4882a593Smuzhiyun inode_unlock(inode);
2261*4882a593Smuzhiyun
2262*4882a593Smuzhiyun if (ret != BTRFS_NO_LOG_SYNC) {
2263*4882a593Smuzhiyun if (!ret) {
2264*4882a593Smuzhiyun ret = btrfs_sync_log(trans, root, &ctx);
2265*4882a593Smuzhiyun if (!ret) {
2266*4882a593Smuzhiyun ret = btrfs_end_transaction(trans);
2267*4882a593Smuzhiyun goto out;
2268*4882a593Smuzhiyun }
2269*4882a593Smuzhiyun }
2270*4882a593Smuzhiyun if (!full_sync) {
2271*4882a593Smuzhiyun ret = btrfs_wait_ordered_range(inode, start, len);
2272*4882a593Smuzhiyun if (ret) {
2273*4882a593Smuzhiyun btrfs_end_transaction(trans);
2274*4882a593Smuzhiyun goto out;
2275*4882a593Smuzhiyun }
2276*4882a593Smuzhiyun }
2277*4882a593Smuzhiyun ret = btrfs_commit_transaction(trans);
2278*4882a593Smuzhiyun } else {
2279*4882a593Smuzhiyun ret = btrfs_end_transaction(trans);
2280*4882a593Smuzhiyun }
2281*4882a593Smuzhiyun out:
2282*4882a593Smuzhiyun ASSERT(list_empty(&ctx.list));
2283*4882a593Smuzhiyun err = file_check_and_advance_wb_err(file);
2284*4882a593Smuzhiyun if (!ret)
2285*4882a593Smuzhiyun ret = err;
2286*4882a593Smuzhiyun return ret > 0 ? -EIO : ret;
2287*4882a593Smuzhiyun
2288*4882a593Smuzhiyun out_release_extents:
2289*4882a593Smuzhiyun btrfs_release_log_ctx_extents(&ctx);
2290*4882a593Smuzhiyun up_write(&BTRFS_I(inode)->dio_sem);
2291*4882a593Smuzhiyun inode_unlock(inode);
2292*4882a593Smuzhiyun goto out;
2293*4882a593Smuzhiyun }
2294*4882a593Smuzhiyun
2295*4882a593Smuzhiyun static const struct vm_operations_struct btrfs_file_vm_ops = {
2296*4882a593Smuzhiyun .fault = filemap_fault,
2297*4882a593Smuzhiyun .map_pages = filemap_map_pages,
2298*4882a593Smuzhiyun .page_mkwrite = btrfs_page_mkwrite,
2299*4882a593Smuzhiyun };
2300*4882a593Smuzhiyun
btrfs_file_mmap(struct file * filp,struct vm_area_struct * vma)2301*4882a593Smuzhiyun static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma)
2302*4882a593Smuzhiyun {
2303*4882a593Smuzhiyun struct address_space *mapping = filp->f_mapping;
2304*4882a593Smuzhiyun
2305*4882a593Smuzhiyun if (!mapping->a_ops->readpage)
2306*4882a593Smuzhiyun return -ENOEXEC;
2307*4882a593Smuzhiyun
2308*4882a593Smuzhiyun file_accessed(filp);
2309*4882a593Smuzhiyun vma->vm_ops = &btrfs_file_vm_ops;
2310*4882a593Smuzhiyun
2311*4882a593Smuzhiyun return 0;
2312*4882a593Smuzhiyun }
2313*4882a593Smuzhiyun
hole_mergeable(struct btrfs_inode * inode,struct extent_buffer * leaf,int slot,u64 start,u64 end)2314*4882a593Smuzhiyun static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2315*4882a593Smuzhiyun int slot, u64 start, u64 end)
2316*4882a593Smuzhiyun {
2317*4882a593Smuzhiyun struct btrfs_file_extent_item *fi;
2318*4882a593Smuzhiyun struct btrfs_key key;
2319*4882a593Smuzhiyun
2320*4882a593Smuzhiyun if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2321*4882a593Smuzhiyun return 0;
2322*4882a593Smuzhiyun
2323*4882a593Smuzhiyun btrfs_item_key_to_cpu(leaf, &key, slot);
2324*4882a593Smuzhiyun if (key.objectid != btrfs_ino(inode) ||
2325*4882a593Smuzhiyun key.type != BTRFS_EXTENT_DATA_KEY)
2326*4882a593Smuzhiyun return 0;
2327*4882a593Smuzhiyun
2328*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2329*4882a593Smuzhiyun
2330*4882a593Smuzhiyun if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2331*4882a593Smuzhiyun return 0;
2332*4882a593Smuzhiyun
2333*4882a593Smuzhiyun if (btrfs_file_extent_disk_bytenr(leaf, fi))
2334*4882a593Smuzhiyun return 0;
2335*4882a593Smuzhiyun
2336*4882a593Smuzhiyun if (key.offset == end)
2337*4882a593Smuzhiyun return 1;
2338*4882a593Smuzhiyun if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2339*4882a593Smuzhiyun return 1;
2340*4882a593Smuzhiyun return 0;
2341*4882a593Smuzhiyun }
2342*4882a593Smuzhiyun
fill_holes(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,struct btrfs_path * path,u64 offset,u64 end)2343*4882a593Smuzhiyun static int fill_holes(struct btrfs_trans_handle *trans,
2344*4882a593Smuzhiyun struct btrfs_inode *inode,
2345*4882a593Smuzhiyun struct btrfs_path *path, u64 offset, u64 end)
2346*4882a593Smuzhiyun {
2347*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = trans->fs_info;
2348*4882a593Smuzhiyun struct btrfs_root *root = inode->root;
2349*4882a593Smuzhiyun struct extent_buffer *leaf;
2350*4882a593Smuzhiyun struct btrfs_file_extent_item *fi;
2351*4882a593Smuzhiyun struct extent_map *hole_em;
2352*4882a593Smuzhiyun struct extent_map_tree *em_tree = &inode->extent_tree;
2353*4882a593Smuzhiyun struct btrfs_key key;
2354*4882a593Smuzhiyun int ret;
2355*4882a593Smuzhiyun
2356*4882a593Smuzhiyun if (btrfs_fs_incompat(fs_info, NO_HOLES))
2357*4882a593Smuzhiyun goto out;
2358*4882a593Smuzhiyun
2359*4882a593Smuzhiyun key.objectid = btrfs_ino(inode);
2360*4882a593Smuzhiyun key.type = BTRFS_EXTENT_DATA_KEY;
2361*4882a593Smuzhiyun key.offset = offset;
2362*4882a593Smuzhiyun
2363*4882a593Smuzhiyun ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2364*4882a593Smuzhiyun if (ret <= 0) {
2365*4882a593Smuzhiyun /*
2366*4882a593Smuzhiyun * We should have dropped this offset, so if we find it then
2367*4882a593Smuzhiyun * something has gone horribly wrong.
2368*4882a593Smuzhiyun */
2369*4882a593Smuzhiyun if (ret == 0)
2370*4882a593Smuzhiyun ret = -EINVAL;
2371*4882a593Smuzhiyun return ret;
2372*4882a593Smuzhiyun }
2373*4882a593Smuzhiyun
2374*4882a593Smuzhiyun leaf = path->nodes[0];
2375*4882a593Smuzhiyun if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2376*4882a593Smuzhiyun u64 num_bytes;
2377*4882a593Smuzhiyun
2378*4882a593Smuzhiyun path->slots[0]--;
2379*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
2380*4882a593Smuzhiyun struct btrfs_file_extent_item);
2381*4882a593Smuzhiyun num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2382*4882a593Smuzhiyun end - offset;
2383*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2384*4882a593Smuzhiyun btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2385*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, fi, 0);
2386*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
2387*4882a593Smuzhiyun goto out;
2388*4882a593Smuzhiyun }
2389*4882a593Smuzhiyun
2390*4882a593Smuzhiyun if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2391*4882a593Smuzhiyun u64 num_bytes;
2392*4882a593Smuzhiyun
2393*4882a593Smuzhiyun key.offset = offset;
2394*4882a593Smuzhiyun btrfs_set_item_key_safe(fs_info, path, &key);
2395*4882a593Smuzhiyun fi = btrfs_item_ptr(leaf, path->slots[0],
2396*4882a593Smuzhiyun struct btrfs_file_extent_item);
2397*4882a593Smuzhiyun num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2398*4882a593Smuzhiyun offset;
2399*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2400*4882a593Smuzhiyun btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2401*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, fi, 0);
2402*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
2403*4882a593Smuzhiyun goto out;
2404*4882a593Smuzhiyun }
2405*4882a593Smuzhiyun btrfs_release_path(path);
2406*4882a593Smuzhiyun
2407*4882a593Smuzhiyun ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2408*4882a593Smuzhiyun offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2409*4882a593Smuzhiyun if (ret)
2410*4882a593Smuzhiyun return ret;
2411*4882a593Smuzhiyun
2412*4882a593Smuzhiyun out:
2413*4882a593Smuzhiyun btrfs_release_path(path);
2414*4882a593Smuzhiyun
2415*4882a593Smuzhiyun hole_em = alloc_extent_map();
2416*4882a593Smuzhiyun if (!hole_em) {
2417*4882a593Smuzhiyun btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2418*4882a593Smuzhiyun set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2419*4882a593Smuzhiyun } else {
2420*4882a593Smuzhiyun hole_em->start = offset;
2421*4882a593Smuzhiyun hole_em->len = end - offset;
2422*4882a593Smuzhiyun hole_em->ram_bytes = hole_em->len;
2423*4882a593Smuzhiyun hole_em->orig_start = offset;
2424*4882a593Smuzhiyun
2425*4882a593Smuzhiyun hole_em->block_start = EXTENT_MAP_HOLE;
2426*4882a593Smuzhiyun hole_em->block_len = 0;
2427*4882a593Smuzhiyun hole_em->orig_block_len = 0;
2428*4882a593Smuzhiyun hole_em->compress_type = BTRFS_COMPRESS_NONE;
2429*4882a593Smuzhiyun hole_em->generation = trans->transid;
2430*4882a593Smuzhiyun
2431*4882a593Smuzhiyun do {
2432*4882a593Smuzhiyun btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2433*4882a593Smuzhiyun write_lock(&em_tree->lock);
2434*4882a593Smuzhiyun ret = add_extent_mapping(em_tree, hole_em, 1);
2435*4882a593Smuzhiyun write_unlock(&em_tree->lock);
2436*4882a593Smuzhiyun } while (ret == -EEXIST);
2437*4882a593Smuzhiyun free_extent_map(hole_em);
2438*4882a593Smuzhiyun if (ret)
2439*4882a593Smuzhiyun set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2440*4882a593Smuzhiyun &inode->runtime_flags);
2441*4882a593Smuzhiyun }
2442*4882a593Smuzhiyun
2443*4882a593Smuzhiyun return 0;
2444*4882a593Smuzhiyun }
2445*4882a593Smuzhiyun
2446*4882a593Smuzhiyun /*
2447*4882a593Smuzhiyun * Find a hole extent on given inode and change start/len to the end of hole
2448*4882a593Smuzhiyun * extent.(hole/vacuum extent whose em->start <= start &&
2449*4882a593Smuzhiyun * em->start + em->len > start)
2450*4882a593Smuzhiyun * When a hole extent is found, return 1 and modify start/len.
2451*4882a593Smuzhiyun */
find_first_non_hole(struct inode * inode,u64 * start,u64 * len)2452*4882a593Smuzhiyun static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len)
2453*4882a593Smuzhiyun {
2454*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2455*4882a593Smuzhiyun struct extent_map *em;
2456*4882a593Smuzhiyun int ret = 0;
2457*4882a593Smuzhiyun
2458*4882a593Smuzhiyun em = btrfs_get_extent(BTRFS_I(inode), NULL, 0,
2459*4882a593Smuzhiyun round_down(*start, fs_info->sectorsize),
2460*4882a593Smuzhiyun round_up(*len, fs_info->sectorsize));
2461*4882a593Smuzhiyun if (IS_ERR(em))
2462*4882a593Smuzhiyun return PTR_ERR(em);
2463*4882a593Smuzhiyun
2464*4882a593Smuzhiyun /* Hole or vacuum extent(only exists in no-hole mode) */
2465*4882a593Smuzhiyun if (em->block_start == EXTENT_MAP_HOLE) {
2466*4882a593Smuzhiyun ret = 1;
2467*4882a593Smuzhiyun *len = em->start + em->len > *start + *len ?
2468*4882a593Smuzhiyun 0 : *start + *len - em->start - em->len;
2469*4882a593Smuzhiyun *start = em->start + em->len;
2470*4882a593Smuzhiyun }
2471*4882a593Smuzhiyun free_extent_map(em);
2472*4882a593Smuzhiyun return ret;
2473*4882a593Smuzhiyun }
2474*4882a593Smuzhiyun
btrfs_punch_hole_lock_range(struct inode * inode,const u64 lockstart,const u64 lockend,struct extent_state ** cached_state)2475*4882a593Smuzhiyun static int btrfs_punch_hole_lock_range(struct inode *inode,
2476*4882a593Smuzhiyun const u64 lockstart,
2477*4882a593Smuzhiyun const u64 lockend,
2478*4882a593Smuzhiyun struct extent_state **cached_state)
2479*4882a593Smuzhiyun {
2480*4882a593Smuzhiyun while (1) {
2481*4882a593Smuzhiyun struct btrfs_ordered_extent *ordered;
2482*4882a593Smuzhiyun int ret;
2483*4882a593Smuzhiyun
2484*4882a593Smuzhiyun truncate_pagecache_range(inode, lockstart, lockend);
2485*4882a593Smuzhiyun
2486*4882a593Smuzhiyun lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2487*4882a593Smuzhiyun cached_state);
2488*4882a593Smuzhiyun ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
2489*4882a593Smuzhiyun lockend);
2490*4882a593Smuzhiyun
2491*4882a593Smuzhiyun /*
2492*4882a593Smuzhiyun * We need to make sure we have no ordered extents in this range
2493*4882a593Smuzhiyun * and nobody raced in and read a page in this range, if we did
2494*4882a593Smuzhiyun * we need to try again.
2495*4882a593Smuzhiyun */
2496*4882a593Smuzhiyun if ((!ordered ||
2497*4882a593Smuzhiyun (ordered->file_offset + ordered->num_bytes <= lockstart ||
2498*4882a593Smuzhiyun ordered->file_offset > lockend)) &&
2499*4882a593Smuzhiyun !filemap_range_has_page(inode->i_mapping,
2500*4882a593Smuzhiyun lockstart, lockend)) {
2501*4882a593Smuzhiyun if (ordered)
2502*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
2503*4882a593Smuzhiyun break;
2504*4882a593Smuzhiyun }
2505*4882a593Smuzhiyun if (ordered)
2506*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
2507*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2508*4882a593Smuzhiyun lockend, cached_state);
2509*4882a593Smuzhiyun ret = btrfs_wait_ordered_range(inode, lockstart,
2510*4882a593Smuzhiyun lockend - lockstart + 1);
2511*4882a593Smuzhiyun if (ret)
2512*4882a593Smuzhiyun return ret;
2513*4882a593Smuzhiyun }
2514*4882a593Smuzhiyun return 0;
2515*4882a593Smuzhiyun }
2516*4882a593Smuzhiyun
btrfs_insert_replace_extent(struct btrfs_trans_handle * trans,struct inode * inode,struct btrfs_path * path,struct btrfs_replace_extent_info * extent_info,const u64 replace_len)2517*4882a593Smuzhiyun static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2518*4882a593Smuzhiyun struct inode *inode,
2519*4882a593Smuzhiyun struct btrfs_path *path,
2520*4882a593Smuzhiyun struct btrfs_replace_extent_info *extent_info,
2521*4882a593Smuzhiyun const u64 replace_len)
2522*4882a593Smuzhiyun {
2523*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2524*4882a593Smuzhiyun struct btrfs_root *root = BTRFS_I(inode)->root;
2525*4882a593Smuzhiyun struct btrfs_file_extent_item *extent;
2526*4882a593Smuzhiyun struct extent_buffer *leaf;
2527*4882a593Smuzhiyun struct btrfs_key key;
2528*4882a593Smuzhiyun int slot;
2529*4882a593Smuzhiyun struct btrfs_ref ref = { 0 };
2530*4882a593Smuzhiyun int ret;
2531*4882a593Smuzhiyun
2532*4882a593Smuzhiyun if (replace_len == 0)
2533*4882a593Smuzhiyun return 0;
2534*4882a593Smuzhiyun
2535*4882a593Smuzhiyun if (extent_info->disk_offset == 0 &&
2536*4882a593Smuzhiyun btrfs_fs_incompat(fs_info, NO_HOLES))
2537*4882a593Smuzhiyun return 0;
2538*4882a593Smuzhiyun
2539*4882a593Smuzhiyun key.objectid = btrfs_ino(BTRFS_I(inode));
2540*4882a593Smuzhiyun key.type = BTRFS_EXTENT_DATA_KEY;
2541*4882a593Smuzhiyun key.offset = extent_info->file_offset;
2542*4882a593Smuzhiyun ret = btrfs_insert_empty_item(trans, root, path, &key,
2543*4882a593Smuzhiyun sizeof(struct btrfs_file_extent_item));
2544*4882a593Smuzhiyun if (ret)
2545*4882a593Smuzhiyun return ret;
2546*4882a593Smuzhiyun leaf = path->nodes[0];
2547*4882a593Smuzhiyun slot = path->slots[0];
2548*4882a593Smuzhiyun write_extent_buffer(leaf, extent_info->extent_buf,
2549*4882a593Smuzhiyun btrfs_item_ptr_offset(leaf, slot),
2550*4882a593Smuzhiyun sizeof(struct btrfs_file_extent_item));
2551*4882a593Smuzhiyun extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2552*4882a593Smuzhiyun ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2553*4882a593Smuzhiyun btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2554*4882a593Smuzhiyun btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2555*4882a593Smuzhiyun if (extent_info->is_new_extent)
2556*4882a593Smuzhiyun btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2557*4882a593Smuzhiyun btrfs_mark_buffer_dirty(leaf);
2558*4882a593Smuzhiyun btrfs_release_path(path);
2559*4882a593Smuzhiyun
2560*4882a593Smuzhiyun ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode),
2561*4882a593Smuzhiyun extent_info->file_offset, replace_len);
2562*4882a593Smuzhiyun if (ret)
2563*4882a593Smuzhiyun return ret;
2564*4882a593Smuzhiyun
2565*4882a593Smuzhiyun /* If it's a hole, nothing more needs to be done. */
2566*4882a593Smuzhiyun if (extent_info->disk_offset == 0)
2567*4882a593Smuzhiyun return 0;
2568*4882a593Smuzhiyun
2569*4882a593Smuzhiyun inode_add_bytes(inode, replace_len);
2570*4882a593Smuzhiyun
2571*4882a593Smuzhiyun if (extent_info->is_new_extent && extent_info->insertions == 0) {
2572*4882a593Smuzhiyun key.objectid = extent_info->disk_offset;
2573*4882a593Smuzhiyun key.type = BTRFS_EXTENT_ITEM_KEY;
2574*4882a593Smuzhiyun key.offset = extent_info->disk_len;
2575*4882a593Smuzhiyun ret = btrfs_alloc_reserved_file_extent(trans, root,
2576*4882a593Smuzhiyun btrfs_ino(BTRFS_I(inode)),
2577*4882a593Smuzhiyun extent_info->file_offset,
2578*4882a593Smuzhiyun extent_info->qgroup_reserved,
2579*4882a593Smuzhiyun &key);
2580*4882a593Smuzhiyun } else {
2581*4882a593Smuzhiyun u64 ref_offset;
2582*4882a593Smuzhiyun
2583*4882a593Smuzhiyun btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2584*4882a593Smuzhiyun extent_info->disk_offset,
2585*4882a593Smuzhiyun extent_info->disk_len, 0);
2586*4882a593Smuzhiyun ref_offset = extent_info->file_offset - extent_info->data_offset;
2587*4882a593Smuzhiyun btrfs_init_data_ref(&ref, root->root_key.objectid,
2588*4882a593Smuzhiyun btrfs_ino(BTRFS_I(inode)), ref_offset);
2589*4882a593Smuzhiyun ret = btrfs_inc_extent_ref(trans, &ref);
2590*4882a593Smuzhiyun }
2591*4882a593Smuzhiyun
2592*4882a593Smuzhiyun extent_info->insertions++;
2593*4882a593Smuzhiyun
2594*4882a593Smuzhiyun return ret;
2595*4882a593Smuzhiyun }
2596*4882a593Smuzhiyun
2597*4882a593Smuzhiyun /*
2598*4882a593Smuzhiyun * The respective range must have been previously locked, as well as the inode.
2599*4882a593Smuzhiyun * The end offset is inclusive (last byte of the range).
2600*4882a593Smuzhiyun * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2601*4882a593Smuzhiyun * the file range with an extent.
2602*4882a593Smuzhiyun * When not punching a hole, we don't want to end up in a state where we dropped
2603*4882a593Smuzhiyun * extents without inserting a new one, so we must abort the transaction to avoid
2604*4882a593Smuzhiyun * a corruption.
2605*4882a593Smuzhiyun */
btrfs_replace_file_extents(struct inode * inode,struct btrfs_path * path,const u64 start,const u64 end,struct btrfs_replace_extent_info * extent_info,struct btrfs_trans_handle ** trans_out)2606*4882a593Smuzhiyun int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
2607*4882a593Smuzhiyun const u64 start, const u64 end,
2608*4882a593Smuzhiyun struct btrfs_replace_extent_info *extent_info,
2609*4882a593Smuzhiyun struct btrfs_trans_handle **trans_out)
2610*4882a593Smuzhiyun {
2611*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2612*4882a593Smuzhiyun u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2613*4882a593Smuzhiyun u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
2614*4882a593Smuzhiyun struct btrfs_root *root = BTRFS_I(inode)->root;
2615*4882a593Smuzhiyun struct btrfs_trans_handle *trans = NULL;
2616*4882a593Smuzhiyun struct btrfs_block_rsv *rsv;
2617*4882a593Smuzhiyun unsigned int rsv_count;
2618*4882a593Smuzhiyun u64 cur_offset;
2619*4882a593Smuzhiyun u64 drop_end;
2620*4882a593Smuzhiyun u64 len = end - start;
2621*4882a593Smuzhiyun int ret = 0;
2622*4882a593Smuzhiyun
2623*4882a593Smuzhiyun if (end <= start)
2624*4882a593Smuzhiyun return -EINVAL;
2625*4882a593Smuzhiyun
2626*4882a593Smuzhiyun rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2627*4882a593Smuzhiyun if (!rsv) {
2628*4882a593Smuzhiyun ret = -ENOMEM;
2629*4882a593Smuzhiyun goto out;
2630*4882a593Smuzhiyun }
2631*4882a593Smuzhiyun rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2632*4882a593Smuzhiyun rsv->failfast = 1;
2633*4882a593Smuzhiyun
2634*4882a593Smuzhiyun /*
2635*4882a593Smuzhiyun * 1 - update the inode
2636*4882a593Smuzhiyun * 1 - removing the extents in the range
2637*4882a593Smuzhiyun * 1 - adding the hole extent if no_holes isn't set or if we are
2638*4882a593Smuzhiyun * replacing the range with a new extent
2639*4882a593Smuzhiyun */
2640*4882a593Smuzhiyun if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2641*4882a593Smuzhiyun rsv_count = 3;
2642*4882a593Smuzhiyun else
2643*4882a593Smuzhiyun rsv_count = 2;
2644*4882a593Smuzhiyun
2645*4882a593Smuzhiyun trans = btrfs_start_transaction(root, rsv_count);
2646*4882a593Smuzhiyun if (IS_ERR(trans)) {
2647*4882a593Smuzhiyun ret = PTR_ERR(trans);
2648*4882a593Smuzhiyun trans = NULL;
2649*4882a593Smuzhiyun goto out_free;
2650*4882a593Smuzhiyun }
2651*4882a593Smuzhiyun
2652*4882a593Smuzhiyun ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2653*4882a593Smuzhiyun min_size, false);
2654*4882a593Smuzhiyun BUG_ON(ret);
2655*4882a593Smuzhiyun trans->block_rsv = rsv;
2656*4882a593Smuzhiyun
2657*4882a593Smuzhiyun cur_offset = start;
2658*4882a593Smuzhiyun while (cur_offset < end) {
2659*4882a593Smuzhiyun ret = __btrfs_drop_extents(trans, root, BTRFS_I(inode), path,
2660*4882a593Smuzhiyun cur_offset, end + 1, &drop_end,
2661*4882a593Smuzhiyun 1, 0, 0, NULL);
2662*4882a593Smuzhiyun if (ret != -ENOSPC) {
2663*4882a593Smuzhiyun /*
2664*4882a593Smuzhiyun * The only time we don't want to abort is if we are
2665*4882a593Smuzhiyun * attempting to clone a partial inline extent, in which
2666*4882a593Smuzhiyun * case we'll get EOPNOTSUPP. However if we aren't
2667*4882a593Smuzhiyun * clone we need to abort no matter what, because if we
2668*4882a593Smuzhiyun * got EOPNOTSUPP via prealloc then we messed up and
2669*4882a593Smuzhiyun * need to abort.
2670*4882a593Smuzhiyun */
2671*4882a593Smuzhiyun if (ret &&
2672*4882a593Smuzhiyun (ret != -EOPNOTSUPP ||
2673*4882a593Smuzhiyun (extent_info && extent_info->is_new_extent)))
2674*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
2675*4882a593Smuzhiyun break;
2676*4882a593Smuzhiyun }
2677*4882a593Smuzhiyun
2678*4882a593Smuzhiyun trans->block_rsv = &fs_info->trans_block_rsv;
2679*4882a593Smuzhiyun
2680*4882a593Smuzhiyun if (!extent_info && cur_offset < drop_end &&
2681*4882a593Smuzhiyun cur_offset < ino_size) {
2682*4882a593Smuzhiyun ret = fill_holes(trans, BTRFS_I(inode), path,
2683*4882a593Smuzhiyun cur_offset, drop_end);
2684*4882a593Smuzhiyun if (ret) {
2685*4882a593Smuzhiyun /*
2686*4882a593Smuzhiyun * If we failed then we didn't insert our hole
2687*4882a593Smuzhiyun * entries for the area we dropped, so now the
2688*4882a593Smuzhiyun * fs is corrupted, so we must abort the
2689*4882a593Smuzhiyun * transaction.
2690*4882a593Smuzhiyun */
2691*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
2692*4882a593Smuzhiyun break;
2693*4882a593Smuzhiyun }
2694*4882a593Smuzhiyun } else if (!extent_info && cur_offset < drop_end) {
2695*4882a593Smuzhiyun /*
2696*4882a593Smuzhiyun * We are past the i_size here, but since we didn't
2697*4882a593Smuzhiyun * insert holes we need to clear the mapped area so we
2698*4882a593Smuzhiyun * know to not set disk_i_size in this area until a new
2699*4882a593Smuzhiyun * file extent is inserted here.
2700*4882a593Smuzhiyun */
2701*4882a593Smuzhiyun ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
2702*4882a593Smuzhiyun cur_offset, drop_end - cur_offset);
2703*4882a593Smuzhiyun if (ret) {
2704*4882a593Smuzhiyun /*
2705*4882a593Smuzhiyun * We couldn't clear our area, so we could
2706*4882a593Smuzhiyun * presumably adjust up and corrupt the fs, so
2707*4882a593Smuzhiyun * we need to abort.
2708*4882a593Smuzhiyun */
2709*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
2710*4882a593Smuzhiyun break;
2711*4882a593Smuzhiyun }
2712*4882a593Smuzhiyun }
2713*4882a593Smuzhiyun
2714*4882a593Smuzhiyun if (extent_info && drop_end > extent_info->file_offset) {
2715*4882a593Smuzhiyun u64 replace_len = drop_end - extent_info->file_offset;
2716*4882a593Smuzhiyun
2717*4882a593Smuzhiyun ret = btrfs_insert_replace_extent(trans, inode, path,
2718*4882a593Smuzhiyun extent_info, replace_len);
2719*4882a593Smuzhiyun if (ret) {
2720*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
2721*4882a593Smuzhiyun break;
2722*4882a593Smuzhiyun }
2723*4882a593Smuzhiyun extent_info->data_len -= replace_len;
2724*4882a593Smuzhiyun extent_info->data_offset += replace_len;
2725*4882a593Smuzhiyun extent_info->file_offset += replace_len;
2726*4882a593Smuzhiyun }
2727*4882a593Smuzhiyun
2728*4882a593Smuzhiyun cur_offset = drop_end;
2729*4882a593Smuzhiyun
2730*4882a593Smuzhiyun ret = btrfs_update_inode(trans, root, inode);
2731*4882a593Smuzhiyun if (ret)
2732*4882a593Smuzhiyun break;
2733*4882a593Smuzhiyun
2734*4882a593Smuzhiyun btrfs_end_transaction(trans);
2735*4882a593Smuzhiyun btrfs_btree_balance_dirty(fs_info);
2736*4882a593Smuzhiyun
2737*4882a593Smuzhiyun trans = btrfs_start_transaction(root, rsv_count);
2738*4882a593Smuzhiyun if (IS_ERR(trans)) {
2739*4882a593Smuzhiyun ret = PTR_ERR(trans);
2740*4882a593Smuzhiyun trans = NULL;
2741*4882a593Smuzhiyun break;
2742*4882a593Smuzhiyun }
2743*4882a593Smuzhiyun
2744*4882a593Smuzhiyun ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2745*4882a593Smuzhiyun rsv, min_size, false);
2746*4882a593Smuzhiyun BUG_ON(ret); /* shouldn't happen */
2747*4882a593Smuzhiyun trans->block_rsv = rsv;
2748*4882a593Smuzhiyun
2749*4882a593Smuzhiyun if (!extent_info) {
2750*4882a593Smuzhiyun ret = find_first_non_hole(inode, &cur_offset, &len);
2751*4882a593Smuzhiyun if (unlikely(ret < 0))
2752*4882a593Smuzhiyun break;
2753*4882a593Smuzhiyun if (ret && !len) {
2754*4882a593Smuzhiyun ret = 0;
2755*4882a593Smuzhiyun break;
2756*4882a593Smuzhiyun }
2757*4882a593Smuzhiyun }
2758*4882a593Smuzhiyun }
2759*4882a593Smuzhiyun
2760*4882a593Smuzhiyun /*
2761*4882a593Smuzhiyun * If we were cloning, force the next fsync to be a full one since we
2762*4882a593Smuzhiyun * we replaced (or just dropped in the case of cloning holes when
2763*4882a593Smuzhiyun * NO_HOLES is enabled) extents and extent maps.
2764*4882a593Smuzhiyun * This is for the sake of simplicity, and cloning into files larger
2765*4882a593Smuzhiyun * than 16Mb would force the full fsync any way (when
2766*4882a593Smuzhiyun * try_release_extent_mapping() is invoked during page cache truncation.
2767*4882a593Smuzhiyun */
2768*4882a593Smuzhiyun if (extent_info && !extent_info->is_new_extent)
2769*4882a593Smuzhiyun set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2770*4882a593Smuzhiyun &BTRFS_I(inode)->runtime_flags);
2771*4882a593Smuzhiyun
2772*4882a593Smuzhiyun if (ret)
2773*4882a593Smuzhiyun goto out_trans;
2774*4882a593Smuzhiyun
2775*4882a593Smuzhiyun trans->block_rsv = &fs_info->trans_block_rsv;
2776*4882a593Smuzhiyun /*
2777*4882a593Smuzhiyun * If we are using the NO_HOLES feature we might have had already an
2778*4882a593Smuzhiyun * hole that overlaps a part of the region [lockstart, lockend] and
2779*4882a593Smuzhiyun * ends at (or beyond) lockend. Since we have no file extent items to
2780*4882a593Smuzhiyun * represent holes, drop_end can be less than lockend and so we must
2781*4882a593Smuzhiyun * make sure we have an extent map representing the existing hole (the
2782*4882a593Smuzhiyun * call to __btrfs_drop_extents() might have dropped the existing extent
2783*4882a593Smuzhiyun * map representing the existing hole), otherwise the fast fsync path
2784*4882a593Smuzhiyun * will not record the existence of the hole region
2785*4882a593Smuzhiyun * [existing_hole_start, lockend].
2786*4882a593Smuzhiyun */
2787*4882a593Smuzhiyun if (drop_end <= end)
2788*4882a593Smuzhiyun drop_end = end + 1;
2789*4882a593Smuzhiyun /*
2790*4882a593Smuzhiyun * Don't insert file hole extent item if it's for a range beyond eof
2791*4882a593Smuzhiyun * (because it's useless) or if it represents a 0 bytes range (when
2792*4882a593Smuzhiyun * cur_offset == drop_end).
2793*4882a593Smuzhiyun */
2794*4882a593Smuzhiyun if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) {
2795*4882a593Smuzhiyun ret = fill_holes(trans, BTRFS_I(inode), path,
2796*4882a593Smuzhiyun cur_offset, drop_end);
2797*4882a593Smuzhiyun if (ret) {
2798*4882a593Smuzhiyun /* Same comment as above. */
2799*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
2800*4882a593Smuzhiyun goto out_trans;
2801*4882a593Smuzhiyun }
2802*4882a593Smuzhiyun } else if (!extent_info && cur_offset < drop_end) {
2803*4882a593Smuzhiyun /* See the comment in the loop above for the reasoning here. */
2804*4882a593Smuzhiyun ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
2805*4882a593Smuzhiyun cur_offset, drop_end - cur_offset);
2806*4882a593Smuzhiyun if (ret) {
2807*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
2808*4882a593Smuzhiyun goto out_trans;
2809*4882a593Smuzhiyun }
2810*4882a593Smuzhiyun
2811*4882a593Smuzhiyun }
2812*4882a593Smuzhiyun if (extent_info) {
2813*4882a593Smuzhiyun ret = btrfs_insert_replace_extent(trans, inode, path, extent_info,
2814*4882a593Smuzhiyun extent_info->data_len);
2815*4882a593Smuzhiyun if (ret) {
2816*4882a593Smuzhiyun btrfs_abort_transaction(trans, ret);
2817*4882a593Smuzhiyun goto out_trans;
2818*4882a593Smuzhiyun }
2819*4882a593Smuzhiyun }
2820*4882a593Smuzhiyun
2821*4882a593Smuzhiyun out_trans:
2822*4882a593Smuzhiyun if (!trans)
2823*4882a593Smuzhiyun goto out_free;
2824*4882a593Smuzhiyun
2825*4882a593Smuzhiyun trans->block_rsv = &fs_info->trans_block_rsv;
2826*4882a593Smuzhiyun if (ret)
2827*4882a593Smuzhiyun btrfs_end_transaction(trans);
2828*4882a593Smuzhiyun else
2829*4882a593Smuzhiyun *trans_out = trans;
2830*4882a593Smuzhiyun out_free:
2831*4882a593Smuzhiyun btrfs_free_block_rsv(fs_info, rsv);
2832*4882a593Smuzhiyun out:
2833*4882a593Smuzhiyun return ret;
2834*4882a593Smuzhiyun }
2835*4882a593Smuzhiyun
btrfs_punch_hole(struct file * file,loff_t offset,loff_t len)2836*4882a593Smuzhiyun static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2837*4882a593Smuzhiyun {
2838*4882a593Smuzhiyun struct inode *inode = file_inode(file);
2839*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2840*4882a593Smuzhiyun struct btrfs_root *root = BTRFS_I(inode)->root;
2841*4882a593Smuzhiyun struct extent_state *cached_state = NULL;
2842*4882a593Smuzhiyun struct btrfs_path *path;
2843*4882a593Smuzhiyun struct btrfs_trans_handle *trans = NULL;
2844*4882a593Smuzhiyun u64 lockstart;
2845*4882a593Smuzhiyun u64 lockend;
2846*4882a593Smuzhiyun u64 tail_start;
2847*4882a593Smuzhiyun u64 tail_len;
2848*4882a593Smuzhiyun u64 orig_start = offset;
2849*4882a593Smuzhiyun int ret = 0;
2850*4882a593Smuzhiyun bool same_block;
2851*4882a593Smuzhiyun u64 ino_size;
2852*4882a593Smuzhiyun bool truncated_block = false;
2853*4882a593Smuzhiyun bool updated_inode = false;
2854*4882a593Smuzhiyun
2855*4882a593Smuzhiyun ret = btrfs_wait_ordered_range(inode, offset, len);
2856*4882a593Smuzhiyun if (ret)
2857*4882a593Smuzhiyun return ret;
2858*4882a593Smuzhiyun
2859*4882a593Smuzhiyun inode_lock(inode);
2860*4882a593Smuzhiyun ino_size = round_up(inode->i_size, fs_info->sectorsize);
2861*4882a593Smuzhiyun ret = find_first_non_hole(inode, &offset, &len);
2862*4882a593Smuzhiyun if (ret < 0)
2863*4882a593Smuzhiyun goto out_only_mutex;
2864*4882a593Smuzhiyun if (ret && !len) {
2865*4882a593Smuzhiyun /* Already in a large hole */
2866*4882a593Smuzhiyun ret = 0;
2867*4882a593Smuzhiyun goto out_only_mutex;
2868*4882a593Smuzhiyun }
2869*4882a593Smuzhiyun
2870*4882a593Smuzhiyun ret = file_modified(file);
2871*4882a593Smuzhiyun if (ret)
2872*4882a593Smuzhiyun goto out_only_mutex;
2873*4882a593Smuzhiyun
2874*4882a593Smuzhiyun lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
2875*4882a593Smuzhiyun lockend = round_down(offset + len,
2876*4882a593Smuzhiyun btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
2877*4882a593Smuzhiyun same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2878*4882a593Smuzhiyun == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2879*4882a593Smuzhiyun /*
2880*4882a593Smuzhiyun * We needn't truncate any block which is beyond the end of the file
2881*4882a593Smuzhiyun * because we are sure there is no data there.
2882*4882a593Smuzhiyun */
2883*4882a593Smuzhiyun /*
2884*4882a593Smuzhiyun * Only do this if we are in the same block and we aren't doing the
2885*4882a593Smuzhiyun * entire block.
2886*4882a593Smuzhiyun */
2887*4882a593Smuzhiyun if (same_block && len < fs_info->sectorsize) {
2888*4882a593Smuzhiyun if (offset < ino_size) {
2889*4882a593Smuzhiyun truncated_block = true;
2890*4882a593Smuzhiyun ret = btrfs_truncate_block(inode, offset, len, 0);
2891*4882a593Smuzhiyun } else {
2892*4882a593Smuzhiyun ret = 0;
2893*4882a593Smuzhiyun }
2894*4882a593Smuzhiyun goto out_only_mutex;
2895*4882a593Smuzhiyun }
2896*4882a593Smuzhiyun
2897*4882a593Smuzhiyun /* zero back part of the first block */
2898*4882a593Smuzhiyun if (offset < ino_size) {
2899*4882a593Smuzhiyun truncated_block = true;
2900*4882a593Smuzhiyun ret = btrfs_truncate_block(inode, offset, 0, 0);
2901*4882a593Smuzhiyun if (ret) {
2902*4882a593Smuzhiyun inode_unlock(inode);
2903*4882a593Smuzhiyun return ret;
2904*4882a593Smuzhiyun }
2905*4882a593Smuzhiyun }
2906*4882a593Smuzhiyun
2907*4882a593Smuzhiyun /* Check the aligned pages after the first unaligned page,
2908*4882a593Smuzhiyun * if offset != orig_start, which means the first unaligned page
2909*4882a593Smuzhiyun * including several following pages are already in holes,
2910*4882a593Smuzhiyun * the extra check can be skipped */
2911*4882a593Smuzhiyun if (offset == orig_start) {
2912*4882a593Smuzhiyun /* after truncate page, check hole again */
2913*4882a593Smuzhiyun len = offset + len - lockstart;
2914*4882a593Smuzhiyun offset = lockstart;
2915*4882a593Smuzhiyun ret = find_first_non_hole(inode, &offset, &len);
2916*4882a593Smuzhiyun if (ret < 0)
2917*4882a593Smuzhiyun goto out_only_mutex;
2918*4882a593Smuzhiyun if (ret && !len) {
2919*4882a593Smuzhiyun ret = 0;
2920*4882a593Smuzhiyun goto out_only_mutex;
2921*4882a593Smuzhiyun }
2922*4882a593Smuzhiyun lockstart = offset;
2923*4882a593Smuzhiyun }
2924*4882a593Smuzhiyun
2925*4882a593Smuzhiyun /* Check the tail unaligned part is in a hole */
2926*4882a593Smuzhiyun tail_start = lockend + 1;
2927*4882a593Smuzhiyun tail_len = offset + len - tail_start;
2928*4882a593Smuzhiyun if (tail_len) {
2929*4882a593Smuzhiyun ret = find_first_non_hole(inode, &tail_start, &tail_len);
2930*4882a593Smuzhiyun if (unlikely(ret < 0))
2931*4882a593Smuzhiyun goto out_only_mutex;
2932*4882a593Smuzhiyun if (!ret) {
2933*4882a593Smuzhiyun /* zero the front end of the last page */
2934*4882a593Smuzhiyun if (tail_start + tail_len < ino_size) {
2935*4882a593Smuzhiyun truncated_block = true;
2936*4882a593Smuzhiyun ret = btrfs_truncate_block(inode,
2937*4882a593Smuzhiyun tail_start + tail_len,
2938*4882a593Smuzhiyun 0, 1);
2939*4882a593Smuzhiyun if (ret)
2940*4882a593Smuzhiyun goto out_only_mutex;
2941*4882a593Smuzhiyun }
2942*4882a593Smuzhiyun }
2943*4882a593Smuzhiyun }
2944*4882a593Smuzhiyun
2945*4882a593Smuzhiyun if (lockend < lockstart) {
2946*4882a593Smuzhiyun ret = 0;
2947*4882a593Smuzhiyun goto out_only_mutex;
2948*4882a593Smuzhiyun }
2949*4882a593Smuzhiyun
2950*4882a593Smuzhiyun ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2951*4882a593Smuzhiyun &cached_state);
2952*4882a593Smuzhiyun if (ret)
2953*4882a593Smuzhiyun goto out_only_mutex;
2954*4882a593Smuzhiyun
2955*4882a593Smuzhiyun path = btrfs_alloc_path();
2956*4882a593Smuzhiyun if (!path) {
2957*4882a593Smuzhiyun ret = -ENOMEM;
2958*4882a593Smuzhiyun goto out;
2959*4882a593Smuzhiyun }
2960*4882a593Smuzhiyun
2961*4882a593Smuzhiyun ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
2962*4882a593Smuzhiyun &trans);
2963*4882a593Smuzhiyun btrfs_free_path(path);
2964*4882a593Smuzhiyun if (ret)
2965*4882a593Smuzhiyun goto out;
2966*4882a593Smuzhiyun
2967*4882a593Smuzhiyun ASSERT(trans != NULL);
2968*4882a593Smuzhiyun inode_inc_iversion(inode);
2969*4882a593Smuzhiyun inode->i_mtime = inode->i_ctime = current_time(inode);
2970*4882a593Smuzhiyun ret = btrfs_update_inode(trans, root, inode);
2971*4882a593Smuzhiyun updated_inode = true;
2972*4882a593Smuzhiyun btrfs_end_transaction(trans);
2973*4882a593Smuzhiyun btrfs_btree_balance_dirty(fs_info);
2974*4882a593Smuzhiyun out:
2975*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2976*4882a593Smuzhiyun &cached_state);
2977*4882a593Smuzhiyun out_only_mutex:
2978*4882a593Smuzhiyun if (!updated_inode && truncated_block && !ret) {
2979*4882a593Smuzhiyun /*
2980*4882a593Smuzhiyun * If we only end up zeroing part of a page, we still need to
2981*4882a593Smuzhiyun * update the inode item, so that all the time fields are
2982*4882a593Smuzhiyun * updated as well as the necessary btrfs inode in memory fields
2983*4882a593Smuzhiyun * for detecting, at fsync time, if the inode isn't yet in the
2984*4882a593Smuzhiyun * log tree or it's there but not up to date.
2985*4882a593Smuzhiyun */
2986*4882a593Smuzhiyun struct timespec64 now = current_time(inode);
2987*4882a593Smuzhiyun
2988*4882a593Smuzhiyun inode_inc_iversion(inode);
2989*4882a593Smuzhiyun inode->i_mtime = now;
2990*4882a593Smuzhiyun inode->i_ctime = now;
2991*4882a593Smuzhiyun trans = btrfs_start_transaction(root, 1);
2992*4882a593Smuzhiyun if (IS_ERR(trans)) {
2993*4882a593Smuzhiyun ret = PTR_ERR(trans);
2994*4882a593Smuzhiyun } else {
2995*4882a593Smuzhiyun int ret2;
2996*4882a593Smuzhiyun
2997*4882a593Smuzhiyun ret = btrfs_update_inode(trans, root, inode);
2998*4882a593Smuzhiyun ret2 = btrfs_end_transaction(trans);
2999*4882a593Smuzhiyun if (!ret)
3000*4882a593Smuzhiyun ret = ret2;
3001*4882a593Smuzhiyun }
3002*4882a593Smuzhiyun }
3003*4882a593Smuzhiyun inode_unlock(inode);
3004*4882a593Smuzhiyun return ret;
3005*4882a593Smuzhiyun }
3006*4882a593Smuzhiyun
3007*4882a593Smuzhiyun /* Helper structure to record which range is already reserved */
3008*4882a593Smuzhiyun struct falloc_range {
3009*4882a593Smuzhiyun struct list_head list;
3010*4882a593Smuzhiyun u64 start;
3011*4882a593Smuzhiyun u64 len;
3012*4882a593Smuzhiyun };
3013*4882a593Smuzhiyun
3014*4882a593Smuzhiyun /*
3015*4882a593Smuzhiyun * Helper function to add falloc range
3016*4882a593Smuzhiyun *
3017*4882a593Smuzhiyun * Caller should have locked the larger range of extent containing
3018*4882a593Smuzhiyun * [start, len)
3019*4882a593Smuzhiyun */
add_falloc_range(struct list_head * head,u64 start,u64 len)3020*4882a593Smuzhiyun static int add_falloc_range(struct list_head *head, u64 start, u64 len)
3021*4882a593Smuzhiyun {
3022*4882a593Smuzhiyun struct falloc_range *prev = NULL;
3023*4882a593Smuzhiyun struct falloc_range *range = NULL;
3024*4882a593Smuzhiyun
3025*4882a593Smuzhiyun if (list_empty(head))
3026*4882a593Smuzhiyun goto insert;
3027*4882a593Smuzhiyun
3028*4882a593Smuzhiyun /*
3029*4882a593Smuzhiyun * As fallocate iterate by bytenr order, we only need to check
3030*4882a593Smuzhiyun * the last range.
3031*4882a593Smuzhiyun */
3032*4882a593Smuzhiyun prev = list_entry(head->prev, struct falloc_range, list);
3033*4882a593Smuzhiyun if (prev->start + prev->len == start) {
3034*4882a593Smuzhiyun prev->len += len;
3035*4882a593Smuzhiyun return 0;
3036*4882a593Smuzhiyun }
3037*4882a593Smuzhiyun insert:
3038*4882a593Smuzhiyun range = kmalloc(sizeof(*range), GFP_KERNEL);
3039*4882a593Smuzhiyun if (!range)
3040*4882a593Smuzhiyun return -ENOMEM;
3041*4882a593Smuzhiyun range->start = start;
3042*4882a593Smuzhiyun range->len = len;
3043*4882a593Smuzhiyun list_add_tail(&range->list, head);
3044*4882a593Smuzhiyun return 0;
3045*4882a593Smuzhiyun }
3046*4882a593Smuzhiyun
btrfs_fallocate_update_isize(struct inode * inode,const u64 end,const int mode)3047*4882a593Smuzhiyun static int btrfs_fallocate_update_isize(struct inode *inode,
3048*4882a593Smuzhiyun const u64 end,
3049*4882a593Smuzhiyun const int mode)
3050*4882a593Smuzhiyun {
3051*4882a593Smuzhiyun struct btrfs_trans_handle *trans;
3052*4882a593Smuzhiyun struct btrfs_root *root = BTRFS_I(inode)->root;
3053*4882a593Smuzhiyun int ret;
3054*4882a593Smuzhiyun int ret2;
3055*4882a593Smuzhiyun
3056*4882a593Smuzhiyun if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
3057*4882a593Smuzhiyun return 0;
3058*4882a593Smuzhiyun
3059*4882a593Smuzhiyun trans = btrfs_start_transaction(root, 1);
3060*4882a593Smuzhiyun if (IS_ERR(trans))
3061*4882a593Smuzhiyun return PTR_ERR(trans);
3062*4882a593Smuzhiyun
3063*4882a593Smuzhiyun inode->i_ctime = current_time(inode);
3064*4882a593Smuzhiyun i_size_write(inode, end);
3065*4882a593Smuzhiyun btrfs_inode_safe_disk_i_size_write(inode, 0);
3066*4882a593Smuzhiyun ret = btrfs_update_inode(trans, root, inode);
3067*4882a593Smuzhiyun ret2 = btrfs_end_transaction(trans);
3068*4882a593Smuzhiyun
3069*4882a593Smuzhiyun return ret ? ret : ret2;
3070*4882a593Smuzhiyun }
3071*4882a593Smuzhiyun
3072*4882a593Smuzhiyun enum {
3073*4882a593Smuzhiyun RANGE_BOUNDARY_WRITTEN_EXTENT,
3074*4882a593Smuzhiyun RANGE_BOUNDARY_PREALLOC_EXTENT,
3075*4882a593Smuzhiyun RANGE_BOUNDARY_HOLE,
3076*4882a593Smuzhiyun };
3077*4882a593Smuzhiyun
btrfs_zero_range_check_range_boundary(struct btrfs_inode * inode,u64 offset)3078*4882a593Smuzhiyun static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3079*4882a593Smuzhiyun u64 offset)
3080*4882a593Smuzhiyun {
3081*4882a593Smuzhiyun const u64 sectorsize = btrfs_inode_sectorsize(inode);
3082*4882a593Smuzhiyun struct extent_map *em;
3083*4882a593Smuzhiyun int ret;
3084*4882a593Smuzhiyun
3085*4882a593Smuzhiyun offset = round_down(offset, sectorsize);
3086*4882a593Smuzhiyun em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
3087*4882a593Smuzhiyun if (IS_ERR(em))
3088*4882a593Smuzhiyun return PTR_ERR(em);
3089*4882a593Smuzhiyun
3090*4882a593Smuzhiyun if (em->block_start == EXTENT_MAP_HOLE)
3091*4882a593Smuzhiyun ret = RANGE_BOUNDARY_HOLE;
3092*4882a593Smuzhiyun else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3093*4882a593Smuzhiyun ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3094*4882a593Smuzhiyun else
3095*4882a593Smuzhiyun ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3096*4882a593Smuzhiyun
3097*4882a593Smuzhiyun free_extent_map(em);
3098*4882a593Smuzhiyun return ret;
3099*4882a593Smuzhiyun }
3100*4882a593Smuzhiyun
btrfs_zero_range(struct inode * inode,loff_t offset,loff_t len,const int mode)3101*4882a593Smuzhiyun static int btrfs_zero_range(struct inode *inode,
3102*4882a593Smuzhiyun loff_t offset,
3103*4882a593Smuzhiyun loff_t len,
3104*4882a593Smuzhiyun const int mode)
3105*4882a593Smuzhiyun {
3106*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3107*4882a593Smuzhiyun struct extent_map *em;
3108*4882a593Smuzhiyun struct extent_changeset *data_reserved = NULL;
3109*4882a593Smuzhiyun int ret;
3110*4882a593Smuzhiyun u64 alloc_hint = 0;
3111*4882a593Smuzhiyun const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
3112*4882a593Smuzhiyun u64 alloc_start = round_down(offset, sectorsize);
3113*4882a593Smuzhiyun u64 alloc_end = round_up(offset + len, sectorsize);
3114*4882a593Smuzhiyun u64 bytes_to_reserve = 0;
3115*4882a593Smuzhiyun bool space_reserved = false;
3116*4882a593Smuzhiyun
3117*4882a593Smuzhiyun inode_dio_wait(inode);
3118*4882a593Smuzhiyun
3119*4882a593Smuzhiyun em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3120*4882a593Smuzhiyun alloc_end - alloc_start);
3121*4882a593Smuzhiyun if (IS_ERR(em)) {
3122*4882a593Smuzhiyun ret = PTR_ERR(em);
3123*4882a593Smuzhiyun goto out;
3124*4882a593Smuzhiyun }
3125*4882a593Smuzhiyun
3126*4882a593Smuzhiyun /*
3127*4882a593Smuzhiyun * Avoid hole punching and extent allocation for some cases. More cases
3128*4882a593Smuzhiyun * could be considered, but these are unlikely common and we keep things
3129*4882a593Smuzhiyun * as simple as possible for now. Also, intentionally, if the target
3130*4882a593Smuzhiyun * range contains one or more prealloc extents together with regular
3131*4882a593Smuzhiyun * extents and holes, we drop all the existing extents and allocate a
3132*4882a593Smuzhiyun * new prealloc extent, so that we get a larger contiguous disk extent.
3133*4882a593Smuzhiyun */
3134*4882a593Smuzhiyun if (em->start <= alloc_start &&
3135*4882a593Smuzhiyun test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3136*4882a593Smuzhiyun const u64 em_end = em->start + em->len;
3137*4882a593Smuzhiyun
3138*4882a593Smuzhiyun if (em_end >= offset + len) {
3139*4882a593Smuzhiyun /*
3140*4882a593Smuzhiyun * The whole range is already a prealloc extent,
3141*4882a593Smuzhiyun * do nothing except updating the inode's i_size if
3142*4882a593Smuzhiyun * needed.
3143*4882a593Smuzhiyun */
3144*4882a593Smuzhiyun free_extent_map(em);
3145*4882a593Smuzhiyun ret = btrfs_fallocate_update_isize(inode, offset + len,
3146*4882a593Smuzhiyun mode);
3147*4882a593Smuzhiyun goto out;
3148*4882a593Smuzhiyun }
3149*4882a593Smuzhiyun /*
3150*4882a593Smuzhiyun * Part of the range is already a prealloc extent, so operate
3151*4882a593Smuzhiyun * only on the remaining part of the range.
3152*4882a593Smuzhiyun */
3153*4882a593Smuzhiyun alloc_start = em_end;
3154*4882a593Smuzhiyun ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3155*4882a593Smuzhiyun len = offset + len - alloc_start;
3156*4882a593Smuzhiyun offset = alloc_start;
3157*4882a593Smuzhiyun alloc_hint = em->block_start + em->len;
3158*4882a593Smuzhiyun }
3159*4882a593Smuzhiyun free_extent_map(em);
3160*4882a593Smuzhiyun
3161*4882a593Smuzhiyun if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3162*4882a593Smuzhiyun BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3163*4882a593Smuzhiyun em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3164*4882a593Smuzhiyun sectorsize);
3165*4882a593Smuzhiyun if (IS_ERR(em)) {
3166*4882a593Smuzhiyun ret = PTR_ERR(em);
3167*4882a593Smuzhiyun goto out;
3168*4882a593Smuzhiyun }
3169*4882a593Smuzhiyun
3170*4882a593Smuzhiyun if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3171*4882a593Smuzhiyun free_extent_map(em);
3172*4882a593Smuzhiyun ret = btrfs_fallocate_update_isize(inode, offset + len,
3173*4882a593Smuzhiyun mode);
3174*4882a593Smuzhiyun goto out;
3175*4882a593Smuzhiyun }
3176*4882a593Smuzhiyun if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3177*4882a593Smuzhiyun free_extent_map(em);
3178*4882a593Smuzhiyun ret = btrfs_truncate_block(inode, offset, len, 0);
3179*4882a593Smuzhiyun if (!ret)
3180*4882a593Smuzhiyun ret = btrfs_fallocate_update_isize(inode,
3181*4882a593Smuzhiyun offset + len,
3182*4882a593Smuzhiyun mode);
3183*4882a593Smuzhiyun return ret;
3184*4882a593Smuzhiyun }
3185*4882a593Smuzhiyun free_extent_map(em);
3186*4882a593Smuzhiyun alloc_start = round_down(offset, sectorsize);
3187*4882a593Smuzhiyun alloc_end = alloc_start + sectorsize;
3188*4882a593Smuzhiyun goto reserve_space;
3189*4882a593Smuzhiyun }
3190*4882a593Smuzhiyun
3191*4882a593Smuzhiyun alloc_start = round_up(offset, sectorsize);
3192*4882a593Smuzhiyun alloc_end = round_down(offset + len, sectorsize);
3193*4882a593Smuzhiyun
3194*4882a593Smuzhiyun /*
3195*4882a593Smuzhiyun * For unaligned ranges, check the pages at the boundaries, they might
3196*4882a593Smuzhiyun * map to an extent, in which case we need to partially zero them, or
3197*4882a593Smuzhiyun * they might map to a hole, in which case we need our allocation range
3198*4882a593Smuzhiyun * to cover them.
3199*4882a593Smuzhiyun */
3200*4882a593Smuzhiyun if (!IS_ALIGNED(offset, sectorsize)) {
3201*4882a593Smuzhiyun ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3202*4882a593Smuzhiyun offset);
3203*4882a593Smuzhiyun if (ret < 0)
3204*4882a593Smuzhiyun goto out;
3205*4882a593Smuzhiyun if (ret == RANGE_BOUNDARY_HOLE) {
3206*4882a593Smuzhiyun alloc_start = round_down(offset, sectorsize);
3207*4882a593Smuzhiyun ret = 0;
3208*4882a593Smuzhiyun } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3209*4882a593Smuzhiyun ret = btrfs_truncate_block(inode, offset, 0, 0);
3210*4882a593Smuzhiyun if (ret)
3211*4882a593Smuzhiyun goto out;
3212*4882a593Smuzhiyun } else {
3213*4882a593Smuzhiyun ret = 0;
3214*4882a593Smuzhiyun }
3215*4882a593Smuzhiyun }
3216*4882a593Smuzhiyun
3217*4882a593Smuzhiyun if (!IS_ALIGNED(offset + len, sectorsize)) {
3218*4882a593Smuzhiyun ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3219*4882a593Smuzhiyun offset + len);
3220*4882a593Smuzhiyun if (ret < 0)
3221*4882a593Smuzhiyun goto out;
3222*4882a593Smuzhiyun if (ret == RANGE_BOUNDARY_HOLE) {
3223*4882a593Smuzhiyun alloc_end = round_up(offset + len, sectorsize);
3224*4882a593Smuzhiyun ret = 0;
3225*4882a593Smuzhiyun } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3226*4882a593Smuzhiyun ret = btrfs_truncate_block(inode, offset + len, 0, 1);
3227*4882a593Smuzhiyun if (ret)
3228*4882a593Smuzhiyun goto out;
3229*4882a593Smuzhiyun } else {
3230*4882a593Smuzhiyun ret = 0;
3231*4882a593Smuzhiyun }
3232*4882a593Smuzhiyun }
3233*4882a593Smuzhiyun
3234*4882a593Smuzhiyun reserve_space:
3235*4882a593Smuzhiyun if (alloc_start < alloc_end) {
3236*4882a593Smuzhiyun struct extent_state *cached_state = NULL;
3237*4882a593Smuzhiyun const u64 lockstart = alloc_start;
3238*4882a593Smuzhiyun const u64 lockend = alloc_end - 1;
3239*4882a593Smuzhiyun
3240*4882a593Smuzhiyun bytes_to_reserve = alloc_end - alloc_start;
3241*4882a593Smuzhiyun ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3242*4882a593Smuzhiyun bytes_to_reserve);
3243*4882a593Smuzhiyun if (ret < 0)
3244*4882a593Smuzhiyun goto out;
3245*4882a593Smuzhiyun space_reserved = true;
3246*4882a593Smuzhiyun ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3247*4882a593Smuzhiyun &cached_state);
3248*4882a593Smuzhiyun if (ret)
3249*4882a593Smuzhiyun goto out;
3250*4882a593Smuzhiyun ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3251*4882a593Smuzhiyun alloc_start, bytes_to_reserve);
3252*4882a593Smuzhiyun if (ret) {
3253*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3254*4882a593Smuzhiyun lockend, &cached_state);
3255*4882a593Smuzhiyun goto out;
3256*4882a593Smuzhiyun }
3257*4882a593Smuzhiyun ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3258*4882a593Smuzhiyun alloc_end - alloc_start,
3259*4882a593Smuzhiyun i_blocksize(inode),
3260*4882a593Smuzhiyun offset + len, &alloc_hint);
3261*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3262*4882a593Smuzhiyun lockend, &cached_state);
3263*4882a593Smuzhiyun /* btrfs_prealloc_file_range releases reserved space on error */
3264*4882a593Smuzhiyun if (ret) {
3265*4882a593Smuzhiyun space_reserved = false;
3266*4882a593Smuzhiyun goto out;
3267*4882a593Smuzhiyun }
3268*4882a593Smuzhiyun }
3269*4882a593Smuzhiyun ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3270*4882a593Smuzhiyun out:
3271*4882a593Smuzhiyun if (ret && space_reserved)
3272*4882a593Smuzhiyun btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3273*4882a593Smuzhiyun alloc_start, bytes_to_reserve);
3274*4882a593Smuzhiyun extent_changeset_free(data_reserved);
3275*4882a593Smuzhiyun
3276*4882a593Smuzhiyun return ret;
3277*4882a593Smuzhiyun }
3278*4882a593Smuzhiyun
btrfs_fallocate(struct file * file,int mode,loff_t offset,loff_t len)3279*4882a593Smuzhiyun static long btrfs_fallocate(struct file *file, int mode,
3280*4882a593Smuzhiyun loff_t offset, loff_t len)
3281*4882a593Smuzhiyun {
3282*4882a593Smuzhiyun struct inode *inode = file_inode(file);
3283*4882a593Smuzhiyun struct extent_state *cached_state = NULL;
3284*4882a593Smuzhiyun struct extent_changeset *data_reserved = NULL;
3285*4882a593Smuzhiyun struct falloc_range *range;
3286*4882a593Smuzhiyun struct falloc_range *tmp;
3287*4882a593Smuzhiyun struct list_head reserve_list;
3288*4882a593Smuzhiyun u64 cur_offset;
3289*4882a593Smuzhiyun u64 last_byte;
3290*4882a593Smuzhiyun u64 alloc_start;
3291*4882a593Smuzhiyun u64 alloc_end;
3292*4882a593Smuzhiyun u64 alloc_hint = 0;
3293*4882a593Smuzhiyun u64 locked_end;
3294*4882a593Smuzhiyun u64 actual_end = 0;
3295*4882a593Smuzhiyun struct extent_map *em;
3296*4882a593Smuzhiyun int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
3297*4882a593Smuzhiyun int ret;
3298*4882a593Smuzhiyun
3299*4882a593Smuzhiyun alloc_start = round_down(offset, blocksize);
3300*4882a593Smuzhiyun alloc_end = round_up(offset + len, blocksize);
3301*4882a593Smuzhiyun cur_offset = alloc_start;
3302*4882a593Smuzhiyun
3303*4882a593Smuzhiyun /* Make sure we aren't being give some crap mode */
3304*4882a593Smuzhiyun if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3305*4882a593Smuzhiyun FALLOC_FL_ZERO_RANGE))
3306*4882a593Smuzhiyun return -EOPNOTSUPP;
3307*4882a593Smuzhiyun
3308*4882a593Smuzhiyun if (mode & FALLOC_FL_PUNCH_HOLE)
3309*4882a593Smuzhiyun return btrfs_punch_hole(file, offset, len);
3310*4882a593Smuzhiyun
3311*4882a593Smuzhiyun /*
3312*4882a593Smuzhiyun * Only trigger disk allocation, don't trigger qgroup reserve
3313*4882a593Smuzhiyun *
3314*4882a593Smuzhiyun * For qgroup space, it will be checked later.
3315*4882a593Smuzhiyun */
3316*4882a593Smuzhiyun if (!(mode & FALLOC_FL_ZERO_RANGE)) {
3317*4882a593Smuzhiyun ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3318*4882a593Smuzhiyun alloc_end - alloc_start);
3319*4882a593Smuzhiyun if (ret < 0)
3320*4882a593Smuzhiyun return ret;
3321*4882a593Smuzhiyun }
3322*4882a593Smuzhiyun
3323*4882a593Smuzhiyun inode_lock(inode);
3324*4882a593Smuzhiyun
3325*4882a593Smuzhiyun if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3326*4882a593Smuzhiyun ret = inode_newsize_ok(inode, offset + len);
3327*4882a593Smuzhiyun if (ret)
3328*4882a593Smuzhiyun goto out;
3329*4882a593Smuzhiyun }
3330*4882a593Smuzhiyun
3331*4882a593Smuzhiyun ret = file_modified(file);
3332*4882a593Smuzhiyun if (ret)
3333*4882a593Smuzhiyun goto out;
3334*4882a593Smuzhiyun
3335*4882a593Smuzhiyun /*
3336*4882a593Smuzhiyun * TODO: Move these two operations after we have checked
3337*4882a593Smuzhiyun * accurate reserved space, or fallocate can still fail but
3338*4882a593Smuzhiyun * with page truncated or size expanded.
3339*4882a593Smuzhiyun *
3340*4882a593Smuzhiyun * But that's a minor problem and won't do much harm BTW.
3341*4882a593Smuzhiyun */
3342*4882a593Smuzhiyun if (alloc_start > inode->i_size) {
3343*4882a593Smuzhiyun ret = btrfs_cont_expand(inode, i_size_read(inode),
3344*4882a593Smuzhiyun alloc_start);
3345*4882a593Smuzhiyun if (ret)
3346*4882a593Smuzhiyun goto out;
3347*4882a593Smuzhiyun } else if (offset + len > inode->i_size) {
3348*4882a593Smuzhiyun /*
3349*4882a593Smuzhiyun * If we are fallocating from the end of the file onward we
3350*4882a593Smuzhiyun * need to zero out the end of the block if i_size lands in the
3351*4882a593Smuzhiyun * middle of a block.
3352*4882a593Smuzhiyun */
3353*4882a593Smuzhiyun ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
3354*4882a593Smuzhiyun if (ret)
3355*4882a593Smuzhiyun goto out;
3356*4882a593Smuzhiyun }
3357*4882a593Smuzhiyun
3358*4882a593Smuzhiyun /*
3359*4882a593Smuzhiyun * wait for ordered IO before we have any locks. We'll loop again
3360*4882a593Smuzhiyun * below with the locks held.
3361*4882a593Smuzhiyun */
3362*4882a593Smuzhiyun ret = btrfs_wait_ordered_range(inode, alloc_start,
3363*4882a593Smuzhiyun alloc_end - alloc_start);
3364*4882a593Smuzhiyun if (ret)
3365*4882a593Smuzhiyun goto out;
3366*4882a593Smuzhiyun
3367*4882a593Smuzhiyun if (mode & FALLOC_FL_ZERO_RANGE) {
3368*4882a593Smuzhiyun ret = btrfs_zero_range(inode, offset, len, mode);
3369*4882a593Smuzhiyun inode_unlock(inode);
3370*4882a593Smuzhiyun return ret;
3371*4882a593Smuzhiyun }
3372*4882a593Smuzhiyun
3373*4882a593Smuzhiyun locked_end = alloc_end - 1;
3374*4882a593Smuzhiyun while (1) {
3375*4882a593Smuzhiyun struct btrfs_ordered_extent *ordered;
3376*4882a593Smuzhiyun
3377*4882a593Smuzhiyun /* the extent lock is ordered inside the running
3378*4882a593Smuzhiyun * transaction
3379*4882a593Smuzhiyun */
3380*4882a593Smuzhiyun lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3381*4882a593Smuzhiyun locked_end, &cached_state);
3382*4882a593Smuzhiyun ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
3383*4882a593Smuzhiyun locked_end);
3384*4882a593Smuzhiyun
3385*4882a593Smuzhiyun if (ordered &&
3386*4882a593Smuzhiyun ordered->file_offset + ordered->num_bytes > alloc_start &&
3387*4882a593Smuzhiyun ordered->file_offset < alloc_end) {
3388*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
3389*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree,
3390*4882a593Smuzhiyun alloc_start, locked_end,
3391*4882a593Smuzhiyun &cached_state);
3392*4882a593Smuzhiyun /*
3393*4882a593Smuzhiyun * we can't wait on the range with the transaction
3394*4882a593Smuzhiyun * running or with the extent lock held
3395*4882a593Smuzhiyun */
3396*4882a593Smuzhiyun ret = btrfs_wait_ordered_range(inode, alloc_start,
3397*4882a593Smuzhiyun alloc_end - alloc_start);
3398*4882a593Smuzhiyun if (ret)
3399*4882a593Smuzhiyun goto out;
3400*4882a593Smuzhiyun } else {
3401*4882a593Smuzhiyun if (ordered)
3402*4882a593Smuzhiyun btrfs_put_ordered_extent(ordered);
3403*4882a593Smuzhiyun break;
3404*4882a593Smuzhiyun }
3405*4882a593Smuzhiyun }
3406*4882a593Smuzhiyun
3407*4882a593Smuzhiyun /* First, check if we exceed the qgroup limit */
3408*4882a593Smuzhiyun INIT_LIST_HEAD(&reserve_list);
3409*4882a593Smuzhiyun while (cur_offset < alloc_end) {
3410*4882a593Smuzhiyun em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3411*4882a593Smuzhiyun alloc_end - cur_offset);
3412*4882a593Smuzhiyun if (IS_ERR(em)) {
3413*4882a593Smuzhiyun ret = PTR_ERR(em);
3414*4882a593Smuzhiyun break;
3415*4882a593Smuzhiyun }
3416*4882a593Smuzhiyun last_byte = min(extent_map_end(em), alloc_end);
3417*4882a593Smuzhiyun actual_end = min_t(u64, extent_map_end(em), offset + len);
3418*4882a593Smuzhiyun last_byte = ALIGN(last_byte, blocksize);
3419*4882a593Smuzhiyun if (em->block_start == EXTENT_MAP_HOLE ||
3420*4882a593Smuzhiyun (cur_offset >= inode->i_size &&
3421*4882a593Smuzhiyun !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3422*4882a593Smuzhiyun ret = add_falloc_range(&reserve_list, cur_offset,
3423*4882a593Smuzhiyun last_byte - cur_offset);
3424*4882a593Smuzhiyun if (ret < 0) {
3425*4882a593Smuzhiyun free_extent_map(em);
3426*4882a593Smuzhiyun break;
3427*4882a593Smuzhiyun }
3428*4882a593Smuzhiyun ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3429*4882a593Smuzhiyun &data_reserved, cur_offset,
3430*4882a593Smuzhiyun last_byte - cur_offset);
3431*4882a593Smuzhiyun if (ret < 0) {
3432*4882a593Smuzhiyun cur_offset = last_byte;
3433*4882a593Smuzhiyun free_extent_map(em);
3434*4882a593Smuzhiyun break;
3435*4882a593Smuzhiyun }
3436*4882a593Smuzhiyun } else {
3437*4882a593Smuzhiyun /*
3438*4882a593Smuzhiyun * Do not need to reserve unwritten extent for this
3439*4882a593Smuzhiyun * range, free reserved data space first, otherwise
3440*4882a593Smuzhiyun * it'll result in false ENOSPC error.
3441*4882a593Smuzhiyun */
3442*4882a593Smuzhiyun btrfs_free_reserved_data_space(BTRFS_I(inode),
3443*4882a593Smuzhiyun data_reserved, cur_offset,
3444*4882a593Smuzhiyun last_byte - cur_offset);
3445*4882a593Smuzhiyun }
3446*4882a593Smuzhiyun free_extent_map(em);
3447*4882a593Smuzhiyun cur_offset = last_byte;
3448*4882a593Smuzhiyun }
3449*4882a593Smuzhiyun
3450*4882a593Smuzhiyun /*
3451*4882a593Smuzhiyun * If ret is still 0, means we're OK to fallocate.
3452*4882a593Smuzhiyun * Or just cleanup the list and exit.
3453*4882a593Smuzhiyun */
3454*4882a593Smuzhiyun list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3455*4882a593Smuzhiyun if (!ret)
3456*4882a593Smuzhiyun ret = btrfs_prealloc_file_range(inode, mode,
3457*4882a593Smuzhiyun range->start,
3458*4882a593Smuzhiyun range->len, i_blocksize(inode),
3459*4882a593Smuzhiyun offset + len, &alloc_hint);
3460*4882a593Smuzhiyun else
3461*4882a593Smuzhiyun btrfs_free_reserved_data_space(BTRFS_I(inode),
3462*4882a593Smuzhiyun data_reserved, range->start,
3463*4882a593Smuzhiyun range->len);
3464*4882a593Smuzhiyun list_del(&range->list);
3465*4882a593Smuzhiyun kfree(range);
3466*4882a593Smuzhiyun }
3467*4882a593Smuzhiyun if (ret < 0)
3468*4882a593Smuzhiyun goto out_unlock;
3469*4882a593Smuzhiyun
3470*4882a593Smuzhiyun /*
3471*4882a593Smuzhiyun * We didn't need to allocate any more space, but we still extended the
3472*4882a593Smuzhiyun * size of the file so we need to update i_size and the inode item.
3473*4882a593Smuzhiyun */
3474*4882a593Smuzhiyun ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3475*4882a593Smuzhiyun out_unlock:
3476*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3477*4882a593Smuzhiyun &cached_state);
3478*4882a593Smuzhiyun out:
3479*4882a593Smuzhiyun inode_unlock(inode);
3480*4882a593Smuzhiyun /* Let go of our reservation. */
3481*4882a593Smuzhiyun if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3482*4882a593Smuzhiyun btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3483*4882a593Smuzhiyun cur_offset, alloc_end - cur_offset);
3484*4882a593Smuzhiyun extent_changeset_free(data_reserved);
3485*4882a593Smuzhiyun return ret;
3486*4882a593Smuzhiyun }
3487*4882a593Smuzhiyun
find_desired_extent(struct inode * inode,loff_t offset,int whence)3488*4882a593Smuzhiyun static loff_t find_desired_extent(struct inode *inode, loff_t offset,
3489*4882a593Smuzhiyun int whence)
3490*4882a593Smuzhiyun {
3491*4882a593Smuzhiyun struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3492*4882a593Smuzhiyun struct extent_map *em = NULL;
3493*4882a593Smuzhiyun struct extent_state *cached_state = NULL;
3494*4882a593Smuzhiyun loff_t i_size = inode->i_size;
3495*4882a593Smuzhiyun u64 lockstart;
3496*4882a593Smuzhiyun u64 lockend;
3497*4882a593Smuzhiyun u64 start;
3498*4882a593Smuzhiyun u64 len;
3499*4882a593Smuzhiyun int ret = 0;
3500*4882a593Smuzhiyun
3501*4882a593Smuzhiyun if (i_size == 0 || offset >= i_size)
3502*4882a593Smuzhiyun return -ENXIO;
3503*4882a593Smuzhiyun
3504*4882a593Smuzhiyun /*
3505*4882a593Smuzhiyun * offset can be negative, in this case we start finding DATA/HOLE from
3506*4882a593Smuzhiyun * the very start of the file.
3507*4882a593Smuzhiyun */
3508*4882a593Smuzhiyun start = max_t(loff_t, 0, offset);
3509*4882a593Smuzhiyun
3510*4882a593Smuzhiyun lockstart = round_down(start, fs_info->sectorsize);
3511*4882a593Smuzhiyun lockend = round_up(i_size, fs_info->sectorsize);
3512*4882a593Smuzhiyun if (lockend <= lockstart)
3513*4882a593Smuzhiyun lockend = lockstart + fs_info->sectorsize;
3514*4882a593Smuzhiyun lockend--;
3515*4882a593Smuzhiyun len = lockend - lockstart + 1;
3516*4882a593Smuzhiyun
3517*4882a593Smuzhiyun lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3518*4882a593Smuzhiyun &cached_state);
3519*4882a593Smuzhiyun
3520*4882a593Smuzhiyun while (start < i_size) {
3521*4882a593Smuzhiyun em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
3522*4882a593Smuzhiyun if (IS_ERR(em)) {
3523*4882a593Smuzhiyun ret = PTR_ERR(em);
3524*4882a593Smuzhiyun em = NULL;
3525*4882a593Smuzhiyun break;
3526*4882a593Smuzhiyun }
3527*4882a593Smuzhiyun
3528*4882a593Smuzhiyun if (whence == SEEK_HOLE &&
3529*4882a593Smuzhiyun (em->block_start == EXTENT_MAP_HOLE ||
3530*4882a593Smuzhiyun test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3531*4882a593Smuzhiyun break;
3532*4882a593Smuzhiyun else if (whence == SEEK_DATA &&
3533*4882a593Smuzhiyun (em->block_start != EXTENT_MAP_HOLE &&
3534*4882a593Smuzhiyun !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3535*4882a593Smuzhiyun break;
3536*4882a593Smuzhiyun
3537*4882a593Smuzhiyun start = em->start + em->len;
3538*4882a593Smuzhiyun free_extent_map(em);
3539*4882a593Smuzhiyun em = NULL;
3540*4882a593Smuzhiyun cond_resched();
3541*4882a593Smuzhiyun }
3542*4882a593Smuzhiyun free_extent_map(em);
3543*4882a593Smuzhiyun unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3544*4882a593Smuzhiyun &cached_state);
3545*4882a593Smuzhiyun if (ret) {
3546*4882a593Smuzhiyun offset = ret;
3547*4882a593Smuzhiyun } else {
3548*4882a593Smuzhiyun if (whence == SEEK_DATA && start >= i_size)
3549*4882a593Smuzhiyun offset = -ENXIO;
3550*4882a593Smuzhiyun else
3551*4882a593Smuzhiyun offset = min_t(loff_t, start, i_size);
3552*4882a593Smuzhiyun }
3553*4882a593Smuzhiyun
3554*4882a593Smuzhiyun return offset;
3555*4882a593Smuzhiyun }
3556*4882a593Smuzhiyun
btrfs_file_llseek(struct file * file,loff_t offset,int whence)3557*4882a593Smuzhiyun static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3558*4882a593Smuzhiyun {
3559*4882a593Smuzhiyun struct inode *inode = file->f_mapping->host;
3560*4882a593Smuzhiyun
3561*4882a593Smuzhiyun switch (whence) {
3562*4882a593Smuzhiyun default:
3563*4882a593Smuzhiyun return generic_file_llseek(file, offset, whence);
3564*4882a593Smuzhiyun case SEEK_DATA:
3565*4882a593Smuzhiyun case SEEK_HOLE:
3566*4882a593Smuzhiyun inode_lock_shared(inode);
3567*4882a593Smuzhiyun offset = find_desired_extent(inode, offset, whence);
3568*4882a593Smuzhiyun inode_unlock_shared(inode);
3569*4882a593Smuzhiyun break;
3570*4882a593Smuzhiyun }
3571*4882a593Smuzhiyun
3572*4882a593Smuzhiyun if (offset < 0)
3573*4882a593Smuzhiyun return offset;
3574*4882a593Smuzhiyun
3575*4882a593Smuzhiyun return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3576*4882a593Smuzhiyun }
3577*4882a593Smuzhiyun
btrfs_file_open(struct inode * inode,struct file * filp)3578*4882a593Smuzhiyun static int btrfs_file_open(struct inode *inode, struct file *filp)
3579*4882a593Smuzhiyun {
3580*4882a593Smuzhiyun filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
3581*4882a593Smuzhiyun return generic_file_open(inode, filp);
3582*4882a593Smuzhiyun }
3583*4882a593Smuzhiyun
btrfs_file_read_iter(struct kiocb * iocb,struct iov_iter * to)3584*4882a593Smuzhiyun static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3585*4882a593Smuzhiyun {
3586*4882a593Smuzhiyun ssize_t ret = 0;
3587*4882a593Smuzhiyun
3588*4882a593Smuzhiyun if (iocb->ki_flags & IOCB_DIRECT) {
3589*4882a593Smuzhiyun struct inode *inode = file_inode(iocb->ki_filp);
3590*4882a593Smuzhiyun
3591*4882a593Smuzhiyun inode_lock_shared(inode);
3592*4882a593Smuzhiyun ret = btrfs_direct_IO(iocb, to);
3593*4882a593Smuzhiyun inode_unlock_shared(inode);
3594*4882a593Smuzhiyun if (ret < 0 || !iov_iter_count(to) ||
3595*4882a593Smuzhiyun iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3596*4882a593Smuzhiyun return ret;
3597*4882a593Smuzhiyun }
3598*4882a593Smuzhiyun
3599*4882a593Smuzhiyun return generic_file_buffered_read(iocb, to, ret);
3600*4882a593Smuzhiyun }
3601*4882a593Smuzhiyun
3602*4882a593Smuzhiyun const struct file_operations btrfs_file_operations = {
3603*4882a593Smuzhiyun .llseek = btrfs_file_llseek,
3604*4882a593Smuzhiyun .read_iter = btrfs_file_read_iter,
3605*4882a593Smuzhiyun .splice_read = generic_file_splice_read,
3606*4882a593Smuzhiyun .write_iter = btrfs_file_write_iter,
3607*4882a593Smuzhiyun .splice_write = iter_file_splice_write,
3608*4882a593Smuzhiyun .mmap = btrfs_file_mmap,
3609*4882a593Smuzhiyun .open = btrfs_file_open,
3610*4882a593Smuzhiyun .release = btrfs_release_file,
3611*4882a593Smuzhiyun .fsync = btrfs_sync_file,
3612*4882a593Smuzhiyun .fallocate = btrfs_fallocate,
3613*4882a593Smuzhiyun .unlocked_ioctl = btrfs_ioctl,
3614*4882a593Smuzhiyun #ifdef CONFIG_COMPAT
3615*4882a593Smuzhiyun .compat_ioctl = btrfs_compat_ioctl,
3616*4882a593Smuzhiyun #endif
3617*4882a593Smuzhiyun .remap_file_range = btrfs_remap_file_range,
3618*4882a593Smuzhiyun };
3619*4882a593Smuzhiyun
btrfs_auto_defrag_exit(void)3620*4882a593Smuzhiyun void __cold btrfs_auto_defrag_exit(void)
3621*4882a593Smuzhiyun {
3622*4882a593Smuzhiyun kmem_cache_destroy(btrfs_inode_defrag_cachep);
3623*4882a593Smuzhiyun }
3624*4882a593Smuzhiyun
btrfs_auto_defrag_init(void)3625*4882a593Smuzhiyun int __init btrfs_auto_defrag_init(void)
3626*4882a593Smuzhiyun {
3627*4882a593Smuzhiyun btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3628*4882a593Smuzhiyun sizeof(struct inode_defrag), 0,
3629*4882a593Smuzhiyun SLAB_MEM_SPREAD,
3630*4882a593Smuzhiyun NULL);
3631*4882a593Smuzhiyun if (!btrfs_inode_defrag_cachep)
3632*4882a593Smuzhiyun return -ENOMEM;
3633*4882a593Smuzhiyun
3634*4882a593Smuzhiyun return 0;
3635*4882a593Smuzhiyun }
3636*4882a593Smuzhiyun
btrfs_fdatawrite_range(struct inode * inode,loff_t start,loff_t end)3637*4882a593Smuzhiyun int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3638*4882a593Smuzhiyun {
3639*4882a593Smuzhiyun int ret;
3640*4882a593Smuzhiyun
3641*4882a593Smuzhiyun /*
3642*4882a593Smuzhiyun * So with compression we will find and lock a dirty page and clear the
3643*4882a593Smuzhiyun * first one as dirty, setup an async extent, and immediately return
3644*4882a593Smuzhiyun * with the entire range locked but with nobody actually marked with
3645*4882a593Smuzhiyun * writeback. So we can't just filemap_write_and_wait_range() and
3646*4882a593Smuzhiyun * expect it to work since it will just kick off a thread to do the
3647*4882a593Smuzhiyun * actual work. So we need to call filemap_fdatawrite_range _again_
3648*4882a593Smuzhiyun * since it will wait on the page lock, which won't be unlocked until
3649*4882a593Smuzhiyun * after the pages have been marked as writeback and so we're good to go
3650*4882a593Smuzhiyun * from there. We have to do this otherwise we'll miss the ordered
3651*4882a593Smuzhiyun * extents and that results in badness. Please Josef, do not think you
3652*4882a593Smuzhiyun * know better and pull this out at some point in the future, it is
3653*4882a593Smuzhiyun * right and you are wrong.
3654*4882a593Smuzhiyun */
3655*4882a593Smuzhiyun ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3656*4882a593Smuzhiyun if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3657*4882a593Smuzhiyun &BTRFS_I(inode)->runtime_flags))
3658*4882a593Smuzhiyun ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3659*4882a593Smuzhiyun
3660*4882a593Smuzhiyun return ret;
3661*4882a593Smuzhiyun }
3662