1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (c) 2000-2005 Silicon Graphics, Inc.
4*4882a593Smuzhiyun * Copyright (c) 2016-2018 Christoph Hellwig.
5*4882a593Smuzhiyun * All Rights Reserved.
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun #include "xfs.h"
8*4882a593Smuzhiyun #include "xfs_shared.h"
9*4882a593Smuzhiyun #include "xfs_format.h"
10*4882a593Smuzhiyun #include "xfs_log_format.h"
11*4882a593Smuzhiyun #include "xfs_trans_resv.h"
12*4882a593Smuzhiyun #include "xfs_mount.h"
13*4882a593Smuzhiyun #include "xfs_inode.h"
14*4882a593Smuzhiyun #include "xfs_trans.h"
15*4882a593Smuzhiyun #include "xfs_iomap.h"
16*4882a593Smuzhiyun #include "xfs_trace.h"
17*4882a593Smuzhiyun #include "xfs_bmap.h"
18*4882a593Smuzhiyun #include "xfs_bmap_util.h"
19*4882a593Smuzhiyun #include "xfs_reflink.h"
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun struct xfs_writepage_ctx {
22*4882a593Smuzhiyun struct iomap_writepage_ctx ctx;
23*4882a593Smuzhiyun unsigned int data_seq;
24*4882a593Smuzhiyun unsigned int cow_seq;
25*4882a593Smuzhiyun };
26*4882a593Smuzhiyun
27*4882a593Smuzhiyun static inline struct xfs_writepage_ctx *
XFS_WPC(struct iomap_writepage_ctx * ctx)28*4882a593Smuzhiyun XFS_WPC(struct iomap_writepage_ctx *ctx)
29*4882a593Smuzhiyun {
30*4882a593Smuzhiyun return container_of(ctx, struct xfs_writepage_ctx, ctx);
31*4882a593Smuzhiyun }
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun /*
34*4882a593Smuzhiyun * Fast and loose check if this write could update the on-disk inode size.
35*4882a593Smuzhiyun */
xfs_ioend_is_append(struct iomap_ioend * ioend)36*4882a593Smuzhiyun static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
37*4882a593Smuzhiyun {
38*4882a593Smuzhiyun return ioend->io_offset + ioend->io_size >
39*4882a593Smuzhiyun XFS_I(ioend->io_inode)->i_d.di_size;
40*4882a593Smuzhiyun }
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun STATIC int
xfs_setfilesize_trans_alloc(struct iomap_ioend * ioend)43*4882a593Smuzhiyun xfs_setfilesize_trans_alloc(
44*4882a593Smuzhiyun struct iomap_ioend *ioend)
45*4882a593Smuzhiyun {
46*4882a593Smuzhiyun struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount;
47*4882a593Smuzhiyun struct xfs_trans *tp;
48*4882a593Smuzhiyun int error;
49*4882a593Smuzhiyun
50*4882a593Smuzhiyun error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
51*4882a593Smuzhiyun if (error)
52*4882a593Smuzhiyun return error;
53*4882a593Smuzhiyun
54*4882a593Smuzhiyun ioend->io_private = tp;
55*4882a593Smuzhiyun
56*4882a593Smuzhiyun /*
57*4882a593Smuzhiyun * We may pass freeze protection with a transaction. So tell lockdep
58*4882a593Smuzhiyun * we released it.
59*4882a593Smuzhiyun */
60*4882a593Smuzhiyun __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
61*4882a593Smuzhiyun /*
62*4882a593Smuzhiyun * We hand off the transaction to the completion thread now, so
63*4882a593Smuzhiyun * clear the flag here.
64*4882a593Smuzhiyun */
65*4882a593Smuzhiyun xfs_trans_clear_context(tp);
66*4882a593Smuzhiyun return 0;
67*4882a593Smuzhiyun }
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun /*
70*4882a593Smuzhiyun * Update on-disk file size now that data has been written to disk.
71*4882a593Smuzhiyun */
72*4882a593Smuzhiyun STATIC int
__xfs_setfilesize(struct xfs_inode * ip,struct xfs_trans * tp,xfs_off_t offset,size_t size)73*4882a593Smuzhiyun __xfs_setfilesize(
74*4882a593Smuzhiyun struct xfs_inode *ip,
75*4882a593Smuzhiyun struct xfs_trans *tp,
76*4882a593Smuzhiyun xfs_off_t offset,
77*4882a593Smuzhiyun size_t size)
78*4882a593Smuzhiyun {
79*4882a593Smuzhiyun xfs_fsize_t isize;
80*4882a593Smuzhiyun
81*4882a593Smuzhiyun xfs_ilock(ip, XFS_ILOCK_EXCL);
82*4882a593Smuzhiyun isize = xfs_new_eof(ip, offset + size);
83*4882a593Smuzhiyun if (!isize) {
84*4882a593Smuzhiyun xfs_iunlock(ip, XFS_ILOCK_EXCL);
85*4882a593Smuzhiyun xfs_trans_cancel(tp);
86*4882a593Smuzhiyun return 0;
87*4882a593Smuzhiyun }
88*4882a593Smuzhiyun
89*4882a593Smuzhiyun trace_xfs_setfilesize(ip, offset, size);
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun ip->i_d.di_size = isize;
92*4882a593Smuzhiyun xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
93*4882a593Smuzhiyun xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun return xfs_trans_commit(tp);
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun
98*4882a593Smuzhiyun int
xfs_setfilesize(struct xfs_inode * ip,xfs_off_t offset,size_t size)99*4882a593Smuzhiyun xfs_setfilesize(
100*4882a593Smuzhiyun struct xfs_inode *ip,
101*4882a593Smuzhiyun xfs_off_t offset,
102*4882a593Smuzhiyun size_t size)
103*4882a593Smuzhiyun {
104*4882a593Smuzhiyun struct xfs_mount *mp = ip->i_mount;
105*4882a593Smuzhiyun struct xfs_trans *tp;
106*4882a593Smuzhiyun int error;
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
109*4882a593Smuzhiyun if (error)
110*4882a593Smuzhiyun return error;
111*4882a593Smuzhiyun
112*4882a593Smuzhiyun return __xfs_setfilesize(ip, tp, offset, size);
113*4882a593Smuzhiyun }
114*4882a593Smuzhiyun
115*4882a593Smuzhiyun STATIC int
xfs_setfilesize_ioend(struct iomap_ioend * ioend,int error)116*4882a593Smuzhiyun xfs_setfilesize_ioend(
117*4882a593Smuzhiyun struct iomap_ioend *ioend,
118*4882a593Smuzhiyun int error)
119*4882a593Smuzhiyun {
120*4882a593Smuzhiyun struct xfs_inode *ip = XFS_I(ioend->io_inode);
121*4882a593Smuzhiyun struct xfs_trans *tp = ioend->io_private;
122*4882a593Smuzhiyun
123*4882a593Smuzhiyun /*
124*4882a593Smuzhiyun * The transaction may have been allocated in the I/O submission thread,
125*4882a593Smuzhiyun * thus we need to mark ourselves as being in a transaction manually.
126*4882a593Smuzhiyun * Similarly for freeze protection.
127*4882a593Smuzhiyun */
128*4882a593Smuzhiyun xfs_trans_set_context(tp);
129*4882a593Smuzhiyun __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun /* we abort the update if there was an IO error */
132*4882a593Smuzhiyun if (error) {
133*4882a593Smuzhiyun xfs_trans_cancel(tp);
134*4882a593Smuzhiyun return error;
135*4882a593Smuzhiyun }
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
138*4882a593Smuzhiyun }
139*4882a593Smuzhiyun
140*4882a593Smuzhiyun /*
141*4882a593Smuzhiyun * IO write completion.
142*4882a593Smuzhiyun */
143*4882a593Smuzhiyun STATIC void
xfs_end_ioend(struct iomap_ioend * ioend)144*4882a593Smuzhiyun xfs_end_ioend(
145*4882a593Smuzhiyun struct iomap_ioend *ioend)
146*4882a593Smuzhiyun {
147*4882a593Smuzhiyun struct xfs_inode *ip = XFS_I(ioend->io_inode);
148*4882a593Smuzhiyun struct xfs_mount *mp = ip->i_mount;
149*4882a593Smuzhiyun xfs_off_t offset = ioend->io_offset;
150*4882a593Smuzhiyun size_t size = ioend->io_size;
151*4882a593Smuzhiyun unsigned int nofs_flag;
152*4882a593Smuzhiyun int error;
153*4882a593Smuzhiyun
154*4882a593Smuzhiyun /*
155*4882a593Smuzhiyun * We can allocate memory here while doing writeback on behalf of
156*4882a593Smuzhiyun * memory reclaim. To avoid memory allocation deadlocks set the
157*4882a593Smuzhiyun * task-wide nofs context for the following operations.
158*4882a593Smuzhiyun */
159*4882a593Smuzhiyun nofs_flag = memalloc_nofs_save();
160*4882a593Smuzhiyun
161*4882a593Smuzhiyun /*
162*4882a593Smuzhiyun * Just clean up the in-memory strutures if the fs has been shut down.
163*4882a593Smuzhiyun */
164*4882a593Smuzhiyun if (XFS_FORCED_SHUTDOWN(mp)) {
165*4882a593Smuzhiyun error = -EIO;
166*4882a593Smuzhiyun goto done;
167*4882a593Smuzhiyun }
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun /*
170*4882a593Smuzhiyun * Clean up all COW blocks and underlying data fork delalloc blocks on
171*4882a593Smuzhiyun * I/O error. The delalloc punch is required because this ioend was
172*4882a593Smuzhiyun * mapped to blocks in the COW fork and the associated pages are no
173*4882a593Smuzhiyun * longer dirty. If we don't remove delalloc blocks here, they become
174*4882a593Smuzhiyun * stale and can corrupt free space accounting on unmount.
175*4882a593Smuzhiyun */
176*4882a593Smuzhiyun error = blk_status_to_errno(ioend->io_bio->bi_status);
177*4882a593Smuzhiyun if (unlikely(error)) {
178*4882a593Smuzhiyun if (ioend->io_flags & IOMAP_F_SHARED) {
179*4882a593Smuzhiyun xfs_reflink_cancel_cow_range(ip, offset, size, true);
180*4882a593Smuzhiyun xfs_bmap_punch_delalloc_range(ip,
181*4882a593Smuzhiyun XFS_B_TO_FSBT(mp, offset),
182*4882a593Smuzhiyun XFS_B_TO_FSB(mp, size));
183*4882a593Smuzhiyun }
184*4882a593Smuzhiyun goto done;
185*4882a593Smuzhiyun }
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun /*
188*4882a593Smuzhiyun * Success: commit the COW or unwritten blocks if needed.
189*4882a593Smuzhiyun */
190*4882a593Smuzhiyun if (ioend->io_flags & IOMAP_F_SHARED)
191*4882a593Smuzhiyun error = xfs_reflink_end_cow(ip, offset, size);
192*4882a593Smuzhiyun else if (ioend->io_type == IOMAP_UNWRITTEN)
193*4882a593Smuzhiyun error = xfs_iomap_write_unwritten(ip, offset, size, false);
194*4882a593Smuzhiyun else
195*4882a593Smuzhiyun ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_private);
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun done:
198*4882a593Smuzhiyun if (ioend->io_private)
199*4882a593Smuzhiyun error = xfs_setfilesize_ioend(ioend, error);
200*4882a593Smuzhiyun iomap_finish_ioends(ioend, error);
201*4882a593Smuzhiyun memalloc_nofs_restore(nofs_flag);
202*4882a593Smuzhiyun }
203*4882a593Smuzhiyun
204*4882a593Smuzhiyun /*
205*4882a593Smuzhiyun * If the to be merged ioend has a preallocated transaction for file
206*4882a593Smuzhiyun * size updates we need to ensure the ioend it is merged into also
207*4882a593Smuzhiyun * has one. If it already has one we can simply cancel the transaction
208*4882a593Smuzhiyun * as it is guaranteed to be clean.
209*4882a593Smuzhiyun */
210*4882a593Smuzhiyun static void
xfs_ioend_merge_private(struct iomap_ioend * ioend,struct iomap_ioend * next)211*4882a593Smuzhiyun xfs_ioend_merge_private(
212*4882a593Smuzhiyun struct iomap_ioend *ioend,
213*4882a593Smuzhiyun struct iomap_ioend *next)
214*4882a593Smuzhiyun {
215*4882a593Smuzhiyun if (!ioend->io_private) {
216*4882a593Smuzhiyun ioend->io_private = next->io_private;
217*4882a593Smuzhiyun next->io_private = NULL;
218*4882a593Smuzhiyun } else {
219*4882a593Smuzhiyun xfs_setfilesize_ioend(next, -ECANCELED);
220*4882a593Smuzhiyun }
221*4882a593Smuzhiyun }
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun /* Finish all pending io completions. */
224*4882a593Smuzhiyun void
xfs_end_io(struct work_struct * work)225*4882a593Smuzhiyun xfs_end_io(
226*4882a593Smuzhiyun struct work_struct *work)
227*4882a593Smuzhiyun {
228*4882a593Smuzhiyun struct xfs_inode *ip =
229*4882a593Smuzhiyun container_of(work, struct xfs_inode, i_ioend_work);
230*4882a593Smuzhiyun struct iomap_ioend *ioend;
231*4882a593Smuzhiyun struct list_head tmp;
232*4882a593Smuzhiyun unsigned long flags;
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun spin_lock_irqsave(&ip->i_ioend_lock, flags);
235*4882a593Smuzhiyun list_replace_init(&ip->i_ioend_list, &tmp);
236*4882a593Smuzhiyun spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
237*4882a593Smuzhiyun
238*4882a593Smuzhiyun iomap_sort_ioends(&tmp);
239*4882a593Smuzhiyun while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
240*4882a593Smuzhiyun io_list))) {
241*4882a593Smuzhiyun list_del_init(&ioend->io_list);
242*4882a593Smuzhiyun iomap_ioend_try_merge(ioend, &tmp, xfs_ioend_merge_private);
243*4882a593Smuzhiyun xfs_end_ioend(ioend);
244*4882a593Smuzhiyun }
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
xfs_ioend_needs_workqueue(struct iomap_ioend * ioend)247*4882a593Smuzhiyun static inline bool xfs_ioend_needs_workqueue(struct iomap_ioend *ioend)
248*4882a593Smuzhiyun {
249*4882a593Smuzhiyun return ioend->io_private ||
250*4882a593Smuzhiyun ioend->io_type == IOMAP_UNWRITTEN ||
251*4882a593Smuzhiyun (ioend->io_flags & IOMAP_F_SHARED);
252*4882a593Smuzhiyun }
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun STATIC void
xfs_end_bio(struct bio * bio)255*4882a593Smuzhiyun xfs_end_bio(
256*4882a593Smuzhiyun struct bio *bio)
257*4882a593Smuzhiyun {
258*4882a593Smuzhiyun struct iomap_ioend *ioend = bio->bi_private;
259*4882a593Smuzhiyun struct xfs_inode *ip = XFS_I(ioend->io_inode);
260*4882a593Smuzhiyun unsigned long flags;
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun ASSERT(xfs_ioend_needs_workqueue(ioend));
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun spin_lock_irqsave(&ip->i_ioend_lock, flags);
265*4882a593Smuzhiyun if (list_empty(&ip->i_ioend_list))
266*4882a593Smuzhiyun WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
267*4882a593Smuzhiyun &ip->i_ioend_work));
268*4882a593Smuzhiyun list_add_tail(&ioend->io_list, &ip->i_ioend_list);
269*4882a593Smuzhiyun spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
270*4882a593Smuzhiyun }
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun /*
273*4882a593Smuzhiyun * Fast revalidation of the cached writeback mapping. Return true if the current
274*4882a593Smuzhiyun * mapping is valid, false otherwise.
275*4882a593Smuzhiyun */
276*4882a593Smuzhiyun static bool
xfs_imap_valid(struct iomap_writepage_ctx * wpc,struct xfs_inode * ip,loff_t offset)277*4882a593Smuzhiyun xfs_imap_valid(
278*4882a593Smuzhiyun struct iomap_writepage_ctx *wpc,
279*4882a593Smuzhiyun struct xfs_inode *ip,
280*4882a593Smuzhiyun loff_t offset)
281*4882a593Smuzhiyun {
282*4882a593Smuzhiyun if (offset < wpc->iomap.offset ||
283*4882a593Smuzhiyun offset >= wpc->iomap.offset + wpc->iomap.length)
284*4882a593Smuzhiyun return false;
285*4882a593Smuzhiyun /*
286*4882a593Smuzhiyun * If this is a COW mapping, it is sufficient to check that the mapping
287*4882a593Smuzhiyun * covers the offset. Be careful to check this first because the caller
288*4882a593Smuzhiyun * can revalidate a COW mapping without updating the data seqno.
289*4882a593Smuzhiyun */
290*4882a593Smuzhiyun if (wpc->iomap.flags & IOMAP_F_SHARED)
291*4882a593Smuzhiyun return true;
292*4882a593Smuzhiyun
293*4882a593Smuzhiyun /*
294*4882a593Smuzhiyun * This is not a COW mapping. Check the sequence number of the data fork
295*4882a593Smuzhiyun * because concurrent changes could have invalidated the extent. Check
296*4882a593Smuzhiyun * the COW fork because concurrent changes since the last time we
297*4882a593Smuzhiyun * checked (and found nothing at this offset) could have added
298*4882a593Smuzhiyun * overlapping blocks.
299*4882a593Smuzhiyun */
300*4882a593Smuzhiyun if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
301*4882a593Smuzhiyun return false;
302*4882a593Smuzhiyun if (xfs_inode_has_cow_data(ip) &&
303*4882a593Smuzhiyun XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
304*4882a593Smuzhiyun return false;
305*4882a593Smuzhiyun return true;
306*4882a593Smuzhiyun }
307*4882a593Smuzhiyun
308*4882a593Smuzhiyun /*
309*4882a593Smuzhiyun * Pass in a dellalloc extent and convert it to real extents, return the real
310*4882a593Smuzhiyun * extent that maps offset_fsb in wpc->iomap.
311*4882a593Smuzhiyun *
312*4882a593Smuzhiyun * The current page is held locked so nothing could have removed the block
313*4882a593Smuzhiyun * backing offset_fsb, although it could have moved from the COW to the data
314*4882a593Smuzhiyun * fork by another thread.
315*4882a593Smuzhiyun */
316*4882a593Smuzhiyun static int
xfs_convert_blocks(struct iomap_writepage_ctx * wpc,struct xfs_inode * ip,int whichfork,loff_t offset)317*4882a593Smuzhiyun xfs_convert_blocks(
318*4882a593Smuzhiyun struct iomap_writepage_ctx *wpc,
319*4882a593Smuzhiyun struct xfs_inode *ip,
320*4882a593Smuzhiyun int whichfork,
321*4882a593Smuzhiyun loff_t offset)
322*4882a593Smuzhiyun {
323*4882a593Smuzhiyun int error;
324*4882a593Smuzhiyun unsigned *seq;
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun if (whichfork == XFS_COW_FORK)
327*4882a593Smuzhiyun seq = &XFS_WPC(wpc)->cow_seq;
328*4882a593Smuzhiyun else
329*4882a593Smuzhiyun seq = &XFS_WPC(wpc)->data_seq;
330*4882a593Smuzhiyun
331*4882a593Smuzhiyun /*
332*4882a593Smuzhiyun * Attempt to allocate whatever delalloc extent currently backs offset
333*4882a593Smuzhiyun * and put the result into wpc->iomap. Allocate in a loop because it
334*4882a593Smuzhiyun * may take several attempts to allocate real blocks for a contiguous
335*4882a593Smuzhiyun * delalloc extent if free space is sufficiently fragmented.
336*4882a593Smuzhiyun */
337*4882a593Smuzhiyun do {
338*4882a593Smuzhiyun error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
339*4882a593Smuzhiyun &wpc->iomap, seq);
340*4882a593Smuzhiyun if (error)
341*4882a593Smuzhiyun return error;
342*4882a593Smuzhiyun } while (wpc->iomap.offset + wpc->iomap.length <= offset);
343*4882a593Smuzhiyun
344*4882a593Smuzhiyun return 0;
345*4882a593Smuzhiyun }
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun static int
xfs_map_blocks(struct iomap_writepage_ctx * wpc,struct inode * inode,loff_t offset)348*4882a593Smuzhiyun xfs_map_blocks(
349*4882a593Smuzhiyun struct iomap_writepage_ctx *wpc,
350*4882a593Smuzhiyun struct inode *inode,
351*4882a593Smuzhiyun loff_t offset)
352*4882a593Smuzhiyun {
353*4882a593Smuzhiyun struct xfs_inode *ip = XFS_I(inode);
354*4882a593Smuzhiyun struct xfs_mount *mp = ip->i_mount;
355*4882a593Smuzhiyun ssize_t count = i_blocksize(inode);
356*4882a593Smuzhiyun xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
357*4882a593Smuzhiyun xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
358*4882a593Smuzhiyun xfs_fileoff_t cow_fsb;
359*4882a593Smuzhiyun int whichfork;
360*4882a593Smuzhiyun struct xfs_bmbt_irec imap;
361*4882a593Smuzhiyun struct xfs_iext_cursor icur;
362*4882a593Smuzhiyun int retries = 0;
363*4882a593Smuzhiyun int error = 0;
364*4882a593Smuzhiyun
365*4882a593Smuzhiyun if (XFS_FORCED_SHUTDOWN(mp))
366*4882a593Smuzhiyun return -EIO;
367*4882a593Smuzhiyun
368*4882a593Smuzhiyun /*
369*4882a593Smuzhiyun * COW fork blocks can overlap data fork blocks even if the blocks
370*4882a593Smuzhiyun * aren't shared. COW I/O always takes precedent, so we must always
371*4882a593Smuzhiyun * check for overlap on reflink inodes unless the mapping is already a
372*4882a593Smuzhiyun * COW one, or the COW fork hasn't changed from the last time we looked
373*4882a593Smuzhiyun * at it.
374*4882a593Smuzhiyun *
375*4882a593Smuzhiyun * It's safe to check the COW fork if_seq here without the ILOCK because
376*4882a593Smuzhiyun * we've indirectly protected against concurrent updates: writeback has
377*4882a593Smuzhiyun * the page locked, which prevents concurrent invalidations by reflink
378*4882a593Smuzhiyun * and directio and prevents concurrent buffered writes to the same
379*4882a593Smuzhiyun * page. Changes to if_seq always happen under i_lock, which protects
380*4882a593Smuzhiyun * against concurrent updates and provides a memory barrier on the way
381*4882a593Smuzhiyun * out that ensures that we always see the current value.
382*4882a593Smuzhiyun */
383*4882a593Smuzhiyun if (xfs_imap_valid(wpc, ip, offset))
384*4882a593Smuzhiyun return 0;
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun /*
387*4882a593Smuzhiyun * If we don't have a valid map, now it's time to get a new one for this
388*4882a593Smuzhiyun * offset. This will convert delayed allocations (including COW ones)
389*4882a593Smuzhiyun * into real extents. If we return without a valid map, it means we
390*4882a593Smuzhiyun * landed in a hole and we skip the block.
391*4882a593Smuzhiyun */
392*4882a593Smuzhiyun retry:
393*4882a593Smuzhiyun cow_fsb = NULLFILEOFF;
394*4882a593Smuzhiyun whichfork = XFS_DATA_FORK;
395*4882a593Smuzhiyun xfs_ilock(ip, XFS_ILOCK_SHARED);
396*4882a593Smuzhiyun ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
397*4882a593Smuzhiyun (ip->i_df.if_flags & XFS_IFEXTENTS));
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun /*
400*4882a593Smuzhiyun * Check if this is offset is covered by a COW extents, and if yes use
401*4882a593Smuzhiyun * it directly instead of looking up anything in the data fork.
402*4882a593Smuzhiyun */
403*4882a593Smuzhiyun if (xfs_inode_has_cow_data(ip) &&
404*4882a593Smuzhiyun xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
405*4882a593Smuzhiyun cow_fsb = imap.br_startoff;
406*4882a593Smuzhiyun if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
407*4882a593Smuzhiyun XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
408*4882a593Smuzhiyun xfs_iunlock(ip, XFS_ILOCK_SHARED);
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun whichfork = XFS_COW_FORK;
411*4882a593Smuzhiyun goto allocate_blocks;
412*4882a593Smuzhiyun }
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun /*
415*4882a593Smuzhiyun * No COW extent overlap. Revalidate now that we may have updated
416*4882a593Smuzhiyun * ->cow_seq. If the data mapping is still valid, we're done.
417*4882a593Smuzhiyun */
418*4882a593Smuzhiyun if (xfs_imap_valid(wpc, ip, offset)) {
419*4882a593Smuzhiyun xfs_iunlock(ip, XFS_ILOCK_SHARED);
420*4882a593Smuzhiyun return 0;
421*4882a593Smuzhiyun }
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun /*
424*4882a593Smuzhiyun * If we don't have a valid map, now it's time to get a new one for this
425*4882a593Smuzhiyun * offset. This will convert delayed allocations (including COW ones)
426*4882a593Smuzhiyun * into real extents.
427*4882a593Smuzhiyun */
428*4882a593Smuzhiyun if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
429*4882a593Smuzhiyun imap.br_startoff = end_fsb; /* fake a hole past EOF */
430*4882a593Smuzhiyun XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
431*4882a593Smuzhiyun xfs_iunlock(ip, XFS_ILOCK_SHARED);
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun /* landed in a hole or beyond EOF? */
434*4882a593Smuzhiyun if (imap.br_startoff > offset_fsb) {
435*4882a593Smuzhiyun imap.br_blockcount = imap.br_startoff - offset_fsb;
436*4882a593Smuzhiyun imap.br_startoff = offset_fsb;
437*4882a593Smuzhiyun imap.br_startblock = HOLESTARTBLOCK;
438*4882a593Smuzhiyun imap.br_state = XFS_EXT_NORM;
439*4882a593Smuzhiyun }
440*4882a593Smuzhiyun
441*4882a593Smuzhiyun /*
442*4882a593Smuzhiyun * Truncate to the next COW extent if there is one. This is the only
443*4882a593Smuzhiyun * opportunity to do this because we can skip COW fork lookups for the
444*4882a593Smuzhiyun * subsequent blocks in the mapping; however, the requirement to treat
445*4882a593Smuzhiyun * the COW range separately remains.
446*4882a593Smuzhiyun */
447*4882a593Smuzhiyun if (cow_fsb != NULLFILEOFF &&
448*4882a593Smuzhiyun cow_fsb < imap.br_startoff + imap.br_blockcount)
449*4882a593Smuzhiyun imap.br_blockcount = cow_fsb - imap.br_startoff;
450*4882a593Smuzhiyun
451*4882a593Smuzhiyun /* got a delalloc extent? */
452*4882a593Smuzhiyun if (imap.br_startblock != HOLESTARTBLOCK &&
453*4882a593Smuzhiyun isnullstartblock(imap.br_startblock))
454*4882a593Smuzhiyun goto allocate_blocks;
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0);
457*4882a593Smuzhiyun trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
458*4882a593Smuzhiyun return 0;
459*4882a593Smuzhiyun allocate_blocks:
460*4882a593Smuzhiyun error = xfs_convert_blocks(wpc, ip, whichfork, offset);
461*4882a593Smuzhiyun if (error) {
462*4882a593Smuzhiyun /*
463*4882a593Smuzhiyun * If we failed to find the extent in the COW fork we might have
464*4882a593Smuzhiyun * raced with a COW to data fork conversion or truncate.
465*4882a593Smuzhiyun * Restart the lookup to catch the extent in the data fork for
466*4882a593Smuzhiyun * the former case, but prevent additional retries to avoid
467*4882a593Smuzhiyun * looping forever for the latter case.
468*4882a593Smuzhiyun */
469*4882a593Smuzhiyun if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
470*4882a593Smuzhiyun goto retry;
471*4882a593Smuzhiyun ASSERT(error != -EAGAIN);
472*4882a593Smuzhiyun return error;
473*4882a593Smuzhiyun }
474*4882a593Smuzhiyun
475*4882a593Smuzhiyun /*
476*4882a593Smuzhiyun * Due to merging the return real extent might be larger than the
477*4882a593Smuzhiyun * original delalloc one. Trim the return extent to the next COW
478*4882a593Smuzhiyun * boundary again to force a re-lookup.
479*4882a593Smuzhiyun */
480*4882a593Smuzhiyun if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
481*4882a593Smuzhiyun loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
484*4882a593Smuzhiyun wpc->iomap.length = cow_offset - wpc->iomap.offset;
485*4882a593Smuzhiyun }
486*4882a593Smuzhiyun
487*4882a593Smuzhiyun ASSERT(wpc->iomap.offset <= offset);
488*4882a593Smuzhiyun ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
489*4882a593Smuzhiyun trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
490*4882a593Smuzhiyun return 0;
491*4882a593Smuzhiyun }
492*4882a593Smuzhiyun
493*4882a593Smuzhiyun static int
xfs_prepare_ioend(struct iomap_ioend * ioend,int status)494*4882a593Smuzhiyun xfs_prepare_ioend(
495*4882a593Smuzhiyun struct iomap_ioend *ioend,
496*4882a593Smuzhiyun int status)
497*4882a593Smuzhiyun {
498*4882a593Smuzhiyun unsigned int nofs_flag;
499*4882a593Smuzhiyun
500*4882a593Smuzhiyun /*
501*4882a593Smuzhiyun * We can allocate memory here while doing writeback on behalf of
502*4882a593Smuzhiyun * memory reclaim. To avoid memory allocation deadlocks set the
503*4882a593Smuzhiyun * task-wide nofs context for the following operations.
504*4882a593Smuzhiyun */
505*4882a593Smuzhiyun nofs_flag = memalloc_nofs_save();
506*4882a593Smuzhiyun
507*4882a593Smuzhiyun /* Convert CoW extents to regular */
508*4882a593Smuzhiyun if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
509*4882a593Smuzhiyun status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
510*4882a593Smuzhiyun ioend->io_offset, ioend->io_size);
511*4882a593Smuzhiyun }
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun /* Reserve log space if we might write beyond the on-disk inode size. */
514*4882a593Smuzhiyun if (!status &&
515*4882a593Smuzhiyun ((ioend->io_flags & IOMAP_F_SHARED) ||
516*4882a593Smuzhiyun ioend->io_type != IOMAP_UNWRITTEN) &&
517*4882a593Smuzhiyun xfs_ioend_is_append(ioend) &&
518*4882a593Smuzhiyun !ioend->io_private)
519*4882a593Smuzhiyun status = xfs_setfilesize_trans_alloc(ioend);
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun memalloc_nofs_restore(nofs_flag);
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun if (xfs_ioend_needs_workqueue(ioend))
524*4882a593Smuzhiyun ioend->io_bio->bi_end_io = xfs_end_bio;
525*4882a593Smuzhiyun return status;
526*4882a593Smuzhiyun }
527*4882a593Smuzhiyun
528*4882a593Smuzhiyun /*
529*4882a593Smuzhiyun * If the page has delalloc blocks on it, we need to punch them out before we
530*4882a593Smuzhiyun * invalidate the page. If we don't, we leave a stale delalloc mapping on the
531*4882a593Smuzhiyun * inode that can trip up a later direct I/O read operation on the same region.
532*4882a593Smuzhiyun *
533*4882a593Smuzhiyun * We prevent this by truncating away the delalloc regions on the page. Because
534*4882a593Smuzhiyun * they are delalloc, we can do this without needing a transaction. Indeed - if
535*4882a593Smuzhiyun * we get ENOSPC errors, we have to be able to do this truncation without a
536*4882a593Smuzhiyun * transaction as there is no space left for block reservation (typically why we
537*4882a593Smuzhiyun * see a ENOSPC in writeback).
538*4882a593Smuzhiyun */
539*4882a593Smuzhiyun static void
xfs_discard_page(struct page * page,loff_t fileoff)540*4882a593Smuzhiyun xfs_discard_page(
541*4882a593Smuzhiyun struct page *page,
542*4882a593Smuzhiyun loff_t fileoff)
543*4882a593Smuzhiyun {
544*4882a593Smuzhiyun struct inode *inode = page->mapping->host;
545*4882a593Smuzhiyun struct xfs_inode *ip = XFS_I(inode);
546*4882a593Smuzhiyun struct xfs_mount *mp = ip->i_mount;
547*4882a593Smuzhiyun unsigned int pageoff = offset_in_page(fileoff);
548*4882a593Smuzhiyun xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, fileoff);
549*4882a593Smuzhiyun xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, pageoff);
550*4882a593Smuzhiyun int error;
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun if (XFS_FORCED_SHUTDOWN(mp))
553*4882a593Smuzhiyun goto out_invalidate;
554*4882a593Smuzhiyun
555*4882a593Smuzhiyun xfs_alert_ratelimited(mp,
556*4882a593Smuzhiyun "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
557*4882a593Smuzhiyun page, ip->i_ino, fileoff);
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
560*4882a593Smuzhiyun i_blocks_per_page(inode, page) - pageoff_fsb);
561*4882a593Smuzhiyun if (error && !XFS_FORCED_SHUTDOWN(mp))
562*4882a593Smuzhiyun xfs_alert(mp, "page discard unable to remove delalloc mapping.");
563*4882a593Smuzhiyun out_invalidate:
564*4882a593Smuzhiyun iomap_invalidatepage(page, pageoff, PAGE_SIZE - pageoff);
565*4882a593Smuzhiyun }
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun static const struct iomap_writeback_ops xfs_writeback_ops = {
568*4882a593Smuzhiyun .map_blocks = xfs_map_blocks,
569*4882a593Smuzhiyun .prepare_ioend = xfs_prepare_ioend,
570*4882a593Smuzhiyun .discard_page = xfs_discard_page,
571*4882a593Smuzhiyun };
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun STATIC int
xfs_vm_writepage(struct page * page,struct writeback_control * wbc)574*4882a593Smuzhiyun xfs_vm_writepage(
575*4882a593Smuzhiyun struct page *page,
576*4882a593Smuzhiyun struct writeback_control *wbc)
577*4882a593Smuzhiyun {
578*4882a593Smuzhiyun struct xfs_writepage_ctx wpc = { };
579*4882a593Smuzhiyun
580*4882a593Smuzhiyun if (WARN_ON_ONCE(current->journal_info)) {
581*4882a593Smuzhiyun redirty_page_for_writepage(wbc, page);
582*4882a593Smuzhiyun unlock_page(page);
583*4882a593Smuzhiyun return 0;
584*4882a593Smuzhiyun }
585*4882a593Smuzhiyun
586*4882a593Smuzhiyun return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
587*4882a593Smuzhiyun }
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun STATIC int
xfs_vm_writepages(struct address_space * mapping,struct writeback_control * wbc)590*4882a593Smuzhiyun xfs_vm_writepages(
591*4882a593Smuzhiyun struct address_space *mapping,
592*4882a593Smuzhiyun struct writeback_control *wbc)
593*4882a593Smuzhiyun {
594*4882a593Smuzhiyun struct xfs_writepage_ctx wpc = { };
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun /*
597*4882a593Smuzhiyun * Writing back data in a transaction context can result in recursive
598*4882a593Smuzhiyun * transactions. This is bad, so issue a warning and get out of here.
599*4882a593Smuzhiyun */
600*4882a593Smuzhiyun if (WARN_ON_ONCE(current->journal_info))
601*4882a593Smuzhiyun return 0;
602*4882a593Smuzhiyun
603*4882a593Smuzhiyun xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
604*4882a593Smuzhiyun return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
605*4882a593Smuzhiyun }
606*4882a593Smuzhiyun
607*4882a593Smuzhiyun STATIC int
xfs_dax_writepages(struct address_space * mapping,struct writeback_control * wbc)608*4882a593Smuzhiyun xfs_dax_writepages(
609*4882a593Smuzhiyun struct address_space *mapping,
610*4882a593Smuzhiyun struct writeback_control *wbc)
611*4882a593Smuzhiyun {
612*4882a593Smuzhiyun struct xfs_inode *ip = XFS_I(mapping->host);
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun xfs_iflags_clear(ip, XFS_ITRUNCATED);
615*4882a593Smuzhiyun return dax_writeback_mapping_range(mapping,
616*4882a593Smuzhiyun xfs_inode_buftarg(ip)->bt_daxdev, wbc);
617*4882a593Smuzhiyun }
618*4882a593Smuzhiyun
619*4882a593Smuzhiyun STATIC sector_t
xfs_vm_bmap(struct address_space * mapping,sector_t block)620*4882a593Smuzhiyun xfs_vm_bmap(
621*4882a593Smuzhiyun struct address_space *mapping,
622*4882a593Smuzhiyun sector_t block)
623*4882a593Smuzhiyun {
624*4882a593Smuzhiyun struct xfs_inode *ip = XFS_I(mapping->host);
625*4882a593Smuzhiyun
626*4882a593Smuzhiyun trace_xfs_vm_bmap(ip);
627*4882a593Smuzhiyun
628*4882a593Smuzhiyun /*
629*4882a593Smuzhiyun * The swap code (ab-)uses ->bmap to get a block mapping and then
630*4882a593Smuzhiyun * bypasses the file system for actual I/O. We really can't allow
631*4882a593Smuzhiyun * that on reflinks inodes, so we have to skip out here. And yes,
632*4882a593Smuzhiyun * 0 is the magic code for a bmap error.
633*4882a593Smuzhiyun *
634*4882a593Smuzhiyun * Since we don't pass back blockdev info, we can't return bmap
635*4882a593Smuzhiyun * information for rt files either.
636*4882a593Smuzhiyun */
637*4882a593Smuzhiyun if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
638*4882a593Smuzhiyun return 0;
639*4882a593Smuzhiyun return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
640*4882a593Smuzhiyun }
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun STATIC int
xfs_vm_readpage(struct file * unused,struct page * page)643*4882a593Smuzhiyun xfs_vm_readpage(
644*4882a593Smuzhiyun struct file *unused,
645*4882a593Smuzhiyun struct page *page)
646*4882a593Smuzhiyun {
647*4882a593Smuzhiyun return iomap_readpage(page, &xfs_read_iomap_ops);
648*4882a593Smuzhiyun }
649*4882a593Smuzhiyun
650*4882a593Smuzhiyun STATIC void
xfs_vm_readahead(struct readahead_control * rac)651*4882a593Smuzhiyun xfs_vm_readahead(
652*4882a593Smuzhiyun struct readahead_control *rac)
653*4882a593Smuzhiyun {
654*4882a593Smuzhiyun iomap_readahead(rac, &xfs_read_iomap_ops);
655*4882a593Smuzhiyun }
656*4882a593Smuzhiyun
657*4882a593Smuzhiyun static int
xfs_iomap_swapfile_activate(struct swap_info_struct * sis,struct file * swap_file,sector_t * span)658*4882a593Smuzhiyun xfs_iomap_swapfile_activate(
659*4882a593Smuzhiyun struct swap_info_struct *sis,
660*4882a593Smuzhiyun struct file *swap_file,
661*4882a593Smuzhiyun sector_t *span)
662*4882a593Smuzhiyun {
663*4882a593Smuzhiyun sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
664*4882a593Smuzhiyun return iomap_swapfile_activate(sis, swap_file, span,
665*4882a593Smuzhiyun &xfs_read_iomap_ops);
666*4882a593Smuzhiyun }
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun const struct address_space_operations xfs_address_space_operations = {
669*4882a593Smuzhiyun .readpage = xfs_vm_readpage,
670*4882a593Smuzhiyun .readahead = xfs_vm_readahead,
671*4882a593Smuzhiyun .writepage = xfs_vm_writepage,
672*4882a593Smuzhiyun .writepages = xfs_vm_writepages,
673*4882a593Smuzhiyun .set_page_dirty = iomap_set_page_dirty,
674*4882a593Smuzhiyun .releasepage = iomap_releasepage,
675*4882a593Smuzhiyun .invalidatepage = iomap_invalidatepage,
676*4882a593Smuzhiyun .bmap = xfs_vm_bmap,
677*4882a593Smuzhiyun .direct_IO = noop_direct_IO,
678*4882a593Smuzhiyun .migratepage = iomap_migrate_page,
679*4882a593Smuzhiyun .is_partially_uptodate = iomap_is_partially_uptodate,
680*4882a593Smuzhiyun .error_remove_page = generic_error_remove_page,
681*4882a593Smuzhiyun .swap_activate = xfs_iomap_swapfile_activate,
682*4882a593Smuzhiyun };
683*4882a593Smuzhiyun
684*4882a593Smuzhiyun const struct address_space_operations xfs_dax_aops = {
685*4882a593Smuzhiyun .writepages = xfs_dax_writepages,
686*4882a593Smuzhiyun .direct_IO = noop_direct_IO,
687*4882a593Smuzhiyun .set_page_dirty = noop_set_page_dirty,
688*4882a593Smuzhiyun .invalidatepage = noop_invalidatepage,
689*4882a593Smuzhiyun .swap_activate = xfs_iomap_swapfile_activate,
690*4882a593Smuzhiyun };
691